1 /* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24 #include <linux/fs.h> 25 #include <linux/init.h> 26 #include <linux/vfs.h> 27 #include <linux/mount.h> 28 #include <linux/ramfs.h> 29 #include <linux/pagemap.h> 30 #include <linux/file.h> 31 #include <linux/fileattr.h> 32 #include <linux/mm.h> 33 #include <linux/random.h> 34 #include <linux/sched/signal.h> 35 #include <linux/export.h> 36 #include <linux/shmem_fs.h> 37 #include <linux/swap.h> 38 #include <linux/uio.h> 39 #include <linux/hugetlb.h> 40 #include <linux/fs_parser.h> 41 #include <linux/swapfile.h> 42 #include <linux/iversion.h> 43 #include "swap.h" 44 45 static struct vfsmount *shm_mnt; 46 47 #ifdef CONFIG_SHMEM 48 /* 49 * This virtual memory filesystem is heavily based on the ramfs. It 50 * extends ramfs by the ability to use swap and honor resource limits 51 * which makes it a completely usable filesystem. 52 */ 53 54 #include <linux/xattr.h> 55 #include <linux/exportfs.h> 56 #include <linux/posix_acl.h> 57 #include <linux/posix_acl_xattr.h> 58 #include <linux/mman.h> 59 #include <linux/string.h> 60 #include <linux/slab.h> 61 #include <linux/backing-dev.h> 62 #include <linux/writeback.h> 63 #include <linux/pagevec.h> 64 #include <linux/percpu_counter.h> 65 #include <linux/falloc.h> 66 #include <linux/splice.h> 67 #include <linux/security.h> 68 #include <linux/swapops.h> 69 #include <linux/mempolicy.h> 70 #include <linux/namei.h> 71 #include <linux/ctype.h> 72 #include <linux/migrate.h> 73 #include <linux/highmem.h> 74 #include <linux/seq_file.h> 75 #include <linux/magic.h> 76 #include <linux/syscalls.h> 77 #include <linux/fcntl.h> 78 #include <uapi/linux/memfd.h> 79 #include <linux/userfaultfd_k.h> 80 #include <linux/rmap.h> 81 #include <linux/uuid.h> 82 83 #include <linux/uaccess.h> 84 85 #include "internal.h" 86 87 #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 88 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 89 90 /* Pretend that each entry is of this size in directory's i_size */ 91 #define BOGO_DIRENT_SIZE 20 92 93 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 94 #define SHORT_SYMLINK_LEN 128 95 96 /* 97 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 98 * inode->i_private (with i_rwsem making sure that it has only one user at 99 * a time): we would prefer not to enlarge the shmem inode just for that. 100 */ 101 struct shmem_falloc { 102 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 103 pgoff_t start; /* start of range currently being fallocated */ 104 pgoff_t next; /* the next page offset to be fallocated */ 105 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 106 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 107 }; 108 109 struct shmem_options { 110 unsigned long long blocks; 111 unsigned long long inodes; 112 struct mempolicy *mpol; 113 kuid_t uid; 114 kgid_t gid; 115 umode_t mode; 116 bool full_inums; 117 int huge; 118 int seen; 119 #define SHMEM_SEEN_BLOCKS 1 120 #define SHMEM_SEEN_INODES 2 121 #define SHMEM_SEEN_HUGE 4 122 #define SHMEM_SEEN_INUMS 8 123 }; 124 125 #ifdef CONFIG_TMPFS 126 static unsigned long shmem_default_max_blocks(void) 127 { 128 return totalram_pages() / 2; 129 } 130 131 static unsigned long shmem_default_max_inodes(void) 132 { 133 unsigned long nr_pages = totalram_pages(); 134 135 return min(nr_pages - totalhigh_pages(), nr_pages / 2); 136 } 137 #endif 138 139 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 140 struct folio **foliop, enum sgp_type sgp, 141 gfp_t gfp, struct vm_area_struct *vma, 142 vm_fault_t *fault_type); 143 144 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 145 { 146 return sb->s_fs_info; 147 } 148 149 /* 150 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 151 * for shared memory and for shared anonymous (/dev/zero) mappings 152 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 153 * consistent with the pre-accounting of private mappings ... 154 */ 155 static inline int shmem_acct_size(unsigned long flags, loff_t size) 156 { 157 return (flags & VM_NORESERVE) ? 158 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 159 } 160 161 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 162 { 163 if (!(flags & VM_NORESERVE)) 164 vm_unacct_memory(VM_ACCT(size)); 165 } 166 167 static inline int shmem_reacct_size(unsigned long flags, 168 loff_t oldsize, loff_t newsize) 169 { 170 if (!(flags & VM_NORESERVE)) { 171 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 172 return security_vm_enough_memory_mm(current->mm, 173 VM_ACCT(newsize) - VM_ACCT(oldsize)); 174 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 175 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 176 } 177 return 0; 178 } 179 180 /* 181 * ... whereas tmpfs objects are accounted incrementally as 182 * pages are allocated, in order to allow large sparse files. 183 * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 184 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 185 */ 186 static inline int shmem_acct_block(unsigned long flags, long pages) 187 { 188 if (!(flags & VM_NORESERVE)) 189 return 0; 190 191 return security_vm_enough_memory_mm(current->mm, 192 pages * VM_ACCT(PAGE_SIZE)); 193 } 194 195 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 196 { 197 if (flags & VM_NORESERVE) 198 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 199 } 200 201 static inline bool shmem_inode_acct_block(struct inode *inode, long pages) 202 { 203 struct shmem_inode_info *info = SHMEM_I(inode); 204 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 205 206 if (shmem_acct_block(info->flags, pages)) 207 return false; 208 209 if (sbinfo->max_blocks) { 210 if (percpu_counter_compare(&sbinfo->used_blocks, 211 sbinfo->max_blocks - pages) > 0) 212 goto unacct; 213 percpu_counter_add(&sbinfo->used_blocks, pages); 214 } 215 216 return true; 217 218 unacct: 219 shmem_unacct_blocks(info->flags, pages); 220 return false; 221 } 222 223 static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) 224 { 225 struct shmem_inode_info *info = SHMEM_I(inode); 226 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 227 228 if (sbinfo->max_blocks) 229 percpu_counter_sub(&sbinfo->used_blocks, pages); 230 shmem_unacct_blocks(info->flags, pages); 231 } 232 233 static const struct super_operations shmem_ops; 234 const struct address_space_operations shmem_aops; 235 static const struct file_operations shmem_file_operations; 236 static const struct inode_operations shmem_inode_operations; 237 static const struct inode_operations shmem_dir_inode_operations; 238 static const struct inode_operations shmem_special_inode_operations; 239 static const struct vm_operations_struct shmem_vm_ops; 240 static const struct vm_operations_struct shmem_anon_vm_ops; 241 static struct file_system_type shmem_fs_type; 242 243 bool vma_is_anon_shmem(struct vm_area_struct *vma) 244 { 245 return vma->vm_ops == &shmem_anon_vm_ops; 246 } 247 248 bool vma_is_shmem(struct vm_area_struct *vma) 249 { 250 return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; 251 } 252 253 static LIST_HEAD(shmem_swaplist); 254 static DEFINE_MUTEX(shmem_swaplist_mutex); 255 256 /* 257 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and 258 * produces a novel ino for the newly allocated inode. 259 * 260 * It may also be called when making a hard link to permit the space needed by 261 * each dentry. However, in that case, no new inode number is needed since that 262 * internally draws from another pool of inode numbers (currently global 263 * get_next_ino()). This case is indicated by passing NULL as inop. 264 */ 265 #define SHMEM_INO_BATCH 1024 266 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) 267 { 268 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 269 ino_t ino; 270 271 if (!(sb->s_flags & SB_KERNMOUNT)) { 272 raw_spin_lock(&sbinfo->stat_lock); 273 if (sbinfo->max_inodes) { 274 if (!sbinfo->free_inodes) { 275 raw_spin_unlock(&sbinfo->stat_lock); 276 return -ENOSPC; 277 } 278 sbinfo->free_inodes--; 279 } 280 if (inop) { 281 ino = sbinfo->next_ino++; 282 if (unlikely(is_zero_ino(ino))) 283 ino = sbinfo->next_ino++; 284 if (unlikely(!sbinfo->full_inums && 285 ino > UINT_MAX)) { 286 /* 287 * Emulate get_next_ino uint wraparound for 288 * compatibility 289 */ 290 if (IS_ENABLED(CONFIG_64BIT)) 291 pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", 292 __func__, MINOR(sb->s_dev)); 293 sbinfo->next_ino = 1; 294 ino = sbinfo->next_ino++; 295 } 296 *inop = ino; 297 } 298 raw_spin_unlock(&sbinfo->stat_lock); 299 } else if (inop) { 300 /* 301 * __shmem_file_setup, one of our callers, is lock-free: it 302 * doesn't hold stat_lock in shmem_reserve_inode since 303 * max_inodes is always 0, and is called from potentially 304 * unknown contexts. As such, use a per-cpu batched allocator 305 * which doesn't require the per-sb stat_lock unless we are at 306 * the batch boundary. 307 * 308 * We don't need to worry about inode{32,64} since SB_KERNMOUNT 309 * shmem mounts are not exposed to userspace, so we don't need 310 * to worry about things like glibc compatibility. 311 */ 312 ino_t *next_ino; 313 314 next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); 315 ino = *next_ino; 316 if (unlikely(ino % SHMEM_INO_BATCH == 0)) { 317 raw_spin_lock(&sbinfo->stat_lock); 318 ino = sbinfo->next_ino; 319 sbinfo->next_ino += SHMEM_INO_BATCH; 320 raw_spin_unlock(&sbinfo->stat_lock); 321 if (unlikely(is_zero_ino(ino))) 322 ino++; 323 } 324 *inop = ino; 325 *next_ino = ++ino; 326 put_cpu(); 327 } 328 329 return 0; 330 } 331 332 static void shmem_free_inode(struct super_block *sb) 333 { 334 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 335 if (sbinfo->max_inodes) { 336 raw_spin_lock(&sbinfo->stat_lock); 337 sbinfo->free_inodes++; 338 raw_spin_unlock(&sbinfo->stat_lock); 339 } 340 } 341 342 /** 343 * shmem_recalc_inode - recalculate the block usage of an inode 344 * @inode: inode to recalc 345 * 346 * We have to calculate the free blocks since the mm can drop 347 * undirtied hole pages behind our back. 348 * 349 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 350 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 351 * 352 * It has to be called with the spinlock held. 353 */ 354 static void shmem_recalc_inode(struct inode *inode) 355 { 356 struct shmem_inode_info *info = SHMEM_I(inode); 357 long freed; 358 359 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 360 if (freed > 0) { 361 info->alloced -= freed; 362 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 363 shmem_inode_unacct_blocks(inode, freed); 364 } 365 } 366 367 bool shmem_charge(struct inode *inode, long pages) 368 { 369 struct shmem_inode_info *info = SHMEM_I(inode); 370 unsigned long flags; 371 372 if (!shmem_inode_acct_block(inode, pages)) 373 return false; 374 375 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ 376 inode->i_mapping->nrpages += pages; 377 378 spin_lock_irqsave(&info->lock, flags); 379 info->alloced += pages; 380 inode->i_blocks += pages * BLOCKS_PER_PAGE; 381 shmem_recalc_inode(inode); 382 spin_unlock_irqrestore(&info->lock, flags); 383 384 return true; 385 } 386 387 void shmem_uncharge(struct inode *inode, long pages) 388 { 389 struct shmem_inode_info *info = SHMEM_I(inode); 390 unsigned long flags; 391 392 /* nrpages adjustment done by __filemap_remove_folio() or caller */ 393 394 spin_lock_irqsave(&info->lock, flags); 395 info->alloced -= pages; 396 inode->i_blocks -= pages * BLOCKS_PER_PAGE; 397 shmem_recalc_inode(inode); 398 spin_unlock_irqrestore(&info->lock, flags); 399 400 shmem_inode_unacct_blocks(inode, pages); 401 } 402 403 /* 404 * Replace item expected in xarray by a new item, while holding xa_lock. 405 */ 406 static int shmem_replace_entry(struct address_space *mapping, 407 pgoff_t index, void *expected, void *replacement) 408 { 409 XA_STATE(xas, &mapping->i_pages, index); 410 void *item; 411 412 VM_BUG_ON(!expected); 413 VM_BUG_ON(!replacement); 414 item = xas_load(&xas); 415 if (item != expected) 416 return -ENOENT; 417 xas_store(&xas, replacement); 418 return 0; 419 } 420 421 /* 422 * Sometimes, before we decide whether to proceed or to fail, we must check 423 * that an entry was not already brought back from swap by a racing thread. 424 * 425 * Checking page is not enough: by the time a SwapCache page is locked, it 426 * might be reused, and again be SwapCache, using the same swap as before. 427 */ 428 static bool shmem_confirm_swap(struct address_space *mapping, 429 pgoff_t index, swp_entry_t swap) 430 { 431 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 432 } 433 434 /* 435 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 436 * 437 * SHMEM_HUGE_NEVER: 438 * disables huge pages for the mount; 439 * SHMEM_HUGE_ALWAYS: 440 * enables huge pages for the mount; 441 * SHMEM_HUGE_WITHIN_SIZE: 442 * only allocate huge pages if the page will be fully within i_size, 443 * also respect fadvise()/madvise() hints; 444 * SHMEM_HUGE_ADVISE: 445 * only allocate huge pages if requested with fadvise()/madvise(); 446 */ 447 448 #define SHMEM_HUGE_NEVER 0 449 #define SHMEM_HUGE_ALWAYS 1 450 #define SHMEM_HUGE_WITHIN_SIZE 2 451 #define SHMEM_HUGE_ADVISE 3 452 453 /* 454 * Special values. 455 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 456 * 457 * SHMEM_HUGE_DENY: 458 * disables huge on shm_mnt and all mounts, for emergency use; 459 * SHMEM_HUGE_FORCE: 460 * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 461 * 462 */ 463 #define SHMEM_HUGE_DENY (-1) 464 #define SHMEM_HUGE_FORCE (-2) 465 466 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 467 /* ifdef here to avoid bloating shmem.o when not necessary */ 468 469 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; 470 471 bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 472 struct mm_struct *mm, unsigned long vm_flags) 473 { 474 loff_t i_size; 475 476 if (!S_ISREG(inode->i_mode)) 477 return false; 478 if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) 479 return false; 480 if (shmem_huge == SHMEM_HUGE_DENY) 481 return false; 482 if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) 483 return true; 484 485 switch (SHMEM_SB(inode->i_sb)->huge) { 486 case SHMEM_HUGE_ALWAYS: 487 return true; 488 case SHMEM_HUGE_WITHIN_SIZE: 489 index = round_up(index + 1, HPAGE_PMD_NR); 490 i_size = round_up(i_size_read(inode), PAGE_SIZE); 491 if (i_size >> PAGE_SHIFT >= index) 492 return true; 493 fallthrough; 494 case SHMEM_HUGE_ADVISE: 495 if (mm && (vm_flags & VM_HUGEPAGE)) 496 return true; 497 fallthrough; 498 default: 499 return false; 500 } 501 } 502 503 #if defined(CONFIG_SYSFS) 504 static int shmem_parse_huge(const char *str) 505 { 506 if (!strcmp(str, "never")) 507 return SHMEM_HUGE_NEVER; 508 if (!strcmp(str, "always")) 509 return SHMEM_HUGE_ALWAYS; 510 if (!strcmp(str, "within_size")) 511 return SHMEM_HUGE_WITHIN_SIZE; 512 if (!strcmp(str, "advise")) 513 return SHMEM_HUGE_ADVISE; 514 if (!strcmp(str, "deny")) 515 return SHMEM_HUGE_DENY; 516 if (!strcmp(str, "force")) 517 return SHMEM_HUGE_FORCE; 518 return -EINVAL; 519 } 520 #endif 521 522 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 523 static const char *shmem_format_huge(int huge) 524 { 525 switch (huge) { 526 case SHMEM_HUGE_NEVER: 527 return "never"; 528 case SHMEM_HUGE_ALWAYS: 529 return "always"; 530 case SHMEM_HUGE_WITHIN_SIZE: 531 return "within_size"; 532 case SHMEM_HUGE_ADVISE: 533 return "advise"; 534 case SHMEM_HUGE_DENY: 535 return "deny"; 536 case SHMEM_HUGE_FORCE: 537 return "force"; 538 default: 539 VM_BUG_ON(1); 540 return "bad_val"; 541 } 542 } 543 #endif 544 545 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 546 struct shrink_control *sc, unsigned long nr_to_split) 547 { 548 LIST_HEAD(list), *pos, *next; 549 LIST_HEAD(to_remove); 550 struct inode *inode; 551 struct shmem_inode_info *info; 552 struct folio *folio; 553 unsigned long batch = sc ? sc->nr_to_scan : 128; 554 int split = 0; 555 556 if (list_empty(&sbinfo->shrinklist)) 557 return SHRINK_STOP; 558 559 spin_lock(&sbinfo->shrinklist_lock); 560 list_for_each_safe(pos, next, &sbinfo->shrinklist) { 561 info = list_entry(pos, struct shmem_inode_info, shrinklist); 562 563 /* pin the inode */ 564 inode = igrab(&info->vfs_inode); 565 566 /* inode is about to be evicted */ 567 if (!inode) { 568 list_del_init(&info->shrinklist); 569 goto next; 570 } 571 572 /* Check if there's anything to gain */ 573 if (round_up(inode->i_size, PAGE_SIZE) == 574 round_up(inode->i_size, HPAGE_PMD_SIZE)) { 575 list_move(&info->shrinklist, &to_remove); 576 goto next; 577 } 578 579 list_move(&info->shrinklist, &list); 580 next: 581 sbinfo->shrinklist_len--; 582 if (!--batch) 583 break; 584 } 585 spin_unlock(&sbinfo->shrinklist_lock); 586 587 list_for_each_safe(pos, next, &to_remove) { 588 info = list_entry(pos, struct shmem_inode_info, shrinklist); 589 inode = &info->vfs_inode; 590 list_del_init(&info->shrinklist); 591 iput(inode); 592 } 593 594 list_for_each_safe(pos, next, &list) { 595 int ret; 596 pgoff_t index; 597 598 info = list_entry(pos, struct shmem_inode_info, shrinklist); 599 inode = &info->vfs_inode; 600 601 if (nr_to_split && split >= nr_to_split) 602 goto move_back; 603 604 index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; 605 folio = filemap_get_folio(inode->i_mapping, index); 606 if (!folio) 607 goto drop; 608 609 /* No huge page at the end of the file: nothing to split */ 610 if (!folio_test_large(folio)) { 611 folio_put(folio); 612 goto drop; 613 } 614 615 /* 616 * Move the inode on the list back to shrinklist if we failed 617 * to lock the page at this time. 618 * 619 * Waiting for the lock may lead to deadlock in the 620 * reclaim path. 621 */ 622 if (!folio_trylock(folio)) { 623 folio_put(folio); 624 goto move_back; 625 } 626 627 ret = split_folio(folio); 628 folio_unlock(folio); 629 folio_put(folio); 630 631 /* If split failed move the inode on the list back to shrinklist */ 632 if (ret) 633 goto move_back; 634 635 split++; 636 drop: 637 list_del_init(&info->shrinklist); 638 goto put; 639 move_back: 640 /* 641 * Make sure the inode is either on the global list or deleted 642 * from any local list before iput() since it could be deleted 643 * in another thread once we put the inode (then the local list 644 * is corrupted). 645 */ 646 spin_lock(&sbinfo->shrinklist_lock); 647 list_move(&info->shrinklist, &sbinfo->shrinklist); 648 sbinfo->shrinklist_len++; 649 spin_unlock(&sbinfo->shrinklist_lock); 650 put: 651 iput(inode); 652 } 653 654 return split; 655 } 656 657 static long shmem_unused_huge_scan(struct super_block *sb, 658 struct shrink_control *sc) 659 { 660 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 661 662 if (!READ_ONCE(sbinfo->shrinklist_len)) 663 return SHRINK_STOP; 664 665 return shmem_unused_huge_shrink(sbinfo, sc, 0); 666 } 667 668 static long shmem_unused_huge_count(struct super_block *sb, 669 struct shrink_control *sc) 670 { 671 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 672 return READ_ONCE(sbinfo->shrinklist_len); 673 } 674 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 675 676 #define shmem_huge SHMEM_HUGE_DENY 677 678 bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 679 struct mm_struct *mm, unsigned long vm_flags) 680 { 681 return false; 682 } 683 684 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 685 struct shrink_control *sc, unsigned long nr_to_split) 686 { 687 return 0; 688 } 689 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 690 691 /* 692 * Like filemap_add_folio, but error if expected item has gone. 693 */ 694 static int shmem_add_to_page_cache(struct folio *folio, 695 struct address_space *mapping, 696 pgoff_t index, void *expected, gfp_t gfp, 697 struct mm_struct *charge_mm) 698 { 699 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); 700 long nr = folio_nr_pages(folio); 701 int error; 702 703 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); 704 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 705 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 706 VM_BUG_ON(expected && folio_test_large(folio)); 707 708 folio_ref_add(folio, nr); 709 folio->mapping = mapping; 710 folio->index = index; 711 712 if (!folio_test_swapcache(folio)) { 713 error = mem_cgroup_charge(folio, charge_mm, gfp); 714 if (error) { 715 if (folio_test_pmd_mappable(folio)) { 716 count_vm_event(THP_FILE_FALLBACK); 717 count_vm_event(THP_FILE_FALLBACK_CHARGE); 718 } 719 goto error; 720 } 721 } 722 folio_throttle_swaprate(folio, gfp); 723 724 do { 725 xas_lock_irq(&xas); 726 if (expected != xas_find_conflict(&xas)) { 727 xas_set_err(&xas, -EEXIST); 728 goto unlock; 729 } 730 if (expected && xas_find_conflict(&xas)) { 731 xas_set_err(&xas, -EEXIST); 732 goto unlock; 733 } 734 xas_store(&xas, folio); 735 if (xas_error(&xas)) 736 goto unlock; 737 if (folio_test_pmd_mappable(folio)) { 738 count_vm_event(THP_FILE_ALLOC); 739 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); 740 } 741 mapping->nrpages += nr; 742 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 743 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); 744 unlock: 745 xas_unlock_irq(&xas); 746 } while (xas_nomem(&xas, gfp)); 747 748 if (xas_error(&xas)) { 749 error = xas_error(&xas); 750 goto error; 751 } 752 753 return 0; 754 error: 755 folio->mapping = NULL; 756 folio_ref_sub(folio, nr); 757 return error; 758 } 759 760 /* 761 * Like delete_from_page_cache, but substitutes swap for @folio. 762 */ 763 static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) 764 { 765 struct address_space *mapping = folio->mapping; 766 long nr = folio_nr_pages(folio); 767 int error; 768 769 xa_lock_irq(&mapping->i_pages); 770 error = shmem_replace_entry(mapping, folio->index, folio, radswap); 771 folio->mapping = NULL; 772 mapping->nrpages -= nr; 773 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 774 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 775 xa_unlock_irq(&mapping->i_pages); 776 folio_put(folio); 777 BUG_ON(error); 778 } 779 780 /* 781 * Remove swap entry from page cache, free the swap and its page cache. 782 */ 783 static int shmem_free_swap(struct address_space *mapping, 784 pgoff_t index, void *radswap) 785 { 786 void *old; 787 788 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); 789 if (old != radswap) 790 return -ENOENT; 791 free_swap_and_cache(radix_to_swp_entry(radswap)); 792 return 0; 793 } 794 795 /* 796 * Determine (in bytes) how many of the shmem object's pages mapped by the 797 * given offsets are swapped out. 798 * 799 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 800 * as long as the inode doesn't go away and racy results are not a problem. 801 */ 802 unsigned long shmem_partial_swap_usage(struct address_space *mapping, 803 pgoff_t start, pgoff_t end) 804 { 805 XA_STATE(xas, &mapping->i_pages, start); 806 struct page *page; 807 unsigned long swapped = 0; 808 809 rcu_read_lock(); 810 xas_for_each(&xas, page, end - 1) { 811 if (xas_retry(&xas, page)) 812 continue; 813 if (xa_is_value(page)) 814 swapped++; 815 816 if (need_resched()) { 817 xas_pause(&xas); 818 cond_resched_rcu(); 819 } 820 } 821 822 rcu_read_unlock(); 823 824 return swapped << PAGE_SHIFT; 825 } 826 827 /* 828 * Determine (in bytes) how many of the shmem object's pages mapped by the 829 * given vma is swapped out. 830 * 831 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 832 * as long as the inode doesn't go away and racy results are not a problem. 833 */ 834 unsigned long shmem_swap_usage(struct vm_area_struct *vma) 835 { 836 struct inode *inode = file_inode(vma->vm_file); 837 struct shmem_inode_info *info = SHMEM_I(inode); 838 struct address_space *mapping = inode->i_mapping; 839 unsigned long swapped; 840 841 /* Be careful as we don't hold info->lock */ 842 swapped = READ_ONCE(info->swapped); 843 844 /* 845 * The easier cases are when the shmem object has nothing in swap, or 846 * the vma maps it whole. Then we can simply use the stats that we 847 * already track. 848 */ 849 if (!swapped) 850 return 0; 851 852 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 853 return swapped << PAGE_SHIFT; 854 855 /* Here comes the more involved part */ 856 return shmem_partial_swap_usage(mapping, vma->vm_pgoff, 857 vma->vm_pgoff + vma_pages(vma)); 858 } 859 860 /* 861 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 862 */ 863 void shmem_unlock_mapping(struct address_space *mapping) 864 { 865 struct folio_batch fbatch; 866 pgoff_t index = 0; 867 868 folio_batch_init(&fbatch); 869 /* 870 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 871 */ 872 while (!mapping_unevictable(mapping) && 873 filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { 874 check_move_unevictable_folios(&fbatch); 875 folio_batch_release(&fbatch); 876 cond_resched(); 877 } 878 } 879 880 static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) 881 { 882 struct folio *folio; 883 884 /* 885 * At first avoid shmem_get_folio(,,,SGP_READ): that fails 886 * beyond i_size, and reports fallocated pages as holes. 887 */ 888 folio = __filemap_get_folio(inode->i_mapping, index, 889 FGP_ENTRY | FGP_LOCK, 0); 890 if (!xa_is_value(folio)) 891 return folio; 892 /* 893 * But read a page back from swap if any of it is within i_size 894 * (although in some cases this is just a waste of time). 895 */ 896 folio = NULL; 897 shmem_get_folio(inode, index, &folio, SGP_READ); 898 return folio; 899 } 900 901 /* 902 * Remove range of pages and swap entries from page cache, and free them. 903 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 904 */ 905 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 906 bool unfalloc) 907 { 908 struct address_space *mapping = inode->i_mapping; 909 struct shmem_inode_info *info = SHMEM_I(inode); 910 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 911 pgoff_t end = (lend + 1) >> PAGE_SHIFT; 912 struct folio_batch fbatch; 913 pgoff_t indices[PAGEVEC_SIZE]; 914 struct folio *folio; 915 bool same_folio; 916 long nr_swaps_freed = 0; 917 pgoff_t index; 918 int i; 919 920 if (lend == -1) 921 end = -1; /* unsigned, so actually very big */ 922 923 if (info->fallocend > start && info->fallocend <= end && !unfalloc) 924 info->fallocend = start; 925 926 folio_batch_init(&fbatch); 927 index = start; 928 while (index < end && find_lock_entries(mapping, &index, end - 1, 929 &fbatch, indices)) { 930 for (i = 0; i < folio_batch_count(&fbatch); i++) { 931 folio = fbatch.folios[i]; 932 933 if (xa_is_value(folio)) { 934 if (unfalloc) 935 continue; 936 nr_swaps_freed += !shmem_free_swap(mapping, 937 indices[i], folio); 938 continue; 939 } 940 941 if (!unfalloc || !folio_test_uptodate(folio)) 942 truncate_inode_folio(mapping, folio); 943 folio_unlock(folio); 944 } 945 folio_batch_remove_exceptionals(&fbatch); 946 folio_batch_release(&fbatch); 947 cond_resched(); 948 } 949 950 /* 951 * When undoing a failed fallocate, we want none of the partial folio 952 * zeroing and splitting below, but shall want to truncate the whole 953 * folio when !uptodate indicates that it was added by this fallocate, 954 * even when [lstart, lend] covers only a part of the folio. 955 */ 956 if (unfalloc) 957 goto whole_folios; 958 959 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); 960 folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); 961 if (folio) { 962 same_folio = lend < folio_pos(folio) + folio_size(folio); 963 folio_mark_dirty(folio); 964 if (!truncate_inode_partial_folio(folio, lstart, lend)) { 965 start = folio->index + folio_nr_pages(folio); 966 if (same_folio) 967 end = folio->index; 968 } 969 folio_unlock(folio); 970 folio_put(folio); 971 folio = NULL; 972 } 973 974 if (!same_folio) 975 folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); 976 if (folio) { 977 folio_mark_dirty(folio); 978 if (!truncate_inode_partial_folio(folio, lstart, lend)) 979 end = folio->index; 980 folio_unlock(folio); 981 folio_put(folio); 982 } 983 984 whole_folios: 985 986 index = start; 987 while (index < end) { 988 cond_resched(); 989 990 if (!find_get_entries(mapping, &index, end - 1, &fbatch, 991 indices)) { 992 /* If all gone or hole-punch or unfalloc, we're done */ 993 if (index == start || end != -1) 994 break; 995 /* But if truncating, restart to make sure all gone */ 996 index = start; 997 continue; 998 } 999 for (i = 0; i < folio_batch_count(&fbatch); i++) { 1000 folio = fbatch.folios[i]; 1001 1002 if (xa_is_value(folio)) { 1003 if (unfalloc) 1004 continue; 1005 if (shmem_free_swap(mapping, indices[i], folio)) { 1006 /* Swap was replaced by page: retry */ 1007 index = indices[i]; 1008 break; 1009 } 1010 nr_swaps_freed++; 1011 continue; 1012 } 1013 1014 folio_lock(folio); 1015 1016 if (!unfalloc || !folio_test_uptodate(folio)) { 1017 if (folio_mapping(folio) != mapping) { 1018 /* Page was replaced by swap: retry */ 1019 folio_unlock(folio); 1020 index = indices[i]; 1021 break; 1022 } 1023 VM_BUG_ON_FOLIO(folio_test_writeback(folio), 1024 folio); 1025 truncate_inode_folio(mapping, folio); 1026 } 1027 folio_unlock(folio); 1028 } 1029 folio_batch_remove_exceptionals(&fbatch); 1030 folio_batch_release(&fbatch); 1031 } 1032 1033 spin_lock_irq(&info->lock); 1034 info->swapped -= nr_swaps_freed; 1035 shmem_recalc_inode(inode); 1036 spin_unlock_irq(&info->lock); 1037 } 1038 1039 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 1040 { 1041 shmem_undo_range(inode, lstart, lend, false); 1042 inode->i_ctime = inode->i_mtime = current_time(inode); 1043 inode_inc_iversion(inode); 1044 } 1045 EXPORT_SYMBOL_GPL(shmem_truncate_range); 1046 1047 static int shmem_getattr(struct mnt_idmap *idmap, 1048 const struct path *path, struct kstat *stat, 1049 u32 request_mask, unsigned int query_flags) 1050 { 1051 struct inode *inode = path->dentry->d_inode; 1052 struct shmem_inode_info *info = SHMEM_I(inode); 1053 1054 if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 1055 spin_lock_irq(&info->lock); 1056 shmem_recalc_inode(inode); 1057 spin_unlock_irq(&info->lock); 1058 } 1059 if (info->fsflags & FS_APPEND_FL) 1060 stat->attributes |= STATX_ATTR_APPEND; 1061 if (info->fsflags & FS_IMMUTABLE_FL) 1062 stat->attributes |= STATX_ATTR_IMMUTABLE; 1063 if (info->fsflags & FS_NODUMP_FL) 1064 stat->attributes |= STATX_ATTR_NODUMP; 1065 stat->attributes_mask |= (STATX_ATTR_APPEND | 1066 STATX_ATTR_IMMUTABLE | 1067 STATX_ATTR_NODUMP); 1068 generic_fillattr(idmap, inode, stat); 1069 1070 if (shmem_is_huge(inode, 0, false, NULL, 0)) 1071 stat->blksize = HPAGE_PMD_SIZE; 1072 1073 if (request_mask & STATX_BTIME) { 1074 stat->result_mask |= STATX_BTIME; 1075 stat->btime.tv_sec = info->i_crtime.tv_sec; 1076 stat->btime.tv_nsec = info->i_crtime.tv_nsec; 1077 } 1078 1079 return 0; 1080 } 1081 1082 static int shmem_setattr(struct mnt_idmap *idmap, 1083 struct dentry *dentry, struct iattr *attr) 1084 { 1085 struct inode *inode = d_inode(dentry); 1086 struct shmem_inode_info *info = SHMEM_I(inode); 1087 int error; 1088 bool update_mtime = false; 1089 bool update_ctime = true; 1090 1091 error = setattr_prepare(idmap, dentry, attr); 1092 if (error) 1093 return error; 1094 1095 if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { 1096 if ((inode->i_mode ^ attr->ia_mode) & 0111) { 1097 return -EPERM; 1098 } 1099 } 1100 1101 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 1102 loff_t oldsize = inode->i_size; 1103 loff_t newsize = attr->ia_size; 1104 1105 /* protected by i_rwsem */ 1106 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 1107 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 1108 return -EPERM; 1109 1110 if (newsize != oldsize) { 1111 error = shmem_reacct_size(SHMEM_I(inode)->flags, 1112 oldsize, newsize); 1113 if (error) 1114 return error; 1115 i_size_write(inode, newsize); 1116 update_mtime = true; 1117 } else { 1118 update_ctime = false; 1119 } 1120 if (newsize <= oldsize) { 1121 loff_t holebegin = round_up(newsize, PAGE_SIZE); 1122 if (oldsize > holebegin) 1123 unmap_mapping_range(inode->i_mapping, 1124 holebegin, 0, 1); 1125 if (info->alloced) 1126 shmem_truncate_range(inode, 1127 newsize, (loff_t)-1); 1128 /* unmap again to remove racily COWed private pages */ 1129 if (oldsize > holebegin) 1130 unmap_mapping_range(inode->i_mapping, 1131 holebegin, 0, 1); 1132 } 1133 } 1134 1135 setattr_copy(idmap, inode, attr); 1136 if (attr->ia_valid & ATTR_MODE) 1137 error = posix_acl_chmod(idmap, dentry, inode->i_mode); 1138 if (!error && update_ctime) { 1139 inode->i_ctime = current_time(inode); 1140 if (update_mtime) 1141 inode->i_mtime = inode->i_ctime; 1142 inode_inc_iversion(inode); 1143 } 1144 return error; 1145 } 1146 1147 static void shmem_evict_inode(struct inode *inode) 1148 { 1149 struct shmem_inode_info *info = SHMEM_I(inode); 1150 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1151 1152 if (shmem_mapping(inode->i_mapping)) { 1153 shmem_unacct_size(info->flags, inode->i_size); 1154 inode->i_size = 0; 1155 mapping_set_exiting(inode->i_mapping); 1156 shmem_truncate_range(inode, 0, (loff_t)-1); 1157 if (!list_empty(&info->shrinklist)) { 1158 spin_lock(&sbinfo->shrinklist_lock); 1159 if (!list_empty(&info->shrinklist)) { 1160 list_del_init(&info->shrinklist); 1161 sbinfo->shrinklist_len--; 1162 } 1163 spin_unlock(&sbinfo->shrinklist_lock); 1164 } 1165 while (!list_empty(&info->swaplist)) { 1166 /* Wait while shmem_unuse() is scanning this inode... */ 1167 wait_var_event(&info->stop_eviction, 1168 !atomic_read(&info->stop_eviction)); 1169 mutex_lock(&shmem_swaplist_mutex); 1170 /* ...but beware of the race if we peeked too early */ 1171 if (!atomic_read(&info->stop_eviction)) 1172 list_del_init(&info->swaplist); 1173 mutex_unlock(&shmem_swaplist_mutex); 1174 } 1175 } 1176 1177 simple_xattrs_free(&info->xattrs); 1178 WARN_ON(inode->i_blocks); 1179 shmem_free_inode(inode->i_sb); 1180 clear_inode(inode); 1181 } 1182 1183 static int shmem_find_swap_entries(struct address_space *mapping, 1184 pgoff_t start, struct folio_batch *fbatch, 1185 pgoff_t *indices, unsigned int type) 1186 { 1187 XA_STATE(xas, &mapping->i_pages, start); 1188 struct folio *folio; 1189 swp_entry_t entry; 1190 1191 rcu_read_lock(); 1192 xas_for_each(&xas, folio, ULONG_MAX) { 1193 if (xas_retry(&xas, folio)) 1194 continue; 1195 1196 if (!xa_is_value(folio)) 1197 continue; 1198 1199 entry = radix_to_swp_entry(folio); 1200 /* 1201 * swapin error entries can be found in the mapping. But they're 1202 * deliberately ignored here as we've done everything we can do. 1203 */ 1204 if (swp_type(entry) != type) 1205 continue; 1206 1207 indices[folio_batch_count(fbatch)] = xas.xa_index; 1208 if (!folio_batch_add(fbatch, folio)) 1209 break; 1210 1211 if (need_resched()) { 1212 xas_pause(&xas); 1213 cond_resched_rcu(); 1214 } 1215 } 1216 rcu_read_unlock(); 1217 1218 return xas.xa_index; 1219 } 1220 1221 /* 1222 * Move the swapped pages for an inode to page cache. Returns the count 1223 * of pages swapped in, or the error in case of failure. 1224 */ 1225 static int shmem_unuse_swap_entries(struct inode *inode, 1226 struct folio_batch *fbatch, pgoff_t *indices) 1227 { 1228 int i = 0; 1229 int ret = 0; 1230 int error = 0; 1231 struct address_space *mapping = inode->i_mapping; 1232 1233 for (i = 0; i < folio_batch_count(fbatch); i++) { 1234 struct folio *folio = fbatch->folios[i]; 1235 1236 if (!xa_is_value(folio)) 1237 continue; 1238 error = shmem_swapin_folio(inode, indices[i], 1239 &folio, SGP_CACHE, 1240 mapping_gfp_mask(mapping), 1241 NULL, NULL); 1242 if (error == 0) { 1243 folio_unlock(folio); 1244 folio_put(folio); 1245 ret++; 1246 } 1247 if (error == -ENOMEM) 1248 break; 1249 error = 0; 1250 } 1251 return error ? error : ret; 1252 } 1253 1254 /* 1255 * If swap found in inode, free it and move page from swapcache to filecache. 1256 */ 1257 static int shmem_unuse_inode(struct inode *inode, unsigned int type) 1258 { 1259 struct address_space *mapping = inode->i_mapping; 1260 pgoff_t start = 0; 1261 struct folio_batch fbatch; 1262 pgoff_t indices[PAGEVEC_SIZE]; 1263 int ret = 0; 1264 1265 do { 1266 folio_batch_init(&fbatch); 1267 shmem_find_swap_entries(mapping, start, &fbatch, indices, type); 1268 if (folio_batch_count(&fbatch) == 0) { 1269 ret = 0; 1270 break; 1271 } 1272 1273 ret = shmem_unuse_swap_entries(inode, &fbatch, indices); 1274 if (ret < 0) 1275 break; 1276 1277 start = indices[folio_batch_count(&fbatch) - 1]; 1278 } while (true); 1279 1280 return ret; 1281 } 1282 1283 /* 1284 * Read all the shared memory data that resides in the swap 1285 * device 'type' back into memory, so the swap device can be 1286 * unused. 1287 */ 1288 int shmem_unuse(unsigned int type) 1289 { 1290 struct shmem_inode_info *info, *next; 1291 int error = 0; 1292 1293 if (list_empty(&shmem_swaplist)) 1294 return 0; 1295 1296 mutex_lock(&shmem_swaplist_mutex); 1297 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1298 if (!info->swapped) { 1299 list_del_init(&info->swaplist); 1300 continue; 1301 } 1302 /* 1303 * Drop the swaplist mutex while searching the inode for swap; 1304 * but before doing so, make sure shmem_evict_inode() will not 1305 * remove placeholder inode from swaplist, nor let it be freed 1306 * (igrab() would protect from unlink, but not from unmount). 1307 */ 1308 atomic_inc(&info->stop_eviction); 1309 mutex_unlock(&shmem_swaplist_mutex); 1310 1311 error = shmem_unuse_inode(&info->vfs_inode, type); 1312 cond_resched(); 1313 1314 mutex_lock(&shmem_swaplist_mutex); 1315 next = list_next_entry(info, swaplist); 1316 if (!info->swapped) 1317 list_del_init(&info->swaplist); 1318 if (atomic_dec_and_test(&info->stop_eviction)) 1319 wake_up_var(&info->stop_eviction); 1320 if (error) 1321 break; 1322 } 1323 mutex_unlock(&shmem_swaplist_mutex); 1324 1325 return error; 1326 } 1327 1328 /* 1329 * Move the page from the page cache to the swap cache. 1330 */ 1331 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1332 { 1333 struct folio *folio = page_folio(page); 1334 struct shmem_inode_info *info; 1335 struct address_space *mapping; 1336 struct inode *inode; 1337 swp_entry_t swap; 1338 pgoff_t index; 1339 1340 /* 1341 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or 1342 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, 1343 * and its shmem_writeback() needs them to be split when swapping. 1344 */ 1345 if (folio_test_large(folio)) { 1346 /* Ensure the subpages are still dirty */ 1347 folio_test_set_dirty(folio); 1348 if (split_huge_page(page) < 0) 1349 goto redirty; 1350 folio = page_folio(page); 1351 folio_clear_dirty(folio); 1352 } 1353 1354 BUG_ON(!folio_test_locked(folio)); 1355 mapping = folio->mapping; 1356 index = folio->index; 1357 inode = mapping->host; 1358 info = SHMEM_I(inode); 1359 if (info->flags & VM_LOCKED) 1360 goto redirty; 1361 if (!total_swap_pages) 1362 goto redirty; 1363 1364 /* 1365 * Our capabilities prevent regular writeback or sync from ever calling 1366 * shmem_writepage; but a stacking filesystem might use ->writepage of 1367 * its underlying filesystem, in which case tmpfs should write out to 1368 * swap only in response to memory pressure, and not for the writeback 1369 * threads or sync. 1370 */ 1371 if (!wbc->for_reclaim) { 1372 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 1373 goto redirty; 1374 } 1375 1376 /* 1377 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 1378 * value into swapfile.c, the only way we can correctly account for a 1379 * fallocated folio arriving here is now to initialize it and write it. 1380 * 1381 * That's okay for a folio already fallocated earlier, but if we have 1382 * not yet completed the fallocation, then (a) we want to keep track 1383 * of this folio in case we have to undo it, and (b) it may not be a 1384 * good idea to continue anyway, once we're pushing into swap. So 1385 * reactivate the folio, and let shmem_fallocate() quit when too many. 1386 */ 1387 if (!folio_test_uptodate(folio)) { 1388 if (inode->i_private) { 1389 struct shmem_falloc *shmem_falloc; 1390 spin_lock(&inode->i_lock); 1391 shmem_falloc = inode->i_private; 1392 if (shmem_falloc && 1393 !shmem_falloc->waitq && 1394 index >= shmem_falloc->start && 1395 index < shmem_falloc->next) 1396 shmem_falloc->nr_unswapped++; 1397 else 1398 shmem_falloc = NULL; 1399 spin_unlock(&inode->i_lock); 1400 if (shmem_falloc) 1401 goto redirty; 1402 } 1403 folio_zero_range(folio, 0, folio_size(folio)); 1404 flush_dcache_folio(folio); 1405 folio_mark_uptodate(folio); 1406 } 1407 1408 swap = folio_alloc_swap(folio); 1409 if (!swap.val) 1410 goto redirty; 1411 1412 /* 1413 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1414 * if it's not already there. Do it now before the folio is 1415 * moved to swap cache, when its pagelock no longer protects 1416 * the inode from eviction. But don't unlock the mutex until 1417 * we've incremented swapped, because shmem_unuse_inode() will 1418 * prune a !swapped inode from the swaplist under this mutex. 1419 */ 1420 mutex_lock(&shmem_swaplist_mutex); 1421 if (list_empty(&info->swaplist)) 1422 list_add(&info->swaplist, &shmem_swaplist); 1423 1424 if (add_to_swap_cache(folio, swap, 1425 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, 1426 NULL) == 0) { 1427 spin_lock_irq(&info->lock); 1428 shmem_recalc_inode(inode); 1429 info->swapped++; 1430 spin_unlock_irq(&info->lock); 1431 1432 swap_shmem_alloc(swap); 1433 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); 1434 1435 mutex_unlock(&shmem_swaplist_mutex); 1436 BUG_ON(folio_mapped(folio)); 1437 swap_writepage(&folio->page, wbc); 1438 return 0; 1439 } 1440 1441 mutex_unlock(&shmem_swaplist_mutex); 1442 put_swap_folio(folio, swap); 1443 redirty: 1444 folio_mark_dirty(folio); 1445 if (wbc->for_reclaim) 1446 return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ 1447 folio_unlock(folio); 1448 return 0; 1449 } 1450 1451 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 1452 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1453 { 1454 char buffer[64]; 1455 1456 if (!mpol || mpol->mode == MPOL_DEFAULT) 1457 return; /* show nothing */ 1458 1459 mpol_to_str(buffer, sizeof(buffer), mpol); 1460 1461 seq_printf(seq, ",mpol=%s", buffer); 1462 } 1463 1464 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1465 { 1466 struct mempolicy *mpol = NULL; 1467 if (sbinfo->mpol) { 1468 raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1469 mpol = sbinfo->mpol; 1470 mpol_get(mpol); 1471 raw_spin_unlock(&sbinfo->stat_lock); 1472 } 1473 return mpol; 1474 } 1475 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 1476 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1477 { 1478 } 1479 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1480 { 1481 return NULL; 1482 } 1483 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1484 #ifndef CONFIG_NUMA 1485 #define vm_policy vm_private_data 1486 #endif 1487 1488 static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1489 struct shmem_inode_info *info, pgoff_t index) 1490 { 1491 /* Create a pseudo vma that just contains the policy */ 1492 vma_init(vma, NULL); 1493 /* Bias interleave by inode number to distribute better across nodes */ 1494 vma->vm_pgoff = index + info->vfs_inode.i_ino; 1495 vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1496 } 1497 1498 static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1499 { 1500 /* Drop reference taken by mpol_shared_policy_lookup() */ 1501 mpol_cond_put(vma->vm_policy); 1502 } 1503 1504 static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, 1505 struct shmem_inode_info *info, pgoff_t index) 1506 { 1507 struct vm_area_struct pvma; 1508 struct page *page; 1509 struct vm_fault vmf = { 1510 .vma = &pvma, 1511 }; 1512 1513 shmem_pseudo_vma_init(&pvma, info, index); 1514 page = swap_cluster_readahead(swap, gfp, &vmf); 1515 shmem_pseudo_vma_destroy(&pvma); 1516 1517 if (!page) 1518 return NULL; 1519 return page_folio(page); 1520 } 1521 1522 /* 1523 * Make sure huge_gfp is always more limited than limit_gfp. 1524 * Some of the flags set permissions, while others set limitations. 1525 */ 1526 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) 1527 { 1528 gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; 1529 gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; 1530 gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; 1531 gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); 1532 1533 /* Allow allocations only from the originally specified zones. */ 1534 result |= zoneflags; 1535 1536 /* 1537 * Minimize the result gfp by taking the union with the deny flags, 1538 * and the intersection of the allow flags. 1539 */ 1540 result |= (limit_gfp & denyflags); 1541 result |= (huge_gfp & limit_gfp) & allowflags; 1542 1543 return result; 1544 } 1545 1546 static struct folio *shmem_alloc_hugefolio(gfp_t gfp, 1547 struct shmem_inode_info *info, pgoff_t index) 1548 { 1549 struct vm_area_struct pvma; 1550 struct address_space *mapping = info->vfs_inode.i_mapping; 1551 pgoff_t hindex; 1552 struct folio *folio; 1553 1554 hindex = round_down(index, HPAGE_PMD_NR); 1555 if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, 1556 XA_PRESENT)) 1557 return NULL; 1558 1559 shmem_pseudo_vma_init(&pvma, info, hindex); 1560 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); 1561 shmem_pseudo_vma_destroy(&pvma); 1562 if (!folio) 1563 count_vm_event(THP_FILE_FALLBACK); 1564 return folio; 1565 } 1566 1567 static struct folio *shmem_alloc_folio(gfp_t gfp, 1568 struct shmem_inode_info *info, pgoff_t index) 1569 { 1570 struct vm_area_struct pvma; 1571 struct folio *folio; 1572 1573 shmem_pseudo_vma_init(&pvma, info, index); 1574 folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); 1575 shmem_pseudo_vma_destroy(&pvma); 1576 1577 return folio; 1578 } 1579 1580 static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, 1581 pgoff_t index, bool huge) 1582 { 1583 struct shmem_inode_info *info = SHMEM_I(inode); 1584 struct folio *folio; 1585 int nr; 1586 int err = -ENOSPC; 1587 1588 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1589 huge = false; 1590 nr = huge ? HPAGE_PMD_NR : 1; 1591 1592 if (!shmem_inode_acct_block(inode, nr)) 1593 goto failed; 1594 1595 if (huge) 1596 folio = shmem_alloc_hugefolio(gfp, info, index); 1597 else 1598 folio = shmem_alloc_folio(gfp, info, index); 1599 if (folio) { 1600 __folio_set_locked(folio); 1601 __folio_set_swapbacked(folio); 1602 return folio; 1603 } 1604 1605 err = -ENOMEM; 1606 shmem_inode_unacct_blocks(inode, nr); 1607 failed: 1608 return ERR_PTR(err); 1609 } 1610 1611 /* 1612 * When a page is moved from swapcache to shmem filecache (either by the 1613 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of 1614 * shmem_unuse_inode()), it may have been read in earlier from swap, in 1615 * ignorance of the mapping it belongs to. If that mapping has special 1616 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1617 * we may need to copy to a suitable page before moving to filecache. 1618 * 1619 * In a future release, this may well be extended to respect cpuset and 1620 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1621 * but for now it is a simple matter of zone. 1622 */ 1623 static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) 1624 { 1625 return folio_zonenum(folio) > gfp_zone(gfp); 1626 } 1627 1628 static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, 1629 struct shmem_inode_info *info, pgoff_t index) 1630 { 1631 struct folio *old, *new; 1632 struct address_space *swap_mapping; 1633 swp_entry_t entry; 1634 pgoff_t swap_index; 1635 int error; 1636 1637 old = *foliop; 1638 entry = folio_swap_entry(old); 1639 swap_index = swp_offset(entry); 1640 swap_mapping = swap_address_space(entry); 1641 1642 /* 1643 * We have arrived here because our zones are constrained, so don't 1644 * limit chance of success by further cpuset and node constraints. 1645 */ 1646 gfp &= ~GFP_CONSTRAINT_MASK; 1647 VM_BUG_ON_FOLIO(folio_test_large(old), old); 1648 new = shmem_alloc_folio(gfp, info, index); 1649 if (!new) 1650 return -ENOMEM; 1651 1652 folio_get(new); 1653 folio_copy(new, old); 1654 flush_dcache_folio(new); 1655 1656 __folio_set_locked(new); 1657 __folio_set_swapbacked(new); 1658 folio_mark_uptodate(new); 1659 folio_set_swap_entry(new, entry); 1660 folio_set_swapcache(new); 1661 1662 /* 1663 * Our caller will very soon move newpage out of swapcache, but it's 1664 * a nice clean interface for us to replace oldpage by newpage there. 1665 */ 1666 xa_lock_irq(&swap_mapping->i_pages); 1667 error = shmem_replace_entry(swap_mapping, swap_index, old, new); 1668 if (!error) { 1669 mem_cgroup_migrate(old, new); 1670 __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); 1671 __lruvec_stat_mod_folio(new, NR_SHMEM, 1); 1672 __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); 1673 __lruvec_stat_mod_folio(old, NR_SHMEM, -1); 1674 } 1675 xa_unlock_irq(&swap_mapping->i_pages); 1676 1677 if (unlikely(error)) { 1678 /* 1679 * Is this possible? I think not, now that our callers check 1680 * both PageSwapCache and page_private after getting page lock; 1681 * but be defensive. Reverse old to newpage for clear and free. 1682 */ 1683 old = new; 1684 } else { 1685 folio_add_lru(new); 1686 *foliop = new; 1687 } 1688 1689 folio_clear_swapcache(old); 1690 old->private = NULL; 1691 1692 folio_unlock(old); 1693 folio_put_refs(old, 2); 1694 return error; 1695 } 1696 1697 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, 1698 struct folio *folio, swp_entry_t swap) 1699 { 1700 struct address_space *mapping = inode->i_mapping; 1701 struct shmem_inode_info *info = SHMEM_I(inode); 1702 swp_entry_t swapin_error; 1703 void *old; 1704 1705 swapin_error = make_swapin_error_entry(); 1706 old = xa_cmpxchg_irq(&mapping->i_pages, index, 1707 swp_to_radix_entry(swap), 1708 swp_to_radix_entry(swapin_error), 0); 1709 if (old != swp_to_radix_entry(swap)) 1710 return; 1711 1712 folio_wait_writeback(folio); 1713 delete_from_swap_cache(folio); 1714 spin_lock_irq(&info->lock); 1715 /* 1716 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't 1717 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in 1718 * shmem_evict_inode. 1719 */ 1720 info->alloced--; 1721 info->swapped--; 1722 shmem_recalc_inode(inode); 1723 spin_unlock_irq(&info->lock); 1724 swap_free(swap); 1725 } 1726 1727 /* 1728 * Swap in the folio pointed to by *foliop. 1729 * Caller has to make sure that *foliop contains a valid swapped folio. 1730 * Returns 0 and the folio in foliop if success. On failure, returns the 1731 * error code and NULL in *foliop. 1732 */ 1733 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 1734 struct folio **foliop, enum sgp_type sgp, 1735 gfp_t gfp, struct vm_area_struct *vma, 1736 vm_fault_t *fault_type) 1737 { 1738 struct address_space *mapping = inode->i_mapping; 1739 struct shmem_inode_info *info = SHMEM_I(inode); 1740 struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; 1741 struct swap_info_struct *si; 1742 struct folio *folio = NULL; 1743 swp_entry_t swap; 1744 int error; 1745 1746 VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 1747 swap = radix_to_swp_entry(*foliop); 1748 *foliop = NULL; 1749 1750 if (is_swapin_error_entry(swap)) 1751 return -EIO; 1752 1753 si = get_swap_device(swap); 1754 if (!si) { 1755 if (!shmem_confirm_swap(mapping, index, swap)) 1756 return -EEXIST; 1757 else 1758 return -EINVAL; 1759 } 1760 1761 /* Look it up and read it in.. */ 1762 folio = swap_cache_get_folio(swap, NULL, 0); 1763 if (!folio) { 1764 /* Or update major stats only when swapin succeeds?? */ 1765 if (fault_type) { 1766 *fault_type |= VM_FAULT_MAJOR; 1767 count_vm_event(PGMAJFAULT); 1768 count_memcg_event_mm(charge_mm, PGMAJFAULT); 1769 } 1770 /* Here we actually start the io */ 1771 folio = shmem_swapin(swap, gfp, info, index); 1772 if (!folio) { 1773 error = -ENOMEM; 1774 goto failed; 1775 } 1776 } 1777 1778 /* We have to do this with folio locked to prevent races */ 1779 folio_lock(folio); 1780 if (!folio_test_swapcache(folio) || 1781 folio_swap_entry(folio).val != swap.val || 1782 !shmem_confirm_swap(mapping, index, swap)) { 1783 error = -EEXIST; 1784 goto unlock; 1785 } 1786 if (!folio_test_uptodate(folio)) { 1787 error = -EIO; 1788 goto failed; 1789 } 1790 folio_wait_writeback(folio); 1791 1792 /* 1793 * Some architectures may have to restore extra metadata to the 1794 * folio after reading from swap. 1795 */ 1796 arch_swap_restore(swap, folio); 1797 1798 if (shmem_should_replace_folio(folio, gfp)) { 1799 error = shmem_replace_folio(&folio, gfp, info, index); 1800 if (error) 1801 goto failed; 1802 } 1803 1804 error = shmem_add_to_page_cache(folio, mapping, index, 1805 swp_to_radix_entry(swap), gfp, 1806 charge_mm); 1807 if (error) 1808 goto failed; 1809 1810 spin_lock_irq(&info->lock); 1811 info->swapped--; 1812 shmem_recalc_inode(inode); 1813 spin_unlock_irq(&info->lock); 1814 1815 if (sgp == SGP_WRITE) 1816 folio_mark_accessed(folio); 1817 1818 delete_from_swap_cache(folio); 1819 folio_mark_dirty(folio); 1820 swap_free(swap); 1821 put_swap_device(si); 1822 1823 *foliop = folio; 1824 return 0; 1825 failed: 1826 if (!shmem_confirm_swap(mapping, index, swap)) 1827 error = -EEXIST; 1828 if (error == -EIO) 1829 shmem_set_folio_swapin_error(inode, index, folio, swap); 1830 unlock: 1831 if (folio) { 1832 folio_unlock(folio); 1833 folio_put(folio); 1834 } 1835 put_swap_device(si); 1836 1837 return error; 1838 } 1839 1840 /* 1841 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate 1842 * 1843 * If we allocate a new one we do not mark it dirty. That's up to the 1844 * vm. If we swap it in we mark it dirty since we also free the swap 1845 * entry since a page cannot live in both the swap and page cache. 1846 * 1847 * vma, vmf, and fault_type are only supplied by shmem_fault: 1848 * otherwise they are NULL. 1849 */ 1850 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, 1851 struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 1852 struct vm_area_struct *vma, struct vm_fault *vmf, 1853 vm_fault_t *fault_type) 1854 { 1855 struct address_space *mapping = inode->i_mapping; 1856 struct shmem_inode_info *info = SHMEM_I(inode); 1857 struct shmem_sb_info *sbinfo; 1858 struct mm_struct *charge_mm; 1859 struct folio *folio; 1860 pgoff_t hindex; 1861 gfp_t huge_gfp; 1862 int error; 1863 int once = 0; 1864 int alloced = 0; 1865 1866 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1867 return -EFBIG; 1868 repeat: 1869 if (sgp <= SGP_CACHE && 1870 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1871 return -EINVAL; 1872 } 1873 1874 sbinfo = SHMEM_SB(inode->i_sb); 1875 charge_mm = vma ? vma->vm_mm : NULL; 1876 1877 folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0); 1878 if (folio && vma && userfaultfd_minor(vma)) { 1879 if (!xa_is_value(folio)) { 1880 folio_unlock(folio); 1881 folio_put(folio); 1882 } 1883 *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); 1884 return 0; 1885 } 1886 1887 if (xa_is_value(folio)) { 1888 error = shmem_swapin_folio(inode, index, &folio, 1889 sgp, gfp, vma, fault_type); 1890 if (error == -EEXIST) 1891 goto repeat; 1892 1893 *foliop = folio; 1894 return error; 1895 } 1896 1897 if (folio) { 1898 if (sgp == SGP_WRITE) 1899 folio_mark_accessed(folio); 1900 if (folio_test_uptodate(folio)) 1901 goto out; 1902 /* fallocated folio */ 1903 if (sgp != SGP_READ) 1904 goto clear; 1905 folio_unlock(folio); 1906 folio_put(folio); 1907 } 1908 1909 /* 1910 * SGP_READ: succeed on hole, with NULL folio, letting caller zero. 1911 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. 1912 */ 1913 *foliop = NULL; 1914 if (sgp == SGP_READ) 1915 return 0; 1916 if (sgp == SGP_NOALLOC) 1917 return -ENOENT; 1918 1919 /* 1920 * Fast cache lookup and swap lookup did not find it: allocate. 1921 */ 1922 1923 if (vma && userfaultfd_missing(vma)) { 1924 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 1925 return 0; 1926 } 1927 1928 if (!shmem_is_huge(inode, index, false, 1929 vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) 1930 goto alloc_nohuge; 1931 1932 huge_gfp = vma_thp_gfp_mask(vma); 1933 huge_gfp = limit_gfp_mask(huge_gfp, gfp); 1934 folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); 1935 if (IS_ERR(folio)) { 1936 alloc_nohuge: 1937 folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); 1938 } 1939 if (IS_ERR(folio)) { 1940 int retry = 5; 1941 1942 error = PTR_ERR(folio); 1943 folio = NULL; 1944 if (error != -ENOSPC) 1945 goto unlock; 1946 /* 1947 * Try to reclaim some space by splitting a large folio 1948 * beyond i_size on the filesystem. 1949 */ 1950 while (retry--) { 1951 int ret; 1952 1953 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 1954 if (ret == SHRINK_STOP) 1955 break; 1956 if (ret) 1957 goto alloc_nohuge; 1958 } 1959 goto unlock; 1960 } 1961 1962 hindex = round_down(index, folio_nr_pages(folio)); 1963 1964 if (sgp == SGP_WRITE) 1965 __folio_set_referenced(folio); 1966 1967 error = shmem_add_to_page_cache(folio, mapping, hindex, 1968 NULL, gfp & GFP_RECLAIM_MASK, 1969 charge_mm); 1970 if (error) 1971 goto unacct; 1972 folio_add_lru(folio); 1973 1974 spin_lock_irq(&info->lock); 1975 info->alloced += folio_nr_pages(folio); 1976 inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); 1977 shmem_recalc_inode(inode); 1978 spin_unlock_irq(&info->lock); 1979 alloced = true; 1980 1981 if (folio_test_pmd_mappable(folio) && 1982 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1983 folio_next_index(folio) - 1) { 1984 /* 1985 * Part of the large folio is beyond i_size: subject 1986 * to shrink under memory pressure. 1987 */ 1988 spin_lock(&sbinfo->shrinklist_lock); 1989 /* 1990 * _careful to defend against unlocked access to 1991 * ->shrink_list in shmem_unused_huge_shrink() 1992 */ 1993 if (list_empty_careful(&info->shrinklist)) { 1994 list_add_tail(&info->shrinklist, 1995 &sbinfo->shrinklist); 1996 sbinfo->shrinklist_len++; 1997 } 1998 spin_unlock(&sbinfo->shrinklist_lock); 1999 } 2000 2001 /* 2002 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. 2003 */ 2004 if (sgp == SGP_FALLOC) 2005 sgp = SGP_WRITE; 2006 clear: 2007 /* 2008 * Let SGP_WRITE caller clear ends if write does not fill folio; 2009 * but SGP_FALLOC on a folio fallocated earlier must initialize 2010 * it now, lest undo on failure cancel our earlier guarantee. 2011 */ 2012 if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { 2013 long i, n = folio_nr_pages(folio); 2014 2015 for (i = 0; i < n; i++) 2016 clear_highpage(folio_page(folio, i)); 2017 flush_dcache_folio(folio); 2018 folio_mark_uptodate(folio); 2019 } 2020 2021 /* Perhaps the file has been truncated since we checked */ 2022 if (sgp <= SGP_CACHE && 2023 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 2024 if (alloced) { 2025 folio_clear_dirty(folio); 2026 filemap_remove_folio(folio); 2027 spin_lock_irq(&info->lock); 2028 shmem_recalc_inode(inode); 2029 spin_unlock_irq(&info->lock); 2030 } 2031 error = -EINVAL; 2032 goto unlock; 2033 } 2034 out: 2035 *foliop = folio; 2036 return 0; 2037 2038 /* 2039 * Error recovery. 2040 */ 2041 unacct: 2042 shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); 2043 2044 if (folio_test_large(folio)) { 2045 folio_unlock(folio); 2046 folio_put(folio); 2047 goto alloc_nohuge; 2048 } 2049 unlock: 2050 if (folio) { 2051 folio_unlock(folio); 2052 folio_put(folio); 2053 } 2054 if (error == -ENOSPC && !once++) { 2055 spin_lock_irq(&info->lock); 2056 shmem_recalc_inode(inode); 2057 spin_unlock_irq(&info->lock); 2058 goto repeat; 2059 } 2060 if (error == -EEXIST) 2061 goto repeat; 2062 return error; 2063 } 2064 2065 int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, 2066 enum sgp_type sgp) 2067 { 2068 return shmem_get_folio_gfp(inode, index, foliop, sgp, 2069 mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); 2070 } 2071 2072 /* 2073 * This is like autoremove_wake_function, but it removes the wait queue 2074 * entry unconditionally - even if something else had already woken the 2075 * target. 2076 */ 2077 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 2078 { 2079 int ret = default_wake_function(wait, mode, sync, key); 2080 list_del_init(&wait->entry); 2081 return ret; 2082 } 2083 2084 static vm_fault_t shmem_fault(struct vm_fault *vmf) 2085 { 2086 struct vm_area_struct *vma = vmf->vma; 2087 struct inode *inode = file_inode(vma->vm_file); 2088 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 2089 struct folio *folio = NULL; 2090 int err; 2091 vm_fault_t ret = VM_FAULT_LOCKED; 2092 2093 /* 2094 * Trinity finds that probing a hole which tmpfs is punching can 2095 * prevent the hole-punch from ever completing: which in turn 2096 * locks writers out with its hold on i_rwsem. So refrain from 2097 * faulting pages into the hole while it's being punched. Although 2098 * shmem_undo_range() does remove the additions, it may be unable to 2099 * keep up, as each new page needs its own unmap_mapping_range() call, 2100 * and the i_mmap tree grows ever slower to scan if new vmas are added. 2101 * 2102 * It does not matter if we sometimes reach this check just before the 2103 * hole-punch begins, so that one fault then races with the punch: 2104 * we just need to make racing faults a rare case. 2105 * 2106 * The implementation below would be much simpler if we just used a 2107 * standard mutex or completion: but we cannot take i_rwsem in fault, 2108 * and bloating every shmem inode for this unlikely case would be sad. 2109 */ 2110 if (unlikely(inode->i_private)) { 2111 struct shmem_falloc *shmem_falloc; 2112 2113 spin_lock(&inode->i_lock); 2114 shmem_falloc = inode->i_private; 2115 if (shmem_falloc && 2116 shmem_falloc->waitq && 2117 vmf->pgoff >= shmem_falloc->start && 2118 vmf->pgoff < shmem_falloc->next) { 2119 struct file *fpin; 2120 wait_queue_head_t *shmem_falloc_waitq; 2121 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 2122 2123 ret = VM_FAULT_NOPAGE; 2124 fpin = maybe_unlock_mmap_for_io(vmf, NULL); 2125 if (fpin) 2126 ret = VM_FAULT_RETRY; 2127 2128 shmem_falloc_waitq = shmem_falloc->waitq; 2129 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 2130 TASK_UNINTERRUPTIBLE); 2131 spin_unlock(&inode->i_lock); 2132 schedule(); 2133 2134 /* 2135 * shmem_falloc_waitq points into the shmem_fallocate() 2136 * stack of the hole-punching task: shmem_falloc_waitq 2137 * is usually invalid by the time we reach here, but 2138 * finish_wait() does not dereference it in that case; 2139 * though i_lock needed lest racing with wake_up_all(). 2140 */ 2141 spin_lock(&inode->i_lock); 2142 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 2143 spin_unlock(&inode->i_lock); 2144 2145 if (fpin) 2146 fput(fpin); 2147 return ret; 2148 } 2149 spin_unlock(&inode->i_lock); 2150 } 2151 2152 err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, 2153 gfp, vma, vmf, &ret); 2154 if (err) 2155 return vmf_error(err); 2156 if (folio) 2157 vmf->page = folio_file_page(folio, vmf->pgoff); 2158 return ret; 2159 } 2160 2161 unsigned long shmem_get_unmapped_area(struct file *file, 2162 unsigned long uaddr, unsigned long len, 2163 unsigned long pgoff, unsigned long flags) 2164 { 2165 unsigned long (*get_area)(struct file *, 2166 unsigned long, unsigned long, unsigned long, unsigned long); 2167 unsigned long addr; 2168 unsigned long offset; 2169 unsigned long inflated_len; 2170 unsigned long inflated_addr; 2171 unsigned long inflated_offset; 2172 2173 if (len > TASK_SIZE) 2174 return -ENOMEM; 2175 2176 get_area = current->mm->get_unmapped_area; 2177 addr = get_area(file, uaddr, len, pgoff, flags); 2178 2179 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 2180 return addr; 2181 if (IS_ERR_VALUE(addr)) 2182 return addr; 2183 if (addr & ~PAGE_MASK) 2184 return addr; 2185 if (addr > TASK_SIZE - len) 2186 return addr; 2187 2188 if (shmem_huge == SHMEM_HUGE_DENY) 2189 return addr; 2190 if (len < HPAGE_PMD_SIZE) 2191 return addr; 2192 if (flags & MAP_FIXED) 2193 return addr; 2194 /* 2195 * Our priority is to support MAP_SHARED mapped hugely; 2196 * and support MAP_PRIVATE mapped hugely too, until it is COWed. 2197 * But if caller specified an address hint and we allocated area there 2198 * successfully, respect that as before. 2199 */ 2200 if (uaddr == addr) 2201 return addr; 2202 2203 if (shmem_huge != SHMEM_HUGE_FORCE) { 2204 struct super_block *sb; 2205 2206 if (file) { 2207 VM_BUG_ON(file->f_op != &shmem_file_operations); 2208 sb = file_inode(file)->i_sb; 2209 } else { 2210 /* 2211 * Called directly from mm/mmap.c, or drivers/char/mem.c 2212 * for "/dev/zero", to create a shared anonymous object. 2213 */ 2214 if (IS_ERR(shm_mnt)) 2215 return addr; 2216 sb = shm_mnt->mnt_sb; 2217 } 2218 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2219 return addr; 2220 } 2221 2222 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2223 if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2224 return addr; 2225 if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2226 return addr; 2227 2228 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2229 if (inflated_len > TASK_SIZE) 2230 return addr; 2231 if (inflated_len < len) 2232 return addr; 2233 2234 inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); 2235 if (IS_ERR_VALUE(inflated_addr)) 2236 return addr; 2237 if (inflated_addr & ~PAGE_MASK) 2238 return addr; 2239 2240 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2241 inflated_addr += offset - inflated_offset; 2242 if (inflated_offset > offset) 2243 inflated_addr += HPAGE_PMD_SIZE; 2244 2245 if (inflated_addr > TASK_SIZE - len) 2246 return addr; 2247 return inflated_addr; 2248 } 2249 2250 #ifdef CONFIG_NUMA 2251 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2252 { 2253 struct inode *inode = file_inode(vma->vm_file); 2254 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 2255 } 2256 2257 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2258 unsigned long addr) 2259 { 2260 struct inode *inode = file_inode(vma->vm_file); 2261 pgoff_t index; 2262 2263 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2264 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2265 } 2266 #endif 2267 2268 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 2269 { 2270 struct inode *inode = file_inode(file); 2271 struct shmem_inode_info *info = SHMEM_I(inode); 2272 int retval = -ENOMEM; 2273 2274 /* 2275 * What serializes the accesses to info->flags? 2276 * ipc_lock_object() when called from shmctl_do_lock(), 2277 * no serialization needed when called from shm_destroy(). 2278 */ 2279 if (lock && !(info->flags & VM_LOCKED)) { 2280 if (!user_shm_lock(inode->i_size, ucounts)) 2281 goto out_nomem; 2282 info->flags |= VM_LOCKED; 2283 mapping_set_unevictable(file->f_mapping); 2284 } 2285 if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2286 user_shm_unlock(inode->i_size, ucounts); 2287 info->flags &= ~VM_LOCKED; 2288 mapping_clear_unevictable(file->f_mapping); 2289 } 2290 retval = 0; 2291 2292 out_nomem: 2293 return retval; 2294 } 2295 2296 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2297 { 2298 struct inode *inode = file_inode(file); 2299 struct shmem_inode_info *info = SHMEM_I(inode); 2300 int ret; 2301 2302 ret = seal_check_future_write(info->seals, vma); 2303 if (ret) 2304 return ret; 2305 2306 /* arm64 - allow memory tagging on RAM-based files */ 2307 vm_flags_set(vma, VM_MTE_ALLOWED); 2308 2309 file_accessed(file); 2310 /* This is anonymous shared memory if it is unlinked at the time of mmap */ 2311 if (inode->i_nlink) 2312 vma->vm_ops = &shmem_vm_ops; 2313 else 2314 vma->vm_ops = &shmem_anon_vm_ops; 2315 return 0; 2316 } 2317 2318 #ifdef CONFIG_TMPFS_XATTR 2319 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2320 2321 /* 2322 * chattr's fsflags are unrelated to extended attributes, 2323 * but tmpfs has chosen to enable them under the same config option. 2324 */ 2325 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2326 { 2327 unsigned int i_flags = 0; 2328 2329 if (fsflags & FS_NOATIME_FL) 2330 i_flags |= S_NOATIME; 2331 if (fsflags & FS_APPEND_FL) 2332 i_flags |= S_APPEND; 2333 if (fsflags & FS_IMMUTABLE_FL) 2334 i_flags |= S_IMMUTABLE; 2335 /* 2336 * But FS_NODUMP_FL does not require any action in i_flags. 2337 */ 2338 inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); 2339 } 2340 #else 2341 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2342 { 2343 } 2344 #define shmem_initxattrs NULL 2345 #endif 2346 2347 static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, 2348 struct inode *dir, umode_t mode, dev_t dev, 2349 unsigned long flags) 2350 { 2351 struct inode *inode; 2352 struct shmem_inode_info *info; 2353 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2354 ino_t ino; 2355 2356 if (shmem_reserve_inode(sb, &ino)) 2357 return NULL; 2358 2359 inode = new_inode(sb); 2360 if (inode) { 2361 inode->i_ino = ino; 2362 inode_init_owner(idmap, inode, dir, mode); 2363 inode->i_blocks = 0; 2364 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 2365 inode->i_generation = get_random_u32(); 2366 info = SHMEM_I(inode); 2367 memset(info, 0, (char *)inode - (char *)info); 2368 spin_lock_init(&info->lock); 2369 atomic_set(&info->stop_eviction, 0); 2370 info->seals = F_SEAL_SEAL; 2371 info->flags = flags & VM_NORESERVE; 2372 info->i_crtime = inode->i_mtime; 2373 info->fsflags = (dir == NULL) ? 0 : 2374 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; 2375 if (info->fsflags) 2376 shmem_set_inode_flags(inode, info->fsflags); 2377 INIT_LIST_HEAD(&info->shrinklist); 2378 INIT_LIST_HEAD(&info->swaplist); 2379 simple_xattrs_init(&info->xattrs); 2380 cache_no_acl(inode); 2381 mapping_set_large_folios(inode->i_mapping); 2382 2383 switch (mode & S_IFMT) { 2384 default: 2385 inode->i_op = &shmem_special_inode_operations; 2386 init_special_inode(inode, mode, dev); 2387 break; 2388 case S_IFREG: 2389 inode->i_mapping->a_ops = &shmem_aops; 2390 inode->i_op = &shmem_inode_operations; 2391 inode->i_fop = &shmem_file_operations; 2392 mpol_shared_policy_init(&info->policy, 2393 shmem_get_sbmpol(sbinfo)); 2394 break; 2395 case S_IFDIR: 2396 inc_nlink(inode); 2397 /* Some things misbehave if size == 0 on a directory */ 2398 inode->i_size = 2 * BOGO_DIRENT_SIZE; 2399 inode->i_op = &shmem_dir_inode_operations; 2400 inode->i_fop = &simple_dir_operations; 2401 break; 2402 case S_IFLNK: 2403 /* 2404 * Must not load anything in the rbtree, 2405 * mpol_free_shared_policy will not be called. 2406 */ 2407 mpol_shared_policy_init(&info->policy, NULL); 2408 break; 2409 } 2410 2411 lockdep_annotate_inode_mutex_key(inode); 2412 } else 2413 shmem_free_inode(sb); 2414 return inode; 2415 } 2416 2417 #ifdef CONFIG_USERFAULTFD 2418 int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, 2419 pmd_t *dst_pmd, 2420 struct vm_area_struct *dst_vma, 2421 unsigned long dst_addr, 2422 unsigned long src_addr, 2423 bool zeropage, bool wp_copy, 2424 struct page **pagep) 2425 { 2426 struct inode *inode = file_inode(dst_vma->vm_file); 2427 struct shmem_inode_info *info = SHMEM_I(inode); 2428 struct address_space *mapping = inode->i_mapping; 2429 gfp_t gfp = mapping_gfp_mask(mapping); 2430 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2431 void *page_kaddr; 2432 struct folio *folio; 2433 int ret; 2434 pgoff_t max_off; 2435 2436 if (!shmem_inode_acct_block(inode, 1)) { 2437 /* 2438 * We may have got a page, returned -ENOENT triggering a retry, 2439 * and now we find ourselves with -ENOMEM. Release the page, to 2440 * avoid a BUG_ON in our caller. 2441 */ 2442 if (unlikely(*pagep)) { 2443 put_page(*pagep); 2444 *pagep = NULL; 2445 } 2446 return -ENOMEM; 2447 } 2448 2449 if (!*pagep) { 2450 ret = -ENOMEM; 2451 folio = shmem_alloc_folio(gfp, info, pgoff); 2452 if (!folio) 2453 goto out_unacct_blocks; 2454 2455 if (!zeropage) { /* COPY */ 2456 page_kaddr = kmap_local_folio(folio, 0); 2457 /* 2458 * The read mmap_lock is held here. Despite the 2459 * mmap_lock being read recursive a deadlock is still 2460 * possible if a writer has taken a lock. For example: 2461 * 2462 * process A thread 1 takes read lock on own mmap_lock 2463 * process A thread 2 calls mmap, blocks taking write lock 2464 * process B thread 1 takes page fault, read lock on own mmap lock 2465 * process B thread 2 calls mmap, blocks taking write lock 2466 * process A thread 1 blocks taking read lock on process B 2467 * process B thread 1 blocks taking read lock on process A 2468 * 2469 * Disable page faults to prevent potential deadlock 2470 * and retry the copy outside the mmap_lock. 2471 */ 2472 pagefault_disable(); 2473 ret = copy_from_user(page_kaddr, 2474 (const void __user *)src_addr, 2475 PAGE_SIZE); 2476 pagefault_enable(); 2477 kunmap_local(page_kaddr); 2478 2479 /* fallback to copy_from_user outside mmap_lock */ 2480 if (unlikely(ret)) { 2481 *pagep = &folio->page; 2482 ret = -ENOENT; 2483 /* don't free the page */ 2484 goto out_unacct_blocks; 2485 } 2486 2487 flush_dcache_folio(folio); 2488 } else { /* ZEROPAGE */ 2489 clear_user_highpage(&folio->page, dst_addr); 2490 } 2491 } else { 2492 folio = page_folio(*pagep); 2493 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 2494 *pagep = NULL; 2495 } 2496 2497 VM_BUG_ON(folio_test_locked(folio)); 2498 VM_BUG_ON(folio_test_swapbacked(folio)); 2499 __folio_set_locked(folio); 2500 __folio_set_swapbacked(folio); 2501 __folio_mark_uptodate(folio); 2502 2503 ret = -EFAULT; 2504 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2505 if (unlikely(pgoff >= max_off)) 2506 goto out_release; 2507 2508 ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, 2509 gfp & GFP_RECLAIM_MASK, dst_mm); 2510 if (ret) 2511 goto out_release; 2512 2513 ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, 2514 &folio->page, true, wp_copy); 2515 if (ret) 2516 goto out_delete_from_cache; 2517 2518 spin_lock_irq(&info->lock); 2519 info->alloced++; 2520 inode->i_blocks += BLOCKS_PER_PAGE; 2521 shmem_recalc_inode(inode); 2522 spin_unlock_irq(&info->lock); 2523 2524 folio_unlock(folio); 2525 return 0; 2526 out_delete_from_cache: 2527 filemap_remove_folio(folio); 2528 out_release: 2529 folio_unlock(folio); 2530 folio_put(folio); 2531 out_unacct_blocks: 2532 shmem_inode_unacct_blocks(inode, 1); 2533 return ret; 2534 } 2535 #endif /* CONFIG_USERFAULTFD */ 2536 2537 #ifdef CONFIG_TMPFS 2538 static const struct inode_operations shmem_symlink_inode_operations; 2539 static const struct inode_operations shmem_short_symlink_operations; 2540 2541 static int 2542 shmem_write_begin(struct file *file, struct address_space *mapping, 2543 loff_t pos, unsigned len, 2544 struct page **pagep, void **fsdata) 2545 { 2546 struct inode *inode = mapping->host; 2547 struct shmem_inode_info *info = SHMEM_I(inode); 2548 pgoff_t index = pos >> PAGE_SHIFT; 2549 struct folio *folio; 2550 int ret = 0; 2551 2552 /* i_rwsem is held by caller */ 2553 if (unlikely(info->seals & (F_SEAL_GROW | 2554 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { 2555 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) 2556 return -EPERM; 2557 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2558 return -EPERM; 2559 } 2560 2561 ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); 2562 2563 if (ret) 2564 return ret; 2565 2566 *pagep = folio_file_page(folio, index); 2567 if (PageHWPoison(*pagep)) { 2568 folio_unlock(folio); 2569 folio_put(folio); 2570 *pagep = NULL; 2571 return -EIO; 2572 } 2573 2574 return 0; 2575 } 2576 2577 static int 2578 shmem_write_end(struct file *file, struct address_space *mapping, 2579 loff_t pos, unsigned len, unsigned copied, 2580 struct page *page, void *fsdata) 2581 { 2582 struct folio *folio = page_folio(page); 2583 struct inode *inode = mapping->host; 2584 2585 if (pos + copied > inode->i_size) 2586 i_size_write(inode, pos + copied); 2587 2588 if (!folio_test_uptodate(folio)) { 2589 if (copied < folio_size(folio)) { 2590 size_t from = offset_in_folio(folio, pos); 2591 folio_zero_segments(folio, 0, from, 2592 from + copied, folio_size(folio)); 2593 } 2594 folio_mark_uptodate(folio); 2595 } 2596 folio_mark_dirty(folio); 2597 folio_unlock(folio); 2598 folio_put(folio); 2599 2600 return copied; 2601 } 2602 2603 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 2604 { 2605 struct file *file = iocb->ki_filp; 2606 struct inode *inode = file_inode(file); 2607 struct address_space *mapping = inode->i_mapping; 2608 pgoff_t index; 2609 unsigned long offset; 2610 int error = 0; 2611 ssize_t retval = 0; 2612 loff_t *ppos = &iocb->ki_pos; 2613 2614 index = *ppos >> PAGE_SHIFT; 2615 offset = *ppos & ~PAGE_MASK; 2616 2617 for (;;) { 2618 struct folio *folio = NULL; 2619 struct page *page = NULL; 2620 pgoff_t end_index; 2621 unsigned long nr, ret; 2622 loff_t i_size = i_size_read(inode); 2623 2624 end_index = i_size >> PAGE_SHIFT; 2625 if (index > end_index) 2626 break; 2627 if (index == end_index) { 2628 nr = i_size & ~PAGE_MASK; 2629 if (nr <= offset) 2630 break; 2631 } 2632 2633 error = shmem_get_folio(inode, index, &folio, SGP_READ); 2634 if (error) { 2635 if (error == -EINVAL) 2636 error = 0; 2637 break; 2638 } 2639 if (folio) { 2640 folio_unlock(folio); 2641 2642 page = folio_file_page(folio, index); 2643 if (PageHWPoison(page)) { 2644 folio_put(folio); 2645 error = -EIO; 2646 break; 2647 } 2648 } 2649 2650 /* 2651 * We must evaluate after, since reads (unlike writes) 2652 * are called without i_rwsem protection against truncate 2653 */ 2654 nr = PAGE_SIZE; 2655 i_size = i_size_read(inode); 2656 end_index = i_size >> PAGE_SHIFT; 2657 if (index == end_index) { 2658 nr = i_size & ~PAGE_MASK; 2659 if (nr <= offset) { 2660 if (folio) 2661 folio_put(folio); 2662 break; 2663 } 2664 } 2665 nr -= offset; 2666 2667 if (folio) { 2668 /* 2669 * If users can be writing to this page using arbitrary 2670 * virtual addresses, take care about potential aliasing 2671 * before reading the page on the kernel side. 2672 */ 2673 if (mapping_writably_mapped(mapping)) 2674 flush_dcache_page(page); 2675 /* 2676 * Mark the page accessed if we read the beginning. 2677 */ 2678 if (!offset) 2679 folio_mark_accessed(folio); 2680 /* 2681 * Ok, we have the page, and it's up-to-date, so 2682 * now we can copy it to user space... 2683 */ 2684 ret = copy_page_to_iter(page, offset, nr, to); 2685 folio_put(folio); 2686 2687 } else if (user_backed_iter(to)) { 2688 /* 2689 * Copy to user tends to be so well optimized, but 2690 * clear_user() not so much, that it is noticeably 2691 * faster to copy the zero page instead of clearing. 2692 */ 2693 ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); 2694 } else { 2695 /* 2696 * But submitting the same page twice in a row to 2697 * splice() - or others? - can result in confusion: 2698 * so don't attempt that optimization on pipes etc. 2699 */ 2700 ret = iov_iter_zero(nr, to); 2701 } 2702 2703 retval += ret; 2704 offset += ret; 2705 index += offset >> PAGE_SHIFT; 2706 offset &= ~PAGE_MASK; 2707 2708 if (!iov_iter_count(to)) 2709 break; 2710 if (ret < nr) { 2711 error = -EFAULT; 2712 break; 2713 } 2714 cond_resched(); 2715 } 2716 2717 *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 2718 file_accessed(file); 2719 return retval ? retval : error; 2720 } 2721 2722 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2723 { 2724 struct address_space *mapping = file->f_mapping; 2725 struct inode *inode = mapping->host; 2726 2727 if (whence != SEEK_DATA && whence != SEEK_HOLE) 2728 return generic_file_llseek_size(file, offset, whence, 2729 MAX_LFS_FILESIZE, i_size_read(inode)); 2730 if (offset < 0) 2731 return -ENXIO; 2732 2733 inode_lock(inode); 2734 /* We're holding i_rwsem so we can access i_size directly */ 2735 offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); 2736 if (offset >= 0) 2737 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 2738 inode_unlock(inode); 2739 return offset; 2740 } 2741 2742 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 2743 loff_t len) 2744 { 2745 struct inode *inode = file_inode(file); 2746 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2747 struct shmem_inode_info *info = SHMEM_I(inode); 2748 struct shmem_falloc shmem_falloc; 2749 pgoff_t start, index, end, undo_fallocend; 2750 int error; 2751 2752 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2753 return -EOPNOTSUPP; 2754 2755 inode_lock(inode); 2756 2757 if (mode & FALLOC_FL_PUNCH_HOLE) { 2758 struct address_space *mapping = file->f_mapping; 2759 loff_t unmap_start = round_up(offset, PAGE_SIZE); 2760 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 2761 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2762 2763 /* protected by i_rwsem */ 2764 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 2765 error = -EPERM; 2766 goto out; 2767 } 2768 2769 shmem_falloc.waitq = &shmem_falloc_waitq; 2770 shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; 2771 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 2772 spin_lock(&inode->i_lock); 2773 inode->i_private = &shmem_falloc; 2774 spin_unlock(&inode->i_lock); 2775 2776 if ((u64)unmap_end > (u64)unmap_start) 2777 unmap_mapping_range(mapping, unmap_start, 2778 1 + unmap_end - unmap_start, 0); 2779 shmem_truncate_range(inode, offset, offset + len - 1); 2780 /* No need to unmap again: hole-punching leaves COWed pages */ 2781 2782 spin_lock(&inode->i_lock); 2783 inode->i_private = NULL; 2784 wake_up_all(&shmem_falloc_waitq); 2785 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 2786 spin_unlock(&inode->i_lock); 2787 error = 0; 2788 goto out; 2789 } 2790 2791 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 2792 error = inode_newsize_ok(inode, offset + len); 2793 if (error) 2794 goto out; 2795 2796 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 2797 error = -EPERM; 2798 goto out; 2799 } 2800 2801 start = offset >> PAGE_SHIFT; 2802 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 2803 /* Try to avoid a swapstorm if len is impossible to satisfy */ 2804 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2805 error = -ENOSPC; 2806 goto out; 2807 } 2808 2809 shmem_falloc.waitq = NULL; 2810 shmem_falloc.start = start; 2811 shmem_falloc.next = start; 2812 shmem_falloc.nr_falloced = 0; 2813 shmem_falloc.nr_unswapped = 0; 2814 spin_lock(&inode->i_lock); 2815 inode->i_private = &shmem_falloc; 2816 spin_unlock(&inode->i_lock); 2817 2818 /* 2819 * info->fallocend is only relevant when huge pages might be 2820 * involved: to prevent split_huge_page() freeing fallocated 2821 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. 2822 */ 2823 undo_fallocend = info->fallocend; 2824 if (info->fallocend < end) 2825 info->fallocend = end; 2826 2827 for (index = start; index < end; ) { 2828 struct folio *folio; 2829 2830 /* 2831 * Good, the fallocate(2) manpage permits EINTR: we may have 2832 * been interrupted because we are using up too much memory. 2833 */ 2834 if (signal_pending(current)) 2835 error = -EINTR; 2836 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 2837 error = -ENOMEM; 2838 else 2839 error = shmem_get_folio(inode, index, &folio, 2840 SGP_FALLOC); 2841 if (error) { 2842 info->fallocend = undo_fallocend; 2843 /* Remove the !uptodate folios we added */ 2844 if (index > start) { 2845 shmem_undo_range(inode, 2846 (loff_t)start << PAGE_SHIFT, 2847 ((loff_t)index << PAGE_SHIFT) - 1, true); 2848 } 2849 goto undone; 2850 } 2851 2852 /* 2853 * Here is a more important optimization than it appears: 2854 * a second SGP_FALLOC on the same large folio will clear it, 2855 * making it uptodate and un-undoable if we fail later. 2856 */ 2857 index = folio_next_index(folio); 2858 /* Beware 32-bit wraparound */ 2859 if (!index) 2860 index--; 2861 2862 /* 2863 * Inform shmem_writepage() how far we have reached. 2864 * No need for lock or barrier: we have the page lock. 2865 */ 2866 if (!folio_test_uptodate(folio)) 2867 shmem_falloc.nr_falloced += index - shmem_falloc.next; 2868 shmem_falloc.next = index; 2869 2870 /* 2871 * If !uptodate, leave it that way so that freeable folios 2872 * can be recognized if we need to rollback on error later. 2873 * But mark it dirty so that memory pressure will swap rather 2874 * than free the folios we are allocating (and SGP_CACHE folios 2875 * might still be clean: we now need to mark those dirty too). 2876 */ 2877 folio_mark_dirty(folio); 2878 folio_unlock(folio); 2879 folio_put(folio); 2880 cond_resched(); 2881 } 2882 2883 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2884 i_size_write(inode, offset + len); 2885 undone: 2886 spin_lock(&inode->i_lock); 2887 inode->i_private = NULL; 2888 spin_unlock(&inode->i_lock); 2889 out: 2890 if (!error) 2891 file_modified(file); 2892 inode_unlock(inode); 2893 return error; 2894 } 2895 2896 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 2897 { 2898 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 2899 2900 buf->f_type = TMPFS_MAGIC; 2901 buf->f_bsize = PAGE_SIZE; 2902 buf->f_namelen = NAME_MAX; 2903 if (sbinfo->max_blocks) { 2904 buf->f_blocks = sbinfo->max_blocks; 2905 buf->f_bavail = 2906 buf->f_bfree = sbinfo->max_blocks - 2907 percpu_counter_sum(&sbinfo->used_blocks); 2908 } 2909 if (sbinfo->max_inodes) { 2910 buf->f_files = sbinfo->max_inodes; 2911 buf->f_ffree = sbinfo->free_inodes; 2912 } 2913 /* else leave those fields 0 like simple_statfs */ 2914 2915 buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); 2916 2917 return 0; 2918 } 2919 2920 /* 2921 * File creation. Allocate an inode, and we're done.. 2922 */ 2923 static int 2924 shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, 2925 struct dentry *dentry, umode_t mode, dev_t dev) 2926 { 2927 struct inode *inode; 2928 int error = -ENOSPC; 2929 2930 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); 2931 if (inode) { 2932 error = simple_acl_create(dir, inode); 2933 if (error) 2934 goto out_iput; 2935 error = security_inode_init_security(inode, dir, 2936 &dentry->d_name, 2937 shmem_initxattrs, NULL); 2938 if (error && error != -EOPNOTSUPP) 2939 goto out_iput; 2940 2941 error = 0; 2942 dir->i_size += BOGO_DIRENT_SIZE; 2943 dir->i_ctime = dir->i_mtime = current_time(dir); 2944 inode_inc_iversion(dir); 2945 d_instantiate(dentry, inode); 2946 dget(dentry); /* Extra count - pin the dentry in core */ 2947 } 2948 return error; 2949 out_iput: 2950 iput(inode); 2951 return error; 2952 } 2953 2954 static int 2955 shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 2956 struct file *file, umode_t mode) 2957 { 2958 struct inode *inode; 2959 int error = -ENOSPC; 2960 2961 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); 2962 if (inode) { 2963 error = security_inode_init_security(inode, dir, 2964 NULL, 2965 shmem_initxattrs, NULL); 2966 if (error && error != -EOPNOTSUPP) 2967 goto out_iput; 2968 error = simple_acl_create(dir, inode); 2969 if (error) 2970 goto out_iput; 2971 d_tmpfile(file, inode); 2972 } 2973 return finish_open_simple(file, error); 2974 out_iput: 2975 iput(inode); 2976 return error; 2977 } 2978 2979 static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, 2980 struct dentry *dentry, umode_t mode) 2981 { 2982 int error; 2983 2984 error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); 2985 if (error) 2986 return error; 2987 inc_nlink(dir); 2988 return 0; 2989 } 2990 2991 static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, 2992 struct dentry *dentry, umode_t mode, bool excl) 2993 { 2994 return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); 2995 } 2996 2997 /* 2998 * Link a file.. 2999 */ 3000 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 3001 { 3002 struct inode *inode = d_inode(old_dentry); 3003 int ret = 0; 3004 3005 /* 3006 * No ordinary (disk based) filesystem counts links as inodes; 3007 * but each new link needs a new dentry, pinning lowmem, and 3008 * tmpfs dentries cannot be pruned until they are unlinked. 3009 * But if an O_TMPFILE file is linked into the tmpfs, the 3010 * first link must skip that, to get the accounting right. 3011 */ 3012 if (inode->i_nlink) { 3013 ret = shmem_reserve_inode(inode->i_sb, NULL); 3014 if (ret) 3015 goto out; 3016 } 3017 3018 dir->i_size += BOGO_DIRENT_SIZE; 3019 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 3020 inode_inc_iversion(dir); 3021 inc_nlink(inode); 3022 ihold(inode); /* New dentry reference */ 3023 dget(dentry); /* Extra pinning count for the created dentry */ 3024 d_instantiate(dentry, inode); 3025 out: 3026 return ret; 3027 } 3028 3029 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 3030 { 3031 struct inode *inode = d_inode(dentry); 3032 3033 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 3034 shmem_free_inode(inode->i_sb); 3035 3036 dir->i_size -= BOGO_DIRENT_SIZE; 3037 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 3038 inode_inc_iversion(dir); 3039 drop_nlink(inode); 3040 dput(dentry); /* Undo the count from "create" - this does all the work */ 3041 return 0; 3042 } 3043 3044 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 3045 { 3046 if (!simple_empty(dentry)) 3047 return -ENOTEMPTY; 3048 3049 drop_nlink(d_inode(dentry)); 3050 drop_nlink(dir); 3051 return shmem_unlink(dir, dentry); 3052 } 3053 3054 static int shmem_whiteout(struct mnt_idmap *idmap, 3055 struct inode *old_dir, struct dentry *old_dentry) 3056 { 3057 struct dentry *whiteout; 3058 int error; 3059 3060 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 3061 if (!whiteout) 3062 return -ENOMEM; 3063 3064 error = shmem_mknod(idmap, old_dir, whiteout, 3065 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 3066 dput(whiteout); 3067 if (error) 3068 return error; 3069 3070 /* 3071 * Cheat and hash the whiteout while the old dentry is still in 3072 * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 3073 * 3074 * d_lookup() will consistently find one of them at this point, 3075 * not sure which one, but that isn't even important. 3076 */ 3077 d_rehash(whiteout); 3078 return 0; 3079 } 3080 3081 /* 3082 * The VFS layer already does all the dentry stuff for rename, 3083 * we just have to decrement the usage count for the target if 3084 * it exists so that the VFS layer correctly free's it when it 3085 * gets overwritten. 3086 */ 3087 static int shmem_rename2(struct mnt_idmap *idmap, 3088 struct inode *old_dir, struct dentry *old_dentry, 3089 struct inode *new_dir, struct dentry *new_dentry, 3090 unsigned int flags) 3091 { 3092 struct inode *inode = d_inode(old_dentry); 3093 int they_are_dirs = S_ISDIR(inode->i_mode); 3094 3095 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 3096 return -EINVAL; 3097 3098 if (flags & RENAME_EXCHANGE) 3099 return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 3100 3101 if (!simple_empty(new_dentry)) 3102 return -ENOTEMPTY; 3103 3104 if (flags & RENAME_WHITEOUT) { 3105 int error; 3106 3107 error = shmem_whiteout(idmap, old_dir, old_dentry); 3108 if (error) 3109 return error; 3110 } 3111 3112 if (d_really_is_positive(new_dentry)) { 3113 (void) shmem_unlink(new_dir, new_dentry); 3114 if (they_are_dirs) { 3115 drop_nlink(d_inode(new_dentry)); 3116 drop_nlink(old_dir); 3117 } 3118 } else if (they_are_dirs) { 3119 drop_nlink(old_dir); 3120 inc_nlink(new_dir); 3121 } 3122 3123 old_dir->i_size -= BOGO_DIRENT_SIZE; 3124 new_dir->i_size += BOGO_DIRENT_SIZE; 3125 old_dir->i_ctime = old_dir->i_mtime = 3126 new_dir->i_ctime = new_dir->i_mtime = 3127 inode->i_ctime = current_time(old_dir); 3128 inode_inc_iversion(old_dir); 3129 inode_inc_iversion(new_dir); 3130 return 0; 3131 } 3132 3133 static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, 3134 struct dentry *dentry, const char *symname) 3135 { 3136 int error; 3137 int len; 3138 struct inode *inode; 3139 struct folio *folio; 3140 3141 len = strlen(symname) + 1; 3142 if (len > PAGE_SIZE) 3143 return -ENAMETOOLONG; 3144 3145 inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, 3146 VM_NORESERVE); 3147 if (!inode) 3148 return -ENOSPC; 3149 3150 error = security_inode_init_security(inode, dir, &dentry->d_name, 3151 shmem_initxattrs, NULL); 3152 if (error && error != -EOPNOTSUPP) { 3153 iput(inode); 3154 return error; 3155 } 3156 3157 inode->i_size = len-1; 3158 if (len <= SHORT_SYMLINK_LEN) { 3159 inode->i_link = kmemdup(symname, len, GFP_KERNEL); 3160 if (!inode->i_link) { 3161 iput(inode); 3162 return -ENOMEM; 3163 } 3164 inode->i_op = &shmem_short_symlink_operations; 3165 } else { 3166 inode_nohighmem(inode); 3167 error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); 3168 if (error) { 3169 iput(inode); 3170 return error; 3171 } 3172 inode->i_mapping->a_ops = &shmem_aops; 3173 inode->i_op = &shmem_symlink_inode_operations; 3174 memcpy(folio_address(folio), symname, len); 3175 folio_mark_uptodate(folio); 3176 folio_mark_dirty(folio); 3177 folio_unlock(folio); 3178 folio_put(folio); 3179 } 3180 dir->i_size += BOGO_DIRENT_SIZE; 3181 dir->i_ctime = dir->i_mtime = current_time(dir); 3182 inode_inc_iversion(dir); 3183 d_instantiate(dentry, inode); 3184 dget(dentry); 3185 return 0; 3186 } 3187 3188 static void shmem_put_link(void *arg) 3189 { 3190 folio_mark_accessed(arg); 3191 folio_put(arg); 3192 } 3193 3194 static const char *shmem_get_link(struct dentry *dentry, 3195 struct inode *inode, 3196 struct delayed_call *done) 3197 { 3198 struct folio *folio = NULL; 3199 int error; 3200 3201 if (!dentry) { 3202 folio = filemap_get_folio(inode->i_mapping, 0); 3203 if (!folio) 3204 return ERR_PTR(-ECHILD); 3205 if (PageHWPoison(folio_page(folio, 0)) || 3206 !folio_test_uptodate(folio)) { 3207 folio_put(folio); 3208 return ERR_PTR(-ECHILD); 3209 } 3210 } else { 3211 error = shmem_get_folio(inode, 0, &folio, SGP_READ); 3212 if (error) 3213 return ERR_PTR(error); 3214 if (!folio) 3215 return ERR_PTR(-ECHILD); 3216 if (PageHWPoison(folio_page(folio, 0))) { 3217 folio_unlock(folio); 3218 folio_put(folio); 3219 return ERR_PTR(-ECHILD); 3220 } 3221 folio_unlock(folio); 3222 } 3223 set_delayed_call(done, shmem_put_link, folio); 3224 return folio_address(folio); 3225 } 3226 3227 #ifdef CONFIG_TMPFS_XATTR 3228 3229 static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) 3230 { 3231 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3232 3233 fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); 3234 3235 return 0; 3236 } 3237 3238 static int shmem_fileattr_set(struct mnt_idmap *idmap, 3239 struct dentry *dentry, struct fileattr *fa) 3240 { 3241 struct inode *inode = d_inode(dentry); 3242 struct shmem_inode_info *info = SHMEM_I(inode); 3243 3244 if (fileattr_has_fsx(fa)) 3245 return -EOPNOTSUPP; 3246 if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) 3247 return -EOPNOTSUPP; 3248 3249 info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | 3250 (fa->flags & SHMEM_FL_USER_MODIFIABLE); 3251 3252 shmem_set_inode_flags(inode, info->fsflags); 3253 inode->i_ctime = current_time(inode); 3254 inode_inc_iversion(inode); 3255 return 0; 3256 } 3257 3258 /* 3259 * Superblocks without xattr inode operations may get some security.* xattr 3260 * support from the LSM "for free". As soon as we have any other xattrs 3261 * like ACLs, we also need to implement the security.* handlers at 3262 * filesystem level, though. 3263 */ 3264 3265 /* 3266 * Callback for security_inode_init_security() for acquiring xattrs. 3267 */ 3268 static int shmem_initxattrs(struct inode *inode, 3269 const struct xattr *xattr_array, 3270 void *fs_info) 3271 { 3272 struct shmem_inode_info *info = SHMEM_I(inode); 3273 const struct xattr *xattr; 3274 struct simple_xattr *new_xattr; 3275 size_t len; 3276 3277 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3278 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 3279 if (!new_xattr) 3280 return -ENOMEM; 3281 3282 len = strlen(xattr->name) + 1; 3283 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 3284 GFP_KERNEL); 3285 if (!new_xattr->name) { 3286 kvfree(new_xattr); 3287 return -ENOMEM; 3288 } 3289 3290 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 3291 XATTR_SECURITY_PREFIX_LEN); 3292 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 3293 xattr->name, len); 3294 3295 simple_xattr_add(&info->xattrs, new_xattr); 3296 } 3297 3298 return 0; 3299 } 3300 3301 static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3302 struct dentry *unused, struct inode *inode, 3303 const char *name, void *buffer, size_t size) 3304 { 3305 struct shmem_inode_info *info = SHMEM_I(inode); 3306 3307 name = xattr_full_name(handler, name); 3308 return simple_xattr_get(&info->xattrs, name, buffer, size); 3309 } 3310 3311 static int shmem_xattr_handler_set(const struct xattr_handler *handler, 3312 struct mnt_idmap *idmap, 3313 struct dentry *unused, struct inode *inode, 3314 const char *name, const void *value, 3315 size_t size, int flags) 3316 { 3317 struct shmem_inode_info *info = SHMEM_I(inode); 3318 int err; 3319 3320 name = xattr_full_name(handler, name); 3321 err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); 3322 if (!err) { 3323 inode->i_ctime = current_time(inode); 3324 inode_inc_iversion(inode); 3325 } 3326 return err; 3327 } 3328 3329 static const struct xattr_handler shmem_security_xattr_handler = { 3330 .prefix = XATTR_SECURITY_PREFIX, 3331 .get = shmem_xattr_handler_get, 3332 .set = shmem_xattr_handler_set, 3333 }; 3334 3335 static const struct xattr_handler shmem_trusted_xattr_handler = { 3336 .prefix = XATTR_TRUSTED_PREFIX, 3337 .get = shmem_xattr_handler_get, 3338 .set = shmem_xattr_handler_set, 3339 }; 3340 3341 static const struct xattr_handler *shmem_xattr_handlers[] = { 3342 #ifdef CONFIG_TMPFS_POSIX_ACL 3343 &posix_acl_access_xattr_handler, 3344 &posix_acl_default_xattr_handler, 3345 #endif 3346 &shmem_security_xattr_handler, 3347 &shmem_trusted_xattr_handler, 3348 NULL 3349 }; 3350 3351 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3352 { 3353 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3354 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3355 } 3356 #endif /* CONFIG_TMPFS_XATTR */ 3357 3358 static const struct inode_operations shmem_short_symlink_operations = { 3359 .getattr = shmem_getattr, 3360 .get_link = simple_get_link, 3361 #ifdef CONFIG_TMPFS_XATTR 3362 .listxattr = shmem_listxattr, 3363 #endif 3364 }; 3365 3366 static const struct inode_operations shmem_symlink_inode_operations = { 3367 .getattr = shmem_getattr, 3368 .get_link = shmem_get_link, 3369 #ifdef CONFIG_TMPFS_XATTR 3370 .listxattr = shmem_listxattr, 3371 #endif 3372 }; 3373 3374 static struct dentry *shmem_get_parent(struct dentry *child) 3375 { 3376 return ERR_PTR(-ESTALE); 3377 } 3378 3379 static int shmem_match(struct inode *ino, void *vfh) 3380 { 3381 __u32 *fh = vfh; 3382 __u64 inum = fh[2]; 3383 inum = (inum << 32) | fh[1]; 3384 return ino->i_ino == inum && fh[0] == ino->i_generation; 3385 } 3386 3387 /* Find any alias of inode, but prefer a hashed alias */ 3388 static struct dentry *shmem_find_alias(struct inode *inode) 3389 { 3390 struct dentry *alias = d_find_alias(inode); 3391 3392 return alias ?: d_find_any_alias(inode); 3393 } 3394 3395 3396 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3397 struct fid *fid, int fh_len, int fh_type) 3398 { 3399 struct inode *inode; 3400 struct dentry *dentry = NULL; 3401 u64 inum; 3402 3403 if (fh_len < 3) 3404 return NULL; 3405 3406 inum = fid->raw[2]; 3407 inum = (inum << 32) | fid->raw[1]; 3408 3409 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3410 shmem_match, fid->raw); 3411 if (inode) { 3412 dentry = shmem_find_alias(inode); 3413 iput(inode); 3414 } 3415 3416 return dentry; 3417 } 3418 3419 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3420 struct inode *parent) 3421 { 3422 if (*len < 3) { 3423 *len = 3; 3424 return FILEID_INVALID; 3425 } 3426 3427 if (inode_unhashed(inode)) { 3428 /* Unfortunately insert_inode_hash is not idempotent, 3429 * so as we hash inodes here rather than at creation 3430 * time, we need a lock to ensure we only try 3431 * to do it once 3432 */ 3433 static DEFINE_SPINLOCK(lock); 3434 spin_lock(&lock); 3435 if (inode_unhashed(inode)) 3436 __insert_inode_hash(inode, 3437 inode->i_ino + inode->i_generation); 3438 spin_unlock(&lock); 3439 } 3440 3441 fh[0] = inode->i_generation; 3442 fh[1] = inode->i_ino; 3443 fh[2] = ((__u64)inode->i_ino) >> 32; 3444 3445 *len = 3; 3446 return 1; 3447 } 3448 3449 static const struct export_operations shmem_export_ops = { 3450 .get_parent = shmem_get_parent, 3451 .encode_fh = shmem_encode_fh, 3452 .fh_to_dentry = shmem_fh_to_dentry, 3453 }; 3454 3455 enum shmem_param { 3456 Opt_gid, 3457 Opt_huge, 3458 Opt_mode, 3459 Opt_mpol, 3460 Opt_nr_blocks, 3461 Opt_nr_inodes, 3462 Opt_size, 3463 Opt_uid, 3464 Opt_inode32, 3465 Opt_inode64, 3466 }; 3467 3468 static const struct constant_table shmem_param_enums_huge[] = { 3469 {"never", SHMEM_HUGE_NEVER }, 3470 {"always", SHMEM_HUGE_ALWAYS }, 3471 {"within_size", SHMEM_HUGE_WITHIN_SIZE }, 3472 {"advise", SHMEM_HUGE_ADVISE }, 3473 {} 3474 }; 3475 3476 const struct fs_parameter_spec shmem_fs_parameters[] = { 3477 fsparam_u32 ("gid", Opt_gid), 3478 fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), 3479 fsparam_u32oct("mode", Opt_mode), 3480 fsparam_string("mpol", Opt_mpol), 3481 fsparam_string("nr_blocks", Opt_nr_blocks), 3482 fsparam_string("nr_inodes", Opt_nr_inodes), 3483 fsparam_string("size", Opt_size), 3484 fsparam_u32 ("uid", Opt_uid), 3485 fsparam_flag ("inode32", Opt_inode32), 3486 fsparam_flag ("inode64", Opt_inode64), 3487 {} 3488 }; 3489 3490 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) 3491 { 3492 struct shmem_options *ctx = fc->fs_private; 3493 struct fs_parse_result result; 3494 unsigned long long size; 3495 char *rest; 3496 int opt; 3497 3498 opt = fs_parse(fc, shmem_fs_parameters, param, &result); 3499 if (opt < 0) 3500 return opt; 3501 3502 switch (opt) { 3503 case Opt_size: 3504 size = memparse(param->string, &rest); 3505 if (*rest == '%') { 3506 size <<= PAGE_SHIFT; 3507 size *= totalram_pages(); 3508 do_div(size, 100); 3509 rest++; 3510 } 3511 if (*rest) 3512 goto bad_value; 3513 ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); 3514 ctx->seen |= SHMEM_SEEN_BLOCKS; 3515 break; 3516 case Opt_nr_blocks: 3517 ctx->blocks = memparse(param->string, &rest); 3518 if (*rest || ctx->blocks > S64_MAX) 3519 goto bad_value; 3520 ctx->seen |= SHMEM_SEEN_BLOCKS; 3521 break; 3522 case Opt_nr_inodes: 3523 ctx->inodes = memparse(param->string, &rest); 3524 if (*rest) 3525 goto bad_value; 3526 ctx->seen |= SHMEM_SEEN_INODES; 3527 break; 3528 case Opt_mode: 3529 ctx->mode = result.uint_32 & 07777; 3530 break; 3531 case Opt_uid: 3532 ctx->uid = make_kuid(current_user_ns(), result.uint_32); 3533 if (!uid_valid(ctx->uid)) 3534 goto bad_value; 3535 break; 3536 case Opt_gid: 3537 ctx->gid = make_kgid(current_user_ns(), result.uint_32); 3538 if (!gid_valid(ctx->gid)) 3539 goto bad_value; 3540 break; 3541 case Opt_huge: 3542 ctx->huge = result.uint_32; 3543 if (ctx->huge != SHMEM_HUGE_NEVER && 3544 !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 3545 has_transparent_hugepage())) 3546 goto unsupported_parameter; 3547 ctx->seen |= SHMEM_SEEN_HUGE; 3548 break; 3549 case Opt_mpol: 3550 if (IS_ENABLED(CONFIG_NUMA)) { 3551 mpol_put(ctx->mpol); 3552 ctx->mpol = NULL; 3553 if (mpol_parse_str(param->string, &ctx->mpol)) 3554 goto bad_value; 3555 break; 3556 } 3557 goto unsupported_parameter; 3558 case Opt_inode32: 3559 ctx->full_inums = false; 3560 ctx->seen |= SHMEM_SEEN_INUMS; 3561 break; 3562 case Opt_inode64: 3563 if (sizeof(ino_t) < 8) { 3564 return invalfc(fc, 3565 "Cannot use inode64 with <64bit inums in kernel\n"); 3566 } 3567 ctx->full_inums = true; 3568 ctx->seen |= SHMEM_SEEN_INUMS; 3569 break; 3570 } 3571 return 0; 3572 3573 unsupported_parameter: 3574 return invalfc(fc, "Unsupported parameter '%s'", param->key); 3575 bad_value: 3576 return invalfc(fc, "Bad value for '%s'", param->key); 3577 } 3578 3579 static int shmem_parse_options(struct fs_context *fc, void *data) 3580 { 3581 char *options = data; 3582 3583 if (options) { 3584 int err = security_sb_eat_lsm_opts(options, &fc->security); 3585 if (err) 3586 return err; 3587 } 3588 3589 while (options != NULL) { 3590 char *this_char = options; 3591 for (;;) { 3592 /* 3593 * NUL-terminate this option: unfortunately, 3594 * mount options form a comma-separated list, 3595 * but mpol's nodelist may also contain commas. 3596 */ 3597 options = strchr(options, ','); 3598 if (options == NULL) 3599 break; 3600 options++; 3601 if (!isdigit(*options)) { 3602 options[-1] = '\0'; 3603 break; 3604 } 3605 } 3606 if (*this_char) { 3607 char *value = strchr(this_char, '='); 3608 size_t len = 0; 3609 int err; 3610 3611 if (value) { 3612 *value++ = '\0'; 3613 len = strlen(value); 3614 } 3615 err = vfs_parse_fs_string(fc, this_char, value, len); 3616 if (err < 0) 3617 return err; 3618 } 3619 } 3620 return 0; 3621 } 3622 3623 /* 3624 * Reconfigure a shmem filesystem. 3625 * 3626 * Note that we disallow change from limited->unlimited blocks/inodes while any 3627 * are in use; but we must separately disallow unlimited->limited, because in 3628 * that case we have no record of how much is already in use. 3629 */ 3630 static int shmem_reconfigure(struct fs_context *fc) 3631 { 3632 struct shmem_options *ctx = fc->fs_private; 3633 struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); 3634 unsigned long inodes; 3635 struct mempolicy *mpol = NULL; 3636 const char *err; 3637 3638 raw_spin_lock(&sbinfo->stat_lock); 3639 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 3640 3641 if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { 3642 if (!sbinfo->max_blocks) { 3643 err = "Cannot retroactively limit size"; 3644 goto out; 3645 } 3646 if (percpu_counter_compare(&sbinfo->used_blocks, 3647 ctx->blocks) > 0) { 3648 err = "Too small a size for current use"; 3649 goto out; 3650 } 3651 } 3652 if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { 3653 if (!sbinfo->max_inodes) { 3654 err = "Cannot retroactively limit inodes"; 3655 goto out; 3656 } 3657 if (ctx->inodes < inodes) { 3658 err = "Too few inodes for current use"; 3659 goto out; 3660 } 3661 } 3662 3663 if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && 3664 sbinfo->next_ino > UINT_MAX) { 3665 err = "Current inum too high to switch to 32-bit inums"; 3666 goto out; 3667 } 3668 3669 if (ctx->seen & SHMEM_SEEN_HUGE) 3670 sbinfo->huge = ctx->huge; 3671 if (ctx->seen & SHMEM_SEEN_INUMS) 3672 sbinfo->full_inums = ctx->full_inums; 3673 if (ctx->seen & SHMEM_SEEN_BLOCKS) 3674 sbinfo->max_blocks = ctx->blocks; 3675 if (ctx->seen & SHMEM_SEEN_INODES) { 3676 sbinfo->max_inodes = ctx->inodes; 3677 sbinfo->free_inodes = ctx->inodes - inodes; 3678 } 3679 3680 /* 3681 * Preserve previous mempolicy unless mpol remount option was specified. 3682 */ 3683 if (ctx->mpol) { 3684 mpol = sbinfo->mpol; 3685 sbinfo->mpol = ctx->mpol; /* transfers initial ref */ 3686 ctx->mpol = NULL; 3687 } 3688 raw_spin_unlock(&sbinfo->stat_lock); 3689 mpol_put(mpol); 3690 return 0; 3691 out: 3692 raw_spin_unlock(&sbinfo->stat_lock); 3693 return invalfc(fc, "%s", err); 3694 } 3695 3696 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 3697 { 3698 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 3699 3700 if (sbinfo->max_blocks != shmem_default_max_blocks()) 3701 seq_printf(seq, ",size=%luk", 3702 sbinfo->max_blocks << (PAGE_SHIFT - 10)); 3703 if (sbinfo->max_inodes != shmem_default_max_inodes()) 3704 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 3705 if (sbinfo->mode != (0777 | S_ISVTX)) 3706 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 3707 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 3708 seq_printf(seq, ",uid=%u", 3709 from_kuid_munged(&init_user_ns, sbinfo->uid)); 3710 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 3711 seq_printf(seq, ",gid=%u", 3712 from_kgid_munged(&init_user_ns, sbinfo->gid)); 3713 3714 /* 3715 * Showing inode{64,32} might be useful even if it's the system default, 3716 * since then people don't have to resort to checking both here and 3717 * /proc/config.gz to confirm 64-bit inums were successfully applied 3718 * (which may not even exist if IKCONFIG_PROC isn't enabled). 3719 * 3720 * We hide it when inode64 isn't the default and we are using 32-bit 3721 * inodes, since that probably just means the feature isn't even under 3722 * consideration. 3723 * 3724 * As such: 3725 * 3726 * +-----------------+-----------------+ 3727 * | TMPFS_INODE64=y | TMPFS_INODE64=n | 3728 * +------------------+-----------------+-----------------+ 3729 * | full_inums=true | show | show | 3730 * | full_inums=false | show | hide | 3731 * +------------------+-----------------+-----------------+ 3732 * 3733 */ 3734 if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) 3735 seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); 3736 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3737 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 3738 if (sbinfo->huge) 3739 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 3740 #endif 3741 shmem_show_mpol(seq, sbinfo->mpol); 3742 return 0; 3743 } 3744 3745 #endif /* CONFIG_TMPFS */ 3746 3747 static void shmem_put_super(struct super_block *sb) 3748 { 3749 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3750 3751 free_percpu(sbinfo->ino_batch); 3752 percpu_counter_destroy(&sbinfo->used_blocks); 3753 mpol_put(sbinfo->mpol); 3754 kfree(sbinfo); 3755 sb->s_fs_info = NULL; 3756 } 3757 3758 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) 3759 { 3760 struct shmem_options *ctx = fc->fs_private; 3761 struct inode *inode; 3762 struct shmem_sb_info *sbinfo; 3763 3764 /* Round up to L1_CACHE_BYTES to resist false sharing */ 3765 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 3766 L1_CACHE_BYTES), GFP_KERNEL); 3767 if (!sbinfo) 3768 return -ENOMEM; 3769 3770 sb->s_fs_info = sbinfo; 3771 3772 #ifdef CONFIG_TMPFS 3773 /* 3774 * Per default we only allow half of the physical ram per 3775 * tmpfs instance, limiting inodes to one per page of lowmem; 3776 * but the internal instance is left unlimited. 3777 */ 3778 if (!(sb->s_flags & SB_KERNMOUNT)) { 3779 if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) 3780 ctx->blocks = shmem_default_max_blocks(); 3781 if (!(ctx->seen & SHMEM_SEEN_INODES)) 3782 ctx->inodes = shmem_default_max_inodes(); 3783 if (!(ctx->seen & SHMEM_SEEN_INUMS)) 3784 ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); 3785 } else { 3786 sb->s_flags |= SB_NOUSER; 3787 } 3788 sb->s_export_op = &shmem_export_ops; 3789 sb->s_flags |= SB_NOSEC | SB_I_VERSION; 3790 #else 3791 sb->s_flags |= SB_NOUSER; 3792 #endif 3793 sbinfo->max_blocks = ctx->blocks; 3794 sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; 3795 if (sb->s_flags & SB_KERNMOUNT) { 3796 sbinfo->ino_batch = alloc_percpu(ino_t); 3797 if (!sbinfo->ino_batch) 3798 goto failed; 3799 } 3800 sbinfo->uid = ctx->uid; 3801 sbinfo->gid = ctx->gid; 3802 sbinfo->full_inums = ctx->full_inums; 3803 sbinfo->mode = ctx->mode; 3804 sbinfo->huge = ctx->huge; 3805 sbinfo->mpol = ctx->mpol; 3806 ctx->mpol = NULL; 3807 3808 raw_spin_lock_init(&sbinfo->stat_lock); 3809 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 3810 goto failed; 3811 spin_lock_init(&sbinfo->shrinklist_lock); 3812 INIT_LIST_HEAD(&sbinfo->shrinklist); 3813 3814 sb->s_maxbytes = MAX_LFS_FILESIZE; 3815 sb->s_blocksize = PAGE_SIZE; 3816 sb->s_blocksize_bits = PAGE_SHIFT; 3817 sb->s_magic = TMPFS_MAGIC; 3818 sb->s_op = &shmem_ops; 3819 sb->s_time_gran = 1; 3820 #ifdef CONFIG_TMPFS_XATTR 3821 sb->s_xattr = shmem_xattr_handlers; 3822 #endif 3823 #ifdef CONFIG_TMPFS_POSIX_ACL 3824 sb->s_flags |= SB_POSIXACL; 3825 #endif 3826 uuid_gen(&sb->s_uuid); 3827 3828 inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, 3829 VM_NORESERVE); 3830 if (!inode) 3831 goto failed; 3832 inode->i_uid = sbinfo->uid; 3833 inode->i_gid = sbinfo->gid; 3834 sb->s_root = d_make_root(inode); 3835 if (!sb->s_root) 3836 goto failed; 3837 return 0; 3838 3839 failed: 3840 shmem_put_super(sb); 3841 return -ENOMEM; 3842 } 3843 3844 static int shmem_get_tree(struct fs_context *fc) 3845 { 3846 return get_tree_nodev(fc, shmem_fill_super); 3847 } 3848 3849 static void shmem_free_fc(struct fs_context *fc) 3850 { 3851 struct shmem_options *ctx = fc->fs_private; 3852 3853 if (ctx) { 3854 mpol_put(ctx->mpol); 3855 kfree(ctx); 3856 } 3857 } 3858 3859 static const struct fs_context_operations shmem_fs_context_ops = { 3860 .free = shmem_free_fc, 3861 .get_tree = shmem_get_tree, 3862 #ifdef CONFIG_TMPFS 3863 .parse_monolithic = shmem_parse_options, 3864 .parse_param = shmem_parse_one, 3865 .reconfigure = shmem_reconfigure, 3866 #endif 3867 }; 3868 3869 static struct kmem_cache *shmem_inode_cachep; 3870 3871 static struct inode *shmem_alloc_inode(struct super_block *sb) 3872 { 3873 struct shmem_inode_info *info; 3874 info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); 3875 if (!info) 3876 return NULL; 3877 return &info->vfs_inode; 3878 } 3879 3880 static void shmem_free_in_core_inode(struct inode *inode) 3881 { 3882 if (S_ISLNK(inode->i_mode)) 3883 kfree(inode->i_link); 3884 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 3885 } 3886 3887 static void shmem_destroy_inode(struct inode *inode) 3888 { 3889 if (S_ISREG(inode->i_mode)) 3890 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 3891 } 3892 3893 static void shmem_init_inode(void *foo) 3894 { 3895 struct shmem_inode_info *info = foo; 3896 inode_init_once(&info->vfs_inode); 3897 } 3898 3899 static void shmem_init_inodecache(void) 3900 { 3901 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3902 sizeof(struct shmem_inode_info), 3903 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 3904 } 3905 3906 static void shmem_destroy_inodecache(void) 3907 { 3908 kmem_cache_destroy(shmem_inode_cachep); 3909 } 3910 3911 /* Keep the page in page cache instead of truncating it */ 3912 static int shmem_error_remove_page(struct address_space *mapping, 3913 struct page *page) 3914 { 3915 return 0; 3916 } 3917 3918 const struct address_space_operations shmem_aops = { 3919 .writepage = shmem_writepage, 3920 .dirty_folio = noop_dirty_folio, 3921 #ifdef CONFIG_TMPFS 3922 .write_begin = shmem_write_begin, 3923 .write_end = shmem_write_end, 3924 #endif 3925 #ifdef CONFIG_MIGRATION 3926 .migrate_folio = migrate_folio, 3927 #endif 3928 .error_remove_page = shmem_error_remove_page, 3929 }; 3930 EXPORT_SYMBOL(shmem_aops); 3931 3932 static const struct file_operations shmem_file_operations = { 3933 .mmap = shmem_mmap, 3934 .open = generic_file_open, 3935 .get_unmapped_area = shmem_get_unmapped_area, 3936 #ifdef CONFIG_TMPFS 3937 .llseek = shmem_file_llseek, 3938 .read_iter = shmem_file_read_iter, 3939 .write_iter = generic_file_write_iter, 3940 .fsync = noop_fsync, 3941 .splice_read = generic_file_splice_read, 3942 .splice_write = iter_file_splice_write, 3943 .fallocate = shmem_fallocate, 3944 #endif 3945 }; 3946 3947 static const struct inode_operations shmem_inode_operations = { 3948 .getattr = shmem_getattr, 3949 .setattr = shmem_setattr, 3950 #ifdef CONFIG_TMPFS_XATTR 3951 .listxattr = shmem_listxattr, 3952 .set_acl = simple_set_acl, 3953 .fileattr_get = shmem_fileattr_get, 3954 .fileattr_set = shmem_fileattr_set, 3955 #endif 3956 }; 3957 3958 static const struct inode_operations shmem_dir_inode_operations = { 3959 #ifdef CONFIG_TMPFS 3960 .getattr = shmem_getattr, 3961 .create = shmem_create, 3962 .lookup = simple_lookup, 3963 .link = shmem_link, 3964 .unlink = shmem_unlink, 3965 .symlink = shmem_symlink, 3966 .mkdir = shmem_mkdir, 3967 .rmdir = shmem_rmdir, 3968 .mknod = shmem_mknod, 3969 .rename = shmem_rename2, 3970 .tmpfile = shmem_tmpfile, 3971 #endif 3972 #ifdef CONFIG_TMPFS_XATTR 3973 .listxattr = shmem_listxattr, 3974 .fileattr_get = shmem_fileattr_get, 3975 .fileattr_set = shmem_fileattr_set, 3976 #endif 3977 #ifdef CONFIG_TMPFS_POSIX_ACL 3978 .setattr = shmem_setattr, 3979 .set_acl = simple_set_acl, 3980 #endif 3981 }; 3982 3983 static const struct inode_operations shmem_special_inode_operations = { 3984 .getattr = shmem_getattr, 3985 #ifdef CONFIG_TMPFS_XATTR 3986 .listxattr = shmem_listxattr, 3987 #endif 3988 #ifdef CONFIG_TMPFS_POSIX_ACL 3989 .setattr = shmem_setattr, 3990 .set_acl = simple_set_acl, 3991 #endif 3992 }; 3993 3994 static const struct super_operations shmem_ops = { 3995 .alloc_inode = shmem_alloc_inode, 3996 .free_inode = shmem_free_in_core_inode, 3997 .destroy_inode = shmem_destroy_inode, 3998 #ifdef CONFIG_TMPFS 3999 .statfs = shmem_statfs, 4000 .show_options = shmem_show_options, 4001 #endif 4002 .evict_inode = shmem_evict_inode, 4003 .drop_inode = generic_delete_inode, 4004 .put_super = shmem_put_super, 4005 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4006 .nr_cached_objects = shmem_unused_huge_count, 4007 .free_cached_objects = shmem_unused_huge_scan, 4008 #endif 4009 }; 4010 4011 static const struct vm_operations_struct shmem_vm_ops = { 4012 .fault = shmem_fault, 4013 .map_pages = filemap_map_pages, 4014 #ifdef CONFIG_NUMA 4015 .set_policy = shmem_set_policy, 4016 .get_policy = shmem_get_policy, 4017 #endif 4018 }; 4019 4020 static const struct vm_operations_struct shmem_anon_vm_ops = { 4021 .fault = shmem_fault, 4022 .map_pages = filemap_map_pages, 4023 #ifdef CONFIG_NUMA 4024 .set_policy = shmem_set_policy, 4025 .get_policy = shmem_get_policy, 4026 #endif 4027 }; 4028 4029 int shmem_init_fs_context(struct fs_context *fc) 4030 { 4031 struct shmem_options *ctx; 4032 4033 ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); 4034 if (!ctx) 4035 return -ENOMEM; 4036 4037 ctx->mode = 0777 | S_ISVTX; 4038 ctx->uid = current_fsuid(); 4039 ctx->gid = current_fsgid(); 4040 4041 fc->fs_private = ctx; 4042 fc->ops = &shmem_fs_context_ops; 4043 return 0; 4044 } 4045 4046 static struct file_system_type shmem_fs_type = { 4047 .owner = THIS_MODULE, 4048 .name = "tmpfs", 4049 .init_fs_context = shmem_init_fs_context, 4050 #ifdef CONFIG_TMPFS 4051 .parameters = shmem_fs_parameters, 4052 #endif 4053 .kill_sb = kill_litter_super, 4054 #ifdef CONFIG_SHMEM 4055 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 4056 #else 4057 .fs_flags = FS_USERNS_MOUNT, 4058 #endif 4059 }; 4060 4061 void __init shmem_init(void) 4062 { 4063 int error; 4064 4065 shmem_init_inodecache(); 4066 4067 error = register_filesystem(&shmem_fs_type); 4068 if (error) { 4069 pr_err("Could not register tmpfs\n"); 4070 goto out2; 4071 } 4072 4073 shm_mnt = kern_mount(&shmem_fs_type); 4074 if (IS_ERR(shm_mnt)) { 4075 error = PTR_ERR(shm_mnt); 4076 pr_err("Could not kern_mount tmpfs\n"); 4077 goto out1; 4078 } 4079 4080 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4081 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 4082 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 4083 else 4084 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ 4085 #endif 4086 return; 4087 4088 out1: 4089 unregister_filesystem(&shmem_fs_type); 4090 out2: 4091 shmem_destroy_inodecache(); 4092 shm_mnt = ERR_PTR(error); 4093 } 4094 4095 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) 4096 static ssize_t shmem_enabled_show(struct kobject *kobj, 4097 struct kobj_attribute *attr, char *buf) 4098 { 4099 static const int values[] = { 4100 SHMEM_HUGE_ALWAYS, 4101 SHMEM_HUGE_WITHIN_SIZE, 4102 SHMEM_HUGE_ADVISE, 4103 SHMEM_HUGE_NEVER, 4104 SHMEM_HUGE_DENY, 4105 SHMEM_HUGE_FORCE, 4106 }; 4107 int len = 0; 4108 int i; 4109 4110 for (i = 0; i < ARRAY_SIZE(values); i++) { 4111 len += sysfs_emit_at(buf, len, 4112 shmem_huge == values[i] ? "%s[%s]" : "%s%s", 4113 i ? " " : "", 4114 shmem_format_huge(values[i])); 4115 } 4116 4117 len += sysfs_emit_at(buf, len, "\n"); 4118 4119 return len; 4120 } 4121 4122 static ssize_t shmem_enabled_store(struct kobject *kobj, 4123 struct kobj_attribute *attr, const char *buf, size_t count) 4124 { 4125 char tmp[16]; 4126 int huge; 4127 4128 if (count + 1 > sizeof(tmp)) 4129 return -EINVAL; 4130 memcpy(tmp, buf, count); 4131 tmp[count] = '\0'; 4132 if (count && tmp[count - 1] == '\n') 4133 tmp[count - 1] = '\0'; 4134 4135 huge = shmem_parse_huge(tmp); 4136 if (huge == -EINVAL) 4137 return -EINVAL; 4138 if (!has_transparent_hugepage() && 4139 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 4140 return -EINVAL; 4141 4142 shmem_huge = huge; 4143 if (shmem_huge > SHMEM_HUGE_DENY) 4144 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 4145 return count; 4146 } 4147 4148 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); 4149 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ 4150 4151 #else /* !CONFIG_SHMEM */ 4152 4153 /* 4154 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 4155 * 4156 * This is intended for small system where the benefits of the full 4157 * shmem code (swap-backed and resource-limited) are outweighed by 4158 * their complexity. On systems without swap this code should be 4159 * effectively equivalent, but much lighter weight. 4160 */ 4161 4162 static struct file_system_type shmem_fs_type = { 4163 .name = "tmpfs", 4164 .init_fs_context = ramfs_init_fs_context, 4165 .parameters = ramfs_fs_parameters, 4166 .kill_sb = kill_litter_super, 4167 .fs_flags = FS_USERNS_MOUNT, 4168 }; 4169 4170 void __init shmem_init(void) 4171 { 4172 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 4173 4174 shm_mnt = kern_mount(&shmem_fs_type); 4175 BUG_ON(IS_ERR(shm_mnt)); 4176 } 4177 4178 int shmem_unuse(unsigned int type) 4179 { 4180 return 0; 4181 } 4182 4183 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 4184 { 4185 return 0; 4186 } 4187 4188 void shmem_unlock_mapping(struct address_space *mapping) 4189 { 4190 } 4191 4192 #ifdef CONFIG_MMU 4193 unsigned long shmem_get_unmapped_area(struct file *file, 4194 unsigned long addr, unsigned long len, 4195 unsigned long pgoff, unsigned long flags) 4196 { 4197 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 4198 } 4199 #endif 4200 4201 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 4202 { 4203 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 4204 } 4205 EXPORT_SYMBOL_GPL(shmem_truncate_range); 4206 4207 #define shmem_vm_ops generic_file_vm_ops 4208 #define shmem_anon_vm_ops generic_file_vm_ops 4209 #define shmem_file_operations ramfs_file_operations 4210 #define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 4211 #define shmem_acct_size(flags, size) 0 4212 #define shmem_unacct_size(flags, size) do {} while (0) 4213 4214 #endif /* CONFIG_SHMEM */ 4215 4216 /* common code */ 4217 4218 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, 4219 unsigned long flags, unsigned int i_flags) 4220 { 4221 struct inode *inode; 4222 struct file *res; 4223 4224 if (IS_ERR(mnt)) 4225 return ERR_CAST(mnt); 4226 4227 if (size < 0 || size > MAX_LFS_FILESIZE) 4228 return ERR_PTR(-EINVAL); 4229 4230 if (shmem_acct_size(flags, size)) 4231 return ERR_PTR(-ENOMEM); 4232 4233 if (is_idmapped_mnt(mnt)) 4234 return ERR_PTR(-EINVAL); 4235 4236 inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, 4237 S_IFREG | S_IRWXUGO, 0, flags); 4238 if (unlikely(!inode)) { 4239 shmem_unacct_size(flags, size); 4240 return ERR_PTR(-ENOSPC); 4241 } 4242 inode->i_flags |= i_flags; 4243 inode->i_size = size; 4244 clear_nlink(inode); /* It is unlinked */ 4245 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 4246 if (!IS_ERR(res)) 4247 res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 4248 &shmem_file_operations); 4249 if (IS_ERR(res)) 4250 iput(inode); 4251 return res; 4252 } 4253 4254 /** 4255 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 4256 * kernel internal. There will be NO LSM permission checks against the 4257 * underlying inode. So users of this interface must do LSM checks at a 4258 * higher layer. The users are the big_key and shm implementations. LSM 4259 * checks are provided at the key or shm level rather than the inode. 4260 * @name: name for dentry (to be seen in /proc/<pid>/maps 4261 * @size: size to be set for the file 4262 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4263 */ 4264 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 4265 { 4266 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 4267 } 4268 4269 /** 4270 * shmem_file_setup - get an unlinked file living in tmpfs 4271 * @name: name for dentry (to be seen in /proc/<pid>/maps 4272 * @size: size to be set for the file 4273 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4274 */ 4275 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 4276 { 4277 return __shmem_file_setup(shm_mnt, name, size, flags, 0); 4278 } 4279 EXPORT_SYMBOL_GPL(shmem_file_setup); 4280 4281 /** 4282 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 4283 * @mnt: the tmpfs mount where the file will be created 4284 * @name: name for dentry (to be seen in /proc/<pid>/maps 4285 * @size: size to be set for the file 4286 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4287 */ 4288 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 4289 loff_t size, unsigned long flags) 4290 { 4291 return __shmem_file_setup(mnt, name, size, flags, 0); 4292 } 4293 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 4294 4295 /** 4296 * shmem_zero_setup - setup a shared anonymous mapping 4297 * @vma: the vma to be mmapped is prepared by do_mmap 4298 */ 4299 int shmem_zero_setup(struct vm_area_struct *vma) 4300 { 4301 struct file *file; 4302 loff_t size = vma->vm_end - vma->vm_start; 4303 4304 /* 4305 * Cloning a new file under mmap_lock leads to a lock ordering conflict 4306 * between XFS directory reading and selinux: since this file is only 4307 * accessible to the user through its mapping, use S_PRIVATE flag to 4308 * bypass file security, in the same way as shmem_kernel_file_setup(). 4309 */ 4310 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 4311 if (IS_ERR(file)) 4312 return PTR_ERR(file); 4313 4314 if (vma->vm_file) 4315 fput(vma->vm_file); 4316 vma->vm_file = file; 4317 vma->vm_ops = &shmem_anon_vm_ops; 4318 4319 return 0; 4320 } 4321 4322 /** 4323 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. 4324 * @mapping: the folio's address_space 4325 * @index: the folio index 4326 * @gfp: the page allocator flags to use if allocating 4327 * 4328 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 4329 * with any new page allocations done using the specified allocation flags. 4330 * But read_cache_page_gfp() uses the ->read_folio() method: which does not 4331 * suit tmpfs, since it may have pages in swapcache, and needs to find those 4332 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4333 * 4334 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 4335 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4336 */ 4337 struct folio *shmem_read_folio_gfp(struct address_space *mapping, 4338 pgoff_t index, gfp_t gfp) 4339 { 4340 #ifdef CONFIG_SHMEM 4341 struct inode *inode = mapping->host; 4342 struct folio *folio; 4343 int error; 4344 4345 BUG_ON(!shmem_mapping(mapping)); 4346 error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, 4347 gfp, NULL, NULL, NULL); 4348 if (error) 4349 return ERR_PTR(error); 4350 4351 folio_unlock(folio); 4352 return folio; 4353 #else 4354 /* 4355 * The tiny !SHMEM case uses ramfs without swap 4356 */ 4357 return mapping_read_folio_gfp(mapping, index, gfp); 4358 #endif 4359 } 4360 EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); 4361 4362 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4363 pgoff_t index, gfp_t gfp) 4364 { 4365 struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); 4366 struct page *page; 4367 4368 if (IS_ERR(folio)) 4369 return &folio->page; 4370 4371 page = folio_file_page(folio, index); 4372 if (PageHWPoison(page)) { 4373 folio_put(folio); 4374 return ERR_PTR(-EIO); 4375 } 4376 4377 return page; 4378 } 4379 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4380