1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 1997 Linus Torvalds 4 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) 5 */ 6 #include <linux/export.h> 7 #include <linux/fs.h> 8 #include <linux/mm.h> 9 #include <linux/backing-dev.h> 10 #include <linux/hash.h> 11 #include <linux/swap.h> 12 #include <linux/security.h> 13 #include <linux/cdev.h> 14 #include <linux/memblock.h> 15 #include <linux/fsnotify.h> 16 #include <linux/mount.h> 17 #include <linux/posix_acl.h> 18 #include <linux/prefetch.h> 19 #include <linux/buffer_head.h> /* for inode_has_buffers */ 20 #include <linux/ratelimit.h> 21 #include <linux/list_lru.h> 22 #include <linux/iversion.h> 23 #include <trace/events/writeback.h> 24 #include "internal.h" 25 26 /* 27 * Inode locking rules: 28 * 29 * inode->i_lock protects: 30 * inode->i_state, inode->i_hash, __iget() 31 * Inode LRU list locks protect: 32 * inode->i_sb->s_inode_lru, inode->i_lru 33 * inode->i_sb->s_inode_list_lock protects: 34 * inode->i_sb->s_inodes, inode->i_sb_list 35 * bdi->wb.list_lock protects: 36 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list 37 * inode_hash_lock protects: 38 * inode_hashtable, inode->i_hash 39 * 40 * Lock ordering: 41 * 42 * inode->i_sb->s_inode_list_lock 43 * inode->i_lock 44 * Inode LRU list locks 45 * 46 * bdi->wb.list_lock 47 * inode->i_lock 48 * 49 * inode_hash_lock 50 * inode->i_sb->s_inode_list_lock 51 * inode->i_lock 52 * 53 * iunique_lock 54 * inode_hash_lock 55 */ 56 57 static unsigned int i_hash_mask __read_mostly; 58 static unsigned int i_hash_shift __read_mostly; 59 static struct hlist_head *inode_hashtable __read_mostly; 60 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 61 62 /* 63 * Empty aops. Can be used for the cases where the user does not 64 * define any of the address_space operations. 65 */ 66 const struct address_space_operations empty_aops = { 67 }; 68 EXPORT_SYMBOL(empty_aops); 69 70 /* 71 * Statistics gathering.. 72 */ 73 struct inodes_stat_t inodes_stat; 74 75 static DEFINE_PER_CPU(unsigned long, nr_inodes); 76 static DEFINE_PER_CPU(unsigned long, nr_unused); 77 78 static struct kmem_cache *inode_cachep __read_mostly; 79 80 static long get_nr_inodes(void) 81 { 82 int i; 83 long sum = 0; 84 for_each_possible_cpu(i) 85 sum += per_cpu(nr_inodes, i); 86 return sum < 0 ? 0 : sum; 87 } 88 89 static inline long get_nr_inodes_unused(void) 90 { 91 int i; 92 long sum = 0; 93 for_each_possible_cpu(i) 94 sum += per_cpu(nr_unused, i); 95 return sum < 0 ? 0 : sum; 96 } 97 98 long get_nr_dirty_inodes(void) 99 { 100 /* not actually dirty inodes, but a wild approximation */ 101 long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 102 return nr_dirty > 0 ? nr_dirty : 0; 103 } 104 105 /* 106 * Handle nr_inode sysctl 107 */ 108 #ifdef CONFIG_SYSCTL 109 int proc_nr_inodes(struct ctl_table *table, int write, 110 void *buffer, size_t *lenp, loff_t *ppos) 111 { 112 inodes_stat.nr_inodes = get_nr_inodes(); 113 inodes_stat.nr_unused = get_nr_inodes_unused(); 114 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 115 } 116 #endif 117 118 static int no_open(struct inode *inode, struct file *file) 119 { 120 return -ENXIO; 121 } 122 123 /** 124 * inode_init_always - perform inode structure initialisation 125 * @sb: superblock inode belongs to 126 * @inode: inode to initialise 127 * 128 * These are initializations that need to be done on every inode 129 * allocation as the fields are not initialised by slab allocation. 130 */ 131 int inode_init_always(struct super_block *sb, struct inode *inode) 132 { 133 static const struct inode_operations empty_iops; 134 static const struct file_operations no_open_fops = {.open = no_open}; 135 struct address_space *const mapping = &inode->i_data; 136 137 inode->i_sb = sb; 138 inode->i_blkbits = sb->s_blocksize_bits; 139 inode->i_flags = 0; 140 atomic64_set(&inode->i_sequence, 0); 141 atomic_set(&inode->i_count, 1); 142 inode->i_op = &empty_iops; 143 inode->i_fop = &no_open_fops; 144 inode->i_ino = 0; 145 inode->__i_nlink = 1; 146 inode->i_opflags = 0; 147 if (sb->s_xattr) 148 inode->i_opflags |= IOP_XATTR; 149 i_uid_write(inode, 0); 150 i_gid_write(inode, 0); 151 atomic_set(&inode->i_writecount, 0); 152 inode->i_size = 0; 153 inode->i_write_hint = WRITE_LIFE_NOT_SET; 154 inode->i_blocks = 0; 155 inode->i_bytes = 0; 156 inode->i_generation = 0; 157 inode->i_pipe = NULL; 158 inode->i_cdev = NULL; 159 inode->i_link = NULL; 160 inode->i_dir_seq = 0; 161 inode->i_rdev = 0; 162 inode->dirtied_when = 0; 163 164 #ifdef CONFIG_CGROUP_WRITEBACK 165 inode->i_wb_frn_winner = 0; 166 inode->i_wb_frn_avg_time = 0; 167 inode->i_wb_frn_history = 0; 168 #endif 169 170 if (security_inode_alloc(inode)) 171 goto out; 172 spin_lock_init(&inode->i_lock); 173 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 174 175 init_rwsem(&inode->i_rwsem); 176 lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); 177 178 atomic_set(&inode->i_dio_count, 0); 179 180 mapping->a_ops = &empty_aops; 181 mapping->host = inode; 182 mapping->flags = 0; 183 if (sb->s_type->fs_flags & FS_THP_SUPPORT) 184 __set_bit(AS_THP_SUPPORT, &mapping->flags); 185 mapping->wb_err = 0; 186 atomic_set(&mapping->i_mmap_writable, 0); 187 #ifdef CONFIG_READ_ONLY_THP_FOR_FS 188 atomic_set(&mapping->nr_thps, 0); 189 #endif 190 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 191 mapping->private_data = NULL; 192 mapping->writeback_index = 0; 193 init_rwsem(&mapping->invalidate_lock); 194 lockdep_set_class_and_name(&mapping->invalidate_lock, 195 &sb->s_type->invalidate_lock_key, 196 "mapping.invalidate_lock"); 197 inode->i_private = NULL; 198 inode->i_mapping = mapping; 199 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ 200 #ifdef CONFIG_FS_POSIX_ACL 201 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 202 #endif 203 204 #ifdef CONFIG_FSNOTIFY 205 inode->i_fsnotify_mask = 0; 206 #endif 207 inode->i_flctx = NULL; 208 this_cpu_inc(nr_inodes); 209 210 return 0; 211 out: 212 return -ENOMEM; 213 } 214 EXPORT_SYMBOL(inode_init_always); 215 216 void free_inode_nonrcu(struct inode *inode) 217 { 218 kmem_cache_free(inode_cachep, inode); 219 } 220 EXPORT_SYMBOL(free_inode_nonrcu); 221 222 static void i_callback(struct rcu_head *head) 223 { 224 struct inode *inode = container_of(head, struct inode, i_rcu); 225 if (inode->free_inode) 226 inode->free_inode(inode); 227 else 228 free_inode_nonrcu(inode); 229 } 230 231 static struct inode *alloc_inode(struct super_block *sb) 232 { 233 const struct super_operations *ops = sb->s_op; 234 struct inode *inode; 235 236 if (ops->alloc_inode) 237 inode = ops->alloc_inode(sb); 238 else 239 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 240 241 if (!inode) 242 return NULL; 243 244 if (unlikely(inode_init_always(sb, inode))) { 245 if (ops->destroy_inode) { 246 ops->destroy_inode(inode); 247 if (!ops->free_inode) 248 return NULL; 249 } 250 inode->free_inode = ops->free_inode; 251 i_callback(&inode->i_rcu); 252 return NULL; 253 } 254 255 return inode; 256 } 257 258 void __destroy_inode(struct inode *inode) 259 { 260 BUG_ON(inode_has_buffers(inode)); 261 inode_detach_wb(inode); 262 security_inode_free(inode); 263 fsnotify_inode_delete(inode); 264 locks_free_lock_context(inode); 265 if (!inode->i_nlink) { 266 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); 267 atomic_long_dec(&inode->i_sb->s_remove_count); 268 } 269 270 #ifdef CONFIG_FS_POSIX_ACL 271 if (inode->i_acl && !is_uncached_acl(inode->i_acl)) 272 posix_acl_release(inode->i_acl); 273 if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) 274 posix_acl_release(inode->i_default_acl); 275 #endif 276 this_cpu_dec(nr_inodes); 277 } 278 EXPORT_SYMBOL(__destroy_inode); 279 280 static void destroy_inode(struct inode *inode) 281 { 282 const struct super_operations *ops = inode->i_sb->s_op; 283 284 BUG_ON(!list_empty(&inode->i_lru)); 285 __destroy_inode(inode); 286 if (ops->destroy_inode) { 287 ops->destroy_inode(inode); 288 if (!ops->free_inode) 289 return; 290 } 291 inode->free_inode = ops->free_inode; 292 call_rcu(&inode->i_rcu, i_callback); 293 } 294 295 /** 296 * drop_nlink - directly drop an inode's link count 297 * @inode: inode 298 * 299 * This is a low-level filesystem helper to replace any 300 * direct filesystem manipulation of i_nlink. In cases 301 * where we are attempting to track writes to the 302 * filesystem, a decrement to zero means an imminent 303 * write when the file is truncated and actually unlinked 304 * on the filesystem. 305 */ 306 void drop_nlink(struct inode *inode) 307 { 308 WARN_ON(inode->i_nlink == 0); 309 inode->__i_nlink--; 310 if (!inode->i_nlink) 311 atomic_long_inc(&inode->i_sb->s_remove_count); 312 } 313 EXPORT_SYMBOL(drop_nlink); 314 315 /** 316 * clear_nlink - directly zero an inode's link count 317 * @inode: inode 318 * 319 * This is a low-level filesystem helper to replace any 320 * direct filesystem manipulation of i_nlink. See 321 * drop_nlink() for why we care about i_nlink hitting zero. 322 */ 323 void clear_nlink(struct inode *inode) 324 { 325 if (inode->i_nlink) { 326 inode->__i_nlink = 0; 327 atomic_long_inc(&inode->i_sb->s_remove_count); 328 } 329 } 330 EXPORT_SYMBOL(clear_nlink); 331 332 /** 333 * set_nlink - directly set an inode's link count 334 * @inode: inode 335 * @nlink: new nlink (should be non-zero) 336 * 337 * This is a low-level filesystem helper to replace any 338 * direct filesystem manipulation of i_nlink. 339 */ 340 void set_nlink(struct inode *inode, unsigned int nlink) 341 { 342 if (!nlink) { 343 clear_nlink(inode); 344 } else { 345 /* Yes, some filesystems do change nlink from zero to one */ 346 if (inode->i_nlink == 0) 347 atomic_long_dec(&inode->i_sb->s_remove_count); 348 349 inode->__i_nlink = nlink; 350 } 351 } 352 EXPORT_SYMBOL(set_nlink); 353 354 /** 355 * inc_nlink - directly increment an inode's link count 356 * @inode: inode 357 * 358 * This is a low-level filesystem helper to replace any 359 * direct filesystem manipulation of i_nlink. Currently, 360 * it is only here for parity with dec_nlink(). 361 */ 362 void inc_nlink(struct inode *inode) 363 { 364 if (unlikely(inode->i_nlink == 0)) { 365 WARN_ON(!(inode->i_state & I_LINKABLE)); 366 atomic_long_dec(&inode->i_sb->s_remove_count); 367 } 368 369 inode->__i_nlink++; 370 } 371 EXPORT_SYMBOL(inc_nlink); 372 373 static void __address_space_init_once(struct address_space *mapping) 374 { 375 xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); 376 init_rwsem(&mapping->i_mmap_rwsem); 377 INIT_LIST_HEAD(&mapping->private_list); 378 spin_lock_init(&mapping->private_lock); 379 mapping->i_mmap = RB_ROOT_CACHED; 380 } 381 382 void address_space_init_once(struct address_space *mapping) 383 { 384 memset(mapping, 0, sizeof(*mapping)); 385 __address_space_init_once(mapping); 386 } 387 EXPORT_SYMBOL(address_space_init_once); 388 389 /* 390 * These are initializations that only need to be done 391 * once, because the fields are idempotent across use 392 * of the inode, so let the slab aware of that. 393 */ 394 void inode_init_once(struct inode *inode) 395 { 396 memset(inode, 0, sizeof(*inode)); 397 INIT_HLIST_NODE(&inode->i_hash); 398 INIT_LIST_HEAD(&inode->i_devices); 399 INIT_LIST_HEAD(&inode->i_io_list); 400 INIT_LIST_HEAD(&inode->i_wb_list); 401 INIT_LIST_HEAD(&inode->i_lru); 402 __address_space_init_once(&inode->i_data); 403 i_size_ordered_init(inode); 404 } 405 EXPORT_SYMBOL(inode_init_once); 406 407 static void init_once(void *foo) 408 { 409 struct inode *inode = (struct inode *) foo; 410 411 inode_init_once(inode); 412 } 413 414 /* 415 * inode->i_lock must be held 416 */ 417 void __iget(struct inode *inode) 418 { 419 atomic_inc(&inode->i_count); 420 } 421 422 /* 423 * get additional reference to inode; caller must already hold one. 424 */ 425 void ihold(struct inode *inode) 426 { 427 WARN_ON(atomic_inc_return(&inode->i_count) < 2); 428 } 429 EXPORT_SYMBOL(ihold); 430 431 static void __inode_add_lru(struct inode *inode, bool rotate) 432 { 433 if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) 434 return; 435 if (atomic_read(&inode->i_count)) 436 return; 437 if (!(inode->i_sb->s_flags & SB_ACTIVE)) 438 return; 439 if (!mapping_shrinkable(&inode->i_data)) 440 return; 441 442 if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) 443 this_cpu_inc(nr_unused); 444 else if (rotate) 445 inode->i_state |= I_REFERENCED; 446 } 447 448 /* 449 * Add inode to LRU if needed (inode is unused and clean). 450 * 451 * Needs inode->i_lock held. 452 */ 453 void inode_add_lru(struct inode *inode) 454 { 455 __inode_add_lru(inode, false); 456 } 457 458 static void inode_lru_list_del(struct inode *inode) 459 { 460 if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) 461 this_cpu_dec(nr_unused); 462 } 463 464 /** 465 * inode_sb_list_add - add inode to the superblock list of inodes 466 * @inode: inode to add 467 */ 468 void inode_sb_list_add(struct inode *inode) 469 { 470 spin_lock(&inode->i_sb->s_inode_list_lock); 471 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 472 spin_unlock(&inode->i_sb->s_inode_list_lock); 473 } 474 EXPORT_SYMBOL_GPL(inode_sb_list_add); 475 476 static inline void inode_sb_list_del(struct inode *inode) 477 { 478 if (!list_empty(&inode->i_sb_list)) { 479 spin_lock(&inode->i_sb->s_inode_list_lock); 480 list_del_init(&inode->i_sb_list); 481 spin_unlock(&inode->i_sb->s_inode_list_lock); 482 } 483 } 484 485 static unsigned long hash(struct super_block *sb, unsigned long hashval) 486 { 487 unsigned long tmp; 488 489 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 490 L1_CACHE_BYTES; 491 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); 492 return tmp & i_hash_mask; 493 } 494 495 /** 496 * __insert_inode_hash - hash an inode 497 * @inode: unhashed inode 498 * @hashval: unsigned long value used to locate this object in the 499 * inode_hashtable. 500 * 501 * Add an inode to the inode hash for this superblock. 502 */ 503 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 504 { 505 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 506 507 spin_lock(&inode_hash_lock); 508 spin_lock(&inode->i_lock); 509 hlist_add_head_rcu(&inode->i_hash, b); 510 spin_unlock(&inode->i_lock); 511 spin_unlock(&inode_hash_lock); 512 } 513 EXPORT_SYMBOL(__insert_inode_hash); 514 515 /** 516 * __remove_inode_hash - remove an inode from the hash 517 * @inode: inode to unhash 518 * 519 * Remove an inode from the superblock. 520 */ 521 void __remove_inode_hash(struct inode *inode) 522 { 523 spin_lock(&inode_hash_lock); 524 spin_lock(&inode->i_lock); 525 hlist_del_init_rcu(&inode->i_hash); 526 spin_unlock(&inode->i_lock); 527 spin_unlock(&inode_hash_lock); 528 } 529 EXPORT_SYMBOL(__remove_inode_hash); 530 531 void clear_inode(struct inode *inode) 532 { 533 /* 534 * We have to cycle the i_pages lock here because reclaim can be in the 535 * process of removing the last page (in __delete_from_page_cache()) 536 * and we must not free the mapping under it. 537 */ 538 xa_lock_irq(&inode->i_data.i_pages); 539 BUG_ON(inode->i_data.nrpages); 540 /* 541 * Almost always, mapping_empty(&inode->i_data) here; but there are 542 * two known and long-standing ways in which nodes may get left behind 543 * (when deep radix-tree node allocation failed partway; or when THP 544 * collapse_file() failed). Until those two known cases are cleaned up, 545 * or a cleanup function is called here, do not BUG_ON(!mapping_empty), 546 * nor even WARN_ON(!mapping_empty). 547 */ 548 xa_unlock_irq(&inode->i_data.i_pages); 549 BUG_ON(!list_empty(&inode->i_data.private_list)); 550 BUG_ON(!(inode->i_state & I_FREEING)); 551 BUG_ON(inode->i_state & I_CLEAR); 552 BUG_ON(!list_empty(&inode->i_wb_list)); 553 /* don't need i_lock here, no concurrent mods to i_state */ 554 inode->i_state = I_FREEING | I_CLEAR; 555 } 556 EXPORT_SYMBOL(clear_inode); 557 558 /* 559 * Free the inode passed in, removing it from the lists it is still connected 560 * to. We remove any pages still attached to the inode and wait for any IO that 561 * is still in progress before finally destroying the inode. 562 * 563 * An inode must already be marked I_FREEING so that we avoid the inode being 564 * moved back onto lists if we race with other code that manipulates the lists 565 * (e.g. writeback_single_inode). The caller is responsible for setting this. 566 * 567 * An inode must already be removed from the LRU list before being evicted from 568 * the cache. This should occur atomically with setting the I_FREEING state 569 * flag, so no inodes here should ever be on the LRU when being evicted. 570 */ 571 static void evict(struct inode *inode) 572 { 573 const struct super_operations *op = inode->i_sb->s_op; 574 575 BUG_ON(!(inode->i_state & I_FREEING)); 576 BUG_ON(!list_empty(&inode->i_lru)); 577 578 if (!list_empty(&inode->i_io_list)) 579 inode_io_list_del(inode); 580 581 inode_sb_list_del(inode); 582 583 /* 584 * Wait for flusher thread to be done with the inode so that filesystem 585 * does not start destroying it while writeback is still running. Since 586 * the inode has I_FREEING set, flusher thread won't start new work on 587 * the inode. We just have to wait for running writeback to finish. 588 */ 589 inode_wait_for_writeback(inode); 590 591 if (op->evict_inode) { 592 op->evict_inode(inode); 593 } else { 594 truncate_inode_pages_final(&inode->i_data); 595 clear_inode(inode); 596 } 597 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 598 cd_forget(inode); 599 600 remove_inode_hash(inode); 601 602 spin_lock(&inode->i_lock); 603 wake_up_bit(&inode->i_state, __I_NEW); 604 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 605 spin_unlock(&inode->i_lock); 606 607 destroy_inode(inode); 608 } 609 610 /* 611 * dispose_list - dispose of the contents of a local list 612 * @head: the head of the list to free 613 * 614 * Dispose-list gets a local list with local inodes in it, so it doesn't 615 * need to worry about list corruption and SMP locks. 616 */ 617 static void dispose_list(struct list_head *head) 618 { 619 while (!list_empty(head)) { 620 struct inode *inode; 621 622 inode = list_first_entry(head, struct inode, i_lru); 623 list_del_init(&inode->i_lru); 624 625 evict(inode); 626 cond_resched(); 627 } 628 } 629 630 /** 631 * evict_inodes - evict all evictable inodes for a superblock 632 * @sb: superblock to operate on 633 * 634 * Make sure that no inodes with zero refcount are retained. This is 635 * called by superblock shutdown after having SB_ACTIVE flag removed, 636 * so any inode reaching zero refcount during or after that call will 637 * be immediately evicted. 638 */ 639 void evict_inodes(struct super_block *sb) 640 { 641 struct inode *inode, *next; 642 LIST_HEAD(dispose); 643 644 again: 645 spin_lock(&sb->s_inode_list_lock); 646 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 647 if (atomic_read(&inode->i_count)) 648 continue; 649 650 spin_lock(&inode->i_lock); 651 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 652 spin_unlock(&inode->i_lock); 653 continue; 654 } 655 656 inode->i_state |= I_FREEING; 657 inode_lru_list_del(inode); 658 spin_unlock(&inode->i_lock); 659 list_add(&inode->i_lru, &dispose); 660 661 /* 662 * We can have a ton of inodes to evict at unmount time given 663 * enough memory, check to see if we need to go to sleep for a 664 * bit so we don't livelock. 665 */ 666 if (need_resched()) { 667 spin_unlock(&sb->s_inode_list_lock); 668 cond_resched(); 669 dispose_list(&dispose); 670 goto again; 671 } 672 } 673 spin_unlock(&sb->s_inode_list_lock); 674 675 dispose_list(&dispose); 676 } 677 EXPORT_SYMBOL_GPL(evict_inodes); 678 679 /** 680 * invalidate_inodes - attempt to free all inodes on a superblock 681 * @sb: superblock to operate on 682 * @kill_dirty: flag to guide handling of dirty inodes 683 * 684 * Attempts to free all inodes for a given superblock. If there were any 685 * busy inodes return a non-zero value, else zero. 686 * If @kill_dirty is set, discard dirty inodes too, otherwise treat 687 * them as busy. 688 */ 689 int invalidate_inodes(struct super_block *sb, bool kill_dirty) 690 { 691 int busy = 0; 692 struct inode *inode, *next; 693 LIST_HEAD(dispose); 694 695 again: 696 spin_lock(&sb->s_inode_list_lock); 697 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 698 spin_lock(&inode->i_lock); 699 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 700 spin_unlock(&inode->i_lock); 701 continue; 702 } 703 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { 704 spin_unlock(&inode->i_lock); 705 busy = 1; 706 continue; 707 } 708 if (atomic_read(&inode->i_count)) { 709 spin_unlock(&inode->i_lock); 710 busy = 1; 711 continue; 712 } 713 714 inode->i_state |= I_FREEING; 715 inode_lru_list_del(inode); 716 spin_unlock(&inode->i_lock); 717 list_add(&inode->i_lru, &dispose); 718 if (need_resched()) { 719 spin_unlock(&sb->s_inode_list_lock); 720 cond_resched(); 721 dispose_list(&dispose); 722 goto again; 723 } 724 } 725 spin_unlock(&sb->s_inode_list_lock); 726 727 dispose_list(&dispose); 728 729 return busy; 730 } 731 732 /* 733 * Isolate the inode from the LRU in preparation for freeing it. 734 * 735 * If the inode has the I_REFERENCED flag set, then it means that it has been 736 * used recently - the flag is set in iput_final(). When we encounter such an 737 * inode, clear the flag and move it to the back of the LRU so it gets another 738 * pass through the LRU before it gets reclaimed. This is necessary because of 739 * the fact we are doing lazy LRU updates to minimise lock contention so the 740 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 741 * with this flag set because they are the inodes that are out of order. 742 */ 743 static enum lru_status inode_lru_isolate(struct list_head *item, 744 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 745 { 746 struct list_head *freeable = arg; 747 struct inode *inode = container_of(item, struct inode, i_lru); 748 749 /* 750 * We are inverting the lru lock/inode->i_lock here, so use a 751 * trylock. If we fail to get the lock, just skip it. 752 */ 753 if (!spin_trylock(&inode->i_lock)) 754 return LRU_SKIP; 755 756 /* 757 * Inodes can get referenced, redirtied, or repopulated while 758 * they're already on the LRU, and this can make them 759 * unreclaimable for a while. Remove them lazily here; iput, 760 * sync, or the last page cache deletion will requeue them. 761 */ 762 if (atomic_read(&inode->i_count) || 763 (inode->i_state & ~I_REFERENCED) || 764 !mapping_shrinkable(&inode->i_data)) { 765 list_lru_isolate(lru, &inode->i_lru); 766 spin_unlock(&inode->i_lock); 767 this_cpu_dec(nr_unused); 768 return LRU_REMOVED; 769 } 770 771 /* Recently referenced inodes get one more pass */ 772 if (inode->i_state & I_REFERENCED) { 773 inode->i_state &= ~I_REFERENCED; 774 spin_unlock(&inode->i_lock); 775 return LRU_ROTATE; 776 } 777 778 /* 779 * On highmem systems, mapping_shrinkable() permits dropping 780 * page cache in order to free up struct inodes: lowmem might 781 * be under pressure before the cache inside the highmem zone. 782 */ 783 if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { 784 __iget(inode); 785 spin_unlock(&inode->i_lock); 786 spin_unlock(lru_lock); 787 if (remove_inode_buffers(inode)) { 788 unsigned long reap; 789 reap = invalidate_mapping_pages(&inode->i_data, 0, -1); 790 if (current_is_kswapd()) 791 __count_vm_events(KSWAPD_INODESTEAL, reap); 792 else 793 __count_vm_events(PGINODESTEAL, reap); 794 if (current->reclaim_state) 795 current->reclaim_state->reclaimed_slab += reap; 796 } 797 iput(inode); 798 spin_lock(lru_lock); 799 return LRU_RETRY; 800 } 801 802 WARN_ON(inode->i_state & I_NEW); 803 inode->i_state |= I_FREEING; 804 list_lru_isolate_move(lru, &inode->i_lru, freeable); 805 spin_unlock(&inode->i_lock); 806 807 this_cpu_dec(nr_unused); 808 return LRU_REMOVED; 809 } 810 811 /* 812 * Walk the superblock inode LRU for freeable inodes and attempt to free them. 813 * This is called from the superblock shrinker function with a number of inodes 814 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 815 * then are freed outside inode_lock by dispose_list(). 816 */ 817 long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) 818 { 819 LIST_HEAD(freeable); 820 long freed; 821 822 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, 823 inode_lru_isolate, &freeable); 824 dispose_list(&freeable); 825 return freed; 826 } 827 828 static void __wait_on_freeing_inode(struct inode *inode); 829 /* 830 * Called with the inode lock held. 831 */ 832 static struct inode *find_inode(struct super_block *sb, 833 struct hlist_head *head, 834 int (*test)(struct inode *, void *), 835 void *data) 836 { 837 struct inode *inode = NULL; 838 839 repeat: 840 hlist_for_each_entry(inode, head, i_hash) { 841 if (inode->i_sb != sb) 842 continue; 843 if (!test(inode, data)) 844 continue; 845 spin_lock(&inode->i_lock); 846 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 847 __wait_on_freeing_inode(inode); 848 goto repeat; 849 } 850 if (unlikely(inode->i_state & I_CREATING)) { 851 spin_unlock(&inode->i_lock); 852 return ERR_PTR(-ESTALE); 853 } 854 __iget(inode); 855 spin_unlock(&inode->i_lock); 856 return inode; 857 } 858 return NULL; 859 } 860 861 /* 862 * find_inode_fast is the fast path version of find_inode, see the comment at 863 * iget_locked for details. 864 */ 865 static struct inode *find_inode_fast(struct super_block *sb, 866 struct hlist_head *head, unsigned long ino) 867 { 868 struct inode *inode = NULL; 869 870 repeat: 871 hlist_for_each_entry(inode, head, i_hash) { 872 if (inode->i_ino != ino) 873 continue; 874 if (inode->i_sb != sb) 875 continue; 876 spin_lock(&inode->i_lock); 877 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 878 __wait_on_freeing_inode(inode); 879 goto repeat; 880 } 881 if (unlikely(inode->i_state & I_CREATING)) { 882 spin_unlock(&inode->i_lock); 883 return ERR_PTR(-ESTALE); 884 } 885 __iget(inode); 886 spin_unlock(&inode->i_lock); 887 return inode; 888 } 889 return NULL; 890 } 891 892 /* 893 * Each cpu owns a range of LAST_INO_BATCH numbers. 894 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, 895 * to renew the exhausted range. 896 * 897 * This does not significantly increase overflow rate because every CPU can 898 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is 899 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the 900 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase 901 * overflow rate by 2x, which does not seem too significant. 902 * 903 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 904 * error if st_ino won't fit in target struct field. Use 32bit counter 905 * here to attempt to avoid that. 906 */ 907 #define LAST_INO_BATCH 1024 908 static DEFINE_PER_CPU(unsigned int, last_ino); 909 910 unsigned int get_next_ino(void) 911 { 912 unsigned int *p = &get_cpu_var(last_ino); 913 unsigned int res = *p; 914 915 #ifdef CONFIG_SMP 916 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { 917 static atomic_t shared_last_ino; 918 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); 919 920 res = next - LAST_INO_BATCH; 921 } 922 #endif 923 924 res++; 925 /* get_next_ino should not provide a 0 inode number */ 926 if (unlikely(!res)) 927 res++; 928 *p = res; 929 put_cpu_var(last_ino); 930 return res; 931 } 932 EXPORT_SYMBOL(get_next_ino); 933 934 /** 935 * new_inode_pseudo - obtain an inode 936 * @sb: superblock 937 * 938 * Allocates a new inode for given superblock. 939 * Inode wont be chained in superblock s_inodes list 940 * This means : 941 * - fs can't be unmount 942 * - quotas, fsnotify, writeback can't work 943 */ 944 struct inode *new_inode_pseudo(struct super_block *sb) 945 { 946 struct inode *inode = alloc_inode(sb); 947 948 if (inode) { 949 spin_lock(&inode->i_lock); 950 inode->i_state = 0; 951 spin_unlock(&inode->i_lock); 952 INIT_LIST_HEAD(&inode->i_sb_list); 953 } 954 return inode; 955 } 956 957 /** 958 * new_inode - obtain an inode 959 * @sb: superblock 960 * 961 * Allocates a new inode for given superblock. The default gfp_mask 962 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. 963 * If HIGHMEM pages are unsuitable or it is known that pages allocated 964 * for the page cache are not reclaimable or migratable, 965 * mapping_set_gfp_mask() must be called with suitable flags on the 966 * newly created inode's mapping 967 * 968 */ 969 struct inode *new_inode(struct super_block *sb) 970 { 971 struct inode *inode; 972 973 spin_lock_prefetch(&sb->s_inode_list_lock); 974 975 inode = new_inode_pseudo(sb); 976 if (inode) 977 inode_sb_list_add(inode); 978 return inode; 979 } 980 EXPORT_SYMBOL(new_inode); 981 982 #ifdef CONFIG_DEBUG_LOCK_ALLOC 983 void lockdep_annotate_inode_mutex_key(struct inode *inode) 984 { 985 if (S_ISDIR(inode->i_mode)) { 986 struct file_system_type *type = inode->i_sb->s_type; 987 988 /* Set new key only if filesystem hasn't already changed it */ 989 if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { 990 /* 991 * ensure nobody is actually holding i_mutex 992 */ 993 // mutex_destroy(&inode->i_mutex); 994 init_rwsem(&inode->i_rwsem); 995 lockdep_set_class(&inode->i_rwsem, 996 &type->i_mutex_dir_key); 997 } 998 } 999 } 1000 EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); 1001 #endif 1002 1003 /** 1004 * unlock_new_inode - clear the I_NEW state and wake up any waiters 1005 * @inode: new inode to unlock 1006 * 1007 * Called when the inode is fully initialised to clear the new state of the 1008 * inode and wake up anyone waiting for the inode to finish initialisation. 1009 */ 1010 void unlock_new_inode(struct inode *inode) 1011 { 1012 lockdep_annotate_inode_mutex_key(inode); 1013 spin_lock(&inode->i_lock); 1014 WARN_ON(!(inode->i_state & I_NEW)); 1015 inode->i_state &= ~I_NEW & ~I_CREATING; 1016 smp_mb(); 1017 wake_up_bit(&inode->i_state, __I_NEW); 1018 spin_unlock(&inode->i_lock); 1019 } 1020 EXPORT_SYMBOL(unlock_new_inode); 1021 1022 void discard_new_inode(struct inode *inode) 1023 { 1024 lockdep_annotate_inode_mutex_key(inode); 1025 spin_lock(&inode->i_lock); 1026 WARN_ON(!(inode->i_state & I_NEW)); 1027 inode->i_state &= ~I_NEW; 1028 smp_mb(); 1029 wake_up_bit(&inode->i_state, __I_NEW); 1030 spin_unlock(&inode->i_lock); 1031 iput(inode); 1032 } 1033 EXPORT_SYMBOL(discard_new_inode); 1034 1035 /** 1036 * lock_two_nondirectories - take two i_mutexes on non-directory objects 1037 * 1038 * Lock any non-NULL argument that is not a directory. 1039 * Zero, one or two objects may be locked by this function. 1040 * 1041 * @inode1: first inode to lock 1042 * @inode2: second inode to lock 1043 */ 1044 void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 1045 { 1046 if (inode1 > inode2) 1047 swap(inode1, inode2); 1048 1049 if (inode1 && !S_ISDIR(inode1->i_mode)) 1050 inode_lock(inode1); 1051 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 1052 inode_lock_nested(inode2, I_MUTEX_NONDIR2); 1053 } 1054 EXPORT_SYMBOL(lock_two_nondirectories); 1055 1056 /** 1057 * unlock_two_nondirectories - release locks from lock_two_nondirectories() 1058 * @inode1: first inode to unlock 1059 * @inode2: second inode to unlock 1060 */ 1061 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 1062 { 1063 if (inode1 && !S_ISDIR(inode1->i_mode)) 1064 inode_unlock(inode1); 1065 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 1066 inode_unlock(inode2); 1067 } 1068 EXPORT_SYMBOL(unlock_two_nondirectories); 1069 1070 /** 1071 * inode_insert5 - obtain an inode from a mounted file system 1072 * @inode: pre-allocated inode to use for insert to cache 1073 * @hashval: hash value (usually inode number) to get 1074 * @test: callback used for comparisons between inodes 1075 * @set: callback used to initialize a new struct inode 1076 * @data: opaque data pointer to pass to @test and @set 1077 * 1078 * Search for the inode specified by @hashval and @data in the inode cache, 1079 * and if present it is return it with an increased reference count. This is 1080 * a variant of iget5_locked() for callers that don't want to fail on memory 1081 * allocation of inode. 1082 * 1083 * If the inode is not in cache, insert the pre-allocated inode to cache and 1084 * return it locked, hashed, and with the I_NEW flag set. The file system gets 1085 * to fill it in before unlocking it via unlock_new_inode(). 1086 * 1087 * Note both @test and @set are called with the inode_hash_lock held, so can't 1088 * sleep. 1089 */ 1090 struct inode *inode_insert5(struct inode *inode, unsigned long hashval, 1091 int (*test)(struct inode *, void *), 1092 int (*set)(struct inode *, void *), void *data) 1093 { 1094 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 1095 struct inode *old; 1096 bool creating = inode->i_state & I_CREATING; 1097 1098 again: 1099 spin_lock(&inode_hash_lock); 1100 old = find_inode(inode->i_sb, head, test, data); 1101 if (unlikely(old)) { 1102 /* 1103 * Uhhuh, somebody else created the same inode under us. 1104 * Use the old inode instead of the preallocated one. 1105 */ 1106 spin_unlock(&inode_hash_lock); 1107 if (IS_ERR(old)) 1108 return NULL; 1109 wait_on_inode(old); 1110 if (unlikely(inode_unhashed(old))) { 1111 iput(old); 1112 goto again; 1113 } 1114 return old; 1115 } 1116 1117 if (set && unlikely(set(inode, data))) { 1118 inode = NULL; 1119 goto unlock; 1120 } 1121 1122 /* 1123 * Return the locked inode with I_NEW set, the 1124 * caller is responsible for filling in the contents 1125 */ 1126 spin_lock(&inode->i_lock); 1127 inode->i_state |= I_NEW; 1128 hlist_add_head_rcu(&inode->i_hash, head); 1129 spin_unlock(&inode->i_lock); 1130 if (!creating) 1131 inode_sb_list_add(inode); 1132 unlock: 1133 spin_unlock(&inode_hash_lock); 1134 1135 return inode; 1136 } 1137 EXPORT_SYMBOL(inode_insert5); 1138 1139 /** 1140 * iget5_locked - obtain an inode from a mounted file system 1141 * @sb: super block of file system 1142 * @hashval: hash value (usually inode number) to get 1143 * @test: callback used for comparisons between inodes 1144 * @set: callback used to initialize a new struct inode 1145 * @data: opaque data pointer to pass to @test and @set 1146 * 1147 * Search for the inode specified by @hashval and @data in the inode cache, 1148 * and if present it is return it with an increased reference count. This is 1149 * a generalized version of iget_locked() for file systems where the inode 1150 * number is not sufficient for unique identification of an inode. 1151 * 1152 * If the inode is not in cache, allocate a new inode and return it locked, 1153 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1154 * before unlocking it via unlock_new_inode(). 1155 * 1156 * Note both @test and @set are called with the inode_hash_lock held, so can't 1157 * sleep. 1158 */ 1159 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 1160 int (*test)(struct inode *, void *), 1161 int (*set)(struct inode *, void *), void *data) 1162 { 1163 struct inode *inode = ilookup5(sb, hashval, test, data); 1164 1165 if (!inode) { 1166 struct inode *new = alloc_inode(sb); 1167 1168 if (new) { 1169 new->i_state = 0; 1170 inode = inode_insert5(new, hashval, test, set, data); 1171 if (unlikely(inode != new)) 1172 destroy_inode(new); 1173 } 1174 } 1175 return inode; 1176 } 1177 EXPORT_SYMBOL(iget5_locked); 1178 1179 /** 1180 * iget_locked - obtain an inode from a mounted file system 1181 * @sb: super block of file system 1182 * @ino: inode number to get 1183 * 1184 * Search for the inode specified by @ino in the inode cache and if present 1185 * return it with an increased reference count. This is for file systems 1186 * where the inode number is sufficient for unique identification of an inode. 1187 * 1188 * If the inode is not in cache, allocate a new inode and return it locked, 1189 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1190 * before unlocking it via unlock_new_inode(). 1191 */ 1192 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 1193 { 1194 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1195 struct inode *inode; 1196 again: 1197 spin_lock(&inode_hash_lock); 1198 inode = find_inode_fast(sb, head, ino); 1199 spin_unlock(&inode_hash_lock); 1200 if (inode) { 1201 if (IS_ERR(inode)) 1202 return NULL; 1203 wait_on_inode(inode); 1204 if (unlikely(inode_unhashed(inode))) { 1205 iput(inode); 1206 goto again; 1207 } 1208 return inode; 1209 } 1210 1211 inode = alloc_inode(sb); 1212 if (inode) { 1213 struct inode *old; 1214 1215 spin_lock(&inode_hash_lock); 1216 /* We released the lock, so.. */ 1217 old = find_inode_fast(sb, head, ino); 1218 if (!old) { 1219 inode->i_ino = ino; 1220 spin_lock(&inode->i_lock); 1221 inode->i_state = I_NEW; 1222 hlist_add_head_rcu(&inode->i_hash, head); 1223 spin_unlock(&inode->i_lock); 1224 inode_sb_list_add(inode); 1225 spin_unlock(&inode_hash_lock); 1226 1227 /* Return the locked inode with I_NEW set, the 1228 * caller is responsible for filling in the contents 1229 */ 1230 return inode; 1231 } 1232 1233 /* 1234 * Uhhuh, somebody else created the same inode under 1235 * us. Use the old inode instead of the one we just 1236 * allocated. 1237 */ 1238 spin_unlock(&inode_hash_lock); 1239 destroy_inode(inode); 1240 if (IS_ERR(old)) 1241 return NULL; 1242 inode = old; 1243 wait_on_inode(inode); 1244 if (unlikely(inode_unhashed(inode))) { 1245 iput(inode); 1246 goto again; 1247 } 1248 } 1249 return inode; 1250 } 1251 EXPORT_SYMBOL(iget_locked); 1252 1253 /* 1254 * search the inode cache for a matching inode number. 1255 * If we find one, then the inode number we are trying to 1256 * allocate is not unique and so we should not use it. 1257 * 1258 * Returns 1 if the inode number is unique, 0 if it is not. 1259 */ 1260 static int test_inode_iunique(struct super_block *sb, unsigned long ino) 1261 { 1262 struct hlist_head *b = inode_hashtable + hash(sb, ino); 1263 struct inode *inode; 1264 1265 hlist_for_each_entry_rcu(inode, b, i_hash) { 1266 if (inode->i_ino == ino && inode->i_sb == sb) 1267 return 0; 1268 } 1269 return 1; 1270 } 1271 1272 /** 1273 * iunique - get a unique inode number 1274 * @sb: superblock 1275 * @max_reserved: highest reserved inode number 1276 * 1277 * Obtain an inode number that is unique on the system for a given 1278 * superblock. This is used by file systems that have no natural 1279 * permanent inode numbering system. An inode number is returned that 1280 * is higher than the reserved limit but unique. 1281 * 1282 * BUGS: 1283 * With a large number of inodes live on the file system this function 1284 * currently becomes quite slow. 1285 */ 1286 ino_t iunique(struct super_block *sb, ino_t max_reserved) 1287 { 1288 /* 1289 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 1290 * error if st_ino won't fit in target struct field. Use 32bit counter 1291 * here to attempt to avoid that. 1292 */ 1293 static DEFINE_SPINLOCK(iunique_lock); 1294 static unsigned int counter; 1295 ino_t res; 1296 1297 rcu_read_lock(); 1298 spin_lock(&iunique_lock); 1299 do { 1300 if (counter <= max_reserved) 1301 counter = max_reserved + 1; 1302 res = counter++; 1303 } while (!test_inode_iunique(sb, res)); 1304 spin_unlock(&iunique_lock); 1305 rcu_read_unlock(); 1306 1307 return res; 1308 } 1309 EXPORT_SYMBOL(iunique); 1310 1311 struct inode *igrab(struct inode *inode) 1312 { 1313 spin_lock(&inode->i_lock); 1314 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1315 __iget(inode); 1316 spin_unlock(&inode->i_lock); 1317 } else { 1318 spin_unlock(&inode->i_lock); 1319 /* 1320 * Handle the case where s_op->clear_inode is not been 1321 * called yet, and somebody is calling igrab 1322 * while the inode is getting freed. 1323 */ 1324 inode = NULL; 1325 } 1326 return inode; 1327 } 1328 EXPORT_SYMBOL(igrab); 1329 1330 /** 1331 * ilookup5_nowait - search for an inode in the inode cache 1332 * @sb: super block of file system to search 1333 * @hashval: hash value (usually inode number) to search for 1334 * @test: callback used for comparisons between inodes 1335 * @data: opaque data pointer to pass to @test 1336 * 1337 * Search for the inode specified by @hashval and @data in the inode cache. 1338 * If the inode is in the cache, the inode is returned with an incremented 1339 * reference count. 1340 * 1341 * Note: I_NEW is not waited upon so you have to be very careful what you do 1342 * with the returned inode. You probably should be using ilookup5() instead. 1343 * 1344 * Note2: @test is called with the inode_hash_lock held, so can't sleep. 1345 */ 1346 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1347 int (*test)(struct inode *, void *), void *data) 1348 { 1349 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1350 struct inode *inode; 1351 1352 spin_lock(&inode_hash_lock); 1353 inode = find_inode(sb, head, test, data); 1354 spin_unlock(&inode_hash_lock); 1355 1356 return IS_ERR(inode) ? NULL : inode; 1357 } 1358 EXPORT_SYMBOL(ilookup5_nowait); 1359 1360 /** 1361 * ilookup5 - search for an inode in the inode cache 1362 * @sb: super block of file system to search 1363 * @hashval: hash value (usually inode number) to search for 1364 * @test: callback used for comparisons between inodes 1365 * @data: opaque data pointer to pass to @test 1366 * 1367 * Search for the inode specified by @hashval and @data in the inode cache, 1368 * and if the inode is in the cache, return the inode with an incremented 1369 * reference count. Waits on I_NEW before returning the inode. 1370 * returned with an incremented reference count. 1371 * 1372 * This is a generalized version of ilookup() for file systems where the 1373 * inode number is not sufficient for unique identification of an inode. 1374 * 1375 * Note: @test is called with the inode_hash_lock held, so can't sleep. 1376 */ 1377 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1378 int (*test)(struct inode *, void *), void *data) 1379 { 1380 struct inode *inode; 1381 again: 1382 inode = ilookup5_nowait(sb, hashval, test, data); 1383 if (inode) { 1384 wait_on_inode(inode); 1385 if (unlikely(inode_unhashed(inode))) { 1386 iput(inode); 1387 goto again; 1388 } 1389 } 1390 return inode; 1391 } 1392 EXPORT_SYMBOL(ilookup5); 1393 1394 /** 1395 * ilookup - search for an inode in the inode cache 1396 * @sb: super block of file system to search 1397 * @ino: inode number to search for 1398 * 1399 * Search for the inode @ino in the inode cache, and if the inode is in the 1400 * cache, the inode is returned with an incremented reference count. 1401 */ 1402 struct inode *ilookup(struct super_block *sb, unsigned long ino) 1403 { 1404 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1405 struct inode *inode; 1406 again: 1407 spin_lock(&inode_hash_lock); 1408 inode = find_inode_fast(sb, head, ino); 1409 spin_unlock(&inode_hash_lock); 1410 1411 if (inode) { 1412 if (IS_ERR(inode)) 1413 return NULL; 1414 wait_on_inode(inode); 1415 if (unlikely(inode_unhashed(inode))) { 1416 iput(inode); 1417 goto again; 1418 } 1419 } 1420 return inode; 1421 } 1422 EXPORT_SYMBOL(ilookup); 1423 1424 /** 1425 * find_inode_nowait - find an inode in the inode cache 1426 * @sb: super block of file system to search 1427 * @hashval: hash value (usually inode number) to search for 1428 * @match: callback used for comparisons between inodes 1429 * @data: opaque data pointer to pass to @match 1430 * 1431 * Search for the inode specified by @hashval and @data in the inode 1432 * cache, where the helper function @match will return 0 if the inode 1433 * does not match, 1 if the inode does match, and -1 if the search 1434 * should be stopped. The @match function must be responsible for 1435 * taking the i_lock spin_lock and checking i_state for an inode being 1436 * freed or being initialized, and incrementing the reference count 1437 * before returning 1. It also must not sleep, since it is called with 1438 * the inode_hash_lock spinlock held. 1439 * 1440 * This is a even more generalized version of ilookup5() when the 1441 * function must never block --- find_inode() can block in 1442 * __wait_on_freeing_inode() --- or when the caller can not increment 1443 * the reference count because the resulting iput() might cause an 1444 * inode eviction. The tradeoff is that the @match funtion must be 1445 * very carefully implemented. 1446 */ 1447 struct inode *find_inode_nowait(struct super_block *sb, 1448 unsigned long hashval, 1449 int (*match)(struct inode *, unsigned long, 1450 void *), 1451 void *data) 1452 { 1453 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1454 struct inode *inode, *ret_inode = NULL; 1455 int mval; 1456 1457 spin_lock(&inode_hash_lock); 1458 hlist_for_each_entry(inode, head, i_hash) { 1459 if (inode->i_sb != sb) 1460 continue; 1461 mval = match(inode, hashval, data); 1462 if (mval == 0) 1463 continue; 1464 if (mval == 1) 1465 ret_inode = inode; 1466 goto out; 1467 } 1468 out: 1469 spin_unlock(&inode_hash_lock); 1470 return ret_inode; 1471 } 1472 EXPORT_SYMBOL(find_inode_nowait); 1473 1474 /** 1475 * find_inode_rcu - find an inode in the inode cache 1476 * @sb: Super block of file system to search 1477 * @hashval: Key to hash 1478 * @test: Function to test match on an inode 1479 * @data: Data for test function 1480 * 1481 * Search for the inode specified by @hashval and @data in the inode cache, 1482 * where the helper function @test will return 0 if the inode does not match 1483 * and 1 if it does. The @test function must be responsible for taking the 1484 * i_lock spin_lock and checking i_state for an inode being freed or being 1485 * initialized. 1486 * 1487 * If successful, this will return the inode for which the @test function 1488 * returned 1 and NULL otherwise. 1489 * 1490 * The @test function is not permitted to take a ref on any inode presented. 1491 * It is also not permitted to sleep. 1492 * 1493 * The caller must hold the RCU read lock. 1494 */ 1495 struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, 1496 int (*test)(struct inode *, void *), void *data) 1497 { 1498 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1499 struct inode *inode; 1500 1501 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), 1502 "suspicious find_inode_rcu() usage"); 1503 1504 hlist_for_each_entry_rcu(inode, head, i_hash) { 1505 if (inode->i_sb == sb && 1506 !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && 1507 test(inode, data)) 1508 return inode; 1509 } 1510 return NULL; 1511 } 1512 EXPORT_SYMBOL(find_inode_rcu); 1513 1514 /** 1515 * find_inode_by_ino_rcu - Find an inode in the inode cache 1516 * @sb: Super block of file system to search 1517 * @ino: The inode number to match 1518 * 1519 * Search for the inode specified by @hashval and @data in the inode cache, 1520 * where the helper function @test will return 0 if the inode does not match 1521 * and 1 if it does. The @test function must be responsible for taking the 1522 * i_lock spin_lock and checking i_state for an inode being freed or being 1523 * initialized. 1524 * 1525 * If successful, this will return the inode for which the @test function 1526 * returned 1 and NULL otherwise. 1527 * 1528 * The @test function is not permitted to take a ref on any inode presented. 1529 * It is also not permitted to sleep. 1530 * 1531 * The caller must hold the RCU read lock. 1532 */ 1533 struct inode *find_inode_by_ino_rcu(struct super_block *sb, 1534 unsigned long ino) 1535 { 1536 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1537 struct inode *inode; 1538 1539 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), 1540 "suspicious find_inode_by_ino_rcu() usage"); 1541 1542 hlist_for_each_entry_rcu(inode, head, i_hash) { 1543 if (inode->i_ino == ino && 1544 inode->i_sb == sb && 1545 !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) 1546 return inode; 1547 } 1548 return NULL; 1549 } 1550 EXPORT_SYMBOL(find_inode_by_ino_rcu); 1551 1552 int insert_inode_locked(struct inode *inode) 1553 { 1554 struct super_block *sb = inode->i_sb; 1555 ino_t ino = inode->i_ino; 1556 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1557 1558 while (1) { 1559 struct inode *old = NULL; 1560 spin_lock(&inode_hash_lock); 1561 hlist_for_each_entry(old, head, i_hash) { 1562 if (old->i_ino != ino) 1563 continue; 1564 if (old->i_sb != sb) 1565 continue; 1566 spin_lock(&old->i_lock); 1567 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1568 spin_unlock(&old->i_lock); 1569 continue; 1570 } 1571 break; 1572 } 1573 if (likely(!old)) { 1574 spin_lock(&inode->i_lock); 1575 inode->i_state |= I_NEW | I_CREATING; 1576 hlist_add_head_rcu(&inode->i_hash, head); 1577 spin_unlock(&inode->i_lock); 1578 spin_unlock(&inode_hash_lock); 1579 return 0; 1580 } 1581 if (unlikely(old->i_state & I_CREATING)) { 1582 spin_unlock(&old->i_lock); 1583 spin_unlock(&inode_hash_lock); 1584 return -EBUSY; 1585 } 1586 __iget(old); 1587 spin_unlock(&old->i_lock); 1588 spin_unlock(&inode_hash_lock); 1589 wait_on_inode(old); 1590 if (unlikely(!inode_unhashed(old))) { 1591 iput(old); 1592 return -EBUSY; 1593 } 1594 iput(old); 1595 } 1596 } 1597 EXPORT_SYMBOL(insert_inode_locked); 1598 1599 int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1600 int (*test)(struct inode *, void *), void *data) 1601 { 1602 struct inode *old; 1603 1604 inode->i_state |= I_CREATING; 1605 old = inode_insert5(inode, hashval, test, NULL, data); 1606 1607 if (old != inode) { 1608 iput(old); 1609 return -EBUSY; 1610 } 1611 return 0; 1612 } 1613 EXPORT_SYMBOL(insert_inode_locked4); 1614 1615 1616 int generic_delete_inode(struct inode *inode) 1617 { 1618 return 1; 1619 } 1620 EXPORT_SYMBOL(generic_delete_inode); 1621 1622 /* 1623 * Called when we're dropping the last reference 1624 * to an inode. 1625 * 1626 * Call the FS "drop_inode()" function, defaulting to 1627 * the legacy UNIX filesystem behaviour. If it tells 1628 * us to evict inode, do so. Otherwise, retain inode 1629 * in cache if fs is alive, sync and evict if fs is 1630 * shutting down. 1631 */ 1632 static void iput_final(struct inode *inode) 1633 { 1634 struct super_block *sb = inode->i_sb; 1635 const struct super_operations *op = inode->i_sb->s_op; 1636 unsigned long state; 1637 int drop; 1638 1639 WARN_ON(inode->i_state & I_NEW); 1640 1641 if (op->drop_inode) 1642 drop = op->drop_inode(inode); 1643 else 1644 drop = generic_drop_inode(inode); 1645 1646 if (!drop && 1647 !(inode->i_state & I_DONTCACHE) && 1648 (sb->s_flags & SB_ACTIVE)) { 1649 __inode_add_lru(inode, true); 1650 spin_unlock(&inode->i_lock); 1651 return; 1652 } 1653 1654 state = inode->i_state; 1655 if (!drop) { 1656 WRITE_ONCE(inode->i_state, state | I_WILL_FREE); 1657 spin_unlock(&inode->i_lock); 1658 1659 write_inode_now(inode, 1); 1660 1661 spin_lock(&inode->i_lock); 1662 state = inode->i_state; 1663 WARN_ON(state & I_NEW); 1664 state &= ~I_WILL_FREE; 1665 } 1666 1667 WRITE_ONCE(inode->i_state, state | I_FREEING); 1668 if (!list_empty(&inode->i_lru)) 1669 inode_lru_list_del(inode); 1670 spin_unlock(&inode->i_lock); 1671 1672 evict(inode); 1673 } 1674 1675 /** 1676 * iput - put an inode 1677 * @inode: inode to put 1678 * 1679 * Puts an inode, dropping its usage count. If the inode use count hits 1680 * zero, the inode is then freed and may also be destroyed. 1681 * 1682 * Consequently, iput() can sleep. 1683 */ 1684 void iput(struct inode *inode) 1685 { 1686 if (!inode) 1687 return; 1688 BUG_ON(inode->i_state & I_CLEAR); 1689 retry: 1690 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { 1691 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { 1692 atomic_inc(&inode->i_count); 1693 spin_unlock(&inode->i_lock); 1694 trace_writeback_lazytime_iput(inode); 1695 mark_inode_dirty_sync(inode); 1696 goto retry; 1697 } 1698 iput_final(inode); 1699 } 1700 } 1701 EXPORT_SYMBOL(iput); 1702 1703 #ifdef CONFIG_BLOCK 1704 /** 1705 * bmap - find a block number in a file 1706 * @inode: inode owning the block number being requested 1707 * @block: pointer containing the block to find 1708 * 1709 * Replaces the value in ``*block`` with the block number on the device holding 1710 * corresponding to the requested block number in the file. 1711 * That is, asked for block 4 of inode 1 the function will replace the 1712 * 4 in ``*block``, with disk block relative to the disk start that holds that 1713 * block of the file. 1714 * 1715 * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a 1716 * hole, returns 0 and ``*block`` is also set to 0. 1717 */ 1718 int bmap(struct inode *inode, sector_t *block) 1719 { 1720 if (!inode->i_mapping->a_ops->bmap) 1721 return -EINVAL; 1722 1723 *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); 1724 return 0; 1725 } 1726 EXPORT_SYMBOL(bmap); 1727 #endif 1728 1729 /* 1730 * With relative atime, only update atime if the previous atime is 1731 * earlier than either the ctime or mtime or if at least a day has 1732 * passed since the last atime update. 1733 */ 1734 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, 1735 struct timespec64 now) 1736 { 1737 1738 if (!(mnt->mnt_flags & MNT_RELATIME)) 1739 return 1; 1740 /* 1741 * Is mtime younger than atime? If yes, update atime: 1742 */ 1743 if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) 1744 return 1; 1745 /* 1746 * Is ctime younger than atime? If yes, update atime: 1747 */ 1748 if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) 1749 return 1; 1750 1751 /* 1752 * Is the previous atime value older than a day? If yes, 1753 * update atime: 1754 */ 1755 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) 1756 return 1; 1757 /* 1758 * Good, we can skip the atime update: 1759 */ 1760 return 0; 1761 } 1762 1763 int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) 1764 { 1765 int dirty_flags = 0; 1766 1767 if (flags & (S_ATIME | S_CTIME | S_MTIME)) { 1768 if (flags & S_ATIME) 1769 inode->i_atime = *time; 1770 if (flags & S_CTIME) 1771 inode->i_ctime = *time; 1772 if (flags & S_MTIME) 1773 inode->i_mtime = *time; 1774 1775 if (inode->i_sb->s_flags & SB_LAZYTIME) 1776 dirty_flags |= I_DIRTY_TIME; 1777 else 1778 dirty_flags |= I_DIRTY_SYNC; 1779 } 1780 1781 if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) 1782 dirty_flags |= I_DIRTY_SYNC; 1783 1784 __mark_inode_dirty(inode, dirty_flags); 1785 return 0; 1786 } 1787 EXPORT_SYMBOL(generic_update_time); 1788 1789 /* 1790 * This does the actual work of updating an inodes time or version. Must have 1791 * had called mnt_want_write() before calling this. 1792 */ 1793 int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) 1794 { 1795 if (inode->i_op->update_time) 1796 return inode->i_op->update_time(inode, time, flags); 1797 return generic_update_time(inode, time, flags); 1798 } 1799 EXPORT_SYMBOL(inode_update_time); 1800 1801 /** 1802 * atime_needs_update - update the access time 1803 * @path: the &struct path to update 1804 * @inode: inode to update 1805 * 1806 * Update the accessed time on an inode and mark it for writeback. 1807 * This function automatically handles read only file systems and media, 1808 * as well as the "noatime" flag and inode specific "noatime" markers. 1809 */ 1810 bool atime_needs_update(const struct path *path, struct inode *inode) 1811 { 1812 struct vfsmount *mnt = path->mnt; 1813 struct timespec64 now; 1814 1815 if (inode->i_flags & S_NOATIME) 1816 return false; 1817 1818 /* Atime updates will likely cause i_uid and i_gid to be written 1819 * back improprely if their true value is unknown to the vfs. 1820 */ 1821 if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode)) 1822 return false; 1823 1824 if (IS_NOATIME(inode)) 1825 return false; 1826 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) 1827 return false; 1828 1829 if (mnt->mnt_flags & MNT_NOATIME) 1830 return false; 1831 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1832 return false; 1833 1834 now = current_time(inode); 1835 1836 if (!relatime_need_update(mnt, inode, now)) 1837 return false; 1838 1839 if (timespec64_equal(&inode->i_atime, &now)) 1840 return false; 1841 1842 return true; 1843 } 1844 1845 void touch_atime(const struct path *path) 1846 { 1847 struct vfsmount *mnt = path->mnt; 1848 struct inode *inode = d_inode(path->dentry); 1849 struct timespec64 now; 1850 1851 if (!atime_needs_update(path, inode)) 1852 return; 1853 1854 if (!sb_start_write_trylock(inode->i_sb)) 1855 return; 1856 1857 if (__mnt_want_write(mnt) != 0) 1858 goto skip_update; 1859 /* 1860 * File systems can error out when updating inodes if they need to 1861 * allocate new space to modify an inode (such is the case for 1862 * Btrfs), but since we touch atime while walking down the path we 1863 * really don't care if we failed to update the atime of the file, 1864 * so just ignore the return value. 1865 * We may also fail on filesystems that have the ability to make parts 1866 * of the fs read only, e.g. subvolumes in Btrfs. 1867 */ 1868 now = current_time(inode); 1869 inode_update_time(inode, &now, S_ATIME); 1870 __mnt_drop_write(mnt); 1871 skip_update: 1872 sb_end_write(inode->i_sb); 1873 } 1874 EXPORT_SYMBOL(touch_atime); 1875 1876 /* 1877 * The logic we want is 1878 * 1879 * if suid or (sgid and xgrp) 1880 * remove privs 1881 */ 1882 int should_remove_suid(struct dentry *dentry) 1883 { 1884 umode_t mode = d_inode(dentry)->i_mode; 1885 int kill = 0; 1886 1887 /* suid always must be killed */ 1888 if (unlikely(mode & S_ISUID)) 1889 kill = ATTR_KILL_SUID; 1890 1891 /* 1892 * sgid without any exec bits is just a mandatory locking mark; leave 1893 * it alone. If some exec bits are set, it's a real sgid; kill it. 1894 */ 1895 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1896 kill |= ATTR_KILL_SGID; 1897 1898 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1899 return kill; 1900 1901 return 0; 1902 } 1903 EXPORT_SYMBOL(should_remove_suid); 1904 1905 /* 1906 * Return mask of changes for notify_change() that need to be done as a 1907 * response to write or truncate. Return 0 if nothing has to be changed. 1908 * Negative value on error (change should be denied). 1909 */ 1910 int dentry_needs_remove_privs(struct dentry *dentry) 1911 { 1912 struct inode *inode = d_inode(dentry); 1913 int mask = 0; 1914 int ret; 1915 1916 if (IS_NOSEC(inode)) 1917 return 0; 1918 1919 mask = should_remove_suid(dentry); 1920 ret = security_inode_need_killpriv(dentry); 1921 if (ret < 0) 1922 return ret; 1923 if (ret) 1924 mask |= ATTR_KILL_PRIV; 1925 return mask; 1926 } 1927 1928 static int __remove_privs(struct user_namespace *mnt_userns, 1929 struct dentry *dentry, int kill) 1930 { 1931 struct iattr newattrs; 1932 1933 newattrs.ia_valid = ATTR_FORCE | kill; 1934 /* 1935 * Note we call this on write, so notify_change will not 1936 * encounter any conflicting delegations: 1937 */ 1938 return notify_change(mnt_userns, dentry, &newattrs, NULL); 1939 } 1940 1941 /* 1942 * Remove special file priviledges (suid, capabilities) when file is written 1943 * to or truncated. 1944 */ 1945 int file_remove_privs(struct file *file) 1946 { 1947 struct dentry *dentry = file_dentry(file); 1948 struct inode *inode = file_inode(file); 1949 int kill; 1950 int error = 0; 1951 1952 /* 1953 * Fast path for nothing security related. 1954 * As well for non-regular files, e.g. blkdev inodes. 1955 * For example, blkdev_write_iter() might get here 1956 * trying to remove privs which it is not allowed to. 1957 */ 1958 if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) 1959 return 0; 1960 1961 kill = dentry_needs_remove_privs(dentry); 1962 if (kill < 0) 1963 return kill; 1964 if (kill) 1965 error = __remove_privs(file_mnt_user_ns(file), dentry, kill); 1966 if (!error) 1967 inode_has_no_xattr(inode); 1968 1969 return error; 1970 } 1971 EXPORT_SYMBOL(file_remove_privs); 1972 1973 /** 1974 * file_update_time - update mtime and ctime time 1975 * @file: file accessed 1976 * 1977 * Update the mtime and ctime members of an inode and mark the inode 1978 * for writeback. Note that this function is meant exclusively for 1979 * usage in the file write path of filesystems, and filesystems may 1980 * choose to explicitly ignore update via this function with the 1981 * S_NOCMTIME inode flag, e.g. for network filesystem where these 1982 * timestamps are handled by the server. This can return an error for 1983 * file systems who need to allocate space in order to update an inode. 1984 */ 1985 1986 int file_update_time(struct file *file) 1987 { 1988 struct inode *inode = file_inode(file); 1989 struct timespec64 now; 1990 int sync_it = 0; 1991 int ret; 1992 1993 /* First try to exhaust all avenues to not sync */ 1994 if (IS_NOCMTIME(inode)) 1995 return 0; 1996 1997 now = current_time(inode); 1998 if (!timespec64_equal(&inode->i_mtime, &now)) 1999 sync_it = S_MTIME; 2000 2001 if (!timespec64_equal(&inode->i_ctime, &now)) 2002 sync_it |= S_CTIME; 2003 2004 if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) 2005 sync_it |= S_VERSION; 2006 2007 if (!sync_it) 2008 return 0; 2009 2010 /* Finally allowed to write? Takes lock. */ 2011 if (__mnt_want_write_file(file)) 2012 return 0; 2013 2014 ret = inode_update_time(inode, &now, sync_it); 2015 __mnt_drop_write_file(file); 2016 2017 return ret; 2018 } 2019 EXPORT_SYMBOL(file_update_time); 2020 2021 /* Caller must hold the file's inode lock */ 2022 int file_modified(struct file *file) 2023 { 2024 int err; 2025 2026 /* 2027 * Clear the security bits if the process is not being run by root. 2028 * This keeps people from modifying setuid and setgid binaries. 2029 */ 2030 err = file_remove_privs(file); 2031 if (err) 2032 return err; 2033 2034 if (unlikely(file->f_mode & FMODE_NOCMTIME)) 2035 return 0; 2036 2037 return file_update_time(file); 2038 } 2039 EXPORT_SYMBOL(file_modified); 2040 2041 int inode_needs_sync(struct inode *inode) 2042 { 2043 if (IS_SYNC(inode)) 2044 return 1; 2045 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 2046 return 1; 2047 return 0; 2048 } 2049 EXPORT_SYMBOL(inode_needs_sync); 2050 2051 /* 2052 * If we try to find an inode in the inode hash while it is being 2053 * deleted, we have to wait until the filesystem completes its 2054 * deletion before reporting that it isn't found. This function waits 2055 * until the deletion _might_ have completed. Callers are responsible 2056 * to recheck inode state. 2057 * 2058 * It doesn't matter if I_NEW is not set initially, a call to 2059 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 2060 * will DTRT. 2061 */ 2062 static void __wait_on_freeing_inode(struct inode *inode) 2063 { 2064 wait_queue_head_t *wq; 2065 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 2066 wq = bit_waitqueue(&inode->i_state, __I_NEW); 2067 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2068 spin_unlock(&inode->i_lock); 2069 spin_unlock(&inode_hash_lock); 2070 schedule(); 2071 finish_wait(wq, &wait.wq_entry); 2072 spin_lock(&inode_hash_lock); 2073 } 2074 2075 static __initdata unsigned long ihash_entries; 2076 static int __init set_ihash_entries(char *str) 2077 { 2078 if (!str) 2079 return 0; 2080 ihash_entries = simple_strtoul(str, &str, 0); 2081 return 1; 2082 } 2083 __setup("ihash_entries=", set_ihash_entries); 2084 2085 /* 2086 * Initialize the waitqueues and inode hash table. 2087 */ 2088 void __init inode_init_early(void) 2089 { 2090 /* If hashes are distributed across NUMA nodes, defer 2091 * hash allocation until vmalloc space is available. 2092 */ 2093 if (hashdist) 2094 return; 2095 2096 inode_hashtable = 2097 alloc_large_system_hash("Inode-cache", 2098 sizeof(struct hlist_head), 2099 ihash_entries, 2100 14, 2101 HASH_EARLY | HASH_ZERO, 2102 &i_hash_shift, 2103 &i_hash_mask, 2104 0, 2105 0); 2106 } 2107 2108 void __init inode_init(void) 2109 { 2110 /* inode slab cache */ 2111 inode_cachep = kmem_cache_create("inode_cache", 2112 sizeof(struct inode), 2113 0, 2114 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 2115 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 2116 init_once); 2117 2118 /* Hash may have been set up in inode_init_early */ 2119 if (!hashdist) 2120 return; 2121 2122 inode_hashtable = 2123 alloc_large_system_hash("Inode-cache", 2124 sizeof(struct hlist_head), 2125 ihash_entries, 2126 14, 2127 HASH_ZERO, 2128 &i_hash_shift, 2129 &i_hash_mask, 2130 0, 2131 0); 2132 } 2133 2134 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 2135 { 2136 inode->i_mode = mode; 2137 if (S_ISCHR(mode)) { 2138 inode->i_fop = &def_chr_fops; 2139 inode->i_rdev = rdev; 2140 } else if (S_ISBLK(mode)) { 2141 inode->i_fop = &def_blk_fops; 2142 inode->i_rdev = rdev; 2143 } else if (S_ISFIFO(mode)) 2144 inode->i_fop = &pipefifo_fops; 2145 else if (S_ISSOCK(mode)) 2146 ; /* leave it no_open_fops */ 2147 else 2148 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" 2149 " inode %s:%lu\n", mode, inode->i_sb->s_id, 2150 inode->i_ino); 2151 } 2152 EXPORT_SYMBOL(init_special_inode); 2153 2154 /** 2155 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards 2156 * @mnt_userns: User namespace of the mount the inode was created from 2157 * @inode: New inode 2158 * @dir: Directory inode 2159 * @mode: mode of the new inode 2160 * 2161 * If the inode has been created through an idmapped mount the user namespace of 2162 * the vfsmount must be passed through @mnt_userns. This function will then take 2163 * care to map the inode according to @mnt_userns before checking permissions 2164 * and initializing i_uid and i_gid. On non-idmapped mounts or if permission 2165 * checking is to be performed on the raw inode simply passs init_user_ns. 2166 */ 2167 void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, 2168 const struct inode *dir, umode_t mode) 2169 { 2170 inode_fsuid_set(inode, mnt_userns); 2171 if (dir && dir->i_mode & S_ISGID) { 2172 inode->i_gid = dir->i_gid; 2173 2174 /* Directories are special, and always inherit S_ISGID */ 2175 if (S_ISDIR(mode)) 2176 mode |= S_ISGID; 2177 else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && 2178 !in_group_p(i_gid_into_mnt(mnt_userns, dir)) && 2179 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) 2180 mode &= ~S_ISGID; 2181 } else 2182 inode_fsgid_set(inode, mnt_userns); 2183 inode->i_mode = mode; 2184 } 2185 EXPORT_SYMBOL(inode_init_owner); 2186 2187 /** 2188 * inode_owner_or_capable - check current task permissions to inode 2189 * @mnt_userns: user namespace of the mount the inode was found from 2190 * @inode: inode being checked 2191 * 2192 * Return true if current either has CAP_FOWNER in a namespace with the 2193 * inode owner uid mapped, or owns the file. 2194 * 2195 * If the inode has been found through an idmapped mount the user namespace of 2196 * the vfsmount must be passed through @mnt_userns. This function will then take 2197 * care to map the inode according to @mnt_userns before checking permissions. 2198 * On non-idmapped mounts or if permission checking is to be performed on the 2199 * raw inode simply passs init_user_ns. 2200 */ 2201 bool inode_owner_or_capable(struct user_namespace *mnt_userns, 2202 const struct inode *inode) 2203 { 2204 kuid_t i_uid; 2205 struct user_namespace *ns; 2206 2207 i_uid = i_uid_into_mnt(mnt_userns, inode); 2208 if (uid_eq(current_fsuid(), i_uid)) 2209 return true; 2210 2211 ns = current_user_ns(); 2212 if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER)) 2213 return true; 2214 return false; 2215 } 2216 EXPORT_SYMBOL(inode_owner_or_capable); 2217 2218 /* 2219 * Direct i/o helper functions 2220 */ 2221 static void __inode_dio_wait(struct inode *inode) 2222 { 2223 wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); 2224 DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); 2225 2226 do { 2227 prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); 2228 if (atomic_read(&inode->i_dio_count)) 2229 schedule(); 2230 } while (atomic_read(&inode->i_dio_count)); 2231 finish_wait(wq, &q.wq_entry); 2232 } 2233 2234 /** 2235 * inode_dio_wait - wait for outstanding DIO requests to finish 2236 * @inode: inode to wait for 2237 * 2238 * Waits for all pending direct I/O requests to finish so that we can 2239 * proceed with a truncate or equivalent operation. 2240 * 2241 * Must be called under a lock that serializes taking new references 2242 * to i_dio_count, usually by inode->i_mutex. 2243 */ 2244 void inode_dio_wait(struct inode *inode) 2245 { 2246 if (atomic_read(&inode->i_dio_count)) 2247 __inode_dio_wait(inode); 2248 } 2249 EXPORT_SYMBOL(inode_dio_wait); 2250 2251 /* 2252 * inode_set_flags - atomically set some inode flags 2253 * 2254 * Note: the caller should be holding i_mutex, or else be sure that 2255 * they have exclusive access to the inode structure (i.e., while the 2256 * inode is being instantiated). The reason for the cmpxchg() loop 2257 * --- which wouldn't be necessary if all code paths which modify 2258 * i_flags actually followed this rule, is that there is at least one 2259 * code path which doesn't today so we use cmpxchg() out of an abundance 2260 * of caution. 2261 * 2262 * In the long run, i_mutex is overkill, and we should probably look 2263 * at using the i_lock spinlock to protect i_flags, and then make sure 2264 * it is so documented in include/linux/fs.h and that all code follows 2265 * the locking convention!! 2266 */ 2267 void inode_set_flags(struct inode *inode, unsigned int flags, 2268 unsigned int mask) 2269 { 2270 WARN_ON_ONCE(flags & ~mask); 2271 set_mask_bits(&inode->i_flags, mask, flags); 2272 } 2273 EXPORT_SYMBOL(inode_set_flags); 2274 2275 void inode_nohighmem(struct inode *inode) 2276 { 2277 mapping_set_gfp_mask(inode->i_mapping, GFP_USER); 2278 } 2279 EXPORT_SYMBOL(inode_nohighmem); 2280 2281 /** 2282 * timestamp_truncate - Truncate timespec to a granularity 2283 * @t: Timespec 2284 * @inode: inode being updated 2285 * 2286 * Truncate a timespec to the granularity supported by the fs 2287 * containing the inode. Always rounds down. gran must 2288 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). 2289 */ 2290 struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) 2291 { 2292 struct super_block *sb = inode->i_sb; 2293 unsigned int gran = sb->s_time_gran; 2294 2295 t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); 2296 if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) 2297 t.tv_nsec = 0; 2298 2299 /* Avoid division in the common cases 1 ns and 1 s. */ 2300 if (gran == 1) 2301 ; /* nothing */ 2302 else if (gran == NSEC_PER_SEC) 2303 t.tv_nsec = 0; 2304 else if (gran > 1 && gran < NSEC_PER_SEC) 2305 t.tv_nsec -= t.tv_nsec % gran; 2306 else 2307 WARN(1, "invalid file time granularity: %u", gran); 2308 return t; 2309 } 2310 EXPORT_SYMBOL(timestamp_truncate); 2311 2312 /** 2313 * current_time - Return FS time 2314 * @inode: inode. 2315 * 2316 * Return the current time truncated to the time granularity supported by 2317 * the fs. 2318 * 2319 * Note that inode and inode->sb cannot be NULL. 2320 * Otherwise, the function warns and returns time without truncation. 2321 */ 2322 struct timespec64 current_time(struct inode *inode) 2323 { 2324 struct timespec64 now; 2325 2326 ktime_get_coarse_real_ts64(&now); 2327 2328 if (unlikely(!inode->i_sb)) { 2329 WARN(1, "current_time() called with uninitialized super_block in the inode"); 2330 return now; 2331 } 2332 2333 return timestamp_truncate(now, inode); 2334 } 2335 EXPORT_SYMBOL(current_time); 2336