1 /* 2 * (C) 1997 Linus Torvalds 3 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) 4 */ 5 #include <linux/export.h> 6 #include <linux/fs.h> 7 #include <linux/mm.h> 8 #include <linux/backing-dev.h> 9 #include <linux/hash.h> 10 #include <linux/swap.h> 11 #include <linux/security.h> 12 #include <linux/cdev.h> 13 #include <linux/bootmem.h> 14 #include <linux/fsnotify.h> 15 #include <linux/mount.h> 16 #include <linux/posix_acl.h> 17 #include <linux/prefetch.h> 18 #include <linux/buffer_head.h> /* for inode_has_buffers */ 19 #include <linux/ratelimit.h> 20 #include <linux/list_lru.h> 21 #include <trace/events/writeback.h> 22 #include "internal.h" 23 24 /* 25 * Inode locking rules: 26 * 27 * inode->i_lock protects: 28 * inode->i_state, inode->i_hash, __iget() 29 * Inode LRU list locks protect: 30 * inode->i_sb->s_inode_lru, inode->i_lru 31 * inode->i_sb->s_inode_list_lock protects: 32 * inode->i_sb->s_inodes, inode->i_sb_list 33 * bdi->wb.list_lock protects: 34 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list 35 * inode_hash_lock protects: 36 * inode_hashtable, inode->i_hash 37 * 38 * Lock ordering: 39 * 40 * inode->i_sb->s_inode_list_lock 41 * inode->i_lock 42 * Inode LRU list locks 43 * 44 * bdi->wb.list_lock 45 * inode->i_lock 46 * 47 * inode_hash_lock 48 * inode->i_sb->s_inode_list_lock 49 * inode->i_lock 50 * 51 * iunique_lock 52 * inode_hash_lock 53 */ 54 55 static unsigned int i_hash_mask __read_mostly; 56 static unsigned int i_hash_shift __read_mostly; 57 static struct hlist_head *inode_hashtable __read_mostly; 58 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 59 60 /* 61 * Empty aops. Can be used for the cases where the user does not 62 * define any of the address_space operations. 63 */ 64 const struct address_space_operations empty_aops = { 65 }; 66 EXPORT_SYMBOL(empty_aops); 67 68 /* 69 * Statistics gathering.. 70 */ 71 struct inodes_stat_t inodes_stat; 72 73 static DEFINE_PER_CPU(unsigned long, nr_inodes); 74 static DEFINE_PER_CPU(unsigned long, nr_unused); 75 76 static struct kmem_cache *inode_cachep __read_mostly; 77 78 static long get_nr_inodes(void) 79 { 80 int i; 81 long sum = 0; 82 for_each_possible_cpu(i) 83 sum += per_cpu(nr_inodes, i); 84 return sum < 0 ? 0 : sum; 85 } 86 87 static inline long get_nr_inodes_unused(void) 88 { 89 int i; 90 long sum = 0; 91 for_each_possible_cpu(i) 92 sum += per_cpu(nr_unused, i); 93 return sum < 0 ? 0 : sum; 94 } 95 96 long get_nr_dirty_inodes(void) 97 { 98 /* not actually dirty inodes, but a wild approximation */ 99 long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 100 return nr_dirty > 0 ? nr_dirty : 0; 101 } 102 103 /* 104 * Handle nr_inode sysctl 105 */ 106 #ifdef CONFIG_SYSCTL 107 int proc_nr_inodes(struct ctl_table *table, int write, 108 void __user *buffer, size_t *lenp, loff_t *ppos) 109 { 110 inodes_stat.nr_inodes = get_nr_inodes(); 111 inodes_stat.nr_unused = get_nr_inodes_unused(); 112 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 113 } 114 #endif 115 116 static int no_open(struct inode *inode, struct file *file) 117 { 118 return -ENXIO; 119 } 120 121 /** 122 * inode_init_always - perform inode structure intialisation 123 * @sb: superblock inode belongs to 124 * @inode: inode to initialise 125 * 126 * These are initializations that need to be done on every inode 127 * allocation as the fields are not initialised by slab allocation. 128 */ 129 int inode_init_always(struct super_block *sb, struct inode *inode) 130 { 131 static const struct inode_operations empty_iops; 132 static const struct file_operations no_open_fops = {.open = no_open}; 133 struct address_space *const mapping = &inode->i_data; 134 135 inode->i_sb = sb; 136 inode->i_blkbits = sb->s_blocksize_bits; 137 inode->i_flags = 0; 138 atomic_set(&inode->i_count, 1); 139 inode->i_op = &empty_iops; 140 inode->i_fop = &no_open_fops; 141 inode->__i_nlink = 1; 142 inode->i_opflags = 0; 143 i_uid_write(inode, 0); 144 i_gid_write(inode, 0); 145 atomic_set(&inode->i_writecount, 0); 146 inode->i_size = 0; 147 inode->i_blocks = 0; 148 inode->i_bytes = 0; 149 inode->i_generation = 0; 150 inode->i_pipe = NULL; 151 inode->i_bdev = NULL; 152 inode->i_cdev = NULL; 153 inode->i_link = NULL; 154 inode->i_rdev = 0; 155 inode->dirtied_when = 0; 156 157 #ifdef CONFIG_CGROUP_WRITEBACK 158 inode->i_wb_frn_winner = 0; 159 inode->i_wb_frn_avg_time = 0; 160 inode->i_wb_frn_history = 0; 161 #endif 162 163 if (security_inode_alloc(inode)) 164 goto out; 165 spin_lock_init(&inode->i_lock); 166 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 167 168 mutex_init(&inode->i_mutex); 169 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 170 171 atomic_set(&inode->i_dio_count, 0); 172 173 mapping->a_ops = &empty_aops; 174 mapping->host = inode; 175 mapping->flags = 0; 176 atomic_set(&mapping->i_mmap_writable, 0); 177 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 178 mapping->private_data = NULL; 179 mapping->writeback_index = 0; 180 inode->i_private = NULL; 181 inode->i_mapping = mapping; 182 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ 183 #ifdef CONFIG_FS_POSIX_ACL 184 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 185 #endif 186 187 #ifdef CONFIG_FSNOTIFY 188 inode->i_fsnotify_mask = 0; 189 #endif 190 inode->i_flctx = NULL; 191 this_cpu_inc(nr_inodes); 192 193 return 0; 194 out: 195 return -ENOMEM; 196 } 197 EXPORT_SYMBOL(inode_init_always); 198 199 static struct inode *alloc_inode(struct super_block *sb) 200 { 201 struct inode *inode; 202 203 if (sb->s_op->alloc_inode) 204 inode = sb->s_op->alloc_inode(sb); 205 else 206 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 207 208 if (!inode) 209 return NULL; 210 211 if (unlikely(inode_init_always(sb, inode))) { 212 if (inode->i_sb->s_op->destroy_inode) 213 inode->i_sb->s_op->destroy_inode(inode); 214 else 215 kmem_cache_free(inode_cachep, inode); 216 return NULL; 217 } 218 219 return inode; 220 } 221 222 void free_inode_nonrcu(struct inode *inode) 223 { 224 kmem_cache_free(inode_cachep, inode); 225 } 226 EXPORT_SYMBOL(free_inode_nonrcu); 227 228 void __destroy_inode(struct inode *inode) 229 { 230 BUG_ON(inode_has_buffers(inode)); 231 inode_detach_wb(inode); 232 security_inode_free(inode); 233 fsnotify_inode_delete(inode); 234 locks_free_lock_context(inode); 235 if (!inode->i_nlink) { 236 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); 237 atomic_long_dec(&inode->i_sb->s_remove_count); 238 } 239 240 #ifdef CONFIG_FS_POSIX_ACL 241 if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) 242 posix_acl_release(inode->i_acl); 243 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 244 posix_acl_release(inode->i_default_acl); 245 #endif 246 this_cpu_dec(nr_inodes); 247 } 248 EXPORT_SYMBOL(__destroy_inode); 249 250 static void i_callback(struct rcu_head *head) 251 { 252 struct inode *inode = container_of(head, struct inode, i_rcu); 253 kmem_cache_free(inode_cachep, inode); 254 } 255 256 static void destroy_inode(struct inode *inode) 257 { 258 BUG_ON(!list_empty(&inode->i_lru)); 259 __destroy_inode(inode); 260 if (inode->i_sb->s_op->destroy_inode) 261 inode->i_sb->s_op->destroy_inode(inode); 262 else 263 call_rcu(&inode->i_rcu, i_callback); 264 } 265 266 /** 267 * drop_nlink - directly drop an inode's link count 268 * @inode: inode 269 * 270 * This is a low-level filesystem helper to replace any 271 * direct filesystem manipulation of i_nlink. In cases 272 * where we are attempting to track writes to the 273 * filesystem, a decrement to zero means an imminent 274 * write when the file is truncated and actually unlinked 275 * on the filesystem. 276 */ 277 void drop_nlink(struct inode *inode) 278 { 279 WARN_ON(inode->i_nlink == 0); 280 inode->__i_nlink--; 281 if (!inode->i_nlink) 282 atomic_long_inc(&inode->i_sb->s_remove_count); 283 } 284 EXPORT_SYMBOL(drop_nlink); 285 286 /** 287 * clear_nlink - directly zero an inode's link count 288 * @inode: inode 289 * 290 * This is a low-level filesystem helper to replace any 291 * direct filesystem manipulation of i_nlink. See 292 * drop_nlink() for why we care about i_nlink hitting zero. 293 */ 294 void clear_nlink(struct inode *inode) 295 { 296 if (inode->i_nlink) { 297 inode->__i_nlink = 0; 298 atomic_long_inc(&inode->i_sb->s_remove_count); 299 } 300 } 301 EXPORT_SYMBOL(clear_nlink); 302 303 /** 304 * set_nlink - directly set an inode's link count 305 * @inode: inode 306 * @nlink: new nlink (should be non-zero) 307 * 308 * This is a low-level filesystem helper to replace any 309 * direct filesystem manipulation of i_nlink. 310 */ 311 void set_nlink(struct inode *inode, unsigned int nlink) 312 { 313 if (!nlink) { 314 clear_nlink(inode); 315 } else { 316 /* Yes, some filesystems do change nlink from zero to one */ 317 if (inode->i_nlink == 0) 318 atomic_long_dec(&inode->i_sb->s_remove_count); 319 320 inode->__i_nlink = nlink; 321 } 322 } 323 EXPORT_SYMBOL(set_nlink); 324 325 /** 326 * inc_nlink - directly increment an inode's link count 327 * @inode: inode 328 * 329 * This is a low-level filesystem helper to replace any 330 * direct filesystem manipulation of i_nlink. Currently, 331 * it is only here for parity with dec_nlink(). 332 */ 333 void inc_nlink(struct inode *inode) 334 { 335 if (unlikely(inode->i_nlink == 0)) { 336 WARN_ON(!(inode->i_state & I_LINKABLE)); 337 atomic_long_dec(&inode->i_sb->s_remove_count); 338 } 339 340 inode->__i_nlink++; 341 } 342 EXPORT_SYMBOL(inc_nlink); 343 344 void address_space_init_once(struct address_space *mapping) 345 { 346 memset(mapping, 0, sizeof(*mapping)); 347 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); 348 spin_lock_init(&mapping->tree_lock); 349 init_rwsem(&mapping->i_mmap_rwsem); 350 INIT_LIST_HEAD(&mapping->private_list); 351 spin_lock_init(&mapping->private_lock); 352 mapping->i_mmap = RB_ROOT; 353 } 354 EXPORT_SYMBOL(address_space_init_once); 355 356 /* 357 * These are initializations that only need to be done 358 * once, because the fields are idempotent across use 359 * of the inode, so let the slab aware of that. 360 */ 361 void inode_init_once(struct inode *inode) 362 { 363 memset(inode, 0, sizeof(*inode)); 364 INIT_HLIST_NODE(&inode->i_hash); 365 INIT_LIST_HEAD(&inode->i_devices); 366 INIT_LIST_HEAD(&inode->i_io_list); 367 INIT_LIST_HEAD(&inode->i_lru); 368 address_space_init_once(&inode->i_data); 369 i_size_ordered_init(inode); 370 #ifdef CONFIG_FSNOTIFY 371 INIT_HLIST_HEAD(&inode->i_fsnotify_marks); 372 #endif 373 } 374 EXPORT_SYMBOL(inode_init_once); 375 376 static void init_once(void *foo) 377 { 378 struct inode *inode = (struct inode *) foo; 379 380 inode_init_once(inode); 381 } 382 383 /* 384 * inode->i_lock must be held 385 */ 386 void __iget(struct inode *inode) 387 { 388 atomic_inc(&inode->i_count); 389 } 390 391 /* 392 * get additional reference to inode; caller must already hold one. 393 */ 394 void ihold(struct inode *inode) 395 { 396 WARN_ON(atomic_inc_return(&inode->i_count) < 2); 397 } 398 EXPORT_SYMBOL(ihold); 399 400 static void inode_lru_list_add(struct inode *inode) 401 { 402 if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) 403 this_cpu_inc(nr_unused); 404 } 405 406 /* 407 * Add inode to LRU if needed (inode is unused and clean). 408 * 409 * Needs inode->i_lock held. 410 */ 411 void inode_add_lru(struct inode *inode) 412 { 413 if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | 414 I_FREEING | I_WILL_FREE)) && 415 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) 416 inode_lru_list_add(inode); 417 } 418 419 420 static void inode_lru_list_del(struct inode *inode) 421 { 422 423 if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) 424 this_cpu_dec(nr_unused); 425 } 426 427 /** 428 * inode_sb_list_add - add inode to the superblock list of inodes 429 * @inode: inode to add 430 */ 431 void inode_sb_list_add(struct inode *inode) 432 { 433 spin_lock(&inode->i_sb->s_inode_list_lock); 434 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 435 spin_unlock(&inode->i_sb->s_inode_list_lock); 436 } 437 EXPORT_SYMBOL_GPL(inode_sb_list_add); 438 439 static inline void inode_sb_list_del(struct inode *inode) 440 { 441 if (!list_empty(&inode->i_sb_list)) { 442 spin_lock(&inode->i_sb->s_inode_list_lock); 443 list_del_init(&inode->i_sb_list); 444 spin_unlock(&inode->i_sb->s_inode_list_lock); 445 } 446 } 447 448 static unsigned long hash(struct super_block *sb, unsigned long hashval) 449 { 450 unsigned long tmp; 451 452 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 453 L1_CACHE_BYTES; 454 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); 455 return tmp & i_hash_mask; 456 } 457 458 /** 459 * __insert_inode_hash - hash an inode 460 * @inode: unhashed inode 461 * @hashval: unsigned long value used to locate this object in the 462 * inode_hashtable. 463 * 464 * Add an inode to the inode hash for this superblock. 465 */ 466 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 467 { 468 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 469 470 spin_lock(&inode_hash_lock); 471 spin_lock(&inode->i_lock); 472 hlist_add_head(&inode->i_hash, b); 473 spin_unlock(&inode->i_lock); 474 spin_unlock(&inode_hash_lock); 475 } 476 EXPORT_SYMBOL(__insert_inode_hash); 477 478 /** 479 * __remove_inode_hash - remove an inode from the hash 480 * @inode: inode to unhash 481 * 482 * Remove an inode from the superblock. 483 */ 484 void __remove_inode_hash(struct inode *inode) 485 { 486 spin_lock(&inode_hash_lock); 487 spin_lock(&inode->i_lock); 488 hlist_del_init(&inode->i_hash); 489 spin_unlock(&inode->i_lock); 490 spin_unlock(&inode_hash_lock); 491 } 492 EXPORT_SYMBOL(__remove_inode_hash); 493 494 void clear_inode(struct inode *inode) 495 { 496 might_sleep(); 497 /* 498 * We have to cycle tree_lock here because reclaim can be still in the 499 * process of removing the last page (in __delete_from_page_cache()) 500 * and we must not free mapping under it. 501 */ 502 spin_lock_irq(&inode->i_data.tree_lock); 503 BUG_ON(inode->i_data.nrpages); 504 BUG_ON(inode->i_data.nrexceptional); 505 spin_unlock_irq(&inode->i_data.tree_lock); 506 BUG_ON(!list_empty(&inode->i_data.private_list)); 507 BUG_ON(!(inode->i_state & I_FREEING)); 508 BUG_ON(inode->i_state & I_CLEAR); 509 /* don't need i_lock here, no concurrent mods to i_state */ 510 inode->i_state = I_FREEING | I_CLEAR; 511 } 512 EXPORT_SYMBOL(clear_inode); 513 514 /* 515 * Free the inode passed in, removing it from the lists it is still connected 516 * to. We remove any pages still attached to the inode and wait for any IO that 517 * is still in progress before finally destroying the inode. 518 * 519 * An inode must already be marked I_FREEING so that we avoid the inode being 520 * moved back onto lists if we race with other code that manipulates the lists 521 * (e.g. writeback_single_inode). The caller is responsible for setting this. 522 * 523 * An inode must already be removed from the LRU list before being evicted from 524 * the cache. This should occur atomically with setting the I_FREEING state 525 * flag, so no inodes here should ever be on the LRU when being evicted. 526 */ 527 static void evict(struct inode *inode) 528 { 529 const struct super_operations *op = inode->i_sb->s_op; 530 531 BUG_ON(!(inode->i_state & I_FREEING)); 532 BUG_ON(!list_empty(&inode->i_lru)); 533 534 if (!list_empty(&inode->i_io_list)) 535 inode_io_list_del(inode); 536 537 inode_sb_list_del(inode); 538 539 /* 540 * Wait for flusher thread to be done with the inode so that filesystem 541 * does not start destroying it while writeback is still running. Since 542 * the inode has I_FREEING set, flusher thread won't start new work on 543 * the inode. We just have to wait for running writeback to finish. 544 */ 545 inode_wait_for_writeback(inode); 546 547 if (op->evict_inode) { 548 op->evict_inode(inode); 549 } else { 550 truncate_inode_pages_final(&inode->i_data); 551 clear_inode(inode); 552 } 553 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 554 bd_forget(inode); 555 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 556 cd_forget(inode); 557 558 remove_inode_hash(inode); 559 560 spin_lock(&inode->i_lock); 561 wake_up_bit(&inode->i_state, __I_NEW); 562 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 563 spin_unlock(&inode->i_lock); 564 565 destroy_inode(inode); 566 } 567 568 /* 569 * dispose_list - dispose of the contents of a local list 570 * @head: the head of the list to free 571 * 572 * Dispose-list gets a local list with local inodes in it, so it doesn't 573 * need to worry about list corruption and SMP locks. 574 */ 575 static void dispose_list(struct list_head *head) 576 { 577 while (!list_empty(head)) { 578 struct inode *inode; 579 580 inode = list_first_entry(head, struct inode, i_lru); 581 list_del_init(&inode->i_lru); 582 583 evict(inode); 584 cond_resched(); 585 } 586 } 587 588 /** 589 * evict_inodes - evict all evictable inodes for a superblock 590 * @sb: superblock to operate on 591 * 592 * Make sure that no inodes with zero refcount are retained. This is 593 * called by superblock shutdown after having MS_ACTIVE flag removed, 594 * so any inode reaching zero refcount during or after that call will 595 * be immediately evicted. 596 */ 597 void evict_inodes(struct super_block *sb) 598 { 599 struct inode *inode, *next; 600 LIST_HEAD(dispose); 601 602 again: 603 spin_lock(&sb->s_inode_list_lock); 604 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 605 if (atomic_read(&inode->i_count)) 606 continue; 607 608 spin_lock(&inode->i_lock); 609 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 610 spin_unlock(&inode->i_lock); 611 continue; 612 } 613 614 inode->i_state |= I_FREEING; 615 inode_lru_list_del(inode); 616 spin_unlock(&inode->i_lock); 617 list_add(&inode->i_lru, &dispose); 618 619 /* 620 * We can have a ton of inodes to evict at unmount time given 621 * enough memory, check to see if we need to go to sleep for a 622 * bit so we don't livelock. 623 */ 624 if (need_resched()) { 625 spin_unlock(&sb->s_inode_list_lock); 626 cond_resched(); 627 dispose_list(&dispose); 628 goto again; 629 } 630 } 631 spin_unlock(&sb->s_inode_list_lock); 632 633 dispose_list(&dispose); 634 } 635 636 /** 637 * invalidate_inodes - attempt to free all inodes on a superblock 638 * @sb: superblock to operate on 639 * @kill_dirty: flag to guide handling of dirty inodes 640 * 641 * Attempts to free all inodes for a given superblock. If there were any 642 * busy inodes return a non-zero value, else zero. 643 * If @kill_dirty is set, discard dirty inodes too, otherwise treat 644 * them as busy. 645 */ 646 int invalidate_inodes(struct super_block *sb, bool kill_dirty) 647 { 648 int busy = 0; 649 struct inode *inode, *next; 650 LIST_HEAD(dispose); 651 652 spin_lock(&sb->s_inode_list_lock); 653 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 654 spin_lock(&inode->i_lock); 655 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 656 spin_unlock(&inode->i_lock); 657 continue; 658 } 659 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { 660 spin_unlock(&inode->i_lock); 661 busy = 1; 662 continue; 663 } 664 if (atomic_read(&inode->i_count)) { 665 spin_unlock(&inode->i_lock); 666 busy = 1; 667 continue; 668 } 669 670 inode->i_state |= I_FREEING; 671 inode_lru_list_del(inode); 672 spin_unlock(&inode->i_lock); 673 list_add(&inode->i_lru, &dispose); 674 } 675 spin_unlock(&sb->s_inode_list_lock); 676 677 dispose_list(&dispose); 678 679 return busy; 680 } 681 682 /* 683 * Isolate the inode from the LRU in preparation for freeing it. 684 * 685 * Any inodes which are pinned purely because of attached pagecache have their 686 * pagecache removed. If the inode has metadata buffers attached to 687 * mapping->private_list then try to remove them. 688 * 689 * If the inode has the I_REFERENCED flag set, then it means that it has been 690 * used recently - the flag is set in iput_final(). When we encounter such an 691 * inode, clear the flag and move it to the back of the LRU so it gets another 692 * pass through the LRU before it gets reclaimed. This is necessary because of 693 * the fact we are doing lazy LRU updates to minimise lock contention so the 694 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 695 * with this flag set because they are the inodes that are out of order. 696 */ 697 static enum lru_status inode_lru_isolate(struct list_head *item, 698 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 699 { 700 struct list_head *freeable = arg; 701 struct inode *inode = container_of(item, struct inode, i_lru); 702 703 /* 704 * we are inverting the lru lock/inode->i_lock here, so use a trylock. 705 * If we fail to get the lock, just skip it. 706 */ 707 if (!spin_trylock(&inode->i_lock)) 708 return LRU_SKIP; 709 710 /* 711 * Referenced or dirty inodes are still in use. Give them another pass 712 * through the LRU as we canot reclaim them now. 713 */ 714 if (atomic_read(&inode->i_count) || 715 (inode->i_state & ~I_REFERENCED)) { 716 list_lru_isolate(lru, &inode->i_lru); 717 spin_unlock(&inode->i_lock); 718 this_cpu_dec(nr_unused); 719 return LRU_REMOVED; 720 } 721 722 /* recently referenced inodes get one more pass */ 723 if (inode->i_state & I_REFERENCED) { 724 inode->i_state &= ~I_REFERENCED; 725 spin_unlock(&inode->i_lock); 726 return LRU_ROTATE; 727 } 728 729 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 730 __iget(inode); 731 spin_unlock(&inode->i_lock); 732 spin_unlock(lru_lock); 733 if (remove_inode_buffers(inode)) { 734 unsigned long reap; 735 reap = invalidate_mapping_pages(&inode->i_data, 0, -1); 736 if (current_is_kswapd()) 737 __count_vm_events(KSWAPD_INODESTEAL, reap); 738 else 739 __count_vm_events(PGINODESTEAL, reap); 740 if (current->reclaim_state) 741 current->reclaim_state->reclaimed_slab += reap; 742 } 743 iput(inode); 744 spin_lock(lru_lock); 745 return LRU_RETRY; 746 } 747 748 WARN_ON(inode->i_state & I_NEW); 749 inode->i_state |= I_FREEING; 750 list_lru_isolate_move(lru, &inode->i_lru, freeable); 751 spin_unlock(&inode->i_lock); 752 753 this_cpu_dec(nr_unused); 754 return LRU_REMOVED; 755 } 756 757 /* 758 * Walk the superblock inode LRU for freeable inodes and attempt to free them. 759 * This is called from the superblock shrinker function with a number of inodes 760 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 761 * then are freed outside inode_lock by dispose_list(). 762 */ 763 long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) 764 { 765 LIST_HEAD(freeable); 766 long freed; 767 768 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, 769 inode_lru_isolate, &freeable); 770 dispose_list(&freeable); 771 return freed; 772 } 773 774 static void __wait_on_freeing_inode(struct inode *inode); 775 /* 776 * Called with the inode lock held. 777 */ 778 static struct inode *find_inode(struct super_block *sb, 779 struct hlist_head *head, 780 int (*test)(struct inode *, void *), 781 void *data) 782 { 783 struct inode *inode = NULL; 784 785 repeat: 786 hlist_for_each_entry(inode, head, i_hash) { 787 if (inode->i_sb != sb) 788 continue; 789 if (!test(inode, data)) 790 continue; 791 spin_lock(&inode->i_lock); 792 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 793 __wait_on_freeing_inode(inode); 794 goto repeat; 795 } 796 __iget(inode); 797 spin_unlock(&inode->i_lock); 798 return inode; 799 } 800 return NULL; 801 } 802 803 /* 804 * find_inode_fast is the fast path version of find_inode, see the comment at 805 * iget_locked for details. 806 */ 807 static struct inode *find_inode_fast(struct super_block *sb, 808 struct hlist_head *head, unsigned long ino) 809 { 810 struct inode *inode = NULL; 811 812 repeat: 813 hlist_for_each_entry(inode, head, i_hash) { 814 if (inode->i_ino != ino) 815 continue; 816 if (inode->i_sb != sb) 817 continue; 818 spin_lock(&inode->i_lock); 819 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 820 __wait_on_freeing_inode(inode); 821 goto repeat; 822 } 823 __iget(inode); 824 spin_unlock(&inode->i_lock); 825 return inode; 826 } 827 return NULL; 828 } 829 830 /* 831 * Each cpu owns a range of LAST_INO_BATCH numbers. 832 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, 833 * to renew the exhausted range. 834 * 835 * This does not significantly increase overflow rate because every CPU can 836 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is 837 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the 838 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase 839 * overflow rate by 2x, which does not seem too significant. 840 * 841 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 842 * error if st_ino won't fit in target struct field. Use 32bit counter 843 * here to attempt to avoid that. 844 */ 845 #define LAST_INO_BATCH 1024 846 static DEFINE_PER_CPU(unsigned int, last_ino); 847 848 unsigned int get_next_ino(void) 849 { 850 unsigned int *p = &get_cpu_var(last_ino); 851 unsigned int res = *p; 852 853 #ifdef CONFIG_SMP 854 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { 855 static atomic_t shared_last_ino; 856 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); 857 858 res = next - LAST_INO_BATCH; 859 } 860 #endif 861 862 res++; 863 /* get_next_ino should not provide a 0 inode number */ 864 if (unlikely(!res)) 865 res++; 866 *p = res; 867 put_cpu_var(last_ino); 868 return res; 869 } 870 EXPORT_SYMBOL(get_next_ino); 871 872 /** 873 * new_inode_pseudo - obtain an inode 874 * @sb: superblock 875 * 876 * Allocates a new inode for given superblock. 877 * Inode wont be chained in superblock s_inodes list 878 * This means : 879 * - fs can't be unmount 880 * - quotas, fsnotify, writeback can't work 881 */ 882 struct inode *new_inode_pseudo(struct super_block *sb) 883 { 884 struct inode *inode = alloc_inode(sb); 885 886 if (inode) { 887 spin_lock(&inode->i_lock); 888 inode->i_state = 0; 889 spin_unlock(&inode->i_lock); 890 INIT_LIST_HEAD(&inode->i_sb_list); 891 } 892 return inode; 893 } 894 895 /** 896 * new_inode - obtain an inode 897 * @sb: superblock 898 * 899 * Allocates a new inode for given superblock. The default gfp_mask 900 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. 901 * If HIGHMEM pages are unsuitable or it is known that pages allocated 902 * for the page cache are not reclaimable or migratable, 903 * mapping_set_gfp_mask() must be called with suitable flags on the 904 * newly created inode's mapping 905 * 906 */ 907 struct inode *new_inode(struct super_block *sb) 908 { 909 struct inode *inode; 910 911 spin_lock_prefetch(&sb->s_inode_list_lock); 912 913 inode = new_inode_pseudo(sb); 914 if (inode) 915 inode_sb_list_add(inode); 916 return inode; 917 } 918 EXPORT_SYMBOL(new_inode); 919 920 #ifdef CONFIG_DEBUG_LOCK_ALLOC 921 void lockdep_annotate_inode_mutex_key(struct inode *inode) 922 { 923 if (S_ISDIR(inode->i_mode)) { 924 struct file_system_type *type = inode->i_sb->s_type; 925 926 /* Set new key only if filesystem hasn't already changed it */ 927 if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { 928 /* 929 * ensure nobody is actually holding i_mutex 930 */ 931 mutex_destroy(&inode->i_mutex); 932 mutex_init(&inode->i_mutex); 933 lockdep_set_class(&inode->i_mutex, 934 &type->i_mutex_dir_key); 935 } 936 } 937 } 938 EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); 939 #endif 940 941 /** 942 * unlock_new_inode - clear the I_NEW state and wake up any waiters 943 * @inode: new inode to unlock 944 * 945 * Called when the inode is fully initialised to clear the new state of the 946 * inode and wake up anyone waiting for the inode to finish initialisation. 947 */ 948 void unlock_new_inode(struct inode *inode) 949 { 950 lockdep_annotate_inode_mutex_key(inode); 951 spin_lock(&inode->i_lock); 952 WARN_ON(!(inode->i_state & I_NEW)); 953 inode->i_state &= ~I_NEW; 954 smp_mb(); 955 wake_up_bit(&inode->i_state, __I_NEW); 956 spin_unlock(&inode->i_lock); 957 } 958 EXPORT_SYMBOL(unlock_new_inode); 959 960 /** 961 * lock_two_nondirectories - take two i_mutexes on non-directory objects 962 * 963 * Lock any non-NULL argument that is not a directory. 964 * Zero, one or two objects may be locked by this function. 965 * 966 * @inode1: first inode to lock 967 * @inode2: second inode to lock 968 */ 969 void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 970 { 971 if (inode1 > inode2) 972 swap(inode1, inode2); 973 974 if (inode1 && !S_ISDIR(inode1->i_mode)) 975 inode_lock(inode1); 976 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 977 inode_lock_nested(inode2, I_MUTEX_NONDIR2); 978 } 979 EXPORT_SYMBOL(lock_two_nondirectories); 980 981 /** 982 * unlock_two_nondirectories - release locks from lock_two_nondirectories() 983 * @inode1: first inode to unlock 984 * @inode2: second inode to unlock 985 */ 986 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 987 { 988 if (inode1 && !S_ISDIR(inode1->i_mode)) 989 inode_unlock(inode1); 990 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 991 inode_unlock(inode2); 992 } 993 EXPORT_SYMBOL(unlock_two_nondirectories); 994 995 /** 996 * iget5_locked - obtain an inode from a mounted file system 997 * @sb: super block of file system 998 * @hashval: hash value (usually inode number) to get 999 * @test: callback used for comparisons between inodes 1000 * @set: callback used to initialize a new struct inode 1001 * @data: opaque data pointer to pass to @test and @set 1002 * 1003 * Search for the inode specified by @hashval and @data in the inode cache, 1004 * and if present it is return it with an increased reference count. This is 1005 * a generalized version of iget_locked() for file systems where the inode 1006 * number is not sufficient for unique identification of an inode. 1007 * 1008 * If the inode is not in cache, allocate a new inode and return it locked, 1009 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1010 * before unlocking it via unlock_new_inode(). 1011 * 1012 * Note both @test and @set are called with the inode_hash_lock held, so can't 1013 * sleep. 1014 */ 1015 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 1016 int (*test)(struct inode *, void *), 1017 int (*set)(struct inode *, void *), void *data) 1018 { 1019 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1020 struct inode *inode; 1021 1022 spin_lock(&inode_hash_lock); 1023 inode = find_inode(sb, head, test, data); 1024 spin_unlock(&inode_hash_lock); 1025 1026 if (inode) { 1027 wait_on_inode(inode); 1028 return inode; 1029 } 1030 1031 inode = alloc_inode(sb); 1032 if (inode) { 1033 struct inode *old; 1034 1035 spin_lock(&inode_hash_lock); 1036 /* We released the lock, so.. */ 1037 old = find_inode(sb, head, test, data); 1038 if (!old) { 1039 if (set(inode, data)) 1040 goto set_failed; 1041 1042 spin_lock(&inode->i_lock); 1043 inode->i_state = I_NEW; 1044 hlist_add_head(&inode->i_hash, head); 1045 spin_unlock(&inode->i_lock); 1046 inode_sb_list_add(inode); 1047 spin_unlock(&inode_hash_lock); 1048 1049 /* Return the locked inode with I_NEW set, the 1050 * caller is responsible for filling in the contents 1051 */ 1052 return inode; 1053 } 1054 1055 /* 1056 * Uhhuh, somebody else created the same inode under 1057 * us. Use the old inode instead of the one we just 1058 * allocated. 1059 */ 1060 spin_unlock(&inode_hash_lock); 1061 destroy_inode(inode); 1062 inode = old; 1063 wait_on_inode(inode); 1064 } 1065 return inode; 1066 1067 set_failed: 1068 spin_unlock(&inode_hash_lock); 1069 destroy_inode(inode); 1070 return NULL; 1071 } 1072 EXPORT_SYMBOL(iget5_locked); 1073 1074 /** 1075 * iget_locked - obtain an inode from a mounted file system 1076 * @sb: super block of file system 1077 * @ino: inode number to get 1078 * 1079 * Search for the inode specified by @ino in the inode cache and if present 1080 * return it with an increased reference count. This is for file systems 1081 * where the inode number is sufficient for unique identification of an inode. 1082 * 1083 * If the inode is not in cache, allocate a new inode and return it locked, 1084 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1085 * before unlocking it via unlock_new_inode(). 1086 */ 1087 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 1088 { 1089 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1090 struct inode *inode; 1091 1092 spin_lock(&inode_hash_lock); 1093 inode = find_inode_fast(sb, head, ino); 1094 spin_unlock(&inode_hash_lock); 1095 if (inode) { 1096 wait_on_inode(inode); 1097 return inode; 1098 } 1099 1100 inode = alloc_inode(sb); 1101 if (inode) { 1102 struct inode *old; 1103 1104 spin_lock(&inode_hash_lock); 1105 /* We released the lock, so.. */ 1106 old = find_inode_fast(sb, head, ino); 1107 if (!old) { 1108 inode->i_ino = ino; 1109 spin_lock(&inode->i_lock); 1110 inode->i_state = I_NEW; 1111 hlist_add_head(&inode->i_hash, head); 1112 spin_unlock(&inode->i_lock); 1113 inode_sb_list_add(inode); 1114 spin_unlock(&inode_hash_lock); 1115 1116 /* Return the locked inode with I_NEW set, the 1117 * caller is responsible for filling in the contents 1118 */ 1119 return inode; 1120 } 1121 1122 /* 1123 * Uhhuh, somebody else created the same inode under 1124 * us. Use the old inode instead of the one we just 1125 * allocated. 1126 */ 1127 spin_unlock(&inode_hash_lock); 1128 destroy_inode(inode); 1129 inode = old; 1130 wait_on_inode(inode); 1131 } 1132 return inode; 1133 } 1134 EXPORT_SYMBOL(iget_locked); 1135 1136 /* 1137 * search the inode cache for a matching inode number. 1138 * If we find one, then the inode number we are trying to 1139 * allocate is not unique and so we should not use it. 1140 * 1141 * Returns 1 if the inode number is unique, 0 if it is not. 1142 */ 1143 static int test_inode_iunique(struct super_block *sb, unsigned long ino) 1144 { 1145 struct hlist_head *b = inode_hashtable + hash(sb, ino); 1146 struct inode *inode; 1147 1148 spin_lock(&inode_hash_lock); 1149 hlist_for_each_entry(inode, b, i_hash) { 1150 if (inode->i_ino == ino && inode->i_sb == sb) { 1151 spin_unlock(&inode_hash_lock); 1152 return 0; 1153 } 1154 } 1155 spin_unlock(&inode_hash_lock); 1156 1157 return 1; 1158 } 1159 1160 /** 1161 * iunique - get a unique inode number 1162 * @sb: superblock 1163 * @max_reserved: highest reserved inode number 1164 * 1165 * Obtain an inode number that is unique on the system for a given 1166 * superblock. This is used by file systems that have no natural 1167 * permanent inode numbering system. An inode number is returned that 1168 * is higher than the reserved limit but unique. 1169 * 1170 * BUGS: 1171 * With a large number of inodes live on the file system this function 1172 * currently becomes quite slow. 1173 */ 1174 ino_t iunique(struct super_block *sb, ino_t max_reserved) 1175 { 1176 /* 1177 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 1178 * error if st_ino won't fit in target struct field. Use 32bit counter 1179 * here to attempt to avoid that. 1180 */ 1181 static DEFINE_SPINLOCK(iunique_lock); 1182 static unsigned int counter; 1183 ino_t res; 1184 1185 spin_lock(&iunique_lock); 1186 do { 1187 if (counter <= max_reserved) 1188 counter = max_reserved + 1; 1189 res = counter++; 1190 } while (!test_inode_iunique(sb, res)); 1191 spin_unlock(&iunique_lock); 1192 1193 return res; 1194 } 1195 EXPORT_SYMBOL(iunique); 1196 1197 struct inode *igrab(struct inode *inode) 1198 { 1199 spin_lock(&inode->i_lock); 1200 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1201 __iget(inode); 1202 spin_unlock(&inode->i_lock); 1203 } else { 1204 spin_unlock(&inode->i_lock); 1205 /* 1206 * Handle the case where s_op->clear_inode is not been 1207 * called yet, and somebody is calling igrab 1208 * while the inode is getting freed. 1209 */ 1210 inode = NULL; 1211 } 1212 return inode; 1213 } 1214 EXPORT_SYMBOL(igrab); 1215 1216 /** 1217 * ilookup5_nowait - search for an inode in the inode cache 1218 * @sb: super block of file system to search 1219 * @hashval: hash value (usually inode number) to search for 1220 * @test: callback used for comparisons between inodes 1221 * @data: opaque data pointer to pass to @test 1222 * 1223 * Search for the inode specified by @hashval and @data in the inode cache. 1224 * If the inode is in the cache, the inode is returned with an incremented 1225 * reference count. 1226 * 1227 * Note: I_NEW is not waited upon so you have to be very careful what you do 1228 * with the returned inode. You probably should be using ilookup5() instead. 1229 * 1230 * Note2: @test is called with the inode_hash_lock held, so can't sleep. 1231 */ 1232 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1233 int (*test)(struct inode *, void *), void *data) 1234 { 1235 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1236 struct inode *inode; 1237 1238 spin_lock(&inode_hash_lock); 1239 inode = find_inode(sb, head, test, data); 1240 spin_unlock(&inode_hash_lock); 1241 1242 return inode; 1243 } 1244 EXPORT_SYMBOL(ilookup5_nowait); 1245 1246 /** 1247 * ilookup5 - search for an inode in the inode cache 1248 * @sb: super block of file system to search 1249 * @hashval: hash value (usually inode number) to search for 1250 * @test: callback used for comparisons between inodes 1251 * @data: opaque data pointer to pass to @test 1252 * 1253 * Search for the inode specified by @hashval and @data in the inode cache, 1254 * and if the inode is in the cache, return the inode with an incremented 1255 * reference count. Waits on I_NEW before returning the inode. 1256 * returned with an incremented reference count. 1257 * 1258 * This is a generalized version of ilookup() for file systems where the 1259 * inode number is not sufficient for unique identification of an inode. 1260 * 1261 * Note: @test is called with the inode_hash_lock held, so can't sleep. 1262 */ 1263 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1264 int (*test)(struct inode *, void *), void *data) 1265 { 1266 struct inode *inode = ilookup5_nowait(sb, hashval, test, data); 1267 1268 if (inode) 1269 wait_on_inode(inode); 1270 return inode; 1271 } 1272 EXPORT_SYMBOL(ilookup5); 1273 1274 /** 1275 * ilookup - search for an inode in the inode cache 1276 * @sb: super block of file system to search 1277 * @ino: inode number to search for 1278 * 1279 * Search for the inode @ino in the inode cache, and if the inode is in the 1280 * cache, the inode is returned with an incremented reference count. 1281 */ 1282 struct inode *ilookup(struct super_block *sb, unsigned long ino) 1283 { 1284 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1285 struct inode *inode; 1286 1287 spin_lock(&inode_hash_lock); 1288 inode = find_inode_fast(sb, head, ino); 1289 spin_unlock(&inode_hash_lock); 1290 1291 if (inode) 1292 wait_on_inode(inode); 1293 return inode; 1294 } 1295 EXPORT_SYMBOL(ilookup); 1296 1297 /** 1298 * find_inode_nowait - find an inode in the inode cache 1299 * @sb: super block of file system to search 1300 * @hashval: hash value (usually inode number) to search for 1301 * @match: callback used for comparisons between inodes 1302 * @data: opaque data pointer to pass to @match 1303 * 1304 * Search for the inode specified by @hashval and @data in the inode 1305 * cache, where the helper function @match will return 0 if the inode 1306 * does not match, 1 if the inode does match, and -1 if the search 1307 * should be stopped. The @match function must be responsible for 1308 * taking the i_lock spin_lock and checking i_state for an inode being 1309 * freed or being initialized, and incrementing the reference count 1310 * before returning 1. It also must not sleep, since it is called with 1311 * the inode_hash_lock spinlock held. 1312 * 1313 * This is a even more generalized version of ilookup5() when the 1314 * function must never block --- find_inode() can block in 1315 * __wait_on_freeing_inode() --- or when the caller can not increment 1316 * the reference count because the resulting iput() might cause an 1317 * inode eviction. The tradeoff is that the @match funtion must be 1318 * very carefully implemented. 1319 */ 1320 struct inode *find_inode_nowait(struct super_block *sb, 1321 unsigned long hashval, 1322 int (*match)(struct inode *, unsigned long, 1323 void *), 1324 void *data) 1325 { 1326 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1327 struct inode *inode, *ret_inode = NULL; 1328 int mval; 1329 1330 spin_lock(&inode_hash_lock); 1331 hlist_for_each_entry(inode, head, i_hash) { 1332 if (inode->i_sb != sb) 1333 continue; 1334 mval = match(inode, hashval, data); 1335 if (mval == 0) 1336 continue; 1337 if (mval == 1) 1338 ret_inode = inode; 1339 goto out; 1340 } 1341 out: 1342 spin_unlock(&inode_hash_lock); 1343 return ret_inode; 1344 } 1345 EXPORT_SYMBOL(find_inode_nowait); 1346 1347 int insert_inode_locked(struct inode *inode) 1348 { 1349 struct super_block *sb = inode->i_sb; 1350 ino_t ino = inode->i_ino; 1351 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1352 1353 while (1) { 1354 struct inode *old = NULL; 1355 spin_lock(&inode_hash_lock); 1356 hlist_for_each_entry(old, head, i_hash) { 1357 if (old->i_ino != ino) 1358 continue; 1359 if (old->i_sb != sb) 1360 continue; 1361 spin_lock(&old->i_lock); 1362 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1363 spin_unlock(&old->i_lock); 1364 continue; 1365 } 1366 break; 1367 } 1368 if (likely(!old)) { 1369 spin_lock(&inode->i_lock); 1370 inode->i_state |= I_NEW; 1371 hlist_add_head(&inode->i_hash, head); 1372 spin_unlock(&inode->i_lock); 1373 spin_unlock(&inode_hash_lock); 1374 return 0; 1375 } 1376 __iget(old); 1377 spin_unlock(&old->i_lock); 1378 spin_unlock(&inode_hash_lock); 1379 wait_on_inode(old); 1380 if (unlikely(!inode_unhashed(old))) { 1381 iput(old); 1382 return -EBUSY; 1383 } 1384 iput(old); 1385 } 1386 } 1387 EXPORT_SYMBOL(insert_inode_locked); 1388 1389 int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1390 int (*test)(struct inode *, void *), void *data) 1391 { 1392 struct super_block *sb = inode->i_sb; 1393 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1394 1395 while (1) { 1396 struct inode *old = NULL; 1397 1398 spin_lock(&inode_hash_lock); 1399 hlist_for_each_entry(old, head, i_hash) { 1400 if (old->i_sb != sb) 1401 continue; 1402 if (!test(old, data)) 1403 continue; 1404 spin_lock(&old->i_lock); 1405 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1406 spin_unlock(&old->i_lock); 1407 continue; 1408 } 1409 break; 1410 } 1411 if (likely(!old)) { 1412 spin_lock(&inode->i_lock); 1413 inode->i_state |= I_NEW; 1414 hlist_add_head(&inode->i_hash, head); 1415 spin_unlock(&inode->i_lock); 1416 spin_unlock(&inode_hash_lock); 1417 return 0; 1418 } 1419 __iget(old); 1420 spin_unlock(&old->i_lock); 1421 spin_unlock(&inode_hash_lock); 1422 wait_on_inode(old); 1423 if (unlikely(!inode_unhashed(old))) { 1424 iput(old); 1425 return -EBUSY; 1426 } 1427 iput(old); 1428 } 1429 } 1430 EXPORT_SYMBOL(insert_inode_locked4); 1431 1432 1433 int generic_delete_inode(struct inode *inode) 1434 { 1435 return 1; 1436 } 1437 EXPORT_SYMBOL(generic_delete_inode); 1438 1439 /* 1440 * Called when we're dropping the last reference 1441 * to an inode. 1442 * 1443 * Call the FS "drop_inode()" function, defaulting to 1444 * the legacy UNIX filesystem behaviour. If it tells 1445 * us to evict inode, do so. Otherwise, retain inode 1446 * in cache if fs is alive, sync and evict if fs is 1447 * shutting down. 1448 */ 1449 static void iput_final(struct inode *inode) 1450 { 1451 struct super_block *sb = inode->i_sb; 1452 const struct super_operations *op = inode->i_sb->s_op; 1453 int drop; 1454 1455 WARN_ON(inode->i_state & I_NEW); 1456 1457 if (op->drop_inode) 1458 drop = op->drop_inode(inode); 1459 else 1460 drop = generic_drop_inode(inode); 1461 1462 if (!drop && (sb->s_flags & MS_ACTIVE)) { 1463 inode->i_state |= I_REFERENCED; 1464 inode_add_lru(inode); 1465 spin_unlock(&inode->i_lock); 1466 return; 1467 } 1468 1469 if (!drop) { 1470 inode->i_state |= I_WILL_FREE; 1471 spin_unlock(&inode->i_lock); 1472 write_inode_now(inode, 1); 1473 spin_lock(&inode->i_lock); 1474 WARN_ON(inode->i_state & I_NEW); 1475 inode->i_state &= ~I_WILL_FREE; 1476 } 1477 1478 inode->i_state |= I_FREEING; 1479 if (!list_empty(&inode->i_lru)) 1480 inode_lru_list_del(inode); 1481 spin_unlock(&inode->i_lock); 1482 1483 evict(inode); 1484 } 1485 1486 /** 1487 * iput - put an inode 1488 * @inode: inode to put 1489 * 1490 * Puts an inode, dropping its usage count. If the inode use count hits 1491 * zero, the inode is then freed and may also be destroyed. 1492 * 1493 * Consequently, iput() can sleep. 1494 */ 1495 void iput(struct inode *inode) 1496 { 1497 if (!inode) 1498 return; 1499 BUG_ON(inode->i_state & I_CLEAR); 1500 retry: 1501 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { 1502 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { 1503 atomic_inc(&inode->i_count); 1504 inode->i_state &= ~I_DIRTY_TIME; 1505 spin_unlock(&inode->i_lock); 1506 trace_writeback_lazytime_iput(inode); 1507 mark_inode_dirty_sync(inode); 1508 goto retry; 1509 } 1510 iput_final(inode); 1511 } 1512 } 1513 EXPORT_SYMBOL(iput); 1514 1515 /** 1516 * bmap - find a block number in a file 1517 * @inode: inode of file 1518 * @block: block to find 1519 * 1520 * Returns the block number on the device holding the inode that 1521 * is the disk block number for the block of the file requested. 1522 * That is, asked for block 4 of inode 1 the function will return the 1523 * disk block relative to the disk start that holds that block of the 1524 * file. 1525 */ 1526 sector_t bmap(struct inode *inode, sector_t block) 1527 { 1528 sector_t res = 0; 1529 if (inode->i_mapping->a_ops->bmap) 1530 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1531 return res; 1532 } 1533 EXPORT_SYMBOL(bmap); 1534 1535 /* 1536 * With relative atime, only update atime if the previous atime is 1537 * earlier than either the ctime or mtime or if at least a day has 1538 * passed since the last atime update. 1539 */ 1540 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, 1541 struct timespec now) 1542 { 1543 1544 if (!(mnt->mnt_flags & MNT_RELATIME)) 1545 return 1; 1546 /* 1547 * Is mtime younger than atime? If yes, update atime: 1548 */ 1549 if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) 1550 return 1; 1551 /* 1552 * Is ctime younger than atime? If yes, update atime: 1553 */ 1554 if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) 1555 return 1; 1556 1557 /* 1558 * Is the previous atime value older than a day? If yes, 1559 * update atime: 1560 */ 1561 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) 1562 return 1; 1563 /* 1564 * Good, we can skip the atime update: 1565 */ 1566 return 0; 1567 } 1568 1569 int generic_update_time(struct inode *inode, struct timespec *time, int flags) 1570 { 1571 int iflags = I_DIRTY_TIME; 1572 1573 if (flags & S_ATIME) 1574 inode->i_atime = *time; 1575 if (flags & S_VERSION) 1576 inode_inc_iversion(inode); 1577 if (flags & S_CTIME) 1578 inode->i_ctime = *time; 1579 if (flags & S_MTIME) 1580 inode->i_mtime = *time; 1581 1582 if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) 1583 iflags |= I_DIRTY_SYNC; 1584 __mark_inode_dirty(inode, iflags); 1585 return 0; 1586 } 1587 EXPORT_SYMBOL(generic_update_time); 1588 1589 /* 1590 * This does the actual work of updating an inodes time or version. Must have 1591 * had called mnt_want_write() before calling this. 1592 */ 1593 static int update_time(struct inode *inode, struct timespec *time, int flags) 1594 { 1595 int (*update_time)(struct inode *, struct timespec *, int); 1596 1597 update_time = inode->i_op->update_time ? inode->i_op->update_time : 1598 generic_update_time; 1599 1600 return update_time(inode, time, flags); 1601 } 1602 1603 /** 1604 * touch_atime - update the access time 1605 * @path: the &struct path to update 1606 * @inode: inode to update 1607 * 1608 * Update the accessed time on an inode and mark it for writeback. 1609 * This function automatically handles read only file systems and media, 1610 * as well as the "noatime" flag and inode specific "noatime" markers. 1611 */ 1612 bool atime_needs_update(const struct path *path, struct inode *inode) 1613 { 1614 struct vfsmount *mnt = path->mnt; 1615 struct timespec now; 1616 1617 if (inode->i_flags & S_NOATIME) 1618 return false; 1619 if (IS_NOATIME(inode)) 1620 return false; 1621 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1622 return false; 1623 1624 if (mnt->mnt_flags & MNT_NOATIME) 1625 return false; 1626 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1627 return false; 1628 1629 now = current_fs_time(inode->i_sb); 1630 1631 if (!relatime_need_update(mnt, inode, now)) 1632 return false; 1633 1634 if (timespec_equal(&inode->i_atime, &now)) 1635 return false; 1636 1637 return true; 1638 } 1639 1640 void touch_atime(const struct path *path) 1641 { 1642 struct vfsmount *mnt = path->mnt; 1643 struct inode *inode = d_inode(path->dentry); 1644 struct timespec now; 1645 1646 if (!atime_needs_update(path, inode)) 1647 return; 1648 1649 if (!sb_start_write_trylock(inode->i_sb)) 1650 return; 1651 1652 if (__mnt_want_write(mnt) != 0) 1653 goto skip_update; 1654 /* 1655 * File systems can error out when updating inodes if they need to 1656 * allocate new space to modify an inode (such is the case for 1657 * Btrfs), but since we touch atime while walking down the path we 1658 * really don't care if we failed to update the atime of the file, 1659 * so just ignore the return value. 1660 * We may also fail on filesystems that have the ability to make parts 1661 * of the fs read only, e.g. subvolumes in Btrfs. 1662 */ 1663 now = current_fs_time(inode->i_sb); 1664 update_time(inode, &now, S_ATIME); 1665 __mnt_drop_write(mnt); 1666 skip_update: 1667 sb_end_write(inode->i_sb); 1668 } 1669 EXPORT_SYMBOL(touch_atime); 1670 1671 /* 1672 * The logic we want is 1673 * 1674 * if suid or (sgid and xgrp) 1675 * remove privs 1676 */ 1677 int should_remove_suid(struct dentry *dentry) 1678 { 1679 umode_t mode = d_inode(dentry)->i_mode; 1680 int kill = 0; 1681 1682 /* suid always must be killed */ 1683 if (unlikely(mode & S_ISUID)) 1684 kill = ATTR_KILL_SUID; 1685 1686 /* 1687 * sgid without any exec bits is just a mandatory locking mark; leave 1688 * it alone. If some exec bits are set, it's a real sgid; kill it. 1689 */ 1690 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1691 kill |= ATTR_KILL_SGID; 1692 1693 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1694 return kill; 1695 1696 return 0; 1697 } 1698 EXPORT_SYMBOL(should_remove_suid); 1699 1700 /* 1701 * Return mask of changes for notify_change() that need to be done as a 1702 * response to write or truncate. Return 0 if nothing has to be changed. 1703 * Negative value on error (change should be denied). 1704 */ 1705 int dentry_needs_remove_privs(struct dentry *dentry) 1706 { 1707 struct inode *inode = d_inode(dentry); 1708 int mask = 0; 1709 int ret; 1710 1711 if (IS_NOSEC(inode)) 1712 return 0; 1713 1714 mask = should_remove_suid(dentry); 1715 ret = security_inode_need_killpriv(dentry); 1716 if (ret < 0) 1717 return ret; 1718 if (ret) 1719 mask |= ATTR_KILL_PRIV; 1720 return mask; 1721 } 1722 EXPORT_SYMBOL(dentry_needs_remove_privs); 1723 1724 static int __remove_privs(struct dentry *dentry, int kill) 1725 { 1726 struct iattr newattrs; 1727 1728 newattrs.ia_valid = ATTR_FORCE | kill; 1729 /* 1730 * Note we call this on write, so notify_change will not 1731 * encounter any conflicting delegations: 1732 */ 1733 return notify_change(dentry, &newattrs, NULL); 1734 } 1735 1736 /* 1737 * Remove special file priviledges (suid, capabilities) when file is written 1738 * to or truncated. 1739 */ 1740 int file_remove_privs(struct file *file) 1741 { 1742 struct dentry *dentry = file->f_path.dentry; 1743 struct inode *inode = d_inode(dentry); 1744 int kill; 1745 int error = 0; 1746 1747 /* Fast path for nothing security related */ 1748 if (IS_NOSEC(inode)) 1749 return 0; 1750 1751 kill = file_needs_remove_privs(file); 1752 if (kill < 0) 1753 return kill; 1754 if (kill) 1755 error = __remove_privs(dentry, kill); 1756 if (!error) 1757 inode_has_no_xattr(inode); 1758 1759 return error; 1760 } 1761 EXPORT_SYMBOL(file_remove_privs); 1762 1763 /** 1764 * file_update_time - update mtime and ctime time 1765 * @file: file accessed 1766 * 1767 * Update the mtime and ctime members of an inode and mark the inode 1768 * for writeback. Note that this function is meant exclusively for 1769 * usage in the file write path of filesystems, and filesystems may 1770 * choose to explicitly ignore update via this function with the 1771 * S_NOCMTIME inode flag, e.g. for network filesystem where these 1772 * timestamps are handled by the server. This can return an error for 1773 * file systems who need to allocate space in order to update an inode. 1774 */ 1775 1776 int file_update_time(struct file *file) 1777 { 1778 struct inode *inode = file_inode(file); 1779 struct timespec now; 1780 int sync_it = 0; 1781 int ret; 1782 1783 /* First try to exhaust all avenues to not sync */ 1784 if (IS_NOCMTIME(inode)) 1785 return 0; 1786 1787 now = current_fs_time(inode->i_sb); 1788 if (!timespec_equal(&inode->i_mtime, &now)) 1789 sync_it = S_MTIME; 1790 1791 if (!timespec_equal(&inode->i_ctime, &now)) 1792 sync_it |= S_CTIME; 1793 1794 if (IS_I_VERSION(inode)) 1795 sync_it |= S_VERSION; 1796 1797 if (!sync_it) 1798 return 0; 1799 1800 /* Finally allowed to write? Takes lock. */ 1801 if (__mnt_want_write_file(file)) 1802 return 0; 1803 1804 ret = update_time(inode, &now, sync_it); 1805 __mnt_drop_write_file(file); 1806 1807 return ret; 1808 } 1809 EXPORT_SYMBOL(file_update_time); 1810 1811 int inode_needs_sync(struct inode *inode) 1812 { 1813 if (IS_SYNC(inode)) 1814 return 1; 1815 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1816 return 1; 1817 return 0; 1818 } 1819 EXPORT_SYMBOL(inode_needs_sync); 1820 1821 /* 1822 * If we try to find an inode in the inode hash while it is being 1823 * deleted, we have to wait until the filesystem completes its 1824 * deletion before reporting that it isn't found. This function waits 1825 * until the deletion _might_ have completed. Callers are responsible 1826 * to recheck inode state. 1827 * 1828 * It doesn't matter if I_NEW is not set initially, a call to 1829 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 1830 * will DTRT. 1831 */ 1832 static void __wait_on_freeing_inode(struct inode *inode) 1833 { 1834 wait_queue_head_t *wq; 1835 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1836 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1837 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1838 spin_unlock(&inode->i_lock); 1839 spin_unlock(&inode_hash_lock); 1840 schedule(); 1841 finish_wait(wq, &wait.wait); 1842 spin_lock(&inode_hash_lock); 1843 } 1844 1845 static __initdata unsigned long ihash_entries; 1846 static int __init set_ihash_entries(char *str) 1847 { 1848 if (!str) 1849 return 0; 1850 ihash_entries = simple_strtoul(str, &str, 0); 1851 return 1; 1852 } 1853 __setup("ihash_entries=", set_ihash_entries); 1854 1855 /* 1856 * Initialize the waitqueues and inode hash table. 1857 */ 1858 void __init inode_init_early(void) 1859 { 1860 unsigned int loop; 1861 1862 /* If hashes are distributed across NUMA nodes, defer 1863 * hash allocation until vmalloc space is available. 1864 */ 1865 if (hashdist) 1866 return; 1867 1868 inode_hashtable = 1869 alloc_large_system_hash("Inode-cache", 1870 sizeof(struct hlist_head), 1871 ihash_entries, 1872 14, 1873 HASH_EARLY, 1874 &i_hash_shift, 1875 &i_hash_mask, 1876 0, 1877 0); 1878 1879 for (loop = 0; loop < (1U << i_hash_shift); loop++) 1880 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1881 } 1882 1883 void __init inode_init(void) 1884 { 1885 unsigned int loop; 1886 1887 /* inode slab cache */ 1888 inode_cachep = kmem_cache_create("inode_cache", 1889 sizeof(struct inode), 1890 0, 1891 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1892 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 1893 init_once); 1894 1895 /* Hash may have been set up in inode_init_early */ 1896 if (!hashdist) 1897 return; 1898 1899 inode_hashtable = 1900 alloc_large_system_hash("Inode-cache", 1901 sizeof(struct hlist_head), 1902 ihash_entries, 1903 14, 1904 0, 1905 &i_hash_shift, 1906 &i_hash_mask, 1907 0, 1908 0); 1909 1910 for (loop = 0; loop < (1U << i_hash_shift); loop++) 1911 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1912 } 1913 1914 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 1915 { 1916 inode->i_mode = mode; 1917 if (S_ISCHR(mode)) { 1918 inode->i_fop = &def_chr_fops; 1919 inode->i_rdev = rdev; 1920 } else if (S_ISBLK(mode)) { 1921 inode->i_fop = &def_blk_fops; 1922 inode->i_rdev = rdev; 1923 } else if (S_ISFIFO(mode)) 1924 inode->i_fop = &pipefifo_fops; 1925 else if (S_ISSOCK(mode)) 1926 ; /* leave it no_open_fops */ 1927 else 1928 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" 1929 " inode %s:%lu\n", mode, inode->i_sb->s_id, 1930 inode->i_ino); 1931 } 1932 EXPORT_SYMBOL(init_special_inode); 1933 1934 /** 1935 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards 1936 * @inode: New inode 1937 * @dir: Directory inode 1938 * @mode: mode of the new inode 1939 */ 1940 void inode_init_owner(struct inode *inode, const struct inode *dir, 1941 umode_t mode) 1942 { 1943 inode->i_uid = current_fsuid(); 1944 if (dir && dir->i_mode & S_ISGID) { 1945 inode->i_gid = dir->i_gid; 1946 if (S_ISDIR(mode)) 1947 mode |= S_ISGID; 1948 } else 1949 inode->i_gid = current_fsgid(); 1950 inode->i_mode = mode; 1951 } 1952 EXPORT_SYMBOL(inode_init_owner); 1953 1954 /** 1955 * inode_owner_or_capable - check current task permissions to inode 1956 * @inode: inode being checked 1957 * 1958 * Return true if current either has CAP_FOWNER in a namespace with the 1959 * inode owner uid mapped, or owns the file. 1960 */ 1961 bool inode_owner_or_capable(const struct inode *inode) 1962 { 1963 struct user_namespace *ns; 1964 1965 if (uid_eq(current_fsuid(), inode->i_uid)) 1966 return true; 1967 1968 ns = current_user_ns(); 1969 if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) 1970 return true; 1971 return false; 1972 } 1973 EXPORT_SYMBOL(inode_owner_or_capable); 1974 1975 /* 1976 * Direct i/o helper functions 1977 */ 1978 static void __inode_dio_wait(struct inode *inode) 1979 { 1980 wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); 1981 DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); 1982 1983 do { 1984 prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); 1985 if (atomic_read(&inode->i_dio_count)) 1986 schedule(); 1987 } while (atomic_read(&inode->i_dio_count)); 1988 finish_wait(wq, &q.wait); 1989 } 1990 1991 /** 1992 * inode_dio_wait - wait for outstanding DIO requests to finish 1993 * @inode: inode to wait for 1994 * 1995 * Waits for all pending direct I/O requests to finish so that we can 1996 * proceed with a truncate or equivalent operation. 1997 * 1998 * Must be called under a lock that serializes taking new references 1999 * to i_dio_count, usually by inode->i_mutex. 2000 */ 2001 void inode_dio_wait(struct inode *inode) 2002 { 2003 if (atomic_read(&inode->i_dio_count)) 2004 __inode_dio_wait(inode); 2005 } 2006 EXPORT_SYMBOL(inode_dio_wait); 2007 2008 /* 2009 * inode_set_flags - atomically set some inode flags 2010 * 2011 * Note: the caller should be holding i_mutex, or else be sure that 2012 * they have exclusive access to the inode structure (i.e., while the 2013 * inode is being instantiated). The reason for the cmpxchg() loop 2014 * --- which wouldn't be necessary if all code paths which modify 2015 * i_flags actually followed this rule, is that there is at least one 2016 * code path which doesn't today so we use cmpxchg() out of an abundance 2017 * of caution. 2018 * 2019 * In the long run, i_mutex is overkill, and we should probably look 2020 * at using the i_lock spinlock to protect i_flags, and then make sure 2021 * it is so documented in include/linux/fs.h and that all code follows 2022 * the locking convention!! 2023 */ 2024 void inode_set_flags(struct inode *inode, unsigned int flags, 2025 unsigned int mask) 2026 { 2027 unsigned int old_flags, new_flags; 2028 2029 WARN_ON_ONCE(flags & ~mask); 2030 do { 2031 old_flags = ACCESS_ONCE(inode->i_flags); 2032 new_flags = (old_flags & ~mask) | flags; 2033 } while (unlikely(cmpxchg(&inode->i_flags, old_flags, 2034 new_flags) != old_flags)); 2035 } 2036 EXPORT_SYMBOL(inode_set_flags); 2037 2038 void inode_nohighmem(struct inode *inode) 2039 { 2040 mapping_set_gfp_mask(inode->i_mapping, GFP_USER); 2041 } 2042 EXPORT_SYMBOL(inode_nohighmem); 2043