1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/libfs.c 4 * Library for filesystems writers. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/export.h> 9 #include <linux/pagemap.h> 10 #include <linux/slab.h> 11 #include <linux/cred.h> 12 #include <linux/mount.h> 13 #include <linux/vfs.h> 14 #include <linux/quotaops.h> 15 #include <linux/mutex.h> 16 #include <linux/namei.h> 17 #include <linux/exportfs.h> 18 #include <linux/iversion.h> 19 #include <linux/writeback.h> 20 #include <linux/buffer_head.h> /* sync_mapping_buffers */ 21 #include <linux/fs_context.h> 22 #include <linux/pseudo_fs.h> 23 #include <linux/fsnotify.h> 24 #include <linux/unicode.h> 25 #include <linux/fscrypt.h> 26 27 #include <linux/uaccess.h> 28 29 #include "internal.h" 30 31 int simple_getattr(struct mnt_idmap *idmap, const struct path *path, 32 struct kstat *stat, u32 request_mask, 33 unsigned int query_flags) 34 { 35 struct inode *inode = d_inode(path->dentry); 36 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 37 stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); 38 return 0; 39 } 40 EXPORT_SYMBOL(simple_getattr); 41 42 int simple_statfs(struct dentry *dentry, struct kstatfs *buf) 43 { 44 buf->f_type = dentry->d_sb->s_magic; 45 buf->f_bsize = PAGE_SIZE; 46 buf->f_namelen = NAME_MAX; 47 return 0; 48 } 49 EXPORT_SYMBOL(simple_statfs); 50 51 /* 52 * Retaining negative dentries for an in-memory filesystem just wastes 53 * memory and lookup time: arrange for them to be deleted immediately. 54 */ 55 int always_delete_dentry(const struct dentry *dentry) 56 { 57 return 1; 58 } 59 EXPORT_SYMBOL(always_delete_dentry); 60 61 const struct dentry_operations simple_dentry_operations = { 62 .d_delete = always_delete_dentry, 63 }; 64 EXPORT_SYMBOL(simple_dentry_operations); 65 66 /* 67 * Lookup the data. This is trivial - if the dentry didn't already 68 * exist, we know it is negative. Set d_op to delete negative dentries. 69 */ 70 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 71 { 72 if (dentry->d_name.len > NAME_MAX) 73 return ERR_PTR(-ENAMETOOLONG); 74 if (!dentry->d_sb->s_d_op) 75 d_set_d_op(dentry, &simple_dentry_operations); 76 d_add(dentry, NULL); 77 return NULL; 78 } 79 EXPORT_SYMBOL(simple_lookup); 80 81 int dcache_dir_open(struct inode *inode, struct file *file) 82 { 83 file->private_data = d_alloc_cursor(file->f_path.dentry); 84 85 return file->private_data ? 0 : -ENOMEM; 86 } 87 EXPORT_SYMBOL(dcache_dir_open); 88 89 int dcache_dir_close(struct inode *inode, struct file *file) 90 { 91 dput(file->private_data); 92 return 0; 93 } 94 EXPORT_SYMBOL(dcache_dir_close); 95 96 /* parent is locked at least shared */ 97 /* 98 * Returns an element of siblings' list. 99 * We are looking for <count>th positive after <p>; if 100 * found, dentry is grabbed and returned to caller. 101 * If no such element exists, NULL is returned. 102 */ 103 static struct dentry *scan_positives(struct dentry *cursor, 104 struct list_head *p, 105 loff_t count, 106 struct dentry *last) 107 { 108 struct dentry *dentry = cursor->d_parent, *found = NULL; 109 110 spin_lock(&dentry->d_lock); 111 while ((p = p->next) != &dentry->d_subdirs) { 112 struct dentry *d = list_entry(p, struct dentry, d_child); 113 // we must at least skip cursors, to avoid livelocks 114 if (d->d_flags & DCACHE_DENTRY_CURSOR) 115 continue; 116 if (simple_positive(d) && !--count) { 117 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 118 if (simple_positive(d)) 119 found = dget_dlock(d); 120 spin_unlock(&d->d_lock); 121 if (likely(found)) 122 break; 123 count = 1; 124 } 125 if (need_resched()) { 126 list_move(&cursor->d_child, p); 127 p = &cursor->d_child; 128 spin_unlock(&dentry->d_lock); 129 cond_resched(); 130 spin_lock(&dentry->d_lock); 131 } 132 } 133 spin_unlock(&dentry->d_lock); 134 dput(last); 135 return found; 136 } 137 138 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) 139 { 140 struct dentry *dentry = file->f_path.dentry; 141 switch (whence) { 142 case 1: 143 offset += file->f_pos; 144 fallthrough; 145 case 0: 146 if (offset >= 0) 147 break; 148 fallthrough; 149 default: 150 return -EINVAL; 151 } 152 if (offset != file->f_pos) { 153 struct dentry *cursor = file->private_data; 154 struct dentry *to = NULL; 155 156 inode_lock_shared(dentry->d_inode); 157 158 if (offset > 2) 159 to = scan_positives(cursor, &dentry->d_subdirs, 160 offset - 2, NULL); 161 spin_lock(&dentry->d_lock); 162 if (to) 163 list_move(&cursor->d_child, &to->d_child); 164 else 165 list_del_init(&cursor->d_child); 166 spin_unlock(&dentry->d_lock); 167 dput(to); 168 169 file->f_pos = offset; 170 171 inode_unlock_shared(dentry->d_inode); 172 } 173 return offset; 174 } 175 EXPORT_SYMBOL(dcache_dir_lseek); 176 177 /* 178 * Directory is locked and all positive dentries in it are safe, since 179 * for ramfs-type trees they can't go away without unlink() or rmdir(), 180 * both impossible due to the lock on directory. 181 */ 182 183 int dcache_readdir(struct file *file, struct dir_context *ctx) 184 { 185 struct dentry *dentry = file->f_path.dentry; 186 struct dentry *cursor = file->private_data; 187 struct list_head *anchor = &dentry->d_subdirs; 188 struct dentry *next = NULL; 189 struct list_head *p; 190 191 if (!dir_emit_dots(file, ctx)) 192 return 0; 193 194 if (ctx->pos == 2) 195 p = anchor; 196 else if (!list_empty(&cursor->d_child)) 197 p = &cursor->d_child; 198 else 199 return 0; 200 201 while ((next = scan_positives(cursor, p, 1, next)) != NULL) { 202 if (!dir_emit(ctx, next->d_name.name, next->d_name.len, 203 d_inode(next)->i_ino, 204 fs_umode_to_dtype(d_inode(next)->i_mode))) 205 break; 206 ctx->pos++; 207 p = &next->d_child; 208 } 209 spin_lock(&dentry->d_lock); 210 if (next) 211 list_move_tail(&cursor->d_child, &next->d_child); 212 else 213 list_del_init(&cursor->d_child); 214 spin_unlock(&dentry->d_lock); 215 dput(next); 216 217 return 0; 218 } 219 EXPORT_SYMBOL(dcache_readdir); 220 221 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) 222 { 223 return -EISDIR; 224 } 225 EXPORT_SYMBOL(generic_read_dir); 226 227 const struct file_operations simple_dir_operations = { 228 .open = dcache_dir_open, 229 .release = dcache_dir_close, 230 .llseek = dcache_dir_lseek, 231 .read = generic_read_dir, 232 .iterate_shared = dcache_readdir, 233 .fsync = noop_fsync, 234 }; 235 EXPORT_SYMBOL(simple_dir_operations); 236 237 const struct inode_operations simple_dir_inode_operations = { 238 .lookup = simple_lookup, 239 }; 240 EXPORT_SYMBOL(simple_dir_inode_operations); 241 242 /* simple_offset_add() never assigns these to a dentry */ 243 enum { 244 DIR_OFFSET_FIRST = 2, /* Find first real entry */ 245 DIR_OFFSET_EOD = S32_MAX, 246 }; 247 248 /* simple_offset_add() allocation range */ 249 enum { 250 DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, 251 DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, 252 }; 253 254 static void offset_set(struct dentry *dentry, u32 offset) 255 { 256 dentry->d_fsdata = (void *)((uintptr_t)(offset)); 257 } 258 259 static u32 dentry2offset(struct dentry *dentry) 260 { 261 return (u32)((uintptr_t)(dentry->d_fsdata)); 262 } 263 264 static struct lock_class_key simple_offset_xa_lock; 265 266 /** 267 * simple_offset_init - initialize an offset_ctx 268 * @octx: directory offset map to be initialized 269 * 270 */ 271 void simple_offset_init(struct offset_ctx *octx) 272 { 273 xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); 274 lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); 275 octx->next_offset = DIR_OFFSET_MIN; 276 } 277 278 /** 279 * simple_offset_add - Add an entry to a directory's offset map 280 * @octx: directory offset ctx to be updated 281 * @dentry: new dentry being added 282 * 283 * Returns zero on success. @so_ctx and the dentry offset are updated. 284 * Otherwise, a negative errno value is returned. 285 */ 286 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) 287 { 288 static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, 289 DIR_OFFSET_MAX); 290 u32 offset; 291 int ret; 292 293 if (dentry2offset(dentry) != 0) 294 return -EBUSY; 295 296 ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, 297 &octx->next_offset, GFP_KERNEL); 298 if (unlikely(ret < 0)) 299 return ret == -EBUSY ? -ENOSPC : ret; 300 301 offset_set(dentry, offset); 302 return 0; 303 } 304 305 static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry, 306 long offset) 307 { 308 void *ret; 309 310 ret = xa_store(&octx->xa, offset, dentry, GFP_KERNEL); 311 if (xa_is_err(ret)) 312 return xa_err(ret); 313 offset_set(dentry, offset); 314 return 0; 315 } 316 317 /** 318 * simple_offset_remove - Remove an entry to a directory's offset map 319 * @octx: directory offset ctx to be updated 320 * @dentry: dentry being removed 321 * 322 */ 323 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) 324 { 325 u32 offset; 326 327 offset = dentry2offset(dentry); 328 if (offset == 0) 329 return; 330 331 xa_erase(&octx->xa, offset); 332 offset_set(dentry, 0); 333 } 334 335 /** 336 * simple_offset_rename - handle directory offsets for rename 337 * @old_dir: parent directory of source entry 338 * @old_dentry: dentry of source entry 339 * @new_dir: parent_directory of destination entry 340 * @new_dentry: dentry of destination 341 * 342 * Caller provides appropriate serialization. 343 * 344 * User space expects the directory offset value of the replaced 345 * (new) directory entry to be unchanged after a rename. 346 * 347 * Returns zero on success, a negative errno value on failure. 348 */ 349 int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, 350 struct inode *new_dir, struct dentry *new_dentry) 351 { 352 struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); 353 struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); 354 long new_offset = dentry2offset(new_dentry); 355 356 simple_offset_remove(old_ctx, old_dentry); 357 358 if (new_offset) { 359 offset_set(new_dentry, 0); 360 return simple_offset_replace(new_ctx, old_dentry, new_offset); 361 } 362 return simple_offset_add(new_ctx, old_dentry); 363 } 364 365 /** 366 * simple_offset_rename_exchange - exchange rename with directory offsets 367 * @old_dir: parent of dentry being moved 368 * @old_dentry: dentry being moved 369 * @new_dir: destination parent 370 * @new_dentry: destination dentry 371 * 372 * This API preserves the directory offset values. Caller provides 373 * appropriate serialization. 374 * 375 * Returns zero on success. Otherwise a negative errno is returned and the 376 * rename is rolled back. 377 */ 378 int simple_offset_rename_exchange(struct inode *old_dir, 379 struct dentry *old_dentry, 380 struct inode *new_dir, 381 struct dentry *new_dentry) 382 { 383 struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); 384 struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); 385 u32 old_index = dentry2offset(old_dentry); 386 u32 new_index = dentry2offset(new_dentry); 387 int ret; 388 389 simple_offset_remove(old_ctx, old_dentry); 390 simple_offset_remove(new_ctx, new_dentry); 391 392 ret = simple_offset_replace(new_ctx, old_dentry, new_index); 393 if (ret) 394 goto out_restore; 395 396 ret = simple_offset_replace(old_ctx, new_dentry, old_index); 397 if (ret) { 398 simple_offset_remove(new_ctx, old_dentry); 399 goto out_restore; 400 } 401 402 ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 403 if (ret) { 404 simple_offset_remove(new_ctx, old_dentry); 405 simple_offset_remove(old_ctx, new_dentry); 406 goto out_restore; 407 } 408 return 0; 409 410 out_restore: 411 (void)simple_offset_replace(old_ctx, old_dentry, old_index); 412 (void)simple_offset_replace(new_ctx, new_dentry, new_index); 413 return ret; 414 } 415 416 /** 417 * simple_offset_destroy - Release offset map 418 * @octx: directory offset ctx that is about to be destroyed 419 * 420 * During fs teardown (eg. umount), a directory's offset map might still 421 * contain entries. xa_destroy() cleans out anything that remains. 422 */ 423 void simple_offset_destroy(struct offset_ctx *octx) 424 { 425 xa_destroy(&octx->xa); 426 } 427 428 /** 429 * offset_dir_llseek - Advance the read position of a directory descriptor 430 * @file: an open directory whose position is to be updated 431 * @offset: a byte offset 432 * @whence: enumerator describing the starting position for this update 433 * 434 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. 435 * 436 * Returns the updated read position if successful; otherwise a 437 * negative errno is returned and the read position remains unchanged. 438 */ 439 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) 440 { 441 switch (whence) { 442 case SEEK_CUR: 443 offset += file->f_pos; 444 fallthrough; 445 case SEEK_SET: 446 if (offset >= 0) 447 break; 448 fallthrough; 449 default: 450 return -EINVAL; 451 } 452 453 return vfs_setpos(file, offset, U32_MAX); 454 } 455 456 static struct dentry *find_positive_dentry(struct dentry *parent, 457 struct dentry *dentry, 458 bool next) 459 { 460 struct dentry *found = NULL; 461 462 spin_lock(&parent->d_lock); 463 if (next) 464 dentry = list_next_entry(dentry, d_child); 465 else if (!dentry) 466 dentry = list_first_entry_or_null(&parent->d_subdirs, 467 struct dentry, d_child); 468 for (; dentry && !list_entry_is_head(dentry, &parent->d_subdirs, d_child); 469 dentry = list_next_entry(dentry, d_child)) { 470 if (!simple_positive(dentry)) 471 continue; 472 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 473 if (simple_positive(dentry)) 474 found = dget_dlock(dentry); 475 spin_unlock(&dentry->d_lock); 476 if (likely(found)) 477 break; 478 } 479 spin_unlock(&parent->d_lock); 480 return found; 481 } 482 483 static noinline_for_stack struct dentry * 484 offset_dir_lookup(struct dentry *parent, loff_t offset) 485 { 486 struct inode *inode = d_inode(parent); 487 struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); 488 struct dentry *child, *found = NULL; 489 490 XA_STATE(xas, &octx->xa, offset); 491 492 if (offset == DIR_OFFSET_FIRST) 493 found = find_positive_dentry(parent, NULL, false); 494 else { 495 rcu_read_lock(); 496 child = xas_next_entry(&xas, DIR_OFFSET_MAX); 497 found = find_positive_dentry(parent, child, false); 498 rcu_read_unlock(); 499 } 500 return found; 501 } 502 503 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) 504 { 505 struct inode *inode = d_inode(dentry); 506 507 return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, 508 inode->i_ino, fs_umode_to_dtype(inode->i_mode)); 509 } 510 511 static void offset_iterate_dir(struct file *file, struct dir_context *ctx) 512 { 513 struct dentry *dir = file->f_path.dentry; 514 struct dentry *dentry; 515 516 dentry = offset_dir_lookup(dir, ctx->pos); 517 if (!dentry) 518 goto out_eod; 519 while (true) { 520 struct dentry *next; 521 522 ctx->pos = dentry2offset(dentry); 523 if (!offset_dir_emit(ctx, dentry)) 524 break; 525 526 next = find_positive_dentry(dir, dentry, true); 527 dput(dentry); 528 529 if (!next) 530 goto out_eod; 531 dentry = next; 532 } 533 dput(dentry); 534 return; 535 536 out_eod: 537 ctx->pos = DIR_OFFSET_EOD; 538 } 539 540 /** 541 * offset_readdir - Emit entries starting at offset @ctx->pos 542 * @file: an open directory to iterate over 543 * @ctx: directory iteration context 544 * 545 * Caller must hold @file's i_rwsem to prevent insertion or removal of 546 * entries during this call. 547 * 548 * On entry, @ctx->pos contains an offset that represents the first entry 549 * to be read from the directory. 550 * 551 * The operation continues until there are no more entries to read, or 552 * until the ctx->actor indicates there is no more space in the caller's 553 * output buffer. 554 * 555 * On return, @ctx->pos contains an offset that will read the next entry 556 * in this directory when offset_readdir() is called again with @ctx. 557 * Caller places this value in the d_off field of the last entry in the 558 * user's buffer. 559 * 560 * Return values: 561 * %0 - Complete 562 */ 563 static int offset_readdir(struct file *file, struct dir_context *ctx) 564 { 565 struct dentry *dir = file->f_path.dentry; 566 567 lockdep_assert_held(&d_inode(dir)->i_rwsem); 568 569 if (!dir_emit_dots(file, ctx)) 570 return 0; 571 if (ctx->pos != DIR_OFFSET_EOD) 572 offset_iterate_dir(file, ctx); 573 return 0; 574 } 575 576 const struct file_operations simple_offset_dir_operations = { 577 .llseek = offset_dir_llseek, 578 .iterate_shared = offset_readdir, 579 .read = generic_read_dir, 580 .fsync = noop_fsync, 581 }; 582 583 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) 584 { 585 struct dentry *child = NULL; 586 struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; 587 588 spin_lock(&parent->d_lock); 589 while ((p = p->next) != &parent->d_subdirs) { 590 struct dentry *d = container_of(p, struct dentry, d_child); 591 if (simple_positive(d)) { 592 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 593 if (simple_positive(d)) 594 child = dget_dlock(d); 595 spin_unlock(&d->d_lock); 596 if (likely(child)) 597 break; 598 } 599 } 600 spin_unlock(&parent->d_lock); 601 dput(prev); 602 return child; 603 } 604 605 void simple_recursive_removal(struct dentry *dentry, 606 void (*callback)(struct dentry *)) 607 { 608 struct dentry *this = dget(dentry); 609 while (true) { 610 struct dentry *victim = NULL, *child; 611 struct inode *inode = this->d_inode; 612 613 inode_lock(inode); 614 if (d_is_dir(this)) 615 inode->i_flags |= S_DEAD; 616 while ((child = find_next_child(this, victim)) == NULL) { 617 // kill and ascend 618 // update metadata while it's still locked 619 inode_set_ctime_current(inode); 620 clear_nlink(inode); 621 inode_unlock(inode); 622 victim = this; 623 this = this->d_parent; 624 inode = this->d_inode; 625 inode_lock(inode); 626 if (simple_positive(victim)) { 627 d_invalidate(victim); // avoid lost mounts 628 if (d_is_dir(victim)) 629 fsnotify_rmdir(inode, victim); 630 else 631 fsnotify_unlink(inode, victim); 632 if (callback) 633 callback(victim); 634 dput(victim); // unpin it 635 } 636 if (victim == dentry) { 637 inode_set_mtime_to_ts(inode, 638 inode_set_ctime_current(inode)); 639 if (d_is_dir(dentry)) 640 drop_nlink(inode); 641 inode_unlock(inode); 642 dput(dentry); 643 return; 644 } 645 } 646 inode_unlock(inode); 647 this = child; 648 } 649 } 650 EXPORT_SYMBOL(simple_recursive_removal); 651 652 static const struct super_operations simple_super_operations = { 653 .statfs = simple_statfs, 654 }; 655 656 static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) 657 { 658 struct pseudo_fs_context *ctx = fc->fs_private; 659 struct inode *root; 660 661 s->s_maxbytes = MAX_LFS_FILESIZE; 662 s->s_blocksize = PAGE_SIZE; 663 s->s_blocksize_bits = PAGE_SHIFT; 664 s->s_magic = ctx->magic; 665 s->s_op = ctx->ops ?: &simple_super_operations; 666 s->s_xattr = ctx->xattr; 667 s->s_time_gran = 1; 668 root = new_inode(s); 669 if (!root) 670 return -ENOMEM; 671 672 /* 673 * since this is the first inode, make it number 1. New inodes created 674 * after this must take care not to collide with it (by passing 675 * max_reserved of 1 to iunique). 676 */ 677 root->i_ino = 1; 678 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 679 simple_inode_init_ts(root); 680 s->s_root = d_make_root(root); 681 if (!s->s_root) 682 return -ENOMEM; 683 s->s_d_op = ctx->dops; 684 return 0; 685 } 686 687 static int pseudo_fs_get_tree(struct fs_context *fc) 688 { 689 return get_tree_nodev(fc, pseudo_fs_fill_super); 690 } 691 692 static void pseudo_fs_free(struct fs_context *fc) 693 { 694 kfree(fc->fs_private); 695 } 696 697 static const struct fs_context_operations pseudo_fs_context_ops = { 698 .free = pseudo_fs_free, 699 .get_tree = pseudo_fs_get_tree, 700 }; 701 702 /* 703 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 704 * will never be mountable) 705 */ 706 struct pseudo_fs_context *init_pseudo(struct fs_context *fc, 707 unsigned long magic) 708 { 709 struct pseudo_fs_context *ctx; 710 711 ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); 712 if (likely(ctx)) { 713 ctx->magic = magic; 714 fc->fs_private = ctx; 715 fc->ops = &pseudo_fs_context_ops; 716 fc->sb_flags |= SB_NOUSER; 717 fc->global = true; 718 } 719 return ctx; 720 } 721 EXPORT_SYMBOL(init_pseudo); 722 723 int simple_open(struct inode *inode, struct file *file) 724 { 725 if (inode->i_private) 726 file->private_data = inode->i_private; 727 return 0; 728 } 729 EXPORT_SYMBOL(simple_open); 730 731 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 732 { 733 struct inode *inode = d_inode(old_dentry); 734 735 inode_set_mtime_to_ts(dir, 736 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 737 inc_nlink(inode); 738 ihold(inode); 739 dget(dentry); 740 d_instantiate(dentry, inode); 741 return 0; 742 } 743 EXPORT_SYMBOL(simple_link); 744 745 int simple_empty(struct dentry *dentry) 746 { 747 struct dentry *child; 748 int ret = 0; 749 750 spin_lock(&dentry->d_lock); 751 list_for_each_entry(child, &dentry->d_subdirs, d_child) { 752 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); 753 if (simple_positive(child)) { 754 spin_unlock(&child->d_lock); 755 goto out; 756 } 757 spin_unlock(&child->d_lock); 758 } 759 ret = 1; 760 out: 761 spin_unlock(&dentry->d_lock); 762 return ret; 763 } 764 EXPORT_SYMBOL(simple_empty); 765 766 int simple_unlink(struct inode *dir, struct dentry *dentry) 767 { 768 struct inode *inode = d_inode(dentry); 769 770 inode_set_mtime_to_ts(dir, 771 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 772 drop_nlink(inode); 773 dput(dentry); 774 return 0; 775 } 776 EXPORT_SYMBOL(simple_unlink); 777 778 int simple_rmdir(struct inode *dir, struct dentry *dentry) 779 { 780 if (!simple_empty(dentry)) 781 return -ENOTEMPTY; 782 783 drop_nlink(d_inode(dentry)); 784 simple_unlink(dir, dentry); 785 drop_nlink(dir); 786 return 0; 787 } 788 EXPORT_SYMBOL(simple_rmdir); 789 790 /** 791 * simple_rename_timestamp - update the various inode timestamps for rename 792 * @old_dir: old parent directory 793 * @old_dentry: dentry that is being renamed 794 * @new_dir: new parent directory 795 * @new_dentry: target for rename 796 * 797 * POSIX mandates that the old and new parent directories have their ctime and 798 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have 799 * their ctime updated. 800 */ 801 void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, 802 struct inode *new_dir, struct dentry *new_dentry) 803 { 804 struct inode *newino = d_inode(new_dentry); 805 806 inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); 807 if (new_dir != old_dir) 808 inode_set_mtime_to_ts(new_dir, 809 inode_set_ctime_current(new_dir)); 810 inode_set_ctime_current(d_inode(old_dentry)); 811 if (newino) 812 inode_set_ctime_current(newino); 813 } 814 EXPORT_SYMBOL_GPL(simple_rename_timestamp); 815 816 int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, 817 struct inode *new_dir, struct dentry *new_dentry) 818 { 819 bool old_is_dir = d_is_dir(old_dentry); 820 bool new_is_dir = d_is_dir(new_dentry); 821 822 if (old_dir != new_dir && old_is_dir != new_is_dir) { 823 if (old_is_dir) { 824 drop_nlink(old_dir); 825 inc_nlink(new_dir); 826 } else { 827 drop_nlink(new_dir); 828 inc_nlink(old_dir); 829 } 830 } 831 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 832 return 0; 833 } 834 EXPORT_SYMBOL_GPL(simple_rename_exchange); 835 836 int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, 837 struct dentry *old_dentry, struct inode *new_dir, 838 struct dentry *new_dentry, unsigned int flags) 839 { 840 int they_are_dirs = d_is_dir(old_dentry); 841 842 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 843 return -EINVAL; 844 845 if (flags & RENAME_EXCHANGE) 846 return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 847 848 if (!simple_empty(new_dentry)) 849 return -ENOTEMPTY; 850 851 if (d_really_is_positive(new_dentry)) { 852 simple_unlink(new_dir, new_dentry); 853 if (they_are_dirs) { 854 drop_nlink(d_inode(new_dentry)); 855 drop_nlink(old_dir); 856 } 857 } else if (they_are_dirs) { 858 drop_nlink(old_dir); 859 inc_nlink(new_dir); 860 } 861 862 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 863 return 0; 864 } 865 EXPORT_SYMBOL(simple_rename); 866 867 /** 868 * simple_setattr - setattr for simple filesystem 869 * @idmap: idmap of the target mount 870 * @dentry: dentry 871 * @iattr: iattr structure 872 * 873 * Returns 0 on success, -error on failure. 874 * 875 * simple_setattr is a simple ->setattr implementation without a proper 876 * implementation of size changes. 877 * 878 * It can either be used for in-memory filesystems or special files 879 * on simple regular filesystems. Anything that needs to change on-disk 880 * or wire state on size changes needs its own setattr method. 881 */ 882 int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 883 struct iattr *iattr) 884 { 885 struct inode *inode = d_inode(dentry); 886 int error; 887 888 error = setattr_prepare(idmap, dentry, iattr); 889 if (error) 890 return error; 891 892 if (iattr->ia_valid & ATTR_SIZE) 893 truncate_setsize(inode, iattr->ia_size); 894 setattr_copy(idmap, inode, iattr); 895 mark_inode_dirty(inode); 896 return 0; 897 } 898 EXPORT_SYMBOL(simple_setattr); 899 900 static int simple_read_folio(struct file *file, struct folio *folio) 901 { 902 folio_zero_range(folio, 0, folio_size(folio)); 903 flush_dcache_folio(folio); 904 folio_mark_uptodate(folio); 905 folio_unlock(folio); 906 return 0; 907 } 908 909 int simple_write_begin(struct file *file, struct address_space *mapping, 910 loff_t pos, unsigned len, 911 struct page **pagep, void **fsdata) 912 { 913 struct folio *folio; 914 915 folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, 916 mapping_gfp_mask(mapping)); 917 if (IS_ERR(folio)) 918 return PTR_ERR(folio); 919 920 *pagep = &folio->page; 921 922 if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { 923 size_t from = offset_in_folio(folio, pos); 924 925 folio_zero_segments(folio, 0, from, 926 from + len, folio_size(folio)); 927 } 928 return 0; 929 } 930 EXPORT_SYMBOL(simple_write_begin); 931 932 /** 933 * simple_write_end - .write_end helper for non-block-device FSes 934 * @file: See .write_end of address_space_operations 935 * @mapping: " 936 * @pos: " 937 * @len: " 938 * @copied: " 939 * @page: " 940 * @fsdata: " 941 * 942 * simple_write_end does the minimum needed for updating a page after writing is 943 * done. It has the same API signature as the .write_end of 944 * address_space_operations vector. So it can just be set onto .write_end for 945 * FSes that don't need any other processing. i_mutex is assumed to be held. 946 * Block based filesystems should use generic_write_end(). 947 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty 948 * is not called, so a filesystem that actually does store data in .write_inode 949 * should extend on what's done here with a call to mark_inode_dirty() in the 950 * case that i_size has changed. 951 * 952 * Use *ONLY* with simple_read_folio() 953 */ 954 static int simple_write_end(struct file *file, struct address_space *mapping, 955 loff_t pos, unsigned len, unsigned copied, 956 struct page *page, void *fsdata) 957 { 958 struct folio *folio = page_folio(page); 959 struct inode *inode = folio->mapping->host; 960 loff_t last_pos = pos + copied; 961 962 /* zero the stale part of the folio if we did a short copy */ 963 if (!folio_test_uptodate(folio)) { 964 if (copied < len) { 965 size_t from = offset_in_folio(folio, pos); 966 967 folio_zero_range(folio, from + copied, len - copied); 968 } 969 folio_mark_uptodate(folio); 970 } 971 /* 972 * No need to use i_size_read() here, the i_size 973 * cannot change under us because we hold the i_mutex. 974 */ 975 if (last_pos > inode->i_size) 976 i_size_write(inode, last_pos); 977 978 folio_mark_dirty(folio); 979 folio_unlock(folio); 980 folio_put(folio); 981 982 return copied; 983 } 984 985 /* 986 * Provides ramfs-style behavior: data in the pagecache, but no writeback. 987 */ 988 const struct address_space_operations ram_aops = { 989 .read_folio = simple_read_folio, 990 .write_begin = simple_write_begin, 991 .write_end = simple_write_end, 992 .dirty_folio = noop_dirty_folio, 993 }; 994 EXPORT_SYMBOL(ram_aops); 995 996 /* 997 * the inodes created here are not hashed. If you use iunique to generate 998 * unique inode values later for this filesystem, then you must take care 999 * to pass it an appropriate max_reserved value to avoid collisions. 1000 */ 1001 int simple_fill_super(struct super_block *s, unsigned long magic, 1002 const struct tree_descr *files) 1003 { 1004 struct inode *inode; 1005 struct dentry *root; 1006 struct dentry *dentry; 1007 int i; 1008 1009 s->s_blocksize = PAGE_SIZE; 1010 s->s_blocksize_bits = PAGE_SHIFT; 1011 s->s_magic = magic; 1012 s->s_op = &simple_super_operations; 1013 s->s_time_gran = 1; 1014 1015 inode = new_inode(s); 1016 if (!inode) 1017 return -ENOMEM; 1018 /* 1019 * because the root inode is 1, the files array must not contain an 1020 * entry at index 1 1021 */ 1022 inode->i_ino = 1; 1023 inode->i_mode = S_IFDIR | 0755; 1024 simple_inode_init_ts(inode); 1025 inode->i_op = &simple_dir_inode_operations; 1026 inode->i_fop = &simple_dir_operations; 1027 set_nlink(inode, 2); 1028 root = d_make_root(inode); 1029 if (!root) 1030 return -ENOMEM; 1031 for (i = 0; !files->name || files->name[0]; i++, files++) { 1032 if (!files->name) 1033 continue; 1034 1035 /* warn if it tries to conflict with the root inode */ 1036 if (unlikely(i == 1)) 1037 printk(KERN_WARNING "%s: %s passed in a files array" 1038 "with an index of 1!\n", __func__, 1039 s->s_type->name); 1040 1041 dentry = d_alloc_name(root, files->name); 1042 if (!dentry) 1043 goto out; 1044 inode = new_inode(s); 1045 if (!inode) { 1046 dput(dentry); 1047 goto out; 1048 } 1049 inode->i_mode = S_IFREG | files->mode; 1050 simple_inode_init_ts(inode); 1051 inode->i_fop = files->ops; 1052 inode->i_ino = i; 1053 d_add(dentry, inode); 1054 } 1055 s->s_root = root; 1056 return 0; 1057 out: 1058 d_genocide(root); 1059 shrink_dcache_parent(root); 1060 dput(root); 1061 return -ENOMEM; 1062 } 1063 EXPORT_SYMBOL(simple_fill_super); 1064 1065 static DEFINE_SPINLOCK(pin_fs_lock); 1066 1067 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) 1068 { 1069 struct vfsmount *mnt = NULL; 1070 spin_lock(&pin_fs_lock); 1071 if (unlikely(!*mount)) { 1072 spin_unlock(&pin_fs_lock); 1073 mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); 1074 if (IS_ERR(mnt)) 1075 return PTR_ERR(mnt); 1076 spin_lock(&pin_fs_lock); 1077 if (!*mount) 1078 *mount = mnt; 1079 } 1080 mntget(*mount); 1081 ++*count; 1082 spin_unlock(&pin_fs_lock); 1083 mntput(mnt); 1084 return 0; 1085 } 1086 EXPORT_SYMBOL(simple_pin_fs); 1087 1088 void simple_release_fs(struct vfsmount **mount, int *count) 1089 { 1090 struct vfsmount *mnt; 1091 spin_lock(&pin_fs_lock); 1092 mnt = *mount; 1093 if (!--*count) 1094 *mount = NULL; 1095 spin_unlock(&pin_fs_lock); 1096 mntput(mnt); 1097 } 1098 EXPORT_SYMBOL(simple_release_fs); 1099 1100 /** 1101 * simple_read_from_buffer - copy data from the buffer to user space 1102 * @to: the user space buffer to read to 1103 * @count: the maximum number of bytes to read 1104 * @ppos: the current position in the buffer 1105 * @from: the buffer to read from 1106 * @available: the size of the buffer 1107 * 1108 * The simple_read_from_buffer() function reads up to @count bytes from the 1109 * buffer @from at offset @ppos into the user space address starting at @to. 1110 * 1111 * On success, the number of bytes read is returned and the offset @ppos is 1112 * advanced by this number, or negative value is returned on error. 1113 **/ 1114 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, 1115 const void *from, size_t available) 1116 { 1117 loff_t pos = *ppos; 1118 size_t ret; 1119 1120 if (pos < 0) 1121 return -EINVAL; 1122 if (pos >= available || !count) 1123 return 0; 1124 if (count > available - pos) 1125 count = available - pos; 1126 ret = copy_to_user(to, from + pos, count); 1127 if (ret == count) 1128 return -EFAULT; 1129 count -= ret; 1130 *ppos = pos + count; 1131 return count; 1132 } 1133 EXPORT_SYMBOL(simple_read_from_buffer); 1134 1135 /** 1136 * simple_write_to_buffer - copy data from user space to the buffer 1137 * @to: the buffer to write to 1138 * @available: the size of the buffer 1139 * @ppos: the current position in the buffer 1140 * @from: the user space buffer to read from 1141 * @count: the maximum number of bytes to read 1142 * 1143 * The simple_write_to_buffer() function reads up to @count bytes from the user 1144 * space address starting at @from into the buffer @to at offset @ppos. 1145 * 1146 * On success, the number of bytes written is returned and the offset @ppos is 1147 * advanced by this number, or negative value is returned on error. 1148 **/ 1149 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 1150 const void __user *from, size_t count) 1151 { 1152 loff_t pos = *ppos; 1153 size_t res; 1154 1155 if (pos < 0) 1156 return -EINVAL; 1157 if (pos >= available || !count) 1158 return 0; 1159 if (count > available - pos) 1160 count = available - pos; 1161 res = copy_from_user(to + pos, from, count); 1162 if (res == count) 1163 return -EFAULT; 1164 count -= res; 1165 *ppos = pos + count; 1166 return count; 1167 } 1168 EXPORT_SYMBOL(simple_write_to_buffer); 1169 1170 /** 1171 * memory_read_from_buffer - copy data from the buffer 1172 * @to: the kernel space buffer to read to 1173 * @count: the maximum number of bytes to read 1174 * @ppos: the current position in the buffer 1175 * @from: the buffer to read from 1176 * @available: the size of the buffer 1177 * 1178 * The memory_read_from_buffer() function reads up to @count bytes from the 1179 * buffer @from at offset @ppos into the kernel space address starting at @to. 1180 * 1181 * On success, the number of bytes read is returned and the offset @ppos is 1182 * advanced by this number, or negative value is returned on error. 1183 **/ 1184 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 1185 const void *from, size_t available) 1186 { 1187 loff_t pos = *ppos; 1188 1189 if (pos < 0) 1190 return -EINVAL; 1191 if (pos >= available) 1192 return 0; 1193 if (count > available - pos) 1194 count = available - pos; 1195 memcpy(to, from + pos, count); 1196 *ppos = pos + count; 1197 1198 return count; 1199 } 1200 EXPORT_SYMBOL(memory_read_from_buffer); 1201 1202 /* 1203 * Transaction based IO. 1204 * The file expects a single write which triggers the transaction, and then 1205 * possibly a read which collects the result - which is stored in a 1206 * file-local buffer. 1207 */ 1208 1209 void simple_transaction_set(struct file *file, size_t n) 1210 { 1211 struct simple_transaction_argresp *ar = file->private_data; 1212 1213 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); 1214 1215 /* 1216 * The barrier ensures that ar->size will really remain zero until 1217 * ar->data is ready for reading. 1218 */ 1219 smp_mb(); 1220 ar->size = n; 1221 } 1222 EXPORT_SYMBOL(simple_transaction_set); 1223 1224 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 1225 { 1226 struct simple_transaction_argresp *ar; 1227 static DEFINE_SPINLOCK(simple_transaction_lock); 1228 1229 if (size > SIMPLE_TRANSACTION_LIMIT - 1) 1230 return ERR_PTR(-EFBIG); 1231 1232 ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); 1233 if (!ar) 1234 return ERR_PTR(-ENOMEM); 1235 1236 spin_lock(&simple_transaction_lock); 1237 1238 /* only one write allowed per open */ 1239 if (file->private_data) { 1240 spin_unlock(&simple_transaction_lock); 1241 free_page((unsigned long)ar); 1242 return ERR_PTR(-EBUSY); 1243 } 1244 1245 file->private_data = ar; 1246 1247 spin_unlock(&simple_transaction_lock); 1248 1249 if (copy_from_user(ar->data, buf, size)) 1250 return ERR_PTR(-EFAULT); 1251 1252 return ar->data; 1253 } 1254 EXPORT_SYMBOL(simple_transaction_get); 1255 1256 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 1257 { 1258 struct simple_transaction_argresp *ar = file->private_data; 1259 1260 if (!ar) 1261 return 0; 1262 return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); 1263 } 1264 EXPORT_SYMBOL(simple_transaction_read); 1265 1266 int simple_transaction_release(struct inode *inode, struct file *file) 1267 { 1268 free_page((unsigned long)file->private_data); 1269 return 0; 1270 } 1271 EXPORT_SYMBOL(simple_transaction_release); 1272 1273 /* Simple attribute files */ 1274 1275 struct simple_attr { 1276 int (*get)(void *, u64 *); 1277 int (*set)(void *, u64); 1278 char get_buf[24]; /* enough to store a u64 and "\n\0" */ 1279 char set_buf[24]; 1280 void *data; 1281 const char *fmt; /* format for read operation */ 1282 struct mutex mutex; /* protects access to these buffers */ 1283 }; 1284 1285 /* simple_attr_open is called by an actual attribute open file operation 1286 * to set the attribute specific access operations. */ 1287 int simple_attr_open(struct inode *inode, struct file *file, 1288 int (*get)(void *, u64 *), int (*set)(void *, u64), 1289 const char *fmt) 1290 { 1291 struct simple_attr *attr; 1292 1293 attr = kzalloc(sizeof(*attr), GFP_KERNEL); 1294 if (!attr) 1295 return -ENOMEM; 1296 1297 attr->get = get; 1298 attr->set = set; 1299 attr->data = inode->i_private; 1300 attr->fmt = fmt; 1301 mutex_init(&attr->mutex); 1302 1303 file->private_data = attr; 1304 1305 return nonseekable_open(inode, file); 1306 } 1307 EXPORT_SYMBOL_GPL(simple_attr_open); 1308 1309 int simple_attr_release(struct inode *inode, struct file *file) 1310 { 1311 kfree(file->private_data); 1312 return 0; 1313 } 1314 EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ 1315 1316 /* read from the buffer that is filled with the get function */ 1317 ssize_t simple_attr_read(struct file *file, char __user *buf, 1318 size_t len, loff_t *ppos) 1319 { 1320 struct simple_attr *attr; 1321 size_t size; 1322 ssize_t ret; 1323 1324 attr = file->private_data; 1325 1326 if (!attr->get) 1327 return -EACCES; 1328 1329 ret = mutex_lock_interruptible(&attr->mutex); 1330 if (ret) 1331 return ret; 1332 1333 if (*ppos && attr->get_buf[0]) { 1334 /* continued read */ 1335 size = strlen(attr->get_buf); 1336 } else { 1337 /* first read */ 1338 u64 val; 1339 ret = attr->get(attr->data, &val); 1340 if (ret) 1341 goto out; 1342 1343 size = scnprintf(attr->get_buf, sizeof(attr->get_buf), 1344 attr->fmt, (unsigned long long)val); 1345 } 1346 1347 ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); 1348 out: 1349 mutex_unlock(&attr->mutex); 1350 return ret; 1351 } 1352 EXPORT_SYMBOL_GPL(simple_attr_read); 1353 1354 /* interpret the buffer as a number to call the set function with */ 1355 static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, 1356 size_t len, loff_t *ppos, bool is_signed) 1357 { 1358 struct simple_attr *attr; 1359 unsigned long long val; 1360 size_t size; 1361 ssize_t ret; 1362 1363 attr = file->private_data; 1364 if (!attr->set) 1365 return -EACCES; 1366 1367 ret = mutex_lock_interruptible(&attr->mutex); 1368 if (ret) 1369 return ret; 1370 1371 ret = -EFAULT; 1372 size = min(sizeof(attr->set_buf) - 1, len); 1373 if (copy_from_user(attr->set_buf, buf, size)) 1374 goto out; 1375 1376 attr->set_buf[size] = '\0'; 1377 if (is_signed) 1378 ret = kstrtoll(attr->set_buf, 0, &val); 1379 else 1380 ret = kstrtoull(attr->set_buf, 0, &val); 1381 if (ret) 1382 goto out; 1383 ret = attr->set(attr->data, val); 1384 if (ret == 0) 1385 ret = len; /* on success, claim we got the whole input */ 1386 out: 1387 mutex_unlock(&attr->mutex); 1388 return ret; 1389 } 1390 1391 ssize_t simple_attr_write(struct file *file, const char __user *buf, 1392 size_t len, loff_t *ppos) 1393 { 1394 return simple_attr_write_xsigned(file, buf, len, ppos, false); 1395 } 1396 EXPORT_SYMBOL_GPL(simple_attr_write); 1397 1398 ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, 1399 size_t len, loff_t *ppos) 1400 { 1401 return simple_attr_write_xsigned(file, buf, len, ppos, true); 1402 } 1403 EXPORT_SYMBOL_GPL(simple_attr_write_signed); 1404 1405 /** 1406 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation 1407 * @sb: filesystem to do the file handle conversion on 1408 * @fid: file handle to convert 1409 * @fh_len: length of the file handle in bytes 1410 * @fh_type: type of file handle 1411 * @get_inode: filesystem callback to retrieve inode 1412 * 1413 * This function decodes @fid as long as it has one of the well-known 1414 * Linux filehandle types and calls @get_inode on it to retrieve the 1415 * inode for the object specified in the file handle. 1416 */ 1417 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, 1418 int fh_len, int fh_type, struct inode *(*get_inode) 1419 (struct super_block *sb, u64 ino, u32 gen)) 1420 { 1421 struct inode *inode = NULL; 1422 1423 if (fh_len < 2) 1424 return NULL; 1425 1426 switch (fh_type) { 1427 case FILEID_INO32_GEN: 1428 case FILEID_INO32_GEN_PARENT: 1429 inode = get_inode(sb, fid->i32.ino, fid->i32.gen); 1430 break; 1431 } 1432 1433 return d_obtain_alias(inode); 1434 } 1435 EXPORT_SYMBOL_GPL(generic_fh_to_dentry); 1436 1437 /** 1438 * generic_fh_to_parent - generic helper for the fh_to_parent export operation 1439 * @sb: filesystem to do the file handle conversion on 1440 * @fid: file handle to convert 1441 * @fh_len: length of the file handle in bytes 1442 * @fh_type: type of file handle 1443 * @get_inode: filesystem callback to retrieve inode 1444 * 1445 * This function decodes @fid as long as it has one of the well-known 1446 * Linux filehandle types and calls @get_inode on it to retrieve the 1447 * inode for the _parent_ object specified in the file handle if it 1448 * is specified in the file handle, or NULL otherwise. 1449 */ 1450 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, 1451 int fh_len, int fh_type, struct inode *(*get_inode) 1452 (struct super_block *sb, u64 ino, u32 gen)) 1453 { 1454 struct inode *inode = NULL; 1455 1456 if (fh_len <= 2) 1457 return NULL; 1458 1459 switch (fh_type) { 1460 case FILEID_INO32_GEN_PARENT: 1461 inode = get_inode(sb, fid->i32.parent_ino, 1462 (fh_len > 3 ? fid->i32.parent_gen : 0)); 1463 break; 1464 } 1465 1466 return d_obtain_alias(inode); 1467 } 1468 EXPORT_SYMBOL_GPL(generic_fh_to_parent); 1469 1470 /** 1471 * __generic_file_fsync - generic fsync implementation for simple filesystems 1472 * 1473 * @file: file to synchronize 1474 * @start: start offset in bytes 1475 * @end: end offset in bytes (inclusive) 1476 * @datasync: only synchronize essential metadata if true 1477 * 1478 * This is a generic implementation of the fsync method for simple 1479 * filesystems which track all non-inode metadata in the buffers list 1480 * hanging off the address_space structure. 1481 */ 1482 int __generic_file_fsync(struct file *file, loff_t start, loff_t end, 1483 int datasync) 1484 { 1485 struct inode *inode = file->f_mapping->host; 1486 int err; 1487 int ret; 1488 1489 err = file_write_and_wait_range(file, start, end); 1490 if (err) 1491 return err; 1492 1493 inode_lock(inode); 1494 ret = sync_mapping_buffers(inode->i_mapping); 1495 if (!(inode->i_state & I_DIRTY_ALL)) 1496 goto out; 1497 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 1498 goto out; 1499 1500 err = sync_inode_metadata(inode, 1); 1501 if (ret == 0) 1502 ret = err; 1503 1504 out: 1505 inode_unlock(inode); 1506 /* check and advance again to catch errors after syncing out buffers */ 1507 err = file_check_and_advance_wb_err(file); 1508 if (ret == 0) 1509 ret = err; 1510 return ret; 1511 } 1512 EXPORT_SYMBOL(__generic_file_fsync); 1513 1514 /** 1515 * generic_file_fsync - generic fsync implementation for simple filesystems 1516 * with flush 1517 * @file: file to synchronize 1518 * @start: start offset in bytes 1519 * @end: end offset in bytes (inclusive) 1520 * @datasync: only synchronize essential metadata if true 1521 * 1522 */ 1523 1524 int generic_file_fsync(struct file *file, loff_t start, loff_t end, 1525 int datasync) 1526 { 1527 struct inode *inode = file->f_mapping->host; 1528 int err; 1529 1530 err = __generic_file_fsync(file, start, end, datasync); 1531 if (err) 1532 return err; 1533 return blkdev_issue_flush(inode->i_sb->s_bdev); 1534 } 1535 EXPORT_SYMBOL(generic_file_fsync); 1536 1537 /** 1538 * generic_check_addressable - Check addressability of file system 1539 * @blocksize_bits: log of file system block size 1540 * @num_blocks: number of blocks in file system 1541 * 1542 * Determine whether a file system with @num_blocks blocks (and a 1543 * block size of 2**@blocksize_bits) is addressable by the sector_t 1544 * and page cache of the system. Return 0 if so and -EFBIG otherwise. 1545 */ 1546 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) 1547 { 1548 u64 last_fs_block = num_blocks - 1; 1549 u64 last_fs_page = 1550 last_fs_block >> (PAGE_SHIFT - blocksize_bits); 1551 1552 if (unlikely(num_blocks == 0)) 1553 return 0; 1554 1555 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) 1556 return -EINVAL; 1557 1558 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || 1559 (last_fs_page > (pgoff_t)(~0ULL))) { 1560 return -EFBIG; 1561 } 1562 return 0; 1563 } 1564 EXPORT_SYMBOL(generic_check_addressable); 1565 1566 /* 1567 * No-op implementation of ->fsync for in-memory filesystems. 1568 */ 1569 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) 1570 { 1571 return 0; 1572 } 1573 EXPORT_SYMBOL(noop_fsync); 1574 1575 ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 1576 { 1577 /* 1578 * iomap based filesystems support direct I/O without need for 1579 * this callback. However, it still needs to be set in 1580 * inode->a_ops so that open/fcntl know that direct I/O is 1581 * generally supported. 1582 */ 1583 return -EINVAL; 1584 } 1585 EXPORT_SYMBOL_GPL(noop_direct_IO); 1586 1587 /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ 1588 void kfree_link(void *p) 1589 { 1590 kfree(p); 1591 } 1592 EXPORT_SYMBOL(kfree_link); 1593 1594 struct inode *alloc_anon_inode(struct super_block *s) 1595 { 1596 static const struct address_space_operations anon_aops = { 1597 .dirty_folio = noop_dirty_folio, 1598 }; 1599 struct inode *inode = new_inode_pseudo(s); 1600 1601 if (!inode) 1602 return ERR_PTR(-ENOMEM); 1603 1604 inode->i_ino = get_next_ino(); 1605 inode->i_mapping->a_ops = &anon_aops; 1606 1607 /* 1608 * Mark the inode dirty from the very beginning, 1609 * that way it will never be moved to the dirty 1610 * list because mark_inode_dirty() will think 1611 * that it already _is_ on the dirty list. 1612 */ 1613 inode->i_state = I_DIRTY; 1614 inode->i_mode = S_IRUSR | S_IWUSR; 1615 inode->i_uid = current_fsuid(); 1616 inode->i_gid = current_fsgid(); 1617 inode->i_flags |= S_PRIVATE; 1618 simple_inode_init_ts(inode); 1619 return inode; 1620 } 1621 EXPORT_SYMBOL(alloc_anon_inode); 1622 1623 /** 1624 * simple_nosetlease - generic helper for prohibiting leases 1625 * @filp: file pointer 1626 * @arg: type of lease to obtain 1627 * @flp: new lease supplied for insertion 1628 * @priv: private data for lm_setup operation 1629 * 1630 * Generic helper for filesystems that do not wish to allow leases to be set. 1631 * All arguments are ignored and it just returns -EINVAL. 1632 */ 1633 int 1634 simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, 1635 void **priv) 1636 { 1637 return -EINVAL; 1638 } 1639 EXPORT_SYMBOL(simple_nosetlease); 1640 1641 /** 1642 * simple_get_link - generic helper to get the target of "fast" symlinks 1643 * @dentry: not used here 1644 * @inode: the symlink inode 1645 * @done: not used here 1646 * 1647 * Generic helper for filesystems to use for symlink inodes where a pointer to 1648 * the symlink target is stored in ->i_link. NOTE: this isn't normally called, 1649 * since as an optimization the path lookup code uses any non-NULL ->i_link 1650 * directly, without calling ->get_link(). But ->get_link() still must be set, 1651 * to mark the inode_operations as being for a symlink. 1652 * 1653 * Return: the symlink target 1654 */ 1655 const char *simple_get_link(struct dentry *dentry, struct inode *inode, 1656 struct delayed_call *done) 1657 { 1658 return inode->i_link; 1659 } 1660 EXPORT_SYMBOL(simple_get_link); 1661 1662 const struct inode_operations simple_symlink_inode_operations = { 1663 .get_link = simple_get_link, 1664 }; 1665 EXPORT_SYMBOL(simple_symlink_inode_operations); 1666 1667 /* 1668 * Operations for a permanently empty directory. 1669 */ 1670 static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 1671 { 1672 return ERR_PTR(-ENOENT); 1673 } 1674 1675 static int empty_dir_getattr(struct mnt_idmap *idmap, 1676 const struct path *path, struct kstat *stat, 1677 u32 request_mask, unsigned int query_flags) 1678 { 1679 struct inode *inode = d_inode(path->dentry); 1680 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 1681 return 0; 1682 } 1683 1684 static int empty_dir_setattr(struct mnt_idmap *idmap, 1685 struct dentry *dentry, struct iattr *attr) 1686 { 1687 return -EPERM; 1688 } 1689 1690 static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) 1691 { 1692 return -EOPNOTSUPP; 1693 } 1694 1695 static const struct inode_operations empty_dir_inode_operations = { 1696 .lookup = empty_dir_lookup, 1697 .permission = generic_permission, 1698 .setattr = empty_dir_setattr, 1699 .getattr = empty_dir_getattr, 1700 .listxattr = empty_dir_listxattr, 1701 }; 1702 1703 static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) 1704 { 1705 /* An empty directory has two entries . and .. at offsets 0 and 1 */ 1706 return generic_file_llseek_size(file, offset, whence, 2, 2); 1707 } 1708 1709 static int empty_dir_readdir(struct file *file, struct dir_context *ctx) 1710 { 1711 dir_emit_dots(file, ctx); 1712 return 0; 1713 } 1714 1715 static const struct file_operations empty_dir_operations = { 1716 .llseek = empty_dir_llseek, 1717 .read = generic_read_dir, 1718 .iterate_shared = empty_dir_readdir, 1719 .fsync = noop_fsync, 1720 }; 1721 1722 1723 void make_empty_dir_inode(struct inode *inode) 1724 { 1725 set_nlink(inode, 2); 1726 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 1727 inode->i_uid = GLOBAL_ROOT_UID; 1728 inode->i_gid = GLOBAL_ROOT_GID; 1729 inode->i_rdev = 0; 1730 inode->i_size = 0; 1731 inode->i_blkbits = PAGE_SHIFT; 1732 inode->i_blocks = 0; 1733 1734 inode->i_op = &empty_dir_inode_operations; 1735 inode->i_opflags &= ~IOP_XATTR; 1736 inode->i_fop = &empty_dir_operations; 1737 } 1738 1739 bool is_empty_dir_inode(struct inode *inode) 1740 { 1741 return (inode->i_fop == &empty_dir_operations) && 1742 (inode->i_op == &empty_dir_inode_operations); 1743 } 1744 1745 #if IS_ENABLED(CONFIG_UNICODE) 1746 /** 1747 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems 1748 * @dentry: dentry whose name we are checking against 1749 * @len: len of name of dentry 1750 * @str: str pointer to name of dentry 1751 * @name: Name to compare against 1752 * 1753 * Return: 0 if names match, 1 if mismatch, or -ERRNO 1754 */ 1755 static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, 1756 const char *str, const struct qstr *name) 1757 { 1758 const struct dentry *parent = READ_ONCE(dentry->d_parent); 1759 const struct inode *dir = READ_ONCE(parent->d_inode); 1760 const struct super_block *sb = dentry->d_sb; 1761 const struct unicode_map *um = sb->s_encoding; 1762 struct qstr qstr = QSTR_INIT(str, len); 1763 char strbuf[DNAME_INLINE_LEN]; 1764 int ret; 1765 1766 if (!dir || !IS_CASEFOLDED(dir)) 1767 goto fallback; 1768 /* 1769 * If the dentry name is stored in-line, then it may be concurrently 1770 * modified by a rename. If this happens, the VFS will eventually retry 1771 * the lookup, so it doesn't matter what ->d_compare() returns. 1772 * However, it's unsafe to call utf8_strncasecmp() with an unstable 1773 * string. Therefore, we have to copy the name into a temporary buffer. 1774 */ 1775 if (len <= DNAME_INLINE_LEN - 1) { 1776 memcpy(strbuf, str, len); 1777 strbuf[len] = 0; 1778 qstr.name = strbuf; 1779 /* prevent compiler from optimizing out the temporary buffer */ 1780 barrier(); 1781 } 1782 ret = utf8_strncasecmp(um, name, &qstr); 1783 if (ret >= 0) 1784 return ret; 1785 1786 if (sb_has_strict_encoding(sb)) 1787 return -EINVAL; 1788 fallback: 1789 if (len != name->len) 1790 return 1; 1791 return !!memcmp(str, name->name, len); 1792 } 1793 1794 /** 1795 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems 1796 * @dentry: dentry of the parent directory 1797 * @str: qstr of name whose hash we should fill in 1798 * 1799 * Return: 0 if hash was successful or unchanged, and -EINVAL on error 1800 */ 1801 static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) 1802 { 1803 const struct inode *dir = READ_ONCE(dentry->d_inode); 1804 struct super_block *sb = dentry->d_sb; 1805 const struct unicode_map *um = sb->s_encoding; 1806 int ret = 0; 1807 1808 if (!dir || !IS_CASEFOLDED(dir)) 1809 return 0; 1810 1811 ret = utf8_casefold_hash(um, dentry, str); 1812 if (ret < 0 && sb_has_strict_encoding(sb)) 1813 return -EINVAL; 1814 return 0; 1815 } 1816 1817 static const struct dentry_operations generic_ci_dentry_ops = { 1818 .d_hash = generic_ci_d_hash, 1819 .d_compare = generic_ci_d_compare, 1820 }; 1821 #endif 1822 1823 #ifdef CONFIG_FS_ENCRYPTION 1824 static const struct dentry_operations generic_encrypted_dentry_ops = { 1825 .d_revalidate = fscrypt_d_revalidate, 1826 }; 1827 #endif 1828 1829 #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) 1830 static const struct dentry_operations generic_encrypted_ci_dentry_ops = { 1831 .d_hash = generic_ci_d_hash, 1832 .d_compare = generic_ci_d_compare, 1833 .d_revalidate = fscrypt_d_revalidate, 1834 }; 1835 #endif 1836 1837 /** 1838 * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry 1839 * @dentry: dentry to set ops on 1840 * 1841 * Casefolded directories need d_hash and d_compare set, so that the dentries 1842 * contained in them are handled case-insensitively. Note that these operations 1843 * are needed on the parent directory rather than on the dentries in it, and 1844 * while the casefolding flag can be toggled on and off on an empty directory, 1845 * dentry_operations can't be changed later. As a result, if the filesystem has 1846 * casefolding support enabled at all, we have to give all dentries the 1847 * casefolding operations even if their inode doesn't have the casefolding flag 1848 * currently (and thus the casefolding ops would be no-ops for now). 1849 * 1850 * Encryption works differently in that the only dentry operation it needs is 1851 * d_revalidate, which it only needs on dentries that have the no-key name flag. 1852 * The no-key flag can't be set "later", so we don't have to worry about that. 1853 * 1854 * Finally, to maximize compatibility with overlayfs (which isn't compatible 1855 * with certain dentry operations) and to avoid taking an unnecessary 1856 * performance hit, we use custom dentry_operations for each possible 1857 * combination rather than always installing all operations. 1858 */ 1859 void generic_set_encrypted_ci_d_ops(struct dentry *dentry) 1860 { 1861 #ifdef CONFIG_FS_ENCRYPTION 1862 bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; 1863 #endif 1864 #if IS_ENABLED(CONFIG_UNICODE) 1865 bool needs_ci_ops = dentry->d_sb->s_encoding; 1866 #endif 1867 #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) 1868 if (needs_encrypt_ops && needs_ci_ops) { 1869 d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); 1870 return; 1871 } 1872 #endif 1873 #ifdef CONFIG_FS_ENCRYPTION 1874 if (needs_encrypt_ops) { 1875 d_set_d_op(dentry, &generic_encrypted_dentry_ops); 1876 return; 1877 } 1878 #endif 1879 #if IS_ENABLED(CONFIG_UNICODE) 1880 if (needs_ci_ops) { 1881 d_set_d_op(dentry, &generic_ci_dentry_ops); 1882 return; 1883 } 1884 #endif 1885 } 1886 EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); 1887 1888 /** 1889 * inode_maybe_inc_iversion - increments i_version 1890 * @inode: inode with the i_version that should be updated 1891 * @force: increment the counter even if it's not necessary? 1892 * 1893 * Every time the inode is modified, the i_version field must be seen to have 1894 * changed by any observer. 1895 * 1896 * If "force" is set or the QUERIED flag is set, then ensure that we increment 1897 * the value, and clear the queried flag. 1898 * 1899 * In the common case where neither is set, then we can return "false" without 1900 * updating i_version. 1901 * 1902 * If this function returns false, and no other metadata has changed, then we 1903 * can avoid logging the metadata. 1904 */ 1905 bool inode_maybe_inc_iversion(struct inode *inode, bool force) 1906 { 1907 u64 cur, new; 1908 1909 /* 1910 * The i_version field is not strictly ordered with any other inode 1911 * information, but the legacy inode_inc_iversion code used a spinlock 1912 * to serialize increments. 1913 * 1914 * Here, we add full memory barriers to ensure that any de-facto 1915 * ordering with other info is preserved. 1916 * 1917 * This barrier pairs with the barrier in inode_query_iversion() 1918 */ 1919 smp_mb(); 1920 cur = inode_peek_iversion_raw(inode); 1921 do { 1922 /* If flag is clear then we needn't do anything */ 1923 if (!force && !(cur & I_VERSION_QUERIED)) 1924 return false; 1925 1926 /* Since lowest bit is flag, add 2 to avoid it */ 1927 new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; 1928 } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 1929 return true; 1930 } 1931 EXPORT_SYMBOL(inode_maybe_inc_iversion); 1932 1933 /** 1934 * inode_query_iversion - read i_version for later use 1935 * @inode: inode from which i_version should be read 1936 * 1937 * Read the inode i_version counter. This should be used by callers that wish 1938 * to store the returned i_version for later comparison. This will guarantee 1939 * that a later query of the i_version will result in a different value if 1940 * anything has changed. 1941 * 1942 * In this implementation, we fetch the current value, set the QUERIED flag and 1943 * then try to swap it into place with a cmpxchg, if it wasn't already set. If 1944 * that fails, we try again with the newly fetched value from the cmpxchg. 1945 */ 1946 u64 inode_query_iversion(struct inode *inode) 1947 { 1948 u64 cur, new; 1949 1950 cur = inode_peek_iversion_raw(inode); 1951 do { 1952 /* If flag is already set, then no need to swap */ 1953 if (cur & I_VERSION_QUERIED) { 1954 /* 1955 * This barrier (and the implicit barrier in the 1956 * cmpxchg below) pairs with the barrier in 1957 * inode_maybe_inc_iversion(). 1958 */ 1959 smp_mb(); 1960 break; 1961 } 1962 1963 new = cur | I_VERSION_QUERIED; 1964 } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 1965 return cur >> I_VERSION_QUERIED_SHIFT; 1966 } 1967 EXPORT_SYMBOL(inode_query_iversion); 1968 1969 ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, 1970 ssize_t direct_written, ssize_t buffered_written) 1971 { 1972 struct address_space *mapping = iocb->ki_filp->f_mapping; 1973 loff_t pos = iocb->ki_pos - buffered_written; 1974 loff_t end = iocb->ki_pos - 1; 1975 int err; 1976 1977 /* 1978 * If the buffered write fallback returned an error, we want to return 1979 * the number of bytes which were written by direct I/O, or the error 1980 * code if that was zero. 1981 * 1982 * Note that this differs from normal direct-io semantics, which will 1983 * return -EFOO even if some bytes were written. 1984 */ 1985 if (unlikely(buffered_written < 0)) { 1986 if (direct_written) 1987 return direct_written; 1988 return buffered_written; 1989 } 1990 1991 /* 1992 * We need to ensure that the page cache pages are written to disk and 1993 * invalidated to preserve the expected O_DIRECT semantics. 1994 */ 1995 err = filemap_write_and_wait_range(mapping, pos, end); 1996 if (err < 0) { 1997 /* 1998 * We don't know how much we wrote, so just return the number of 1999 * bytes which were direct-written 2000 */ 2001 iocb->ki_pos -= buffered_written; 2002 if (direct_written) 2003 return direct_written; 2004 return err; 2005 } 2006 invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); 2007 return direct_written + buffered_written; 2008 } 2009 EXPORT_SYMBOL_GPL(direct_write_fallback); 2010 2011 /** 2012 * simple_inode_init_ts - initialize the timestamps for a new inode 2013 * @inode: inode to be initialized 2014 * 2015 * When a new inode is created, most filesystems set the timestamps to the 2016 * current time. Add a helper to do this. 2017 */ 2018 struct timespec64 simple_inode_init_ts(struct inode *inode) 2019 { 2020 struct timespec64 ts = inode_set_ctime_current(inode); 2021 2022 inode_set_atime_to_ts(inode, ts); 2023 inode_set_mtime_to_ts(inode, ts); 2024 return ts; 2025 } 2026 EXPORT_SYMBOL(simple_inode_init_ts); 2027