1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright (C) 2011 Novell Inc. 5 */ 6 7 #include <linux/fs.h> 8 #include <linux/slab.h> 9 #include <linux/cred.h> 10 #include <linux/xattr.h> 11 #include <linux/posix_acl.h> 12 #include <linux/ratelimit.h> 13 #include <linux/fiemap.h> 14 #include <linux/fileattr.h> 15 #include <linux/security.h> 16 #include "overlayfs.h" 17 18 19 int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 20 struct iattr *attr) 21 { 22 int err; 23 bool full_copy_up = false; 24 struct dentry *upperdentry; 25 const struct cred *old_cred; 26 27 err = setattr_prepare(&init_user_ns, dentry, attr); 28 if (err) 29 return err; 30 31 err = ovl_want_write(dentry); 32 if (err) 33 goto out; 34 35 if (attr->ia_valid & ATTR_SIZE) { 36 struct inode *realinode = d_inode(ovl_dentry_real(dentry)); 37 38 err = -ETXTBSY; 39 if (atomic_read(&realinode->i_writecount) < 0) 40 goto out_drop_write; 41 42 /* Truncate should trigger data copy up as well */ 43 full_copy_up = true; 44 } 45 46 if (!full_copy_up) 47 err = ovl_copy_up(dentry); 48 else 49 err = ovl_copy_up_with_data(dentry); 50 if (!err) { 51 struct inode *winode = NULL; 52 53 upperdentry = ovl_dentry_upper(dentry); 54 55 if (attr->ia_valid & ATTR_SIZE) { 56 winode = d_inode(upperdentry); 57 err = get_write_access(winode); 58 if (err) 59 goto out_drop_write; 60 } 61 62 if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) 63 attr->ia_valid &= ~ATTR_MODE; 64 65 /* 66 * We might have to translate ovl file into real file object 67 * once use cases emerge. For now, simply don't let underlying 68 * filesystem rely on attr->ia_file 69 */ 70 attr->ia_valid &= ~ATTR_FILE; 71 72 /* 73 * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN 74 * set. Overlayfs does not pass O_TRUNC flag to underlying 75 * filesystem during open -> do not pass ATTR_OPEN. This 76 * disables optimization in fuse which assumes open(O_TRUNC) 77 * already set file size to 0. But we never passed O_TRUNC to 78 * fuse. So by clearing ATTR_OPEN, fuse will be forced to send 79 * setattr request to server. 80 */ 81 attr->ia_valid &= ~ATTR_OPEN; 82 83 inode_lock(upperdentry->d_inode); 84 old_cred = ovl_override_creds(dentry->d_sb); 85 err = notify_change(&init_user_ns, upperdentry, attr, NULL); 86 revert_creds(old_cred); 87 if (!err) 88 ovl_copyattr(upperdentry->d_inode, dentry->d_inode); 89 inode_unlock(upperdentry->d_inode); 90 91 if (winode) 92 put_write_access(winode); 93 } 94 out_drop_write: 95 ovl_drop_write(dentry); 96 out: 97 return err; 98 } 99 100 static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) 101 { 102 bool samefs = ovl_same_fs(dentry->d_sb); 103 unsigned int xinobits = ovl_xino_bits(dentry->d_sb); 104 unsigned int xinoshift = 64 - xinobits; 105 106 if (samefs) { 107 /* 108 * When all layers are on the same fs, all real inode 109 * number are unique, so we use the overlay st_dev, 110 * which is friendly to du -x. 111 */ 112 stat->dev = dentry->d_sb->s_dev; 113 return; 114 } else if (xinobits) { 115 /* 116 * All inode numbers of underlying fs should not be using the 117 * high xinobits, so we use high xinobits to partition the 118 * overlay st_ino address space. The high bits holds the fsid 119 * (upper fsid is 0). The lowest xinobit is reserved for mapping 120 * the non-persistent inode numbers range in case of overflow. 121 * This way all overlay inode numbers are unique and use the 122 * overlay st_dev. 123 */ 124 if (likely(!(stat->ino >> xinoshift))) { 125 stat->ino |= ((u64)fsid) << (xinoshift + 1); 126 stat->dev = dentry->d_sb->s_dev; 127 return; 128 } else if (ovl_xino_warn(dentry->d_sb)) { 129 pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", 130 dentry, stat->ino, xinobits); 131 } 132 } 133 134 /* The inode could not be mapped to a unified st_ino address space */ 135 if (S_ISDIR(dentry->d_inode->i_mode)) { 136 /* 137 * Always use the overlay st_dev for directories, so 'find 138 * -xdev' will scan the entire overlay mount and won't cross the 139 * overlay mount boundaries. 140 * 141 * If not all layers are on the same fs the pair {real st_ino; 142 * overlay st_dev} is not unique, so use the non persistent 143 * overlay st_ino for directories. 144 */ 145 stat->dev = dentry->d_sb->s_dev; 146 stat->ino = dentry->d_inode->i_ino; 147 } else { 148 /* 149 * For non-samefs setup, if we cannot map all layers st_ino 150 * to a unified address space, we need to make sure that st_dev 151 * is unique per underlying fs, so we use the unique anonymous 152 * bdev assigned to the underlying fs. 153 */ 154 stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; 155 } 156 } 157 158 int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, 159 struct kstat *stat, u32 request_mask, unsigned int flags) 160 { 161 struct dentry *dentry = path->dentry; 162 enum ovl_path_type type; 163 struct path realpath; 164 const struct cred *old_cred; 165 bool is_dir = S_ISDIR(dentry->d_inode->i_mode); 166 int fsid = 0; 167 int err; 168 bool metacopy_blocks = false; 169 170 metacopy_blocks = ovl_is_metacopy_dentry(dentry); 171 172 type = ovl_path_real(dentry, &realpath); 173 old_cred = ovl_override_creds(dentry->d_sb); 174 err = vfs_getattr(&realpath, stat, request_mask, flags); 175 if (err) 176 goto out; 177 178 /* 179 * For non-dir or same fs, we use st_ino of the copy up origin. 180 * This guaranties constant st_dev/st_ino across copy up. 181 * With xino feature and non-samefs, we use st_ino of the copy up 182 * origin masked with high bits that represent the layer id. 183 * 184 * If lower filesystem supports NFS file handles, this also guaranties 185 * persistent st_ino across mount cycle. 186 */ 187 if (!is_dir || ovl_same_dev(dentry->d_sb)) { 188 if (!OVL_TYPE_UPPER(type)) { 189 fsid = ovl_layer_lower(dentry)->fsid; 190 } else if (OVL_TYPE_ORIGIN(type)) { 191 struct kstat lowerstat; 192 u32 lowermask = STATX_INO | STATX_BLOCKS | 193 (!is_dir ? STATX_NLINK : 0); 194 195 ovl_path_lower(dentry, &realpath); 196 err = vfs_getattr(&realpath, &lowerstat, 197 lowermask, flags); 198 if (err) 199 goto out; 200 201 /* 202 * Lower hardlinks may be broken on copy up to different 203 * upper files, so we cannot use the lower origin st_ino 204 * for those different files, even for the same fs case. 205 * 206 * Similarly, several redirected dirs can point to the 207 * same dir on a lower layer. With the "verify_lower" 208 * feature, we do not use the lower origin st_ino, if 209 * we haven't verified that this redirect is unique. 210 * 211 * With inodes index enabled, it is safe to use st_ino 212 * of an indexed origin. The index validates that the 213 * upper hardlink is not broken and that a redirected 214 * dir is the only redirect to that origin. 215 */ 216 if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || 217 (!ovl_verify_lower(dentry->d_sb) && 218 (is_dir || lowerstat.nlink == 1))) { 219 fsid = ovl_layer_lower(dentry)->fsid; 220 stat->ino = lowerstat.ino; 221 } 222 223 /* 224 * If we are querying a metacopy dentry and lower 225 * dentry is data dentry, then use the blocks we 226 * queried just now. We don't have to do additional 227 * vfs_getattr(). If lower itself is metacopy, then 228 * additional vfs_getattr() is unavoidable. 229 */ 230 if (metacopy_blocks && 231 realpath.dentry == ovl_dentry_lowerdata(dentry)) { 232 stat->blocks = lowerstat.blocks; 233 metacopy_blocks = false; 234 } 235 } 236 237 if (metacopy_blocks) { 238 /* 239 * If lower is not same as lowerdata or if there was 240 * no origin on upper, we can end up here. 241 */ 242 struct kstat lowerdatastat; 243 u32 lowermask = STATX_BLOCKS; 244 245 ovl_path_lowerdata(dentry, &realpath); 246 err = vfs_getattr(&realpath, &lowerdatastat, 247 lowermask, flags); 248 if (err) 249 goto out; 250 stat->blocks = lowerdatastat.blocks; 251 } 252 } 253 254 ovl_map_dev_ino(dentry, stat, fsid); 255 256 /* 257 * It's probably not worth it to count subdirs to get the 258 * correct link count. nlink=1 seems to pacify 'find' and 259 * other utilities. 260 */ 261 if (is_dir && OVL_TYPE_MERGE(type)) 262 stat->nlink = 1; 263 264 /* 265 * Return the overlay inode nlinks for indexed upper inodes. 266 * Overlay inode nlink counts the union of the upper hardlinks 267 * and non-covered lower hardlinks. It does not include the upper 268 * index hardlink. 269 */ 270 if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) 271 stat->nlink = dentry->d_inode->i_nlink; 272 273 out: 274 revert_creds(old_cred); 275 276 return err; 277 } 278 279 int ovl_permission(struct user_namespace *mnt_userns, 280 struct inode *inode, int mask) 281 { 282 struct inode *upperinode = ovl_inode_upper(inode); 283 struct inode *realinode = upperinode ?: ovl_inode_lower(inode); 284 const struct cred *old_cred; 285 int err; 286 287 /* Careful in RCU walk mode */ 288 if (!realinode) { 289 WARN_ON(!(mask & MAY_NOT_BLOCK)); 290 return -ECHILD; 291 } 292 293 /* 294 * Check overlay inode with the creds of task and underlying inode 295 * with creds of mounter 296 */ 297 err = generic_permission(&init_user_ns, inode, mask); 298 if (err) 299 return err; 300 301 old_cred = ovl_override_creds(inode->i_sb); 302 if (!upperinode && 303 !special_file(realinode->i_mode) && mask & MAY_WRITE) { 304 mask &= ~(MAY_WRITE | MAY_APPEND); 305 /* Make sure mounter can read file for copy up later */ 306 mask |= MAY_READ; 307 } 308 err = inode_permission(&init_user_ns, realinode, mask); 309 revert_creds(old_cred); 310 311 return err; 312 } 313 314 static const char *ovl_get_link(struct dentry *dentry, 315 struct inode *inode, 316 struct delayed_call *done) 317 { 318 const struct cred *old_cred; 319 const char *p; 320 321 if (!dentry) 322 return ERR_PTR(-ECHILD); 323 324 old_cred = ovl_override_creds(dentry->d_sb); 325 p = vfs_get_link(ovl_dentry_real(dentry), done); 326 revert_creds(old_cred); 327 return p; 328 } 329 330 bool ovl_is_private_xattr(struct super_block *sb, const char *name) 331 { 332 struct ovl_fs *ofs = sb->s_fs_info; 333 334 if (ofs->config.userxattr) 335 return strncmp(name, OVL_XATTR_USER_PREFIX, 336 sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0; 337 else 338 return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, 339 sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0; 340 } 341 342 int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, 343 const void *value, size_t size, int flags) 344 { 345 int err; 346 struct dentry *upperdentry = ovl_i_dentry_upper(inode); 347 struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); 348 const struct cred *old_cred; 349 350 err = ovl_want_write(dentry); 351 if (err) 352 goto out; 353 354 if (!value && !upperdentry) { 355 old_cred = ovl_override_creds(dentry->d_sb); 356 err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); 357 revert_creds(old_cred); 358 if (err < 0) 359 goto out_drop_write; 360 } 361 362 if (!upperdentry) { 363 err = ovl_copy_up(dentry); 364 if (err) 365 goto out_drop_write; 366 367 realdentry = ovl_dentry_upper(dentry); 368 } 369 370 old_cred = ovl_override_creds(dentry->d_sb); 371 if (value) 372 err = vfs_setxattr(&init_user_ns, realdentry, name, value, size, 373 flags); 374 else { 375 WARN_ON(flags != XATTR_REPLACE); 376 err = vfs_removexattr(&init_user_ns, realdentry, name); 377 } 378 revert_creds(old_cred); 379 380 /* copy c/mtime */ 381 ovl_copyattr(d_inode(realdentry), inode); 382 383 out_drop_write: 384 ovl_drop_write(dentry); 385 out: 386 return err; 387 } 388 389 int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, 390 void *value, size_t size) 391 { 392 ssize_t res; 393 const struct cred *old_cred; 394 struct dentry *realdentry = 395 ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); 396 397 old_cred = ovl_override_creds(dentry->d_sb); 398 res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); 399 revert_creds(old_cred); 400 return res; 401 } 402 403 static bool ovl_can_list(struct super_block *sb, const char *s) 404 { 405 /* Never list private (.overlay) */ 406 if (ovl_is_private_xattr(sb, s)) 407 return false; 408 409 /* List all non-trusted xattrs */ 410 if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) 411 return true; 412 413 /* list other trusted for superuser only */ 414 return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); 415 } 416 417 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) 418 { 419 struct dentry *realdentry = ovl_dentry_real(dentry); 420 ssize_t res; 421 size_t len; 422 char *s; 423 const struct cred *old_cred; 424 425 old_cred = ovl_override_creds(dentry->d_sb); 426 res = vfs_listxattr(realdentry, list, size); 427 revert_creds(old_cred); 428 if (res <= 0 || size == 0) 429 return res; 430 431 /* filter out private xattrs */ 432 for (s = list, len = res; len;) { 433 size_t slen = strnlen(s, len) + 1; 434 435 /* underlying fs providing us with an broken xattr list? */ 436 if (WARN_ON(slen > len)) 437 return -EIO; 438 439 len -= slen; 440 if (!ovl_can_list(dentry->d_sb, s)) { 441 res -= slen; 442 memmove(s, s + slen, len); 443 } else { 444 s += slen; 445 } 446 } 447 448 return res; 449 } 450 451 struct posix_acl *ovl_get_acl(struct inode *inode, int type) 452 { 453 struct inode *realinode = ovl_inode_real(inode); 454 const struct cred *old_cred; 455 struct posix_acl *acl; 456 457 if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) 458 return NULL; 459 460 old_cred = ovl_override_creds(inode->i_sb); 461 acl = get_acl(realinode, type); 462 revert_creds(old_cred); 463 464 return acl; 465 } 466 467 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) 468 { 469 if (flags & S_ATIME) { 470 struct ovl_fs *ofs = inode->i_sb->s_fs_info; 471 struct path upperpath = { 472 .mnt = ovl_upper_mnt(ofs), 473 .dentry = ovl_upperdentry_dereference(OVL_I(inode)), 474 }; 475 476 if (upperpath.dentry) { 477 touch_atime(&upperpath); 478 inode->i_atime = d_inode(upperpath.dentry)->i_atime; 479 } 480 } 481 return 0; 482 } 483 484 static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 485 u64 start, u64 len) 486 { 487 int err; 488 struct inode *realinode = ovl_inode_realdata(inode); 489 const struct cred *old_cred; 490 491 if (!realinode->i_op->fiemap) 492 return -EOPNOTSUPP; 493 494 old_cred = ovl_override_creds(inode->i_sb); 495 err = realinode->i_op->fiemap(realinode, fieinfo, start, len); 496 revert_creds(old_cred); 497 498 return err; 499 } 500 501 /* 502 * Work around the fact that security_file_ioctl() takes a file argument. 503 * Introducing security_inode_fileattr_get/set() hooks would solve this issue 504 * properly. 505 */ 506 static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa, 507 bool set) 508 { 509 struct path realpath; 510 struct file *file; 511 unsigned int cmd; 512 int err; 513 514 ovl_path_real(dentry, &realpath); 515 file = dentry_open(&realpath, O_RDONLY, current_cred()); 516 if (IS_ERR(file)) 517 return PTR_ERR(file); 518 519 if (set) 520 cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; 521 else 522 cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; 523 524 err = security_file_ioctl(file, cmd, 0); 525 fput(file); 526 527 return err; 528 } 529 530 int ovl_fileattr_set(struct user_namespace *mnt_userns, 531 struct dentry *dentry, struct fileattr *fa) 532 { 533 struct inode *inode = d_inode(dentry); 534 struct dentry *upperdentry; 535 const struct cred *old_cred; 536 int err; 537 538 err = ovl_want_write(dentry); 539 if (err) 540 goto out; 541 542 err = ovl_copy_up(dentry); 543 if (!err) { 544 upperdentry = ovl_dentry_upper(dentry); 545 546 old_cred = ovl_override_creds(inode->i_sb); 547 err = ovl_security_fileattr(dentry, fa, true); 548 if (!err) 549 err = vfs_fileattr_set(&init_user_ns, upperdentry, fa); 550 revert_creds(old_cred); 551 ovl_copyflags(ovl_inode_real(inode), inode); 552 } 553 ovl_drop_write(dentry); 554 out: 555 return err; 556 } 557 558 int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) 559 { 560 struct inode *inode = d_inode(dentry); 561 struct dentry *realdentry = ovl_dentry_real(dentry); 562 const struct cred *old_cred; 563 int err; 564 565 old_cred = ovl_override_creds(inode->i_sb); 566 err = ovl_security_fileattr(dentry, fa, false); 567 if (!err) 568 err = vfs_fileattr_get(realdentry, fa); 569 revert_creds(old_cred); 570 571 return err; 572 } 573 574 static const struct inode_operations ovl_file_inode_operations = { 575 .setattr = ovl_setattr, 576 .permission = ovl_permission, 577 .getattr = ovl_getattr, 578 .listxattr = ovl_listxattr, 579 .get_acl = ovl_get_acl, 580 .update_time = ovl_update_time, 581 .fiemap = ovl_fiemap, 582 .fileattr_get = ovl_fileattr_get, 583 .fileattr_set = ovl_fileattr_set, 584 }; 585 586 static const struct inode_operations ovl_symlink_inode_operations = { 587 .setattr = ovl_setattr, 588 .get_link = ovl_get_link, 589 .getattr = ovl_getattr, 590 .listxattr = ovl_listxattr, 591 .update_time = ovl_update_time, 592 }; 593 594 static const struct inode_operations ovl_special_inode_operations = { 595 .setattr = ovl_setattr, 596 .permission = ovl_permission, 597 .getattr = ovl_getattr, 598 .listxattr = ovl_listxattr, 599 .get_acl = ovl_get_acl, 600 .update_time = ovl_update_time, 601 }; 602 603 static const struct address_space_operations ovl_aops = { 604 /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ 605 .direct_IO = noop_direct_IO, 606 }; 607 608 /* 609 * It is possible to stack overlayfs instance on top of another 610 * overlayfs instance as lower layer. We need to annotate the 611 * stackable i_mutex locks according to stack level of the super 612 * block instance. An overlayfs instance can never be in stack 613 * depth 0 (there is always a real fs below it). An overlayfs 614 * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. 615 * 616 * For example, here is a snip from /proc/lockdep_chains after 617 * dir_iterate of nested overlayfs: 618 * 619 * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) 620 * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) 621 * [...] &type->i_mutex_dir_key (stack_depth=0) 622 * 623 * Locking order w.r.t ovl_want_write() is important for nested overlayfs. 624 * 625 * This chain is valid: 626 * - inode->i_rwsem (inode_lock[2]) 627 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 628 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 629 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 630 * 631 * And this chain is valid: 632 * - inode->i_rwsem (inode_lock[2]) 633 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 634 * - lowerinode->i_rwsem (inode_lock[1]) 635 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 636 * 637 * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is 638 * held, because it is in reverse order of the non-nested case using the same 639 * upper fs: 640 * - inode->i_rwsem (inode_lock[1]) 641 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 642 * - OVL_I(inode)->lock (ovl_inode_lock[1]) 643 */ 644 #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH 645 646 static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) 647 { 648 #ifdef CONFIG_LOCKDEP 649 static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; 650 static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; 651 static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; 652 653 int depth = inode->i_sb->s_stack_depth - 1; 654 655 if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) 656 depth = 0; 657 658 if (S_ISDIR(inode->i_mode)) 659 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); 660 else 661 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); 662 663 lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); 664 #endif 665 } 666 667 static void ovl_next_ino(struct inode *inode) 668 { 669 struct ovl_fs *ofs = inode->i_sb->s_fs_info; 670 671 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 672 if (unlikely(!inode->i_ino)) 673 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 674 } 675 676 static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) 677 { 678 int xinobits = ovl_xino_bits(inode->i_sb); 679 unsigned int xinoshift = 64 - xinobits; 680 681 /* 682 * When d_ino is consistent with st_ino (samefs or i_ino has enough 683 * bits to encode layer), set the same value used for st_ino to i_ino, 684 * so inode number exposed via /proc/locks and a like will be 685 * consistent with d_ino and st_ino values. An i_ino value inconsistent 686 * with d_ino also causes nfsd readdirplus to fail. 687 */ 688 inode->i_ino = ino; 689 if (ovl_same_fs(inode->i_sb)) { 690 return; 691 } else if (xinobits && likely(!(ino >> xinoshift))) { 692 inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); 693 return; 694 } 695 696 /* 697 * For directory inodes on non-samefs with xino disabled or xino 698 * overflow, we allocate a non-persistent inode number, to be used for 699 * resolving st_ino collisions in ovl_map_dev_ino(). 700 * 701 * To avoid ino collision with legitimate xino values from upper 702 * layer (fsid 0), use the lowest xinobit to map the non 703 * persistent inode numbers to the unified st_ino address space. 704 */ 705 if (S_ISDIR(inode->i_mode)) { 706 ovl_next_ino(inode); 707 if (xinobits) { 708 inode->i_ino &= ~0UL >> xinobits; 709 inode->i_ino |= 1UL << xinoshift; 710 } 711 } 712 } 713 714 void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, 715 unsigned long ino, int fsid) 716 { 717 struct inode *realinode; 718 719 if (oip->upperdentry) 720 OVL_I(inode)->__upperdentry = oip->upperdentry; 721 if (oip->lowerpath && oip->lowerpath->dentry) 722 OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry)); 723 if (oip->lowerdata) 724 OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata)); 725 726 realinode = ovl_inode_real(inode); 727 ovl_copyattr(realinode, inode); 728 ovl_copyflags(realinode, inode); 729 ovl_map_ino(inode, ino, fsid); 730 } 731 732 static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) 733 { 734 inode->i_mode = mode; 735 inode->i_flags |= S_NOCMTIME; 736 #ifdef CONFIG_FS_POSIX_ACL 737 inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; 738 #endif 739 740 ovl_lockdep_annotate_inode_mutex_key(inode); 741 742 switch (mode & S_IFMT) { 743 case S_IFREG: 744 inode->i_op = &ovl_file_inode_operations; 745 inode->i_fop = &ovl_file_operations; 746 inode->i_mapping->a_ops = &ovl_aops; 747 break; 748 749 case S_IFDIR: 750 inode->i_op = &ovl_dir_inode_operations; 751 inode->i_fop = &ovl_dir_operations; 752 break; 753 754 case S_IFLNK: 755 inode->i_op = &ovl_symlink_inode_operations; 756 break; 757 758 default: 759 inode->i_op = &ovl_special_inode_operations; 760 init_special_inode(inode, mode, rdev); 761 break; 762 } 763 } 764 765 /* 766 * With inodes index enabled, an overlay inode nlink counts the union of upper 767 * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure 768 * upper inode, the following nlink modifying operations can happen: 769 * 770 * 1. Lower hardlink copy up 771 * 2. Upper hardlink created, unlinked or renamed over 772 * 3. Lower hardlink whiteout or renamed over 773 * 774 * For the first, copy up case, the union nlink does not change, whether the 775 * operation succeeds or fails, but the upper inode nlink may change. 776 * Therefore, before copy up, we store the union nlink value relative to the 777 * lower inode nlink in the index inode xattr .overlay.nlink. 778 * 779 * For the second, upper hardlink case, the union nlink should be incremented 780 * or decremented IFF the operation succeeds, aligned with nlink change of the 781 * upper inode. Therefore, before link/unlink/rename, we store the union nlink 782 * value relative to the upper inode nlink in the index inode. 783 * 784 * For the last, lower cover up case, we simplify things by preceding the 785 * whiteout or cover up with copy up. This makes sure that there is an index 786 * upper inode where the nlink xattr can be stored before the copied up upper 787 * entry is unlink. 788 */ 789 #define OVL_NLINK_ADD_UPPER (1 << 0) 790 791 /* 792 * On-disk format for indexed nlink: 793 * 794 * nlink relative to the upper inode - "U[+-]NUM" 795 * nlink relative to the lower inode - "L[+-]NUM" 796 */ 797 798 static int ovl_set_nlink_common(struct dentry *dentry, 799 struct dentry *realdentry, const char *format) 800 { 801 struct inode *inode = d_inode(dentry); 802 struct inode *realinode = d_inode(realdentry); 803 char buf[13]; 804 int len; 805 806 len = snprintf(buf, sizeof(buf), format, 807 (int) (inode->i_nlink - realinode->i_nlink)); 808 809 if (WARN_ON(len >= sizeof(buf))) 810 return -EIO; 811 812 return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), 813 OVL_XATTR_NLINK, buf, len); 814 } 815 816 int ovl_set_nlink_upper(struct dentry *dentry) 817 { 818 return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); 819 } 820 821 int ovl_set_nlink_lower(struct dentry *dentry) 822 { 823 return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); 824 } 825 826 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, 827 struct dentry *upperdentry, 828 unsigned int fallback) 829 { 830 int nlink_diff; 831 int nlink; 832 char buf[13]; 833 int err; 834 835 if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) 836 return fallback; 837 838 err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK, 839 &buf, sizeof(buf) - 1); 840 if (err < 0) 841 goto fail; 842 843 buf[err] = '\0'; 844 if ((buf[0] != 'L' && buf[0] != 'U') || 845 (buf[1] != '+' && buf[1] != '-')) 846 goto fail; 847 848 err = kstrtoint(buf + 1, 10, &nlink_diff); 849 if (err < 0) 850 goto fail; 851 852 nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; 853 nlink += nlink_diff; 854 855 if (nlink <= 0) 856 goto fail; 857 858 return nlink; 859 860 fail: 861 pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", 862 upperdentry, err); 863 return fallback; 864 } 865 866 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) 867 { 868 struct inode *inode; 869 870 inode = new_inode(sb); 871 if (inode) 872 ovl_fill_inode(inode, mode, rdev); 873 874 return inode; 875 } 876 877 static int ovl_inode_test(struct inode *inode, void *data) 878 { 879 return inode->i_private == data; 880 } 881 882 static int ovl_inode_set(struct inode *inode, void *data) 883 { 884 inode->i_private = data; 885 return 0; 886 } 887 888 static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, 889 struct dentry *upperdentry, bool strict) 890 { 891 /* 892 * For directories, @strict verify from lookup path performs consistency 893 * checks, so NULL lower/upper in dentry must match NULL lower/upper in 894 * inode. Non @strict verify from NFS handle decode path passes NULL for 895 * 'unknown' lower/upper. 896 */ 897 if (S_ISDIR(inode->i_mode) && strict) { 898 /* Real lower dir moved to upper layer under us? */ 899 if (!lowerdentry && ovl_inode_lower(inode)) 900 return false; 901 902 /* Lookup of an uncovered redirect origin? */ 903 if (!upperdentry && ovl_inode_upper(inode)) 904 return false; 905 } 906 907 /* 908 * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. 909 * This happens when finding a copied up overlay inode for a renamed 910 * or hardlinked overlay dentry and lower dentry cannot be followed 911 * by origin because lower fs does not support file handles. 912 */ 913 if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) 914 return false; 915 916 /* 917 * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. 918 * This happens when finding a lower alias for a copied up hard link. 919 */ 920 if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) 921 return false; 922 923 return true; 924 } 925 926 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, 927 bool is_upper) 928 { 929 struct inode *inode, *key = d_inode(real); 930 931 inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 932 if (!inode) 933 return NULL; 934 935 if (!ovl_verify_inode(inode, is_upper ? NULL : real, 936 is_upper ? real : NULL, false)) { 937 iput(inode); 938 return ERR_PTR(-ESTALE); 939 } 940 941 return inode; 942 } 943 944 bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) 945 { 946 struct inode *key = d_inode(dir); 947 struct inode *trap; 948 bool res; 949 950 trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 951 if (!trap) 952 return false; 953 954 res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && 955 !ovl_inode_lower(trap); 956 957 iput(trap); 958 return res; 959 } 960 961 /* 962 * Create an inode cache entry for layer root dir, that will intentionally 963 * fail ovl_verify_inode(), so any lookup that will find some layer root 964 * will fail. 965 */ 966 struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) 967 { 968 struct inode *key = d_inode(dir); 969 struct inode *trap; 970 971 if (!d_is_dir(dir)) 972 return ERR_PTR(-ENOTDIR); 973 974 trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, 975 ovl_inode_set, key); 976 if (!trap) 977 return ERR_PTR(-ENOMEM); 978 979 if (!(trap->i_state & I_NEW)) { 980 /* Conflicting layer roots? */ 981 iput(trap); 982 return ERR_PTR(-ELOOP); 983 } 984 985 trap->i_mode = S_IFDIR; 986 trap->i_flags = S_DEAD; 987 unlock_new_inode(trap); 988 989 return trap; 990 } 991 992 /* 993 * Does overlay inode need to be hashed by lower inode? 994 */ 995 static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, 996 struct dentry *lower, bool index) 997 { 998 struct ovl_fs *ofs = sb->s_fs_info; 999 1000 /* No, if pure upper */ 1001 if (!lower) 1002 return false; 1003 1004 /* Yes, if already indexed */ 1005 if (index) 1006 return true; 1007 1008 /* Yes, if won't be copied up */ 1009 if (!ovl_upper_mnt(ofs)) 1010 return true; 1011 1012 /* No, if lower hardlink is or will be broken on copy up */ 1013 if ((upper || !ovl_indexdir(sb)) && 1014 !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) 1015 return false; 1016 1017 /* No, if non-indexed upper with NFS export */ 1018 if (sb->s_export_op && upper) 1019 return false; 1020 1021 /* Otherwise, hash by lower inode for fsnotify */ 1022 return true; 1023 } 1024 1025 static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, 1026 struct inode *key) 1027 { 1028 return newinode ? inode_insert5(newinode, (unsigned long) key, 1029 ovl_inode_test, ovl_inode_set, key) : 1030 iget5_locked(sb, (unsigned long) key, 1031 ovl_inode_test, ovl_inode_set, key); 1032 } 1033 1034 struct inode *ovl_get_inode(struct super_block *sb, 1035 struct ovl_inode_params *oip) 1036 { 1037 struct ovl_fs *ofs = OVL_FS(sb); 1038 struct dentry *upperdentry = oip->upperdentry; 1039 struct ovl_path *lowerpath = oip->lowerpath; 1040 struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; 1041 struct inode *inode; 1042 struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; 1043 bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, 1044 oip->index); 1045 int fsid = bylower ? lowerpath->layer->fsid : 0; 1046 bool is_dir; 1047 unsigned long ino = 0; 1048 int err = oip->newinode ? -EEXIST : -ENOMEM; 1049 1050 if (!realinode) 1051 realinode = d_inode(lowerdentry); 1052 1053 /* 1054 * Copy up origin (lower) may exist for non-indexed upper, but we must 1055 * not use lower as hash key if this is a broken hardlink. 1056 */ 1057 is_dir = S_ISDIR(realinode->i_mode); 1058 if (upperdentry || bylower) { 1059 struct inode *key = d_inode(bylower ? lowerdentry : 1060 upperdentry); 1061 unsigned int nlink = is_dir ? 1 : realinode->i_nlink; 1062 1063 inode = ovl_iget5(sb, oip->newinode, key); 1064 if (!inode) 1065 goto out_err; 1066 if (!(inode->i_state & I_NEW)) { 1067 /* 1068 * Verify that the underlying files stored in the inode 1069 * match those in the dentry. 1070 */ 1071 if (!ovl_verify_inode(inode, lowerdentry, upperdentry, 1072 true)) { 1073 iput(inode); 1074 err = -ESTALE; 1075 goto out_err; 1076 } 1077 1078 dput(upperdentry); 1079 kfree(oip->redirect); 1080 goto out; 1081 } 1082 1083 /* Recalculate nlink for non-dir due to indexing */ 1084 if (!is_dir) 1085 nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, 1086 nlink); 1087 set_nlink(inode, nlink); 1088 ino = key->i_ino; 1089 } else { 1090 /* Lower hardlink that will be broken on copy up */ 1091 inode = new_inode(sb); 1092 if (!inode) { 1093 err = -ENOMEM; 1094 goto out_err; 1095 } 1096 ino = realinode->i_ino; 1097 fsid = lowerpath->layer->fsid; 1098 } 1099 ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); 1100 ovl_inode_init(inode, oip, ino, fsid); 1101 1102 if (upperdentry && ovl_is_impuredir(sb, upperdentry)) 1103 ovl_set_flag(OVL_IMPURE, inode); 1104 1105 if (oip->index) 1106 ovl_set_flag(OVL_INDEX, inode); 1107 1108 OVL_I(inode)->redirect = oip->redirect; 1109 1110 if (bylower) 1111 ovl_set_flag(OVL_CONST_INO, inode); 1112 1113 /* Check for non-merge dir that may have whiteouts */ 1114 if (is_dir) { 1115 if (((upperdentry && lowerdentry) || oip->numlower > 1) || 1116 ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) { 1117 ovl_set_flag(OVL_WHITEOUTS, inode); 1118 } 1119 } 1120 1121 if (inode->i_state & I_NEW) 1122 unlock_new_inode(inode); 1123 out: 1124 return inode; 1125 1126 out_err: 1127 pr_warn_ratelimited("failed to get inode (%i)\n", err); 1128 inode = ERR_PTR(err); 1129 goto out; 1130 } 1131