1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright (C) 2011 Novell Inc. 5 */ 6 7 #include <linux/fs.h> 8 #include <linux/slab.h> 9 #include <linux/cred.h> 10 #include <linux/xattr.h> 11 #include <linux/posix_acl.h> 12 #include <linux/ratelimit.h> 13 #include <linux/fiemap.h> 14 #include "overlayfs.h" 15 16 17 int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 18 struct iattr *attr) 19 { 20 int err; 21 bool full_copy_up = false; 22 struct dentry *upperdentry; 23 const struct cred *old_cred; 24 25 err = setattr_prepare(&init_user_ns, dentry, attr); 26 if (err) 27 return err; 28 29 err = ovl_want_write(dentry); 30 if (err) 31 goto out; 32 33 if (attr->ia_valid & ATTR_SIZE) { 34 struct inode *realinode = d_inode(ovl_dentry_real(dentry)); 35 36 err = -ETXTBSY; 37 if (atomic_read(&realinode->i_writecount) < 0) 38 goto out_drop_write; 39 40 /* Truncate should trigger data copy up as well */ 41 full_copy_up = true; 42 } 43 44 if (!full_copy_up) 45 err = ovl_copy_up(dentry); 46 else 47 err = ovl_copy_up_with_data(dentry); 48 if (!err) { 49 struct inode *winode = NULL; 50 51 upperdentry = ovl_dentry_upper(dentry); 52 53 if (attr->ia_valid & ATTR_SIZE) { 54 winode = d_inode(upperdentry); 55 err = get_write_access(winode); 56 if (err) 57 goto out_drop_write; 58 } 59 60 if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) 61 attr->ia_valid &= ~ATTR_MODE; 62 63 /* 64 * We might have to translate ovl file into real file object 65 * once use cases emerge. For now, simply don't let underlying 66 * filesystem rely on attr->ia_file 67 */ 68 attr->ia_valid &= ~ATTR_FILE; 69 70 /* 71 * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN 72 * set. Overlayfs does not pass O_TRUNC flag to underlying 73 * filesystem during open -> do not pass ATTR_OPEN. This 74 * disables optimization in fuse which assumes open(O_TRUNC) 75 * already set file size to 0. But we never passed O_TRUNC to 76 * fuse. So by clearing ATTR_OPEN, fuse will be forced to send 77 * setattr request to server. 78 */ 79 attr->ia_valid &= ~ATTR_OPEN; 80 81 inode_lock(upperdentry->d_inode); 82 old_cred = ovl_override_creds(dentry->d_sb); 83 err = notify_change(&init_user_ns, upperdentry, attr, NULL); 84 revert_creds(old_cred); 85 if (!err) 86 ovl_copyattr(upperdentry->d_inode, dentry->d_inode); 87 inode_unlock(upperdentry->d_inode); 88 89 if (winode) 90 put_write_access(winode); 91 } 92 out_drop_write: 93 ovl_drop_write(dentry); 94 out: 95 return err; 96 } 97 98 static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) 99 { 100 bool samefs = ovl_same_fs(dentry->d_sb); 101 unsigned int xinobits = ovl_xino_bits(dentry->d_sb); 102 unsigned int xinoshift = 64 - xinobits; 103 104 if (samefs) { 105 /* 106 * When all layers are on the same fs, all real inode 107 * number are unique, so we use the overlay st_dev, 108 * which is friendly to du -x. 109 */ 110 stat->dev = dentry->d_sb->s_dev; 111 return; 112 } else if (xinobits) { 113 /* 114 * All inode numbers of underlying fs should not be using the 115 * high xinobits, so we use high xinobits to partition the 116 * overlay st_ino address space. The high bits holds the fsid 117 * (upper fsid is 0). The lowest xinobit is reserved for mapping 118 * the non-persistent inode numbers range in case of overflow. 119 * This way all overlay inode numbers are unique and use the 120 * overlay st_dev. 121 */ 122 if (likely(!(stat->ino >> xinoshift))) { 123 stat->ino |= ((u64)fsid) << (xinoshift + 1); 124 stat->dev = dentry->d_sb->s_dev; 125 return; 126 } else if (ovl_xino_warn(dentry->d_sb)) { 127 pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", 128 dentry, stat->ino, xinobits); 129 } 130 } 131 132 /* The inode could not be mapped to a unified st_ino address space */ 133 if (S_ISDIR(dentry->d_inode->i_mode)) { 134 /* 135 * Always use the overlay st_dev for directories, so 'find 136 * -xdev' will scan the entire overlay mount and won't cross the 137 * overlay mount boundaries. 138 * 139 * If not all layers are on the same fs the pair {real st_ino; 140 * overlay st_dev} is not unique, so use the non persistent 141 * overlay st_ino for directories. 142 */ 143 stat->dev = dentry->d_sb->s_dev; 144 stat->ino = dentry->d_inode->i_ino; 145 } else { 146 /* 147 * For non-samefs setup, if we cannot map all layers st_ino 148 * to a unified address space, we need to make sure that st_dev 149 * is unique per underlying fs, so we use the unique anonymous 150 * bdev assigned to the underlying fs. 151 */ 152 stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; 153 } 154 } 155 156 int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, 157 struct kstat *stat, u32 request_mask, unsigned int flags) 158 { 159 struct dentry *dentry = path->dentry; 160 enum ovl_path_type type; 161 struct path realpath; 162 const struct cred *old_cred; 163 bool is_dir = S_ISDIR(dentry->d_inode->i_mode); 164 int fsid = 0; 165 int err; 166 bool metacopy_blocks = false; 167 168 metacopy_blocks = ovl_is_metacopy_dentry(dentry); 169 170 type = ovl_path_real(dentry, &realpath); 171 old_cred = ovl_override_creds(dentry->d_sb); 172 err = vfs_getattr(&realpath, stat, request_mask, flags); 173 if (err) 174 goto out; 175 176 /* 177 * For non-dir or same fs, we use st_ino of the copy up origin. 178 * This guaranties constant st_dev/st_ino across copy up. 179 * With xino feature and non-samefs, we use st_ino of the copy up 180 * origin masked with high bits that represent the layer id. 181 * 182 * If lower filesystem supports NFS file handles, this also guaranties 183 * persistent st_ino across mount cycle. 184 */ 185 if (!is_dir || ovl_same_dev(dentry->d_sb)) { 186 if (!OVL_TYPE_UPPER(type)) { 187 fsid = ovl_layer_lower(dentry)->fsid; 188 } else if (OVL_TYPE_ORIGIN(type)) { 189 struct kstat lowerstat; 190 u32 lowermask = STATX_INO | STATX_BLOCKS | 191 (!is_dir ? STATX_NLINK : 0); 192 193 ovl_path_lower(dentry, &realpath); 194 err = vfs_getattr(&realpath, &lowerstat, 195 lowermask, flags); 196 if (err) 197 goto out; 198 199 /* 200 * Lower hardlinks may be broken on copy up to different 201 * upper files, so we cannot use the lower origin st_ino 202 * for those different files, even for the same fs case. 203 * 204 * Similarly, several redirected dirs can point to the 205 * same dir on a lower layer. With the "verify_lower" 206 * feature, we do not use the lower origin st_ino, if 207 * we haven't verified that this redirect is unique. 208 * 209 * With inodes index enabled, it is safe to use st_ino 210 * of an indexed origin. The index validates that the 211 * upper hardlink is not broken and that a redirected 212 * dir is the only redirect to that origin. 213 */ 214 if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || 215 (!ovl_verify_lower(dentry->d_sb) && 216 (is_dir || lowerstat.nlink == 1))) { 217 fsid = ovl_layer_lower(dentry)->fsid; 218 stat->ino = lowerstat.ino; 219 } 220 221 /* 222 * If we are querying a metacopy dentry and lower 223 * dentry is data dentry, then use the blocks we 224 * queried just now. We don't have to do additional 225 * vfs_getattr(). If lower itself is metacopy, then 226 * additional vfs_getattr() is unavoidable. 227 */ 228 if (metacopy_blocks && 229 realpath.dentry == ovl_dentry_lowerdata(dentry)) { 230 stat->blocks = lowerstat.blocks; 231 metacopy_blocks = false; 232 } 233 } 234 235 if (metacopy_blocks) { 236 /* 237 * If lower is not same as lowerdata or if there was 238 * no origin on upper, we can end up here. 239 */ 240 struct kstat lowerdatastat; 241 u32 lowermask = STATX_BLOCKS; 242 243 ovl_path_lowerdata(dentry, &realpath); 244 err = vfs_getattr(&realpath, &lowerdatastat, 245 lowermask, flags); 246 if (err) 247 goto out; 248 stat->blocks = lowerdatastat.blocks; 249 } 250 } 251 252 ovl_map_dev_ino(dentry, stat, fsid); 253 254 /* 255 * It's probably not worth it to count subdirs to get the 256 * correct link count. nlink=1 seems to pacify 'find' and 257 * other utilities. 258 */ 259 if (is_dir && OVL_TYPE_MERGE(type)) 260 stat->nlink = 1; 261 262 /* 263 * Return the overlay inode nlinks for indexed upper inodes. 264 * Overlay inode nlink counts the union of the upper hardlinks 265 * and non-covered lower hardlinks. It does not include the upper 266 * index hardlink. 267 */ 268 if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) 269 stat->nlink = dentry->d_inode->i_nlink; 270 271 out: 272 revert_creds(old_cred); 273 274 return err; 275 } 276 277 int ovl_permission(struct user_namespace *mnt_userns, 278 struct inode *inode, int mask) 279 { 280 struct inode *upperinode = ovl_inode_upper(inode); 281 struct inode *realinode = upperinode ?: ovl_inode_lower(inode); 282 const struct cred *old_cred; 283 int err; 284 285 /* Careful in RCU walk mode */ 286 if (!realinode) { 287 WARN_ON(!(mask & MAY_NOT_BLOCK)); 288 return -ECHILD; 289 } 290 291 /* 292 * Check overlay inode with the creds of task and underlying inode 293 * with creds of mounter 294 */ 295 err = generic_permission(&init_user_ns, inode, mask); 296 if (err) 297 return err; 298 299 old_cred = ovl_override_creds(inode->i_sb); 300 if (!upperinode && 301 !special_file(realinode->i_mode) && mask & MAY_WRITE) { 302 mask &= ~(MAY_WRITE | MAY_APPEND); 303 /* Make sure mounter can read file for copy up later */ 304 mask |= MAY_READ; 305 } 306 err = inode_permission(&init_user_ns, realinode, mask); 307 revert_creds(old_cred); 308 309 return err; 310 } 311 312 static const char *ovl_get_link(struct dentry *dentry, 313 struct inode *inode, 314 struct delayed_call *done) 315 { 316 const struct cred *old_cred; 317 const char *p; 318 319 if (!dentry) 320 return ERR_PTR(-ECHILD); 321 322 old_cred = ovl_override_creds(dentry->d_sb); 323 p = vfs_get_link(ovl_dentry_real(dentry), done); 324 revert_creds(old_cred); 325 return p; 326 } 327 328 bool ovl_is_private_xattr(struct super_block *sb, const char *name) 329 { 330 struct ovl_fs *ofs = sb->s_fs_info; 331 332 if (ofs->config.userxattr) 333 return strncmp(name, OVL_XATTR_USER_PREFIX, 334 sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0; 335 else 336 return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, 337 sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0; 338 } 339 340 int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, 341 const void *value, size_t size, int flags) 342 { 343 int err; 344 struct dentry *upperdentry = ovl_i_dentry_upper(inode); 345 struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); 346 const struct cred *old_cred; 347 348 err = ovl_want_write(dentry); 349 if (err) 350 goto out; 351 352 if (!value && !upperdentry) { 353 old_cred = ovl_override_creds(dentry->d_sb); 354 err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); 355 revert_creds(old_cred); 356 if (err < 0) 357 goto out_drop_write; 358 } 359 360 if (!upperdentry) { 361 err = ovl_copy_up(dentry); 362 if (err) 363 goto out_drop_write; 364 365 realdentry = ovl_dentry_upper(dentry); 366 } 367 368 old_cred = ovl_override_creds(dentry->d_sb); 369 if (value) 370 err = vfs_setxattr(&init_user_ns, realdentry, name, value, size, 371 flags); 372 else { 373 WARN_ON(flags != XATTR_REPLACE); 374 err = vfs_removexattr(&init_user_ns, realdentry, name); 375 } 376 revert_creds(old_cred); 377 378 /* copy c/mtime */ 379 ovl_copyattr(d_inode(realdentry), inode); 380 381 out_drop_write: 382 ovl_drop_write(dentry); 383 out: 384 return err; 385 } 386 387 int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, 388 void *value, size_t size) 389 { 390 ssize_t res; 391 const struct cred *old_cred; 392 struct dentry *realdentry = 393 ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); 394 395 old_cred = ovl_override_creds(dentry->d_sb); 396 res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); 397 revert_creds(old_cred); 398 return res; 399 } 400 401 static bool ovl_can_list(struct super_block *sb, const char *s) 402 { 403 /* Never list private (.overlay) */ 404 if (ovl_is_private_xattr(sb, s)) 405 return false; 406 407 /* List all non-trusted xattrs */ 408 if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) 409 return true; 410 411 /* list other trusted for superuser only */ 412 return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); 413 } 414 415 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) 416 { 417 struct dentry *realdentry = ovl_dentry_real(dentry); 418 ssize_t res; 419 size_t len; 420 char *s; 421 const struct cred *old_cred; 422 423 old_cred = ovl_override_creds(dentry->d_sb); 424 res = vfs_listxattr(realdentry, list, size); 425 revert_creds(old_cred); 426 if (res <= 0 || size == 0) 427 return res; 428 429 /* filter out private xattrs */ 430 for (s = list, len = res; len;) { 431 size_t slen = strnlen(s, len) + 1; 432 433 /* underlying fs providing us with an broken xattr list? */ 434 if (WARN_ON(slen > len)) 435 return -EIO; 436 437 len -= slen; 438 if (!ovl_can_list(dentry->d_sb, s)) { 439 res -= slen; 440 memmove(s, s + slen, len); 441 } else { 442 s += slen; 443 } 444 } 445 446 return res; 447 } 448 449 struct posix_acl *ovl_get_acl(struct inode *inode, int type) 450 { 451 struct inode *realinode = ovl_inode_real(inode); 452 const struct cred *old_cred; 453 struct posix_acl *acl; 454 455 if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) 456 return NULL; 457 458 old_cred = ovl_override_creds(inode->i_sb); 459 acl = get_acl(realinode, type); 460 revert_creds(old_cred); 461 462 return acl; 463 } 464 465 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) 466 { 467 if (flags & S_ATIME) { 468 struct ovl_fs *ofs = inode->i_sb->s_fs_info; 469 struct path upperpath = { 470 .mnt = ovl_upper_mnt(ofs), 471 .dentry = ovl_upperdentry_dereference(OVL_I(inode)), 472 }; 473 474 if (upperpath.dentry) { 475 touch_atime(&upperpath); 476 inode->i_atime = d_inode(upperpath.dentry)->i_atime; 477 } 478 } 479 return 0; 480 } 481 482 static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 483 u64 start, u64 len) 484 { 485 int err; 486 struct inode *realinode = ovl_inode_realdata(inode); 487 const struct cred *old_cred; 488 489 if (!realinode->i_op->fiemap) 490 return -EOPNOTSUPP; 491 492 old_cred = ovl_override_creds(inode->i_sb); 493 err = realinode->i_op->fiemap(realinode, fieinfo, start, len); 494 revert_creds(old_cred); 495 496 return err; 497 } 498 499 static const struct inode_operations ovl_file_inode_operations = { 500 .setattr = ovl_setattr, 501 .permission = ovl_permission, 502 .getattr = ovl_getattr, 503 .listxattr = ovl_listxattr, 504 .get_acl = ovl_get_acl, 505 .update_time = ovl_update_time, 506 .fiemap = ovl_fiemap, 507 }; 508 509 static const struct inode_operations ovl_symlink_inode_operations = { 510 .setattr = ovl_setattr, 511 .get_link = ovl_get_link, 512 .getattr = ovl_getattr, 513 .listxattr = ovl_listxattr, 514 .update_time = ovl_update_time, 515 }; 516 517 static const struct inode_operations ovl_special_inode_operations = { 518 .setattr = ovl_setattr, 519 .permission = ovl_permission, 520 .getattr = ovl_getattr, 521 .listxattr = ovl_listxattr, 522 .get_acl = ovl_get_acl, 523 .update_time = ovl_update_time, 524 }; 525 526 static const struct address_space_operations ovl_aops = { 527 /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ 528 .direct_IO = noop_direct_IO, 529 }; 530 531 /* 532 * It is possible to stack overlayfs instance on top of another 533 * overlayfs instance as lower layer. We need to annotate the 534 * stackable i_mutex locks according to stack level of the super 535 * block instance. An overlayfs instance can never be in stack 536 * depth 0 (there is always a real fs below it). An overlayfs 537 * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. 538 * 539 * For example, here is a snip from /proc/lockdep_chains after 540 * dir_iterate of nested overlayfs: 541 * 542 * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) 543 * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) 544 * [...] &type->i_mutex_dir_key (stack_depth=0) 545 * 546 * Locking order w.r.t ovl_want_write() is important for nested overlayfs. 547 * 548 * This chain is valid: 549 * - inode->i_rwsem (inode_lock[2]) 550 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 551 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 552 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 553 * 554 * And this chain is valid: 555 * - inode->i_rwsem (inode_lock[2]) 556 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 557 * - lowerinode->i_rwsem (inode_lock[1]) 558 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 559 * 560 * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is 561 * held, because it is in reverse order of the non-nested case using the same 562 * upper fs: 563 * - inode->i_rwsem (inode_lock[1]) 564 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 565 * - OVL_I(inode)->lock (ovl_inode_lock[1]) 566 */ 567 #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH 568 569 static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) 570 { 571 #ifdef CONFIG_LOCKDEP 572 static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; 573 static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; 574 static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; 575 576 int depth = inode->i_sb->s_stack_depth - 1; 577 578 if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) 579 depth = 0; 580 581 if (S_ISDIR(inode->i_mode)) 582 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); 583 else 584 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); 585 586 lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); 587 #endif 588 } 589 590 static void ovl_next_ino(struct inode *inode) 591 { 592 struct ovl_fs *ofs = inode->i_sb->s_fs_info; 593 594 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 595 if (unlikely(!inode->i_ino)) 596 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 597 } 598 599 static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) 600 { 601 int xinobits = ovl_xino_bits(inode->i_sb); 602 unsigned int xinoshift = 64 - xinobits; 603 604 /* 605 * When d_ino is consistent with st_ino (samefs or i_ino has enough 606 * bits to encode layer), set the same value used for st_ino to i_ino, 607 * so inode number exposed via /proc/locks and a like will be 608 * consistent with d_ino and st_ino values. An i_ino value inconsistent 609 * with d_ino also causes nfsd readdirplus to fail. 610 */ 611 inode->i_ino = ino; 612 if (ovl_same_fs(inode->i_sb)) { 613 return; 614 } else if (xinobits && likely(!(ino >> xinoshift))) { 615 inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); 616 return; 617 } 618 619 /* 620 * For directory inodes on non-samefs with xino disabled or xino 621 * overflow, we allocate a non-persistent inode number, to be used for 622 * resolving st_ino collisions in ovl_map_dev_ino(). 623 * 624 * To avoid ino collision with legitimate xino values from upper 625 * layer (fsid 0), use the lowest xinobit to map the non 626 * persistent inode numbers to the unified st_ino address space. 627 */ 628 if (S_ISDIR(inode->i_mode)) { 629 ovl_next_ino(inode); 630 if (xinobits) { 631 inode->i_ino &= ~0UL >> xinobits; 632 inode->i_ino |= 1UL << xinoshift; 633 } 634 } 635 } 636 637 void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, 638 unsigned long ino, int fsid) 639 { 640 struct inode *realinode; 641 642 if (oip->upperdentry) 643 OVL_I(inode)->__upperdentry = oip->upperdentry; 644 if (oip->lowerpath && oip->lowerpath->dentry) 645 OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry)); 646 if (oip->lowerdata) 647 OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata)); 648 649 realinode = ovl_inode_real(inode); 650 ovl_copyattr(realinode, inode); 651 ovl_copyflags(realinode, inode); 652 ovl_map_ino(inode, ino, fsid); 653 } 654 655 static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) 656 { 657 inode->i_mode = mode; 658 inode->i_flags |= S_NOCMTIME; 659 #ifdef CONFIG_FS_POSIX_ACL 660 inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; 661 #endif 662 663 ovl_lockdep_annotate_inode_mutex_key(inode); 664 665 switch (mode & S_IFMT) { 666 case S_IFREG: 667 inode->i_op = &ovl_file_inode_operations; 668 inode->i_fop = &ovl_file_operations; 669 inode->i_mapping->a_ops = &ovl_aops; 670 break; 671 672 case S_IFDIR: 673 inode->i_op = &ovl_dir_inode_operations; 674 inode->i_fop = &ovl_dir_operations; 675 break; 676 677 case S_IFLNK: 678 inode->i_op = &ovl_symlink_inode_operations; 679 break; 680 681 default: 682 inode->i_op = &ovl_special_inode_operations; 683 init_special_inode(inode, mode, rdev); 684 break; 685 } 686 } 687 688 /* 689 * With inodes index enabled, an overlay inode nlink counts the union of upper 690 * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure 691 * upper inode, the following nlink modifying operations can happen: 692 * 693 * 1. Lower hardlink copy up 694 * 2. Upper hardlink created, unlinked or renamed over 695 * 3. Lower hardlink whiteout or renamed over 696 * 697 * For the first, copy up case, the union nlink does not change, whether the 698 * operation succeeds or fails, but the upper inode nlink may change. 699 * Therefore, before copy up, we store the union nlink value relative to the 700 * lower inode nlink in the index inode xattr .overlay.nlink. 701 * 702 * For the second, upper hardlink case, the union nlink should be incremented 703 * or decremented IFF the operation succeeds, aligned with nlink change of the 704 * upper inode. Therefore, before link/unlink/rename, we store the union nlink 705 * value relative to the upper inode nlink in the index inode. 706 * 707 * For the last, lower cover up case, we simplify things by preceding the 708 * whiteout or cover up with copy up. This makes sure that there is an index 709 * upper inode where the nlink xattr can be stored before the copied up upper 710 * entry is unlink. 711 */ 712 #define OVL_NLINK_ADD_UPPER (1 << 0) 713 714 /* 715 * On-disk format for indexed nlink: 716 * 717 * nlink relative to the upper inode - "U[+-]NUM" 718 * nlink relative to the lower inode - "L[+-]NUM" 719 */ 720 721 static int ovl_set_nlink_common(struct dentry *dentry, 722 struct dentry *realdentry, const char *format) 723 { 724 struct inode *inode = d_inode(dentry); 725 struct inode *realinode = d_inode(realdentry); 726 char buf[13]; 727 int len; 728 729 len = snprintf(buf, sizeof(buf), format, 730 (int) (inode->i_nlink - realinode->i_nlink)); 731 732 if (WARN_ON(len >= sizeof(buf))) 733 return -EIO; 734 735 return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), 736 OVL_XATTR_NLINK, buf, len); 737 } 738 739 int ovl_set_nlink_upper(struct dentry *dentry) 740 { 741 return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); 742 } 743 744 int ovl_set_nlink_lower(struct dentry *dentry) 745 { 746 return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); 747 } 748 749 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, 750 struct dentry *upperdentry, 751 unsigned int fallback) 752 { 753 int nlink_diff; 754 int nlink; 755 char buf[13]; 756 int err; 757 758 if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) 759 return fallback; 760 761 err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK, 762 &buf, sizeof(buf) - 1); 763 if (err < 0) 764 goto fail; 765 766 buf[err] = '\0'; 767 if ((buf[0] != 'L' && buf[0] != 'U') || 768 (buf[1] != '+' && buf[1] != '-')) 769 goto fail; 770 771 err = kstrtoint(buf + 1, 10, &nlink_diff); 772 if (err < 0) 773 goto fail; 774 775 nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; 776 nlink += nlink_diff; 777 778 if (nlink <= 0) 779 goto fail; 780 781 return nlink; 782 783 fail: 784 pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", 785 upperdentry, err); 786 return fallback; 787 } 788 789 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) 790 { 791 struct inode *inode; 792 793 inode = new_inode(sb); 794 if (inode) 795 ovl_fill_inode(inode, mode, rdev); 796 797 return inode; 798 } 799 800 static int ovl_inode_test(struct inode *inode, void *data) 801 { 802 return inode->i_private == data; 803 } 804 805 static int ovl_inode_set(struct inode *inode, void *data) 806 { 807 inode->i_private = data; 808 return 0; 809 } 810 811 static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, 812 struct dentry *upperdentry, bool strict) 813 { 814 /* 815 * For directories, @strict verify from lookup path performs consistency 816 * checks, so NULL lower/upper in dentry must match NULL lower/upper in 817 * inode. Non @strict verify from NFS handle decode path passes NULL for 818 * 'unknown' lower/upper. 819 */ 820 if (S_ISDIR(inode->i_mode) && strict) { 821 /* Real lower dir moved to upper layer under us? */ 822 if (!lowerdentry && ovl_inode_lower(inode)) 823 return false; 824 825 /* Lookup of an uncovered redirect origin? */ 826 if (!upperdentry && ovl_inode_upper(inode)) 827 return false; 828 } 829 830 /* 831 * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. 832 * This happens when finding a copied up overlay inode for a renamed 833 * or hardlinked overlay dentry and lower dentry cannot be followed 834 * by origin because lower fs does not support file handles. 835 */ 836 if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) 837 return false; 838 839 /* 840 * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. 841 * This happens when finding a lower alias for a copied up hard link. 842 */ 843 if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) 844 return false; 845 846 return true; 847 } 848 849 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, 850 bool is_upper) 851 { 852 struct inode *inode, *key = d_inode(real); 853 854 inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 855 if (!inode) 856 return NULL; 857 858 if (!ovl_verify_inode(inode, is_upper ? NULL : real, 859 is_upper ? real : NULL, false)) { 860 iput(inode); 861 return ERR_PTR(-ESTALE); 862 } 863 864 return inode; 865 } 866 867 bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) 868 { 869 struct inode *key = d_inode(dir); 870 struct inode *trap; 871 bool res; 872 873 trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 874 if (!trap) 875 return false; 876 877 res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && 878 !ovl_inode_lower(trap); 879 880 iput(trap); 881 return res; 882 } 883 884 /* 885 * Create an inode cache entry for layer root dir, that will intentionally 886 * fail ovl_verify_inode(), so any lookup that will find some layer root 887 * will fail. 888 */ 889 struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) 890 { 891 struct inode *key = d_inode(dir); 892 struct inode *trap; 893 894 if (!d_is_dir(dir)) 895 return ERR_PTR(-ENOTDIR); 896 897 trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, 898 ovl_inode_set, key); 899 if (!trap) 900 return ERR_PTR(-ENOMEM); 901 902 if (!(trap->i_state & I_NEW)) { 903 /* Conflicting layer roots? */ 904 iput(trap); 905 return ERR_PTR(-ELOOP); 906 } 907 908 trap->i_mode = S_IFDIR; 909 trap->i_flags = S_DEAD; 910 unlock_new_inode(trap); 911 912 return trap; 913 } 914 915 /* 916 * Does overlay inode need to be hashed by lower inode? 917 */ 918 static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, 919 struct dentry *lower, bool index) 920 { 921 struct ovl_fs *ofs = sb->s_fs_info; 922 923 /* No, if pure upper */ 924 if (!lower) 925 return false; 926 927 /* Yes, if already indexed */ 928 if (index) 929 return true; 930 931 /* Yes, if won't be copied up */ 932 if (!ovl_upper_mnt(ofs)) 933 return true; 934 935 /* No, if lower hardlink is or will be broken on copy up */ 936 if ((upper || !ovl_indexdir(sb)) && 937 !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) 938 return false; 939 940 /* No, if non-indexed upper with NFS export */ 941 if (sb->s_export_op && upper) 942 return false; 943 944 /* Otherwise, hash by lower inode for fsnotify */ 945 return true; 946 } 947 948 static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, 949 struct inode *key) 950 { 951 return newinode ? inode_insert5(newinode, (unsigned long) key, 952 ovl_inode_test, ovl_inode_set, key) : 953 iget5_locked(sb, (unsigned long) key, 954 ovl_inode_test, ovl_inode_set, key); 955 } 956 957 struct inode *ovl_get_inode(struct super_block *sb, 958 struct ovl_inode_params *oip) 959 { 960 struct ovl_fs *ofs = OVL_FS(sb); 961 struct dentry *upperdentry = oip->upperdentry; 962 struct ovl_path *lowerpath = oip->lowerpath; 963 struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; 964 struct inode *inode; 965 struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; 966 bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, 967 oip->index); 968 int fsid = bylower ? lowerpath->layer->fsid : 0; 969 bool is_dir; 970 unsigned long ino = 0; 971 int err = oip->newinode ? -EEXIST : -ENOMEM; 972 973 if (!realinode) 974 realinode = d_inode(lowerdentry); 975 976 /* 977 * Copy up origin (lower) may exist for non-indexed upper, but we must 978 * not use lower as hash key if this is a broken hardlink. 979 */ 980 is_dir = S_ISDIR(realinode->i_mode); 981 if (upperdentry || bylower) { 982 struct inode *key = d_inode(bylower ? lowerdentry : 983 upperdentry); 984 unsigned int nlink = is_dir ? 1 : realinode->i_nlink; 985 986 inode = ovl_iget5(sb, oip->newinode, key); 987 if (!inode) 988 goto out_err; 989 if (!(inode->i_state & I_NEW)) { 990 /* 991 * Verify that the underlying files stored in the inode 992 * match those in the dentry. 993 */ 994 if (!ovl_verify_inode(inode, lowerdentry, upperdentry, 995 true)) { 996 iput(inode); 997 err = -ESTALE; 998 goto out_err; 999 } 1000 1001 dput(upperdentry); 1002 kfree(oip->redirect); 1003 goto out; 1004 } 1005 1006 /* Recalculate nlink for non-dir due to indexing */ 1007 if (!is_dir) 1008 nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, 1009 nlink); 1010 set_nlink(inode, nlink); 1011 ino = key->i_ino; 1012 } else { 1013 /* Lower hardlink that will be broken on copy up */ 1014 inode = new_inode(sb); 1015 if (!inode) { 1016 err = -ENOMEM; 1017 goto out_err; 1018 } 1019 ino = realinode->i_ino; 1020 fsid = lowerpath->layer->fsid; 1021 } 1022 ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); 1023 ovl_inode_init(inode, oip, ino, fsid); 1024 1025 if (upperdentry && ovl_is_impuredir(sb, upperdentry)) 1026 ovl_set_flag(OVL_IMPURE, inode); 1027 1028 if (oip->index) 1029 ovl_set_flag(OVL_INDEX, inode); 1030 1031 OVL_I(inode)->redirect = oip->redirect; 1032 1033 if (bylower) 1034 ovl_set_flag(OVL_CONST_INO, inode); 1035 1036 /* Check for non-merge dir that may have whiteouts */ 1037 if (is_dir) { 1038 if (((upperdentry && lowerdentry) || oip->numlower > 1) || 1039 ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) { 1040 ovl_set_flag(OVL_WHITEOUTS, inode); 1041 } 1042 } 1043 1044 if (inode->i_state & I_NEW) 1045 unlock_new_inode(inode); 1046 out: 1047 return inode; 1048 1049 out_err: 1050 pr_warn_ratelimited("failed to get inode (%i)\n", err); 1051 inode = ERR_PTR(err); 1052 goto out; 1053 } 1054