1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Landlock LSM - Filesystem management and hooks 4 * 5 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> 6 * Copyright © 2018-2020 ANSSI 7 * Copyright © 2021-2022 Microsoft Corporation 8 */ 9 10 #include <linux/atomic.h> 11 #include <linux/bitops.h> 12 #include <linux/bits.h> 13 #include <linux/compiler_types.h> 14 #include <linux/dcache.h> 15 #include <linux/err.h> 16 #include <linux/fs.h> 17 #include <linux/init.h> 18 #include <linux/kernel.h> 19 #include <linux/limits.h> 20 #include <linux/list.h> 21 #include <linux/lsm_hooks.h> 22 #include <linux/mount.h> 23 #include <linux/namei.h> 24 #include <linux/path.h> 25 #include <linux/rcupdate.h> 26 #include <linux/spinlock.h> 27 #include <linux/stat.h> 28 #include <linux/types.h> 29 #include <linux/wait_bit.h> 30 #include <linux/workqueue.h> 31 #include <uapi/linux/landlock.h> 32 33 #include "common.h" 34 #include "cred.h" 35 #include "fs.h" 36 #include "limits.h" 37 #include "object.h" 38 #include "ruleset.h" 39 #include "setup.h" 40 41 /* Underlying object management */ 42 43 static void release_inode(struct landlock_object *const object) 44 __releases(object->lock) 45 { 46 struct inode *const inode = object->underobj; 47 struct super_block *sb; 48 49 if (!inode) { 50 spin_unlock(&object->lock); 51 return; 52 } 53 54 /* 55 * Protects against concurrent use by hook_sb_delete() of the reference 56 * to the underlying inode. 57 */ 58 object->underobj = NULL; 59 /* 60 * Makes sure that if the filesystem is concurrently unmounted, 61 * hook_sb_delete() will wait for us to finish iput(). 62 */ 63 sb = inode->i_sb; 64 atomic_long_inc(&landlock_superblock(sb)->inode_refs); 65 spin_unlock(&object->lock); 66 /* 67 * Because object->underobj was not NULL, hook_sb_delete() and 68 * get_inode_object() guarantee that it is safe to reset 69 * landlock_inode(inode)->object while it is not NULL. It is therefore 70 * not necessary to lock inode->i_lock. 71 */ 72 rcu_assign_pointer(landlock_inode(inode)->object, NULL); 73 /* 74 * Now, new rules can safely be tied to @inode with get_inode_object(). 75 */ 76 77 iput(inode); 78 if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs)) 79 wake_up_var(&landlock_superblock(sb)->inode_refs); 80 } 81 82 static const struct landlock_object_underops landlock_fs_underops = { 83 .release = release_inode 84 }; 85 86 /* Ruleset management */ 87 88 static struct landlock_object *get_inode_object(struct inode *const inode) 89 { 90 struct landlock_object *object, *new_object; 91 struct landlock_inode_security *inode_sec = landlock_inode(inode); 92 93 rcu_read_lock(); 94 retry: 95 object = rcu_dereference(inode_sec->object); 96 if (object) { 97 if (likely(refcount_inc_not_zero(&object->usage))) { 98 rcu_read_unlock(); 99 return object; 100 } 101 /* 102 * We are racing with release_inode(), the object is going 103 * away. Wait for release_inode(), then retry. 104 */ 105 spin_lock(&object->lock); 106 spin_unlock(&object->lock); 107 goto retry; 108 } 109 rcu_read_unlock(); 110 111 /* 112 * If there is no object tied to @inode, then create a new one (without 113 * holding any locks). 114 */ 115 new_object = landlock_create_object(&landlock_fs_underops, inode); 116 if (IS_ERR(new_object)) 117 return new_object; 118 119 /* 120 * Protects against concurrent calls to get_inode_object() or 121 * hook_sb_delete(). 122 */ 123 spin_lock(&inode->i_lock); 124 if (unlikely(rcu_access_pointer(inode_sec->object))) { 125 /* Someone else just created the object, bail out and retry. */ 126 spin_unlock(&inode->i_lock); 127 kfree(new_object); 128 129 rcu_read_lock(); 130 goto retry; 131 } 132 133 /* 134 * @inode will be released by hook_sb_delete() on its superblock 135 * shutdown, or by release_inode() when no more ruleset references the 136 * related object. 137 */ 138 ihold(inode); 139 rcu_assign_pointer(inode_sec->object, new_object); 140 spin_unlock(&inode->i_lock); 141 return new_object; 142 } 143 144 /* All access rights that can be tied to files. */ 145 /* clang-format off */ 146 #define ACCESS_FILE ( \ 147 LANDLOCK_ACCESS_FS_EXECUTE | \ 148 LANDLOCK_ACCESS_FS_WRITE_FILE | \ 149 LANDLOCK_ACCESS_FS_READ_FILE) 150 /* clang-format on */ 151 152 /* 153 * @path: Should have been checked by get_path_from_fd(). 154 */ 155 int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, 156 const struct path *const path, 157 access_mask_t access_rights) 158 { 159 int err; 160 struct landlock_object *object; 161 162 /* Files only get access rights that make sense. */ 163 if (!d_is_dir(path->dentry) && 164 (access_rights | ACCESS_FILE) != ACCESS_FILE) 165 return -EINVAL; 166 if (WARN_ON_ONCE(ruleset->num_layers != 1)) 167 return -EINVAL; 168 169 /* Transforms relative access rights to absolute ones. */ 170 access_rights |= LANDLOCK_MASK_ACCESS_FS & ~ruleset->fs_access_masks[0]; 171 object = get_inode_object(d_backing_inode(path->dentry)); 172 if (IS_ERR(object)) 173 return PTR_ERR(object); 174 mutex_lock(&ruleset->lock); 175 err = landlock_insert_rule(ruleset, object, access_rights); 176 mutex_unlock(&ruleset->lock); 177 /* 178 * No need to check for an error because landlock_insert_rule() 179 * increments the refcount for the new object if needed. 180 */ 181 landlock_put_object(object); 182 return err; 183 } 184 185 /* Access-control management */ 186 187 /* 188 * The lifetime of the returned rule is tied to @domain. 189 * 190 * Returns NULL if no rule is found or if @dentry is negative. 191 */ 192 static inline const struct landlock_rule * 193 find_rule(const struct landlock_ruleset *const domain, 194 const struct dentry *const dentry) 195 { 196 const struct landlock_rule *rule; 197 const struct inode *inode; 198 199 /* Ignores nonexistent leafs. */ 200 if (d_is_negative(dentry)) 201 return NULL; 202 203 inode = d_backing_inode(dentry); 204 rcu_read_lock(); 205 rule = landlock_find_rule( 206 domain, rcu_dereference(landlock_inode(inode)->object)); 207 rcu_read_unlock(); 208 return rule; 209 } 210 211 /* 212 * @layer_masks is read and may be updated according to the access request and 213 * the matching rule. 214 * 215 * Returns true if the request is allowed (i.e. relevant layer masks for the 216 * request are empty). 217 */ 218 static inline bool 219 unmask_layers(const struct landlock_rule *const rule, 220 const access_mask_t access_request, 221 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS]) 222 { 223 size_t layer_level; 224 225 if (!access_request || !layer_masks) 226 return true; 227 if (!rule) 228 return false; 229 230 /* 231 * An access is granted if, for each policy layer, at least one rule 232 * encountered on the pathwalk grants the requested access, 233 * regardless of its position in the layer stack. We must then check 234 * the remaining layers for each inode, from the first added layer to 235 * the last one. When there is multiple requested accesses, for each 236 * policy layer, the full set of requested accesses may not be granted 237 * by only one rule, but by the union (binary OR) of multiple rules. 238 * E.g. /a/b <execute> + /a <read> => /a/b <execute + read> 239 */ 240 for (layer_level = 0; layer_level < rule->num_layers; layer_level++) { 241 const struct landlock_layer *const layer = 242 &rule->layers[layer_level]; 243 const layer_mask_t layer_bit = BIT_ULL(layer->level - 1); 244 const unsigned long access_req = access_request; 245 unsigned long access_bit; 246 bool is_empty; 247 248 /* 249 * Records in @layer_masks which layer grants access to each 250 * requested access. 251 */ 252 is_empty = true; 253 for_each_set_bit(access_bit, &access_req, 254 ARRAY_SIZE(*layer_masks)) { 255 if (layer->access & BIT_ULL(access_bit)) 256 (*layer_masks)[access_bit] &= ~layer_bit; 257 is_empty = is_empty && !(*layer_masks)[access_bit]; 258 } 259 if (is_empty) 260 return true; 261 } 262 return false; 263 } 264 265 /* 266 * Allows access to pseudo filesystems that will never be mountable (e.g. 267 * sockfs, pipefs), but can still be reachable through 268 * /proc/<pid>/fd/<file-descriptor> 269 */ 270 static inline bool is_nouser_or_private(const struct dentry *dentry) 271 { 272 return (dentry->d_sb->s_flags & SB_NOUSER) || 273 (d_is_positive(dentry) && 274 unlikely(IS_PRIVATE(d_backing_inode(dentry)))); 275 } 276 277 static inline access_mask_t 278 get_handled_accesses(const struct landlock_ruleset *const domain) 279 { 280 access_mask_t access_dom = 0; 281 unsigned long access_bit; 282 283 for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS; 284 access_bit++) { 285 size_t layer_level; 286 287 for (layer_level = 0; layer_level < domain->num_layers; 288 layer_level++) { 289 if (domain->fs_access_masks[layer_level] & 290 BIT_ULL(access_bit)) { 291 access_dom |= BIT_ULL(access_bit); 292 break; 293 } 294 } 295 } 296 return access_dom; 297 } 298 299 static inline access_mask_t 300 init_layer_masks(const struct landlock_ruleset *const domain, 301 const access_mask_t access_request, 302 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS]) 303 { 304 access_mask_t handled_accesses = 0; 305 size_t layer_level; 306 307 memset(layer_masks, 0, sizeof(*layer_masks)); 308 /* An empty access request can happen because of O_WRONLY | O_RDWR. */ 309 if (!access_request) 310 return 0; 311 312 /* Saves all handled accesses per layer. */ 313 for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { 314 const unsigned long access_req = access_request; 315 unsigned long access_bit; 316 317 for_each_set_bit(access_bit, &access_req, 318 ARRAY_SIZE(*layer_masks)) { 319 if (domain->fs_access_masks[layer_level] & 320 BIT_ULL(access_bit)) { 321 (*layer_masks)[access_bit] |= 322 BIT_ULL(layer_level); 323 handled_accesses |= BIT_ULL(access_bit); 324 } 325 } 326 } 327 return handled_accesses; 328 } 329 330 /* 331 * Check that a destination file hierarchy has more restrictions than a source 332 * file hierarchy. This is only used for link and rename actions. 333 * 334 * @layer_masks_child2: Optional child masks. 335 */ 336 static inline bool no_more_access( 337 const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS], 338 const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS], 339 const bool child1_is_directory, 340 const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS], 341 const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS], 342 const bool child2_is_directory) 343 { 344 unsigned long access_bit; 345 346 for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2); 347 access_bit++) { 348 /* Ignores accesses that only make sense for directories. */ 349 const bool is_file_access = 350 !!(BIT_ULL(access_bit) & ACCESS_FILE); 351 352 if (child1_is_directory || is_file_access) { 353 /* 354 * Checks if the destination restrictions are a 355 * superset of the source ones (i.e. inherited access 356 * rights without child exceptions): 357 * restrictions(parent2) >= restrictions(child1) 358 */ 359 if ((((*layer_masks_parent1)[access_bit] & 360 (*layer_masks_child1)[access_bit]) | 361 (*layer_masks_parent2)[access_bit]) != 362 (*layer_masks_parent2)[access_bit]) 363 return false; 364 } 365 366 if (!layer_masks_child2) 367 continue; 368 if (child2_is_directory || is_file_access) { 369 /* 370 * Checks inverted restrictions for RENAME_EXCHANGE: 371 * restrictions(parent1) >= restrictions(child2) 372 */ 373 if ((((*layer_masks_parent2)[access_bit] & 374 (*layer_masks_child2)[access_bit]) | 375 (*layer_masks_parent1)[access_bit]) != 376 (*layer_masks_parent1)[access_bit]) 377 return false; 378 } 379 } 380 return true; 381 } 382 383 /* 384 * Removes @layer_masks accesses that are not requested. 385 * 386 * Returns true if the request is allowed, false otherwise. 387 */ 388 static inline bool 389 scope_to_request(const access_mask_t access_request, 390 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS]) 391 { 392 const unsigned long access_req = access_request; 393 unsigned long access_bit; 394 395 if (WARN_ON_ONCE(!layer_masks)) 396 return true; 397 398 for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks)) 399 (*layer_masks)[access_bit] = 0; 400 return !memchr_inv(layer_masks, 0, sizeof(*layer_masks)); 401 } 402 403 /* 404 * Returns true if there is at least one access right different than 405 * LANDLOCK_ACCESS_FS_REFER. 406 */ 407 static inline bool 408 is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS], 409 const access_mask_t access_request) 410 { 411 unsigned long access_bit; 412 /* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */ 413 const unsigned long access_check = access_request & 414 ~LANDLOCK_ACCESS_FS_REFER; 415 416 if (!layer_masks) 417 return false; 418 419 for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) { 420 if ((*layer_masks)[access_bit]) 421 return true; 422 } 423 return false; 424 } 425 426 /** 427 * check_access_path_dual - Check accesses for requests with a common path 428 * 429 * @domain: Domain to check against. 430 * @path: File hierarchy to walk through. 431 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is 432 * equal to @layer_masks_parent2 (if any). This is tied to the unique 433 * requested path for most actions, or the source in case of a refer action 434 * (i.e. rename or link), or the source and destination in case of 435 * RENAME_EXCHANGE. 436 * @layer_masks_parent1: Pointer to a matrix of layer masks per access 437 * masks, identifying the layers that forbid a specific access. Bits from 438 * this matrix can be unset according to the @path walk. An empty matrix 439 * means that @domain allows all possible Landlock accesses (i.e. not only 440 * those identified by @access_request_parent1). This matrix can 441 * initially refer to domain layer masks and, when the accesses for the 442 * destination and source are the same, to requested layer masks. 443 * @dentry_child1: Dentry to the initial child of the parent1 path. This 444 * pointer must be NULL for non-refer actions (i.e. not link nor rename). 445 * @access_request_parent2: Similar to @access_request_parent1 but for a 446 * request involving a source and a destination. This refers to the 447 * destination, except in case of RENAME_EXCHANGE where it also refers to 448 * the source. Must be set to 0 when using a simple path request. 449 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer 450 * action. This must be NULL otherwise. 451 * @dentry_child2: Dentry to the initial child of the parent2 path. This 452 * pointer is only set for RENAME_EXCHANGE actions and must be NULL 453 * otherwise. 454 * 455 * This helper first checks that the destination has a superset of restrictions 456 * compared to the source (if any) for a common path. Because of 457 * RENAME_EXCHANGE actions, source and destinations may be swapped. It then 458 * checks that the collected accesses and the remaining ones are enough to 459 * allow the request. 460 * 461 * Returns: 462 * - 0 if the access request is granted; 463 * - -EACCES if it is denied because of access right other than 464 * LANDLOCK_ACCESS_FS_REFER; 465 * - -EXDEV if the renaming or linking would be a privileged escalation 466 * (according to each layered policies), or if LANDLOCK_ACCESS_FS_REFER is 467 * not allowed by the source or the destination. 468 */ 469 static int check_access_path_dual( 470 const struct landlock_ruleset *const domain, 471 const struct path *const path, 472 const access_mask_t access_request_parent1, 473 layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS], 474 const struct dentry *const dentry_child1, 475 const access_mask_t access_request_parent2, 476 layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS], 477 const struct dentry *const dentry_child2) 478 { 479 bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check, 480 child1_is_directory = true, child2_is_directory = true; 481 struct path walker_path; 482 access_mask_t access_masked_parent1, access_masked_parent2; 483 layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS], 484 _layer_masks_child2[LANDLOCK_NUM_ACCESS_FS]; 485 layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL, 486 (*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL; 487 488 if (!access_request_parent1 && !access_request_parent2) 489 return 0; 490 if (WARN_ON_ONCE(!domain || !path)) 491 return 0; 492 if (is_nouser_or_private(path->dentry)) 493 return 0; 494 if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1)) 495 return -EACCES; 496 497 if (unlikely(layer_masks_parent2)) { 498 if (WARN_ON_ONCE(!dentry_child1)) 499 return -EACCES; 500 /* 501 * For a double request, first check for potential privilege 502 * escalation by looking at domain handled accesses (which are 503 * a superset of the meaningful requested accesses). 504 */ 505 access_masked_parent1 = access_masked_parent2 = 506 get_handled_accesses(domain); 507 is_dom_check = true; 508 } else { 509 if (WARN_ON_ONCE(dentry_child1 || dentry_child2)) 510 return -EACCES; 511 /* For a simple request, only check for requested accesses. */ 512 access_masked_parent1 = access_request_parent1; 513 access_masked_parent2 = access_request_parent2; 514 is_dom_check = false; 515 } 516 517 if (unlikely(dentry_child1)) { 518 unmask_layers(find_rule(domain, dentry_child1), 519 init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS, 520 &_layer_masks_child1), 521 &_layer_masks_child1); 522 layer_masks_child1 = &_layer_masks_child1; 523 child1_is_directory = d_is_dir(dentry_child1); 524 } 525 if (unlikely(dentry_child2)) { 526 unmask_layers(find_rule(domain, dentry_child2), 527 init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS, 528 &_layer_masks_child2), 529 &_layer_masks_child2); 530 layer_masks_child2 = &_layer_masks_child2; 531 child2_is_directory = d_is_dir(dentry_child2); 532 } 533 534 walker_path = *path; 535 path_get(&walker_path); 536 /* 537 * We need to walk through all the hierarchy to not miss any relevant 538 * restriction. 539 */ 540 while (true) { 541 struct dentry *parent_dentry; 542 const struct landlock_rule *rule; 543 544 /* 545 * If at least all accesses allowed on the destination are 546 * already allowed on the source, respectively if there is at 547 * least as much as restrictions on the destination than on the 548 * source, then we can safely refer files from the source to 549 * the destination without risking a privilege escalation. 550 * This also applies in the case of RENAME_EXCHANGE, which 551 * implies checks on both direction. This is crucial for 552 * standalone multilayered security policies. Furthermore, 553 * this helps avoid policy writers to shoot themselves in the 554 * foot. 555 */ 556 if (unlikely(is_dom_check && 557 no_more_access( 558 layer_masks_parent1, layer_masks_child1, 559 child1_is_directory, layer_masks_parent2, 560 layer_masks_child2, 561 child2_is_directory))) { 562 allowed_parent1 = scope_to_request( 563 access_request_parent1, layer_masks_parent1); 564 allowed_parent2 = scope_to_request( 565 access_request_parent2, layer_masks_parent2); 566 567 /* Stops when all accesses are granted. */ 568 if (allowed_parent1 && allowed_parent2) 569 break; 570 571 /* 572 * Now, downgrades the remaining checks from domain 573 * handled accesses to requested accesses. 574 */ 575 is_dom_check = false; 576 access_masked_parent1 = access_request_parent1; 577 access_masked_parent2 = access_request_parent2; 578 } 579 580 rule = find_rule(domain, walker_path.dentry); 581 allowed_parent1 = unmask_layers(rule, access_masked_parent1, 582 layer_masks_parent1); 583 allowed_parent2 = unmask_layers(rule, access_masked_parent2, 584 layer_masks_parent2); 585 586 /* Stops when a rule from each layer grants access. */ 587 if (allowed_parent1 && allowed_parent2) 588 break; 589 590 jump_up: 591 if (walker_path.dentry == walker_path.mnt->mnt_root) { 592 if (follow_up(&walker_path)) { 593 /* Ignores hidden mount points. */ 594 goto jump_up; 595 } else { 596 /* 597 * Stops at the real root. Denies access 598 * because not all layers have granted access. 599 */ 600 break; 601 } 602 } 603 if (unlikely(IS_ROOT(walker_path.dentry))) { 604 /* 605 * Stops at disconnected root directories. Only allows 606 * access to internal filesystems (e.g. nsfs, which is 607 * reachable through /proc/<pid>/ns/<namespace>). 608 */ 609 allowed_parent1 = allowed_parent2 = 610 !!(walker_path.mnt->mnt_flags & MNT_INTERNAL); 611 break; 612 } 613 parent_dentry = dget_parent(walker_path.dentry); 614 dput(walker_path.dentry); 615 walker_path.dentry = parent_dentry; 616 } 617 path_put(&walker_path); 618 619 if (allowed_parent1 && allowed_parent2) 620 return 0; 621 622 /* 623 * This prioritizes EACCES over EXDEV for all actions, including 624 * renames with RENAME_EXCHANGE. 625 */ 626 if (likely(is_eacces(layer_masks_parent1, access_request_parent1) || 627 is_eacces(layer_masks_parent2, access_request_parent2))) 628 return -EACCES; 629 630 /* 631 * Gracefully forbids reparenting if the destination directory 632 * hierarchy is not a superset of restrictions of the source directory 633 * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the 634 * source or the destination. 635 */ 636 return -EXDEV; 637 } 638 639 static inline int check_access_path(const struct landlock_ruleset *const domain, 640 const struct path *const path, 641 access_mask_t access_request) 642 { 643 layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; 644 645 access_request = init_layer_masks(domain, access_request, &layer_masks); 646 return check_access_path_dual(domain, path, access_request, 647 &layer_masks, NULL, 0, NULL, NULL); 648 } 649 650 static inline int current_check_access_path(const struct path *const path, 651 const access_mask_t access_request) 652 { 653 const struct landlock_ruleset *const dom = 654 landlock_get_current_domain(); 655 656 if (!dom) 657 return 0; 658 return check_access_path(dom, path, access_request); 659 } 660 661 static inline access_mask_t get_mode_access(const umode_t mode) 662 { 663 switch (mode & S_IFMT) { 664 case S_IFLNK: 665 return LANDLOCK_ACCESS_FS_MAKE_SYM; 666 case 0: 667 /* A zero mode translates to S_IFREG. */ 668 case S_IFREG: 669 return LANDLOCK_ACCESS_FS_MAKE_REG; 670 case S_IFDIR: 671 return LANDLOCK_ACCESS_FS_MAKE_DIR; 672 case S_IFCHR: 673 return LANDLOCK_ACCESS_FS_MAKE_CHAR; 674 case S_IFBLK: 675 return LANDLOCK_ACCESS_FS_MAKE_BLOCK; 676 case S_IFIFO: 677 return LANDLOCK_ACCESS_FS_MAKE_FIFO; 678 case S_IFSOCK: 679 return LANDLOCK_ACCESS_FS_MAKE_SOCK; 680 default: 681 WARN_ON_ONCE(1); 682 return 0; 683 } 684 } 685 686 static inline access_mask_t maybe_remove(const struct dentry *const dentry) 687 { 688 if (d_is_negative(dentry)) 689 return 0; 690 return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR : 691 LANDLOCK_ACCESS_FS_REMOVE_FILE; 692 } 693 694 /** 695 * collect_domain_accesses - Walk through a file path and collect accesses 696 * 697 * @domain: Domain to check against. 698 * @mnt_root: Last directory to check. 699 * @dir: Directory to start the walk from. 700 * @layer_masks_dom: Where to store the collected accesses. 701 * 702 * This helper is useful to begin a path walk from the @dir directory to a 703 * @mnt_root directory used as a mount point. This mount point is the common 704 * ancestor between the source and the destination of a renamed and linked 705 * file. While walking from @dir to @mnt_root, we record all the domain's 706 * allowed accesses in @layer_masks_dom. 707 * 708 * This is similar to check_access_path_dual() but much simpler because it only 709 * handles walking on the same mount point and only check one set of accesses. 710 * 711 * Returns: 712 * - true if all the domain access rights are allowed for @dir; 713 * - false if the walk reached @mnt_root. 714 */ 715 static bool collect_domain_accesses( 716 const struct landlock_ruleset *const domain, 717 const struct dentry *const mnt_root, struct dentry *dir, 718 layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS]) 719 { 720 unsigned long access_dom; 721 bool ret = false; 722 723 if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom)) 724 return true; 725 if (is_nouser_or_private(dir)) 726 return true; 727 728 access_dom = init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS, 729 layer_masks_dom); 730 731 dget(dir); 732 while (true) { 733 struct dentry *parent_dentry; 734 735 /* Gets all layers allowing all domain accesses. */ 736 if (unmask_layers(find_rule(domain, dir), access_dom, 737 layer_masks_dom)) { 738 /* 739 * Stops when all handled accesses are allowed by at 740 * least one rule in each layer. 741 */ 742 ret = true; 743 break; 744 } 745 746 /* We should not reach a root other than @mnt_root. */ 747 if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir))) 748 break; 749 750 parent_dentry = dget_parent(dir); 751 dput(dir); 752 dir = parent_dentry; 753 } 754 dput(dir); 755 return ret; 756 } 757 758 /** 759 * current_check_refer_path - Check if a rename or link action is allowed 760 * 761 * @old_dentry: File or directory requested to be moved or linked. 762 * @new_dir: Destination parent directory. 763 * @new_dentry: Destination file or directory. 764 * @removable: Sets to true if it is a rename operation. 765 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE. 766 * 767 * Because of its unprivileged constraints, Landlock relies on file hierarchies 768 * (and not only inodes) to tie access rights to files. Being able to link or 769 * rename a file hierarchy brings some challenges. Indeed, moving or linking a 770 * file (i.e. creating a new reference to an inode) can have an impact on the 771 * actions allowed for a set of files if it would change its parent directory 772 * (i.e. reparenting). 773 * 774 * To avoid trivial access right bypasses, Landlock first checks if the file or 775 * directory requested to be moved would gain new access rights inherited from 776 * its new hierarchy. Before returning any error, Landlock then checks that 777 * the parent source hierarchy and the destination hierarchy would allow the 778 * link or rename action. If it is not the case, an error with EACCES is 779 * returned to inform user space that there is no way to remove or create the 780 * requested source file type. If it should be allowed but the new inherited 781 * access rights would be greater than the source access rights, then the 782 * kernel returns an error with EXDEV. Prioritizing EACCES over EXDEV enables 783 * user space to abort the whole operation if there is no way to do it, or to 784 * manually copy the source to the destination if this remains allowed, e.g. 785 * because file creation is allowed on the destination directory but not direct 786 * linking. 787 * 788 * To achieve this goal, the kernel needs to compare two file hierarchies: the 789 * one identifying the source file or directory (including itself), and the 790 * destination one. This can be seen as a multilayer partial ordering problem. 791 * The kernel walks through these paths and collects in a matrix the access 792 * rights that are denied per layer. These matrices are then compared to see 793 * if the destination one has more (or the same) restrictions as the source 794 * one. If this is the case, the requested action will not return EXDEV, which 795 * doesn't mean the action is allowed. The parent hierarchy of the source 796 * (i.e. parent directory), and the destination hierarchy must also be checked 797 * to verify that they explicitly allow such action (i.e. referencing, 798 * creation and potentially removal rights). The kernel implementation is then 799 * required to rely on potentially four matrices of access rights: one for the 800 * source file or directory (i.e. the child), a potentially other one for the 801 * other source/destination (in case of RENAME_EXCHANGE), one for the source 802 * parent hierarchy and a last one for the destination hierarchy. These 803 * ephemeral matrices take some space on the stack, which limits the number of 804 * layers to a deemed reasonable number: 16. 805 * 806 * Returns: 807 * - 0 if access is allowed; 808 * - -EXDEV if @old_dentry would inherit new access rights from @new_dir; 809 * - -EACCES if file removal or creation is denied. 810 */ 811 static int current_check_refer_path(struct dentry *const old_dentry, 812 const struct path *const new_dir, 813 struct dentry *const new_dentry, 814 const bool removable, const bool exchange) 815 { 816 const struct landlock_ruleset *const dom = 817 landlock_get_current_domain(); 818 bool allow_parent1, allow_parent2; 819 access_mask_t access_request_parent1, access_request_parent2; 820 struct path mnt_dir; 821 layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS], 822 layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS]; 823 824 if (!dom) 825 return 0; 826 if (WARN_ON_ONCE(dom->num_layers < 1)) 827 return -EACCES; 828 if (unlikely(d_is_negative(old_dentry))) 829 return -ENOENT; 830 if (exchange) { 831 if (unlikely(d_is_negative(new_dentry))) 832 return -ENOENT; 833 access_request_parent1 = 834 get_mode_access(d_backing_inode(new_dentry)->i_mode); 835 } else { 836 access_request_parent1 = 0; 837 } 838 access_request_parent2 = 839 get_mode_access(d_backing_inode(old_dentry)->i_mode); 840 if (removable) { 841 access_request_parent1 |= maybe_remove(old_dentry); 842 access_request_parent2 |= maybe_remove(new_dentry); 843 } 844 845 /* The mount points are the same for old and new paths, cf. EXDEV. */ 846 if (old_dentry->d_parent == new_dir->dentry) { 847 /* 848 * The LANDLOCK_ACCESS_FS_REFER access right is not required 849 * for same-directory referer (i.e. no reparenting). 850 */ 851 access_request_parent1 = init_layer_masks( 852 dom, access_request_parent1 | access_request_parent2, 853 &layer_masks_parent1); 854 return check_access_path_dual(dom, new_dir, 855 access_request_parent1, 856 &layer_masks_parent1, NULL, 0, 857 NULL, NULL); 858 } 859 860 /* Backward compatibility: no reparenting support. */ 861 if (!(get_handled_accesses(dom) & LANDLOCK_ACCESS_FS_REFER)) 862 return -EXDEV; 863 864 access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER; 865 access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER; 866 867 /* Saves the common mount point. */ 868 mnt_dir.mnt = new_dir->mnt; 869 mnt_dir.dentry = new_dir->mnt->mnt_root; 870 871 /* new_dir->dentry is equal to new_dentry->d_parent */ 872 allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, 873 old_dentry->d_parent, 874 &layer_masks_parent1); 875 allow_parent2 = collect_domain_accesses( 876 dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2); 877 878 if (allow_parent1 && allow_parent2) 879 return 0; 880 881 /* 882 * To be able to compare source and destination domain access rights, 883 * take into account the @old_dentry access rights aggregated with its 884 * parent access rights. This will be useful to compare with the 885 * destination parent access rights. 886 */ 887 return check_access_path_dual(dom, &mnt_dir, access_request_parent1, 888 &layer_masks_parent1, old_dentry, 889 access_request_parent2, 890 &layer_masks_parent2, 891 exchange ? new_dentry : NULL); 892 } 893 894 /* Inode hooks */ 895 896 static void hook_inode_free_security(struct inode *const inode) 897 { 898 /* 899 * All inodes must already have been untied from their object by 900 * release_inode() or hook_sb_delete(). 901 */ 902 WARN_ON_ONCE(landlock_inode(inode)->object); 903 } 904 905 /* Super-block hooks */ 906 907 /* 908 * Release the inodes used in a security policy. 909 * 910 * Cf. fsnotify_unmount_inodes() and invalidate_inodes() 911 */ 912 static void hook_sb_delete(struct super_block *const sb) 913 { 914 struct inode *inode, *prev_inode = NULL; 915 916 if (!landlock_initialized) 917 return; 918 919 spin_lock(&sb->s_inode_list_lock); 920 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 921 struct landlock_object *object; 922 923 /* Only handles referenced inodes. */ 924 if (!atomic_read(&inode->i_count)) 925 continue; 926 927 /* 928 * Protects against concurrent modification of inode (e.g. 929 * from get_inode_object()). 930 */ 931 spin_lock(&inode->i_lock); 932 /* 933 * Checks I_FREEING and I_WILL_FREE to protect against a race 934 * condition when release_inode() just called iput(), which 935 * could lead to a NULL dereference of inode->security or a 936 * second call to iput() for the same Landlock object. Also 937 * checks I_NEW because such inode cannot be tied to an object. 938 */ 939 if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { 940 spin_unlock(&inode->i_lock); 941 continue; 942 } 943 944 rcu_read_lock(); 945 object = rcu_dereference(landlock_inode(inode)->object); 946 if (!object) { 947 rcu_read_unlock(); 948 spin_unlock(&inode->i_lock); 949 continue; 950 } 951 /* Keeps a reference to this inode until the next loop walk. */ 952 __iget(inode); 953 spin_unlock(&inode->i_lock); 954 955 /* 956 * If there is no concurrent release_inode() ongoing, then we 957 * are in charge of calling iput() on this inode, otherwise we 958 * will just wait for it to finish. 959 */ 960 spin_lock(&object->lock); 961 if (object->underobj == inode) { 962 object->underobj = NULL; 963 spin_unlock(&object->lock); 964 rcu_read_unlock(); 965 966 /* 967 * Because object->underobj was not NULL, 968 * release_inode() and get_inode_object() guarantee 969 * that it is safe to reset 970 * landlock_inode(inode)->object while it is not NULL. 971 * It is therefore not necessary to lock inode->i_lock. 972 */ 973 rcu_assign_pointer(landlock_inode(inode)->object, NULL); 974 /* 975 * At this point, we own the ihold() reference that was 976 * originally set up by get_inode_object() and the 977 * __iget() reference that we just set in this loop 978 * walk. Therefore the following call to iput() will 979 * not sleep nor drop the inode because there is now at 980 * least two references to it. 981 */ 982 iput(inode); 983 } else { 984 spin_unlock(&object->lock); 985 rcu_read_unlock(); 986 } 987 988 if (prev_inode) { 989 /* 990 * At this point, we still own the __iget() reference 991 * that we just set in this loop walk. Therefore we 992 * can drop the list lock and know that the inode won't 993 * disappear from under us until the next loop walk. 994 */ 995 spin_unlock(&sb->s_inode_list_lock); 996 /* 997 * We can now actually put the inode reference from the 998 * previous loop walk, which is not needed anymore. 999 */ 1000 iput(prev_inode); 1001 cond_resched(); 1002 spin_lock(&sb->s_inode_list_lock); 1003 } 1004 prev_inode = inode; 1005 } 1006 spin_unlock(&sb->s_inode_list_lock); 1007 1008 /* Puts the inode reference from the last loop walk, if any. */ 1009 if (prev_inode) 1010 iput(prev_inode); 1011 /* Waits for pending iput() in release_inode(). */ 1012 wait_var_event(&landlock_superblock(sb)->inode_refs, 1013 !atomic_long_read(&landlock_superblock(sb)->inode_refs)); 1014 } 1015 1016 /* 1017 * Because a Landlock security policy is defined according to the filesystem 1018 * topology (i.e. the mount namespace), changing it may grant access to files 1019 * not previously allowed. 1020 * 1021 * To make it simple, deny any filesystem topology modification by landlocked 1022 * processes. Non-landlocked processes may still change the namespace of a 1023 * landlocked process, but this kind of threat must be handled by a system-wide 1024 * access-control security policy. 1025 * 1026 * This could be lifted in the future if Landlock can safely handle mount 1027 * namespace updates requested by a landlocked process. Indeed, we could 1028 * update the current domain (which is currently read-only) by taking into 1029 * account the accesses of the source and the destination of a new mount point. 1030 * However, it would also require to make all the child domains dynamically 1031 * inherit these new constraints. Anyway, for backward compatibility reasons, 1032 * a dedicated user space option would be required (e.g. as a ruleset flag). 1033 */ 1034 static int hook_sb_mount(const char *const dev_name, 1035 const struct path *const path, const char *const type, 1036 const unsigned long flags, void *const data) 1037 { 1038 if (!landlock_get_current_domain()) 1039 return 0; 1040 return -EPERM; 1041 } 1042 1043 static int hook_move_mount(const struct path *const from_path, 1044 const struct path *const to_path) 1045 { 1046 if (!landlock_get_current_domain()) 1047 return 0; 1048 return -EPERM; 1049 } 1050 1051 /* 1052 * Removing a mount point may reveal a previously hidden file hierarchy, which 1053 * may then grant access to files, which may have previously been forbidden. 1054 */ 1055 static int hook_sb_umount(struct vfsmount *const mnt, const int flags) 1056 { 1057 if (!landlock_get_current_domain()) 1058 return 0; 1059 return -EPERM; 1060 } 1061 1062 static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts) 1063 { 1064 if (!landlock_get_current_domain()) 1065 return 0; 1066 return -EPERM; 1067 } 1068 1069 /* 1070 * pivot_root(2), like mount(2), changes the current mount namespace. It must 1071 * then be forbidden for a landlocked process. 1072 * 1073 * However, chroot(2) may be allowed because it only changes the relative root 1074 * directory of the current process. Moreover, it can be used to restrict the 1075 * view of the filesystem. 1076 */ 1077 static int hook_sb_pivotroot(const struct path *const old_path, 1078 const struct path *const new_path) 1079 { 1080 if (!landlock_get_current_domain()) 1081 return 0; 1082 return -EPERM; 1083 } 1084 1085 /* Path hooks */ 1086 1087 static int hook_path_link(struct dentry *const old_dentry, 1088 const struct path *const new_dir, 1089 struct dentry *const new_dentry) 1090 { 1091 return current_check_refer_path(old_dentry, new_dir, new_dentry, false, 1092 false); 1093 } 1094 1095 static int hook_path_rename(const struct path *const old_dir, 1096 struct dentry *const old_dentry, 1097 const struct path *const new_dir, 1098 struct dentry *const new_dentry, 1099 const unsigned int flags) 1100 { 1101 /* old_dir refers to old_dentry->d_parent and new_dir->mnt */ 1102 return current_check_refer_path(old_dentry, new_dir, new_dentry, true, 1103 !!(flags & RENAME_EXCHANGE)); 1104 } 1105 1106 static int hook_path_mkdir(const struct path *const dir, 1107 struct dentry *const dentry, const umode_t mode) 1108 { 1109 return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR); 1110 } 1111 1112 static int hook_path_mknod(const struct path *const dir, 1113 struct dentry *const dentry, const umode_t mode, 1114 const unsigned int dev) 1115 { 1116 const struct landlock_ruleset *const dom = 1117 landlock_get_current_domain(); 1118 1119 if (!dom) 1120 return 0; 1121 return check_access_path(dom, dir, get_mode_access(mode)); 1122 } 1123 1124 static int hook_path_symlink(const struct path *const dir, 1125 struct dentry *const dentry, 1126 const char *const old_name) 1127 { 1128 return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM); 1129 } 1130 1131 static int hook_path_unlink(const struct path *const dir, 1132 struct dentry *const dentry) 1133 { 1134 return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE); 1135 } 1136 1137 static int hook_path_rmdir(const struct path *const dir, 1138 struct dentry *const dentry) 1139 { 1140 return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR); 1141 } 1142 1143 /* File hooks */ 1144 1145 static inline access_mask_t get_file_access(const struct file *const file) 1146 { 1147 access_mask_t access = 0; 1148 1149 if (file->f_mode & FMODE_READ) { 1150 /* A directory can only be opened in read mode. */ 1151 if (S_ISDIR(file_inode(file)->i_mode)) 1152 return LANDLOCK_ACCESS_FS_READ_DIR; 1153 access = LANDLOCK_ACCESS_FS_READ_FILE; 1154 } 1155 if (file->f_mode & FMODE_WRITE) 1156 access |= LANDLOCK_ACCESS_FS_WRITE_FILE; 1157 /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */ 1158 if (file->f_flags & __FMODE_EXEC) 1159 access |= LANDLOCK_ACCESS_FS_EXECUTE; 1160 return access; 1161 } 1162 1163 static int hook_file_open(struct file *const file) 1164 { 1165 const struct landlock_ruleset *const dom = 1166 landlock_get_current_domain(); 1167 1168 if (!dom) 1169 return 0; 1170 /* 1171 * Because a file may be opened with O_PATH, get_file_access() may 1172 * return 0. This case will be handled with a future Landlock 1173 * evolution. 1174 */ 1175 return check_access_path(dom, &file->f_path, get_file_access(file)); 1176 } 1177 1178 static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = { 1179 LSM_HOOK_INIT(inode_free_security, hook_inode_free_security), 1180 1181 LSM_HOOK_INIT(sb_delete, hook_sb_delete), 1182 LSM_HOOK_INIT(sb_mount, hook_sb_mount), 1183 LSM_HOOK_INIT(move_mount, hook_move_mount), 1184 LSM_HOOK_INIT(sb_umount, hook_sb_umount), 1185 LSM_HOOK_INIT(sb_remount, hook_sb_remount), 1186 LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot), 1187 1188 LSM_HOOK_INIT(path_link, hook_path_link), 1189 LSM_HOOK_INIT(path_rename, hook_path_rename), 1190 LSM_HOOK_INIT(path_mkdir, hook_path_mkdir), 1191 LSM_HOOK_INIT(path_mknod, hook_path_mknod), 1192 LSM_HOOK_INIT(path_symlink, hook_path_symlink), 1193 LSM_HOOK_INIT(path_unlink, hook_path_unlink), 1194 LSM_HOOK_INIT(path_rmdir, hook_path_rmdir), 1195 1196 LSM_HOOK_INIT(file_open, hook_file_open), 1197 }; 1198 1199 __init void landlock_add_fs_hooks(void) 1200 { 1201 security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), 1202 LANDLOCK_NAME); 1203 } 1204