1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/fanotify.h> 3 #include <linux/fcntl.h> 4 #include <linux/fdtable.h> 5 #include <linux/file.h> 6 #include <linux/fs.h> 7 #include <linux/anon_inodes.h> 8 #include <linux/fsnotify_backend.h> 9 #include <linux/init.h> 10 #include <linux/mount.h> 11 #include <linux/namei.h> 12 #include <linux/poll.h> 13 #include <linux/security.h> 14 #include <linux/syscalls.h> 15 #include <linux/slab.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/compat.h> 19 #include <linux/sched/signal.h> 20 #include <linux/memcontrol.h> 21 #include <linux/statfs.h> 22 #include <linux/exportfs.h> 23 24 #include <asm/ioctls.h> 25 26 #include "../../mount.h" 27 #include "../fdinfo.h" 28 #include "fanotify.h" 29 30 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 31 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 32 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 33 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 34 35 /* 36 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable 37 * limit of marks per user, similar to inotify. Effectively, the legacy limit 38 * of fanotify marks per user is <max marks per group> * <max groups per user>. 39 * This default limit (1M) also happens to match the increased limit of inotify 40 * max_user_watches since v5.10. 41 */ 42 #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ 43 (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) 44 45 /* 46 * Most of the memory cost of adding an inode mark is pinning the marked inode. 47 * The size of the filesystem inode struct is not uniform across filesystems, 48 * so double the size of a VFS inode is used as a conservative approximation. 49 */ 50 #define INODE_MARK_COST (2 * sizeof(struct inode)) 51 52 /* configurable via /proc/sys/fs/fanotify/ */ 53 static int fanotify_max_queued_events __read_mostly; 54 55 #ifdef CONFIG_SYSCTL 56 57 #include <linux/sysctl.h> 58 59 static long ft_zero = 0; 60 static long ft_int_max = INT_MAX; 61 62 static struct ctl_table fanotify_table[] = { 63 { 64 .procname = "max_user_groups", 65 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], 66 .maxlen = sizeof(long), 67 .mode = 0644, 68 .proc_handler = proc_doulongvec_minmax, 69 .extra1 = &ft_zero, 70 .extra2 = &ft_int_max, 71 }, 72 { 73 .procname = "max_user_marks", 74 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], 75 .maxlen = sizeof(long), 76 .mode = 0644, 77 .proc_handler = proc_doulongvec_minmax, 78 .extra1 = &ft_zero, 79 .extra2 = &ft_int_max, 80 }, 81 { 82 .procname = "max_queued_events", 83 .data = &fanotify_max_queued_events, 84 .maxlen = sizeof(int), 85 .mode = 0644, 86 .proc_handler = proc_dointvec_minmax, 87 .extra1 = SYSCTL_ZERO 88 }, 89 { } 90 }; 91 92 static void __init fanotify_sysctls_init(void) 93 { 94 register_sysctl("fs/fanotify", fanotify_table); 95 } 96 #else 97 #define fanotify_sysctls_init() do { } while (0) 98 #endif /* CONFIG_SYSCTL */ 99 100 /* 101 * All flags that may be specified in parameter event_f_flags of fanotify_init. 102 * 103 * Internal and external open flags are stored together in field f_flags of 104 * struct file. Only external open flags shall be allowed in event_f_flags. 105 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be 106 * excluded. 107 */ 108 #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ 109 O_ACCMODE | O_APPEND | O_NONBLOCK | \ 110 __O_SYNC | O_DSYNC | O_CLOEXEC | \ 111 O_LARGEFILE | O_NOATIME ) 112 113 extern const struct fsnotify_ops fanotify_fsnotify_ops; 114 115 struct kmem_cache *fanotify_mark_cache __read_mostly; 116 struct kmem_cache *fanotify_fid_event_cachep __read_mostly; 117 struct kmem_cache *fanotify_path_event_cachep __read_mostly; 118 struct kmem_cache *fanotify_perm_event_cachep __read_mostly; 119 120 #define FANOTIFY_EVENT_ALIGN 4 121 #define FANOTIFY_FID_INFO_HDR_LEN \ 122 (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) 123 #define FANOTIFY_PIDFD_INFO_HDR_LEN \ 124 sizeof(struct fanotify_event_info_pidfd) 125 #define FANOTIFY_ERROR_INFO_LEN \ 126 (sizeof(struct fanotify_event_info_error)) 127 128 static int fanotify_fid_info_len(int fh_len, int name_len) 129 { 130 int info_len = fh_len; 131 132 if (name_len) 133 info_len += name_len + 1; 134 135 return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, 136 FANOTIFY_EVENT_ALIGN); 137 } 138 139 /* FAN_RENAME may have one or two dir+name info records */ 140 static int fanotify_dir_name_info_len(struct fanotify_event *event) 141 { 142 struct fanotify_info *info = fanotify_event_info(event); 143 int dir_fh_len = fanotify_event_dir_fh_len(event); 144 int dir2_fh_len = fanotify_event_dir2_fh_len(event); 145 int info_len = 0; 146 147 if (dir_fh_len) 148 info_len += fanotify_fid_info_len(dir_fh_len, 149 info->name_len); 150 if (dir2_fh_len) 151 info_len += fanotify_fid_info_len(dir2_fh_len, 152 info->name2_len); 153 154 return info_len; 155 } 156 157 static size_t fanotify_event_len(unsigned int info_mode, 158 struct fanotify_event *event) 159 { 160 size_t event_len = FAN_EVENT_METADATA_LEN; 161 struct fanotify_info *info; 162 int fh_len; 163 int dot_len = 0; 164 165 if (!info_mode) 166 return event_len; 167 168 if (fanotify_is_error_event(event->mask)) 169 event_len += FANOTIFY_ERROR_INFO_LEN; 170 171 info = fanotify_event_info(event); 172 173 if (fanotify_event_has_any_dir_fh(event)) { 174 event_len += fanotify_dir_name_info_len(event); 175 } else if ((info_mode & FAN_REPORT_NAME) && 176 (event->mask & FAN_ONDIR)) { 177 /* 178 * With group flag FAN_REPORT_NAME, if name was not recorded in 179 * event on a directory, we will report the name ".". 180 */ 181 dot_len = 1; 182 } 183 184 if (info_mode & FAN_REPORT_PIDFD) 185 event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; 186 187 if (fanotify_event_has_object_fh(event)) { 188 fh_len = fanotify_event_object_fh_len(event); 189 event_len += fanotify_fid_info_len(fh_len, dot_len); 190 } 191 192 return event_len; 193 } 194 195 /* 196 * Remove an hashed event from merge hash table. 197 */ 198 static void fanotify_unhash_event(struct fsnotify_group *group, 199 struct fanotify_event *event) 200 { 201 assert_spin_locked(&group->notification_lock); 202 203 pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, 204 group, event, fanotify_event_hash_bucket(group, event)); 205 206 if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) 207 return; 208 209 hlist_del_init(&event->merge_list); 210 } 211 212 /* 213 * Get an fanotify notification event if one exists and is small 214 * enough to fit in "count". Return an error pointer if the count 215 * is not large enough. When permission event is dequeued, its state is 216 * updated accordingly. 217 */ 218 static struct fanotify_event *get_one_event(struct fsnotify_group *group, 219 size_t count) 220 { 221 size_t event_size; 222 struct fanotify_event *event = NULL; 223 struct fsnotify_event *fsn_event; 224 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); 225 226 pr_debug("%s: group=%p count=%zd\n", __func__, group, count); 227 228 spin_lock(&group->notification_lock); 229 fsn_event = fsnotify_peek_first_event(group); 230 if (!fsn_event) 231 goto out; 232 233 event = FANOTIFY_E(fsn_event); 234 event_size = fanotify_event_len(info_mode, event); 235 236 if (event_size > count) { 237 event = ERR_PTR(-EINVAL); 238 goto out; 239 } 240 241 /* 242 * Held the notification_lock the whole time, so this is the 243 * same event we peeked above. 244 */ 245 fsnotify_remove_first_event(group); 246 if (fanotify_is_perm_event(event->mask)) 247 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; 248 if (fanotify_is_hashed_event(event->mask)) 249 fanotify_unhash_event(group, event); 250 out: 251 spin_unlock(&group->notification_lock); 252 return event; 253 } 254 255 static int create_fd(struct fsnotify_group *group, struct path *path, 256 struct file **file) 257 { 258 int client_fd; 259 struct file *new_file; 260 261 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); 262 if (client_fd < 0) 263 return client_fd; 264 265 /* 266 * we need a new file handle for the userspace program so it can read even if it was 267 * originally opened O_WRONLY. 268 */ 269 new_file = dentry_open(path, 270 group->fanotify_data.f_flags | FMODE_NONOTIFY, 271 current_cred()); 272 if (IS_ERR(new_file)) { 273 /* 274 * we still send an event even if we can't open the file. this 275 * can happen when say tasks are gone and we try to open their 276 * /proc files or we try to open a WRONLY file like in sysfs 277 * we just send the errno to userspace since there isn't much 278 * else we can do. 279 */ 280 put_unused_fd(client_fd); 281 client_fd = PTR_ERR(new_file); 282 } else { 283 *file = new_file; 284 } 285 286 return client_fd; 287 } 288 289 /* 290 * Finish processing of permission event by setting it to ANSWERED state and 291 * drop group->notification_lock. 292 */ 293 static void finish_permission_event(struct fsnotify_group *group, 294 struct fanotify_perm_event *event, 295 unsigned int response) 296 __releases(&group->notification_lock) 297 { 298 bool destroy = false; 299 300 assert_spin_locked(&group->notification_lock); 301 event->response = response; 302 if (event->state == FAN_EVENT_CANCELED) 303 destroy = true; 304 else 305 event->state = FAN_EVENT_ANSWERED; 306 spin_unlock(&group->notification_lock); 307 if (destroy) 308 fsnotify_destroy_event(group, &event->fae.fse); 309 } 310 311 static int process_access_response(struct fsnotify_group *group, 312 struct fanotify_response *response_struct) 313 { 314 struct fanotify_perm_event *event; 315 int fd = response_struct->fd; 316 int response = response_struct->response; 317 318 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, 319 fd, response); 320 /* 321 * make sure the response is valid, if invalid we do nothing and either 322 * userspace can send a valid response or we will clean it up after the 323 * timeout 324 */ 325 switch (response & ~FAN_AUDIT) { 326 case FAN_ALLOW: 327 case FAN_DENY: 328 break; 329 default: 330 return -EINVAL; 331 } 332 333 if (fd < 0) 334 return -EINVAL; 335 336 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) 337 return -EINVAL; 338 339 spin_lock(&group->notification_lock); 340 list_for_each_entry(event, &group->fanotify_data.access_list, 341 fae.fse.list) { 342 if (event->fd != fd) 343 continue; 344 345 list_del_init(&event->fae.fse.list); 346 finish_permission_event(group, event, response); 347 wake_up(&group->fanotify_data.access_waitq); 348 return 0; 349 } 350 spin_unlock(&group->notification_lock); 351 352 return -ENOENT; 353 } 354 355 static size_t copy_error_info_to_user(struct fanotify_event *event, 356 char __user *buf, int count) 357 { 358 struct fanotify_event_info_error info = { }; 359 struct fanotify_error_event *fee = FANOTIFY_EE(event); 360 361 info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; 362 info.hdr.len = FANOTIFY_ERROR_INFO_LEN; 363 364 if (WARN_ON(count < info.hdr.len)) 365 return -EFAULT; 366 367 info.error = fee->error; 368 info.error_count = fee->err_count; 369 370 if (copy_to_user(buf, &info, sizeof(info))) 371 return -EFAULT; 372 373 return info.hdr.len; 374 } 375 376 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, 377 int info_type, const char *name, 378 size_t name_len, 379 char __user *buf, size_t count) 380 { 381 struct fanotify_event_info_fid info = { }; 382 struct file_handle handle = { }; 383 unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; 384 size_t fh_len = fh ? fh->len : 0; 385 size_t info_len = fanotify_fid_info_len(fh_len, name_len); 386 size_t len = info_len; 387 388 pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", 389 __func__, fh_len, name_len, info_len, count); 390 391 if (WARN_ON_ONCE(len < sizeof(info) || len > count)) 392 return -EFAULT; 393 394 /* 395 * Copy event info fid header followed by variable sized file handle 396 * and optionally followed by variable sized filename. 397 */ 398 switch (info_type) { 399 case FAN_EVENT_INFO_TYPE_FID: 400 case FAN_EVENT_INFO_TYPE_DFID: 401 if (WARN_ON_ONCE(name_len)) 402 return -EFAULT; 403 break; 404 case FAN_EVENT_INFO_TYPE_DFID_NAME: 405 case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: 406 case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: 407 if (WARN_ON_ONCE(!name || !name_len)) 408 return -EFAULT; 409 break; 410 default: 411 return -EFAULT; 412 } 413 414 info.hdr.info_type = info_type; 415 info.hdr.len = len; 416 info.fsid = *fsid; 417 if (copy_to_user(buf, &info, sizeof(info))) 418 return -EFAULT; 419 420 buf += sizeof(info); 421 len -= sizeof(info); 422 if (WARN_ON_ONCE(len < sizeof(handle))) 423 return -EFAULT; 424 425 handle.handle_type = fh->type; 426 handle.handle_bytes = fh_len; 427 428 /* Mangle handle_type for bad file_handle */ 429 if (!fh_len) 430 handle.handle_type = FILEID_INVALID; 431 432 if (copy_to_user(buf, &handle, sizeof(handle))) 433 return -EFAULT; 434 435 buf += sizeof(handle); 436 len -= sizeof(handle); 437 if (WARN_ON_ONCE(len < fh_len)) 438 return -EFAULT; 439 440 /* 441 * For an inline fh and inline file name, copy through stack to exclude 442 * the copy from usercopy hardening protections. 443 */ 444 fh_buf = fanotify_fh_buf(fh); 445 if (fh_len <= FANOTIFY_INLINE_FH_LEN) { 446 memcpy(bounce, fh_buf, fh_len); 447 fh_buf = bounce; 448 } 449 if (copy_to_user(buf, fh_buf, fh_len)) 450 return -EFAULT; 451 452 buf += fh_len; 453 len -= fh_len; 454 455 if (name_len) { 456 /* Copy the filename with terminating null */ 457 name_len++; 458 if (WARN_ON_ONCE(len < name_len)) 459 return -EFAULT; 460 461 if (copy_to_user(buf, name, name_len)) 462 return -EFAULT; 463 464 buf += name_len; 465 len -= name_len; 466 } 467 468 /* Pad with 0's */ 469 WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); 470 if (len > 0 && clear_user(buf, len)) 471 return -EFAULT; 472 473 return info_len; 474 } 475 476 static int copy_pidfd_info_to_user(int pidfd, 477 char __user *buf, 478 size_t count) 479 { 480 struct fanotify_event_info_pidfd info = { }; 481 size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; 482 483 if (WARN_ON_ONCE(info_len > count)) 484 return -EFAULT; 485 486 info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; 487 info.hdr.len = info_len; 488 info.pidfd = pidfd; 489 490 if (copy_to_user(buf, &info, info_len)) 491 return -EFAULT; 492 493 return info_len; 494 } 495 496 static int copy_info_records_to_user(struct fanotify_event *event, 497 struct fanotify_info *info, 498 unsigned int info_mode, int pidfd, 499 char __user *buf, size_t count) 500 { 501 int ret, total_bytes = 0, info_type = 0; 502 unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; 503 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; 504 505 /* 506 * Event info records order is as follows: 507 * 1. dir fid + name 508 * 2. (optional) new dir fid + new name 509 * 3. (optional) child fid 510 */ 511 if (fanotify_event_has_dir_fh(event)) { 512 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : 513 FAN_EVENT_INFO_TYPE_DFID; 514 515 /* FAN_RENAME uses special info types */ 516 if (event->mask & FAN_RENAME) 517 info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; 518 519 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 520 fanotify_info_dir_fh(info), 521 info_type, 522 fanotify_info_name(info), 523 info->name_len, buf, count); 524 if (ret < 0) 525 return ret; 526 527 buf += ret; 528 count -= ret; 529 total_bytes += ret; 530 } 531 532 /* New dir fid+name may be reported in addition to old dir fid+name */ 533 if (fanotify_event_has_dir2_fh(event)) { 534 info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; 535 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 536 fanotify_info_dir2_fh(info), 537 info_type, 538 fanotify_info_name2(info), 539 info->name2_len, buf, count); 540 if (ret < 0) 541 return ret; 542 543 buf += ret; 544 count -= ret; 545 total_bytes += ret; 546 } 547 548 if (fanotify_event_has_object_fh(event)) { 549 const char *dot = NULL; 550 int dot_len = 0; 551 552 if (fid_mode == FAN_REPORT_FID || info_type) { 553 /* 554 * With only group flag FAN_REPORT_FID only type FID is 555 * reported. Second info record type is always FID. 556 */ 557 info_type = FAN_EVENT_INFO_TYPE_FID; 558 } else if ((fid_mode & FAN_REPORT_NAME) && 559 (event->mask & FAN_ONDIR)) { 560 /* 561 * With group flag FAN_REPORT_NAME, if name was not 562 * recorded in an event on a directory, report the name 563 * "." with info type DFID_NAME. 564 */ 565 info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; 566 dot = "."; 567 dot_len = 1; 568 } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || 569 (event->mask & FAN_ONDIR)) { 570 /* 571 * With group flag FAN_REPORT_DIR_FID, a single info 572 * record has type DFID for directory entry modification 573 * event and for event on a directory. 574 */ 575 info_type = FAN_EVENT_INFO_TYPE_DFID; 576 } else { 577 /* 578 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, 579 * a single info record has type FID for event on a 580 * non-directory, when there is no directory to report. 581 * For example, on FAN_DELETE_SELF event. 582 */ 583 info_type = FAN_EVENT_INFO_TYPE_FID; 584 } 585 586 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 587 fanotify_event_object_fh(event), 588 info_type, dot, dot_len, 589 buf, count); 590 if (ret < 0) 591 return ret; 592 593 buf += ret; 594 count -= ret; 595 total_bytes += ret; 596 } 597 598 if (pidfd_mode) { 599 ret = copy_pidfd_info_to_user(pidfd, buf, count); 600 if (ret < 0) 601 return ret; 602 603 buf += ret; 604 count -= ret; 605 total_bytes += ret; 606 } 607 608 if (fanotify_is_error_event(event->mask)) { 609 ret = copy_error_info_to_user(event, buf, count); 610 if (ret < 0) 611 return ret; 612 buf += ret; 613 count -= ret; 614 total_bytes += ret; 615 } 616 617 return total_bytes; 618 } 619 620 static ssize_t copy_event_to_user(struct fsnotify_group *group, 621 struct fanotify_event *event, 622 char __user *buf, size_t count) 623 { 624 struct fanotify_event_metadata metadata; 625 struct path *path = fanotify_event_path(event); 626 struct fanotify_info *info = fanotify_event_info(event); 627 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); 628 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; 629 struct file *f = NULL; 630 int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; 631 632 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 633 634 metadata.event_len = fanotify_event_len(info_mode, event); 635 metadata.metadata_len = FAN_EVENT_METADATA_LEN; 636 metadata.vers = FANOTIFY_METADATA_VERSION; 637 metadata.reserved = 0; 638 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; 639 metadata.pid = pid_vnr(event->pid); 640 /* 641 * For an unprivileged listener, event->pid can be used to identify the 642 * events generated by the listener process itself, without disclosing 643 * the pids of other processes. 644 */ 645 if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && 646 task_tgid(current) != event->pid) 647 metadata.pid = 0; 648 649 /* 650 * For now, fid mode is required for an unprivileged listener and 651 * fid mode does not report fd in events. Keep this check anyway 652 * for safety in case fid mode requirement is relaxed in the future 653 * to allow unprivileged listener to get events with no fd and no fid. 654 */ 655 if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && 656 path && path->mnt && path->dentry) { 657 fd = create_fd(group, path, &f); 658 if (fd < 0) 659 return fd; 660 } 661 metadata.fd = fd; 662 663 if (pidfd_mode) { 664 /* 665 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual 666 * exclusion is ever lifted. At the time of incoporating pidfd 667 * support within fanotify, the pidfd API only supported the 668 * creation of pidfds for thread-group leaders. 669 */ 670 WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); 671 672 /* 673 * The PIDTYPE_TGID check for an event->pid is performed 674 * preemptively in an attempt to catch out cases where the event 675 * listener reads events after the event generating process has 676 * already terminated. Report FAN_NOPIDFD to the event listener 677 * in those cases, with all other pidfd creation errors being 678 * reported as FAN_EPIDFD. 679 */ 680 if (metadata.pid == 0 || 681 !pid_has_task(event->pid, PIDTYPE_TGID)) { 682 pidfd = FAN_NOPIDFD; 683 } else { 684 pidfd = pidfd_create(event->pid, 0); 685 if (pidfd < 0) 686 pidfd = FAN_EPIDFD; 687 } 688 } 689 690 ret = -EFAULT; 691 /* 692 * Sanity check copy size in case get_one_event() and 693 * event_len sizes ever get out of sync. 694 */ 695 if (WARN_ON_ONCE(metadata.event_len > count)) 696 goto out_close_fd; 697 698 if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) 699 goto out_close_fd; 700 701 buf += FAN_EVENT_METADATA_LEN; 702 count -= FAN_EVENT_METADATA_LEN; 703 704 if (fanotify_is_perm_event(event->mask)) 705 FANOTIFY_PERM(event)->fd = fd; 706 707 if (f) 708 fd_install(fd, f); 709 710 if (info_mode) { 711 ret = copy_info_records_to_user(event, info, info_mode, pidfd, 712 buf, count); 713 if (ret < 0) 714 goto out_close_fd; 715 } 716 717 return metadata.event_len; 718 719 out_close_fd: 720 if (fd != FAN_NOFD) { 721 put_unused_fd(fd); 722 fput(f); 723 } 724 725 if (pidfd >= 0) 726 close_fd(pidfd); 727 728 return ret; 729 } 730 731 /* intofiy userspace file descriptor functions */ 732 static __poll_t fanotify_poll(struct file *file, poll_table *wait) 733 { 734 struct fsnotify_group *group = file->private_data; 735 __poll_t ret = 0; 736 737 poll_wait(file, &group->notification_waitq, wait); 738 spin_lock(&group->notification_lock); 739 if (!fsnotify_notify_queue_is_empty(group)) 740 ret = EPOLLIN | EPOLLRDNORM; 741 spin_unlock(&group->notification_lock); 742 743 return ret; 744 } 745 746 static ssize_t fanotify_read(struct file *file, char __user *buf, 747 size_t count, loff_t *pos) 748 { 749 struct fsnotify_group *group; 750 struct fanotify_event *event; 751 char __user *start; 752 int ret; 753 DEFINE_WAIT_FUNC(wait, woken_wake_function); 754 755 start = buf; 756 group = file->private_data; 757 758 pr_debug("%s: group=%p\n", __func__, group); 759 760 add_wait_queue(&group->notification_waitq, &wait); 761 while (1) { 762 /* 763 * User can supply arbitrarily large buffer. Avoid softlockups 764 * in case there are lots of available events. 765 */ 766 cond_resched(); 767 event = get_one_event(group, count); 768 if (IS_ERR(event)) { 769 ret = PTR_ERR(event); 770 break; 771 } 772 773 if (!event) { 774 ret = -EAGAIN; 775 if (file->f_flags & O_NONBLOCK) 776 break; 777 778 ret = -ERESTARTSYS; 779 if (signal_pending(current)) 780 break; 781 782 if (start != buf) 783 break; 784 785 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 786 continue; 787 } 788 789 ret = copy_event_to_user(group, event, buf, count); 790 if (unlikely(ret == -EOPENSTALE)) { 791 /* 792 * We cannot report events with stale fd so drop it. 793 * Setting ret to 0 will continue the event loop and 794 * do the right thing if there are no more events to 795 * read (i.e. return bytes read, -EAGAIN or wait). 796 */ 797 ret = 0; 798 } 799 800 /* 801 * Permission events get queued to wait for response. Other 802 * events can be destroyed now. 803 */ 804 if (!fanotify_is_perm_event(event->mask)) { 805 fsnotify_destroy_event(group, &event->fse); 806 } else { 807 if (ret <= 0) { 808 spin_lock(&group->notification_lock); 809 finish_permission_event(group, 810 FANOTIFY_PERM(event), FAN_DENY); 811 wake_up(&group->fanotify_data.access_waitq); 812 } else { 813 spin_lock(&group->notification_lock); 814 list_add_tail(&event->fse.list, 815 &group->fanotify_data.access_list); 816 spin_unlock(&group->notification_lock); 817 } 818 } 819 if (ret < 0) 820 break; 821 buf += ret; 822 count -= ret; 823 } 824 remove_wait_queue(&group->notification_waitq, &wait); 825 826 if (start != buf && ret != -EFAULT) 827 ret = buf - start; 828 return ret; 829 } 830 831 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 832 { 833 struct fanotify_response response = { .fd = -1, .response = -1 }; 834 struct fsnotify_group *group; 835 int ret; 836 837 if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) 838 return -EINVAL; 839 840 group = file->private_data; 841 842 if (count < sizeof(response)) 843 return -EINVAL; 844 845 count = sizeof(response); 846 847 pr_debug("%s: group=%p count=%zu\n", __func__, group, count); 848 849 if (copy_from_user(&response, buf, count)) 850 return -EFAULT; 851 852 ret = process_access_response(group, &response); 853 if (ret < 0) 854 count = ret; 855 856 return count; 857 } 858 859 static int fanotify_release(struct inode *ignored, struct file *file) 860 { 861 struct fsnotify_group *group = file->private_data; 862 struct fsnotify_event *fsn_event; 863 864 /* 865 * Stop new events from arriving in the notification queue. since 866 * userspace cannot use fanotify fd anymore, no event can enter or 867 * leave access_list by now either. 868 */ 869 fsnotify_group_stop_queueing(group); 870 871 /* 872 * Process all permission events on access_list and notification queue 873 * and simulate reply from userspace. 874 */ 875 spin_lock(&group->notification_lock); 876 while (!list_empty(&group->fanotify_data.access_list)) { 877 struct fanotify_perm_event *event; 878 879 event = list_first_entry(&group->fanotify_data.access_list, 880 struct fanotify_perm_event, fae.fse.list); 881 list_del_init(&event->fae.fse.list); 882 finish_permission_event(group, event, FAN_ALLOW); 883 spin_lock(&group->notification_lock); 884 } 885 886 /* 887 * Destroy all non-permission events. For permission events just 888 * dequeue them and set the response. They will be freed once the 889 * response is consumed and fanotify_get_response() returns. 890 */ 891 while ((fsn_event = fsnotify_remove_first_event(group))) { 892 struct fanotify_event *event = FANOTIFY_E(fsn_event); 893 894 if (!(event->mask & FANOTIFY_PERM_EVENTS)) { 895 spin_unlock(&group->notification_lock); 896 fsnotify_destroy_event(group, fsn_event); 897 } else { 898 finish_permission_event(group, FANOTIFY_PERM(event), 899 FAN_ALLOW); 900 } 901 spin_lock(&group->notification_lock); 902 } 903 spin_unlock(&group->notification_lock); 904 905 /* Response for all permission events it set, wakeup waiters */ 906 wake_up(&group->fanotify_data.access_waitq); 907 908 /* matches the fanotify_init->fsnotify_alloc_group */ 909 fsnotify_destroy_group(group); 910 911 return 0; 912 } 913 914 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 915 { 916 struct fsnotify_group *group; 917 struct fsnotify_event *fsn_event; 918 void __user *p; 919 int ret = -ENOTTY; 920 size_t send_len = 0; 921 922 group = file->private_data; 923 924 p = (void __user *) arg; 925 926 switch (cmd) { 927 case FIONREAD: 928 spin_lock(&group->notification_lock); 929 list_for_each_entry(fsn_event, &group->notification_list, list) 930 send_len += FAN_EVENT_METADATA_LEN; 931 spin_unlock(&group->notification_lock); 932 ret = put_user(send_len, (int __user *) p); 933 break; 934 } 935 936 return ret; 937 } 938 939 static const struct file_operations fanotify_fops = { 940 .show_fdinfo = fanotify_show_fdinfo, 941 .poll = fanotify_poll, 942 .read = fanotify_read, 943 .write = fanotify_write, 944 .fasync = NULL, 945 .release = fanotify_release, 946 .unlocked_ioctl = fanotify_ioctl, 947 .compat_ioctl = compat_ptr_ioctl, 948 .llseek = noop_llseek, 949 }; 950 951 static int fanotify_find_path(int dfd, const char __user *filename, 952 struct path *path, unsigned int flags, __u64 mask, 953 unsigned int obj_type) 954 { 955 int ret; 956 957 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, 958 dfd, filename, flags); 959 960 if (filename == NULL) { 961 struct fd f = fdget(dfd); 962 963 ret = -EBADF; 964 if (!f.file) 965 goto out; 966 967 ret = -ENOTDIR; 968 if ((flags & FAN_MARK_ONLYDIR) && 969 !(S_ISDIR(file_inode(f.file)->i_mode))) { 970 fdput(f); 971 goto out; 972 } 973 974 *path = f.file->f_path; 975 path_get(path); 976 fdput(f); 977 } else { 978 unsigned int lookup_flags = 0; 979 980 if (!(flags & FAN_MARK_DONT_FOLLOW)) 981 lookup_flags |= LOOKUP_FOLLOW; 982 if (flags & FAN_MARK_ONLYDIR) 983 lookup_flags |= LOOKUP_DIRECTORY; 984 985 ret = user_path_at(dfd, filename, lookup_flags, path); 986 if (ret) 987 goto out; 988 } 989 990 /* you can only watch an inode if you have read permissions on it */ 991 ret = path_permission(path, MAY_READ); 992 if (ret) { 993 path_put(path); 994 goto out; 995 } 996 997 ret = security_path_notify(path, mask, obj_type); 998 if (ret) 999 path_put(path); 1000 1001 out: 1002 return ret; 1003 } 1004 1005 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, 1006 __u32 mask, unsigned int flags, 1007 __u32 umask, int *destroy) 1008 { 1009 __u32 oldmask = 0; 1010 1011 /* umask bits cannot be removed by user */ 1012 mask &= ~umask; 1013 spin_lock(&fsn_mark->lock); 1014 if (!(flags & FAN_MARK_IGNORED_MASK)) { 1015 oldmask = fsn_mark->mask; 1016 fsn_mark->mask &= ~mask; 1017 } else { 1018 fsn_mark->ignored_mask &= ~mask; 1019 } 1020 /* 1021 * We need to keep the mark around even if remaining mask cannot 1022 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal 1023 * changes to the mask. 1024 * Destroy mark when only umask bits remain. 1025 */ 1026 *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); 1027 spin_unlock(&fsn_mark->lock); 1028 1029 return mask & oldmask; 1030 } 1031 1032 static int fanotify_remove_mark(struct fsnotify_group *group, 1033 fsnotify_connp_t *connp, __u32 mask, 1034 unsigned int flags, __u32 umask) 1035 { 1036 struct fsnotify_mark *fsn_mark = NULL; 1037 __u32 removed; 1038 int destroy_mark; 1039 1040 mutex_lock(&group->mark_mutex); 1041 fsn_mark = fsnotify_find_mark(connp, group); 1042 if (!fsn_mark) { 1043 mutex_unlock(&group->mark_mutex); 1044 return -ENOENT; 1045 } 1046 1047 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 1048 umask, &destroy_mark); 1049 if (removed & fsnotify_conn_mask(fsn_mark->connector)) 1050 fsnotify_recalc_mask(fsn_mark->connector); 1051 if (destroy_mark) 1052 fsnotify_detach_mark(fsn_mark); 1053 mutex_unlock(&group->mark_mutex); 1054 if (destroy_mark) 1055 fsnotify_free_mark(fsn_mark); 1056 1057 /* matches the fsnotify_find_mark() */ 1058 fsnotify_put_mark(fsn_mark); 1059 return 0; 1060 } 1061 1062 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, 1063 struct vfsmount *mnt, __u32 mask, 1064 unsigned int flags, __u32 umask) 1065 { 1066 return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, 1067 mask, flags, umask); 1068 } 1069 1070 static int fanotify_remove_sb_mark(struct fsnotify_group *group, 1071 struct super_block *sb, __u32 mask, 1072 unsigned int flags, __u32 umask) 1073 { 1074 return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, 1075 flags, umask); 1076 } 1077 1078 static int fanotify_remove_inode_mark(struct fsnotify_group *group, 1079 struct inode *inode, __u32 mask, 1080 unsigned int flags, __u32 umask) 1081 { 1082 return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, 1083 flags, umask); 1084 } 1085 1086 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, 1087 __u32 mask, 1088 unsigned int flags) 1089 { 1090 __u32 oldmask = -1; 1091 1092 spin_lock(&fsn_mark->lock); 1093 if (!(flags & FAN_MARK_IGNORED_MASK)) { 1094 oldmask = fsn_mark->mask; 1095 fsn_mark->mask |= mask; 1096 } else { 1097 fsn_mark->ignored_mask |= mask; 1098 if (flags & FAN_MARK_IGNORED_SURV_MODIFY) 1099 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 1100 } 1101 spin_unlock(&fsn_mark->lock); 1102 1103 return mask & ~oldmask; 1104 } 1105 1106 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, 1107 fsnotify_connp_t *connp, 1108 unsigned int obj_type, 1109 __kernel_fsid_t *fsid) 1110 { 1111 struct ucounts *ucounts = group->fanotify_data.ucounts; 1112 struct fsnotify_mark *mark; 1113 int ret; 1114 1115 /* 1116 * Enforce per user marks limits per user in all containing user ns. 1117 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count 1118 * in the limited groups account. 1119 */ 1120 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && 1121 !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) 1122 return ERR_PTR(-ENOSPC); 1123 1124 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 1125 if (!mark) { 1126 ret = -ENOMEM; 1127 goto out_dec_ucounts; 1128 } 1129 1130 fsnotify_init_mark(mark, group); 1131 ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); 1132 if (ret) { 1133 fsnotify_put_mark(mark); 1134 goto out_dec_ucounts; 1135 } 1136 1137 return mark; 1138 1139 out_dec_ucounts: 1140 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 1141 dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); 1142 return ERR_PTR(ret); 1143 } 1144 1145 static int fanotify_group_init_error_pool(struct fsnotify_group *group) 1146 { 1147 if (mempool_initialized(&group->fanotify_data.error_events_pool)) 1148 return 0; 1149 1150 return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, 1151 FANOTIFY_DEFAULT_FEE_POOL_SIZE, 1152 sizeof(struct fanotify_error_event)); 1153 } 1154 1155 static int fanotify_add_mark(struct fsnotify_group *group, 1156 fsnotify_connp_t *connp, unsigned int obj_type, 1157 __u32 mask, unsigned int flags, 1158 __kernel_fsid_t *fsid) 1159 { 1160 struct fsnotify_mark *fsn_mark; 1161 __u32 added; 1162 int ret = 0; 1163 1164 mutex_lock(&group->mark_mutex); 1165 fsn_mark = fsnotify_find_mark(connp, group); 1166 if (!fsn_mark) { 1167 fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid); 1168 if (IS_ERR(fsn_mark)) { 1169 mutex_unlock(&group->mark_mutex); 1170 return PTR_ERR(fsn_mark); 1171 } 1172 } 1173 1174 /* 1175 * Error events are pre-allocated per group, only if strictly 1176 * needed (i.e. FAN_FS_ERROR was requested). 1177 */ 1178 if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { 1179 ret = fanotify_group_init_error_pool(group); 1180 if (ret) 1181 goto out; 1182 } 1183 1184 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 1185 if (added & ~fsnotify_conn_mask(fsn_mark->connector)) 1186 fsnotify_recalc_mask(fsn_mark->connector); 1187 1188 out: 1189 mutex_unlock(&group->mark_mutex); 1190 1191 fsnotify_put_mark(fsn_mark); 1192 return ret; 1193 } 1194 1195 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, 1196 struct vfsmount *mnt, __u32 mask, 1197 unsigned int flags, __kernel_fsid_t *fsid) 1198 { 1199 return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, 1200 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); 1201 } 1202 1203 static int fanotify_add_sb_mark(struct fsnotify_group *group, 1204 struct super_block *sb, __u32 mask, 1205 unsigned int flags, __kernel_fsid_t *fsid) 1206 { 1207 return fanotify_add_mark(group, &sb->s_fsnotify_marks, 1208 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); 1209 } 1210 1211 static int fanotify_add_inode_mark(struct fsnotify_group *group, 1212 struct inode *inode, __u32 mask, 1213 unsigned int flags, __kernel_fsid_t *fsid) 1214 { 1215 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 1216 1217 /* 1218 * If some other task has this inode open for write we should not add 1219 * an ignored mark, unless that ignored mark is supposed to survive 1220 * modification changes anyway. 1221 */ 1222 if ((flags & FAN_MARK_IGNORED_MASK) && 1223 !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && 1224 inode_is_open_for_write(inode)) 1225 return 0; 1226 1227 return fanotify_add_mark(group, &inode->i_fsnotify_marks, 1228 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); 1229 } 1230 1231 static struct fsnotify_event *fanotify_alloc_overflow_event(void) 1232 { 1233 struct fanotify_event *oevent; 1234 1235 oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); 1236 if (!oevent) 1237 return NULL; 1238 1239 fanotify_init_event(oevent, 0, FS_Q_OVERFLOW); 1240 oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; 1241 1242 return &oevent->fse; 1243 } 1244 1245 static struct hlist_head *fanotify_alloc_merge_hash(void) 1246 { 1247 struct hlist_head *hash; 1248 1249 hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, 1250 GFP_KERNEL_ACCOUNT); 1251 if (!hash) 1252 return NULL; 1253 1254 __hash_init(hash, FANOTIFY_HTABLE_SIZE); 1255 1256 return hash; 1257 } 1258 1259 /* fanotify syscalls */ 1260 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) 1261 { 1262 struct fsnotify_group *group; 1263 int f_flags, fd; 1264 unsigned int fid_mode = flags & FANOTIFY_FID_BITS; 1265 unsigned int class = flags & FANOTIFY_CLASS_BITS; 1266 unsigned int internal_flags = 0; 1267 1268 pr_debug("%s: flags=%x event_f_flags=%x\n", 1269 __func__, flags, event_f_flags); 1270 1271 if (!capable(CAP_SYS_ADMIN)) { 1272 /* 1273 * An unprivileged user can setup an fanotify group with 1274 * limited functionality - an unprivileged group is limited to 1275 * notification events with file handles and it cannot use 1276 * unlimited queue/marks. 1277 */ 1278 if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) 1279 return -EPERM; 1280 1281 /* 1282 * Setting the internal flag FANOTIFY_UNPRIV on the group 1283 * prevents setting mount/filesystem marks on this group and 1284 * prevents reporting pid and open fd in events. 1285 */ 1286 internal_flags |= FANOTIFY_UNPRIV; 1287 } 1288 1289 #ifdef CONFIG_AUDITSYSCALL 1290 if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) 1291 #else 1292 if (flags & ~FANOTIFY_INIT_FLAGS) 1293 #endif 1294 return -EINVAL; 1295 1296 /* 1297 * A pidfd can only be returned for a thread-group leader; thus 1298 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually 1299 * exclusive. 1300 */ 1301 if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) 1302 return -EINVAL; 1303 1304 if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) 1305 return -EINVAL; 1306 1307 switch (event_f_flags & O_ACCMODE) { 1308 case O_RDONLY: 1309 case O_RDWR: 1310 case O_WRONLY: 1311 break; 1312 default: 1313 return -EINVAL; 1314 } 1315 1316 if (fid_mode && class != FAN_CLASS_NOTIF) 1317 return -EINVAL; 1318 1319 /* 1320 * Child name is reported with parent fid so requires dir fid. 1321 * We can report both child fid and dir fid with or without name. 1322 */ 1323 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) 1324 return -EINVAL; 1325 1326 /* 1327 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID 1328 * and is used as an indication to report both dir and child fid on all 1329 * dirent events. 1330 */ 1331 if ((fid_mode & FAN_REPORT_TARGET_FID) && 1332 (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) 1333 return -EINVAL; 1334 1335 f_flags = O_RDWR | FMODE_NONOTIFY; 1336 if (flags & FAN_CLOEXEC) 1337 f_flags |= O_CLOEXEC; 1338 if (flags & FAN_NONBLOCK) 1339 f_flags |= O_NONBLOCK; 1340 1341 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 1342 group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); 1343 if (IS_ERR(group)) { 1344 return PTR_ERR(group); 1345 } 1346 1347 /* Enforce groups limits per user in all containing user ns */ 1348 group->fanotify_data.ucounts = inc_ucount(current_user_ns(), 1349 current_euid(), 1350 UCOUNT_FANOTIFY_GROUPS); 1351 if (!group->fanotify_data.ucounts) { 1352 fd = -EMFILE; 1353 goto out_destroy_group; 1354 } 1355 1356 group->fanotify_data.flags = flags | internal_flags; 1357 group->memcg = get_mem_cgroup_from_mm(current->mm); 1358 1359 group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); 1360 if (!group->fanotify_data.merge_hash) { 1361 fd = -ENOMEM; 1362 goto out_destroy_group; 1363 } 1364 1365 group->overflow_event = fanotify_alloc_overflow_event(); 1366 if (unlikely(!group->overflow_event)) { 1367 fd = -ENOMEM; 1368 goto out_destroy_group; 1369 } 1370 1371 if (force_o_largefile()) 1372 event_f_flags |= O_LARGEFILE; 1373 group->fanotify_data.f_flags = event_f_flags; 1374 init_waitqueue_head(&group->fanotify_data.access_waitq); 1375 INIT_LIST_HEAD(&group->fanotify_data.access_list); 1376 switch (class) { 1377 case FAN_CLASS_NOTIF: 1378 group->priority = FS_PRIO_0; 1379 break; 1380 case FAN_CLASS_CONTENT: 1381 group->priority = FS_PRIO_1; 1382 break; 1383 case FAN_CLASS_PRE_CONTENT: 1384 group->priority = FS_PRIO_2; 1385 break; 1386 default: 1387 fd = -EINVAL; 1388 goto out_destroy_group; 1389 } 1390 1391 if (flags & FAN_UNLIMITED_QUEUE) { 1392 fd = -EPERM; 1393 if (!capable(CAP_SYS_ADMIN)) 1394 goto out_destroy_group; 1395 group->max_events = UINT_MAX; 1396 } else { 1397 group->max_events = fanotify_max_queued_events; 1398 } 1399 1400 if (flags & FAN_UNLIMITED_MARKS) { 1401 fd = -EPERM; 1402 if (!capable(CAP_SYS_ADMIN)) 1403 goto out_destroy_group; 1404 } 1405 1406 if (flags & FAN_ENABLE_AUDIT) { 1407 fd = -EPERM; 1408 if (!capable(CAP_AUDIT_WRITE)) 1409 goto out_destroy_group; 1410 } 1411 1412 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); 1413 if (fd < 0) 1414 goto out_destroy_group; 1415 1416 return fd; 1417 1418 out_destroy_group: 1419 fsnotify_destroy_group(group); 1420 return fd; 1421 } 1422 1423 static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) 1424 { 1425 __kernel_fsid_t root_fsid; 1426 int err; 1427 1428 /* 1429 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). 1430 */ 1431 err = vfs_get_fsid(dentry, fsid); 1432 if (err) 1433 return err; 1434 1435 if (!fsid->val[0] && !fsid->val[1]) 1436 return -ENODEV; 1437 1438 /* 1439 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) 1440 * which uses a different fsid than sb root. 1441 */ 1442 err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); 1443 if (err) 1444 return err; 1445 1446 if (root_fsid.val[0] != fsid->val[0] || 1447 root_fsid.val[1] != fsid->val[1]) 1448 return -EXDEV; 1449 1450 return 0; 1451 } 1452 1453 /* Check if filesystem can encode a unique fid */ 1454 static int fanotify_test_fid(struct dentry *dentry) 1455 { 1456 /* 1457 * We need to make sure that the file system supports at least 1458 * encoding a file handle so user can use name_to_handle_at() to 1459 * compare fid returned with event to the file handle of watched 1460 * objects. However, name_to_handle_at() requires that the 1461 * filesystem also supports decoding file handles. 1462 */ 1463 if (!dentry->d_sb->s_export_op || 1464 !dentry->d_sb->s_export_op->fh_to_dentry) 1465 return -EOPNOTSUPP; 1466 1467 return 0; 1468 } 1469 1470 static int fanotify_events_supported(struct path *path, __u64 mask) 1471 { 1472 /* 1473 * Some filesystems such as 'proc' acquire unusual locks when opening 1474 * files. For them fanotify permission events have high chances of 1475 * deadlocking the system - open done when reporting fanotify event 1476 * blocks on this "unusual" lock while another process holding the lock 1477 * waits for fanotify permission event to be answered. Just disallow 1478 * permission events for such filesystems. 1479 */ 1480 if (mask & FANOTIFY_PERM_EVENTS && 1481 path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) 1482 return -EINVAL; 1483 return 0; 1484 } 1485 1486 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, 1487 int dfd, const char __user *pathname) 1488 { 1489 struct inode *inode = NULL; 1490 struct vfsmount *mnt = NULL; 1491 struct fsnotify_group *group; 1492 struct fd f; 1493 struct path path; 1494 __kernel_fsid_t __fsid, *fsid = NULL; 1495 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; 1496 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1497 bool ignored = flags & FAN_MARK_IGNORED_MASK; 1498 unsigned int obj_type, fid_mode; 1499 u32 umask = 0; 1500 int ret; 1501 1502 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", 1503 __func__, fanotify_fd, flags, dfd, pathname, mask); 1504 1505 /* we only use the lower 32 bits as of right now. */ 1506 if (upper_32_bits(mask)) 1507 return -EINVAL; 1508 1509 if (flags & ~FANOTIFY_MARK_FLAGS) 1510 return -EINVAL; 1511 1512 switch (mark_type) { 1513 case FAN_MARK_INODE: 1514 obj_type = FSNOTIFY_OBJ_TYPE_INODE; 1515 break; 1516 case FAN_MARK_MOUNT: 1517 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; 1518 break; 1519 case FAN_MARK_FILESYSTEM: 1520 obj_type = FSNOTIFY_OBJ_TYPE_SB; 1521 break; 1522 default: 1523 return -EINVAL; 1524 } 1525 1526 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 1527 case FAN_MARK_ADD: 1528 case FAN_MARK_REMOVE: 1529 if (!mask) 1530 return -EINVAL; 1531 break; 1532 case FAN_MARK_FLUSH: 1533 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) 1534 return -EINVAL; 1535 break; 1536 default: 1537 return -EINVAL; 1538 } 1539 1540 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) 1541 valid_mask |= FANOTIFY_PERM_EVENTS; 1542 1543 if (mask & ~valid_mask) 1544 return -EINVAL; 1545 1546 /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ 1547 if (ignored) 1548 mask &= ~FANOTIFY_EVENT_FLAGS; 1549 1550 f = fdget(fanotify_fd); 1551 if (unlikely(!f.file)) 1552 return -EBADF; 1553 1554 /* verify that this is indeed an fanotify instance */ 1555 ret = -EINVAL; 1556 if (unlikely(f.file->f_op != &fanotify_fops)) 1557 goto fput_and_out; 1558 group = f.file->private_data; 1559 1560 /* 1561 * An unprivileged user is not allowed to setup mount nor filesystem 1562 * marks. This also includes setting up such marks by a group that 1563 * was initialized by an unprivileged user. 1564 */ 1565 ret = -EPERM; 1566 if ((!capable(CAP_SYS_ADMIN) || 1567 FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && 1568 mark_type != FAN_MARK_INODE) 1569 goto fput_and_out; 1570 1571 /* 1572 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not 1573 * allowed to set permissions events. 1574 */ 1575 ret = -EINVAL; 1576 if (mask & FANOTIFY_PERM_EVENTS && 1577 group->priority == FS_PRIO_0) 1578 goto fput_and_out; 1579 1580 if (mask & FAN_FS_ERROR && 1581 mark_type != FAN_MARK_FILESYSTEM) 1582 goto fput_and_out; 1583 1584 /* 1585 * Events that do not carry enough information to report 1586 * event->fd require a group that supports reporting fid. Those 1587 * events are not supported on a mount mark, because they do not 1588 * carry enough information (i.e. path) to be filtered by mount 1589 * point. 1590 */ 1591 fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); 1592 if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && 1593 (!fid_mode || mark_type == FAN_MARK_MOUNT)) 1594 goto fput_and_out; 1595 1596 /* 1597 * FAN_RENAME uses special info type records to report the old and 1598 * new parent+name. Reporting only old and new parent id is less 1599 * useful and was not implemented. 1600 */ 1601 if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) 1602 goto fput_and_out; 1603 1604 if (flags & FAN_MARK_FLUSH) { 1605 ret = 0; 1606 if (mark_type == FAN_MARK_MOUNT) 1607 fsnotify_clear_vfsmount_marks_by_group(group); 1608 else if (mark_type == FAN_MARK_FILESYSTEM) 1609 fsnotify_clear_sb_marks_by_group(group); 1610 else 1611 fsnotify_clear_inode_marks_by_group(group); 1612 goto fput_and_out; 1613 } 1614 1615 ret = fanotify_find_path(dfd, pathname, &path, flags, 1616 (mask & ALL_FSNOTIFY_EVENTS), obj_type); 1617 if (ret) 1618 goto fput_and_out; 1619 1620 if (flags & FAN_MARK_ADD) { 1621 ret = fanotify_events_supported(&path, mask); 1622 if (ret) 1623 goto path_put_and_out; 1624 } 1625 1626 if (fid_mode) { 1627 ret = fanotify_test_fsid(path.dentry, &__fsid); 1628 if (ret) 1629 goto path_put_and_out; 1630 1631 ret = fanotify_test_fid(path.dentry); 1632 if (ret) 1633 goto path_put_and_out; 1634 1635 fsid = &__fsid; 1636 } 1637 1638 /* inode held in place by reference to path; group by fget on fd */ 1639 if (mark_type == FAN_MARK_INODE) 1640 inode = path.dentry->d_inode; 1641 else 1642 mnt = path.mnt; 1643 1644 /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ 1645 if (mnt || !S_ISDIR(inode->i_mode)) { 1646 mask &= ~FAN_EVENT_ON_CHILD; 1647 umask = FAN_EVENT_ON_CHILD; 1648 /* 1649 * If group needs to report parent fid, register for getting 1650 * events with parent/name info for non-directory. 1651 */ 1652 if ((fid_mode & FAN_REPORT_DIR_FID) && 1653 (flags & FAN_MARK_ADD) && !ignored) 1654 mask |= FAN_EVENT_ON_CHILD; 1655 } 1656 1657 /* create/update an inode mark */ 1658 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { 1659 case FAN_MARK_ADD: 1660 if (mark_type == FAN_MARK_MOUNT) 1661 ret = fanotify_add_vfsmount_mark(group, mnt, mask, 1662 flags, fsid); 1663 else if (mark_type == FAN_MARK_FILESYSTEM) 1664 ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, 1665 flags, fsid); 1666 else 1667 ret = fanotify_add_inode_mark(group, inode, mask, 1668 flags, fsid); 1669 break; 1670 case FAN_MARK_REMOVE: 1671 if (mark_type == FAN_MARK_MOUNT) 1672 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, 1673 flags, umask); 1674 else if (mark_type == FAN_MARK_FILESYSTEM) 1675 ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, 1676 flags, umask); 1677 else 1678 ret = fanotify_remove_inode_mark(group, inode, mask, 1679 flags, umask); 1680 break; 1681 default: 1682 ret = -EINVAL; 1683 } 1684 1685 path_put_and_out: 1686 path_put(&path); 1687 fput_and_out: 1688 fdput(f); 1689 return ret; 1690 } 1691 1692 #ifndef CONFIG_ARCH_SPLIT_ARG64 1693 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, 1694 __u64, mask, int, dfd, 1695 const char __user *, pathname) 1696 { 1697 return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); 1698 } 1699 #endif 1700 1701 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) 1702 SYSCALL32_DEFINE6(fanotify_mark, 1703 int, fanotify_fd, unsigned int, flags, 1704 SC_ARG64(mask), int, dfd, 1705 const char __user *, pathname) 1706 { 1707 return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), 1708 dfd, pathname); 1709 } 1710 #endif 1711 1712 /* 1713 * fanotify_user_setup - Our initialization function. Note that we cannot return 1714 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 1715 * must result in panic(). 1716 */ 1717 static int __init fanotify_user_setup(void) 1718 { 1719 struct sysinfo si; 1720 int max_marks; 1721 1722 si_meminfo(&si); 1723 /* 1724 * Allow up to 1% of addressable memory to be accounted for per user 1725 * marks limited to the range [8192, 1048576]. mount and sb marks are 1726 * a lot cheaper than inode marks, but there is no reason for a user 1727 * to have many of those, so calculate by the cost of inode marks. 1728 */ 1729 max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / 1730 INODE_MARK_COST; 1731 max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, 1732 FANOTIFY_DEFAULT_MAX_USER_MARKS); 1733 1734 BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); 1735 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); 1736 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); 1737 1738 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, 1739 SLAB_PANIC|SLAB_ACCOUNT); 1740 fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, 1741 SLAB_PANIC); 1742 fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, 1743 SLAB_PANIC); 1744 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { 1745 fanotify_perm_event_cachep = 1746 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); 1747 } 1748 1749 fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; 1750 init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = 1751 FANOTIFY_DEFAULT_MAX_GROUPS; 1752 init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; 1753 fanotify_sysctls_init(); 1754 1755 return 0; 1756 } 1757 device_initcall(fanotify_user_setup); 1758