1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/fanotify.h> 3 #include <linux/fcntl.h> 4 #include <linux/fdtable.h> 5 #include <linux/file.h> 6 #include <linux/fs.h> 7 #include <linux/anon_inodes.h> 8 #include <linux/fsnotify_backend.h> 9 #include <linux/init.h> 10 #include <linux/mount.h> 11 #include <linux/namei.h> 12 #include <linux/poll.h> 13 #include <linux/security.h> 14 #include <linux/syscalls.h> 15 #include <linux/slab.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/compat.h> 19 #include <linux/sched/signal.h> 20 #include <linux/memcontrol.h> 21 #include <linux/statfs.h> 22 #include <linux/exportfs.h> 23 24 #include <asm/ioctls.h> 25 26 #include "../../mount.h" 27 #include "../fdinfo.h" 28 #include "fanotify.h" 29 30 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 31 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 32 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 33 34 /* 35 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable 36 * limit of marks per user, similar to inotify. Effectively, the legacy limit 37 * of fanotify marks per user is <max marks per group> * <max groups per user>. 38 * This default limit (1M) also happens to match the increased limit of inotify 39 * max_user_watches since v5.10. 40 */ 41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ 42 (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) 43 44 /* 45 * Most of the memory cost of adding an inode mark is pinning the marked inode. 46 * The size of the filesystem inode struct is not uniform across filesystems, 47 * so double the size of a VFS inode is used as a conservative approximation. 48 */ 49 #define INODE_MARK_COST (2 * sizeof(struct inode)) 50 51 /* configurable via /proc/sys/fs/fanotify/ */ 52 static int fanotify_max_queued_events __read_mostly; 53 54 #ifdef CONFIG_SYSCTL 55 56 #include <linux/sysctl.h> 57 58 static long ft_zero = 0; 59 static long ft_int_max = INT_MAX; 60 61 struct ctl_table fanotify_table[] = { 62 { 63 .procname = "max_user_groups", 64 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], 65 .maxlen = sizeof(long), 66 .mode = 0644, 67 .proc_handler = proc_doulongvec_minmax, 68 .extra1 = &ft_zero, 69 .extra2 = &ft_int_max, 70 }, 71 { 72 .procname = "max_user_marks", 73 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], 74 .maxlen = sizeof(long), 75 .mode = 0644, 76 .proc_handler = proc_doulongvec_minmax, 77 .extra1 = &ft_zero, 78 .extra2 = &ft_int_max, 79 }, 80 { 81 .procname = "max_queued_events", 82 .data = &fanotify_max_queued_events, 83 .maxlen = sizeof(int), 84 .mode = 0644, 85 .proc_handler = proc_dointvec_minmax, 86 .extra1 = SYSCTL_ZERO 87 }, 88 { } 89 }; 90 #endif /* CONFIG_SYSCTL */ 91 92 /* 93 * All flags that may be specified in parameter event_f_flags of fanotify_init. 94 * 95 * Internal and external open flags are stored together in field f_flags of 96 * struct file. Only external open flags shall be allowed in event_f_flags. 97 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be 98 * excluded. 99 */ 100 #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ 101 O_ACCMODE | O_APPEND | O_NONBLOCK | \ 102 __O_SYNC | O_DSYNC | O_CLOEXEC | \ 103 O_LARGEFILE | O_NOATIME ) 104 105 extern const struct fsnotify_ops fanotify_fsnotify_ops; 106 107 struct kmem_cache *fanotify_mark_cache __read_mostly; 108 struct kmem_cache *fanotify_fid_event_cachep __read_mostly; 109 struct kmem_cache *fanotify_path_event_cachep __read_mostly; 110 struct kmem_cache *fanotify_perm_event_cachep __read_mostly; 111 112 #define FANOTIFY_EVENT_ALIGN 4 113 #define FANOTIFY_FID_INFO_HDR_LEN \ 114 (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) 115 #define FANOTIFY_PIDFD_INFO_HDR_LEN \ 116 sizeof(struct fanotify_event_info_pidfd) 117 118 static int fanotify_fid_info_len(int fh_len, int name_len) 119 { 120 int info_len = fh_len; 121 122 if (name_len) 123 info_len += name_len + 1; 124 125 return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, 126 FANOTIFY_EVENT_ALIGN); 127 } 128 129 static int fanotify_event_info_len(unsigned int info_mode, 130 struct fanotify_event *event) 131 { 132 struct fanotify_info *info = fanotify_event_info(event); 133 int dir_fh_len = fanotify_event_dir_fh_len(event); 134 int fh_len = fanotify_event_object_fh_len(event); 135 int info_len = 0; 136 int dot_len = 0; 137 138 if (dir_fh_len) { 139 info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); 140 } else if ((info_mode & FAN_REPORT_NAME) && 141 (event->mask & FAN_ONDIR)) { 142 /* 143 * With group flag FAN_REPORT_NAME, if name was not recorded in 144 * event on a directory, we will report the name ".". 145 */ 146 dot_len = 1; 147 } 148 149 if (info_mode & FAN_REPORT_PIDFD) 150 info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; 151 152 if (fh_len) 153 info_len += fanotify_fid_info_len(fh_len, dot_len); 154 155 return info_len; 156 } 157 158 /* 159 * Remove an hashed event from merge hash table. 160 */ 161 static void fanotify_unhash_event(struct fsnotify_group *group, 162 struct fanotify_event *event) 163 { 164 assert_spin_locked(&group->notification_lock); 165 166 pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, 167 group, event, fanotify_event_hash_bucket(group, event)); 168 169 if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) 170 return; 171 172 hlist_del_init(&event->merge_list); 173 } 174 175 /* 176 * Get an fanotify notification event if one exists and is small 177 * enough to fit in "count". Return an error pointer if the count 178 * is not large enough. When permission event is dequeued, its state is 179 * updated accordingly. 180 */ 181 static struct fanotify_event *get_one_event(struct fsnotify_group *group, 182 size_t count) 183 { 184 size_t event_size = FAN_EVENT_METADATA_LEN; 185 struct fanotify_event *event = NULL; 186 struct fsnotify_event *fsn_event; 187 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); 188 189 pr_debug("%s: group=%p count=%zd\n", __func__, group, count); 190 191 spin_lock(&group->notification_lock); 192 fsn_event = fsnotify_peek_first_event(group); 193 if (!fsn_event) 194 goto out; 195 196 event = FANOTIFY_E(fsn_event); 197 if (info_mode) 198 event_size += fanotify_event_info_len(info_mode, event); 199 200 if (event_size > count) { 201 event = ERR_PTR(-EINVAL); 202 goto out; 203 } 204 205 /* 206 * Held the notification_lock the whole time, so this is the 207 * same event we peeked above. 208 */ 209 fsnotify_remove_first_event(group); 210 if (fanotify_is_perm_event(event->mask)) 211 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; 212 if (fanotify_is_hashed_event(event->mask)) 213 fanotify_unhash_event(group, event); 214 out: 215 spin_unlock(&group->notification_lock); 216 return event; 217 } 218 219 static int create_fd(struct fsnotify_group *group, struct path *path, 220 struct file **file) 221 { 222 int client_fd; 223 struct file *new_file; 224 225 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); 226 if (client_fd < 0) 227 return client_fd; 228 229 /* 230 * we need a new file handle for the userspace program so it can read even if it was 231 * originally opened O_WRONLY. 232 */ 233 new_file = dentry_open(path, 234 group->fanotify_data.f_flags | FMODE_NONOTIFY, 235 current_cred()); 236 if (IS_ERR(new_file)) { 237 /* 238 * we still send an event even if we can't open the file. this 239 * can happen when say tasks are gone and we try to open their 240 * /proc files or we try to open a WRONLY file like in sysfs 241 * we just send the errno to userspace since there isn't much 242 * else we can do. 243 */ 244 put_unused_fd(client_fd); 245 client_fd = PTR_ERR(new_file); 246 } else { 247 *file = new_file; 248 } 249 250 return client_fd; 251 } 252 253 /* 254 * Finish processing of permission event by setting it to ANSWERED state and 255 * drop group->notification_lock. 256 */ 257 static void finish_permission_event(struct fsnotify_group *group, 258 struct fanotify_perm_event *event, 259 unsigned int response) 260 __releases(&group->notification_lock) 261 { 262 bool destroy = false; 263 264 assert_spin_locked(&group->notification_lock); 265 event->response = response; 266 if (event->state == FAN_EVENT_CANCELED) 267 destroy = true; 268 else 269 event->state = FAN_EVENT_ANSWERED; 270 spin_unlock(&group->notification_lock); 271 if (destroy) 272 fsnotify_destroy_event(group, &event->fae.fse); 273 } 274 275 static int process_access_response(struct fsnotify_group *group, 276 struct fanotify_response *response_struct) 277 { 278 struct fanotify_perm_event *event; 279 int fd = response_struct->fd; 280 int response = response_struct->response; 281 282 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, 283 fd, response); 284 /* 285 * make sure the response is valid, if invalid we do nothing and either 286 * userspace can send a valid response or we will clean it up after the 287 * timeout 288 */ 289 switch (response & ~FAN_AUDIT) { 290 case FAN_ALLOW: 291 case FAN_DENY: 292 break; 293 default: 294 return -EINVAL; 295 } 296 297 if (fd < 0) 298 return -EINVAL; 299 300 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) 301 return -EINVAL; 302 303 spin_lock(&group->notification_lock); 304 list_for_each_entry(event, &group->fanotify_data.access_list, 305 fae.fse.list) { 306 if (event->fd != fd) 307 continue; 308 309 list_del_init(&event->fae.fse.list); 310 finish_permission_event(group, event, response); 311 wake_up(&group->fanotify_data.access_waitq); 312 return 0; 313 } 314 spin_unlock(&group->notification_lock); 315 316 return -ENOENT; 317 } 318 319 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, 320 int info_type, const char *name, 321 size_t name_len, 322 char __user *buf, size_t count) 323 { 324 struct fanotify_event_info_fid info = { }; 325 struct file_handle handle = { }; 326 unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; 327 size_t fh_len = fh ? fh->len : 0; 328 size_t info_len = fanotify_fid_info_len(fh_len, name_len); 329 size_t len = info_len; 330 331 pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", 332 __func__, fh_len, name_len, info_len, count); 333 334 if (!fh_len) 335 return 0; 336 337 if (WARN_ON_ONCE(len < sizeof(info) || len > count)) 338 return -EFAULT; 339 340 /* 341 * Copy event info fid header followed by variable sized file handle 342 * and optionally followed by variable sized filename. 343 */ 344 switch (info_type) { 345 case FAN_EVENT_INFO_TYPE_FID: 346 case FAN_EVENT_INFO_TYPE_DFID: 347 if (WARN_ON_ONCE(name_len)) 348 return -EFAULT; 349 break; 350 case FAN_EVENT_INFO_TYPE_DFID_NAME: 351 if (WARN_ON_ONCE(!name || !name_len)) 352 return -EFAULT; 353 break; 354 default: 355 return -EFAULT; 356 } 357 358 info.hdr.info_type = info_type; 359 info.hdr.len = len; 360 info.fsid = *fsid; 361 if (copy_to_user(buf, &info, sizeof(info))) 362 return -EFAULT; 363 364 buf += sizeof(info); 365 len -= sizeof(info); 366 if (WARN_ON_ONCE(len < sizeof(handle))) 367 return -EFAULT; 368 369 handle.handle_type = fh->type; 370 handle.handle_bytes = fh_len; 371 if (copy_to_user(buf, &handle, sizeof(handle))) 372 return -EFAULT; 373 374 buf += sizeof(handle); 375 len -= sizeof(handle); 376 if (WARN_ON_ONCE(len < fh_len)) 377 return -EFAULT; 378 379 /* 380 * For an inline fh and inline file name, copy through stack to exclude 381 * the copy from usercopy hardening protections. 382 */ 383 fh_buf = fanotify_fh_buf(fh); 384 if (fh_len <= FANOTIFY_INLINE_FH_LEN) { 385 memcpy(bounce, fh_buf, fh_len); 386 fh_buf = bounce; 387 } 388 if (copy_to_user(buf, fh_buf, fh_len)) 389 return -EFAULT; 390 391 buf += fh_len; 392 len -= fh_len; 393 394 if (name_len) { 395 /* Copy the filename with terminating null */ 396 name_len++; 397 if (WARN_ON_ONCE(len < name_len)) 398 return -EFAULT; 399 400 if (copy_to_user(buf, name, name_len)) 401 return -EFAULT; 402 403 buf += name_len; 404 len -= name_len; 405 } 406 407 /* Pad with 0's */ 408 WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); 409 if (len > 0 && clear_user(buf, len)) 410 return -EFAULT; 411 412 return info_len; 413 } 414 415 static int copy_pidfd_info_to_user(int pidfd, 416 char __user *buf, 417 size_t count) 418 { 419 struct fanotify_event_info_pidfd info = { }; 420 size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; 421 422 if (WARN_ON_ONCE(info_len > count)) 423 return -EFAULT; 424 425 info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; 426 info.hdr.len = info_len; 427 info.pidfd = pidfd; 428 429 if (copy_to_user(buf, &info, info_len)) 430 return -EFAULT; 431 432 return info_len; 433 } 434 435 static int copy_info_records_to_user(struct fanotify_event *event, 436 struct fanotify_info *info, 437 unsigned int info_mode, int pidfd, 438 char __user *buf, size_t count) 439 { 440 int ret, total_bytes = 0, info_type = 0; 441 unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; 442 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; 443 444 /* 445 * Event info records order is as follows: dir fid + name, child fid. 446 */ 447 if (fanotify_event_dir_fh_len(event)) { 448 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : 449 FAN_EVENT_INFO_TYPE_DFID; 450 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 451 fanotify_info_dir_fh(info), 452 info_type, 453 fanotify_info_name(info), 454 info->name_len, buf, count); 455 if (ret < 0) 456 return ret; 457 458 buf += ret; 459 count -= ret; 460 total_bytes += ret; 461 } 462 463 if (fanotify_event_object_fh_len(event)) { 464 const char *dot = NULL; 465 int dot_len = 0; 466 467 if (fid_mode == FAN_REPORT_FID || info_type) { 468 /* 469 * With only group flag FAN_REPORT_FID only type FID is 470 * reported. Second info record type is always FID. 471 */ 472 info_type = FAN_EVENT_INFO_TYPE_FID; 473 } else if ((fid_mode & FAN_REPORT_NAME) && 474 (event->mask & FAN_ONDIR)) { 475 /* 476 * With group flag FAN_REPORT_NAME, if name was not 477 * recorded in an event on a directory, report the name 478 * "." with info type DFID_NAME. 479 */ 480 info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; 481 dot = "."; 482 dot_len = 1; 483 } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || 484 (event->mask & FAN_ONDIR)) { 485 /* 486 * With group flag FAN_REPORT_DIR_FID, a single info 487 * record has type DFID for directory entry modification 488 * event and for event on a directory. 489 */ 490 info_type = FAN_EVENT_INFO_TYPE_DFID; 491 } else { 492 /* 493 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, 494 * a single info record has type FID for event on a 495 * non-directory, when there is no directory to report. 496 * For example, on FAN_DELETE_SELF event. 497 */ 498 info_type = FAN_EVENT_INFO_TYPE_FID; 499 } 500 501 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 502 fanotify_event_object_fh(event), 503 info_type, dot, dot_len, 504 buf, count); 505 if (ret < 0) 506 return ret; 507 508 buf += ret; 509 count -= ret; 510 total_bytes += ret; 511 } 512 513 if (pidfd_mode) { 514 ret = copy_pidfd_info_to_user(pidfd, buf, count); 515 if (ret < 0) 516 return ret; 517 518 buf += ret; 519 count -= ret; 520 total_bytes += ret; 521 } 522 523 return total_bytes; 524 } 525 526 static ssize_t copy_event_to_user(struct fsnotify_group *group, 527 struct fanotify_event *event, 528 char __user *buf, size_t count) 529 { 530 struct fanotify_event_metadata metadata; 531 struct path *path = fanotify_event_path(event); 532 struct fanotify_info *info = fanotify_event_info(event); 533 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); 534 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; 535 struct file *f = NULL; 536 int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; 537 538 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 539 540 metadata.event_len = FAN_EVENT_METADATA_LEN + 541 fanotify_event_info_len(info_mode, event); 542 metadata.metadata_len = FAN_EVENT_METADATA_LEN; 543 metadata.vers = FANOTIFY_METADATA_VERSION; 544 metadata.reserved = 0; 545 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; 546 metadata.pid = pid_vnr(event->pid); 547 /* 548 * For an unprivileged listener, event->pid can be used to identify the 549 * events generated by the listener process itself, without disclosing 550 * the pids of other processes. 551 */ 552 if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && 553 task_tgid(current) != event->pid) 554 metadata.pid = 0; 555 556 /* 557 * For now, fid mode is required for an unprivileged listener and 558 * fid mode does not report fd in events. Keep this check anyway 559 * for safety in case fid mode requirement is relaxed in the future 560 * to allow unprivileged listener to get events with no fd and no fid. 561 */ 562 if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && 563 path && path->mnt && path->dentry) { 564 fd = create_fd(group, path, &f); 565 if (fd < 0) 566 return fd; 567 } 568 metadata.fd = fd; 569 570 if (pidfd_mode) { 571 /* 572 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual 573 * exclusion is ever lifted. At the time of incoporating pidfd 574 * support within fanotify, the pidfd API only supported the 575 * creation of pidfds for thread-group leaders. 576 */ 577 WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); 578 579 /* 580 * The PIDTYPE_TGID check for an event->pid is performed 581 * preemptively in an attempt to catch out cases where the event 582 * listener reads events after the event generating process has 583 * already terminated. Report FAN_NOPIDFD to the event listener 584 * in those cases, with all other pidfd creation errors being 585 * reported as FAN_EPIDFD. 586 */ 587 if (metadata.pid == 0 || 588 !pid_has_task(event->pid, PIDTYPE_TGID)) { 589 pidfd = FAN_NOPIDFD; 590 } else { 591 pidfd = pidfd_create(event->pid, 0); 592 if (pidfd < 0) 593 pidfd = FAN_EPIDFD; 594 } 595 } 596 597 ret = -EFAULT; 598 /* 599 * Sanity check copy size in case get_one_event() and 600 * event_len sizes ever get out of sync. 601 */ 602 if (WARN_ON_ONCE(metadata.event_len > count)) 603 goto out_close_fd; 604 605 if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) 606 goto out_close_fd; 607 608 buf += FAN_EVENT_METADATA_LEN; 609 count -= FAN_EVENT_METADATA_LEN; 610 611 if (fanotify_is_perm_event(event->mask)) 612 FANOTIFY_PERM(event)->fd = fd; 613 614 if (f) 615 fd_install(fd, f); 616 617 if (info_mode) { 618 ret = copy_info_records_to_user(event, info, info_mode, pidfd, 619 buf, count); 620 if (ret < 0) 621 goto out_close_fd; 622 } 623 624 return metadata.event_len; 625 626 out_close_fd: 627 if (fd != FAN_NOFD) { 628 put_unused_fd(fd); 629 fput(f); 630 } 631 632 if (pidfd >= 0) 633 close_fd(pidfd); 634 635 return ret; 636 } 637 638 /* intofiy userspace file descriptor functions */ 639 static __poll_t fanotify_poll(struct file *file, poll_table *wait) 640 { 641 struct fsnotify_group *group = file->private_data; 642 __poll_t ret = 0; 643 644 poll_wait(file, &group->notification_waitq, wait); 645 spin_lock(&group->notification_lock); 646 if (!fsnotify_notify_queue_is_empty(group)) 647 ret = EPOLLIN | EPOLLRDNORM; 648 spin_unlock(&group->notification_lock); 649 650 return ret; 651 } 652 653 static ssize_t fanotify_read(struct file *file, char __user *buf, 654 size_t count, loff_t *pos) 655 { 656 struct fsnotify_group *group; 657 struct fanotify_event *event; 658 char __user *start; 659 int ret; 660 DEFINE_WAIT_FUNC(wait, woken_wake_function); 661 662 start = buf; 663 group = file->private_data; 664 665 pr_debug("%s: group=%p\n", __func__, group); 666 667 add_wait_queue(&group->notification_waitq, &wait); 668 while (1) { 669 /* 670 * User can supply arbitrarily large buffer. Avoid softlockups 671 * in case there are lots of available events. 672 */ 673 cond_resched(); 674 event = get_one_event(group, count); 675 if (IS_ERR(event)) { 676 ret = PTR_ERR(event); 677 break; 678 } 679 680 if (!event) { 681 ret = -EAGAIN; 682 if (file->f_flags & O_NONBLOCK) 683 break; 684 685 ret = -ERESTARTSYS; 686 if (signal_pending(current)) 687 break; 688 689 if (start != buf) 690 break; 691 692 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 693 continue; 694 } 695 696 ret = copy_event_to_user(group, event, buf, count); 697 if (unlikely(ret == -EOPENSTALE)) { 698 /* 699 * We cannot report events with stale fd so drop it. 700 * Setting ret to 0 will continue the event loop and 701 * do the right thing if there are no more events to 702 * read (i.e. return bytes read, -EAGAIN or wait). 703 */ 704 ret = 0; 705 } 706 707 /* 708 * Permission events get queued to wait for response. Other 709 * events can be destroyed now. 710 */ 711 if (!fanotify_is_perm_event(event->mask)) { 712 fsnotify_destroy_event(group, &event->fse); 713 } else { 714 if (ret <= 0) { 715 spin_lock(&group->notification_lock); 716 finish_permission_event(group, 717 FANOTIFY_PERM(event), FAN_DENY); 718 wake_up(&group->fanotify_data.access_waitq); 719 } else { 720 spin_lock(&group->notification_lock); 721 list_add_tail(&event->fse.list, 722 &group->fanotify_data.access_list); 723 spin_unlock(&group->notification_lock); 724 } 725 } 726 if (ret < 0) 727 break; 728 buf += ret; 729 count -= ret; 730 } 731 remove_wait_queue(&group->notification_waitq, &wait); 732 733 if (start != buf && ret != -EFAULT) 734 ret = buf - start; 735 return ret; 736 } 737 738 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 739 { 740 struct fanotify_response response = { .fd = -1, .response = -1 }; 741 struct fsnotify_group *group; 742 int ret; 743 744 if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) 745 return -EINVAL; 746 747 group = file->private_data; 748 749 if (count < sizeof(response)) 750 return -EINVAL; 751 752 count = sizeof(response); 753 754 pr_debug("%s: group=%p count=%zu\n", __func__, group, count); 755 756 if (copy_from_user(&response, buf, count)) 757 return -EFAULT; 758 759 ret = process_access_response(group, &response); 760 if (ret < 0) 761 count = ret; 762 763 return count; 764 } 765 766 static int fanotify_release(struct inode *ignored, struct file *file) 767 { 768 struct fsnotify_group *group = file->private_data; 769 struct fsnotify_event *fsn_event; 770 771 /* 772 * Stop new events from arriving in the notification queue. since 773 * userspace cannot use fanotify fd anymore, no event can enter or 774 * leave access_list by now either. 775 */ 776 fsnotify_group_stop_queueing(group); 777 778 /* 779 * Process all permission events on access_list and notification queue 780 * and simulate reply from userspace. 781 */ 782 spin_lock(&group->notification_lock); 783 while (!list_empty(&group->fanotify_data.access_list)) { 784 struct fanotify_perm_event *event; 785 786 event = list_first_entry(&group->fanotify_data.access_list, 787 struct fanotify_perm_event, fae.fse.list); 788 list_del_init(&event->fae.fse.list); 789 finish_permission_event(group, event, FAN_ALLOW); 790 spin_lock(&group->notification_lock); 791 } 792 793 /* 794 * Destroy all non-permission events. For permission events just 795 * dequeue them and set the response. They will be freed once the 796 * response is consumed and fanotify_get_response() returns. 797 */ 798 while ((fsn_event = fsnotify_remove_first_event(group))) { 799 struct fanotify_event *event = FANOTIFY_E(fsn_event); 800 801 if (!(event->mask & FANOTIFY_PERM_EVENTS)) { 802 spin_unlock(&group->notification_lock); 803 fsnotify_destroy_event(group, fsn_event); 804 } else { 805 finish_permission_event(group, FANOTIFY_PERM(event), 806 FAN_ALLOW); 807 } 808 spin_lock(&group->notification_lock); 809 } 810 spin_unlock(&group->notification_lock); 811 812 /* Response for all permission events it set, wakeup waiters */ 813 wake_up(&group->fanotify_data.access_waitq); 814 815 /* matches the fanotify_init->fsnotify_alloc_group */ 816 fsnotify_destroy_group(group); 817 818 return 0; 819 } 820 821 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 822 { 823 struct fsnotify_group *group; 824 struct fsnotify_event *fsn_event; 825 void __user *p; 826 int ret = -ENOTTY; 827 size_t send_len = 0; 828 829 group = file->private_data; 830 831 p = (void __user *) arg; 832 833 switch (cmd) { 834 case FIONREAD: 835 spin_lock(&group->notification_lock); 836 list_for_each_entry(fsn_event, &group->notification_list, list) 837 send_len += FAN_EVENT_METADATA_LEN; 838 spin_unlock(&group->notification_lock); 839 ret = put_user(send_len, (int __user *) p); 840 break; 841 } 842 843 return ret; 844 } 845 846 static const struct file_operations fanotify_fops = { 847 .show_fdinfo = fanotify_show_fdinfo, 848 .poll = fanotify_poll, 849 .read = fanotify_read, 850 .write = fanotify_write, 851 .fasync = NULL, 852 .release = fanotify_release, 853 .unlocked_ioctl = fanotify_ioctl, 854 .compat_ioctl = compat_ptr_ioctl, 855 .llseek = noop_llseek, 856 }; 857 858 static int fanotify_find_path(int dfd, const char __user *filename, 859 struct path *path, unsigned int flags, __u64 mask, 860 unsigned int obj_type) 861 { 862 int ret; 863 864 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, 865 dfd, filename, flags); 866 867 if (filename == NULL) { 868 struct fd f = fdget(dfd); 869 870 ret = -EBADF; 871 if (!f.file) 872 goto out; 873 874 ret = -ENOTDIR; 875 if ((flags & FAN_MARK_ONLYDIR) && 876 !(S_ISDIR(file_inode(f.file)->i_mode))) { 877 fdput(f); 878 goto out; 879 } 880 881 *path = f.file->f_path; 882 path_get(path); 883 fdput(f); 884 } else { 885 unsigned int lookup_flags = 0; 886 887 if (!(flags & FAN_MARK_DONT_FOLLOW)) 888 lookup_flags |= LOOKUP_FOLLOW; 889 if (flags & FAN_MARK_ONLYDIR) 890 lookup_flags |= LOOKUP_DIRECTORY; 891 892 ret = user_path_at(dfd, filename, lookup_flags, path); 893 if (ret) 894 goto out; 895 } 896 897 /* you can only watch an inode if you have read permissions on it */ 898 ret = path_permission(path, MAY_READ); 899 if (ret) { 900 path_put(path); 901 goto out; 902 } 903 904 ret = security_path_notify(path, mask, obj_type); 905 if (ret) 906 path_put(path); 907 908 out: 909 return ret; 910 } 911 912 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, 913 __u32 mask, unsigned int flags, 914 __u32 umask, int *destroy) 915 { 916 __u32 oldmask = 0; 917 918 /* umask bits cannot be removed by user */ 919 mask &= ~umask; 920 spin_lock(&fsn_mark->lock); 921 if (!(flags & FAN_MARK_IGNORED_MASK)) { 922 oldmask = fsn_mark->mask; 923 fsn_mark->mask &= ~mask; 924 } else { 925 fsn_mark->ignored_mask &= ~mask; 926 } 927 /* 928 * We need to keep the mark around even if remaining mask cannot 929 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal 930 * changes to the mask. 931 * Destroy mark when only umask bits remain. 932 */ 933 *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); 934 spin_unlock(&fsn_mark->lock); 935 936 return mask & oldmask; 937 } 938 939 static int fanotify_remove_mark(struct fsnotify_group *group, 940 fsnotify_connp_t *connp, __u32 mask, 941 unsigned int flags, __u32 umask) 942 { 943 struct fsnotify_mark *fsn_mark = NULL; 944 __u32 removed; 945 int destroy_mark; 946 947 mutex_lock(&group->mark_mutex); 948 fsn_mark = fsnotify_find_mark(connp, group); 949 if (!fsn_mark) { 950 mutex_unlock(&group->mark_mutex); 951 return -ENOENT; 952 } 953 954 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 955 umask, &destroy_mark); 956 if (removed & fsnotify_conn_mask(fsn_mark->connector)) 957 fsnotify_recalc_mask(fsn_mark->connector); 958 if (destroy_mark) 959 fsnotify_detach_mark(fsn_mark); 960 mutex_unlock(&group->mark_mutex); 961 if (destroy_mark) 962 fsnotify_free_mark(fsn_mark); 963 964 /* matches the fsnotify_find_mark() */ 965 fsnotify_put_mark(fsn_mark); 966 return 0; 967 } 968 969 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, 970 struct vfsmount *mnt, __u32 mask, 971 unsigned int flags, __u32 umask) 972 { 973 return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, 974 mask, flags, umask); 975 } 976 977 static int fanotify_remove_sb_mark(struct fsnotify_group *group, 978 struct super_block *sb, __u32 mask, 979 unsigned int flags, __u32 umask) 980 { 981 return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, 982 flags, umask); 983 } 984 985 static int fanotify_remove_inode_mark(struct fsnotify_group *group, 986 struct inode *inode, __u32 mask, 987 unsigned int flags, __u32 umask) 988 { 989 return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, 990 flags, umask); 991 } 992 993 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, 994 __u32 mask, 995 unsigned int flags) 996 { 997 __u32 oldmask = -1; 998 999 spin_lock(&fsn_mark->lock); 1000 if (!(flags & FAN_MARK_IGNORED_MASK)) { 1001 oldmask = fsn_mark->mask; 1002 fsn_mark->mask |= mask; 1003 } else { 1004 fsn_mark->ignored_mask |= mask; 1005 if (flags & FAN_MARK_IGNORED_SURV_MODIFY) 1006 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 1007 } 1008 spin_unlock(&fsn_mark->lock); 1009 1010 return mask & ~oldmask; 1011 } 1012 1013 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, 1014 fsnotify_connp_t *connp, 1015 unsigned int type, 1016 __kernel_fsid_t *fsid) 1017 { 1018 struct ucounts *ucounts = group->fanotify_data.ucounts; 1019 struct fsnotify_mark *mark; 1020 int ret; 1021 1022 /* 1023 * Enforce per user marks limits per user in all containing user ns. 1024 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count 1025 * in the limited groups account. 1026 */ 1027 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && 1028 !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) 1029 return ERR_PTR(-ENOSPC); 1030 1031 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 1032 if (!mark) { 1033 ret = -ENOMEM; 1034 goto out_dec_ucounts; 1035 } 1036 1037 fsnotify_init_mark(mark, group); 1038 ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); 1039 if (ret) { 1040 fsnotify_put_mark(mark); 1041 goto out_dec_ucounts; 1042 } 1043 1044 return mark; 1045 1046 out_dec_ucounts: 1047 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 1048 dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); 1049 return ERR_PTR(ret); 1050 } 1051 1052 1053 static int fanotify_add_mark(struct fsnotify_group *group, 1054 fsnotify_connp_t *connp, unsigned int type, 1055 __u32 mask, unsigned int flags, 1056 __kernel_fsid_t *fsid) 1057 { 1058 struct fsnotify_mark *fsn_mark; 1059 __u32 added; 1060 1061 mutex_lock(&group->mark_mutex); 1062 fsn_mark = fsnotify_find_mark(connp, group); 1063 if (!fsn_mark) { 1064 fsn_mark = fanotify_add_new_mark(group, connp, type, fsid); 1065 if (IS_ERR(fsn_mark)) { 1066 mutex_unlock(&group->mark_mutex); 1067 return PTR_ERR(fsn_mark); 1068 } 1069 } 1070 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 1071 if (added & ~fsnotify_conn_mask(fsn_mark->connector)) 1072 fsnotify_recalc_mask(fsn_mark->connector); 1073 mutex_unlock(&group->mark_mutex); 1074 1075 fsnotify_put_mark(fsn_mark); 1076 return 0; 1077 } 1078 1079 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, 1080 struct vfsmount *mnt, __u32 mask, 1081 unsigned int flags, __kernel_fsid_t *fsid) 1082 { 1083 return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, 1084 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); 1085 } 1086 1087 static int fanotify_add_sb_mark(struct fsnotify_group *group, 1088 struct super_block *sb, __u32 mask, 1089 unsigned int flags, __kernel_fsid_t *fsid) 1090 { 1091 return fanotify_add_mark(group, &sb->s_fsnotify_marks, 1092 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); 1093 } 1094 1095 static int fanotify_add_inode_mark(struct fsnotify_group *group, 1096 struct inode *inode, __u32 mask, 1097 unsigned int flags, __kernel_fsid_t *fsid) 1098 { 1099 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 1100 1101 /* 1102 * If some other task has this inode open for write we should not add 1103 * an ignored mark, unless that ignored mark is supposed to survive 1104 * modification changes anyway. 1105 */ 1106 if ((flags & FAN_MARK_IGNORED_MASK) && 1107 !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && 1108 inode_is_open_for_write(inode)) 1109 return 0; 1110 1111 return fanotify_add_mark(group, &inode->i_fsnotify_marks, 1112 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); 1113 } 1114 1115 static struct fsnotify_event *fanotify_alloc_overflow_event(void) 1116 { 1117 struct fanotify_event *oevent; 1118 1119 oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); 1120 if (!oevent) 1121 return NULL; 1122 1123 fanotify_init_event(oevent, 0, FS_Q_OVERFLOW); 1124 oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; 1125 1126 return &oevent->fse; 1127 } 1128 1129 static struct hlist_head *fanotify_alloc_merge_hash(void) 1130 { 1131 struct hlist_head *hash; 1132 1133 hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, 1134 GFP_KERNEL_ACCOUNT); 1135 if (!hash) 1136 return NULL; 1137 1138 __hash_init(hash, FANOTIFY_HTABLE_SIZE); 1139 1140 return hash; 1141 } 1142 1143 /* fanotify syscalls */ 1144 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) 1145 { 1146 struct fsnotify_group *group; 1147 int f_flags, fd; 1148 unsigned int fid_mode = flags & FANOTIFY_FID_BITS; 1149 unsigned int class = flags & FANOTIFY_CLASS_BITS; 1150 unsigned int internal_flags = 0; 1151 1152 pr_debug("%s: flags=%x event_f_flags=%x\n", 1153 __func__, flags, event_f_flags); 1154 1155 if (!capable(CAP_SYS_ADMIN)) { 1156 /* 1157 * An unprivileged user can setup an fanotify group with 1158 * limited functionality - an unprivileged group is limited to 1159 * notification events with file handles and it cannot use 1160 * unlimited queue/marks. 1161 */ 1162 if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) 1163 return -EPERM; 1164 1165 /* 1166 * Setting the internal flag FANOTIFY_UNPRIV on the group 1167 * prevents setting mount/filesystem marks on this group and 1168 * prevents reporting pid and open fd in events. 1169 */ 1170 internal_flags |= FANOTIFY_UNPRIV; 1171 } 1172 1173 #ifdef CONFIG_AUDITSYSCALL 1174 if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) 1175 #else 1176 if (flags & ~FANOTIFY_INIT_FLAGS) 1177 #endif 1178 return -EINVAL; 1179 1180 /* 1181 * A pidfd can only be returned for a thread-group leader; thus 1182 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually 1183 * exclusive. 1184 */ 1185 if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) 1186 return -EINVAL; 1187 1188 if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) 1189 return -EINVAL; 1190 1191 switch (event_f_flags & O_ACCMODE) { 1192 case O_RDONLY: 1193 case O_RDWR: 1194 case O_WRONLY: 1195 break; 1196 default: 1197 return -EINVAL; 1198 } 1199 1200 if (fid_mode && class != FAN_CLASS_NOTIF) 1201 return -EINVAL; 1202 1203 /* 1204 * Child name is reported with parent fid so requires dir fid. 1205 * We can report both child fid and dir fid with or without name. 1206 */ 1207 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) 1208 return -EINVAL; 1209 1210 f_flags = O_RDWR | FMODE_NONOTIFY; 1211 if (flags & FAN_CLOEXEC) 1212 f_flags |= O_CLOEXEC; 1213 if (flags & FAN_NONBLOCK) 1214 f_flags |= O_NONBLOCK; 1215 1216 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 1217 group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); 1218 if (IS_ERR(group)) { 1219 return PTR_ERR(group); 1220 } 1221 1222 /* Enforce groups limits per user in all containing user ns */ 1223 group->fanotify_data.ucounts = inc_ucount(current_user_ns(), 1224 current_euid(), 1225 UCOUNT_FANOTIFY_GROUPS); 1226 if (!group->fanotify_data.ucounts) { 1227 fd = -EMFILE; 1228 goto out_destroy_group; 1229 } 1230 1231 group->fanotify_data.flags = flags | internal_flags; 1232 group->memcg = get_mem_cgroup_from_mm(current->mm); 1233 1234 group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); 1235 if (!group->fanotify_data.merge_hash) { 1236 fd = -ENOMEM; 1237 goto out_destroy_group; 1238 } 1239 1240 group->overflow_event = fanotify_alloc_overflow_event(); 1241 if (unlikely(!group->overflow_event)) { 1242 fd = -ENOMEM; 1243 goto out_destroy_group; 1244 } 1245 1246 if (force_o_largefile()) 1247 event_f_flags |= O_LARGEFILE; 1248 group->fanotify_data.f_flags = event_f_flags; 1249 init_waitqueue_head(&group->fanotify_data.access_waitq); 1250 INIT_LIST_HEAD(&group->fanotify_data.access_list); 1251 switch (class) { 1252 case FAN_CLASS_NOTIF: 1253 group->priority = FS_PRIO_0; 1254 break; 1255 case FAN_CLASS_CONTENT: 1256 group->priority = FS_PRIO_1; 1257 break; 1258 case FAN_CLASS_PRE_CONTENT: 1259 group->priority = FS_PRIO_2; 1260 break; 1261 default: 1262 fd = -EINVAL; 1263 goto out_destroy_group; 1264 } 1265 1266 if (flags & FAN_UNLIMITED_QUEUE) { 1267 fd = -EPERM; 1268 if (!capable(CAP_SYS_ADMIN)) 1269 goto out_destroy_group; 1270 group->max_events = UINT_MAX; 1271 } else { 1272 group->max_events = fanotify_max_queued_events; 1273 } 1274 1275 if (flags & FAN_UNLIMITED_MARKS) { 1276 fd = -EPERM; 1277 if (!capable(CAP_SYS_ADMIN)) 1278 goto out_destroy_group; 1279 } 1280 1281 if (flags & FAN_ENABLE_AUDIT) { 1282 fd = -EPERM; 1283 if (!capable(CAP_AUDIT_WRITE)) 1284 goto out_destroy_group; 1285 } 1286 1287 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); 1288 if (fd < 0) 1289 goto out_destroy_group; 1290 1291 return fd; 1292 1293 out_destroy_group: 1294 fsnotify_destroy_group(group); 1295 return fd; 1296 } 1297 1298 /* Check if filesystem can encode a unique fid */ 1299 static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) 1300 { 1301 __kernel_fsid_t root_fsid; 1302 int err; 1303 1304 /* 1305 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). 1306 */ 1307 err = vfs_get_fsid(path->dentry, fsid); 1308 if (err) 1309 return err; 1310 1311 if (!fsid->val[0] && !fsid->val[1]) 1312 return -ENODEV; 1313 1314 /* 1315 * Make sure path is not inside a filesystem subvolume (e.g. btrfs) 1316 * which uses a different fsid than sb root. 1317 */ 1318 err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); 1319 if (err) 1320 return err; 1321 1322 if (root_fsid.val[0] != fsid->val[0] || 1323 root_fsid.val[1] != fsid->val[1]) 1324 return -EXDEV; 1325 1326 /* 1327 * We need to make sure that the file system supports at least 1328 * encoding a file handle so user can use name_to_handle_at() to 1329 * compare fid returned with event to the file handle of watched 1330 * objects. However, name_to_handle_at() requires that the 1331 * filesystem also supports decoding file handles. 1332 */ 1333 if (!path->dentry->d_sb->s_export_op || 1334 !path->dentry->d_sb->s_export_op->fh_to_dentry) 1335 return -EOPNOTSUPP; 1336 1337 return 0; 1338 } 1339 1340 static int fanotify_events_supported(struct path *path, __u64 mask) 1341 { 1342 /* 1343 * Some filesystems such as 'proc' acquire unusual locks when opening 1344 * files. For them fanotify permission events have high chances of 1345 * deadlocking the system - open done when reporting fanotify event 1346 * blocks on this "unusual" lock while another process holding the lock 1347 * waits for fanotify permission event to be answered. Just disallow 1348 * permission events for such filesystems. 1349 */ 1350 if (mask & FANOTIFY_PERM_EVENTS && 1351 path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) 1352 return -EINVAL; 1353 return 0; 1354 } 1355 1356 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, 1357 int dfd, const char __user *pathname) 1358 { 1359 struct inode *inode = NULL; 1360 struct vfsmount *mnt = NULL; 1361 struct fsnotify_group *group; 1362 struct fd f; 1363 struct path path; 1364 __kernel_fsid_t __fsid, *fsid = NULL; 1365 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; 1366 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1367 bool ignored = flags & FAN_MARK_IGNORED_MASK; 1368 unsigned int obj_type, fid_mode; 1369 u32 umask = 0; 1370 int ret; 1371 1372 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", 1373 __func__, fanotify_fd, flags, dfd, pathname, mask); 1374 1375 /* we only use the lower 32 bits as of right now. */ 1376 if (upper_32_bits(mask)) 1377 return -EINVAL; 1378 1379 if (flags & ~FANOTIFY_MARK_FLAGS) 1380 return -EINVAL; 1381 1382 switch (mark_type) { 1383 case FAN_MARK_INODE: 1384 obj_type = FSNOTIFY_OBJ_TYPE_INODE; 1385 break; 1386 case FAN_MARK_MOUNT: 1387 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; 1388 break; 1389 case FAN_MARK_FILESYSTEM: 1390 obj_type = FSNOTIFY_OBJ_TYPE_SB; 1391 break; 1392 default: 1393 return -EINVAL; 1394 } 1395 1396 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 1397 case FAN_MARK_ADD: 1398 case FAN_MARK_REMOVE: 1399 if (!mask) 1400 return -EINVAL; 1401 break; 1402 case FAN_MARK_FLUSH: 1403 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) 1404 return -EINVAL; 1405 break; 1406 default: 1407 return -EINVAL; 1408 } 1409 1410 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) 1411 valid_mask |= FANOTIFY_PERM_EVENTS; 1412 1413 if (mask & ~valid_mask) 1414 return -EINVAL; 1415 1416 /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ 1417 if (ignored) 1418 mask &= ~FANOTIFY_EVENT_FLAGS; 1419 1420 f = fdget(fanotify_fd); 1421 if (unlikely(!f.file)) 1422 return -EBADF; 1423 1424 /* verify that this is indeed an fanotify instance */ 1425 ret = -EINVAL; 1426 if (unlikely(f.file->f_op != &fanotify_fops)) 1427 goto fput_and_out; 1428 group = f.file->private_data; 1429 1430 /* 1431 * An unprivileged user is not allowed to setup mount nor filesystem 1432 * marks. This also includes setting up such marks by a group that 1433 * was initialized by an unprivileged user. 1434 */ 1435 ret = -EPERM; 1436 if ((!capable(CAP_SYS_ADMIN) || 1437 FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && 1438 mark_type != FAN_MARK_INODE) 1439 goto fput_and_out; 1440 1441 /* 1442 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not 1443 * allowed to set permissions events. 1444 */ 1445 ret = -EINVAL; 1446 if (mask & FANOTIFY_PERM_EVENTS && 1447 group->priority == FS_PRIO_0) 1448 goto fput_and_out; 1449 1450 /* 1451 * Events with data type inode do not carry enough information to report 1452 * event->fd, so we do not allow setting a mask for inode events unless 1453 * group supports reporting fid. 1454 * inode events are not supported on a mount mark, because they do not 1455 * carry enough information (i.e. path) to be filtered by mount point. 1456 */ 1457 fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); 1458 if (mask & FANOTIFY_INODE_EVENTS && 1459 (!fid_mode || mark_type == FAN_MARK_MOUNT)) 1460 goto fput_and_out; 1461 1462 if (flags & FAN_MARK_FLUSH) { 1463 ret = 0; 1464 if (mark_type == FAN_MARK_MOUNT) 1465 fsnotify_clear_vfsmount_marks_by_group(group); 1466 else if (mark_type == FAN_MARK_FILESYSTEM) 1467 fsnotify_clear_sb_marks_by_group(group); 1468 else 1469 fsnotify_clear_inode_marks_by_group(group); 1470 goto fput_and_out; 1471 } 1472 1473 ret = fanotify_find_path(dfd, pathname, &path, flags, 1474 (mask & ALL_FSNOTIFY_EVENTS), obj_type); 1475 if (ret) 1476 goto fput_and_out; 1477 1478 if (flags & FAN_MARK_ADD) { 1479 ret = fanotify_events_supported(&path, mask); 1480 if (ret) 1481 goto path_put_and_out; 1482 } 1483 1484 if (fid_mode) { 1485 ret = fanotify_test_fid(&path, &__fsid); 1486 if (ret) 1487 goto path_put_and_out; 1488 1489 fsid = &__fsid; 1490 } 1491 1492 /* inode held in place by reference to path; group by fget on fd */ 1493 if (mark_type == FAN_MARK_INODE) 1494 inode = path.dentry->d_inode; 1495 else 1496 mnt = path.mnt; 1497 1498 /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ 1499 if (mnt || !S_ISDIR(inode->i_mode)) { 1500 mask &= ~FAN_EVENT_ON_CHILD; 1501 umask = FAN_EVENT_ON_CHILD; 1502 /* 1503 * If group needs to report parent fid, register for getting 1504 * events with parent/name info for non-directory. 1505 */ 1506 if ((fid_mode & FAN_REPORT_DIR_FID) && 1507 (flags & FAN_MARK_ADD) && !ignored) 1508 mask |= FAN_EVENT_ON_CHILD; 1509 } 1510 1511 /* create/update an inode mark */ 1512 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { 1513 case FAN_MARK_ADD: 1514 if (mark_type == FAN_MARK_MOUNT) 1515 ret = fanotify_add_vfsmount_mark(group, mnt, mask, 1516 flags, fsid); 1517 else if (mark_type == FAN_MARK_FILESYSTEM) 1518 ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, 1519 flags, fsid); 1520 else 1521 ret = fanotify_add_inode_mark(group, inode, mask, 1522 flags, fsid); 1523 break; 1524 case FAN_MARK_REMOVE: 1525 if (mark_type == FAN_MARK_MOUNT) 1526 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, 1527 flags, umask); 1528 else if (mark_type == FAN_MARK_FILESYSTEM) 1529 ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, 1530 flags, umask); 1531 else 1532 ret = fanotify_remove_inode_mark(group, inode, mask, 1533 flags, umask); 1534 break; 1535 default: 1536 ret = -EINVAL; 1537 } 1538 1539 path_put_and_out: 1540 path_put(&path); 1541 fput_and_out: 1542 fdput(f); 1543 return ret; 1544 } 1545 1546 #ifndef CONFIG_ARCH_SPLIT_ARG64 1547 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, 1548 __u64, mask, int, dfd, 1549 const char __user *, pathname) 1550 { 1551 return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); 1552 } 1553 #endif 1554 1555 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) 1556 SYSCALL32_DEFINE6(fanotify_mark, 1557 int, fanotify_fd, unsigned int, flags, 1558 SC_ARG64(mask), int, dfd, 1559 const char __user *, pathname) 1560 { 1561 return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), 1562 dfd, pathname); 1563 } 1564 #endif 1565 1566 /* 1567 * fanotify_user_setup - Our initialization function. Note that we cannot return 1568 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 1569 * must result in panic(). 1570 */ 1571 static int __init fanotify_user_setup(void) 1572 { 1573 struct sysinfo si; 1574 int max_marks; 1575 1576 si_meminfo(&si); 1577 /* 1578 * Allow up to 1% of addressable memory to be accounted for per user 1579 * marks limited to the range [8192, 1048576]. mount and sb marks are 1580 * a lot cheaper than inode marks, but there is no reason for a user 1581 * to have many of those, so calculate by the cost of inode marks. 1582 */ 1583 max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / 1584 INODE_MARK_COST; 1585 max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, 1586 FANOTIFY_DEFAULT_MAX_USER_MARKS); 1587 1588 BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); 1589 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); 1590 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); 1591 1592 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, 1593 SLAB_PANIC|SLAB_ACCOUNT); 1594 fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, 1595 SLAB_PANIC); 1596 fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, 1597 SLAB_PANIC); 1598 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { 1599 fanotify_perm_event_cachep = 1600 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); 1601 } 1602 1603 fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; 1604 init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = 1605 FANOTIFY_DEFAULT_MAX_GROUPS; 1606 init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; 1607 1608 return 0; 1609 } 1610 device_initcall(fanotify_user_setup); 1611