1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/fdtable.h>
5 #include <linux/file.h>
6 #include <linux/fs.h>
7 #include <linux/anon_inodes.h>
8 #include <linux/fsnotify_backend.h>
9 #include <linux/init.h>
10 #include <linux/mount.h>
11 #include <linux/namei.h>
12 #include <linux/poll.h>
13 #include <linux/security.h>
14 #include <linux/syscalls.h>
15 #include <linux/slab.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/compat.h>
19 #include <linux/sched/signal.h>
20 #include <linux/memcontrol.h>
21 #include <linux/statfs.h>
22 #include <linux/exportfs.h>
23 
24 #include <asm/ioctls.h>
25 
26 #include "../../mount.h"
27 #include "../fdinfo.h"
28 #include "fanotify.h"
29 
30 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
31 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
32 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
33 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
34 
35 /*
36  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
37  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
38  * of fanotify marks per user is <max marks per group> * <max groups per user>.
39  * This default limit (1M) also happens to match the increased limit of inotify
40  * max_user_watches since v5.10.
41  */
42 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
43 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
44 
45 /*
46  * Most of the memory cost of adding an inode mark is pinning the marked inode.
47  * The size of the filesystem inode struct is not uniform across filesystems,
48  * so double the size of a VFS inode is used as a conservative approximation.
49  */
50 #define INODE_MARK_COST	(2 * sizeof(struct inode))
51 
52 /* configurable via /proc/sys/fs/fanotify/ */
53 static int fanotify_max_queued_events __read_mostly;
54 
55 #ifdef CONFIG_SYSCTL
56 
57 #include <linux/sysctl.h>
58 
59 static long ft_zero = 0;
60 static long ft_int_max = INT_MAX;
61 
62 static struct ctl_table fanotify_table[] = {
63 	{
64 		.procname	= "max_user_groups",
65 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
66 		.maxlen		= sizeof(long),
67 		.mode		= 0644,
68 		.proc_handler	= proc_doulongvec_minmax,
69 		.extra1		= &ft_zero,
70 		.extra2		= &ft_int_max,
71 	},
72 	{
73 		.procname	= "max_user_marks",
74 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
75 		.maxlen		= sizeof(long),
76 		.mode		= 0644,
77 		.proc_handler	= proc_doulongvec_minmax,
78 		.extra1		= &ft_zero,
79 		.extra2		= &ft_int_max,
80 	},
81 	{
82 		.procname	= "max_queued_events",
83 		.data		= &fanotify_max_queued_events,
84 		.maxlen		= sizeof(int),
85 		.mode		= 0644,
86 		.proc_handler	= proc_dointvec_minmax,
87 		.extra1		= SYSCTL_ZERO
88 	},
89 	{ }
90 };
91 
92 static void __init fanotify_sysctls_init(void)
93 {
94 	register_sysctl("fs/fanotify", fanotify_table);
95 }
96 #else
97 #define fanotify_sysctls_init() do { } while (0)
98 #endif /* CONFIG_SYSCTL */
99 
100 /*
101  * All flags that may be specified in parameter event_f_flags of fanotify_init.
102  *
103  * Internal and external open flags are stored together in field f_flags of
104  * struct file. Only external open flags shall be allowed in event_f_flags.
105  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
106  * excluded.
107  */
108 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
109 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
110 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
111 		O_LARGEFILE	| O_NOATIME	)
112 
113 extern const struct fsnotify_ops fanotify_fsnotify_ops;
114 
115 struct kmem_cache *fanotify_mark_cache __read_mostly;
116 struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
117 struct kmem_cache *fanotify_path_event_cachep __read_mostly;
118 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
119 
120 #define FANOTIFY_EVENT_ALIGN 4
121 #define FANOTIFY_FID_INFO_HDR_LEN \
122 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
123 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
124 	sizeof(struct fanotify_event_info_pidfd)
125 #define FANOTIFY_ERROR_INFO_LEN \
126 	(sizeof(struct fanotify_event_info_error))
127 
128 static int fanotify_fid_info_len(int fh_len, int name_len)
129 {
130 	int info_len = fh_len;
131 
132 	if (name_len)
133 		info_len += name_len + 1;
134 
135 	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
136 		       FANOTIFY_EVENT_ALIGN);
137 }
138 
139 /* FAN_RENAME may have one or two dir+name info records */
140 static int fanotify_dir_name_info_len(struct fanotify_event *event)
141 {
142 	struct fanotify_info *info = fanotify_event_info(event);
143 	int dir_fh_len = fanotify_event_dir_fh_len(event);
144 	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
145 	int info_len = 0;
146 
147 	if (dir_fh_len)
148 		info_len += fanotify_fid_info_len(dir_fh_len,
149 						  info->name_len);
150 	if (dir2_fh_len)
151 		info_len += fanotify_fid_info_len(dir2_fh_len,
152 						  info->name2_len);
153 
154 	return info_len;
155 }
156 
157 static size_t fanotify_event_len(unsigned int info_mode,
158 				 struct fanotify_event *event)
159 {
160 	size_t event_len = FAN_EVENT_METADATA_LEN;
161 	struct fanotify_info *info;
162 	int fh_len;
163 	int dot_len = 0;
164 
165 	if (!info_mode)
166 		return event_len;
167 
168 	if (fanotify_is_error_event(event->mask))
169 		event_len += FANOTIFY_ERROR_INFO_LEN;
170 
171 	info = fanotify_event_info(event);
172 
173 	if (fanotify_event_has_any_dir_fh(event)) {
174 		event_len += fanotify_dir_name_info_len(event);
175 	} else if ((info_mode & FAN_REPORT_NAME) &&
176 		   (event->mask & FAN_ONDIR)) {
177 		/*
178 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
179 		 * event on a directory, we will report the name ".".
180 		 */
181 		dot_len = 1;
182 	}
183 
184 	if (info_mode & FAN_REPORT_PIDFD)
185 		event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
186 
187 	if (fanotify_event_has_object_fh(event)) {
188 		fh_len = fanotify_event_object_fh_len(event);
189 		event_len += fanotify_fid_info_len(fh_len, dot_len);
190 	}
191 
192 	return event_len;
193 }
194 
195 /*
196  * Remove an hashed event from merge hash table.
197  */
198 static void fanotify_unhash_event(struct fsnotify_group *group,
199 				  struct fanotify_event *event)
200 {
201 	assert_spin_locked(&group->notification_lock);
202 
203 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
204 		 group, event, fanotify_event_hash_bucket(group, event));
205 
206 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
207 		return;
208 
209 	hlist_del_init(&event->merge_list);
210 }
211 
212 /*
213  * Get an fanotify notification event if one exists and is small
214  * enough to fit in "count". Return an error pointer if the count
215  * is not large enough. When permission event is dequeued, its state is
216  * updated accordingly.
217  */
218 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
219 					    size_t count)
220 {
221 	size_t event_size;
222 	struct fanotify_event *event = NULL;
223 	struct fsnotify_event *fsn_event;
224 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
225 
226 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
227 
228 	spin_lock(&group->notification_lock);
229 	fsn_event = fsnotify_peek_first_event(group);
230 	if (!fsn_event)
231 		goto out;
232 
233 	event = FANOTIFY_E(fsn_event);
234 	event_size = fanotify_event_len(info_mode, event);
235 
236 	if (event_size > count) {
237 		event = ERR_PTR(-EINVAL);
238 		goto out;
239 	}
240 
241 	/*
242 	 * Held the notification_lock the whole time, so this is the
243 	 * same event we peeked above.
244 	 */
245 	fsnotify_remove_first_event(group);
246 	if (fanotify_is_perm_event(event->mask))
247 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
248 	if (fanotify_is_hashed_event(event->mask))
249 		fanotify_unhash_event(group, event);
250 out:
251 	spin_unlock(&group->notification_lock);
252 	return event;
253 }
254 
255 static int create_fd(struct fsnotify_group *group, struct path *path,
256 		     struct file **file)
257 {
258 	int client_fd;
259 	struct file *new_file;
260 
261 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
262 	if (client_fd < 0)
263 		return client_fd;
264 
265 	/*
266 	 * we need a new file handle for the userspace program so it can read even if it was
267 	 * originally opened O_WRONLY.
268 	 */
269 	new_file = dentry_open(path,
270 			       group->fanotify_data.f_flags | FMODE_NONOTIFY,
271 			       current_cred());
272 	if (IS_ERR(new_file)) {
273 		/*
274 		 * we still send an event even if we can't open the file.  this
275 		 * can happen when say tasks are gone and we try to open their
276 		 * /proc files or we try to open a WRONLY file like in sysfs
277 		 * we just send the errno to userspace since there isn't much
278 		 * else we can do.
279 		 */
280 		put_unused_fd(client_fd);
281 		client_fd = PTR_ERR(new_file);
282 	} else {
283 		*file = new_file;
284 	}
285 
286 	return client_fd;
287 }
288 
289 /*
290  * Finish processing of permission event by setting it to ANSWERED state and
291  * drop group->notification_lock.
292  */
293 static void finish_permission_event(struct fsnotify_group *group,
294 				    struct fanotify_perm_event *event,
295 				    unsigned int response)
296 				    __releases(&group->notification_lock)
297 {
298 	bool destroy = false;
299 
300 	assert_spin_locked(&group->notification_lock);
301 	event->response = response;
302 	if (event->state == FAN_EVENT_CANCELED)
303 		destroy = true;
304 	else
305 		event->state = FAN_EVENT_ANSWERED;
306 	spin_unlock(&group->notification_lock);
307 	if (destroy)
308 		fsnotify_destroy_event(group, &event->fae.fse);
309 }
310 
311 static int process_access_response(struct fsnotify_group *group,
312 				   struct fanotify_response *response_struct)
313 {
314 	struct fanotify_perm_event *event;
315 	int fd = response_struct->fd;
316 	int response = response_struct->response;
317 
318 	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
319 		 fd, response);
320 	/*
321 	 * make sure the response is valid, if invalid we do nothing and either
322 	 * userspace can send a valid response or we will clean it up after the
323 	 * timeout
324 	 */
325 	switch (response & ~FAN_AUDIT) {
326 	case FAN_ALLOW:
327 	case FAN_DENY:
328 		break;
329 	default:
330 		return -EINVAL;
331 	}
332 
333 	if (fd < 0)
334 		return -EINVAL;
335 
336 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
337 		return -EINVAL;
338 
339 	spin_lock(&group->notification_lock);
340 	list_for_each_entry(event, &group->fanotify_data.access_list,
341 			    fae.fse.list) {
342 		if (event->fd != fd)
343 			continue;
344 
345 		list_del_init(&event->fae.fse.list);
346 		finish_permission_event(group, event, response);
347 		wake_up(&group->fanotify_data.access_waitq);
348 		return 0;
349 	}
350 	spin_unlock(&group->notification_lock);
351 
352 	return -ENOENT;
353 }
354 
355 static size_t copy_error_info_to_user(struct fanotify_event *event,
356 				      char __user *buf, int count)
357 {
358 	struct fanotify_event_info_error info = { };
359 	struct fanotify_error_event *fee = FANOTIFY_EE(event);
360 
361 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
362 	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
363 
364 	if (WARN_ON(count < info.hdr.len))
365 		return -EFAULT;
366 
367 	info.error = fee->error;
368 	info.error_count = fee->err_count;
369 
370 	if (copy_to_user(buf, &info, sizeof(info)))
371 		return -EFAULT;
372 
373 	return info.hdr.len;
374 }
375 
376 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
377 				 int info_type, const char *name,
378 				 size_t name_len,
379 				 char __user *buf, size_t count)
380 {
381 	struct fanotify_event_info_fid info = { };
382 	struct file_handle handle = { };
383 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
384 	size_t fh_len = fh ? fh->len : 0;
385 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
386 	size_t len = info_len;
387 
388 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
389 		 __func__, fh_len, name_len, info_len, count);
390 
391 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
392 		return -EFAULT;
393 
394 	/*
395 	 * Copy event info fid header followed by variable sized file handle
396 	 * and optionally followed by variable sized filename.
397 	 */
398 	switch (info_type) {
399 	case FAN_EVENT_INFO_TYPE_FID:
400 	case FAN_EVENT_INFO_TYPE_DFID:
401 		if (WARN_ON_ONCE(name_len))
402 			return -EFAULT;
403 		break;
404 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
405 	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
406 	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
407 		if (WARN_ON_ONCE(!name || !name_len))
408 			return -EFAULT;
409 		break;
410 	default:
411 		return -EFAULT;
412 	}
413 
414 	info.hdr.info_type = info_type;
415 	info.hdr.len = len;
416 	info.fsid = *fsid;
417 	if (copy_to_user(buf, &info, sizeof(info)))
418 		return -EFAULT;
419 
420 	buf += sizeof(info);
421 	len -= sizeof(info);
422 	if (WARN_ON_ONCE(len < sizeof(handle)))
423 		return -EFAULT;
424 
425 	handle.handle_type = fh->type;
426 	handle.handle_bytes = fh_len;
427 
428 	/* Mangle handle_type for bad file_handle */
429 	if (!fh_len)
430 		handle.handle_type = FILEID_INVALID;
431 
432 	if (copy_to_user(buf, &handle, sizeof(handle)))
433 		return -EFAULT;
434 
435 	buf += sizeof(handle);
436 	len -= sizeof(handle);
437 	if (WARN_ON_ONCE(len < fh_len))
438 		return -EFAULT;
439 
440 	/*
441 	 * For an inline fh and inline file name, copy through stack to exclude
442 	 * the copy from usercopy hardening protections.
443 	 */
444 	fh_buf = fanotify_fh_buf(fh);
445 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
446 		memcpy(bounce, fh_buf, fh_len);
447 		fh_buf = bounce;
448 	}
449 	if (copy_to_user(buf, fh_buf, fh_len))
450 		return -EFAULT;
451 
452 	buf += fh_len;
453 	len -= fh_len;
454 
455 	if (name_len) {
456 		/* Copy the filename with terminating null */
457 		name_len++;
458 		if (WARN_ON_ONCE(len < name_len))
459 			return -EFAULT;
460 
461 		if (copy_to_user(buf, name, name_len))
462 			return -EFAULT;
463 
464 		buf += name_len;
465 		len -= name_len;
466 	}
467 
468 	/* Pad with 0's */
469 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
470 	if (len > 0 && clear_user(buf, len))
471 		return -EFAULT;
472 
473 	return info_len;
474 }
475 
476 static int copy_pidfd_info_to_user(int pidfd,
477 				   char __user *buf,
478 				   size_t count)
479 {
480 	struct fanotify_event_info_pidfd info = { };
481 	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
482 
483 	if (WARN_ON_ONCE(info_len > count))
484 		return -EFAULT;
485 
486 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
487 	info.hdr.len = info_len;
488 	info.pidfd = pidfd;
489 
490 	if (copy_to_user(buf, &info, info_len))
491 		return -EFAULT;
492 
493 	return info_len;
494 }
495 
496 static int copy_info_records_to_user(struct fanotify_event *event,
497 				     struct fanotify_info *info,
498 				     unsigned int info_mode, int pidfd,
499 				     char __user *buf, size_t count)
500 {
501 	int ret, total_bytes = 0, info_type = 0;
502 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
503 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
504 
505 	/*
506 	 * Event info records order is as follows:
507 	 * 1. dir fid + name
508 	 * 2. (optional) new dir fid + new name
509 	 * 3. (optional) child fid
510 	 */
511 	if (fanotify_event_has_dir_fh(event)) {
512 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
513 					     FAN_EVENT_INFO_TYPE_DFID;
514 
515 		/* FAN_RENAME uses special info types */
516 		if (event->mask & FAN_RENAME)
517 			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
518 
519 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
520 					    fanotify_info_dir_fh(info),
521 					    info_type,
522 					    fanotify_info_name(info),
523 					    info->name_len, buf, count);
524 		if (ret < 0)
525 			return ret;
526 
527 		buf += ret;
528 		count -= ret;
529 		total_bytes += ret;
530 	}
531 
532 	/* New dir fid+name may be reported in addition to old dir fid+name */
533 	if (fanotify_event_has_dir2_fh(event)) {
534 		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
535 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
536 					    fanotify_info_dir2_fh(info),
537 					    info_type,
538 					    fanotify_info_name2(info),
539 					    info->name2_len, buf, count);
540 		if (ret < 0)
541 			return ret;
542 
543 		buf += ret;
544 		count -= ret;
545 		total_bytes += ret;
546 	}
547 
548 	if (fanotify_event_has_object_fh(event)) {
549 		const char *dot = NULL;
550 		int dot_len = 0;
551 
552 		if (fid_mode == FAN_REPORT_FID || info_type) {
553 			/*
554 			 * With only group flag FAN_REPORT_FID only type FID is
555 			 * reported. Second info record type is always FID.
556 			 */
557 			info_type = FAN_EVENT_INFO_TYPE_FID;
558 		} else if ((fid_mode & FAN_REPORT_NAME) &&
559 			   (event->mask & FAN_ONDIR)) {
560 			/*
561 			 * With group flag FAN_REPORT_NAME, if name was not
562 			 * recorded in an event on a directory, report the name
563 			 * "." with info type DFID_NAME.
564 			 */
565 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
566 			dot = ".";
567 			dot_len = 1;
568 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
569 			   (event->mask & FAN_ONDIR)) {
570 			/*
571 			 * With group flag FAN_REPORT_DIR_FID, a single info
572 			 * record has type DFID for directory entry modification
573 			 * event and for event on a directory.
574 			 */
575 			info_type = FAN_EVENT_INFO_TYPE_DFID;
576 		} else {
577 			/*
578 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
579 			 * a single info record has type FID for event on a
580 			 * non-directory, when there is no directory to report.
581 			 * For example, on FAN_DELETE_SELF event.
582 			 */
583 			info_type = FAN_EVENT_INFO_TYPE_FID;
584 		}
585 
586 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
587 					    fanotify_event_object_fh(event),
588 					    info_type, dot, dot_len,
589 					    buf, count);
590 		if (ret < 0)
591 			return ret;
592 
593 		buf += ret;
594 		count -= ret;
595 		total_bytes += ret;
596 	}
597 
598 	if (pidfd_mode) {
599 		ret = copy_pidfd_info_to_user(pidfd, buf, count);
600 		if (ret < 0)
601 			return ret;
602 
603 		buf += ret;
604 		count -= ret;
605 		total_bytes += ret;
606 	}
607 
608 	if (fanotify_is_error_event(event->mask)) {
609 		ret = copy_error_info_to_user(event, buf, count);
610 		if (ret < 0)
611 			return ret;
612 		buf += ret;
613 		count -= ret;
614 		total_bytes += ret;
615 	}
616 
617 	return total_bytes;
618 }
619 
620 static ssize_t copy_event_to_user(struct fsnotify_group *group,
621 				  struct fanotify_event *event,
622 				  char __user *buf, size_t count)
623 {
624 	struct fanotify_event_metadata metadata;
625 	struct path *path = fanotify_event_path(event);
626 	struct fanotify_info *info = fanotify_event_info(event);
627 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
628 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
629 	struct file *f = NULL;
630 	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
631 
632 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
633 
634 	metadata.event_len = fanotify_event_len(info_mode, event);
635 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
636 	metadata.vers = FANOTIFY_METADATA_VERSION;
637 	metadata.reserved = 0;
638 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
639 	metadata.pid = pid_vnr(event->pid);
640 	/*
641 	 * For an unprivileged listener, event->pid can be used to identify the
642 	 * events generated by the listener process itself, without disclosing
643 	 * the pids of other processes.
644 	 */
645 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
646 	    task_tgid(current) != event->pid)
647 		metadata.pid = 0;
648 
649 	/*
650 	 * For now, fid mode is required for an unprivileged listener and
651 	 * fid mode does not report fd in events.  Keep this check anyway
652 	 * for safety in case fid mode requirement is relaxed in the future
653 	 * to allow unprivileged listener to get events with no fd and no fid.
654 	 */
655 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
656 	    path && path->mnt && path->dentry) {
657 		fd = create_fd(group, path, &f);
658 		if (fd < 0)
659 			return fd;
660 	}
661 	metadata.fd = fd;
662 
663 	if (pidfd_mode) {
664 		/*
665 		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
666 		 * exclusion is ever lifted. At the time of incoporating pidfd
667 		 * support within fanotify, the pidfd API only supported the
668 		 * creation of pidfds for thread-group leaders.
669 		 */
670 		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
671 
672 		/*
673 		 * The PIDTYPE_TGID check for an event->pid is performed
674 		 * preemptively in an attempt to catch out cases where the event
675 		 * listener reads events after the event generating process has
676 		 * already terminated. Report FAN_NOPIDFD to the event listener
677 		 * in those cases, with all other pidfd creation errors being
678 		 * reported as FAN_EPIDFD.
679 		 */
680 		if (metadata.pid == 0 ||
681 		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
682 			pidfd = FAN_NOPIDFD;
683 		} else {
684 			pidfd = pidfd_create(event->pid, 0);
685 			if (pidfd < 0)
686 				pidfd = FAN_EPIDFD;
687 		}
688 	}
689 
690 	ret = -EFAULT;
691 	/*
692 	 * Sanity check copy size in case get_one_event() and
693 	 * event_len sizes ever get out of sync.
694 	 */
695 	if (WARN_ON_ONCE(metadata.event_len > count))
696 		goto out_close_fd;
697 
698 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
699 		goto out_close_fd;
700 
701 	buf += FAN_EVENT_METADATA_LEN;
702 	count -= FAN_EVENT_METADATA_LEN;
703 
704 	if (fanotify_is_perm_event(event->mask))
705 		FANOTIFY_PERM(event)->fd = fd;
706 
707 	if (f)
708 		fd_install(fd, f);
709 
710 	if (info_mode) {
711 		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
712 						buf, count);
713 		if (ret < 0)
714 			goto out_close_fd;
715 	}
716 
717 	return metadata.event_len;
718 
719 out_close_fd:
720 	if (fd != FAN_NOFD) {
721 		put_unused_fd(fd);
722 		fput(f);
723 	}
724 
725 	if (pidfd >= 0)
726 		close_fd(pidfd);
727 
728 	return ret;
729 }
730 
731 /* intofiy userspace file descriptor functions */
732 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
733 {
734 	struct fsnotify_group *group = file->private_data;
735 	__poll_t ret = 0;
736 
737 	poll_wait(file, &group->notification_waitq, wait);
738 	spin_lock(&group->notification_lock);
739 	if (!fsnotify_notify_queue_is_empty(group))
740 		ret = EPOLLIN | EPOLLRDNORM;
741 	spin_unlock(&group->notification_lock);
742 
743 	return ret;
744 }
745 
746 static ssize_t fanotify_read(struct file *file, char __user *buf,
747 			     size_t count, loff_t *pos)
748 {
749 	struct fsnotify_group *group;
750 	struct fanotify_event *event;
751 	char __user *start;
752 	int ret;
753 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
754 
755 	start = buf;
756 	group = file->private_data;
757 
758 	pr_debug("%s: group=%p\n", __func__, group);
759 
760 	add_wait_queue(&group->notification_waitq, &wait);
761 	while (1) {
762 		/*
763 		 * User can supply arbitrarily large buffer. Avoid softlockups
764 		 * in case there are lots of available events.
765 		 */
766 		cond_resched();
767 		event = get_one_event(group, count);
768 		if (IS_ERR(event)) {
769 			ret = PTR_ERR(event);
770 			break;
771 		}
772 
773 		if (!event) {
774 			ret = -EAGAIN;
775 			if (file->f_flags & O_NONBLOCK)
776 				break;
777 
778 			ret = -ERESTARTSYS;
779 			if (signal_pending(current))
780 				break;
781 
782 			if (start != buf)
783 				break;
784 
785 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
786 			continue;
787 		}
788 
789 		ret = copy_event_to_user(group, event, buf, count);
790 		if (unlikely(ret == -EOPENSTALE)) {
791 			/*
792 			 * We cannot report events with stale fd so drop it.
793 			 * Setting ret to 0 will continue the event loop and
794 			 * do the right thing if there are no more events to
795 			 * read (i.e. return bytes read, -EAGAIN or wait).
796 			 */
797 			ret = 0;
798 		}
799 
800 		/*
801 		 * Permission events get queued to wait for response.  Other
802 		 * events can be destroyed now.
803 		 */
804 		if (!fanotify_is_perm_event(event->mask)) {
805 			fsnotify_destroy_event(group, &event->fse);
806 		} else {
807 			if (ret <= 0) {
808 				spin_lock(&group->notification_lock);
809 				finish_permission_event(group,
810 					FANOTIFY_PERM(event), FAN_DENY);
811 				wake_up(&group->fanotify_data.access_waitq);
812 			} else {
813 				spin_lock(&group->notification_lock);
814 				list_add_tail(&event->fse.list,
815 					&group->fanotify_data.access_list);
816 				spin_unlock(&group->notification_lock);
817 			}
818 		}
819 		if (ret < 0)
820 			break;
821 		buf += ret;
822 		count -= ret;
823 	}
824 	remove_wait_queue(&group->notification_waitq, &wait);
825 
826 	if (start != buf && ret != -EFAULT)
827 		ret = buf - start;
828 	return ret;
829 }
830 
831 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
832 {
833 	struct fanotify_response response = { .fd = -1, .response = -1 };
834 	struct fsnotify_group *group;
835 	int ret;
836 
837 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
838 		return -EINVAL;
839 
840 	group = file->private_data;
841 
842 	if (count < sizeof(response))
843 		return -EINVAL;
844 
845 	count = sizeof(response);
846 
847 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
848 
849 	if (copy_from_user(&response, buf, count))
850 		return -EFAULT;
851 
852 	ret = process_access_response(group, &response);
853 	if (ret < 0)
854 		count = ret;
855 
856 	return count;
857 }
858 
859 static int fanotify_release(struct inode *ignored, struct file *file)
860 {
861 	struct fsnotify_group *group = file->private_data;
862 	struct fsnotify_event *fsn_event;
863 
864 	/*
865 	 * Stop new events from arriving in the notification queue. since
866 	 * userspace cannot use fanotify fd anymore, no event can enter or
867 	 * leave access_list by now either.
868 	 */
869 	fsnotify_group_stop_queueing(group);
870 
871 	/*
872 	 * Process all permission events on access_list and notification queue
873 	 * and simulate reply from userspace.
874 	 */
875 	spin_lock(&group->notification_lock);
876 	while (!list_empty(&group->fanotify_data.access_list)) {
877 		struct fanotify_perm_event *event;
878 
879 		event = list_first_entry(&group->fanotify_data.access_list,
880 				struct fanotify_perm_event, fae.fse.list);
881 		list_del_init(&event->fae.fse.list);
882 		finish_permission_event(group, event, FAN_ALLOW);
883 		spin_lock(&group->notification_lock);
884 	}
885 
886 	/*
887 	 * Destroy all non-permission events. For permission events just
888 	 * dequeue them and set the response. They will be freed once the
889 	 * response is consumed and fanotify_get_response() returns.
890 	 */
891 	while ((fsn_event = fsnotify_remove_first_event(group))) {
892 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
893 
894 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
895 			spin_unlock(&group->notification_lock);
896 			fsnotify_destroy_event(group, fsn_event);
897 		} else {
898 			finish_permission_event(group, FANOTIFY_PERM(event),
899 						FAN_ALLOW);
900 		}
901 		spin_lock(&group->notification_lock);
902 	}
903 	spin_unlock(&group->notification_lock);
904 
905 	/* Response for all permission events it set, wakeup waiters */
906 	wake_up(&group->fanotify_data.access_waitq);
907 
908 	/* matches the fanotify_init->fsnotify_alloc_group */
909 	fsnotify_destroy_group(group);
910 
911 	return 0;
912 }
913 
914 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
915 {
916 	struct fsnotify_group *group;
917 	struct fsnotify_event *fsn_event;
918 	void __user *p;
919 	int ret = -ENOTTY;
920 	size_t send_len = 0;
921 
922 	group = file->private_data;
923 
924 	p = (void __user *) arg;
925 
926 	switch (cmd) {
927 	case FIONREAD:
928 		spin_lock(&group->notification_lock);
929 		list_for_each_entry(fsn_event, &group->notification_list, list)
930 			send_len += FAN_EVENT_METADATA_LEN;
931 		spin_unlock(&group->notification_lock);
932 		ret = put_user(send_len, (int __user *) p);
933 		break;
934 	}
935 
936 	return ret;
937 }
938 
939 static const struct file_operations fanotify_fops = {
940 	.show_fdinfo	= fanotify_show_fdinfo,
941 	.poll		= fanotify_poll,
942 	.read		= fanotify_read,
943 	.write		= fanotify_write,
944 	.fasync		= NULL,
945 	.release	= fanotify_release,
946 	.unlocked_ioctl	= fanotify_ioctl,
947 	.compat_ioctl	= compat_ptr_ioctl,
948 	.llseek		= noop_llseek,
949 };
950 
951 static int fanotify_find_path(int dfd, const char __user *filename,
952 			      struct path *path, unsigned int flags, __u64 mask,
953 			      unsigned int obj_type)
954 {
955 	int ret;
956 
957 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
958 		 dfd, filename, flags);
959 
960 	if (filename == NULL) {
961 		struct fd f = fdget(dfd);
962 
963 		ret = -EBADF;
964 		if (!f.file)
965 			goto out;
966 
967 		ret = -ENOTDIR;
968 		if ((flags & FAN_MARK_ONLYDIR) &&
969 		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
970 			fdput(f);
971 			goto out;
972 		}
973 
974 		*path = f.file->f_path;
975 		path_get(path);
976 		fdput(f);
977 	} else {
978 		unsigned int lookup_flags = 0;
979 
980 		if (!(flags & FAN_MARK_DONT_FOLLOW))
981 			lookup_flags |= LOOKUP_FOLLOW;
982 		if (flags & FAN_MARK_ONLYDIR)
983 			lookup_flags |= LOOKUP_DIRECTORY;
984 
985 		ret = user_path_at(dfd, filename, lookup_flags, path);
986 		if (ret)
987 			goto out;
988 	}
989 
990 	/* you can only watch an inode if you have read permissions on it */
991 	ret = path_permission(path, MAY_READ);
992 	if (ret) {
993 		path_put(path);
994 		goto out;
995 	}
996 
997 	ret = security_path_notify(path, mask, obj_type);
998 	if (ret)
999 		path_put(path);
1000 
1001 out:
1002 	return ret;
1003 }
1004 
1005 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
1006 					    __u32 mask, unsigned int flags,
1007 					    __u32 umask, int *destroy)
1008 {
1009 	__u32 oldmask = 0;
1010 
1011 	/* umask bits cannot be removed by user */
1012 	mask &= ~umask;
1013 	spin_lock(&fsn_mark->lock);
1014 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
1015 		oldmask = fsn_mark->mask;
1016 		fsn_mark->mask &= ~mask;
1017 	} else {
1018 		fsn_mark->ignored_mask &= ~mask;
1019 	}
1020 	/*
1021 	 * We need to keep the mark around even if remaining mask cannot
1022 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1023 	 * changes to the mask.
1024 	 * Destroy mark when only umask bits remain.
1025 	 */
1026 	*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
1027 	spin_unlock(&fsn_mark->lock);
1028 
1029 	return mask & oldmask;
1030 }
1031 
1032 static int fanotify_remove_mark(struct fsnotify_group *group,
1033 				fsnotify_connp_t *connp, __u32 mask,
1034 				unsigned int flags, __u32 umask)
1035 {
1036 	struct fsnotify_mark *fsn_mark = NULL;
1037 	__u32 removed;
1038 	int destroy_mark;
1039 
1040 	mutex_lock(&group->mark_mutex);
1041 	fsn_mark = fsnotify_find_mark(connp, group);
1042 	if (!fsn_mark) {
1043 		mutex_unlock(&group->mark_mutex);
1044 		return -ENOENT;
1045 	}
1046 
1047 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1048 						 umask, &destroy_mark);
1049 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
1050 		fsnotify_recalc_mask(fsn_mark->connector);
1051 	if (destroy_mark)
1052 		fsnotify_detach_mark(fsn_mark);
1053 	mutex_unlock(&group->mark_mutex);
1054 	if (destroy_mark)
1055 		fsnotify_free_mark(fsn_mark);
1056 
1057 	/* matches the fsnotify_find_mark() */
1058 	fsnotify_put_mark(fsn_mark);
1059 	return 0;
1060 }
1061 
1062 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
1063 					 struct vfsmount *mnt, __u32 mask,
1064 					 unsigned int flags, __u32 umask)
1065 {
1066 	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
1067 				    mask, flags, umask);
1068 }
1069 
1070 static int fanotify_remove_sb_mark(struct fsnotify_group *group,
1071 				   struct super_block *sb, __u32 mask,
1072 				   unsigned int flags, __u32 umask)
1073 {
1074 	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
1075 				    flags, umask);
1076 }
1077 
1078 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
1079 				      struct inode *inode, __u32 mask,
1080 				      unsigned int flags, __u32 umask)
1081 {
1082 	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
1083 				    flags, umask);
1084 }
1085 
1086 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1087 				       __u32 mask,
1088 				       unsigned int flags)
1089 {
1090 	__u32 oldmask = -1;
1091 
1092 	spin_lock(&fsn_mark->lock);
1093 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
1094 		oldmask = fsn_mark->mask;
1095 		fsn_mark->mask |= mask;
1096 	} else {
1097 		fsn_mark->ignored_mask |= mask;
1098 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
1099 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1100 	}
1101 	spin_unlock(&fsn_mark->lock);
1102 
1103 	return mask & ~oldmask;
1104 }
1105 
1106 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1107 						   fsnotify_connp_t *connp,
1108 						   unsigned int obj_type,
1109 						   __kernel_fsid_t *fsid)
1110 {
1111 	struct ucounts *ucounts = group->fanotify_data.ucounts;
1112 	struct fsnotify_mark *mark;
1113 	int ret;
1114 
1115 	/*
1116 	 * Enforce per user marks limits per user in all containing user ns.
1117 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1118 	 * in the limited groups account.
1119 	 */
1120 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1121 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1122 		return ERR_PTR(-ENOSPC);
1123 
1124 	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1125 	if (!mark) {
1126 		ret = -ENOMEM;
1127 		goto out_dec_ucounts;
1128 	}
1129 
1130 	fsnotify_init_mark(mark, group);
1131 	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
1132 	if (ret) {
1133 		fsnotify_put_mark(mark);
1134 		goto out_dec_ucounts;
1135 	}
1136 
1137 	return mark;
1138 
1139 out_dec_ucounts:
1140 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1141 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1142 	return ERR_PTR(ret);
1143 }
1144 
1145 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1146 {
1147 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
1148 		return 0;
1149 
1150 	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1151 					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1152 					 sizeof(struct fanotify_error_event));
1153 }
1154 
1155 static int fanotify_add_mark(struct fsnotify_group *group,
1156 			     fsnotify_connp_t *connp, unsigned int obj_type,
1157 			     __u32 mask, unsigned int flags,
1158 			     __kernel_fsid_t *fsid)
1159 {
1160 	struct fsnotify_mark *fsn_mark;
1161 	__u32 added;
1162 	int ret = 0;
1163 
1164 	mutex_lock(&group->mark_mutex);
1165 	fsn_mark = fsnotify_find_mark(connp, group);
1166 	if (!fsn_mark) {
1167 		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
1168 		if (IS_ERR(fsn_mark)) {
1169 			mutex_unlock(&group->mark_mutex);
1170 			return PTR_ERR(fsn_mark);
1171 		}
1172 	}
1173 
1174 	/*
1175 	 * Error events are pre-allocated per group, only if strictly
1176 	 * needed (i.e. FAN_FS_ERROR was requested).
1177 	 */
1178 	if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
1179 		ret = fanotify_group_init_error_pool(group);
1180 		if (ret)
1181 			goto out;
1182 	}
1183 
1184 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
1185 	if (added & ~fsnotify_conn_mask(fsn_mark->connector))
1186 		fsnotify_recalc_mask(fsn_mark->connector);
1187 
1188 out:
1189 	mutex_unlock(&group->mark_mutex);
1190 
1191 	fsnotify_put_mark(fsn_mark);
1192 	return ret;
1193 }
1194 
1195 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
1196 				      struct vfsmount *mnt, __u32 mask,
1197 				      unsigned int flags, __kernel_fsid_t *fsid)
1198 {
1199 	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
1200 				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
1201 }
1202 
1203 static int fanotify_add_sb_mark(struct fsnotify_group *group,
1204 				struct super_block *sb, __u32 mask,
1205 				unsigned int flags, __kernel_fsid_t *fsid)
1206 {
1207 	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
1208 				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
1209 }
1210 
1211 static int fanotify_add_inode_mark(struct fsnotify_group *group,
1212 				   struct inode *inode, __u32 mask,
1213 				   unsigned int flags, __kernel_fsid_t *fsid)
1214 {
1215 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
1216 
1217 	/*
1218 	 * If some other task has this inode open for write we should not add
1219 	 * an ignored mark, unless that ignored mark is supposed to survive
1220 	 * modification changes anyway.
1221 	 */
1222 	if ((flags & FAN_MARK_IGNORED_MASK) &&
1223 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1224 	    inode_is_open_for_write(inode))
1225 		return 0;
1226 
1227 	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
1228 				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
1229 }
1230 
1231 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1232 {
1233 	struct fanotify_event *oevent;
1234 
1235 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1236 	if (!oevent)
1237 		return NULL;
1238 
1239 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1240 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1241 
1242 	return &oevent->fse;
1243 }
1244 
1245 static struct hlist_head *fanotify_alloc_merge_hash(void)
1246 {
1247 	struct hlist_head *hash;
1248 
1249 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1250 		       GFP_KERNEL_ACCOUNT);
1251 	if (!hash)
1252 		return NULL;
1253 
1254 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1255 
1256 	return hash;
1257 }
1258 
1259 /* fanotify syscalls */
1260 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1261 {
1262 	struct fsnotify_group *group;
1263 	int f_flags, fd;
1264 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1265 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1266 	unsigned int internal_flags = 0;
1267 
1268 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1269 		 __func__, flags, event_f_flags);
1270 
1271 	if (!capable(CAP_SYS_ADMIN)) {
1272 		/*
1273 		 * An unprivileged user can setup an fanotify group with
1274 		 * limited functionality - an unprivileged group is limited to
1275 		 * notification events with file handles and it cannot use
1276 		 * unlimited queue/marks.
1277 		 */
1278 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1279 			return -EPERM;
1280 
1281 		/*
1282 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1283 		 * prevents setting mount/filesystem marks on this group and
1284 		 * prevents reporting pid and open fd in events.
1285 		 */
1286 		internal_flags |= FANOTIFY_UNPRIV;
1287 	}
1288 
1289 #ifdef CONFIG_AUDITSYSCALL
1290 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1291 #else
1292 	if (flags & ~FANOTIFY_INIT_FLAGS)
1293 #endif
1294 		return -EINVAL;
1295 
1296 	/*
1297 	 * A pidfd can only be returned for a thread-group leader; thus
1298 	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1299 	 * exclusive.
1300 	 */
1301 	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1302 		return -EINVAL;
1303 
1304 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1305 		return -EINVAL;
1306 
1307 	switch (event_f_flags & O_ACCMODE) {
1308 	case O_RDONLY:
1309 	case O_RDWR:
1310 	case O_WRONLY:
1311 		break;
1312 	default:
1313 		return -EINVAL;
1314 	}
1315 
1316 	if (fid_mode && class != FAN_CLASS_NOTIF)
1317 		return -EINVAL;
1318 
1319 	/*
1320 	 * Child name is reported with parent fid so requires dir fid.
1321 	 * We can report both child fid and dir fid with or without name.
1322 	 */
1323 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1324 		return -EINVAL;
1325 
1326 	/*
1327 	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1328 	 * and is used as an indication to report both dir and child fid on all
1329 	 * dirent events.
1330 	 */
1331 	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1332 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1333 		return -EINVAL;
1334 
1335 	f_flags = O_RDWR | FMODE_NONOTIFY;
1336 	if (flags & FAN_CLOEXEC)
1337 		f_flags |= O_CLOEXEC;
1338 	if (flags & FAN_NONBLOCK)
1339 		f_flags |= O_NONBLOCK;
1340 
1341 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1342 	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
1343 	if (IS_ERR(group)) {
1344 		return PTR_ERR(group);
1345 	}
1346 
1347 	/* Enforce groups limits per user in all containing user ns */
1348 	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1349 						  current_euid(),
1350 						  UCOUNT_FANOTIFY_GROUPS);
1351 	if (!group->fanotify_data.ucounts) {
1352 		fd = -EMFILE;
1353 		goto out_destroy_group;
1354 	}
1355 
1356 	group->fanotify_data.flags = flags | internal_flags;
1357 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1358 
1359 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1360 	if (!group->fanotify_data.merge_hash) {
1361 		fd = -ENOMEM;
1362 		goto out_destroy_group;
1363 	}
1364 
1365 	group->overflow_event = fanotify_alloc_overflow_event();
1366 	if (unlikely(!group->overflow_event)) {
1367 		fd = -ENOMEM;
1368 		goto out_destroy_group;
1369 	}
1370 
1371 	if (force_o_largefile())
1372 		event_f_flags |= O_LARGEFILE;
1373 	group->fanotify_data.f_flags = event_f_flags;
1374 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1375 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1376 	switch (class) {
1377 	case FAN_CLASS_NOTIF:
1378 		group->priority = FS_PRIO_0;
1379 		break;
1380 	case FAN_CLASS_CONTENT:
1381 		group->priority = FS_PRIO_1;
1382 		break;
1383 	case FAN_CLASS_PRE_CONTENT:
1384 		group->priority = FS_PRIO_2;
1385 		break;
1386 	default:
1387 		fd = -EINVAL;
1388 		goto out_destroy_group;
1389 	}
1390 
1391 	if (flags & FAN_UNLIMITED_QUEUE) {
1392 		fd = -EPERM;
1393 		if (!capable(CAP_SYS_ADMIN))
1394 			goto out_destroy_group;
1395 		group->max_events = UINT_MAX;
1396 	} else {
1397 		group->max_events = fanotify_max_queued_events;
1398 	}
1399 
1400 	if (flags & FAN_UNLIMITED_MARKS) {
1401 		fd = -EPERM;
1402 		if (!capable(CAP_SYS_ADMIN))
1403 			goto out_destroy_group;
1404 	}
1405 
1406 	if (flags & FAN_ENABLE_AUDIT) {
1407 		fd = -EPERM;
1408 		if (!capable(CAP_AUDIT_WRITE))
1409 			goto out_destroy_group;
1410 	}
1411 
1412 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1413 	if (fd < 0)
1414 		goto out_destroy_group;
1415 
1416 	return fd;
1417 
1418 out_destroy_group:
1419 	fsnotify_destroy_group(group);
1420 	return fd;
1421 }
1422 
1423 static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
1424 {
1425 	__kernel_fsid_t root_fsid;
1426 	int err;
1427 
1428 	/*
1429 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1430 	 */
1431 	err = vfs_get_fsid(dentry, fsid);
1432 	if (err)
1433 		return err;
1434 
1435 	if (!fsid->val[0] && !fsid->val[1])
1436 		return -ENODEV;
1437 
1438 	/*
1439 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1440 	 * which uses a different fsid than sb root.
1441 	 */
1442 	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1443 	if (err)
1444 		return err;
1445 
1446 	if (root_fsid.val[0] != fsid->val[0] ||
1447 	    root_fsid.val[1] != fsid->val[1])
1448 		return -EXDEV;
1449 
1450 	return 0;
1451 }
1452 
1453 /* Check if filesystem can encode a unique fid */
1454 static int fanotify_test_fid(struct dentry *dentry)
1455 {
1456 	/*
1457 	 * We need to make sure that the file system supports at least
1458 	 * encoding a file handle so user can use name_to_handle_at() to
1459 	 * compare fid returned with event to the file handle of watched
1460 	 * objects. However, name_to_handle_at() requires that the
1461 	 * filesystem also supports decoding file handles.
1462 	 */
1463 	if (!dentry->d_sb->s_export_op ||
1464 	    !dentry->d_sb->s_export_op->fh_to_dentry)
1465 		return -EOPNOTSUPP;
1466 
1467 	return 0;
1468 }
1469 
1470 static int fanotify_events_supported(struct path *path, __u64 mask)
1471 {
1472 	/*
1473 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1474 	 * files. For them fanotify permission events have high chances of
1475 	 * deadlocking the system - open done when reporting fanotify event
1476 	 * blocks on this "unusual" lock while another process holding the lock
1477 	 * waits for fanotify permission event to be answered. Just disallow
1478 	 * permission events for such filesystems.
1479 	 */
1480 	if (mask & FANOTIFY_PERM_EVENTS &&
1481 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1482 		return -EINVAL;
1483 	return 0;
1484 }
1485 
1486 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1487 			    int dfd, const char  __user *pathname)
1488 {
1489 	struct inode *inode = NULL;
1490 	struct vfsmount *mnt = NULL;
1491 	struct fsnotify_group *group;
1492 	struct fd f;
1493 	struct path path;
1494 	__kernel_fsid_t __fsid, *fsid = NULL;
1495 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1496 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1497 	bool ignored = flags & FAN_MARK_IGNORED_MASK;
1498 	unsigned int obj_type, fid_mode;
1499 	u32 umask = 0;
1500 	int ret;
1501 
1502 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1503 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1504 
1505 	/* we only use the lower 32 bits as of right now. */
1506 	if (upper_32_bits(mask))
1507 		return -EINVAL;
1508 
1509 	if (flags & ~FANOTIFY_MARK_FLAGS)
1510 		return -EINVAL;
1511 
1512 	switch (mark_type) {
1513 	case FAN_MARK_INODE:
1514 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1515 		break;
1516 	case FAN_MARK_MOUNT:
1517 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1518 		break;
1519 	case FAN_MARK_FILESYSTEM:
1520 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1521 		break;
1522 	default:
1523 		return -EINVAL;
1524 	}
1525 
1526 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1527 	case FAN_MARK_ADD:
1528 	case FAN_MARK_REMOVE:
1529 		if (!mask)
1530 			return -EINVAL;
1531 		break;
1532 	case FAN_MARK_FLUSH:
1533 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1534 			return -EINVAL;
1535 		break;
1536 	default:
1537 		return -EINVAL;
1538 	}
1539 
1540 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1541 		valid_mask |= FANOTIFY_PERM_EVENTS;
1542 
1543 	if (mask & ~valid_mask)
1544 		return -EINVAL;
1545 
1546 	/* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1547 	if (ignored)
1548 		mask &= ~FANOTIFY_EVENT_FLAGS;
1549 
1550 	f = fdget(fanotify_fd);
1551 	if (unlikely(!f.file))
1552 		return -EBADF;
1553 
1554 	/* verify that this is indeed an fanotify instance */
1555 	ret = -EINVAL;
1556 	if (unlikely(f.file->f_op != &fanotify_fops))
1557 		goto fput_and_out;
1558 	group = f.file->private_data;
1559 
1560 	/*
1561 	 * An unprivileged user is not allowed to setup mount nor filesystem
1562 	 * marks.  This also includes setting up such marks by a group that
1563 	 * was initialized by an unprivileged user.
1564 	 */
1565 	ret = -EPERM;
1566 	if ((!capable(CAP_SYS_ADMIN) ||
1567 	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1568 	    mark_type != FAN_MARK_INODE)
1569 		goto fput_and_out;
1570 
1571 	/*
1572 	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1573 	 * allowed to set permissions events.
1574 	 */
1575 	ret = -EINVAL;
1576 	if (mask & FANOTIFY_PERM_EVENTS &&
1577 	    group->priority == FS_PRIO_0)
1578 		goto fput_and_out;
1579 
1580 	if (mask & FAN_FS_ERROR &&
1581 	    mark_type != FAN_MARK_FILESYSTEM)
1582 		goto fput_and_out;
1583 
1584 	/*
1585 	 * Events that do not carry enough information to report
1586 	 * event->fd require a group that supports reporting fid.  Those
1587 	 * events are not supported on a mount mark, because they do not
1588 	 * carry enough information (i.e. path) to be filtered by mount
1589 	 * point.
1590 	 */
1591 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1592 	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1593 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1594 		goto fput_and_out;
1595 
1596 	/*
1597 	 * FAN_RENAME uses special info type records to report the old and
1598 	 * new parent+name.  Reporting only old and new parent id is less
1599 	 * useful and was not implemented.
1600 	 */
1601 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1602 		goto fput_and_out;
1603 
1604 	if (flags & FAN_MARK_FLUSH) {
1605 		ret = 0;
1606 		if (mark_type == FAN_MARK_MOUNT)
1607 			fsnotify_clear_vfsmount_marks_by_group(group);
1608 		else if (mark_type == FAN_MARK_FILESYSTEM)
1609 			fsnotify_clear_sb_marks_by_group(group);
1610 		else
1611 			fsnotify_clear_inode_marks_by_group(group);
1612 		goto fput_and_out;
1613 	}
1614 
1615 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1616 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1617 	if (ret)
1618 		goto fput_and_out;
1619 
1620 	if (flags & FAN_MARK_ADD) {
1621 		ret = fanotify_events_supported(&path, mask);
1622 		if (ret)
1623 			goto path_put_and_out;
1624 	}
1625 
1626 	if (fid_mode) {
1627 		ret = fanotify_test_fsid(path.dentry, &__fsid);
1628 		if (ret)
1629 			goto path_put_and_out;
1630 
1631 		ret = fanotify_test_fid(path.dentry);
1632 		if (ret)
1633 			goto path_put_and_out;
1634 
1635 		fsid = &__fsid;
1636 	}
1637 
1638 	/* inode held in place by reference to path; group by fget on fd */
1639 	if (mark_type == FAN_MARK_INODE)
1640 		inode = path.dentry->d_inode;
1641 	else
1642 		mnt = path.mnt;
1643 
1644 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1645 	if (mnt || !S_ISDIR(inode->i_mode)) {
1646 		mask &= ~FAN_EVENT_ON_CHILD;
1647 		umask = FAN_EVENT_ON_CHILD;
1648 		/*
1649 		 * If group needs to report parent fid, register for getting
1650 		 * events with parent/name info for non-directory.
1651 		 */
1652 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
1653 		    (flags & FAN_MARK_ADD) && !ignored)
1654 			mask |= FAN_EVENT_ON_CHILD;
1655 	}
1656 
1657 	/* create/update an inode mark */
1658 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1659 	case FAN_MARK_ADD:
1660 		if (mark_type == FAN_MARK_MOUNT)
1661 			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1662 							 flags, fsid);
1663 		else if (mark_type == FAN_MARK_FILESYSTEM)
1664 			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1665 						   flags, fsid);
1666 		else
1667 			ret = fanotify_add_inode_mark(group, inode, mask,
1668 						      flags, fsid);
1669 		break;
1670 	case FAN_MARK_REMOVE:
1671 		if (mark_type == FAN_MARK_MOUNT)
1672 			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1673 							    flags, umask);
1674 		else if (mark_type == FAN_MARK_FILESYSTEM)
1675 			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1676 						      flags, umask);
1677 		else
1678 			ret = fanotify_remove_inode_mark(group, inode, mask,
1679 							 flags, umask);
1680 		break;
1681 	default:
1682 		ret = -EINVAL;
1683 	}
1684 
1685 path_put_and_out:
1686 	path_put(&path);
1687 fput_and_out:
1688 	fdput(f);
1689 	return ret;
1690 }
1691 
1692 #ifndef CONFIG_ARCH_SPLIT_ARG64
1693 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1694 			      __u64, mask, int, dfd,
1695 			      const char  __user *, pathname)
1696 {
1697 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1698 }
1699 #endif
1700 
1701 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1702 SYSCALL32_DEFINE6(fanotify_mark,
1703 				int, fanotify_fd, unsigned int, flags,
1704 				SC_ARG64(mask), int, dfd,
1705 				const char  __user *, pathname)
1706 {
1707 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1708 				dfd, pathname);
1709 }
1710 #endif
1711 
1712 /*
1713  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1714  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1715  * must result in panic().
1716  */
1717 static int __init fanotify_user_setup(void)
1718 {
1719 	struct sysinfo si;
1720 	int max_marks;
1721 
1722 	si_meminfo(&si);
1723 	/*
1724 	 * Allow up to 1% of addressable memory to be accounted for per user
1725 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
1726 	 * a lot cheaper than inode marks, but there is no reason for a user
1727 	 * to have many of those, so calculate by the cost of inode marks.
1728 	 */
1729 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1730 		    INODE_MARK_COST;
1731 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1732 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
1733 
1734 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1735 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
1736 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1737 
1738 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1739 					 SLAB_PANIC|SLAB_ACCOUNT);
1740 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1741 					       SLAB_PANIC);
1742 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1743 						SLAB_PANIC);
1744 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1745 		fanotify_perm_event_cachep =
1746 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1747 	}
1748 
1749 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1750 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1751 					FANOTIFY_DEFAULT_MAX_GROUPS;
1752 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1753 	fanotify_sysctls_init();
1754 
1755 	return 0;
1756 }
1757 device_initcall(fanotify_user_setup);
1758