1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/fdtable.h>
5 #include <linux/file.h>
6 #include <linux/fs.h>
7 #include <linux/anon_inodes.h>
8 #include <linux/fsnotify_backend.h>
9 #include <linux/init.h>
10 #include <linux/mount.h>
11 #include <linux/namei.h>
12 #include <linux/poll.h>
13 #include <linux/security.h>
14 #include <linux/syscalls.h>
15 #include <linux/slab.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/compat.h>
19 #include <linux/sched/signal.h>
20 #include <linux/memcontrol.h>
21 #include <linux/statfs.h>
22 #include <linux/exportfs.h>
23 
24 #include <asm/ioctls.h>
25 
26 #include "../../mount.h"
27 #include "../fdinfo.h"
28 #include "fanotify.h"
29 
30 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
31 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
32 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
33 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
34 
35 /*
36  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
37  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
38  * of fanotify marks per user is <max marks per group> * <max groups per user>.
39  * This default limit (1M) also happens to match the increased limit of inotify
40  * max_user_watches since v5.10.
41  */
42 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
43 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
44 
45 /*
46  * Most of the memory cost of adding an inode mark is pinning the marked inode.
47  * The size of the filesystem inode struct is not uniform across filesystems,
48  * so double the size of a VFS inode is used as a conservative approximation.
49  */
50 #define INODE_MARK_COST	(2 * sizeof(struct inode))
51 
52 /* configurable via /proc/sys/fs/fanotify/ */
53 static int fanotify_max_queued_events __read_mostly;
54 
55 #ifdef CONFIG_SYSCTL
56 
57 #include <linux/sysctl.h>
58 
59 static long ft_zero = 0;
60 static long ft_int_max = INT_MAX;
61 
62 struct ctl_table fanotify_table[] = {
63 	{
64 		.procname	= "max_user_groups",
65 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
66 		.maxlen		= sizeof(long),
67 		.mode		= 0644,
68 		.proc_handler	= proc_doulongvec_minmax,
69 		.extra1		= &ft_zero,
70 		.extra2		= &ft_int_max,
71 	},
72 	{
73 		.procname	= "max_user_marks",
74 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
75 		.maxlen		= sizeof(long),
76 		.mode		= 0644,
77 		.proc_handler	= proc_doulongvec_minmax,
78 		.extra1		= &ft_zero,
79 		.extra2		= &ft_int_max,
80 	},
81 	{
82 		.procname	= "max_queued_events",
83 		.data		= &fanotify_max_queued_events,
84 		.maxlen		= sizeof(int),
85 		.mode		= 0644,
86 		.proc_handler	= proc_dointvec_minmax,
87 		.extra1		= SYSCTL_ZERO
88 	},
89 	{ }
90 };
91 #endif /* CONFIG_SYSCTL */
92 
93 /*
94  * All flags that may be specified in parameter event_f_flags of fanotify_init.
95  *
96  * Internal and external open flags are stored together in field f_flags of
97  * struct file. Only external open flags shall be allowed in event_f_flags.
98  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
99  * excluded.
100  */
101 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
102 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
103 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
104 		O_LARGEFILE	| O_NOATIME	)
105 
106 extern const struct fsnotify_ops fanotify_fsnotify_ops;
107 
108 struct kmem_cache *fanotify_mark_cache __read_mostly;
109 struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
110 struct kmem_cache *fanotify_path_event_cachep __read_mostly;
111 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
112 
113 #define FANOTIFY_EVENT_ALIGN 4
114 #define FANOTIFY_FID_INFO_HDR_LEN \
115 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
116 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
117 	sizeof(struct fanotify_event_info_pidfd)
118 #define FANOTIFY_ERROR_INFO_LEN \
119 	(sizeof(struct fanotify_event_info_error))
120 
121 static int fanotify_fid_info_len(int fh_len, int name_len)
122 {
123 	int info_len = fh_len;
124 
125 	if (name_len)
126 		info_len += name_len + 1;
127 
128 	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
129 		       FANOTIFY_EVENT_ALIGN);
130 }
131 
132 /* FAN_RENAME may have one or two dir+name info records */
133 static int fanotify_dir_name_info_len(struct fanotify_event *event)
134 {
135 	struct fanotify_info *info = fanotify_event_info(event);
136 	int dir_fh_len = fanotify_event_dir_fh_len(event);
137 	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
138 	int info_len = 0;
139 
140 	if (dir_fh_len)
141 		info_len += fanotify_fid_info_len(dir_fh_len,
142 						  info->name_len);
143 	if (dir2_fh_len)
144 		info_len += fanotify_fid_info_len(dir2_fh_len,
145 						  info->name2_len);
146 
147 	return info_len;
148 }
149 
150 static size_t fanotify_event_len(unsigned int info_mode,
151 				 struct fanotify_event *event)
152 {
153 	size_t event_len = FAN_EVENT_METADATA_LEN;
154 	struct fanotify_info *info;
155 	int fh_len;
156 	int dot_len = 0;
157 
158 	if (!info_mode)
159 		return event_len;
160 
161 	if (fanotify_is_error_event(event->mask))
162 		event_len += FANOTIFY_ERROR_INFO_LEN;
163 
164 	info = fanotify_event_info(event);
165 
166 	if (fanotify_event_has_any_dir_fh(event)) {
167 		event_len += fanotify_dir_name_info_len(event);
168 	} else if ((info_mode & FAN_REPORT_NAME) &&
169 		   (event->mask & FAN_ONDIR)) {
170 		/*
171 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
172 		 * event on a directory, we will report the name ".".
173 		 */
174 		dot_len = 1;
175 	}
176 
177 	if (info_mode & FAN_REPORT_PIDFD)
178 		event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
179 
180 	if (fanotify_event_has_object_fh(event)) {
181 		fh_len = fanotify_event_object_fh_len(event);
182 		event_len += fanotify_fid_info_len(fh_len, dot_len);
183 	}
184 
185 	return event_len;
186 }
187 
188 /*
189  * Remove an hashed event from merge hash table.
190  */
191 static void fanotify_unhash_event(struct fsnotify_group *group,
192 				  struct fanotify_event *event)
193 {
194 	assert_spin_locked(&group->notification_lock);
195 
196 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
197 		 group, event, fanotify_event_hash_bucket(group, event));
198 
199 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
200 		return;
201 
202 	hlist_del_init(&event->merge_list);
203 }
204 
205 /*
206  * Get an fanotify notification event if one exists and is small
207  * enough to fit in "count". Return an error pointer if the count
208  * is not large enough. When permission event is dequeued, its state is
209  * updated accordingly.
210  */
211 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
212 					    size_t count)
213 {
214 	size_t event_size;
215 	struct fanotify_event *event = NULL;
216 	struct fsnotify_event *fsn_event;
217 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
218 
219 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
220 
221 	spin_lock(&group->notification_lock);
222 	fsn_event = fsnotify_peek_first_event(group);
223 	if (!fsn_event)
224 		goto out;
225 
226 	event = FANOTIFY_E(fsn_event);
227 	event_size = fanotify_event_len(info_mode, event);
228 
229 	if (event_size > count) {
230 		event = ERR_PTR(-EINVAL);
231 		goto out;
232 	}
233 
234 	/*
235 	 * Held the notification_lock the whole time, so this is the
236 	 * same event we peeked above.
237 	 */
238 	fsnotify_remove_first_event(group);
239 	if (fanotify_is_perm_event(event->mask))
240 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
241 	if (fanotify_is_hashed_event(event->mask))
242 		fanotify_unhash_event(group, event);
243 out:
244 	spin_unlock(&group->notification_lock);
245 	return event;
246 }
247 
248 static int create_fd(struct fsnotify_group *group, struct path *path,
249 		     struct file **file)
250 {
251 	int client_fd;
252 	struct file *new_file;
253 
254 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
255 	if (client_fd < 0)
256 		return client_fd;
257 
258 	/*
259 	 * we need a new file handle for the userspace program so it can read even if it was
260 	 * originally opened O_WRONLY.
261 	 */
262 	new_file = dentry_open(path,
263 			       group->fanotify_data.f_flags | FMODE_NONOTIFY,
264 			       current_cred());
265 	if (IS_ERR(new_file)) {
266 		/*
267 		 * we still send an event even if we can't open the file.  this
268 		 * can happen when say tasks are gone and we try to open their
269 		 * /proc files or we try to open a WRONLY file like in sysfs
270 		 * we just send the errno to userspace since there isn't much
271 		 * else we can do.
272 		 */
273 		put_unused_fd(client_fd);
274 		client_fd = PTR_ERR(new_file);
275 	} else {
276 		*file = new_file;
277 	}
278 
279 	return client_fd;
280 }
281 
282 /*
283  * Finish processing of permission event by setting it to ANSWERED state and
284  * drop group->notification_lock.
285  */
286 static void finish_permission_event(struct fsnotify_group *group,
287 				    struct fanotify_perm_event *event,
288 				    unsigned int response)
289 				    __releases(&group->notification_lock)
290 {
291 	bool destroy = false;
292 
293 	assert_spin_locked(&group->notification_lock);
294 	event->response = response;
295 	if (event->state == FAN_EVENT_CANCELED)
296 		destroy = true;
297 	else
298 		event->state = FAN_EVENT_ANSWERED;
299 	spin_unlock(&group->notification_lock);
300 	if (destroy)
301 		fsnotify_destroy_event(group, &event->fae.fse);
302 }
303 
304 static int process_access_response(struct fsnotify_group *group,
305 				   struct fanotify_response *response_struct)
306 {
307 	struct fanotify_perm_event *event;
308 	int fd = response_struct->fd;
309 	int response = response_struct->response;
310 
311 	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
312 		 fd, response);
313 	/*
314 	 * make sure the response is valid, if invalid we do nothing and either
315 	 * userspace can send a valid response or we will clean it up after the
316 	 * timeout
317 	 */
318 	switch (response & ~FAN_AUDIT) {
319 	case FAN_ALLOW:
320 	case FAN_DENY:
321 		break;
322 	default:
323 		return -EINVAL;
324 	}
325 
326 	if (fd < 0)
327 		return -EINVAL;
328 
329 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
330 		return -EINVAL;
331 
332 	spin_lock(&group->notification_lock);
333 	list_for_each_entry(event, &group->fanotify_data.access_list,
334 			    fae.fse.list) {
335 		if (event->fd != fd)
336 			continue;
337 
338 		list_del_init(&event->fae.fse.list);
339 		finish_permission_event(group, event, response);
340 		wake_up(&group->fanotify_data.access_waitq);
341 		return 0;
342 	}
343 	spin_unlock(&group->notification_lock);
344 
345 	return -ENOENT;
346 }
347 
348 static size_t copy_error_info_to_user(struct fanotify_event *event,
349 				      char __user *buf, int count)
350 {
351 	struct fanotify_event_info_error info = { };
352 	struct fanotify_error_event *fee = FANOTIFY_EE(event);
353 
354 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
355 	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
356 
357 	if (WARN_ON(count < info.hdr.len))
358 		return -EFAULT;
359 
360 	info.error = fee->error;
361 	info.error_count = fee->err_count;
362 
363 	if (copy_to_user(buf, &info, sizeof(info)))
364 		return -EFAULT;
365 
366 	return info.hdr.len;
367 }
368 
369 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
370 				 int info_type, const char *name,
371 				 size_t name_len,
372 				 char __user *buf, size_t count)
373 {
374 	struct fanotify_event_info_fid info = { };
375 	struct file_handle handle = { };
376 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
377 	size_t fh_len = fh ? fh->len : 0;
378 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
379 	size_t len = info_len;
380 
381 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
382 		 __func__, fh_len, name_len, info_len, count);
383 
384 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
385 		return -EFAULT;
386 
387 	/*
388 	 * Copy event info fid header followed by variable sized file handle
389 	 * and optionally followed by variable sized filename.
390 	 */
391 	switch (info_type) {
392 	case FAN_EVENT_INFO_TYPE_FID:
393 	case FAN_EVENT_INFO_TYPE_DFID:
394 		if (WARN_ON_ONCE(name_len))
395 			return -EFAULT;
396 		break;
397 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
398 	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
399 	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
400 		if (WARN_ON_ONCE(!name || !name_len))
401 			return -EFAULT;
402 		break;
403 	default:
404 		return -EFAULT;
405 	}
406 
407 	info.hdr.info_type = info_type;
408 	info.hdr.len = len;
409 	info.fsid = *fsid;
410 	if (copy_to_user(buf, &info, sizeof(info)))
411 		return -EFAULT;
412 
413 	buf += sizeof(info);
414 	len -= sizeof(info);
415 	if (WARN_ON_ONCE(len < sizeof(handle)))
416 		return -EFAULT;
417 
418 	handle.handle_type = fh->type;
419 	handle.handle_bytes = fh_len;
420 
421 	/* Mangle handle_type for bad file_handle */
422 	if (!fh_len)
423 		handle.handle_type = FILEID_INVALID;
424 
425 	if (copy_to_user(buf, &handle, sizeof(handle)))
426 		return -EFAULT;
427 
428 	buf += sizeof(handle);
429 	len -= sizeof(handle);
430 	if (WARN_ON_ONCE(len < fh_len))
431 		return -EFAULT;
432 
433 	/*
434 	 * For an inline fh and inline file name, copy through stack to exclude
435 	 * the copy from usercopy hardening protections.
436 	 */
437 	fh_buf = fanotify_fh_buf(fh);
438 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
439 		memcpy(bounce, fh_buf, fh_len);
440 		fh_buf = bounce;
441 	}
442 	if (copy_to_user(buf, fh_buf, fh_len))
443 		return -EFAULT;
444 
445 	buf += fh_len;
446 	len -= fh_len;
447 
448 	if (name_len) {
449 		/* Copy the filename with terminating null */
450 		name_len++;
451 		if (WARN_ON_ONCE(len < name_len))
452 			return -EFAULT;
453 
454 		if (copy_to_user(buf, name, name_len))
455 			return -EFAULT;
456 
457 		buf += name_len;
458 		len -= name_len;
459 	}
460 
461 	/* Pad with 0's */
462 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
463 	if (len > 0 && clear_user(buf, len))
464 		return -EFAULT;
465 
466 	return info_len;
467 }
468 
469 static int copy_pidfd_info_to_user(int pidfd,
470 				   char __user *buf,
471 				   size_t count)
472 {
473 	struct fanotify_event_info_pidfd info = { };
474 	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
475 
476 	if (WARN_ON_ONCE(info_len > count))
477 		return -EFAULT;
478 
479 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
480 	info.hdr.len = info_len;
481 	info.pidfd = pidfd;
482 
483 	if (copy_to_user(buf, &info, info_len))
484 		return -EFAULT;
485 
486 	return info_len;
487 }
488 
489 static int copy_info_records_to_user(struct fanotify_event *event,
490 				     struct fanotify_info *info,
491 				     unsigned int info_mode, int pidfd,
492 				     char __user *buf, size_t count)
493 {
494 	int ret, total_bytes = 0, info_type = 0;
495 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
496 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
497 
498 	/*
499 	 * Event info records order is as follows:
500 	 * 1. dir fid + name
501 	 * 2. (optional) new dir fid + new name
502 	 * 3. (optional) child fid
503 	 */
504 	if (fanotify_event_has_dir_fh(event)) {
505 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
506 					     FAN_EVENT_INFO_TYPE_DFID;
507 
508 		/* FAN_RENAME uses special info types */
509 		if (event->mask & FAN_RENAME)
510 			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
511 
512 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
513 					    fanotify_info_dir_fh(info),
514 					    info_type,
515 					    fanotify_info_name(info),
516 					    info->name_len, buf, count);
517 		if (ret < 0)
518 			return ret;
519 
520 		buf += ret;
521 		count -= ret;
522 		total_bytes += ret;
523 	}
524 
525 	/* New dir fid+name may be reported in addition to old dir fid+name */
526 	if (fanotify_event_has_dir2_fh(event)) {
527 		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
528 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
529 					    fanotify_info_dir2_fh(info),
530 					    info_type,
531 					    fanotify_info_name2(info),
532 					    info->name2_len, buf, count);
533 		if (ret < 0)
534 			return ret;
535 
536 		buf += ret;
537 		count -= ret;
538 		total_bytes += ret;
539 	}
540 
541 	if (fanotify_event_has_object_fh(event)) {
542 		const char *dot = NULL;
543 		int dot_len = 0;
544 
545 		if (fid_mode == FAN_REPORT_FID || info_type) {
546 			/*
547 			 * With only group flag FAN_REPORT_FID only type FID is
548 			 * reported. Second info record type is always FID.
549 			 */
550 			info_type = FAN_EVENT_INFO_TYPE_FID;
551 		} else if ((fid_mode & FAN_REPORT_NAME) &&
552 			   (event->mask & FAN_ONDIR)) {
553 			/*
554 			 * With group flag FAN_REPORT_NAME, if name was not
555 			 * recorded in an event on a directory, report the name
556 			 * "." with info type DFID_NAME.
557 			 */
558 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
559 			dot = ".";
560 			dot_len = 1;
561 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
562 			   (event->mask & FAN_ONDIR)) {
563 			/*
564 			 * With group flag FAN_REPORT_DIR_FID, a single info
565 			 * record has type DFID for directory entry modification
566 			 * event and for event on a directory.
567 			 */
568 			info_type = FAN_EVENT_INFO_TYPE_DFID;
569 		} else {
570 			/*
571 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
572 			 * a single info record has type FID for event on a
573 			 * non-directory, when there is no directory to report.
574 			 * For example, on FAN_DELETE_SELF event.
575 			 */
576 			info_type = FAN_EVENT_INFO_TYPE_FID;
577 		}
578 
579 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
580 					    fanotify_event_object_fh(event),
581 					    info_type, dot, dot_len,
582 					    buf, count);
583 		if (ret < 0)
584 			return ret;
585 
586 		buf += ret;
587 		count -= ret;
588 		total_bytes += ret;
589 	}
590 
591 	if (pidfd_mode) {
592 		ret = copy_pidfd_info_to_user(pidfd, buf, count);
593 		if (ret < 0)
594 			return ret;
595 
596 		buf += ret;
597 		count -= ret;
598 		total_bytes += ret;
599 	}
600 
601 	if (fanotify_is_error_event(event->mask)) {
602 		ret = copy_error_info_to_user(event, buf, count);
603 		if (ret < 0)
604 			return ret;
605 		buf += ret;
606 		count -= ret;
607 		total_bytes += ret;
608 	}
609 
610 	return total_bytes;
611 }
612 
613 static ssize_t copy_event_to_user(struct fsnotify_group *group,
614 				  struct fanotify_event *event,
615 				  char __user *buf, size_t count)
616 {
617 	struct fanotify_event_metadata metadata;
618 	struct path *path = fanotify_event_path(event);
619 	struct fanotify_info *info = fanotify_event_info(event);
620 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
621 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
622 	struct file *f = NULL;
623 	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
624 
625 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
626 
627 	metadata.event_len = fanotify_event_len(info_mode, event);
628 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
629 	metadata.vers = FANOTIFY_METADATA_VERSION;
630 	metadata.reserved = 0;
631 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
632 	metadata.pid = pid_vnr(event->pid);
633 	/*
634 	 * For an unprivileged listener, event->pid can be used to identify the
635 	 * events generated by the listener process itself, without disclosing
636 	 * the pids of other processes.
637 	 */
638 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
639 	    task_tgid(current) != event->pid)
640 		metadata.pid = 0;
641 
642 	/*
643 	 * For now, fid mode is required for an unprivileged listener and
644 	 * fid mode does not report fd in events.  Keep this check anyway
645 	 * for safety in case fid mode requirement is relaxed in the future
646 	 * to allow unprivileged listener to get events with no fd and no fid.
647 	 */
648 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
649 	    path && path->mnt && path->dentry) {
650 		fd = create_fd(group, path, &f);
651 		if (fd < 0)
652 			return fd;
653 	}
654 	metadata.fd = fd;
655 
656 	if (pidfd_mode) {
657 		/*
658 		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
659 		 * exclusion is ever lifted. At the time of incoporating pidfd
660 		 * support within fanotify, the pidfd API only supported the
661 		 * creation of pidfds for thread-group leaders.
662 		 */
663 		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
664 
665 		/*
666 		 * The PIDTYPE_TGID check for an event->pid is performed
667 		 * preemptively in an attempt to catch out cases where the event
668 		 * listener reads events after the event generating process has
669 		 * already terminated. Report FAN_NOPIDFD to the event listener
670 		 * in those cases, with all other pidfd creation errors being
671 		 * reported as FAN_EPIDFD.
672 		 */
673 		if (metadata.pid == 0 ||
674 		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
675 			pidfd = FAN_NOPIDFD;
676 		} else {
677 			pidfd = pidfd_create(event->pid, 0);
678 			if (pidfd < 0)
679 				pidfd = FAN_EPIDFD;
680 		}
681 	}
682 
683 	ret = -EFAULT;
684 	/*
685 	 * Sanity check copy size in case get_one_event() and
686 	 * event_len sizes ever get out of sync.
687 	 */
688 	if (WARN_ON_ONCE(metadata.event_len > count))
689 		goto out_close_fd;
690 
691 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
692 		goto out_close_fd;
693 
694 	buf += FAN_EVENT_METADATA_LEN;
695 	count -= FAN_EVENT_METADATA_LEN;
696 
697 	if (fanotify_is_perm_event(event->mask))
698 		FANOTIFY_PERM(event)->fd = fd;
699 
700 	if (f)
701 		fd_install(fd, f);
702 
703 	if (info_mode) {
704 		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
705 						buf, count);
706 		if (ret < 0)
707 			goto out_close_fd;
708 	}
709 
710 	return metadata.event_len;
711 
712 out_close_fd:
713 	if (fd != FAN_NOFD) {
714 		put_unused_fd(fd);
715 		fput(f);
716 	}
717 
718 	if (pidfd >= 0)
719 		close_fd(pidfd);
720 
721 	return ret;
722 }
723 
724 /* intofiy userspace file descriptor functions */
725 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
726 {
727 	struct fsnotify_group *group = file->private_data;
728 	__poll_t ret = 0;
729 
730 	poll_wait(file, &group->notification_waitq, wait);
731 	spin_lock(&group->notification_lock);
732 	if (!fsnotify_notify_queue_is_empty(group))
733 		ret = EPOLLIN | EPOLLRDNORM;
734 	spin_unlock(&group->notification_lock);
735 
736 	return ret;
737 }
738 
739 static ssize_t fanotify_read(struct file *file, char __user *buf,
740 			     size_t count, loff_t *pos)
741 {
742 	struct fsnotify_group *group;
743 	struct fanotify_event *event;
744 	char __user *start;
745 	int ret;
746 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
747 
748 	start = buf;
749 	group = file->private_data;
750 
751 	pr_debug("%s: group=%p\n", __func__, group);
752 
753 	add_wait_queue(&group->notification_waitq, &wait);
754 	while (1) {
755 		/*
756 		 * User can supply arbitrarily large buffer. Avoid softlockups
757 		 * in case there are lots of available events.
758 		 */
759 		cond_resched();
760 		event = get_one_event(group, count);
761 		if (IS_ERR(event)) {
762 			ret = PTR_ERR(event);
763 			break;
764 		}
765 
766 		if (!event) {
767 			ret = -EAGAIN;
768 			if (file->f_flags & O_NONBLOCK)
769 				break;
770 
771 			ret = -ERESTARTSYS;
772 			if (signal_pending(current))
773 				break;
774 
775 			if (start != buf)
776 				break;
777 
778 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
779 			continue;
780 		}
781 
782 		ret = copy_event_to_user(group, event, buf, count);
783 		if (unlikely(ret == -EOPENSTALE)) {
784 			/*
785 			 * We cannot report events with stale fd so drop it.
786 			 * Setting ret to 0 will continue the event loop and
787 			 * do the right thing if there are no more events to
788 			 * read (i.e. return bytes read, -EAGAIN or wait).
789 			 */
790 			ret = 0;
791 		}
792 
793 		/*
794 		 * Permission events get queued to wait for response.  Other
795 		 * events can be destroyed now.
796 		 */
797 		if (!fanotify_is_perm_event(event->mask)) {
798 			fsnotify_destroy_event(group, &event->fse);
799 		} else {
800 			if (ret <= 0) {
801 				spin_lock(&group->notification_lock);
802 				finish_permission_event(group,
803 					FANOTIFY_PERM(event), FAN_DENY);
804 				wake_up(&group->fanotify_data.access_waitq);
805 			} else {
806 				spin_lock(&group->notification_lock);
807 				list_add_tail(&event->fse.list,
808 					&group->fanotify_data.access_list);
809 				spin_unlock(&group->notification_lock);
810 			}
811 		}
812 		if (ret < 0)
813 			break;
814 		buf += ret;
815 		count -= ret;
816 	}
817 	remove_wait_queue(&group->notification_waitq, &wait);
818 
819 	if (start != buf && ret != -EFAULT)
820 		ret = buf - start;
821 	return ret;
822 }
823 
824 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
825 {
826 	struct fanotify_response response = { .fd = -1, .response = -1 };
827 	struct fsnotify_group *group;
828 	int ret;
829 
830 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
831 		return -EINVAL;
832 
833 	group = file->private_data;
834 
835 	if (count < sizeof(response))
836 		return -EINVAL;
837 
838 	count = sizeof(response);
839 
840 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
841 
842 	if (copy_from_user(&response, buf, count))
843 		return -EFAULT;
844 
845 	ret = process_access_response(group, &response);
846 	if (ret < 0)
847 		count = ret;
848 
849 	return count;
850 }
851 
852 static int fanotify_release(struct inode *ignored, struct file *file)
853 {
854 	struct fsnotify_group *group = file->private_data;
855 	struct fsnotify_event *fsn_event;
856 
857 	/*
858 	 * Stop new events from arriving in the notification queue. since
859 	 * userspace cannot use fanotify fd anymore, no event can enter or
860 	 * leave access_list by now either.
861 	 */
862 	fsnotify_group_stop_queueing(group);
863 
864 	/*
865 	 * Process all permission events on access_list and notification queue
866 	 * and simulate reply from userspace.
867 	 */
868 	spin_lock(&group->notification_lock);
869 	while (!list_empty(&group->fanotify_data.access_list)) {
870 		struct fanotify_perm_event *event;
871 
872 		event = list_first_entry(&group->fanotify_data.access_list,
873 				struct fanotify_perm_event, fae.fse.list);
874 		list_del_init(&event->fae.fse.list);
875 		finish_permission_event(group, event, FAN_ALLOW);
876 		spin_lock(&group->notification_lock);
877 	}
878 
879 	/*
880 	 * Destroy all non-permission events. For permission events just
881 	 * dequeue them and set the response. They will be freed once the
882 	 * response is consumed and fanotify_get_response() returns.
883 	 */
884 	while ((fsn_event = fsnotify_remove_first_event(group))) {
885 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
886 
887 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
888 			spin_unlock(&group->notification_lock);
889 			fsnotify_destroy_event(group, fsn_event);
890 		} else {
891 			finish_permission_event(group, FANOTIFY_PERM(event),
892 						FAN_ALLOW);
893 		}
894 		spin_lock(&group->notification_lock);
895 	}
896 	spin_unlock(&group->notification_lock);
897 
898 	/* Response for all permission events it set, wakeup waiters */
899 	wake_up(&group->fanotify_data.access_waitq);
900 
901 	/* matches the fanotify_init->fsnotify_alloc_group */
902 	fsnotify_destroy_group(group);
903 
904 	return 0;
905 }
906 
907 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
908 {
909 	struct fsnotify_group *group;
910 	struct fsnotify_event *fsn_event;
911 	void __user *p;
912 	int ret = -ENOTTY;
913 	size_t send_len = 0;
914 
915 	group = file->private_data;
916 
917 	p = (void __user *) arg;
918 
919 	switch (cmd) {
920 	case FIONREAD:
921 		spin_lock(&group->notification_lock);
922 		list_for_each_entry(fsn_event, &group->notification_list, list)
923 			send_len += FAN_EVENT_METADATA_LEN;
924 		spin_unlock(&group->notification_lock);
925 		ret = put_user(send_len, (int __user *) p);
926 		break;
927 	}
928 
929 	return ret;
930 }
931 
932 static const struct file_operations fanotify_fops = {
933 	.show_fdinfo	= fanotify_show_fdinfo,
934 	.poll		= fanotify_poll,
935 	.read		= fanotify_read,
936 	.write		= fanotify_write,
937 	.fasync		= NULL,
938 	.release	= fanotify_release,
939 	.unlocked_ioctl	= fanotify_ioctl,
940 	.compat_ioctl	= compat_ptr_ioctl,
941 	.llseek		= noop_llseek,
942 };
943 
944 static int fanotify_find_path(int dfd, const char __user *filename,
945 			      struct path *path, unsigned int flags, __u64 mask,
946 			      unsigned int obj_type)
947 {
948 	int ret;
949 
950 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
951 		 dfd, filename, flags);
952 
953 	if (filename == NULL) {
954 		struct fd f = fdget(dfd);
955 
956 		ret = -EBADF;
957 		if (!f.file)
958 			goto out;
959 
960 		ret = -ENOTDIR;
961 		if ((flags & FAN_MARK_ONLYDIR) &&
962 		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
963 			fdput(f);
964 			goto out;
965 		}
966 
967 		*path = f.file->f_path;
968 		path_get(path);
969 		fdput(f);
970 	} else {
971 		unsigned int lookup_flags = 0;
972 
973 		if (!(flags & FAN_MARK_DONT_FOLLOW))
974 			lookup_flags |= LOOKUP_FOLLOW;
975 		if (flags & FAN_MARK_ONLYDIR)
976 			lookup_flags |= LOOKUP_DIRECTORY;
977 
978 		ret = user_path_at(dfd, filename, lookup_flags, path);
979 		if (ret)
980 			goto out;
981 	}
982 
983 	/* you can only watch an inode if you have read permissions on it */
984 	ret = path_permission(path, MAY_READ);
985 	if (ret) {
986 		path_put(path);
987 		goto out;
988 	}
989 
990 	ret = security_path_notify(path, mask, obj_type);
991 	if (ret)
992 		path_put(path);
993 
994 out:
995 	return ret;
996 }
997 
998 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
999 					    __u32 mask, unsigned int flags,
1000 					    __u32 umask, int *destroy)
1001 {
1002 	__u32 oldmask = 0;
1003 
1004 	/* umask bits cannot be removed by user */
1005 	mask &= ~umask;
1006 	spin_lock(&fsn_mark->lock);
1007 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
1008 		oldmask = fsn_mark->mask;
1009 		fsn_mark->mask &= ~mask;
1010 	} else {
1011 		fsn_mark->ignored_mask &= ~mask;
1012 	}
1013 	/*
1014 	 * We need to keep the mark around even if remaining mask cannot
1015 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1016 	 * changes to the mask.
1017 	 * Destroy mark when only umask bits remain.
1018 	 */
1019 	*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
1020 	spin_unlock(&fsn_mark->lock);
1021 
1022 	return mask & oldmask;
1023 }
1024 
1025 static int fanotify_remove_mark(struct fsnotify_group *group,
1026 				fsnotify_connp_t *connp, __u32 mask,
1027 				unsigned int flags, __u32 umask)
1028 {
1029 	struct fsnotify_mark *fsn_mark = NULL;
1030 	__u32 removed;
1031 	int destroy_mark;
1032 
1033 	mutex_lock(&group->mark_mutex);
1034 	fsn_mark = fsnotify_find_mark(connp, group);
1035 	if (!fsn_mark) {
1036 		mutex_unlock(&group->mark_mutex);
1037 		return -ENOENT;
1038 	}
1039 
1040 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1041 						 umask, &destroy_mark);
1042 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
1043 		fsnotify_recalc_mask(fsn_mark->connector);
1044 	if (destroy_mark)
1045 		fsnotify_detach_mark(fsn_mark);
1046 	mutex_unlock(&group->mark_mutex);
1047 	if (destroy_mark)
1048 		fsnotify_free_mark(fsn_mark);
1049 
1050 	/* matches the fsnotify_find_mark() */
1051 	fsnotify_put_mark(fsn_mark);
1052 	return 0;
1053 }
1054 
1055 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
1056 					 struct vfsmount *mnt, __u32 mask,
1057 					 unsigned int flags, __u32 umask)
1058 {
1059 	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
1060 				    mask, flags, umask);
1061 }
1062 
1063 static int fanotify_remove_sb_mark(struct fsnotify_group *group,
1064 				   struct super_block *sb, __u32 mask,
1065 				   unsigned int flags, __u32 umask)
1066 {
1067 	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
1068 				    flags, umask);
1069 }
1070 
1071 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
1072 				      struct inode *inode, __u32 mask,
1073 				      unsigned int flags, __u32 umask)
1074 {
1075 	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
1076 				    flags, umask);
1077 }
1078 
1079 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1080 				       __u32 mask,
1081 				       unsigned int flags)
1082 {
1083 	__u32 oldmask = -1;
1084 
1085 	spin_lock(&fsn_mark->lock);
1086 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
1087 		oldmask = fsn_mark->mask;
1088 		fsn_mark->mask |= mask;
1089 	} else {
1090 		fsn_mark->ignored_mask |= mask;
1091 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
1092 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1093 	}
1094 	spin_unlock(&fsn_mark->lock);
1095 
1096 	return mask & ~oldmask;
1097 }
1098 
1099 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1100 						   fsnotify_connp_t *connp,
1101 						   unsigned int obj_type,
1102 						   __kernel_fsid_t *fsid)
1103 {
1104 	struct ucounts *ucounts = group->fanotify_data.ucounts;
1105 	struct fsnotify_mark *mark;
1106 	int ret;
1107 
1108 	/*
1109 	 * Enforce per user marks limits per user in all containing user ns.
1110 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1111 	 * in the limited groups account.
1112 	 */
1113 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1114 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1115 		return ERR_PTR(-ENOSPC);
1116 
1117 	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1118 	if (!mark) {
1119 		ret = -ENOMEM;
1120 		goto out_dec_ucounts;
1121 	}
1122 
1123 	fsnotify_init_mark(mark, group);
1124 	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
1125 	if (ret) {
1126 		fsnotify_put_mark(mark);
1127 		goto out_dec_ucounts;
1128 	}
1129 
1130 	return mark;
1131 
1132 out_dec_ucounts:
1133 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1134 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1135 	return ERR_PTR(ret);
1136 }
1137 
1138 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1139 {
1140 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
1141 		return 0;
1142 
1143 	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1144 					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1145 					 sizeof(struct fanotify_error_event));
1146 }
1147 
1148 static int fanotify_add_mark(struct fsnotify_group *group,
1149 			     fsnotify_connp_t *connp, unsigned int obj_type,
1150 			     __u32 mask, unsigned int flags,
1151 			     __kernel_fsid_t *fsid)
1152 {
1153 	struct fsnotify_mark *fsn_mark;
1154 	__u32 added;
1155 	int ret = 0;
1156 
1157 	mutex_lock(&group->mark_mutex);
1158 	fsn_mark = fsnotify_find_mark(connp, group);
1159 	if (!fsn_mark) {
1160 		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
1161 		if (IS_ERR(fsn_mark)) {
1162 			mutex_unlock(&group->mark_mutex);
1163 			return PTR_ERR(fsn_mark);
1164 		}
1165 	}
1166 
1167 	/*
1168 	 * Error events are pre-allocated per group, only if strictly
1169 	 * needed (i.e. FAN_FS_ERROR was requested).
1170 	 */
1171 	if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
1172 		ret = fanotify_group_init_error_pool(group);
1173 		if (ret)
1174 			goto out;
1175 	}
1176 
1177 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
1178 	if (added & ~fsnotify_conn_mask(fsn_mark->connector))
1179 		fsnotify_recalc_mask(fsn_mark->connector);
1180 
1181 out:
1182 	mutex_unlock(&group->mark_mutex);
1183 
1184 	fsnotify_put_mark(fsn_mark);
1185 	return ret;
1186 }
1187 
1188 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
1189 				      struct vfsmount *mnt, __u32 mask,
1190 				      unsigned int flags, __kernel_fsid_t *fsid)
1191 {
1192 	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
1193 				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
1194 }
1195 
1196 static int fanotify_add_sb_mark(struct fsnotify_group *group,
1197 				struct super_block *sb, __u32 mask,
1198 				unsigned int flags, __kernel_fsid_t *fsid)
1199 {
1200 	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
1201 				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
1202 }
1203 
1204 static int fanotify_add_inode_mark(struct fsnotify_group *group,
1205 				   struct inode *inode, __u32 mask,
1206 				   unsigned int flags, __kernel_fsid_t *fsid)
1207 {
1208 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
1209 
1210 	/*
1211 	 * If some other task has this inode open for write we should not add
1212 	 * an ignored mark, unless that ignored mark is supposed to survive
1213 	 * modification changes anyway.
1214 	 */
1215 	if ((flags & FAN_MARK_IGNORED_MASK) &&
1216 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1217 	    inode_is_open_for_write(inode))
1218 		return 0;
1219 
1220 	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
1221 				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
1222 }
1223 
1224 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1225 {
1226 	struct fanotify_event *oevent;
1227 
1228 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1229 	if (!oevent)
1230 		return NULL;
1231 
1232 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1233 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1234 
1235 	return &oevent->fse;
1236 }
1237 
1238 static struct hlist_head *fanotify_alloc_merge_hash(void)
1239 {
1240 	struct hlist_head *hash;
1241 
1242 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1243 		       GFP_KERNEL_ACCOUNT);
1244 	if (!hash)
1245 		return NULL;
1246 
1247 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1248 
1249 	return hash;
1250 }
1251 
1252 /* fanotify syscalls */
1253 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1254 {
1255 	struct fsnotify_group *group;
1256 	int f_flags, fd;
1257 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1258 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1259 	unsigned int internal_flags = 0;
1260 
1261 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1262 		 __func__, flags, event_f_flags);
1263 
1264 	if (!capable(CAP_SYS_ADMIN)) {
1265 		/*
1266 		 * An unprivileged user can setup an fanotify group with
1267 		 * limited functionality - an unprivileged group is limited to
1268 		 * notification events with file handles and it cannot use
1269 		 * unlimited queue/marks.
1270 		 */
1271 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1272 			return -EPERM;
1273 
1274 		/*
1275 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1276 		 * prevents setting mount/filesystem marks on this group and
1277 		 * prevents reporting pid and open fd in events.
1278 		 */
1279 		internal_flags |= FANOTIFY_UNPRIV;
1280 	}
1281 
1282 #ifdef CONFIG_AUDITSYSCALL
1283 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1284 #else
1285 	if (flags & ~FANOTIFY_INIT_FLAGS)
1286 #endif
1287 		return -EINVAL;
1288 
1289 	/*
1290 	 * A pidfd can only be returned for a thread-group leader; thus
1291 	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1292 	 * exclusive.
1293 	 */
1294 	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1295 		return -EINVAL;
1296 
1297 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1298 		return -EINVAL;
1299 
1300 	switch (event_f_flags & O_ACCMODE) {
1301 	case O_RDONLY:
1302 	case O_RDWR:
1303 	case O_WRONLY:
1304 		break;
1305 	default:
1306 		return -EINVAL;
1307 	}
1308 
1309 	if (fid_mode && class != FAN_CLASS_NOTIF)
1310 		return -EINVAL;
1311 
1312 	/*
1313 	 * Child name is reported with parent fid so requires dir fid.
1314 	 * We can report both child fid and dir fid with or without name.
1315 	 */
1316 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1317 		return -EINVAL;
1318 
1319 	/*
1320 	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1321 	 * and is used as an indication to report both dir and child fid on all
1322 	 * dirent events.
1323 	 */
1324 	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1325 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1326 		return -EINVAL;
1327 
1328 	f_flags = O_RDWR | FMODE_NONOTIFY;
1329 	if (flags & FAN_CLOEXEC)
1330 		f_flags |= O_CLOEXEC;
1331 	if (flags & FAN_NONBLOCK)
1332 		f_flags |= O_NONBLOCK;
1333 
1334 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1335 	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
1336 	if (IS_ERR(group)) {
1337 		return PTR_ERR(group);
1338 	}
1339 
1340 	/* Enforce groups limits per user in all containing user ns */
1341 	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1342 						  current_euid(),
1343 						  UCOUNT_FANOTIFY_GROUPS);
1344 	if (!group->fanotify_data.ucounts) {
1345 		fd = -EMFILE;
1346 		goto out_destroy_group;
1347 	}
1348 
1349 	group->fanotify_data.flags = flags | internal_flags;
1350 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1351 
1352 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1353 	if (!group->fanotify_data.merge_hash) {
1354 		fd = -ENOMEM;
1355 		goto out_destroy_group;
1356 	}
1357 
1358 	group->overflow_event = fanotify_alloc_overflow_event();
1359 	if (unlikely(!group->overflow_event)) {
1360 		fd = -ENOMEM;
1361 		goto out_destroy_group;
1362 	}
1363 
1364 	if (force_o_largefile())
1365 		event_f_flags |= O_LARGEFILE;
1366 	group->fanotify_data.f_flags = event_f_flags;
1367 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1368 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1369 	switch (class) {
1370 	case FAN_CLASS_NOTIF:
1371 		group->priority = FS_PRIO_0;
1372 		break;
1373 	case FAN_CLASS_CONTENT:
1374 		group->priority = FS_PRIO_1;
1375 		break;
1376 	case FAN_CLASS_PRE_CONTENT:
1377 		group->priority = FS_PRIO_2;
1378 		break;
1379 	default:
1380 		fd = -EINVAL;
1381 		goto out_destroy_group;
1382 	}
1383 
1384 	if (flags & FAN_UNLIMITED_QUEUE) {
1385 		fd = -EPERM;
1386 		if (!capable(CAP_SYS_ADMIN))
1387 			goto out_destroy_group;
1388 		group->max_events = UINT_MAX;
1389 	} else {
1390 		group->max_events = fanotify_max_queued_events;
1391 	}
1392 
1393 	if (flags & FAN_UNLIMITED_MARKS) {
1394 		fd = -EPERM;
1395 		if (!capable(CAP_SYS_ADMIN))
1396 			goto out_destroy_group;
1397 	}
1398 
1399 	if (flags & FAN_ENABLE_AUDIT) {
1400 		fd = -EPERM;
1401 		if (!capable(CAP_AUDIT_WRITE))
1402 			goto out_destroy_group;
1403 	}
1404 
1405 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1406 	if (fd < 0)
1407 		goto out_destroy_group;
1408 
1409 	return fd;
1410 
1411 out_destroy_group:
1412 	fsnotify_destroy_group(group);
1413 	return fd;
1414 }
1415 
1416 static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
1417 {
1418 	__kernel_fsid_t root_fsid;
1419 	int err;
1420 
1421 	/*
1422 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1423 	 */
1424 	err = vfs_get_fsid(dentry, fsid);
1425 	if (err)
1426 		return err;
1427 
1428 	if (!fsid->val[0] && !fsid->val[1])
1429 		return -ENODEV;
1430 
1431 	/*
1432 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1433 	 * which uses a different fsid than sb root.
1434 	 */
1435 	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1436 	if (err)
1437 		return err;
1438 
1439 	if (root_fsid.val[0] != fsid->val[0] ||
1440 	    root_fsid.val[1] != fsid->val[1])
1441 		return -EXDEV;
1442 
1443 	return 0;
1444 }
1445 
1446 /* Check if filesystem can encode a unique fid */
1447 static int fanotify_test_fid(struct dentry *dentry)
1448 {
1449 	/*
1450 	 * We need to make sure that the file system supports at least
1451 	 * encoding a file handle so user can use name_to_handle_at() to
1452 	 * compare fid returned with event to the file handle of watched
1453 	 * objects. However, name_to_handle_at() requires that the
1454 	 * filesystem also supports decoding file handles.
1455 	 */
1456 	if (!dentry->d_sb->s_export_op ||
1457 	    !dentry->d_sb->s_export_op->fh_to_dentry)
1458 		return -EOPNOTSUPP;
1459 
1460 	return 0;
1461 }
1462 
1463 static int fanotify_events_supported(struct path *path, __u64 mask)
1464 {
1465 	/*
1466 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1467 	 * files. For them fanotify permission events have high chances of
1468 	 * deadlocking the system - open done when reporting fanotify event
1469 	 * blocks on this "unusual" lock while another process holding the lock
1470 	 * waits for fanotify permission event to be answered. Just disallow
1471 	 * permission events for such filesystems.
1472 	 */
1473 	if (mask & FANOTIFY_PERM_EVENTS &&
1474 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1475 		return -EINVAL;
1476 	return 0;
1477 }
1478 
1479 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1480 			    int dfd, const char  __user *pathname)
1481 {
1482 	struct inode *inode = NULL;
1483 	struct vfsmount *mnt = NULL;
1484 	struct fsnotify_group *group;
1485 	struct fd f;
1486 	struct path path;
1487 	__kernel_fsid_t __fsid, *fsid = NULL;
1488 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1489 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1490 	bool ignored = flags & FAN_MARK_IGNORED_MASK;
1491 	unsigned int obj_type, fid_mode;
1492 	u32 umask = 0;
1493 	int ret;
1494 
1495 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1496 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1497 
1498 	/* we only use the lower 32 bits as of right now. */
1499 	if (upper_32_bits(mask))
1500 		return -EINVAL;
1501 
1502 	if (flags & ~FANOTIFY_MARK_FLAGS)
1503 		return -EINVAL;
1504 
1505 	switch (mark_type) {
1506 	case FAN_MARK_INODE:
1507 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1508 		break;
1509 	case FAN_MARK_MOUNT:
1510 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1511 		break;
1512 	case FAN_MARK_FILESYSTEM:
1513 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1514 		break;
1515 	default:
1516 		return -EINVAL;
1517 	}
1518 
1519 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1520 	case FAN_MARK_ADD:
1521 	case FAN_MARK_REMOVE:
1522 		if (!mask)
1523 			return -EINVAL;
1524 		break;
1525 	case FAN_MARK_FLUSH:
1526 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1527 			return -EINVAL;
1528 		break;
1529 	default:
1530 		return -EINVAL;
1531 	}
1532 
1533 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1534 		valid_mask |= FANOTIFY_PERM_EVENTS;
1535 
1536 	if (mask & ~valid_mask)
1537 		return -EINVAL;
1538 
1539 	/* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1540 	if (ignored)
1541 		mask &= ~FANOTIFY_EVENT_FLAGS;
1542 
1543 	f = fdget(fanotify_fd);
1544 	if (unlikely(!f.file))
1545 		return -EBADF;
1546 
1547 	/* verify that this is indeed an fanotify instance */
1548 	ret = -EINVAL;
1549 	if (unlikely(f.file->f_op != &fanotify_fops))
1550 		goto fput_and_out;
1551 	group = f.file->private_data;
1552 
1553 	/*
1554 	 * An unprivileged user is not allowed to setup mount nor filesystem
1555 	 * marks.  This also includes setting up such marks by a group that
1556 	 * was initialized by an unprivileged user.
1557 	 */
1558 	ret = -EPERM;
1559 	if ((!capable(CAP_SYS_ADMIN) ||
1560 	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1561 	    mark_type != FAN_MARK_INODE)
1562 		goto fput_and_out;
1563 
1564 	/*
1565 	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1566 	 * allowed to set permissions events.
1567 	 */
1568 	ret = -EINVAL;
1569 	if (mask & FANOTIFY_PERM_EVENTS &&
1570 	    group->priority == FS_PRIO_0)
1571 		goto fput_and_out;
1572 
1573 	if (mask & FAN_FS_ERROR &&
1574 	    mark_type != FAN_MARK_FILESYSTEM)
1575 		goto fput_and_out;
1576 
1577 	/*
1578 	 * Events that do not carry enough information to report
1579 	 * event->fd require a group that supports reporting fid.  Those
1580 	 * events are not supported on a mount mark, because they do not
1581 	 * carry enough information (i.e. path) to be filtered by mount
1582 	 * point.
1583 	 */
1584 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1585 	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1586 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1587 		goto fput_and_out;
1588 
1589 	/*
1590 	 * FAN_RENAME uses special info type records to report the old and
1591 	 * new parent+name.  Reporting only old and new parent id is less
1592 	 * useful and was not implemented.
1593 	 */
1594 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1595 		goto fput_and_out;
1596 
1597 	if (flags & FAN_MARK_FLUSH) {
1598 		ret = 0;
1599 		if (mark_type == FAN_MARK_MOUNT)
1600 			fsnotify_clear_vfsmount_marks_by_group(group);
1601 		else if (mark_type == FAN_MARK_FILESYSTEM)
1602 			fsnotify_clear_sb_marks_by_group(group);
1603 		else
1604 			fsnotify_clear_inode_marks_by_group(group);
1605 		goto fput_and_out;
1606 	}
1607 
1608 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1609 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1610 	if (ret)
1611 		goto fput_and_out;
1612 
1613 	if (flags & FAN_MARK_ADD) {
1614 		ret = fanotify_events_supported(&path, mask);
1615 		if (ret)
1616 			goto path_put_and_out;
1617 	}
1618 
1619 	if (fid_mode) {
1620 		ret = fanotify_test_fsid(path.dentry, &__fsid);
1621 		if (ret)
1622 			goto path_put_and_out;
1623 
1624 		ret = fanotify_test_fid(path.dentry);
1625 		if (ret)
1626 			goto path_put_and_out;
1627 
1628 		fsid = &__fsid;
1629 	}
1630 
1631 	/* inode held in place by reference to path; group by fget on fd */
1632 	if (mark_type == FAN_MARK_INODE)
1633 		inode = path.dentry->d_inode;
1634 	else
1635 		mnt = path.mnt;
1636 
1637 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1638 	if (mnt || !S_ISDIR(inode->i_mode)) {
1639 		mask &= ~FAN_EVENT_ON_CHILD;
1640 		umask = FAN_EVENT_ON_CHILD;
1641 		/*
1642 		 * If group needs to report parent fid, register for getting
1643 		 * events with parent/name info for non-directory.
1644 		 */
1645 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
1646 		    (flags & FAN_MARK_ADD) && !ignored)
1647 			mask |= FAN_EVENT_ON_CHILD;
1648 	}
1649 
1650 	/* create/update an inode mark */
1651 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1652 	case FAN_MARK_ADD:
1653 		if (mark_type == FAN_MARK_MOUNT)
1654 			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1655 							 flags, fsid);
1656 		else if (mark_type == FAN_MARK_FILESYSTEM)
1657 			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1658 						   flags, fsid);
1659 		else
1660 			ret = fanotify_add_inode_mark(group, inode, mask,
1661 						      flags, fsid);
1662 		break;
1663 	case FAN_MARK_REMOVE:
1664 		if (mark_type == FAN_MARK_MOUNT)
1665 			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1666 							    flags, umask);
1667 		else if (mark_type == FAN_MARK_FILESYSTEM)
1668 			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1669 						      flags, umask);
1670 		else
1671 			ret = fanotify_remove_inode_mark(group, inode, mask,
1672 							 flags, umask);
1673 		break;
1674 	default:
1675 		ret = -EINVAL;
1676 	}
1677 
1678 path_put_and_out:
1679 	path_put(&path);
1680 fput_and_out:
1681 	fdput(f);
1682 	return ret;
1683 }
1684 
1685 #ifndef CONFIG_ARCH_SPLIT_ARG64
1686 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1687 			      __u64, mask, int, dfd,
1688 			      const char  __user *, pathname)
1689 {
1690 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1691 }
1692 #endif
1693 
1694 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1695 SYSCALL32_DEFINE6(fanotify_mark,
1696 				int, fanotify_fd, unsigned int, flags,
1697 				SC_ARG64(mask), int, dfd,
1698 				const char  __user *, pathname)
1699 {
1700 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1701 				dfd, pathname);
1702 }
1703 #endif
1704 
1705 /*
1706  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1707  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1708  * must result in panic().
1709  */
1710 static int __init fanotify_user_setup(void)
1711 {
1712 	struct sysinfo si;
1713 	int max_marks;
1714 
1715 	si_meminfo(&si);
1716 	/*
1717 	 * Allow up to 1% of addressable memory to be accounted for per user
1718 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
1719 	 * a lot cheaper than inode marks, but there is no reason for a user
1720 	 * to have many of those, so calculate by the cost of inode marks.
1721 	 */
1722 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1723 		    INODE_MARK_COST;
1724 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1725 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
1726 
1727 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1728 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
1729 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1730 
1731 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1732 					 SLAB_PANIC|SLAB_ACCOUNT);
1733 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1734 					       SLAB_PANIC);
1735 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1736 						SLAB_PANIC);
1737 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1738 		fanotify_perm_event_cachep =
1739 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1740 	}
1741 
1742 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1743 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1744 					FANOTIFY_DEFAULT_MAX_GROUPS;
1745 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1746 
1747 	return 0;
1748 }
1749 device_initcall(fanotify_user_setup);
1750