1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/file.h>
5 #include <linux/fs.h>
6 #include <linux/anon_inodes.h>
7 #include <linux/fsnotify_backend.h>
8 #include <linux/init.h>
9 #include <linux/mount.h>
10 #include <linux/namei.h>
11 #include <linux/poll.h>
12 #include <linux/security.h>
13 #include <linux/syscalls.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/uaccess.h>
17 #include <linux/compat.h>
18 #include <linux/sched/signal.h>
19 #include <linux/memcontrol.h>
20 #include <linux/statfs.h>
21 #include <linux/exportfs.h>
22 
23 #include <asm/ioctls.h>
24 
25 #include "../../mount.h"
26 #include "../fdinfo.h"
27 #include "fanotify.h"
28 
29 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
31 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
32 
33 /*
34  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
35  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
36  * of fanotify marks per user is <max marks per group> * <max groups per user>.
37  * This default limit (1M) also happens to match the increased limit of inotify
38  * max_user_watches since v5.10.
39  */
40 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
41 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
42 
43 /*
44  * Most of the memory cost of adding an inode mark is pinning the marked inode.
45  * The size of the filesystem inode struct is not uniform across filesystems,
46  * so double the size of a VFS inode is used as a conservative approximation.
47  */
48 #define INODE_MARK_COST	(2 * sizeof(struct inode))
49 
50 /* configurable via /proc/sys/fs/fanotify/ */
51 static int fanotify_max_queued_events __read_mostly;
52 
53 #ifdef CONFIG_SYSCTL
54 
55 #include <linux/sysctl.h>
56 
57 static long ft_zero = 0;
58 static long ft_int_max = INT_MAX;
59 
60 struct ctl_table fanotify_table[] = {
61 	{
62 		.procname	= "max_user_groups",
63 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
64 		.maxlen		= sizeof(long),
65 		.mode		= 0644,
66 		.proc_handler	= proc_doulongvec_minmax,
67 		.extra1		= &ft_zero,
68 		.extra2		= &ft_int_max,
69 	},
70 	{
71 		.procname	= "max_user_marks",
72 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
73 		.maxlen		= sizeof(long),
74 		.mode		= 0644,
75 		.proc_handler	= proc_doulongvec_minmax,
76 		.extra1		= &ft_zero,
77 		.extra2		= &ft_int_max,
78 	},
79 	{
80 		.procname	= "max_queued_events",
81 		.data		= &fanotify_max_queued_events,
82 		.maxlen		= sizeof(int),
83 		.mode		= 0644,
84 		.proc_handler	= proc_dointvec_minmax,
85 		.extra1		= SYSCTL_ZERO
86 	},
87 	{ }
88 };
89 #endif /* CONFIG_SYSCTL */
90 
91 /*
92  * All flags that may be specified in parameter event_f_flags of fanotify_init.
93  *
94  * Internal and external open flags are stored together in field f_flags of
95  * struct file. Only external open flags shall be allowed in event_f_flags.
96  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
97  * excluded.
98  */
99 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
100 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
101 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
102 		O_LARGEFILE	| O_NOATIME	)
103 
104 extern const struct fsnotify_ops fanotify_fsnotify_ops;
105 
106 struct kmem_cache *fanotify_mark_cache __read_mostly;
107 struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
108 struct kmem_cache *fanotify_path_event_cachep __read_mostly;
109 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
110 
111 #define FANOTIFY_EVENT_ALIGN 4
112 #define FANOTIFY_INFO_HDR_LEN \
113 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
114 
115 static int fanotify_fid_info_len(int fh_len, int name_len)
116 {
117 	int info_len = fh_len;
118 
119 	if (name_len)
120 		info_len += name_len + 1;
121 
122 	return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
123 }
124 
125 static int fanotify_event_info_len(unsigned int fid_mode,
126 				   struct fanotify_event *event)
127 {
128 	struct fanotify_info *info = fanotify_event_info(event);
129 	int dir_fh_len = fanotify_event_dir_fh_len(event);
130 	int fh_len = fanotify_event_object_fh_len(event);
131 	int info_len = 0;
132 	int dot_len = 0;
133 
134 	if (dir_fh_len) {
135 		info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
136 	} else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) {
137 		/*
138 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
139 		 * event on a directory, we will report the name ".".
140 		 */
141 		dot_len = 1;
142 	}
143 
144 	if (fh_len)
145 		info_len += fanotify_fid_info_len(fh_len, dot_len);
146 
147 	return info_len;
148 }
149 
150 /*
151  * Remove an hashed event from merge hash table.
152  */
153 static void fanotify_unhash_event(struct fsnotify_group *group,
154 				  struct fanotify_event *event)
155 {
156 	assert_spin_locked(&group->notification_lock);
157 
158 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
159 		 group, event, fanotify_event_hash_bucket(group, event));
160 
161 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
162 		return;
163 
164 	hlist_del_init(&event->merge_list);
165 }
166 
167 /*
168  * Get an fanotify notification event if one exists and is small
169  * enough to fit in "count". Return an error pointer if the count
170  * is not large enough. When permission event is dequeued, its state is
171  * updated accordingly.
172  */
173 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
174 					    size_t count)
175 {
176 	size_t event_size = FAN_EVENT_METADATA_LEN;
177 	struct fanotify_event *event = NULL;
178 	struct fsnotify_event *fsn_event;
179 	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
180 
181 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
182 
183 	spin_lock(&group->notification_lock);
184 	fsn_event = fsnotify_peek_first_event(group);
185 	if (!fsn_event)
186 		goto out;
187 
188 	event = FANOTIFY_E(fsn_event);
189 	if (fid_mode)
190 		event_size += fanotify_event_info_len(fid_mode, event);
191 
192 	if (event_size > count) {
193 		event = ERR_PTR(-EINVAL);
194 		goto out;
195 	}
196 
197 	/*
198 	 * Held the notification_lock the whole time, so this is the
199 	 * same event we peeked above.
200 	 */
201 	fsnotify_remove_first_event(group);
202 	if (fanotify_is_perm_event(event->mask))
203 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
204 	if (fanotify_is_hashed_event(event->mask))
205 		fanotify_unhash_event(group, event);
206 out:
207 	spin_unlock(&group->notification_lock);
208 	return event;
209 }
210 
211 static int create_fd(struct fsnotify_group *group, struct path *path,
212 		     struct file **file)
213 {
214 	int client_fd;
215 	struct file *new_file;
216 
217 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
218 	if (client_fd < 0)
219 		return client_fd;
220 
221 	/*
222 	 * we need a new file handle for the userspace program so it can read even if it was
223 	 * originally opened O_WRONLY.
224 	 */
225 	new_file = dentry_open(path,
226 			       group->fanotify_data.f_flags | FMODE_NONOTIFY,
227 			       current_cred());
228 	if (IS_ERR(new_file)) {
229 		/*
230 		 * we still send an event even if we can't open the file.  this
231 		 * can happen when say tasks are gone and we try to open their
232 		 * /proc files or we try to open a WRONLY file like in sysfs
233 		 * we just send the errno to userspace since there isn't much
234 		 * else we can do.
235 		 */
236 		put_unused_fd(client_fd);
237 		client_fd = PTR_ERR(new_file);
238 	} else {
239 		*file = new_file;
240 	}
241 
242 	return client_fd;
243 }
244 
245 /*
246  * Finish processing of permission event by setting it to ANSWERED state and
247  * drop group->notification_lock.
248  */
249 static void finish_permission_event(struct fsnotify_group *group,
250 				    struct fanotify_perm_event *event,
251 				    unsigned int response)
252 				    __releases(&group->notification_lock)
253 {
254 	bool destroy = false;
255 
256 	assert_spin_locked(&group->notification_lock);
257 	event->response = response;
258 	if (event->state == FAN_EVENT_CANCELED)
259 		destroy = true;
260 	else
261 		event->state = FAN_EVENT_ANSWERED;
262 	spin_unlock(&group->notification_lock);
263 	if (destroy)
264 		fsnotify_destroy_event(group, &event->fae.fse);
265 }
266 
267 static int process_access_response(struct fsnotify_group *group,
268 				   struct fanotify_response *response_struct)
269 {
270 	struct fanotify_perm_event *event;
271 	int fd = response_struct->fd;
272 	int response = response_struct->response;
273 
274 	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
275 		 fd, response);
276 	/*
277 	 * make sure the response is valid, if invalid we do nothing and either
278 	 * userspace can send a valid response or we will clean it up after the
279 	 * timeout
280 	 */
281 	switch (response & ~FAN_AUDIT) {
282 	case FAN_ALLOW:
283 	case FAN_DENY:
284 		break;
285 	default:
286 		return -EINVAL;
287 	}
288 
289 	if (fd < 0)
290 		return -EINVAL;
291 
292 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
293 		return -EINVAL;
294 
295 	spin_lock(&group->notification_lock);
296 	list_for_each_entry(event, &group->fanotify_data.access_list,
297 			    fae.fse.list) {
298 		if (event->fd != fd)
299 			continue;
300 
301 		list_del_init(&event->fae.fse.list);
302 		finish_permission_event(group, event, response);
303 		wake_up(&group->fanotify_data.access_waitq);
304 		return 0;
305 	}
306 	spin_unlock(&group->notification_lock);
307 
308 	return -ENOENT;
309 }
310 
311 static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
312 			     int info_type, const char *name, size_t name_len,
313 			     char __user *buf, size_t count)
314 {
315 	struct fanotify_event_info_fid info = { };
316 	struct file_handle handle = { };
317 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
318 	size_t fh_len = fh ? fh->len : 0;
319 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
320 	size_t len = info_len;
321 
322 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
323 		 __func__, fh_len, name_len, info_len, count);
324 
325 	if (!fh_len)
326 		return 0;
327 
328 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
329 		return -EFAULT;
330 
331 	/*
332 	 * Copy event info fid header followed by variable sized file handle
333 	 * and optionally followed by variable sized filename.
334 	 */
335 	switch (info_type) {
336 	case FAN_EVENT_INFO_TYPE_FID:
337 	case FAN_EVENT_INFO_TYPE_DFID:
338 		if (WARN_ON_ONCE(name_len))
339 			return -EFAULT;
340 		break;
341 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
342 		if (WARN_ON_ONCE(!name || !name_len))
343 			return -EFAULT;
344 		break;
345 	default:
346 		return -EFAULT;
347 	}
348 
349 	info.hdr.info_type = info_type;
350 	info.hdr.len = len;
351 	info.fsid = *fsid;
352 	if (copy_to_user(buf, &info, sizeof(info)))
353 		return -EFAULT;
354 
355 	buf += sizeof(info);
356 	len -= sizeof(info);
357 	if (WARN_ON_ONCE(len < sizeof(handle)))
358 		return -EFAULT;
359 
360 	handle.handle_type = fh->type;
361 	handle.handle_bytes = fh_len;
362 	if (copy_to_user(buf, &handle, sizeof(handle)))
363 		return -EFAULT;
364 
365 	buf += sizeof(handle);
366 	len -= sizeof(handle);
367 	if (WARN_ON_ONCE(len < fh_len))
368 		return -EFAULT;
369 
370 	/*
371 	 * For an inline fh and inline file name, copy through stack to exclude
372 	 * the copy from usercopy hardening protections.
373 	 */
374 	fh_buf = fanotify_fh_buf(fh);
375 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
376 		memcpy(bounce, fh_buf, fh_len);
377 		fh_buf = bounce;
378 	}
379 	if (copy_to_user(buf, fh_buf, fh_len))
380 		return -EFAULT;
381 
382 	buf += fh_len;
383 	len -= fh_len;
384 
385 	if (name_len) {
386 		/* Copy the filename with terminating null */
387 		name_len++;
388 		if (WARN_ON_ONCE(len < name_len))
389 			return -EFAULT;
390 
391 		if (copy_to_user(buf, name, name_len))
392 			return -EFAULT;
393 
394 		buf += name_len;
395 		len -= name_len;
396 	}
397 
398 	/* Pad with 0's */
399 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
400 	if (len > 0 && clear_user(buf, len))
401 		return -EFAULT;
402 
403 	return info_len;
404 }
405 
406 static ssize_t copy_event_to_user(struct fsnotify_group *group,
407 				  struct fanotify_event *event,
408 				  char __user *buf, size_t count)
409 {
410 	struct fanotify_event_metadata metadata;
411 	struct path *path = fanotify_event_path(event);
412 	struct fanotify_info *info = fanotify_event_info(event);
413 	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
414 	struct file *f = NULL;
415 	int ret, fd = FAN_NOFD;
416 	int info_type = 0;
417 
418 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
419 
420 	metadata.event_len = FAN_EVENT_METADATA_LEN +
421 				fanotify_event_info_len(fid_mode, event);
422 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
423 	metadata.vers = FANOTIFY_METADATA_VERSION;
424 	metadata.reserved = 0;
425 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
426 	metadata.pid = pid_vnr(event->pid);
427 	/*
428 	 * For an unprivileged listener, event->pid can be used to identify the
429 	 * events generated by the listener process itself, without disclosing
430 	 * the pids of other processes.
431 	 */
432 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
433 	    task_tgid(current) != event->pid)
434 		metadata.pid = 0;
435 
436 	/*
437 	 * For now, fid mode is required for an unprivileged listener and
438 	 * fid mode does not report fd in events.  Keep this check anyway
439 	 * for safety in case fid mode requirement is relaxed in the future
440 	 * to allow unprivileged listener to get events with no fd and no fid.
441 	 */
442 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
443 	    path && path->mnt && path->dentry) {
444 		fd = create_fd(group, path, &f);
445 		if (fd < 0)
446 			return fd;
447 	}
448 	metadata.fd = fd;
449 
450 	ret = -EFAULT;
451 	/*
452 	 * Sanity check copy size in case get_one_event() and
453 	 * event_len sizes ever get out of sync.
454 	 */
455 	if (WARN_ON_ONCE(metadata.event_len > count))
456 		goto out_close_fd;
457 
458 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
459 		goto out_close_fd;
460 
461 	buf += FAN_EVENT_METADATA_LEN;
462 	count -= FAN_EVENT_METADATA_LEN;
463 
464 	if (fanotify_is_perm_event(event->mask))
465 		FANOTIFY_PERM(event)->fd = fd;
466 
467 	if (f)
468 		fd_install(fd, f);
469 
470 	/* Event info records order is: dir fid + name, child fid */
471 	if (fanotify_event_dir_fh_len(event)) {
472 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
473 					     FAN_EVENT_INFO_TYPE_DFID;
474 		ret = copy_info_to_user(fanotify_event_fsid(event),
475 					fanotify_info_dir_fh(info),
476 					info_type, fanotify_info_name(info),
477 					info->name_len, buf, count);
478 		if (ret < 0)
479 			goto out_close_fd;
480 
481 		buf += ret;
482 		count -= ret;
483 	}
484 
485 	if (fanotify_event_object_fh_len(event)) {
486 		const char *dot = NULL;
487 		int dot_len = 0;
488 
489 		if (fid_mode == FAN_REPORT_FID || info_type) {
490 			/*
491 			 * With only group flag FAN_REPORT_FID only type FID is
492 			 * reported. Second info record type is always FID.
493 			 */
494 			info_type = FAN_EVENT_INFO_TYPE_FID;
495 		} else if ((fid_mode & FAN_REPORT_NAME) &&
496 			   (event->mask & FAN_ONDIR)) {
497 			/*
498 			 * With group flag FAN_REPORT_NAME, if name was not
499 			 * recorded in an event on a directory, report the
500 			 * name "." with info type DFID_NAME.
501 			 */
502 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
503 			dot = ".";
504 			dot_len = 1;
505 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
506 			   (event->mask & FAN_ONDIR)) {
507 			/*
508 			 * With group flag FAN_REPORT_DIR_FID, a single info
509 			 * record has type DFID for directory entry modification
510 			 * event and for event on a directory.
511 			 */
512 			info_type = FAN_EVENT_INFO_TYPE_DFID;
513 		} else {
514 			/*
515 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
516 			 * a single info record has type FID for event on a
517 			 * non-directory, when there is no directory to report.
518 			 * For example, on FAN_DELETE_SELF event.
519 			 */
520 			info_type = FAN_EVENT_INFO_TYPE_FID;
521 		}
522 
523 		ret = copy_info_to_user(fanotify_event_fsid(event),
524 					fanotify_event_object_fh(event),
525 					info_type, dot, dot_len, buf, count);
526 		if (ret < 0)
527 			goto out_close_fd;
528 
529 		buf += ret;
530 		count -= ret;
531 	}
532 
533 	return metadata.event_len;
534 
535 out_close_fd:
536 	if (fd != FAN_NOFD) {
537 		put_unused_fd(fd);
538 		fput(f);
539 	}
540 	return ret;
541 }
542 
543 /* intofiy userspace file descriptor functions */
544 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
545 {
546 	struct fsnotify_group *group = file->private_data;
547 	__poll_t ret = 0;
548 
549 	poll_wait(file, &group->notification_waitq, wait);
550 	spin_lock(&group->notification_lock);
551 	if (!fsnotify_notify_queue_is_empty(group))
552 		ret = EPOLLIN | EPOLLRDNORM;
553 	spin_unlock(&group->notification_lock);
554 
555 	return ret;
556 }
557 
558 static ssize_t fanotify_read(struct file *file, char __user *buf,
559 			     size_t count, loff_t *pos)
560 {
561 	struct fsnotify_group *group;
562 	struct fanotify_event *event;
563 	char __user *start;
564 	int ret;
565 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
566 
567 	start = buf;
568 	group = file->private_data;
569 
570 	pr_debug("%s: group=%p\n", __func__, group);
571 
572 	add_wait_queue(&group->notification_waitq, &wait);
573 	while (1) {
574 		/*
575 		 * User can supply arbitrarily large buffer. Avoid softlockups
576 		 * in case there are lots of available events.
577 		 */
578 		cond_resched();
579 		event = get_one_event(group, count);
580 		if (IS_ERR(event)) {
581 			ret = PTR_ERR(event);
582 			break;
583 		}
584 
585 		if (!event) {
586 			ret = -EAGAIN;
587 			if (file->f_flags & O_NONBLOCK)
588 				break;
589 
590 			ret = -ERESTARTSYS;
591 			if (signal_pending(current))
592 				break;
593 
594 			if (start != buf)
595 				break;
596 
597 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
598 			continue;
599 		}
600 
601 		ret = copy_event_to_user(group, event, buf, count);
602 		if (unlikely(ret == -EOPENSTALE)) {
603 			/*
604 			 * We cannot report events with stale fd so drop it.
605 			 * Setting ret to 0 will continue the event loop and
606 			 * do the right thing if there are no more events to
607 			 * read (i.e. return bytes read, -EAGAIN or wait).
608 			 */
609 			ret = 0;
610 		}
611 
612 		/*
613 		 * Permission events get queued to wait for response.  Other
614 		 * events can be destroyed now.
615 		 */
616 		if (!fanotify_is_perm_event(event->mask)) {
617 			fsnotify_destroy_event(group, &event->fse);
618 		} else {
619 			if (ret <= 0) {
620 				spin_lock(&group->notification_lock);
621 				finish_permission_event(group,
622 					FANOTIFY_PERM(event), FAN_DENY);
623 				wake_up(&group->fanotify_data.access_waitq);
624 			} else {
625 				spin_lock(&group->notification_lock);
626 				list_add_tail(&event->fse.list,
627 					&group->fanotify_data.access_list);
628 				spin_unlock(&group->notification_lock);
629 			}
630 		}
631 		if (ret < 0)
632 			break;
633 		buf += ret;
634 		count -= ret;
635 	}
636 	remove_wait_queue(&group->notification_waitq, &wait);
637 
638 	if (start != buf && ret != -EFAULT)
639 		ret = buf - start;
640 	return ret;
641 }
642 
643 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
644 {
645 	struct fanotify_response response = { .fd = -1, .response = -1 };
646 	struct fsnotify_group *group;
647 	int ret;
648 
649 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
650 		return -EINVAL;
651 
652 	group = file->private_data;
653 
654 	if (count < sizeof(response))
655 		return -EINVAL;
656 
657 	count = sizeof(response);
658 
659 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
660 
661 	if (copy_from_user(&response, buf, count))
662 		return -EFAULT;
663 
664 	ret = process_access_response(group, &response);
665 	if (ret < 0)
666 		count = ret;
667 
668 	return count;
669 }
670 
671 static int fanotify_release(struct inode *ignored, struct file *file)
672 {
673 	struct fsnotify_group *group = file->private_data;
674 	struct fsnotify_event *fsn_event;
675 
676 	/*
677 	 * Stop new events from arriving in the notification queue. since
678 	 * userspace cannot use fanotify fd anymore, no event can enter or
679 	 * leave access_list by now either.
680 	 */
681 	fsnotify_group_stop_queueing(group);
682 
683 	/*
684 	 * Process all permission events on access_list and notification queue
685 	 * and simulate reply from userspace.
686 	 */
687 	spin_lock(&group->notification_lock);
688 	while (!list_empty(&group->fanotify_data.access_list)) {
689 		struct fanotify_perm_event *event;
690 
691 		event = list_first_entry(&group->fanotify_data.access_list,
692 				struct fanotify_perm_event, fae.fse.list);
693 		list_del_init(&event->fae.fse.list);
694 		finish_permission_event(group, event, FAN_ALLOW);
695 		spin_lock(&group->notification_lock);
696 	}
697 
698 	/*
699 	 * Destroy all non-permission events. For permission events just
700 	 * dequeue them and set the response. They will be freed once the
701 	 * response is consumed and fanotify_get_response() returns.
702 	 */
703 	while ((fsn_event = fsnotify_remove_first_event(group))) {
704 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
705 
706 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
707 			spin_unlock(&group->notification_lock);
708 			fsnotify_destroy_event(group, fsn_event);
709 		} else {
710 			finish_permission_event(group, FANOTIFY_PERM(event),
711 						FAN_ALLOW);
712 		}
713 		spin_lock(&group->notification_lock);
714 	}
715 	spin_unlock(&group->notification_lock);
716 
717 	/* Response for all permission events it set, wakeup waiters */
718 	wake_up(&group->fanotify_data.access_waitq);
719 
720 	/* matches the fanotify_init->fsnotify_alloc_group */
721 	fsnotify_destroy_group(group);
722 
723 	return 0;
724 }
725 
726 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
727 {
728 	struct fsnotify_group *group;
729 	struct fsnotify_event *fsn_event;
730 	void __user *p;
731 	int ret = -ENOTTY;
732 	size_t send_len = 0;
733 
734 	group = file->private_data;
735 
736 	p = (void __user *) arg;
737 
738 	switch (cmd) {
739 	case FIONREAD:
740 		spin_lock(&group->notification_lock);
741 		list_for_each_entry(fsn_event, &group->notification_list, list)
742 			send_len += FAN_EVENT_METADATA_LEN;
743 		spin_unlock(&group->notification_lock);
744 		ret = put_user(send_len, (int __user *) p);
745 		break;
746 	}
747 
748 	return ret;
749 }
750 
751 static const struct file_operations fanotify_fops = {
752 	.show_fdinfo	= fanotify_show_fdinfo,
753 	.poll		= fanotify_poll,
754 	.read		= fanotify_read,
755 	.write		= fanotify_write,
756 	.fasync		= NULL,
757 	.release	= fanotify_release,
758 	.unlocked_ioctl	= fanotify_ioctl,
759 	.compat_ioctl	= compat_ptr_ioctl,
760 	.llseek		= noop_llseek,
761 };
762 
763 static int fanotify_find_path(int dfd, const char __user *filename,
764 			      struct path *path, unsigned int flags, __u64 mask,
765 			      unsigned int obj_type)
766 {
767 	int ret;
768 
769 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
770 		 dfd, filename, flags);
771 
772 	if (filename == NULL) {
773 		struct fd f = fdget(dfd);
774 
775 		ret = -EBADF;
776 		if (!f.file)
777 			goto out;
778 
779 		ret = -ENOTDIR;
780 		if ((flags & FAN_MARK_ONLYDIR) &&
781 		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
782 			fdput(f);
783 			goto out;
784 		}
785 
786 		*path = f.file->f_path;
787 		path_get(path);
788 		fdput(f);
789 	} else {
790 		unsigned int lookup_flags = 0;
791 
792 		if (!(flags & FAN_MARK_DONT_FOLLOW))
793 			lookup_flags |= LOOKUP_FOLLOW;
794 		if (flags & FAN_MARK_ONLYDIR)
795 			lookup_flags |= LOOKUP_DIRECTORY;
796 
797 		ret = user_path_at(dfd, filename, lookup_flags, path);
798 		if (ret)
799 			goto out;
800 	}
801 
802 	/* you can only watch an inode if you have read permissions on it */
803 	ret = path_permission(path, MAY_READ);
804 	if (ret) {
805 		path_put(path);
806 		goto out;
807 	}
808 
809 	ret = security_path_notify(path, mask, obj_type);
810 	if (ret)
811 		path_put(path);
812 
813 out:
814 	return ret;
815 }
816 
817 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
818 					    __u32 mask, unsigned int flags,
819 					    __u32 umask, int *destroy)
820 {
821 	__u32 oldmask = 0;
822 
823 	/* umask bits cannot be removed by user */
824 	mask &= ~umask;
825 	spin_lock(&fsn_mark->lock);
826 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
827 		oldmask = fsn_mark->mask;
828 		fsn_mark->mask &= ~mask;
829 	} else {
830 		fsn_mark->ignored_mask &= ~mask;
831 	}
832 	/*
833 	 * We need to keep the mark around even if remaining mask cannot
834 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
835 	 * changes to the mask.
836 	 * Destroy mark when only umask bits remain.
837 	 */
838 	*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
839 	spin_unlock(&fsn_mark->lock);
840 
841 	return mask & oldmask;
842 }
843 
844 static int fanotify_remove_mark(struct fsnotify_group *group,
845 				fsnotify_connp_t *connp, __u32 mask,
846 				unsigned int flags, __u32 umask)
847 {
848 	struct fsnotify_mark *fsn_mark = NULL;
849 	__u32 removed;
850 	int destroy_mark;
851 
852 	mutex_lock(&group->mark_mutex);
853 	fsn_mark = fsnotify_find_mark(connp, group);
854 	if (!fsn_mark) {
855 		mutex_unlock(&group->mark_mutex);
856 		return -ENOENT;
857 	}
858 
859 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
860 						 umask, &destroy_mark);
861 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
862 		fsnotify_recalc_mask(fsn_mark->connector);
863 	if (destroy_mark)
864 		fsnotify_detach_mark(fsn_mark);
865 	mutex_unlock(&group->mark_mutex);
866 	if (destroy_mark)
867 		fsnotify_free_mark(fsn_mark);
868 
869 	/* matches the fsnotify_find_mark() */
870 	fsnotify_put_mark(fsn_mark);
871 	return 0;
872 }
873 
874 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
875 					 struct vfsmount *mnt, __u32 mask,
876 					 unsigned int flags, __u32 umask)
877 {
878 	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
879 				    mask, flags, umask);
880 }
881 
882 static int fanotify_remove_sb_mark(struct fsnotify_group *group,
883 				   struct super_block *sb, __u32 mask,
884 				   unsigned int flags, __u32 umask)
885 {
886 	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
887 				    flags, umask);
888 }
889 
890 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
891 				      struct inode *inode, __u32 mask,
892 				      unsigned int flags, __u32 umask)
893 {
894 	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
895 				    flags, umask);
896 }
897 
898 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
899 				       __u32 mask,
900 				       unsigned int flags)
901 {
902 	__u32 oldmask = -1;
903 
904 	spin_lock(&fsn_mark->lock);
905 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
906 		oldmask = fsn_mark->mask;
907 		fsn_mark->mask |= mask;
908 	} else {
909 		fsn_mark->ignored_mask |= mask;
910 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
911 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
912 	}
913 	spin_unlock(&fsn_mark->lock);
914 
915 	return mask & ~oldmask;
916 }
917 
918 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
919 						   fsnotify_connp_t *connp,
920 						   unsigned int type,
921 						   __kernel_fsid_t *fsid)
922 {
923 	struct ucounts *ucounts = group->fanotify_data.ucounts;
924 	struct fsnotify_mark *mark;
925 	int ret;
926 
927 	/*
928 	 * Enforce per user marks limits per user in all containing user ns.
929 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
930 	 * in the limited groups account.
931 	 */
932 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
933 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
934 		return ERR_PTR(-ENOSPC);
935 
936 	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
937 	if (!mark) {
938 		ret = -ENOMEM;
939 		goto out_dec_ucounts;
940 	}
941 
942 	fsnotify_init_mark(mark, group);
943 	ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
944 	if (ret) {
945 		fsnotify_put_mark(mark);
946 		goto out_dec_ucounts;
947 	}
948 
949 	return mark;
950 
951 out_dec_ucounts:
952 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
953 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
954 	return ERR_PTR(ret);
955 }
956 
957 
958 static int fanotify_add_mark(struct fsnotify_group *group,
959 			     fsnotify_connp_t *connp, unsigned int type,
960 			     __u32 mask, unsigned int flags,
961 			     __kernel_fsid_t *fsid)
962 {
963 	struct fsnotify_mark *fsn_mark;
964 	__u32 added;
965 
966 	mutex_lock(&group->mark_mutex);
967 	fsn_mark = fsnotify_find_mark(connp, group);
968 	if (!fsn_mark) {
969 		fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
970 		if (IS_ERR(fsn_mark)) {
971 			mutex_unlock(&group->mark_mutex);
972 			return PTR_ERR(fsn_mark);
973 		}
974 	}
975 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
976 	if (added & ~fsnotify_conn_mask(fsn_mark->connector))
977 		fsnotify_recalc_mask(fsn_mark->connector);
978 	mutex_unlock(&group->mark_mutex);
979 
980 	fsnotify_put_mark(fsn_mark);
981 	return 0;
982 }
983 
984 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
985 				      struct vfsmount *mnt, __u32 mask,
986 				      unsigned int flags, __kernel_fsid_t *fsid)
987 {
988 	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
989 				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
990 }
991 
992 static int fanotify_add_sb_mark(struct fsnotify_group *group,
993 				struct super_block *sb, __u32 mask,
994 				unsigned int flags, __kernel_fsid_t *fsid)
995 {
996 	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
997 				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
998 }
999 
1000 static int fanotify_add_inode_mark(struct fsnotify_group *group,
1001 				   struct inode *inode, __u32 mask,
1002 				   unsigned int flags, __kernel_fsid_t *fsid)
1003 {
1004 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
1005 
1006 	/*
1007 	 * If some other task has this inode open for write we should not add
1008 	 * an ignored mark, unless that ignored mark is supposed to survive
1009 	 * modification changes anyway.
1010 	 */
1011 	if ((flags & FAN_MARK_IGNORED_MASK) &&
1012 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1013 	    inode_is_open_for_write(inode))
1014 		return 0;
1015 
1016 	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
1017 				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
1018 }
1019 
1020 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1021 {
1022 	struct fanotify_event *oevent;
1023 
1024 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1025 	if (!oevent)
1026 		return NULL;
1027 
1028 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1029 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1030 
1031 	return &oevent->fse;
1032 }
1033 
1034 static struct hlist_head *fanotify_alloc_merge_hash(void)
1035 {
1036 	struct hlist_head *hash;
1037 
1038 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1039 		       GFP_KERNEL_ACCOUNT);
1040 	if (!hash)
1041 		return NULL;
1042 
1043 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1044 
1045 	return hash;
1046 }
1047 
1048 /* fanotify syscalls */
1049 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1050 {
1051 	struct fsnotify_group *group;
1052 	int f_flags, fd;
1053 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1054 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1055 	unsigned int internal_flags = 0;
1056 
1057 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1058 		 __func__, flags, event_f_flags);
1059 
1060 	if (!capable(CAP_SYS_ADMIN)) {
1061 		/*
1062 		 * An unprivileged user can setup an fanotify group with
1063 		 * limited functionality - an unprivileged group is limited to
1064 		 * notification events with file handles and it cannot use
1065 		 * unlimited queue/marks.
1066 		 */
1067 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1068 			return -EPERM;
1069 
1070 		/*
1071 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1072 		 * prevents setting mount/filesystem marks on this group and
1073 		 * prevents reporting pid and open fd in events.
1074 		 */
1075 		internal_flags |= FANOTIFY_UNPRIV;
1076 	}
1077 
1078 #ifdef CONFIG_AUDITSYSCALL
1079 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1080 #else
1081 	if (flags & ~FANOTIFY_INIT_FLAGS)
1082 #endif
1083 		return -EINVAL;
1084 
1085 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1086 		return -EINVAL;
1087 
1088 	switch (event_f_flags & O_ACCMODE) {
1089 	case O_RDONLY:
1090 	case O_RDWR:
1091 	case O_WRONLY:
1092 		break;
1093 	default:
1094 		return -EINVAL;
1095 	}
1096 
1097 	if (fid_mode && class != FAN_CLASS_NOTIF)
1098 		return -EINVAL;
1099 
1100 	/*
1101 	 * Child name is reported with parent fid so requires dir fid.
1102 	 * We can report both child fid and dir fid with or without name.
1103 	 */
1104 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1105 		return -EINVAL;
1106 
1107 	f_flags = O_RDWR | FMODE_NONOTIFY;
1108 	if (flags & FAN_CLOEXEC)
1109 		f_flags |= O_CLOEXEC;
1110 	if (flags & FAN_NONBLOCK)
1111 		f_flags |= O_NONBLOCK;
1112 
1113 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1114 	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
1115 	if (IS_ERR(group)) {
1116 		return PTR_ERR(group);
1117 	}
1118 
1119 	/* Enforce groups limits per user in all containing user ns */
1120 	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1121 						  current_euid(),
1122 						  UCOUNT_FANOTIFY_GROUPS);
1123 	if (!group->fanotify_data.ucounts) {
1124 		fd = -EMFILE;
1125 		goto out_destroy_group;
1126 	}
1127 
1128 	group->fanotify_data.flags = flags | internal_flags;
1129 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1130 
1131 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1132 	if (!group->fanotify_data.merge_hash) {
1133 		fd = -ENOMEM;
1134 		goto out_destroy_group;
1135 	}
1136 
1137 	group->overflow_event = fanotify_alloc_overflow_event();
1138 	if (unlikely(!group->overflow_event)) {
1139 		fd = -ENOMEM;
1140 		goto out_destroy_group;
1141 	}
1142 
1143 	if (force_o_largefile())
1144 		event_f_flags |= O_LARGEFILE;
1145 	group->fanotify_data.f_flags = event_f_flags;
1146 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1147 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1148 	switch (class) {
1149 	case FAN_CLASS_NOTIF:
1150 		group->priority = FS_PRIO_0;
1151 		break;
1152 	case FAN_CLASS_CONTENT:
1153 		group->priority = FS_PRIO_1;
1154 		break;
1155 	case FAN_CLASS_PRE_CONTENT:
1156 		group->priority = FS_PRIO_2;
1157 		break;
1158 	default:
1159 		fd = -EINVAL;
1160 		goto out_destroy_group;
1161 	}
1162 
1163 	if (flags & FAN_UNLIMITED_QUEUE) {
1164 		fd = -EPERM;
1165 		if (!capable(CAP_SYS_ADMIN))
1166 			goto out_destroy_group;
1167 		group->max_events = UINT_MAX;
1168 	} else {
1169 		group->max_events = fanotify_max_queued_events;
1170 	}
1171 
1172 	if (flags & FAN_UNLIMITED_MARKS) {
1173 		fd = -EPERM;
1174 		if (!capable(CAP_SYS_ADMIN))
1175 			goto out_destroy_group;
1176 	}
1177 
1178 	if (flags & FAN_ENABLE_AUDIT) {
1179 		fd = -EPERM;
1180 		if (!capable(CAP_AUDIT_WRITE))
1181 			goto out_destroy_group;
1182 	}
1183 
1184 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1185 	if (fd < 0)
1186 		goto out_destroy_group;
1187 
1188 	return fd;
1189 
1190 out_destroy_group:
1191 	fsnotify_destroy_group(group);
1192 	return fd;
1193 }
1194 
1195 /* Check if filesystem can encode a unique fid */
1196 static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
1197 {
1198 	__kernel_fsid_t root_fsid;
1199 	int err;
1200 
1201 	/*
1202 	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
1203 	 */
1204 	err = vfs_get_fsid(path->dentry, fsid);
1205 	if (err)
1206 		return err;
1207 
1208 	if (!fsid->val[0] && !fsid->val[1])
1209 		return -ENODEV;
1210 
1211 	/*
1212 	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
1213 	 * which uses a different fsid than sb root.
1214 	 */
1215 	err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
1216 	if (err)
1217 		return err;
1218 
1219 	if (root_fsid.val[0] != fsid->val[0] ||
1220 	    root_fsid.val[1] != fsid->val[1])
1221 		return -EXDEV;
1222 
1223 	/*
1224 	 * We need to make sure that the file system supports at least
1225 	 * encoding a file handle so user can use name_to_handle_at() to
1226 	 * compare fid returned with event to the file handle of watched
1227 	 * objects. However, name_to_handle_at() requires that the
1228 	 * filesystem also supports decoding file handles.
1229 	 */
1230 	if (!path->dentry->d_sb->s_export_op ||
1231 	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
1232 		return -EOPNOTSUPP;
1233 
1234 	return 0;
1235 }
1236 
1237 static int fanotify_events_supported(struct path *path, __u64 mask)
1238 {
1239 	/*
1240 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1241 	 * files. For them fanotify permission events have high chances of
1242 	 * deadlocking the system - open done when reporting fanotify event
1243 	 * blocks on this "unusual" lock while another process holding the lock
1244 	 * waits for fanotify permission event to be answered. Just disallow
1245 	 * permission events for such filesystems.
1246 	 */
1247 	if (mask & FANOTIFY_PERM_EVENTS &&
1248 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1249 		return -EINVAL;
1250 	return 0;
1251 }
1252 
1253 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1254 			    int dfd, const char  __user *pathname)
1255 {
1256 	struct inode *inode = NULL;
1257 	struct vfsmount *mnt = NULL;
1258 	struct fsnotify_group *group;
1259 	struct fd f;
1260 	struct path path;
1261 	__kernel_fsid_t __fsid, *fsid = NULL;
1262 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1263 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1264 	bool ignored = flags & FAN_MARK_IGNORED_MASK;
1265 	unsigned int obj_type, fid_mode;
1266 	u32 umask = 0;
1267 	int ret;
1268 
1269 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1270 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1271 
1272 	/* we only use the lower 32 bits as of right now. */
1273 	if (upper_32_bits(mask))
1274 		return -EINVAL;
1275 
1276 	if (flags & ~FANOTIFY_MARK_FLAGS)
1277 		return -EINVAL;
1278 
1279 	switch (mark_type) {
1280 	case FAN_MARK_INODE:
1281 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1282 		break;
1283 	case FAN_MARK_MOUNT:
1284 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1285 		break;
1286 	case FAN_MARK_FILESYSTEM:
1287 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1288 		break;
1289 	default:
1290 		return -EINVAL;
1291 	}
1292 
1293 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1294 	case FAN_MARK_ADD:
1295 	case FAN_MARK_REMOVE:
1296 		if (!mask)
1297 			return -EINVAL;
1298 		break;
1299 	case FAN_MARK_FLUSH:
1300 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1301 			return -EINVAL;
1302 		break;
1303 	default:
1304 		return -EINVAL;
1305 	}
1306 
1307 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1308 		valid_mask |= FANOTIFY_PERM_EVENTS;
1309 
1310 	if (mask & ~valid_mask)
1311 		return -EINVAL;
1312 
1313 	/* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1314 	if (ignored)
1315 		mask &= ~FANOTIFY_EVENT_FLAGS;
1316 
1317 	f = fdget(fanotify_fd);
1318 	if (unlikely(!f.file))
1319 		return -EBADF;
1320 
1321 	/* verify that this is indeed an fanotify instance */
1322 	ret = -EINVAL;
1323 	if (unlikely(f.file->f_op != &fanotify_fops))
1324 		goto fput_and_out;
1325 	group = f.file->private_data;
1326 
1327 	/*
1328 	 * An unprivileged user is not allowed to setup mount nor filesystem
1329 	 * marks.  This also includes setting up such marks by a group that
1330 	 * was initialized by an unprivileged user.
1331 	 */
1332 	ret = -EPERM;
1333 	if ((!capable(CAP_SYS_ADMIN) ||
1334 	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1335 	    mark_type != FAN_MARK_INODE)
1336 		goto fput_and_out;
1337 
1338 	/*
1339 	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1340 	 * allowed to set permissions events.
1341 	 */
1342 	ret = -EINVAL;
1343 	if (mask & FANOTIFY_PERM_EVENTS &&
1344 	    group->priority == FS_PRIO_0)
1345 		goto fput_and_out;
1346 
1347 	/*
1348 	 * Events with data type inode do not carry enough information to report
1349 	 * event->fd, so we do not allow setting a mask for inode events unless
1350 	 * group supports reporting fid.
1351 	 * inode events are not supported on a mount mark, because they do not
1352 	 * carry enough information (i.e. path) to be filtered by mount point.
1353 	 */
1354 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1355 	if (mask & FANOTIFY_INODE_EVENTS &&
1356 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1357 		goto fput_and_out;
1358 
1359 	if (flags & FAN_MARK_FLUSH) {
1360 		ret = 0;
1361 		if (mark_type == FAN_MARK_MOUNT)
1362 			fsnotify_clear_vfsmount_marks_by_group(group);
1363 		else if (mark_type == FAN_MARK_FILESYSTEM)
1364 			fsnotify_clear_sb_marks_by_group(group);
1365 		else
1366 			fsnotify_clear_inode_marks_by_group(group);
1367 		goto fput_and_out;
1368 	}
1369 
1370 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1371 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1372 	if (ret)
1373 		goto fput_and_out;
1374 
1375 	if (flags & FAN_MARK_ADD) {
1376 		ret = fanotify_events_supported(&path, mask);
1377 		if (ret)
1378 			goto path_put_and_out;
1379 	}
1380 
1381 	if (fid_mode) {
1382 		ret = fanotify_test_fid(&path, &__fsid);
1383 		if (ret)
1384 			goto path_put_and_out;
1385 
1386 		fsid = &__fsid;
1387 	}
1388 
1389 	/* inode held in place by reference to path; group by fget on fd */
1390 	if (mark_type == FAN_MARK_INODE)
1391 		inode = path.dentry->d_inode;
1392 	else
1393 		mnt = path.mnt;
1394 
1395 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1396 	if (mnt || !S_ISDIR(inode->i_mode)) {
1397 		mask &= ~FAN_EVENT_ON_CHILD;
1398 		umask = FAN_EVENT_ON_CHILD;
1399 		/*
1400 		 * If group needs to report parent fid, register for getting
1401 		 * events with parent/name info for non-directory.
1402 		 */
1403 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
1404 		    (flags & FAN_MARK_ADD) && !ignored)
1405 			mask |= FAN_EVENT_ON_CHILD;
1406 	}
1407 
1408 	/* create/update an inode mark */
1409 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1410 	case FAN_MARK_ADD:
1411 		if (mark_type == FAN_MARK_MOUNT)
1412 			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1413 							 flags, fsid);
1414 		else if (mark_type == FAN_MARK_FILESYSTEM)
1415 			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1416 						   flags, fsid);
1417 		else
1418 			ret = fanotify_add_inode_mark(group, inode, mask,
1419 						      flags, fsid);
1420 		break;
1421 	case FAN_MARK_REMOVE:
1422 		if (mark_type == FAN_MARK_MOUNT)
1423 			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1424 							    flags, umask);
1425 		else if (mark_type == FAN_MARK_FILESYSTEM)
1426 			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1427 						      flags, umask);
1428 		else
1429 			ret = fanotify_remove_inode_mark(group, inode, mask,
1430 							 flags, umask);
1431 		break;
1432 	default:
1433 		ret = -EINVAL;
1434 	}
1435 
1436 path_put_and_out:
1437 	path_put(&path);
1438 fput_and_out:
1439 	fdput(f);
1440 	return ret;
1441 }
1442 
1443 #ifndef CONFIG_ARCH_SPLIT_ARG64
1444 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1445 			      __u64, mask, int, dfd,
1446 			      const char  __user *, pathname)
1447 {
1448 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1449 }
1450 #endif
1451 
1452 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1453 SYSCALL32_DEFINE6(fanotify_mark,
1454 				int, fanotify_fd, unsigned int, flags,
1455 				SC_ARG64(mask), int, dfd,
1456 				const char  __user *, pathname)
1457 {
1458 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1459 				dfd, pathname);
1460 }
1461 #endif
1462 
1463 /*
1464  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1465  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1466  * must result in panic().
1467  */
1468 static int __init fanotify_user_setup(void)
1469 {
1470 	struct sysinfo si;
1471 	int max_marks;
1472 
1473 	si_meminfo(&si);
1474 	/*
1475 	 * Allow up to 1% of addressable memory to be accounted for per user
1476 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
1477 	 * a lot cheaper than inode marks, but there is no reason for a user
1478 	 * to have many of those, so calculate by the cost of inode marks.
1479 	 */
1480 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1481 		    INODE_MARK_COST;
1482 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1483 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
1484 
1485 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1486 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
1487 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1488 
1489 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1490 					 SLAB_PANIC|SLAB_ACCOUNT);
1491 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1492 					       SLAB_PANIC);
1493 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1494 						SLAB_PANIC);
1495 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1496 		fanotify_perm_event_cachep =
1497 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1498 	}
1499 
1500 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1501 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1502 					FANOTIFY_DEFAULT_MAX_GROUPS;
1503 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1504 
1505 	return 0;
1506 }
1507 device_initcall(fanotify_user_setup);
1508