xref: /openbmc/linux/kernel/bpf/task_iter.c (revision f2d8e15b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3 
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/fdtable.h>
9 #include <linux/filter.h>
10 #include <linux/btf_ids.h>
11 #include "mmap_unlock_work.h"
12 
13 struct bpf_iter_seq_task_common {
14 	struct pid_namespace *ns;
15 };
16 
17 struct bpf_iter_seq_task_info {
18 	/* The first field must be struct bpf_iter_seq_task_common.
19 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
20 	 */
21 	struct bpf_iter_seq_task_common common;
22 	u32 tid;
23 };
24 
25 static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
26 					     u32 *tid,
27 					     bool skip_if_dup_files)
28 {
29 	struct task_struct *task = NULL;
30 	struct pid *pid;
31 
32 	rcu_read_lock();
33 retry:
34 	pid = find_ge_pid(*tid, ns);
35 	if (pid) {
36 		*tid = pid_nr_ns(pid, ns);
37 		task = get_pid_task(pid, PIDTYPE_PID);
38 		if (!task) {
39 			++*tid;
40 			goto retry;
41 		} else if (skip_if_dup_files && !thread_group_leader(task) &&
42 			   task->files == task->group_leader->files) {
43 			put_task_struct(task);
44 			task = NULL;
45 			++*tid;
46 			goto retry;
47 		}
48 	}
49 	rcu_read_unlock();
50 
51 	return task;
52 }
53 
54 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
55 {
56 	struct bpf_iter_seq_task_info *info = seq->private;
57 	struct task_struct *task;
58 
59 	task = task_seq_get_next(info->common.ns, &info->tid, false);
60 	if (!task)
61 		return NULL;
62 
63 	if (*pos == 0)
64 		++*pos;
65 	return task;
66 }
67 
68 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
69 {
70 	struct bpf_iter_seq_task_info *info = seq->private;
71 	struct task_struct *task;
72 
73 	++*pos;
74 	++info->tid;
75 	put_task_struct((struct task_struct *)v);
76 	task = task_seq_get_next(info->common.ns, &info->tid, false);
77 	if (!task)
78 		return NULL;
79 
80 	return task;
81 }
82 
83 struct bpf_iter__task {
84 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
85 	__bpf_md_ptr(struct task_struct *, task);
86 };
87 
88 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
89 
90 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
91 			   bool in_stop)
92 {
93 	struct bpf_iter_meta meta;
94 	struct bpf_iter__task ctx;
95 	struct bpf_prog *prog;
96 
97 	meta.seq = seq;
98 	prog = bpf_iter_get_info(&meta, in_stop);
99 	if (!prog)
100 		return 0;
101 
102 	ctx.meta = &meta;
103 	ctx.task = task;
104 	return bpf_iter_run_prog(prog, &ctx);
105 }
106 
107 static int task_seq_show(struct seq_file *seq, void *v)
108 {
109 	return __task_seq_show(seq, v, false);
110 }
111 
112 static void task_seq_stop(struct seq_file *seq, void *v)
113 {
114 	if (!v)
115 		(void)__task_seq_show(seq, v, true);
116 	else
117 		put_task_struct((struct task_struct *)v);
118 }
119 
120 static const struct seq_operations task_seq_ops = {
121 	.start	= task_seq_start,
122 	.next	= task_seq_next,
123 	.stop	= task_seq_stop,
124 	.show	= task_seq_show,
125 };
126 
127 struct bpf_iter_seq_task_file_info {
128 	/* The first field must be struct bpf_iter_seq_task_common.
129 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
130 	 */
131 	struct bpf_iter_seq_task_common common;
132 	struct task_struct *task;
133 	u32 tid;
134 	u32 fd;
135 };
136 
137 static struct file *
138 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
139 {
140 	struct pid_namespace *ns = info->common.ns;
141 	u32 curr_tid = info->tid;
142 	struct task_struct *curr_task;
143 	unsigned int curr_fd = info->fd;
144 
145 	/* If this function returns a non-NULL file object,
146 	 * it held a reference to the task/file.
147 	 * Otherwise, it does not hold any reference.
148 	 */
149 again:
150 	if (info->task) {
151 		curr_task = info->task;
152 		curr_fd = info->fd;
153 	} else {
154                 curr_task = task_seq_get_next(ns, &curr_tid, true);
155                 if (!curr_task) {
156                         info->task = NULL;
157                         info->tid = curr_tid;
158                         return NULL;
159                 }
160 
161                 /* set info->task and info->tid */
162 		info->task = curr_task;
163 		if (curr_tid == info->tid) {
164 			curr_fd = info->fd;
165 		} else {
166 			info->tid = curr_tid;
167 			curr_fd = 0;
168 		}
169 	}
170 
171 	rcu_read_lock();
172 	for (;; curr_fd++) {
173 		struct file *f;
174 		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
175 		if (!f)
176 			break;
177 		if (!get_file_rcu(f))
178 			continue;
179 
180 		/* set info->fd */
181 		info->fd = curr_fd;
182 		rcu_read_unlock();
183 		return f;
184 	}
185 
186 	/* the current task is done, go to the next task */
187 	rcu_read_unlock();
188 	put_task_struct(curr_task);
189 	info->task = NULL;
190 	info->fd = 0;
191 	curr_tid = ++(info->tid);
192 	goto again;
193 }
194 
195 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
196 {
197 	struct bpf_iter_seq_task_file_info *info = seq->private;
198 	struct file *file;
199 
200 	info->task = NULL;
201 	file = task_file_seq_get_next(info);
202 	if (file && *pos == 0)
203 		++*pos;
204 
205 	return file;
206 }
207 
208 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210 	struct bpf_iter_seq_task_file_info *info = seq->private;
211 
212 	++*pos;
213 	++info->fd;
214 	fput((struct file *)v);
215 	return task_file_seq_get_next(info);
216 }
217 
218 struct bpf_iter__task_file {
219 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
220 	__bpf_md_ptr(struct task_struct *, task);
221 	u32 fd __aligned(8);
222 	__bpf_md_ptr(struct file *, file);
223 };
224 
225 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
226 		     struct task_struct *task, u32 fd,
227 		     struct file *file)
228 
229 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
230 				bool in_stop)
231 {
232 	struct bpf_iter_seq_task_file_info *info = seq->private;
233 	struct bpf_iter__task_file ctx;
234 	struct bpf_iter_meta meta;
235 	struct bpf_prog *prog;
236 
237 	meta.seq = seq;
238 	prog = bpf_iter_get_info(&meta, in_stop);
239 	if (!prog)
240 		return 0;
241 
242 	ctx.meta = &meta;
243 	ctx.task = info->task;
244 	ctx.fd = info->fd;
245 	ctx.file = file;
246 	return bpf_iter_run_prog(prog, &ctx);
247 }
248 
249 static int task_file_seq_show(struct seq_file *seq, void *v)
250 {
251 	return __task_file_seq_show(seq, v, false);
252 }
253 
254 static void task_file_seq_stop(struct seq_file *seq, void *v)
255 {
256 	struct bpf_iter_seq_task_file_info *info = seq->private;
257 
258 	if (!v) {
259 		(void)__task_file_seq_show(seq, v, true);
260 	} else {
261 		fput((struct file *)v);
262 		put_task_struct(info->task);
263 		info->task = NULL;
264 	}
265 }
266 
267 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
268 {
269 	struct bpf_iter_seq_task_common *common = priv_data;
270 
271 	common->ns = get_pid_ns(task_active_pid_ns(current));
272 	return 0;
273 }
274 
275 static void fini_seq_pidns(void *priv_data)
276 {
277 	struct bpf_iter_seq_task_common *common = priv_data;
278 
279 	put_pid_ns(common->ns);
280 }
281 
282 static const struct seq_operations task_file_seq_ops = {
283 	.start	= task_file_seq_start,
284 	.next	= task_file_seq_next,
285 	.stop	= task_file_seq_stop,
286 	.show	= task_file_seq_show,
287 };
288 
289 struct bpf_iter_seq_task_vma_info {
290 	/* The first field must be struct bpf_iter_seq_task_common.
291 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
292 	 */
293 	struct bpf_iter_seq_task_common common;
294 	struct task_struct *task;
295 	struct vm_area_struct *vma;
296 	u32 tid;
297 	unsigned long prev_vm_start;
298 	unsigned long prev_vm_end;
299 };
300 
301 enum bpf_task_vma_iter_find_op {
302 	task_vma_iter_first_vma,   /* use mm->mmap */
303 	task_vma_iter_next_vma,    /* use curr_vma->vm_next */
304 	task_vma_iter_find_vma,    /* use find_vma() to find next vma */
305 };
306 
307 static struct vm_area_struct *
308 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
309 {
310 	struct pid_namespace *ns = info->common.ns;
311 	enum bpf_task_vma_iter_find_op op;
312 	struct vm_area_struct *curr_vma;
313 	struct task_struct *curr_task;
314 	u32 curr_tid = info->tid;
315 
316 	/* If this function returns a non-NULL vma, it holds a reference to
317 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
318 	 * If this function returns NULL, it does not hold any reference or
319 	 * lock.
320 	 */
321 	if (info->task) {
322 		curr_task = info->task;
323 		curr_vma = info->vma;
324 		/* In case of lock contention, drop mmap_lock to unblock
325 		 * the writer.
326 		 *
327 		 * After relock, call find(mm, prev_vm_end - 1) to find
328 		 * new vma to process.
329 		 *
330 		 *   +------+------+-----------+
331 		 *   | VMA1 | VMA2 | VMA3      |
332 		 *   +------+------+-----------+
333 		 *   |      |      |           |
334 		 *  4k     8k     16k         400k
335 		 *
336 		 * For example, curr_vma == VMA2. Before unlock, we set
337 		 *
338 		 *    prev_vm_start = 8k
339 		 *    prev_vm_end   = 16k
340 		 *
341 		 * There are a few cases:
342 		 *
343 		 * 1) VMA2 is freed, but VMA3 exists.
344 		 *
345 		 *    find_vma() will return VMA3, just process VMA3.
346 		 *
347 		 * 2) VMA2 still exists.
348 		 *
349 		 *    find_vma() will return VMA2, process VMA2->next.
350 		 *
351 		 * 3) no more vma in this mm.
352 		 *
353 		 *    Process the next task.
354 		 *
355 		 * 4) find_vma() returns a different vma, VMA2'.
356 		 *
357 		 *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
358 		 *         because we already covered the range;
359 		 *    4.2) VMA2 and VMA2' covers different ranges, process
360 		 *         VMA2'.
361 		 */
362 		if (mmap_lock_is_contended(curr_task->mm)) {
363 			info->prev_vm_start = curr_vma->vm_start;
364 			info->prev_vm_end = curr_vma->vm_end;
365 			op = task_vma_iter_find_vma;
366 			mmap_read_unlock(curr_task->mm);
367 			if (mmap_read_lock_killable(curr_task->mm))
368 				goto finish;
369 		} else {
370 			op = task_vma_iter_next_vma;
371 		}
372 	} else {
373 again:
374 		curr_task = task_seq_get_next(ns, &curr_tid, true);
375 		if (!curr_task) {
376 			info->tid = curr_tid + 1;
377 			goto finish;
378 		}
379 
380 		if (curr_tid != info->tid) {
381 			info->tid = curr_tid;
382 			/* new task, process the first vma */
383 			op = task_vma_iter_first_vma;
384 		} else {
385 			/* Found the same tid, which means the user space
386 			 * finished data in previous buffer and read more.
387 			 * We dropped mmap_lock before returning to user
388 			 * space, so it is necessary to use find_vma() to
389 			 * find the next vma to process.
390 			 */
391 			op = task_vma_iter_find_vma;
392 		}
393 
394 		if (!curr_task->mm)
395 			goto next_task;
396 
397 		if (mmap_read_lock_killable(curr_task->mm))
398 			goto finish;
399 	}
400 
401 	switch (op) {
402 	case task_vma_iter_first_vma:
403 		curr_vma = curr_task->mm->mmap;
404 		break;
405 	case task_vma_iter_next_vma:
406 		curr_vma = curr_vma->vm_next;
407 		break;
408 	case task_vma_iter_find_vma:
409 		/* We dropped mmap_lock so it is necessary to use find_vma
410 		 * to find the next vma. This is similar to the  mechanism
411 		 * in show_smaps_rollup().
412 		 */
413 		curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
414 		/* case 1) and 4.2) above just use curr_vma */
415 
416 		/* check for case 2) or case 4.1) above */
417 		if (curr_vma &&
418 		    curr_vma->vm_start == info->prev_vm_start &&
419 		    curr_vma->vm_end == info->prev_vm_end)
420 			curr_vma = curr_vma->vm_next;
421 		break;
422 	}
423 	if (!curr_vma) {
424 		/* case 3) above, or case 2) 4.1) with vma->next == NULL */
425 		mmap_read_unlock(curr_task->mm);
426 		goto next_task;
427 	}
428 	info->task = curr_task;
429 	info->vma = curr_vma;
430 	return curr_vma;
431 
432 next_task:
433 	put_task_struct(curr_task);
434 	info->task = NULL;
435 	curr_tid++;
436 	goto again;
437 
438 finish:
439 	if (curr_task)
440 		put_task_struct(curr_task);
441 	info->task = NULL;
442 	info->vma = NULL;
443 	return NULL;
444 }
445 
446 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
447 {
448 	struct bpf_iter_seq_task_vma_info *info = seq->private;
449 	struct vm_area_struct *vma;
450 
451 	vma = task_vma_seq_get_next(info);
452 	if (vma && *pos == 0)
453 		++*pos;
454 
455 	return vma;
456 }
457 
458 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
459 {
460 	struct bpf_iter_seq_task_vma_info *info = seq->private;
461 
462 	++*pos;
463 	return task_vma_seq_get_next(info);
464 }
465 
466 struct bpf_iter__task_vma {
467 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
468 	__bpf_md_ptr(struct task_struct *, task);
469 	__bpf_md_ptr(struct vm_area_struct *, vma);
470 };
471 
472 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
473 		     struct task_struct *task, struct vm_area_struct *vma)
474 
475 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
476 {
477 	struct bpf_iter_seq_task_vma_info *info = seq->private;
478 	struct bpf_iter__task_vma ctx;
479 	struct bpf_iter_meta meta;
480 	struct bpf_prog *prog;
481 
482 	meta.seq = seq;
483 	prog = bpf_iter_get_info(&meta, in_stop);
484 	if (!prog)
485 		return 0;
486 
487 	ctx.meta = &meta;
488 	ctx.task = info->task;
489 	ctx.vma = info->vma;
490 	return bpf_iter_run_prog(prog, &ctx);
491 }
492 
493 static int task_vma_seq_show(struct seq_file *seq, void *v)
494 {
495 	return __task_vma_seq_show(seq, false);
496 }
497 
498 static void task_vma_seq_stop(struct seq_file *seq, void *v)
499 {
500 	struct bpf_iter_seq_task_vma_info *info = seq->private;
501 
502 	if (!v) {
503 		(void)__task_vma_seq_show(seq, true);
504 	} else {
505 		/* info->vma has not been seen by the BPF program. If the
506 		 * user space reads more, task_vma_seq_get_next should
507 		 * return this vma again. Set prev_vm_start to ~0UL,
508 		 * so that we don't skip the vma returned by the next
509 		 * find_vma() (case task_vma_iter_find_vma in
510 		 * task_vma_seq_get_next()).
511 		 */
512 		info->prev_vm_start = ~0UL;
513 		info->prev_vm_end = info->vma->vm_end;
514 		mmap_read_unlock(info->task->mm);
515 		put_task_struct(info->task);
516 		info->task = NULL;
517 	}
518 }
519 
520 static const struct seq_operations task_vma_seq_ops = {
521 	.start	= task_vma_seq_start,
522 	.next	= task_vma_seq_next,
523 	.stop	= task_vma_seq_stop,
524 	.show	= task_vma_seq_show,
525 };
526 
527 static const struct bpf_iter_seq_info task_seq_info = {
528 	.seq_ops		= &task_seq_ops,
529 	.init_seq_private	= init_seq_pidns,
530 	.fini_seq_private	= fini_seq_pidns,
531 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
532 };
533 
534 static struct bpf_iter_reg task_reg_info = {
535 	.target			= "task",
536 	.feature		= BPF_ITER_RESCHED,
537 	.ctx_arg_info_size	= 1,
538 	.ctx_arg_info		= {
539 		{ offsetof(struct bpf_iter__task, task),
540 		  PTR_TO_BTF_ID_OR_NULL },
541 	},
542 	.seq_info		= &task_seq_info,
543 };
544 
545 static const struct bpf_iter_seq_info task_file_seq_info = {
546 	.seq_ops		= &task_file_seq_ops,
547 	.init_seq_private	= init_seq_pidns,
548 	.fini_seq_private	= fini_seq_pidns,
549 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
550 };
551 
552 static struct bpf_iter_reg task_file_reg_info = {
553 	.target			= "task_file",
554 	.feature		= BPF_ITER_RESCHED,
555 	.ctx_arg_info_size	= 2,
556 	.ctx_arg_info		= {
557 		{ offsetof(struct bpf_iter__task_file, task),
558 		  PTR_TO_BTF_ID_OR_NULL },
559 		{ offsetof(struct bpf_iter__task_file, file),
560 		  PTR_TO_BTF_ID_OR_NULL },
561 	},
562 	.seq_info		= &task_file_seq_info,
563 };
564 
565 static const struct bpf_iter_seq_info task_vma_seq_info = {
566 	.seq_ops		= &task_vma_seq_ops,
567 	.init_seq_private	= init_seq_pidns,
568 	.fini_seq_private	= fini_seq_pidns,
569 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_vma_info),
570 };
571 
572 static struct bpf_iter_reg task_vma_reg_info = {
573 	.target			= "task_vma",
574 	.feature		= BPF_ITER_RESCHED,
575 	.ctx_arg_info_size	= 2,
576 	.ctx_arg_info		= {
577 		{ offsetof(struct bpf_iter__task_vma, task),
578 		  PTR_TO_BTF_ID_OR_NULL },
579 		{ offsetof(struct bpf_iter__task_vma, vma),
580 		  PTR_TO_BTF_ID_OR_NULL },
581 	},
582 	.seq_info		= &task_vma_seq_info,
583 };
584 
585 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
586 	   bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
587 {
588 	struct mmap_unlock_irq_work *work = NULL;
589 	struct vm_area_struct *vma;
590 	bool irq_work_busy = false;
591 	struct mm_struct *mm;
592 	int ret = -ENOENT;
593 
594 	if (flags)
595 		return -EINVAL;
596 
597 	if (!task)
598 		return -ENOENT;
599 
600 	mm = task->mm;
601 	if (!mm)
602 		return -ENOENT;
603 
604 	irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
605 
606 	if (irq_work_busy || !mmap_read_trylock(mm))
607 		return -EBUSY;
608 
609 	vma = find_vma(mm, start);
610 
611 	if (vma && vma->vm_start <= start && vma->vm_end > start) {
612 		callback_fn((u64)(long)task, (u64)(long)vma,
613 			    (u64)(long)callback_ctx, 0, 0);
614 		ret = 0;
615 	}
616 	bpf_mmap_unlock_mm(work, mm);
617 	return ret;
618 }
619 
620 const struct bpf_func_proto bpf_find_vma_proto = {
621 	.func		= bpf_find_vma,
622 	.ret_type	= RET_INTEGER,
623 	.arg1_type	= ARG_PTR_TO_BTF_ID,
624 	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
625 	.arg2_type	= ARG_ANYTHING,
626 	.arg3_type	= ARG_PTR_TO_FUNC,
627 	.arg4_type	= ARG_PTR_TO_STACK_OR_NULL,
628 	.arg5_type	= ARG_ANYTHING,
629 };
630 
631 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
632 
633 static void do_mmap_read_unlock(struct irq_work *entry)
634 {
635 	struct mmap_unlock_irq_work *work;
636 
637 	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
638 		return;
639 
640 	work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
641 	mmap_read_unlock_non_owner(work->mm);
642 }
643 
644 static int __init task_iter_init(void)
645 {
646 	struct mmap_unlock_irq_work *work;
647 	int ret, cpu;
648 
649 	for_each_possible_cpu(cpu) {
650 		work = per_cpu_ptr(&mmap_unlock_work, cpu);
651 		init_irq_work(&work->irq_work, do_mmap_read_unlock);
652 	}
653 
654 	task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
655 	ret = bpf_iter_reg_target(&task_reg_info);
656 	if (ret)
657 		return ret;
658 
659 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
660 	task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
661 	ret =  bpf_iter_reg_target(&task_file_reg_info);
662 	if (ret)
663 		return ret;
664 
665 	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
666 	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
667 	return bpf_iter_reg_target(&task_vma_reg_info);
668 }
669 late_initcall(task_iter_init);
670