xref: /openbmc/linux/kernel/bpf/task_iter.c (revision 869b6ca39c08c5b10eeb29d4b3c4bc433bf8ba5e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3 
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/fdtable.h>
9 #include <linux/filter.h>
10 #include <linux/btf_ids.h>
11 
12 struct bpf_iter_seq_task_common {
13 	struct pid_namespace *ns;
14 };
15 
16 struct bpf_iter_seq_task_info {
17 	/* The first field must be struct bpf_iter_seq_task_common.
18 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
19 	 */
20 	struct bpf_iter_seq_task_common common;
21 	u32 tid;
22 };
23 
24 static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
25 					     u32 *tid,
26 					     bool skip_if_dup_files)
27 {
28 	struct task_struct *task = NULL;
29 	struct pid *pid;
30 
31 	rcu_read_lock();
32 retry:
33 	pid = find_ge_pid(*tid, ns);
34 	if (pid) {
35 		*tid = pid_nr_ns(pid, ns);
36 		task = get_pid_task(pid, PIDTYPE_PID);
37 		if (!task) {
38 			++*tid;
39 			goto retry;
40 		} else if (skip_if_dup_files && !thread_group_leader(task) &&
41 			   task->files == task->group_leader->files) {
42 			put_task_struct(task);
43 			task = NULL;
44 			++*tid;
45 			goto retry;
46 		}
47 	}
48 	rcu_read_unlock();
49 
50 	return task;
51 }
52 
53 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
54 {
55 	struct bpf_iter_seq_task_info *info = seq->private;
56 	struct task_struct *task;
57 
58 	task = task_seq_get_next(info->common.ns, &info->tid, false);
59 	if (!task)
60 		return NULL;
61 
62 	if (*pos == 0)
63 		++*pos;
64 	return task;
65 }
66 
67 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
68 {
69 	struct bpf_iter_seq_task_info *info = seq->private;
70 	struct task_struct *task;
71 
72 	++*pos;
73 	++info->tid;
74 	put_task_struct((struct task_struct *)v);
75 	task = task_seq_get_next(info->common.ns, &info->tid, false);
76 	if (!task)
77 		return NULL;
78 
79 	return task;
80 }
81 
82 struct bpf_iter__task {
83 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
84 	__bpf_md_ptr(struct task_struct *, task);
85 };
86 
87 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
88 
89 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
90 			   bool in_stop)
91 {
92 	struct bpf_iter_meta meta;
93 	struct bpf_iter__task ctx;
94 	struct bpf_prog *prog;
95 
96 	meta.seq = seq;
97 	prog = bpf_iter_get_info(&meta, in_stop);
98 	if (!prog)
99 		return 0;
100 
101 	meta.seq = seq;
102 	ctx.meta = &meta;
103 	ctx.task = task;
104 	return bpf_iter_run_prog(prog, &ctx);
105 }
106 
107 static int task_seq_show(struct seq_file *seq, void *v)
108 {
109 	return __task_seq_show(seq, v, false);
110 }
111 
112 static void task_seq_stop(struct seq_file *seq, void *v)
113 {
114 	if (!v)
115 		(void)__task_seq_show(seq, v, true);
116 	else
117 		put_task_struct((struct task_struct *)v);
118 }
119 
120 static const struct seq_operations task_seq_ops = {
121 	.start	= task_seq_start,
122 	.next	= task_seq_next,
123 	.stop	= task_seq_stop,
124 	.show	= task_seq_show,
125 };
126 
127 struct bpf_iter_seq_task_file_info {
128 	/* The first field must be struct bpf_iter_seq_task_common.
129 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
130 	 */
131 	struct bpf_iter_seq_task_common common;
132 	struct task_struct *task;
133 	u32 tid;
134 	u32 fd;
135 };
136 
137 static struct file *
138 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
139 {
140 	struct pid_namespace *ns = info->common.ns;
141 	u32 curr_tid = info->tid;
142 	struct task_struct *curr_task;
143 	unsigned int curr_fd = info->fd;
144 
145 	/* If this function returns a non-NULL file object,
146 	 * it held a reference to the task/file.
147 	 * Otherwise, it does not hold any reference.
148 	 */
149 again:
150 	if (info->task) {
151 		curr_task = info->task;
152 		curr_fd = info->fd;
153 	} else {
154                 curr_task = task_seq_get_next(ns, &curr_tid, true);
155                 if (!curr_task) {
156                         info->task = NULL;
157                         info->tid = curr_tid;
158                         return NULL;
159                 }
160 
161                 /* set info->task and info->tid */
162 		info->task = curr_task;
163 		if (curr_tid == info->tid) {
164 			curr_fd = info->fd;
165 		} else {
166 			info->tid = curr_tid;
167 			curr_fd = 0;
168 		}
169 	}
170 
171 	rcu_read_lock();
172 	for (;; curr_fd++) {
173 		struct file *f;
174 		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
175 		if (!f)
176 			break;
177 		if (!get_file_rcu(f))
178 			continue;
179 
180 		/* set info->fd */
181 		info->fd = curr_fd;
182 		rcu_read_unlock();
183 		return f;
184 	}
185 
186 	/* the current task is done, go to the next task */
187 	rcu_read_unlock();
188 	put_task_struct(curr_task);
189 	info->task = NULL;
190 	info->fd = 0;
191 	curr_tid = ++(info->tid);
192 	goto again;
193 }
194 
195 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
196 {
197 	struct bpf_iter_seq_task_file_info *info = seq->private;
198 	struct file *file;
199 
200 	info->task = NULL;
201 	file = task_file_seq_get_next(info);
202 	if (file && *pos == 0)
203 		++*pos;
204 
205 	return file;
206 }
207 
208 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210 	struct bpf_iter_seq_task_file_info *info = seq->private;
211 
212 	++*pos;
213 	++info->fd;
214 	fput((struct file *)v);
215 	return task_file_seq_get_next(info);
216 }
217 
218 struct bpf_iter__task_file {
219 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
220 	__bpf_md_ptr(struct task_struct *, task);
221 	u32 fd __aligned(8);
222 	__bpf_md_ptr(struct file *, file);
223 };
224 
225 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
226 		     struct task_struct *task, u32 fd,
227 		     struct file *file)
228 
229 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
230 				bool in_stop)
231 {
232 	struct bpf_iter_seq_task_file_info *info = seq->private;
233 	struct bpf_iter__task_file ctx;
234 	struct bpf_iter_meta meta;
235 	struct bpf_prog *prog;
236 
237 	meta.seq = seq;
238 	prog = bpf_iter_get_info(&meta, in_stop);
239 	if (!prog)
240 		return 0;
241 
242 	ctx.meta = &meta;
243 	ctx.task = info->task;
244 	ctx.fd = info->fd;
245 	ctx.file = file;
246 	return bpf_iter_run_prog(prog, &ctx);
247 }
248 
249 static int task_file_seq_show(struct seq_file *seq, void *v)
250 {
251 	return __task_file_seq_show(seq, v, false);
252 }
253 
254 static void task_file_seq_stop(struct seq_file *seq, void *v)
255 {
256 	struct bpf_iter_seq_task_file_info *info = seq->private;
257 
258 	if (!v) {
259 		(void)__task_file_seq_show(seq, v, true);
260 	} else {
261 		fput((struct file *)v);
262 		put_task_struct(info->task);
263 		info->task = NULL;
264 	}
265 }
266 
267 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
268 {
269 	struct bpf_iter_seq_task_common *common = priv_data;
270 
271 	common->ns = get_pid_ns(task_active_pid_ns(current));
272 	return 0;
273 }
274 
275 static void fini_seq_pidns(void *priv_data)
276 {
277 	struct bpf_iter_seq_task_common *common = priv_data;
278 
279 	put_pid_ns(common->ns);
280 }
281 
282 static const struct seq_operations task_file_seq_ops = {
283 	.start	= task_file_seq_start,
284 	.next	= task_file_seq_next,
285 	.stop	= task_file_seq_stop,
286 	.show	= task_file_seq_show,
287 };
288 
289 struct bpf_iter_seq_task_vma_info {
290 	/* The first field must be struct bpf_iter_seq_task_common.
291 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
292 	 */
293 	struct bpf_iter_seq_task_common common;
294 	struct task_struct *task;
295 	struct vm_area_struct *vma;
296 	u32 tid;
297 	unsigned long prev_vm_start;
298 	unsigned long prev_vm_end;
299 };
300 
301 enum bpf_task_vma_iter_find_op {
302 	task_vma_iter_first_vma,   /* use mm->mmap */
303 	task_vma_iter_next_vma,    /* use curr_vma->vm_next */
304 	task_vma_iter_find_vma,    /* use find_vma() to find next vma */
305 };
306 
307 static struct vm_area_struct *
308 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
309 {
310 	struct pid_namespace *ns = info->common.ns;
311 	enum bpf_task_vma_iter_find_op op;
312 	struct vm_area_struct *curr_vma;
313 	struct task_struct *curr_task;
314 	u32 curr_tid = info->tid;
315 
316 	/* If this function returns a non-NULL vma, it holds a reference to
317 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
318 	 * If this function returns NULL, it does not hold any reference or
319 	 * lock.
320 	 */
321 	if (info->task) {
322 		curr_task = info->task;
323 		curr_vma = info->vma;
324 		/* In case of lock contention, drop mmap_lock to unblock
325 		 * the writer.
326 		 *
327 		 * After relock, call find(mm, prev_vm_end - 1) to find
328 		 * new vma to process.
329 		 *
330 		 *   +------+------+-----------+
331 		 *   | VMA1 | VMA2 | VMA3      |
332 		 *   +------+------+-----------+
333 		 *   |      |      |           |
334 		 *  4k     8k     16k         400k
335 		 *
336 		 * For example, curr_vma == VMA2. Before unlock, we set
337 		 *
338 		 *    prev_vm_start = 8k
339 		 *    prev_vm_end   = 16k
340 		 *
341 		 * There are a few cases:
342 		 *
343 		 * 1) VMA2 is freed, but VMA3 exists.
344 		 *
345 		 *    find_vma() will return VMA3, just process VMA3.
346 		 *
347 		 * 2) VMA2 still exists.
348 		 *
349 		 *    find_vma() will return VMA2, process VMA2->next.
350 		 *
351 		 * 3) no more vma in this mm.
352 		 *
353 		 *    Process the next task.
354 		 *
355 		 * 4) find_vma() returns a different vma, VMA2'.
356 		 *
357 		 *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
358 		 *         because we already covered the range;
359 		 *    4.2) VMA2 and VMA2' covers different ranges, process
360 		 *         VMA2'.
361 		 */
362 		if (mmap_lock_is_contended(curr_task->mm)) {
363 			info->prev_vm_start = curr_vma->vm_start;
364 			info->prev_vm_end = curr_vma->vm_end;
365 			op = task_vma_iter_find_vma;
366 			mmap_read_unlock(curr_task->mm);
367 			if (mmap_read_lock_killable(curr_task->mm))
368 				goto finish;
369 		} else {
370 			op = task_vma_iter_next_vma;
371 		}
372 	} else {
373 again:
374 		curr_task = task_seq_get_next(ns, &curr_tid, true);
375 		if (!curr_task) {
376 			info->tid = curr_tid + 1;
377 			goto finish;
378 		}
379 
380 		if (curr_tid != info->tid) {
381 			info->tid = curr_tid;
382 			/* new task, process the first vma */
383 			op = task_vma_iter_first_vma;
384 		} else {
385 			/* Found the same tid, which means the user space
386 			 * finished data in previous buffer and read more.
387 			 * We dropped mmap_lock before returning to user
388 			 * space, so it is necessary to use find_vma() to
389 			 * find the next vma to process.
390 			 */
391 			op = task_vma_iter_find_vma;
392 		}
393 
394 		if (!curr_task->mm)
395 			goto next_task;
396 
397 		if (mmap_read_lock_killable(curr_task->mm))
398 			goto finish;
399 	}
400 
401 	switch (op) {
402 	case task_vma_iter_first_vma:
403 		curr_vma = curr_task->mm->mmap;
404 		break;
405 	case task_vma_iter_next_vma:
406 		curr_vma = curr_vma->vm_next;
407 		break;
408 	case task_vma_iter_find_vma:
409 		/* We dropped mmap_lock so it is necessary to use find_vma
410 		 * to find the next vma. This is similar to the  mechanism
411 		 * in show_smaps_rollup().
412 		 */
413 		curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
414 		/* case 1) and 4.2) above just use curr_vma */
415 
416 		/* check for case 2) or case 4.1) above */
417 		if (curr_vma &&
418 		    curr_vma->vm_start == info->prev_vm_start &&
419 		    curr_vma->vm_end == info->prev_vm_end)
420 			curr_vma = curr_vma->vm_next;
421 		break;
422 	}
423 	if (!curr_vma) {
424 		/* case 3) above, or case 2) 4.1) with vma->next == NULL */
425 		mmap_read_unlock(curr_task->mm);
426 		goto next_task;
427 	}
428 	info->task = curr_task;
429 	info->vma = curr_vma;
430 	return curr_vma;
431 
432 next_task:
433 	put_task_struct(curr_task);
434 	info->task = NULL;
435 	curr_tid++;
436 	goto again;
437 
438 finish:
439 	if (curr_task)
440 		put_task_struct(curr_task);
441 	info->task = NULL;
442 	info->vma = NULL;
443 	return NULL;
444 }
445 
446 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
447 {
448 	struct bpf_iter_seq_task_vma_info *info = seq->private;
449 	struct vm_area_struct *vma;
450 
451 	vma = task_vma_seq_get_next(info);
452 	if (vma && *pos == 0)
453 		++*pos;
454 
455 	return vma;
456 }
457 
458 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
459 {
460 	struct bpf_iter_seq_task_vma_info *info = seq->private;
461 
462 	++*pos;
463 	return task_vma_seq_get_next(info);
464 }
465 
466 struct bpf_iter__task_vma {
467 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
468 	__bpf_md_ptr(struct task_struct *, task);
469 	__bpf_md_ptr(struct vm_area_struct *, vma);
470 };
471 
472 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
473 		     struct task_struct *task, struct vm_area_struct *vma)
474 
475 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
476 {
477 	struct bpf_iter_seq_task_vma_info *info = seq->private;
478 	struct bpf_iter__task_vma ctx;
479 	struct bpf_iter_meta meta;
480 	struct bpf_prog *prog;
481 
482 	meta.seq = seq;
483 	prog = bpf_iter_get_info(&meta, in_stop);
484 	if (!prog)
485 		return 0;
486 
487 	ctx.meta = &meta;
488 	ctx.task = info->task;
489 	ctx.vma = info->vma;
490 	return bpf_iter_run_prog(prog, &ctx);
491 }
492 
493 static int task_vma_seq_show(struct seq_file *seq, void *v)
494 {
495 	return __task_vma_seq_show(seq, false);
496 }
497 
498 static void task_vma_seq_stop(struct seq_file *seq, void *v)
499 {
500 	struct bpf_iter_seq_task_vma_info *info = seq->private;
501 
502 	if (!v) {
503 		(void)__task_vma_seq_show(seq, true);
504 	} else {
505 		/* info->vma has not been seen by the BPF program. If the
506 		 * user space reads more, task_vma_seq_get_next should
507 		 * return this vma again. Set prev_vm_start to ~0UL,
508 		 * so that we don't skip the vma returned by the next
509 		 * find_vma() (case task_vma_iter_find_vma in
510 		 * task_vma_seq_get_next()).
511 		 */
512 		info->prev_vm_start = ~0UL;
513 		info->prev_vm_end = info->vma->vm_end;
514 		mmap_read_unlock(info->task->mm);
515 		put_task_struct(info->task);
516 		info->task = NULL;
517 	}
518 }
519 
520 static const struct seq_operations task_vma_seq_ops = {
521 	.start	= task_vma_seq_start,
522 	.next	= task_vma_seq_next,
523 	.stop	= task_vma_seq_stop,
524 	.show	= task_vma_seq_show,
525 };
526 
527 BTF_ID_LIST(btf_task_file_ids)
528 BTF_ID(struct, file)
529 BTF_ID(struct, vm_area_struct)
530 
531 static const struct bpf_iter_seq_info task_seq_info = {
532 	.seq_ops		= &task_seq_ops,
533 	.init_seq_private	= init_seq_pidns,
534 	.fini_seq_private	= fini_seq_pidns,
535 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
536 };
537 
538 static struct bpf_iter_reg task_reg_info = {
539 	.target			= "task",
540 	.feature		= BPF_ITER_RESCHED,
541 	.ctx_arg_info_size	= 1,
542 	.ctx_arg_info		= {
543 		{ offsetof(struct bpf_iter__task, task),
544 		  PTR_TO_BTF_ID_OR_NULL },
545 	},
546 	.seq_info		= &task_seq_info,
547 };
548 
549 static const struct bpf_iter_seq_info task_file_seq_info = {
550 	.seq_ops		= &task_file_seq_ops,
551 	.init_seq_private	= init_seq_pidns,
552 	.fini_seq_private	= fini_seq_pidns,
553 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
554 };
555 
556 static struct bpf_iter_reg task_file_reg_info = {
557 	.target			= "task_file",
558 	.feature		= BPF_ITER_RESCHED,
559 	.ctx_arg_info_size	= 2,
560 	.ctx_arg_info		= {
561 		{ offsetof(struct bpf_iter__task_file, task),
562 		  PTR_TO_BTF_ID_OR_NULL },
563 		{ offsetof(struct bpf_iter__task_file, file),
564 		  PTR_TO_BTF_ID_OR_NULL },
565 	},
566 	.seq_info		= &task_file_seq_info,
567 };
568 
569 static const struct bpf_iter_seq_info task_vma_seq_info = {
570 	.seq_ops		= &task_vma_seq_ops,
571 	.init_seq_private	= init_seq_pidns,
572 	.fini_seq_private	= fini_seq_pidns,
573 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_vma_info),
574 };
575 
576 static struct bpf_iter_reg task_vma_reg_info = {
577 	.target			= "task_vma",
578 	.feature		= BPF_ITER_RESCHED,
579 	.ctx_arg_info_size	= 2,
580 	.ctx_arg_info		= {
581 		{ offsetof(struct bpf_iter__task_vma, task),
582 		  PTR_TO_BTF_ID_OR_NULL },
583 		{ offsetof(struct bpf_iter__task_vma, vma),
584 		  PTR_TO_BTF_ID_OR_NULL },
585 	},
586 	.seq_info		= &task_vma_seq_info,
587 };
588 
589 static int __init task_iter_init(void)
590 {
591 	int ret;
592 
593 	task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
594 	ret = bpf_iter_reg_target(&task_reg_info);
595 	if (ret)
596 		return ret;
597 
598 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
599 	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
600 	ret =  bpf_iter_reg_target(&task_file_reg_info);
601 	if (ret)
602 		return ret;
603 
604 	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
605 	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
606 	return bpf_iter_reg_target(&task_vma_reg_info);
607 }
608 late_initcall(task_iter_init);
609