1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2020 Facebook */ 3 4 #include <linux/init.h> 5 #include <linux/namei.h> 6 #include <linux/pid_namespace.h> 7 #include <linux/fs.h> 8 #include <linux/fdtable.h> 9 #include <linux/filter.h> 10 #include <linux/btf_ids.h> 11 #include "mmap_unlock_work.h" 12 13 struct bpf_iter_seq_task_common { 14 struct pid_namespace *ns; 15 }; 16 17 struct bpf_iter_seq_task_info { 18 /* The first field must be struct bpf_iter_seq_task_common. 19 * this is assumed by {init, fini}_seq_pidns() callback functions. 20 */ 21 struct bpf_iter_seq_task_common common; 22 u32 tid; 23 }; 24 25 static struct task_struct *task_seq_get_next(struct pid_namespace *ns, 26 u32 *tid, 27 bool skip_if_dup_files) 28 { 29 struct task_struct *task = NULL; 30 struct pid *pid; 31 32 rcu_read_lock(); 33 retry: 34 pid = find_ge_pid(*tid, ns); 35 if (pid) { 36 *tid = pid_nr_ns(pid, ns); 37 task = get_pid_task(pid, PIDTYPE_PID); 38 if (!task) { 39 ++*tid; 40 goto retry; 41 } else if (skip_if_dup_files && !thread_group_leader(task) && 42 task->files == task->group_leader->files) { 43 put_task_struct(task); 44 task = NULL; 45 ++*tid; 46 goto retry; 47 } 48 } 49 rcu_read_unlock(); 50 51 return task; 52 } 53 54 static void *task_seq_start(struct seq_file *seq, loff_t *pos) 55 { 56 struct bpf_iter_seq_task_info *info = seq->private; 57 struct task_struct *task; 58 59 task = task_seq_get_next(info->common.ns, &info->tid, false); 60 if (!task) 61 return NULL; 62 63 if (*pos == 0) 64 ++*pos; 65 return task; 66 } 67 68 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 69 { 70 struct bpf_iter_seq_task_info *info = seq->private; 71 struct task_struct *task; 72 73 ++*pos; 74 ++info->tid; 75 put_task_struct((struct task_struct *)v); 76 task = task_seq_get_next(info->common.ns, &info->tid, false); 77 if (!task) 78 return NULL; 79 80 return task; 81 } 82 83 struct bpf_iter__task { 84 __bpf_md_ptr(struct bpf_iter_meta *, meta); 85 __bpf_md_ptr(struct task_struct *, task); 86 }; 87 88 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 89 90 static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 91 bool in_stop) 92 { 93 struct bpf_iter_meta meta; 94 struct bpf_iter__task ctx; 95 struct bpf_prog *prog; 96 97 meta.seq = seq; 98 prog = bpf_iter_get_info(&meta, in_stop); 99 if (!prog) 100 return 0; 101 102 ctx.meta = &meta; 103 ctx.task = task; 104 return bpf_iter_run_prog(prog, &ctx); 105 } 106 107 static int task_seq_show(struct seq_file *seq, void *v) 108 { 109 return __task_seq_show(seq, v, false); 110 } 111 112 static void task_seq_stop(struct seq_file *seq, void *v) 113 { 114 if (!v) 115 (void)__task_seq_show(seq, v, true); 116 else 117 put_task_struct((struct task_struct *)v); 118 } 119 120 static const struct seq_operations task_seq_ops = { 121 .start = task_seq_start, 122 .next = task_seq_next, 123 .stop = task_seq_stop, 124 .show = task_seq_show, 125 }; 126 127 struct bpf_iter_seq_task_file_info { 128 /* The first field must be struct bpf_iter_seq_task_common. 129 * this is assumed by {init, fini}_seq_pidns() callback functions. 130 */ 131 struct bpf_iter_seq_task_common common; 132 struct task_struct *task; 133 u32 tid; 134 u32 fd; 135 }; 136 137 static struct file * 138 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 139 { 140 struct pid_namespace *ns = info->common.ns; 141 u32 curr_tid = info->tid; 142 struct task_struct *curr_task; 143 unsigned int curr_fd = info->fd; 144 145 /* If this function returns a non-NULL file object, 146 * it held a reference to the task/file. 147 * Otherwise, it does not hold any reference. 148 */ 149 again: 150 if (info->task) { 151 curr_task = info->task; 152 curr_fd = info->fd; 153 } else { 154 curr_task = task_seq_get_next(ns, &curr_tid, true); 155 if (!curr_task) { 156 info->task = NULL; 157 info->tid = curr_tid; 158 return NULL; 159 } 160 161 /* set info->task and info->tid */ 162 info->task = curr_task; 163 if (curr_tid == info->tid) { 164 curr_fd = info->fd; 165 } else { 166 info->tid = curr_tid; 167 curr_fd = 0; 168 } 169 } 170 171 rcu_read_lock(); 172 for (;; curr_fd++) { 173 struct file *f; 174 f = task_lookup_next_fd_rcu(curr_task, &curr_fd); 175 if (!f) 176 break; 177 if (!get_file_rcu(f)) 178 continue; 179 180 /* set info->fd */ 181 info->fd = curr_fd; 182 rcu_read_unlock(); 183 return f; 184 } 185 186 /* the current task is done, go to the next task */ 187 rcu_read_unlock(); 188 put_task_struct(curr_task); 189 info->task = NULL; 190 info->fd = 0; 191 curr_tid = ++(info->tid); 192 goto again; 193 } 194 195 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 196 { 197 struct bpf_iter_seq_task_file_info *info = seq->private; 198 struct file *file; 199 200 info->task = NULL; 201 file = task_file_seq_get_next(info); 202 if (file && *pos == 0) 203 ++*pos; 204 205 return file; 206 } 207 208 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 209 { 210 struct bpf_iter_seq_task_file_info *info = seq->private; 211 212 ++*pos; 213 ++info->fd; 214 fput((struct file *)v); 215 return task_file_seq_get_next(info); 216 } 217 218 struct bpf_iter__task_file { 219 __bpf_md_ptr(struct bpf_iter_meta *, meta); 220 __bpf_md_ptr(struct task_struct *, task); 221 u32 fd __aligned(8); 222 __bpf_md_ptr(struct file *, file); 223 }; 224 225 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 226 struct task_struct *task, u32 fd, 227 struct file *file) 228 229 static int __task_file_seq_show(struct seq_file *seq, struct file *file, 230 bool in_stop) 231 { 232 struct bpf_iter_seq_task_file_info *info = seq->private; 233 struct bpf_iter__task_file ctx; 234 struct bpf_iter_meta meta; 235 struct bpf_prog *prog; 236 237 meta.seq = seq; 238 prog = bpf_iter_get_info(&meta, in_stop); 239 if (!prog) 240 return 0; 241 242 ctx.meta = &meta; 243 ctx.task = info->task; 244 ctx.fd = info->fd; 245 ctx.file = file; 246 return bpf_iter_run_prog(prog, &ctx); 247 } 248 249 static int task_file_seq_show(struct seq_file *seq, void *v) 250 { 251 return __task_file_seq_show(seq, v, false); 252 } 253 254 static void task_file_seq_stop(struct seq_file *seq, void *v) 255 { 256 struct bpf_iter_seq_task_file_info *info = seq->private; 257 258 if (!v) { 259 (void)__task_file_seq_show(seq, v, true); 260 } else { 261 fput((struct file *)v); 262 put_task_struct(info->task); 263 info->task = NULL; 264 } 265 } 266 267 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 268 { 269 struct bpf_iter_seq_task_common *common = priv_data; 270 271 common->ns = get_pid_ns(task_active_pid_ns(current)); 272 return 0; 273 } 274 275 static void fini_seq_pidns(void *priv_data) 276 { 277 struct bpf_iter_seq_task_common *common = priv_data; 278 279 put_pid_ns(common->ns); 280 } 281 282 static const struct seq_operations task_file_seq_ops = { 283 .start = task_file_seq_start, 284 .next = task_file_seq_next, 285 .stop = task_file_seq_stop, 286 .show = task_file_seq_show, 287 }; 288 289 struct bpf_iter_seq_task_vma_info { 290 /* The first field must be struct bpf_iter_seq_task_common. 291 * this is assumed by {init, fini}_seq_pidns() callback functions. 292 */ 293 struct bpf_iter_seq_task_common common; 294 struct task_struct *task; 295 struct vm_area_struct *vma; 296 u32 tid; 297 unsigned long prev_vm_start; 298 unsigned long prev_vm_end; 299 }; 300 301 enum bpf_task_vma_iter_find_op { 302 task_vma_iter_first_vma, /* use mm->mmap */ 303 task_vma_iter_next_vma, /* use curr_vma->vm_next */ 304 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 305 }; 306 307 static struct vm_area_struct * 308 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 309 { 310 struct pid_namespace *ns = info->common.ns; 311 enum bpf_task_vma_iter_find_op op; 312 struct vm_area_struct *curr_vma; 313 struct task_struct *curr_task; 314 u32 curr_tid = info->tid; 315 316 /* If this function returns a non-NULL vma, it holds a reference to 317 * the task_struct, and holds read lock on vma->mm->mmap_lock. 318 * If this function returns NULL, it does not hold any reference or 319 * lock. 320 */ 321 if (info->task) { 322 curr_task = info->task; 323 curr_vma = info->vma; 324 /* In case of lock contention, drop mmap_lock to unblock 325 * the writer. 326 * 327 * After relock, call find(mm, prev_vm_end - 1) to find 328 * new vma to process. 329 * 330 * +------+------+-----------+ 331 * | VMA1 | VMA2 | VMA3 | 332 * +------+------+-----------+ 333 * | | | | 334 * 4k 8k 16k 400k 335 * 336 * For example, curr_vma == VMA2. Before unlock, we set 337 * 338 * prev_vm_start = 8k 339 * prev_vm_end = 16k 340 * 341 * There are a few cases: 342 * 343 * 1) VMA2 is freed, but VMA3 exists. 344 * 345 * find_vma() will return VMA3, just process VMA3. 346 * 347 * 2) VMA2 still exists. 348 * 349 * find_vma() will return VMA2, process VMA2->next. 350 * 351 * 3) no more vma in this mm. 352 * 353 * Process the next task. 354 * 355 * 4) find_vma() returns a different vma, VMA2'. 356 * 357 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 358 * because we already covered the range; 359 * 4.2) VMA2 and VMA2' covers different ranges, process 360 * VMA2'. 361 */ 362 if (mmap_lock_is_contended(curr_task->mm)) { 363 info->prev_vm_start = curr_vma->vm_start; 364 info->prev_vm_end = curr_vma->vm_end; 365 op = task_vma_iter_find_vma; 366 mmap_read_unlock(curr_task->mm); 367 if (mmap_read_lock_killable(curr_task->mm)) 368 goto finish; 369 } else { 370 op = task_vma_iter_next_vma; 371 } 372 } else { 373 again: 374 curr_task = task_seq_get_next(ns, &curr_tid, true); 375 if (!curr_task) { 376 info->tid = curr_tid + 1; 377 goto finish; 378 } 379 380 if (curr_tid != info->tid) { 381 info->tid = curr_tid; 382 /* new task, process the first vma */ 383 op = task_vma_iter_first_vma; 384 } else { 385 /* Found the same tid, which means the user space 386 * finished data in previous buffer and read more. 387 * We dropped mmap_lock before returning to user 388 * space, so it is necessary to use find_vma() to 389 * find the next vma to process. 390 */ 391 op = task_vma_iter_find_vma; 392 } 393 394 if (!curr_task->mm) 395 goto next_task; 396 397 if (mmap_read_lock_killable(curr_task->mm)) 398 goto finish; 399 } 400 401 switch (op) { 402 case task_vma_iter_first_vma: 403 curr_vma = curr_task->mm->mmap; 404 break; 405 case task_vma_iter_next_vma: 406 curr_vma = curr_vma->vm_next; 407 break; 408 case task_vma_iter_find_vma: 409 /* We dropped mmap_lock so it is necessary to use find_vma 410 * to find the next vma. This is similar to the mechanism 411 * in show_smaps_rollup(). 412 */ 413 curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); 414 /* case 1) and 4.2) above just use curr_vma */ 415 416 /* check for case 2) or case 4.1) above */ 417 if (curr_vma && 418 curr_vma->vm_start == info->prev_vm_start && 419 curr_vma->vm_end == info->prev_vm_end) 420 curr_vma = curr_vma->vm_next; 421 break; 422 } 423 if (!curr_vma) { 424 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 425 mmap_read_unlock(curr_task->mm); 426 goto next_task; 427 } 428 info->task = curr_task; 429 info->vma = curr_vma; 430 return curr_vma; 431 432 next_task: 433 put_task_struct(curr_task); 434 info->task = NULL; 435 curr_tid++; 436 goto again; 437 438 finish: 439 if (curr_task) 440 put_task_struct(curr_task); 441 info->task = NULL; 442 info->vma = NULL; 443 return NULL; 444 } 445 446 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 447 { 448 struct bpf_iter_seq_task_vma_info *info = seq->private; 449 struct vm_area_struct *vma; 450 451 vma = task_vma_seq_get_next(info); 452 if (vma && *pos == 0) 453 ++*pos; 454 455 return vma; 456 } 457 458 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 459 { 460 struct bpf_iter_seq_task_vma_info *info = seq->private; 461 462 ++*pos; 463 return task_vma_seq_get_next(info); 464 } 465 466 struct bpf_iter__task_vma { 467 __bpf_md_ptr(struct bpf_iter_meta *, meta); 468 __bpf_md_ptr(struct task_struct *, task); 469 __bpf_md_ptr(struct vm_area_struct *, vma); 470 }; 471 472 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 473 struct task_struct *task, struct vm_area_struct *vma) 474 475 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 476 { 477 struct bpf_iter_seq_task_vma_info *info = seq->private; 478 struct bpf_iter__task_vma ctx; 479 struct bpf_iter_meta meta; 480 struct bpf_prog *prog; 481 482 meta.seq = seq; 483 prog = bpf_iter_get_info(&meta, in_stop); 484 if (!prog) 485 return 0; 486 487 ctx.meta = &meta; 488 ctx.task = info->task; 489 ctx.vma = info->vma; 490 return bpf_iter_run_prog(prog, &ctx); 491 } 492 493 static int task_vma_seq_show(struct seq_file *seq, void *v) 494 { 495 return __task_vma_seq_show(seq, false); 496 } 497 498 static void task_vma_seq_stop(struct seq_file *seq, void *v) 499 { 500 struct bpf_iter_seq_task_vma_info *info = seq->private; 501 502 if (!v) { 503 (void)__task_vma_seq_show(seq, true); 504 } else { 505 /* info->vma has not been seen by the BPF program. If the 506 * user space reads more, task_vma_seq_get_next should 507 * return this vma again. Set prev_vm_start to ~0UL, 508 * so that we don't skip the vma returned by the next 509 * find_vma() (case task_vma_iter_find_vma in 510 * task_vma_seq_get_next()). 511 */ 512 info->prev_vm_start = ~0UL; 513 info->prev_vm_end = info->vma->vm_end; 514 mmap_read_unlock(info->task->mm); 515 put_task_struct(info->task); 516 info->task = NULL; 517 } 518 } 519 520 static const struct seq_operations task_vma_seq_ops = { 521 .start = task_vma_seq_start, 522 .next = task_vma_seq_next, 523 .stop = task_vma_seq_stop, 524 .show = task_vma_seq_show, 525 }; 526 527 static const struct bpf_iter_seq_info task_seq_info = { 528 .seq_ops = &task_seq_ops, 529 .init_seq_private = init_seq_pidns, 530 .fini_seq_private = fini_seq_pidns, 531 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 532 }; 533 534 static struct bpf_iter_reg task_reg_info = { 535 .target = "task", 536 .feature = BPF_ITER_RESCHED, 537 .ctx_arg_info_size = 1, 538 .ctx_arg_info = { 539 { offsetof(struct bpf_iter__task, task), 540 PTR_TO_BTF_ID_OR_NULL }, 541 }, 542 .seq_info = &task_seq_info, 543 }; 544 545 static const struct bpf_iter_seq_info task_file_seq_info = { 546 .seq_ops = &task_file_seq_ops, 547 .init_seq_private = init_seq_pidns, 548 .fini_seq_private = fini_seq_pidns, 549 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 550 }; 551 552 static struct bpf_iter_reg task_file_reg_info = { 553 .target = "task_file", 554 .feature = BPF_ITER_RESCHED, 555 .ctx_arg_info_size = 2, 556 .ctx_arg_info = { 557 { offsetof(struct bpf_iter__task_file, task), 558 PTR_TO_BTF_ID_OR_NULL }, 559 { offsetof(struct bpf_iter__task_file, file), 560 PTR_TO_BTF_ID_OR_NULL }, 561 }, 562 .seq_info = &task_file_seq_info, 563 }; 564 565 static const struct bpf_iter_seq_info task_vma_seq_info = { 566 .seq_ops = &task_vma_seq_ops, 567 .init_seq_private = init_seq_pidns, 568 .fini_seq_private = fini_seq_pidns, 569 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 570 }; 571 572 static struct bpf_iter_reg task_vma_reg_info = { 573 .target = "task_vma", 574 .feature = BPF_ITER_RESCHED, 575 .ctx_arg_info_size = 2, 576 .ctx_arg_info = { 577 { offsetof(struct bpf_iter__task_vma, task), 578 PTR_TO_BTF_ID_OR_NULL }, 579 { offsetof(struct bpf_iter__task_vma, vma), 580 PTR_TO_BTF_ID_OR_NULL }, 581 }, 582 .seq_info = &task_vma_seq_info, 583 }; 584 585 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, 586 bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) 587 { 588 struct mmap_unlock_irq_work *work = NULL; 589 struct vm_area_struct *vma; 590 bool irq_work_busy = false; 591 struct mm_struct *mm; 592 int ret = -ENOENT; 593 594 if (flags) 595 return -EINVAL; 596 597 if (!task) 598 return -ENOENT; 599 600 mm = task->mm; 601 if (!mm) 602 return -ENOENT; 603 604 irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 605 606 if (irq_work_busy || !mmap_read_trylock(mm)) 607 return -EBUSY; 608 609 vma = find_vma(mm, start); 610 611 if (vma && vma->vm_start <= start && vma->vm_end > start) { 612 callback_fn((u64)(long)task, (u64)(long)vma, 613 (u64)(long)callback_ctx, 0, 0); 614 ret = 0; 615 } 616 bpf_mmap_unlock_mm(work, mm); 617 return ret; 618 } 619 620 const struct bpf_func_proto bpf_find_vma_proto = { 621 .func = bpf_find_vma, 622 .ret_type = RET_INTEGER, 623 .arg1_type = ARG_PTR_TO_BTF_ID, 624 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 625 .arg2_type = ARG_ANYTHING, 626 .arg3_type = ARG_PTR_TO_FUNC, 627 .arg4_type = ARG_PTR_TO_STACK_OR_NULL, 628 .arg5_type = ARG_ANYTHING, 629 }; 630 631 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); 632 633 static void do_mmap_read_unlock(struct irq_work *entry) 634 { 635 struct mmap_unlock_irq_work *work; 636 637 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 638 return; 639 640 work = container_of(entry, struct mmap_unlock_irq_work, irq_work); 641 mmap_read_unlock_non_owner(work->mm); 642 } 643 644 static int __init task_iter_init(void) 645 { 646 struct mmap_unlock_irq_work *work; 647 int ret, cpu; 648 649 for_each_possible_cpu(cpu) { 650 work = per_cpu_ptr(&mmap_unlock_work, cpu); 651 init_irq_work(&work->irq_work, do_mmap_read_unlock); 652 } 653 654 task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 655 ret = bpf_iter_reg_target(&task_reg_info); 656 if (ret) 657 return ret; 658 659 task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 660 task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; 661 ret = bpf_iter_reg_target(&task_file_reg_info); 662 if (ret) 663 return ret; 664 665 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 666 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; 667 return bpf_iter_reg_target(&task_vma_reg_info); 668 } 669 late_initcall(task_iter_init); 670