1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2020 Facebook */ 3 4 #include <linux/init.h> 5 #include <linux/namei.h> 6 #include <linux/pid_namespace.h> 7 #include <linux/fs.h> 8 #include <linux/fdtable.h> 9 #include <linux/filter.h> 10 #include <linux/btf_ids.h> 11 #include "mmap_unlock_work.h" 12 13 struct bpf_iter_seq_task_common { 14 struct pid_namespace *ns; 15 }; 16 17 struct bpf_iter_seq_task_info { 18 /* The first field must be struct bpf_iter_seq_task_common. 19 * this is assumed by {init, fini}_seq_pidns() callback functions. 20 */ 21 struct bpf_iter_seq_task_common common; 22 u32 tid; 23 }; 24 25 static struct task_struct *task_seq_get_next(struct pid_namespace *ns, 26 u32 *tid, 27 bool skip_if_dup_files) 28 { 29 struct task_struct *task = NULL; 30 struct pid *pid; 31 32 rcu_read_lock(); 33 retry: 34 pid = find_ge_pid(*tid, ns); 35 if (pid) { 36 *tid = pid_nr_ns(pid, ns); 37 task = get_pid_task(pid, PIDTYPE_PID); 38 if (!task) { 39 ++*tid; 40 goto retry; 41 } else if (skip_if_dup_files && !thread_group_leader(task) && 42 task->files == task->group_leader->files) { 43 put_task_struct(task); 44 task = NULL; 45 ++*tid; 46 goto retry; 47 } 48 } 49 rcu_read_unlock(); 50 51 return task; 52 } 53 54 static void *task_seq_start(struct seq_file *seq, loff_t *pos) 55 { 56 struct bpf_iter_seq_task_info *info = seq->private; 57 struct task_struct *task; 58 59 task = task_seq_get_next(info->common.ns, &info->tid, false); 60 if (!task) 61 return NULL; 62 63 if (*pos == 0) 64 ++*pos; 65 return task; 66 } 67 68 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 69 { 70 struct bpf_iter_seq_task_info *info = seq->private; 71 struct task_struct *task; 72 73 ++*pos; 74 ++info->tid; 75 put_task_struct((struct task_struct *)v); 76 task = task_seq_get_next(info->common.ns, &info->tid, false); 77 if (!task) 78 return NULL; 79 80 return task; 81 } 82 83 struct bpf_iter__task { 84 __bpf_md_ptr(struct bpf_iter_meta *, meta); 85 __bpf_md_ptr(struct task_struct *, task); 86 }; 87 88 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 89 90 static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 91 bool in_stop) 92 { 93 struct bpf_iter_meta meta; 94 struct bpf_iter__task ctx; 95 struct bpf_prog *prog; 96 97 meta.seq = seq; 98 prog = bpf_iter_get_info(&meta, in_stop); 99 if (!prog) 100 return 0; 101 102 meta.seq = seq; 103 ctx.meta = &meta; 104 ctx.task = task; 105 return bpf_iter_run_prog(prog, &ctx); 106 } 107 108 static int task_seq_show(struct seq_file *seq, void *v) 109 { 110 return __task_seq_show(seq, v, false); 111 } 112 113 static void task_seq_stop(struct seq_file *seq, void *v) 114 { 115 if (!v) 116 (void)__task_seq_show(seq, v, true); 117 else 118 put_task_struct((struct task_struct *)v); 119 } 120 121 static const struct seq_operations task_seq_ops = { 122 .start = task_seq_start, 123 .next = task_seq_next, 124 .stop = task_seq_stop, 125 .show = task_seq_show, 126 }; 127 128 struct bpf_iter_seq_task_file_info { 129 /* The first field must be struct bpf_iter_seq_task_common. 130 * this is assumed by {init, fini}_seq_pidns() callback functions. 131 */ 132 struct bpf_iter_seq_task_common common; 133 struct task_struct *task; 134 u32 tid; 135 u32 fd; 136 }; 137 138 static struct file * 139 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 140 { 141 struct pid_namespace *ns = info->common.ns; 142 u32 curr_tid = info->tid; 143 struct task_struct *curr_task; 144 unsigned int curr_fd = info->fd; 145 146 /* If this function returns a non-NULL file object, 147 * it held a reference to the task/file. 148 * Otherwise, it does not hold any reference. 149 */ 150 again: 151 if (info->task) { 152 curr_task = info->task; 153 curr_fd = info->fd; 154 } else { 155 curr_task = task_seq_get_next(ns, &curr_tid, true); 156 if (!curr_task) { 157 info->task = NULL; 158 info->tid = curr_tid; 159 return NULL; 160 } 161 162 /* set info->task and info->tid */ 163 info->task = curr_task; 164 if (curr_tid == info->tid) { 165 curr_fd = info->fd; 166 } else { 167 info->tid = curr_tid; 168 curr_fd = 0; 169 } 170 } 171 172 rcu_read_lock(); 173 for (;; curr_fd++) { 174 struct file *f; 175 f = task_lookup_next_fd_rcu(curr_task, &curr_fd); 176 if (!f) 177 break; 178 if (!get_file_rcu(f)) 179 continue; 180 181 /* set info->fd */ 182 info->fd = curr_fd; 183 rcu_read_unlock(); 184 return f; 185 } 186 187 /* the current task is done, go to the next task */ 188 rcu_read_unlock(); 189 put_task_struct(curr_task); 190 info->task = NULL; 191 info->fd = 0; 192 curr_tid = ++(info->tid); 193 goto again; 194 } 195 196 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 197 { 198 struct bpf_iter_seq_task_file_info *info = seq->private; 199 struct file *file; 200 201 info->task = NULL; 202 file = task_file_seq_get_next(info); 203 if (file && *pos == 0) 204 ++*pos; 205 206 return file; 207 } 208 209 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 210 { 211 struct bpf_iter_seq_task_file_info *info = seq->private; 212 213 ++*pos; 214 ++info->fd; 215 fput((struct file *)v); 216 return task_file_seq_get_next(info); 217 } 218 219 struct bpf_iter__task_file { 220 __bpf_md_ptr(struct bpf_iter_meta *, meta); 221 __bpf_md_ptr(struct task_struct *, task); 222 u32 fd __aligned(8); 223 __bpf_md_ptr(struct file *, file); 224 }; 225 226 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 227 struct task_struct *task, u32 fd, 228 struct file *file) 229 230 static int __task_file_seq_show(struct seq_file *seq, struct file *file, 231 bool in_stop) 232 { 233 struct bpf_iter_seq_task_file_info *info = seq->private; 234 struct bpf_iter__task_file ctx; 235 struct bpf_iter_meta meta; 236 struct bpf_prog *prog; 237 238 meta.seq = seq; 239 prog = bpf_iter_get_info(&meta, in_stop); 240 if (!prog) 241 return 0; 242 243 ctx.meta = &meta; 244 ctx.task = info->task; 245 ctx.fd = info->fd; 246 ctx.file = file; 247 return bpf_iter_run_prog(prog, &ctx); 248 } 249 250 static int task_file_seq_show(struct seq_file *seq, void *v) 251 { 252 return __task_file_seq_show(seq, v, false); 253 } 254 255 static void task_file_seq_stop(struct seq_file *seq, void *v) 256 { 257 struct bpf_iter_seq_task_file_info *info = seq->private; 258 259 if (!v) { 260 (void)__task_file_seq_show(seq, v, true); 261 } else { 262 fput((struct file *)v); 263 put_task_struct(info->task); 264 info->task = NULL; 265 } 266 } 267 268 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 269 { 270 struct bpf_iter_seq_task_common *common = priv_data; 271 272 common->ns = get_pid_ns(task_active_pid_ns(current)); 273 return 0; 274 } 275 276 static void fini_seq_pidns(void *priv_data) 277 { 278 struct bpf_iter_seq_task_common *common = priv_data; 279 280 put_pid_ns(common->ns); 281 } 282 283 static const struct seq_operations task_file_seq_ops = { 284 .start = task_file_seq_start, 285 .next = task_file_seq_next, 286 .stop = task_file_seq_stop, 287 .show = task_file_seq_show, 288 }; 289 290 struct bpf_iter_seq_task_vma_info { 291 /* The first field must be struct bpf_iter_seq_task_common. 292 * this is assumed by {init, fini}_seq_pidns() callback functions. 293 */ 294 struct bpf_iter_seq_task_common common; 295 struct task_struct *task; 296 struct vm_area_struct *vma; 297 u32 tid; 298 unsigned long prev_vm_start; 299 unsigned long prev_vm_end; 300 }; 301 302 enum bpf_task_vma_iter_find_op { 303 task_vma_iter_first_vma, /* use mm->mmap */ 304 task_vma_iter_next_vma, /* use curr_vma->vm_next */ 305 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 306 }; 307 308 static struct vm_area_struct * 309 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 310 { 311 struct pid_namespace *ns = info->common.ns; 312 enum bpf_task_vma_iter_find_op op; 313 struct vm_area_struct *curr_vma; 314 struct task_struct *curr_task; 315 u32 curr_tid = info->tid; 316 317 /* If this function returns a non-NULL vma, it holds a reference to 318 * the task_struct, and holds read lock on vma->mm->mmap_lock. 319 * If this function returns NULL, it does not hold any reference or 320 * lock. 321 */ 322 if (info->task) { 323 curr_task = info->task; 324 curr_vma = info->vma; 325 /* In case of lock contention, drop mmap_lock to unblock 326 * the writer. 327 * 328 * After relock, call find(mm, prev_vm_end - 1) to find 329 * new vma to process. 330 * 331 * +------+------+-----------+ 332 * | VMA1 | VMA2 | VMA3 | 333 * +------+------+-----------+ 334 * | | | | 335 * 4k 8k 16k 400k 336 * 337 * For example, curr_vma == VMA2. Before unlock, we set 338 * 339 * prev_vm_start = 8k 340 * prev_vm_end = 16k 341 * 342 * There are a few cases: 343 * 344 * 1) VMA2 is freed, but VMA3 exists. 345 * 346 * find_vma() will return VMA3, just process VMA3. 347 * 348 * 2) VMA2 still exists. 349 * 350 * find_vma() will return VMA2, process VMA2->next. 351 * 352 * 3) no more vma in this mm. 353 * 354 * Process the next task. 355 * 356 * 4) find_vma() returns a different vma, VMA2'. 357 * 358 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 359 * because we already covered the range; 360 * 4.2) VMA2 and VMA2' covers different ranges, process 361 * VMA2'. 362 */ 363 if (mmap_lock_is_contended(curr_task->mm)) { 364 info->prev_vm_start = curr_vma->vm_start; 365 info->prev_vm_end = curr_vma->vm_end; 366 op = task_vma_iter_find_vma; 367 mmap_read_unlock(curr_task->mm); 368 if (mmap_read_lock_killable(curr_task->mm)) 369 goto finish; 370 } else { 371 op = task_vma_iter_next_vma; 372 } 373 } else { 374 again: 375 curr_task = task_seq_get_next(ns, &curr_tid, true); 376 if (!curr_task) { 377 info->tid = curr_tid + 1; 378 goto finish; 379 } 380 381 if (curr_tid != info->tid) { 382 info->tid = curr_tid; 383 /* new task, process the first vma */ 384 op = task_vma_iter_first_vma; 385 } else { 386 /* Found the same tid, which means the user space 387 * finished data in previous buffer and read more. 388 * We dropped mmap_lock before returning to user 389 * space, so it is necessary to use find_vma() to 390 * find the next vma to process. 391 */ 392 op = task_vma_iter_find_vma; 393 } 394 395 if (!curr_task->mm) 396 goto next_task; 397 398 if (mmap_read_lock_killable(curr_task->mm)) 399 goto finish; 400 } 401 402 switch (op) { 403 case task_vma_iter_first_vma: 404 curr_vma = curr_task->mm->mmap; 405 break; 406 case task_vma_iter_next_vma: 407 curr_vma = curr_vma->vm_next; 408 break; 409 case task_vma_iter_find_vma: 410 /* We dropped mmap_lock so it is necessary to use find_vma 411 * to find the next vma. This is similar to the mechanism 412 * in show_smaps_rollup(). 413 */ 414 curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); 415 /* case 1) and 4.2) above just use curr_vma */ 416 417 /* check for case 2) or case 4.1) above */ 418 if (curr_vma && 419 curr_vma->vm_start == info->prev_vm_start && 420 curr_vma->vm_end == info->prev_vm_end) 421 curr_vma = curr_vma->vm_next; 422 break; 423 } 424 if (!curr_vma) { 425 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 426 mmap_read_unlock(curr_task->mm); 427 goto next_task; 428 } 429 info->task = curr_task; 430 info->vma = curr_vma; 431 return curr_vma; 432 433 next_task: 434 put_task_struct(curr_task); 435 info->task = NULL; 436 curr_tid++; 437 goto again; 438 439 finish: 440 if (curr_task) 441 put_task_struct(curr_task); 442 info->task = NULL; 443 info->vma = NULL; 444 return NULL; 445 } 446 447 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 448 { 449 struct bpf_iter_seq_task_vma_info *info = seq->private; 450 struct vm_area_struct *vma; 451 452 vma = task_vma_seq_get_next(info); 453 if (vma && *pos == 0) 454 ++*pos; 455 456 return vma; 457 } 458 459 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 460 { 461 struct bpf_iter_seq_task_vma_info *info = seq->private; 462 463 ++*pos; 464 return task_vma_seq_get_next(info); 465 } 466 467 struct bpf_iter__task_vma { 468 __bpf_md_ptr(struct bpf_iter_meta *, meta); 469 __bpf_md_ptr(struct task_struct *, task); 470 __bpf_md_ptr(struct vm_area_struct *, vma); 471 }; 472 473 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 474 struct task_struct *task, struct vm_area_struct *vma) 475 476 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 477 { 478 struct bpf_iter_seq_task_vma_info *info = seq->private; 479 struct bpf_iter__task_vma ctx; 480 struct bpf_iter_meta meta; 481 struct bpf_prog *prog; 482 483 meta.seq = seq; 484 prog = bpf_iter_get_info(&meta, in_stop); 485 if (!prog) 486 return 0; 487 488 ctx.meta = &meta; 489 ctx.task = info->task; 490 ctx.vma = info->vma; 491 return bpf_iter_run_prog(prog, &ctx); 492 } 493 494 static int task_vma_seq_show(struct seq_file *seq, void *v) 495 { 496 return __task_vma_seq_show(seq, false); 497 } 498 499 static void task_vma_seq_stop(struct seq_file *seq, void *v) 500 { 501 struct bpf_iter_seq_task_vma_info *info = seq->private; 502 503 if (!v) { 504 (void)__task_vma_seq_show(seq, true); 505 } else { 506 /* info->vma has not been seen by the BPF program. If the 507 * user space reads more, task_vma_seq_get_next should 508 * return this vma again. Set prev_vm_start to ~0UL, 509 * so that we don't skip the vma returned by the next 510 * find_vma() (case task_vma_iter_find_vma in 511 * task_vma_seq_get_next()). 512 */ 513 info->prev_vm_start = ~0UL; 514 info->prev_vm_end = info->vma->vm_end; 515 mmap_read_unlock(info->task->mm); 516 put_task_struct(info->task); 517 info->task = NULL; 518 } 519 } 520 521 static const struct seq_operations task_vma_seq_ops = { 522 .start = task_vma_seq_start, 523 .next = task_vma_seq_next, 524 .stop = task_vma_seq_stop, 525 .show = task_vma_seq_show, 526 }; 527 528 static const struct bpf_iter_seq_info task_seq_info = { 529 .seq_ops = &task_seq_ops, 530 .init_seq_private = init_seq_pidns, 531 .fini_seq_private = fini_seq_pidns, 532 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 533 }; 534 535 static struct bpf_iter_reg task_reg_info = { 536 .target = "task", 537 .feature = BPF_ITER_RESCHED, 538 .ctx_arg_info_size = 1, 539 .ctx_arg_info = { 540 { offsetof(struct bpf_iter__task, task), 541 PTR_TO_BTF_ID_OR_NULL }, 542 }, 543 .seq_info = &task_seq_info, 544 }; 545 546 static const struct bpf_iter_seq_info task_file_seq_info = { 547 .seq_ops = &task_file_seq_ops, 548 .init_seq_private = init_seq_pidns, 549 .fini_seq_private = fini_seq_pidns, 550 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 551 }; 552 553 static struct bpf_iter_reg task_file_reg_info = { 554 .target = "task_file", 555 .feature = BPF_ITER_RESCHED, 556 .ctx_arg_info_size = 2, 557 .ctx_arg_info = { 558 { offsetof(struct bpf_iter__task_file, task), 559 PTR_TO_BTF_ID_OR_NULL }, 560 { offsetof(struct bpf_iter__task_file, file), 561 PTR_TO_BTF_ID_OR_NULL }, 562 }, 563 .seq_info = &task_file_seq_info, 564 }; 565 566 static const struct bpf_iter_seq_info task_vma_seq_info = { 567 .seq_ops = &task_vma_seq_ops, 568 .init_seq_private = init_seq_pidns, 569 .fini_seq_private = fini_seq_pidns, 570 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 571 }; 572 573 static struct bpf_iter_reg task_vma_reg_info = { 574 .target = "task_vma", 575 .feature = BPF_ITER_RESCHED, 576 .ctx_arg_info_size = 2, 577 .ctx_arg_info = { 578 { offsetof(struct bpf_iter__task_vma, task), 579 PTR_TO_BTF_ID_OR_NULL }, 580 { offsetof(struct bpf_iter__task_vma, vma), 581 PTR_TO_BTF_ID_OR_NULL }, 582 }, 583 .seq_info = &task_vma_seq_info, 584 }; 585 586 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, 587 bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) 588 { 589 struct mmap_unlock_irq_work *work = NULL; 590 struct vm_area_struct *vma; 591 bool irq_work_busy = false; 592 struct mm_struct *mm; 593 int ret = -ENOENT; 594 595 if (flags) 596 return -EINVAL; 597 598 if (!task) 599 return -ENOENT; 600 601 mm = task->mm; 602 if (!mm) 603 return -ENOENT; 604 605 irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 606 607 if (irq_work_busy || !mmap_read_trylock(mm)) 608 return -EBUSY; 609 610 vma = find_vma(mm, start); 611 612 if (vma && vma->vm_start <= start && vma->vm_end > start) { 613 callback_fn((u64)(long)task, (u64)(long)vma, 614 (u64)(long)callback_ctx, 0, 0); 615 ret = 0; 616 } 617 bpf_mmap_unlock_mm(work, mm); 618 return ret; 619 } 620 621 const struct bpf_func_proto bpf_find_vma_proto = { 622 .func = bpf_find_vma, 623 .ret_type = RET_INTEGER, 624 .arg1_type = ARG_PTR_TO_BTF_ID, 625 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 626 .arg2_type = ARG_ANYTHING, 627 .arg3_type = ARG_PTR_TO_FUNC, 628 .arg4_type = ARG_PTR_TO_STACK_OR_NULL, 629 .arg5_type = ARG_ANYTHING, 630 }; 631 632 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); 633 634 static void do_mmap_read_unlock(struct irq_work *entry) 635 { 636 struct mmap_unlock_irq_work *work; 637 638 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 639 return; 640 641 work = container_of(entry, struct mmap_unlock_irq_work, irq_work); 642 mmap_read_unlock_non_owner(work->mm); 643 } 644 645 static int __init task_iter_init(void) 646 { 647 struct mmap_unlock_irq_work *work; 648 int ret, cpu; 649 650 for_each_possible_cpu(cpu) { 651 work = per_cpu_ptr(&mmap_unlock_work, cpu); 652 init_irq_work(&work->irq_work, do_mmap_read_unlock); 653 } 654 655 task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 656 ret = bpf_iter_reg_target(&task_reg_info); 657 if (ret) 658 return ret; 659 660 task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 661 task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; 662 ret = bpf_iter_reg_target(&task_file_reg_info); 663 if (ret) 664 return ret; 665 666 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 667 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; 668 return bpf_iter_reg_target(&task_vma_reg_info); 669 } 670 late_initcall(task_iter_init); 671