1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2020 Facebook */ 3 4 #include <linux/init.h> 5 #include <linux/namei.h> 6 #include <linux/pid_namespace.h> 7 #include <linux/fs.h> 8 #include <linux/fdtable.h> 9 #include <linux/filter.h> 10 #include <linux/btf_ids.h> 11 12 struct bpf_iter_seq_task_common { 13 struct pid_namespace *ns; 14 }; 15 16 struct bpf_iter_seq_task_info { 17 /* The first field must be struct bpf_iter_seq_task_common. 18 * this is assumed by {init, fini}_seq_pidns() callback functions. 19 */ 20 struct bpf_iter_seq_task_common common; 21 u32 tid; 22 }; 23 24 static struct task_struct *task_seq_get_next(struct pid_namespace *ns, 25 u32 *tid, 26 bool skip_if_dup_files) 27 { 28 struct task_struct *task = NULL; 29 struct pid *pid; 30 31 rcu_read_lock(); 32 retry: 33 pid = find_ge_pid(*tid, ns); 34 if (pid) { 35 *tid = pid_nr_ns(pid, ns); 36 task = get_pid_task(pid, PIDTYPE_PID); 37 if (!task) { 38 ++*tid; 39 goto retry; 40 } else if (skip_if_dup_files && !thread_group_leader(task) && 41 task->files == task->group_leader->files) { 42 put_task_struct(task); 43 task = NULL; 44 ++*tid; 45 goto retry; 46 } 47 } 48 rcu_read_unlock(); 49 50 return task; 51 } 52 53 static void *task_seq_start(struct seq_file *seq, loff_t *pos) 54 { 55 struct bpf_iter_seq_task_info *info = seq->private; 56 struct task_struct *task; 57 58 task = task_seq_get_next(info->common.ns, &info->tid, false); 59 if (!task) 60 return NULL; 61 62 if (*pos == 0) 63 ++*pos; 64 return task; 65 } 66 67 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 68 { 69 struct bpf_iter_seq_task_info *info = seq->private; 70 struct task_struct *task; 71 72 ++*pos; 73 ++info->tid; 74 put_task_struct((struct task_struct *)v); 75 task = task_seq_get_next(info->common.ns, &info->tid, false); 76 if (!task) 77 return NULL; 78 79 return task; 80 } 81 82 struct bpf_iter__task { 83 __bpf_md_ptr(struct bpf_iter_meta *, meta); 84 __bpf_md_ptr(struct task_struct *, task); 85 }; 86 87 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 88 89 static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 90 bool in_stop) 91 { 92 struct bpf_iter_meta meta; 93 struct bpf_iter__task ctx; 94 struct bpf_prog *prog; 95 96 meta.seq = seq; 97 prog = bpf_iter_get_info(&meta, in_stop); 98 if (!prog) 99 return 0; 100 101 meta.seq = seq; 102 ctx.meta = &meta; 103 ctx.task = task; 104 return bpf_iter_run_prog(prog, &ctx); 105 } 106 107 static int task_seq_show(struct seq_file *seq, void *v) 108 { 109 return __task_seq_show(seq, v, false); 110 } 111 112 static void task_seq_stop(struct seq_file *seq, void *v) 113 { 114 if (!v) 115 (void)__task_seq_show(seq, v, true); 116 else 117 put_task_struct((struct task_struct *)v); 118 } 119 120 static const struct seq_operations task_seq_ops = { 121 .start = task_seq_start, 122 .next = task_seq_next, 123 .stop = task_seq_stop, 124 .show = task_seq_show, 125 }; 126 127 struct bpf_iter_seq_task_file_info { 128 /* The first field must be struct bpf_iter_seq_task_common. 129 * this is assumed by {init, fini}_seq_pidns() callback functions. 130 */ 131 struct bpf_iter_seq_task_common common; 132 struct task_struct *task; 133 u32 tid; 134 u32 fd; 135 }; 136 137 static struct file * 138 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 139 { 140 struct pid_namespace *ns = info->common.ns; 141 u32 curr_tid = info->tid; 142 struct task_struct *curr_task; 143 unsigned int curr_fd = info->fd; 144 145 /* If this function returns a non-NULL file object, 146 * it held a reference to the task/file. 147 * Otherwise, it does not hold any reference. 148 */ 149 again: 150 if (info->task) { 151 curr_task = info->task; 152 curr_fd = info->fd; 153 } else { 154 curr_task = task_seq_get_next(ns, &curr_tid, true); 155 if (!curr_task) { 156 info->task = NULL; 157 info->tid = curr_tid; 158 return NULL; 159 } 160 161 /* set info->task and info->tid */ 162 info->task = curr_task; 163 if (curr_tid == info->tid) { 164 curr_fd = info->fd; 165 } else { 166 info->tid = curr_tid; 167 curr_fd = 0; 168 } 169 } 170 171 rcu_read_lock(); 172 for (;; curr_fd++) { 173 struct file *f; 174 f = task_lookup_next_fd_rcu(curr_task, &curr_fd); 175 if (!f) 176 break; 177 if (!get_file_rcu(f)) 178 continue; 179 180 /* set info->fd */ 181 info->fd = curr_fd; 182 rcu_read_unlock(); 183 return f; 184 } 185 186 /* the current task is done, go to the next task */ 187 rcu_read_unlock(); 188 put_task_struct(curr_task); 189 info->task = NULL; 190 info->fd = 0; 191 curr_tid = ++(info->tid); 192 goto again; 193 } 194 195 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 196 { 197 struct bpf_iter_seq_task_file_info *info = seq->private; 198 struct file *file; 199 200 info->task = NULL; 201 file = task_file_seq_get_next(info); 202 if (file && *pos == 0) 203 ++*pos; 204 205 return file; 206 } 207 208 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 209 { 210 struct bpf_iter_seq_task_file_info *info = seq->private; 211 212 ++*pos; 213 ++info->fd; 214 fput((struct file *)v); 215 return task_file_seq_get_next(info); 216 } 217 218 struct bpf_iter__task_file { 219 __bpf_md_ptr(struct bpf_iter_meta *, meta); 220 __bpf_md_ptr(struct task_struct *, task); 221 u32 fd __aligned(8); 222 __bpf_md_ptr(struct file *, file); 223 }; 224 225 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 226 struct task_struct *task, u32 fd, 227 struct file *file) 228 229 static int __task_file_seq_show(struct seq_file *seq, struct file *file, 230 bool in_stop) 231 { 232 struct bpf_iter_seq_task_file_info *info = seq->private; 233 struct bpf_iter__task_file ctx; 234 struct bpf_iter_meta meta; 235 struct bpf_prog *prog; 236 237 meta.seq = seq; 238 prog = bpf_iter_get_info(&meta, in_stop); 239 if (!prog) 240 return 0; 241 242 ctx.meta = &meta; 243 ctx.task = info->task; 244 ctx.fd = info->fd; 245 ctx.file = file; 246 return bpf_iter_run_prog(prog, &ctx); 247 } 248 249 static int task_file_seq_show(struct seq_file *seq, void *v) 250 { 251 return __task_file_seq_show(seq, v, false); 252 } 253 254 static void task_file_seq_stop(struct seq_file *seq, void *v) 255 { 256 struct bpf_iter_seq_task_file_info *info = seq->private; 257 258 if (!v) { 259 (void)__task_file_seq_show(seq, v, true); 260 } else { 261 fput((struct file *)v); 262 put_task_struct(info->task); 263 info->task = NULL; 264 } 265 } 266 267 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 268 { 269 struct bpf_iter_seq_task_common *common = priv_data; 270 271 common->ns = get_pid_ns(task_active_pid_ns(current)); 272 return 0; 273 } 274 275 static void fini_seq_pidns(void *priv_data) 276 { 277 struct bpf_iter_seq_task_common *common = priv_data; 278 279 put_pid_ns(common->ns); 280 } 281 282 static const struct seq_operations task_file_seq_ops = { 283 .start = task_file_seq_start, 284 .next = task_file_seq_next, 285 .stop = task_file_seq_stop, 286 .show = task_file_seq_show, 287 }; 288 289 struct bpf_iter_seq_task_vma_info { 290 /* The first field must be struct bpf_iter_seq_task_common. 291 * this is assumed by {init, fini}_seq_pidns() callback functions. 292 */ 293 struct bpf_iter_seq_task_common common; 294 struct task_struct *task; 295 struct vm_area_struct *vma; 296 u32 tid; 297 unsigned long prev_vm_start; 298 unsigned long prev_vm_end; 299 }; 300 301 enum bpf_task_vma_iter_find_op { 302 task_vma_iter_first_vma, /* use mm->mmap */ 303 task_vma_iter_next_vma, /* use curr_vma->vm_next */ 304 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 305 }; 306 307 static struct vm_area_struct * 308 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 309 { 310 struct pid_namespace *ns = info->common.ns; 311 enum bpf_task_vma_iter_find_op op; 312 struct vm_area_struct *curr_vma; 313 struct task_struct *curr_task; 314 u32 curr_tid = info->tid; 315 316 /* If this function returns a non-NULL vma, it holds a reference to 317 * the task_struct, and holds read lock on vma->mm->mmap_lock. 318 * If this function returns NULL, it does not hold any reference or 319 * lock. 320 */ 321 if (info->task) { 322 curr_task = info->task; 323 curr_vma = info->vma; 324 /* In case of lock contention, drop mmap_lock to unblock 325 * the writer. 326 * 327 * After relock, call find(mm, prev_vm_end - 1) to find 328 * new vma to process. 329 * 330 * +------+------+-----------+ 331 * | VMA1 | VMA2 | VMA3 | 332 * +------+------+-----------+ 333 * | | | | 334 * 4k 8k 16k 400k 335 * 336 * For example, curr_vma == VMA2. Before unlock, we set 337 * 338 * prev_vm_start = 8k 339 * prev_vm_end = 16k 340 * 341 * There are a few cases: 342 * 343 * 1) VMA2 is freed, but VMA3 exists. 344 * 345 * find_vma() will return VMA3, just process VMA3. 346 * 347 * 2) VMA2 still exists. 348 * 349 * find_vma() will return VMA2, process VMA2->next. 350 * 351 * 3) no more vma in this mm. 352 * 353 * Process the next task. 354 * 355 * 4) find_vma() returns a different vma, VMA2'. 356 * 357 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 358 * because we already covered the range; 359 * 4.2) VMA2 and VMA2' covers different ranges, process 360 * VMA2'. 361 */ 362 if (mmap_lock_is_contended(curr_task->mm)) { 363 info->prev_vm_start = curr_vma->vm_start; 364 info->prev_vm_end = curr_vma->vm_end; 365 op = task_vma_iter_find_vma; 366 mmap_read_unlock(curr_task->mm); 367 if (mmap_read_lock_killable(curr_task->mm)) 368 goto finish; 369 } else { 370 op = task_vma_iter_next_vma; 371 } 372 } else { 373 again: 374 curr_task = task_seq_get_next(ns, &curr_tid, true); 375 if (!curr_task) { 376 info->tid = curr_tid + 1; 377 goto finish; 378 } 379 380 if (curr_tid != info->tid) { 381 info->tid = curr_tid; 382 /* new task, process the first vma */ 383 op = task_vma_iter_first_vma; 384 } else { 385 /* Found the same tid, which means the user space 386 * finished data in previous buffer and read more. 387 * We dropped mmap_lock before returning to user 388 * space, so it is necessary to use find_vma() to 389 * find the next vma to process. 390 */ 391 op = task_vma_iter_find_vma; 392 } 393 394 if (!curr_task->mm) 395 goto next_task; 396 397 if (mmap_read_lock_killable(curr_task->mm)) 398 goto finish; 399 } 400 401 switch (op) { 402 case task_vma_iter_first_vma: 403 curr_vma = curr_task->mm->mmap; 404 break; 405 case task_vma_iter_next_vma: 406 curr_vma = curr_vma->vm_next; 407 break; 408 case task_vma_iter_find_vma: 409 /* We dropped mmap_lock so it is necessary to use find_vma 410 * to find the next vma. This is similar to the mechanism 411 * in show_smaps_rollup(). 412 */ 413 curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); 414 /* case 1) and 4.2) above just use curr_vma */ 415 416 /* check for case 2) or case 4.1) above */ 417 if (curr_vma && 418 curr_vma->vm_start == info->prev_vm_start && 419 curr_vma->vm_end == info->prev_vm_end) 420 curr_vma = curr_vma->vm_next; 421 break; 422 } 423 if (!curr_vma) { 424 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 425 mmap_read_unlock(curr_task->mm); 426 goto next_task; 427 } 428 info->task = curr_task; 429 info->vma = curr_vma; 430 return curr_vma; 431 432 next_task: 433 put_task_struct(curr_task); 434 info->task = NULL; 435 curr_tid++; 436 goto again; 437 438 finish: 439 if (curr_task) 440 put_task_struct(curr_task); 441 info->task = NULL; 442 info->vma = NULL; 443 return NULL; 444 } 445 446 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 447 { 448 struct bpf_iter_seq_task_vma_info *info = seq->private; 449 struct vm_area_struct *vma; 450 451 vma = task_vma_seq_get_next(info); 452 if (vma && *pos == 0) 453 ++*pos; 454 455 return vma; 456 } 457 458 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 459 { 460 struct bpf_iter_seq_task_vma_info *info = seq->private; 461 462 ++*pos; 463 return task_vma_seq_get_next(info); 464 } 465 466 struct bpf_iter__task_vma { 467 __bpf_md_ptr(struct bpf_iter_meta *, meta); 468 __bpf_md_ptr(struct task_struct *, task); 469 __bpf_md_ptr(struct vm_area_struct *, vma); 470 }; 471 472 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 473 struct task_struct *task, struct vm_area_struct *vma) 474 475 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 476 { 477 struct bpf_iter_seq_task_vma_info *info = seq->private; 478 struct bpf_iter__task_vma ctx; 479 struct bpf_iter_meta meta; 480 struct bpf_prog *prog; 481 482 meta.seq = seq; 483 prog = bpf_iter_get_info(&meta, in_stop); 484 if (!prog) 485 return 0; 486 487 ctx.meta = &meta; 488 ctx.task = info->task; 489 ctx.vma = info->vma; 490 return bpf_iter_run_prog(prog, &ctx); 491 } 492 493 static int task_vma_seq_show(struct seq_file *seq, void *v) 494 { 495 return __task_vma_seq_show(seq, false); 496 } 497 498 static void task_vma_seq_stop(struct seq_file *seq, void *v) 499 { 500 struct bpf_iter_seq_task_vma_info *info = seq->private; 501 502 if (!v) { 503 (void)__task_vma_seq_show(seq, true); 504 } else { 505 /* info->vma has not been seen by the BPF program. If the 506 * user space reads more, task_vma_seq_get_next should 507 * return this vma again. Set prev_vm_start to ~0UL, 508 * so that we don't skip the vma returned by the next 509 * find_vma() (case task_vma_iter_find_vma in 510 * task_vma_seq_get_next()). 511 */ 512 info->prev_vm_start = ~0UL; 513 info->prev_vm_end = info->vma->vm_end; 514 mmap_read_unlock(info->task->mm); 515 put_task_struct(info->task); 516 info->task = NULL; 517 } 518 } 519 520 static const struct seq_operations task_vma_seq_ops = { 521 .start = task_vma_seq_start, 522 .next = task_vma_seq_next, 523 .stop = task_vma_seq_stop, 524 .show = task_vma_seq_show, 525 }; 526 527 BTF_ID_LIST(btf_task_file_ids) 528 BTF_ID(struct, file) 529 BTF_ID(struct, vm_area_struct) 530 531 static const struct bpf_iter_seq_info task_seq_info = { 532 .seq_ops = &task_seq_ops, 533 .init_seq_private = init_seq_pidns, 534 .fini_seq_private = fini_seq_pidns, 535 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 536 }; 537 538 static struct bpf_iter_reg task_reg_info = { 539 .target = "task", 540 .feature = BPF_ITER_RESCHED, 541 .ctx_arg_info_size = 1, 542 .ctx_arg_info = { 543 { offsetof(struct bpf_iter__task, task), 544 PTR_TO_BTF_ID_OR_NULL }, 545 }, 546 .seq_info = &task_seq_info, 547 }; 548 549 static const struct bpf_iter_seq_info task_file_seq_info = { 550 .seq_ops = &task_file_seq_ops, 551 .init_seq_private = init_seq_pidns, 552 .fini_seq_private = fini_seq_pidns, 553 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 554 }; 555 556 static struct bpf_iter_reg task_file_reg_info = { 557 .target = "task_file", 558 .feature = BPF_ITER_RESCHED, 559 .ctx_arg_info_size = 2, 560 .ctx_arg_info = { 561 { offsetof(struct bpf_iter__task_file, task), 562 PTR_TO_BTF_ID_OR_NULL }, 563 { offsetof(struct bpf_iter__task_file, file), 564 PTR_TO_BTF_ID_OR_NULL }, 565 }, 566 .seq_info = &task_file_seq_info, 567 }; 568 569 static const struct bpf_iter_seq_info task_vma_seq_info = { 570 .seq_ops = &task_vma_seq_ops, 571 .init_seq_private = init_seq_pidns, 572 .fini_seq_private = fini_seq_pidns, 573 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 574 }; 575 576 static struct bpf_iter_reg task_vma_reg_info = { 577 .target = "task_vma", 578 .feature = BPF_ITER_RESCHED, 579 .ctx_arg_info_size = 2, 580 .ctx_arg_info = { 581 { offsetof(struct bpf_iter__task_vma, task), 582 PTR_TO_BTF_ID_OR_NULL }, 583 { offsetof(struct bpf_iter__task_vma, vma), 584 PTR_TO_BTF_ID_OR_NULL }, 585 }, 586 .seq_info = &task_vma_seq_info, 587 }; 588 589 static int __init task_iter_init(void) 590 { 591 int ret; 592 593 task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; 594 ret = bpf_iter_reg_target(&task_reg_info); 595 if (ret) 596 return ret; 597 598 task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; 599 task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; 600 ret = bpf_iter_reg_target(&task_file_reg_info); 601 if (ret) 602 return ret; 603 604 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; 605 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; 606 return bpf_iter_reg_target(&task_vma_reg_info); 607 } 608 late_initcall(task_iter_init); 609