1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <linux/pid_namespace.h> 31 #include <net/genetlink.h> 32 #include <linux/atomic.h> 33 #include <linux/sched/cputime.h> 34 35 /* 36 * Maximum length of a cpumask that can be specified in 37 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 38 */ 39 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 40 41 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 42 static int family_registered; 43 struct kmem_cache *taskstats_cache; 44 45 static struct genl_family family; 46 47 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 48 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 49 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 51 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 52 53 /* 54 * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. 55 * Make sure they are always aligned. 56 */ 57 static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 59 }; 60 61 struct listener { 62 struct list_head list; 63 pid_t pid; 64 char valid; 65 }; 66 67 struct listener_list { 68 struct rw_semaphore sem; 69 struct list_head list; 70 }; 71 static DEFINE_PER_CPU(struct listener_list, listener_array); 72 73 enum actions { 74 REGISTER, 75 DEREGISTER, 76 CPU_DONT_CARE 77 }; 78 79 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 80 size_t size) 81 { 82 struct sk_buff *skb; 83 void *reply; 84 85 /* 86 * If new attributes are added, please revisit this allocation 87 */ 88 skb = genlmsg_new(size, GFP_KERNEL); 89 if (!skb) 90 return -ENOMEM; 91 92 if (!info) { 93 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 94 95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 96 } else 97 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 98 if (reply == NULL) { 99 nlmsg_free(skb); 100 return -EINVAL; 101 } 102 103 *skbp = skb; 104 return 0; 105 } 106 107 /* 108 * Send taskstats data in @skb to listener with nl_pid @pid 109 */ 110 static int send_reply(struct sk_buff *skb, struct genl_info *info) 111 { 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 void *reply = genlmsg_data(genlhdr); 114 115 genlmsg_end(skb, reply); 116 117 return genlmsg_reply(skb, info); 118 } 119 120 /* 121 * Send taskstats data in @skb to listeners registered for @cpu's exit data 122 */ 123 static void send_cpu_listeners(struct sk_buff *skb, 124 struct listener_list *listeners) 125 { 126 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 127 struct listener *s, *tmp; 128 struct sk_buff *skb_next, *skb_cur = skb; 129 void *reply = genlmsg_data(genlhdr); 130 int rc, delcount = 0; 131 132 genlmsg_end(skb, reply); 133 134 rc = 0; 135 down_read(&listeners->sem); 136 list_for_each_entry(s, &listeners->list, list) { 137 skb_next = NULL; 138 if (!list_is_last(&s->list, &listeners->list)) { 139 skb_next = skb_clone(skb_cur, GFP_KERNEL); 140 if (!skb_next) 141 break; 142 } 143 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 144 if (rc == -ECONNREFUSED) { 145 s->valid = 0; 146 delcount++; 147 } 148 skb_cur = skb_next; 149 } 150 up_read(&listeners->sem); 151 152 if (skb_cur) 153 nlmsg_free(skb_cur); 154 155 if (!delcount) 156 return; 157 158 /* Delete invalidated entries */ 159 down_write(&listeners->sem); 160 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 161 if (!s->valid) { 162 list_del(&s->list); 163 kfree(s); 164 } 165 } 166 up_write(&listeners->sem); 167 } 168 169 static void fill_stats(struct user_namespace *user_ns, 170 struct pid_namespace *pid_ns, 171 struct task_struct *tsk, struct taskstats *stats) 172 { 173 memset(stats, 0, sizeof(*stats)); 174 /* 175 * Each accounting subsystem adds calls to its functions to 176 * fill in relevant parts of struct taskstsats as follows 177 * 178 * per-task-foo(stats, tsk); 179 */ 180 181 delayacct_add_tsk(stats, tsk); 182 183 /* fill in basic acct fields */ 184 stats->version = TASKSTATS_VERSION; 185 stats->nvcsw = tsk->nvcsw; 186 stats->nivcsw = tsk->nivcsw; 187 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 188 189 /* fill in extended acct fields */ 190 xacct_add_tsk(stats, tsk); 191 } 192 193 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 194 { 195 struct task_struct *tsk; 196 197 tsk = find_get_task_by_vpid(pid); 198 if (!tsk) 199 return -ESRCH; 200 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 201 put_task_struct(tsk); 202 return 0; 203 } 204 205 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 206 { 207 struct task_struct *tsk, *first; 208 unsigned long flags; 209 int rc = -ESRCH; 210 u64 delta, utime, stime; 211 u64 start_time; 212 213 /* 214 * Add additional stats from live tasks except zombie thread group 215 * leaders who are already counted with the dead tasks 216 */ 217 rcu_read_lock(); 218 first = find_task_by_vpid(tgid); 219 220 if (!first || !lock_task_sighand(first, &flags)) 221 goto out; 222 223 if (first->signal->stats) 224 memcpy(stats, first->signal->stats, sizeof(*stats)); 225 else 226 memset(stats, 0, sizeof(*stats)); 227 228 tsk = first; 229 start_time = ktime_get_ns(); 230 do { 231 if (tsk->exit_state) 232 continue; 233 /* 234 * Accounting subsystem can call its functions here to 235 * fill in relevant parts of struct taskstsats as follows 236 * 237 * per-task-foo(stats, tsk); 238 */ 239 delayacct_add_tsk(stats, tsk); 240 241 /* calculate task elapsed time in nsec */ 242 delta = start_time - tsk->start_time; 243 /* Convert to micro seconds */ 244 do_div(delta, NSEC_PER_USEC); 245 stats->ac_etime += delta; 246 247 task_cputime(tsk, &utime, &stime); 248 stats->ac_utime += div_u64(utime, NSEC_PER_USEC); 249 stats->ac_stime += div_u64(stime, NSEC_PER_USEC); 250 251 stats->nvcsw += tsk->nvcsw; 252 stats->nivcsw += tsk->nivcsw; 253 } while_each_thread(first, tsk); 254 255 unlock_task_sighand(first, &flags); 256 rc = 0; 257 out: 258 rcu_read_unlock(); 259 260 stats->version = TASKSTATS_VERSION; 261 /* 262 * Accounting subsystems can also add calls here to modify 263 * fields of taskstats. 264 */ 265 return rc; 266 } 267 268 static void fill_tgid_exit(struct task_struct *tsk) 269 { 270 unsigned long flags; 271 272 spin_lock_irqsave(&tsk->sighand->siglock, flags); 273 if (!tsk->signal->stats) 274 goto ret; 275 276 /* 277 * Each accounting subsystem calls its functions here to 278 * accumalate its per-task stats for tsk, into the per-tgid structure 279 * 280 * per-task-foo(tsk->signal->stats, tsk); 281 */ 282 delayacct_add_tsk(tsk->signal->stats, tsk); 283 ret: 284 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 285 return; 286 } 287 288 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 289 { 290 struct listener_list *listeners; 291 struct listener *s, *tmp, *s2; 292 unsigned int cpu; 293 int ret = 0; 294 295 if (!cpumask_subset(mask, cpu_possible_mask)) 296 return -EINVAL; 297 298 if (current_user_ns() != &init_user_ns) 299 return -EINVAL; 300 301 if (task_active_pid_ns(current) != &init_pid_ns) 302 return -EINVAL; 303 304 if (isadd == REGISTER) { 305 for_each_cpu(cpu, mask) { 306 s = kmalloc_node(sizeof(struct listener), 307 GFP_KERNEL, cpu_to_node(cpu)); 308 if (!s) { 309 ret = -ENOMEM; 310 goto cleanup; 311 } 312 s->pid = pid; 313 s->valid = 1; 314 315 listeners = &per_cpu(listener_array, cpu); 316 down_write(&listeners->sem); 317 list_for_each_entry(s2, &listeners->list, list) { 318 if (s2->pid == pid && s2->valid) 319 goto exists; 320 } 321 list_add(&s->list, &listeners->list); 322 s = NULL; 323 exists: 324 up_write(&listeners->sem); 325 kfree(s); /* nop if NULL */ 326 } 327 return 0; 328 } 329 330 /* Deregister or cleanup */ 331 cleanup: 332 for_each_cpu(cpu, mask) { 333 listeners = &per_cpu(listener_array, cpu); 334 down_write(&listeners->sem); 335 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 336 if (s->pid == pid) { 337 list_del(&s->list); 338 kfree(s); 339 break; 340 } 341 } 342 up_write(&listeners->sem); 343 } 344 return ret; 345 } 346 347 static int parse(struct nlattr *na, struct cpumask *mask) 348 { 349 char *data; 350 int len; 351 int ret; 352 353 if (na == NULL) 354 return 1; 355 len = nla_len(na); 356 if (len > TASKSTATS_CPUMASK_MAXLEN) 357 return -E2BIG; 358 if (len < 1) 359 return -EINVAL; 360 data = kmalloc(len, GFP_KERNEL); 361 if (!data) 362 return -ENOMEM; 363 nla_strlcpy(data, na, len); 364 ret = cpulist_parse(data, mask); 365 kfree(data); 366 return ret; 367 } 368 369 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 370 { 371 struct nlattr *na, *ret; 372 int aggr; 373 374 aggr = (type == TASKSTATS_TYPE_PID) 375 ? TASKSTATS_TYPE_AGGR_PID 376 : TASKSTATS_TYPE_AGGR_TGID; 377 378 na = nla_nest_start(skb, aggr); 379 if (!na) 380 goto err; 381 382 if (nla_put(skb, type, sizeof(pid), &pid) < 0) { 383 nla_nest_cancel(skb, na); 384 goto err; 385 } 386 ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, 387 sizeof(struct taskstats), TASKSTATS_TYPE_NULL); 388 if (!ret) { 389 nla_nest_cancel(skb, na); 390 goto err; 391 } 392 nla_nest_end(skb, na); 393 394 return nla_data(ret); 395 err: 396 return NULL; 397 } 398 399 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 400 { 401 int rc = 0; 402 struct sk_buff *rep_skb; 403 struct cgroupstats *stats; 404 struct nlattr *na; 405 size_t size; 406 u32 fd; 407 struct fd f; 408 409 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 410 if (!na) 411 return -EINVAL; 412 413 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 414 f = fdget(fd); 415 if (!f.file) 416 return 0; 417 418 size = nla_total_size(sizeof(struct cgroupstats)); 419 420 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 421 size); 422 if (rc < 0) 423 goto err; 424 425 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 426 sizeof(struct cgroupstats)); 427 if (na == NULL) { 428 nlmsg_free(rep_skb); 429 rc = -EMSGSIZE; 430 goto err; 431 } 432 433 stats = nla_data(na); 434 memset(stats, 0, sizeof(*stats)); 435 436 rc = cgroupstats_build(stats, f.file->f_path.dentry); 437 if (rc < 0) { 438 nlmsg_free(rep_skb); 439 goto err; 440 } 441 442 rc = send_reply(rep_skb, info); 443 444 err: 445 fdput(f); 446 return rc; 447 } 448 449 static int cmd_attr_register_cpumask(struct genl_info *info) 450 { 451 cpumask_var_t mask; 452 int rc; 453 454 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 455 return -ENOMEM; 456 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 457 if (rc < 0) 458 goto out; 459 rc = add_del_listener(info->snd_portid, mask, REGISTER); 460 out: 461 free_cpumask_var(mask); 462 return rc; 463 } 464 465 static int cmd_attr_deregister_cpumask(struct genl_info *info) 466 { 467 cpumask_var_t mask; 468 int rc; 469 470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 471 return -ENOMEM; 472 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 473 if (rc < 0) 474 goto out; 475 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 476 out: 477 free_cpumask_var(mask); 478 return rc; 479 } 480 481 static size_t taskstats_packet_size(void) 482 { 483 size_t size; 484 485 size = nla_total_size(sizeof(u32)) + 486 nla_total_size_64bit(sizeof(struct taskstats)) + 487 nla_total_size(0); 488 489 return size; 490 } 491 492 static int cmd_attr_pid(struct genl_info *info) 493 { 494 struct taskstats *stats; 495 struct sk_buff *rep_skb; 496 size_t size; 497 u32 pid; 498 int rc; 499 500 size = taskstats_packet_size(); 501 502 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 503 if (rc < 0) 504 return rc; 505 506 rc = -EINVAL; 507 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 508 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 509 if (!stats) 510 goto err; 511 512 rc = fill_stats_for_pid(pid, stats); 513 if (rc < 0) 514 goto err; 515 return send_reply(rep_skb, info); 516 err: 517 nlmsg_free(rep_skb); 518 return rc; 519 } 520 521 static int cmd_attr_tgid(struct genl_info *info) 522 { 523 struct taskstats *stats; 524 struct sk_buff *rep_skb; 525 size_t size; 526 u32 tgid; 527 int rc; 528 529 size = taskstats_packet_size(); 530 531 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 532 if (rc < 0) 533 return rc; 534 535 rc = -EINVAL; 536 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 537 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 538 if (!stats) 539 goto err; 540 541 rc = fill_stats_for_tgid(tgid, stats); 542 if (rc < 0) 543 goto err; 544 return send_reply(rep_skb, info); 545 err: 546 nlmsg_free(rep_skb); 547 return rc; 548 } 549 550 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 551 { 552 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 553 return cmd_attr_register_cpumask(info); 554 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 555 return cmd_attr_deregister_cpumask(info); 556 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 557 return cmd_attr_pid(info); 558 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 559 return cmd_attr_tgid(info); 560 else 561 return -EINVAL; 562 } 563 564 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 565 { 566 struct signal_struct *sig = tsk->signal; 567 struct taskstats *stats; 568 569 if (sig->stats || thread_group_empty(tsk)) 570 goto ret; 571 572 /* No problem if kmem_cache_zalloc() fails */ 573 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 574 575 spin_lock_irq(&tsk->sighand->siglock); 576 if (!sig->stats) { 577 sig->stats = stats; 578 stats = NULL; 579 } 580 spin_unlock_irq(&tsk->sighand->siglock); 581 582 if (stats) 583 kmem_cache_free(taskstats_cache, stats); 584 ret: 585 return sig->stats; 586 } 587 588 /* Send pid data out on exit */ 589 void taskstats_exit(struct task_struct *tsk, int group_dead) 590 { 591 int rc; 592 struct listener_list *listeners; 593 struct taskstats *stats; 594 struct sk_buff *rep_skb; 595 size_t size; 596 int is_thread_group; 597 598 if (!family_registered) 599 return; 600 601 /* 602 * Size includes space for nested attributes 603 */ 604 size = taskstats_packet_size(); 605 606 is_thread_group = !!taskstats_tgid_alloc(tsk); 607 if (is_thread_group) { 608 /* PID + STATS + TGID + STATS */ 609 size = 2 * size; 610 /* fill the tsk->signal->stats structure */ 611 fill_tgid_exit(tsk); 612 } 613 614 listeners = raw_cpu_ptr(&listener_array); 615 if (list_empty(&listeners->list)) 616 return; 617 618 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 619 if (rc < 0) 620 return; 621 622 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 623 task_pid_nr_ns(tsk, &init_pid_ns)); 624 if (!stats) 625 goto err; 626 627 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 628 629 /* 630 * Doesn't matter if tsk is the leader or the last group member leaving 631 */ 632 if (!is_thread_group || !group_dead) 633 goto send; 634 635 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 636 task_tgid_nr_ns(tsk, &init_pid_ns)); 637 if (!stats) 638 goto err; 639 640 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 641 642 send: 643 send_cpu_listeners(rep_skb, listeners); 644 return; 645 err: 646 nlmsg_free(rep_skb); 647 } 648 649 static const struct genl_ops taskstats_ops[] = { 650 { 651 .cmd = TASKSTATS_CMD_GET, 652 .doit = taskstats_user_cmd, 653 .policy = taskstats_cmd_get_policy, 654 .flags = GENL_ADMIN_PERM, 655 }, 656 { 657 .cmd = CGROUPSTATS_CMD_GET, 658 .doit = cgroupstats_user_cmd, 659 .policy = cgroupstats_cmd_get_policy, 660 }, 661 }; 662 663 static struct genl_family family __ro_after_init = { 664 .name = TASKSTATS_GENL_NAME, 665 .version = TASKSTATS_GENL_VERSION, 666 .maxattr = TASKSTATS_CMD_ATTR_MAX, 667 .module = THIS_MODULE, 668 .ops = taskstats_ops, 669 .n_ops = ARRAY_SIZE(taskstats_ops), 670 }; 671 672 /* Needed early in initialization */ 673 void __init taskstats_init_early(void) 674 { 675 unsigned int i; 676 677 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 678 for_each_possible_cpu(i) { 679 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 680 init_rwsem(&(per_cpu(listener_array, i).sem)); 681 } 682 } 683 684 static int __init taskstats_init(void) 685 { 686 int rc; 687 688 rc = genl_register_family(&family); 689 if (rc) 690 return rc; 691 692 family_registered = 1; 693 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 694 return 0; 695 } 696 697 /* 698 * late initcall ensures initialization of statistics collection 699 * mechanisms precedes initialization of the taskstats interface 700 */ 701 late_initcall(taskstats_init); 702