1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <linux/pid_namespace.h> 31 #include <net/genetlink.h> 32 #include <linux/atomic.h> 33 #include <linux/sched/cputime.h> 34 35 /* 36 * Maximum length of a cpumask that can be specified in 37 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 38 */ 39 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 40 41 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 42 static int family_registered; 43 struct kmem_cache *taskstats_cache; 44 45 static struct genl_family family; 46 47 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 48 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 49 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 51 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 52 53 /* 54 * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. 55 * Make sure they are always aligned. 56 */ 57 static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 59 }; 60 61 struct listener { 62 struct list_head list; 63 pid_t pid; 64 char valid; 65 }; 66 67 struct listener_list { 68 struct rw_semaphore sem; 69 struct list_head list; 70 }; 71 static DEFINE_PER_CPU(struct listener_list, listener_array); 72 73 enum actions { 74 REGISTER, 75 DEREGISTER, 76 CPU_DONT_CARE 77 }; 78 79 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 80 size_t size) 81 { 82 struct sk_buff *skb; 83 void *reply; 84 85 /* 86 * If new attributes are added, please revisit this allocation 87 */ 88 skb = genlmsg_new(size, GFP_KERNEL); 89 if (!skb) 90 return -ENOMEM; 91 92 if (!info) { 93 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 94 95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 96 } else 97 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 98 if (reply == NULL) { 99 nlmsg_free(skb); 100 return -EINVAL; 101 } 102 103 *skbp = skb; 104 return 0; 105 } 106 107 /* 108 * Send taskstats data in @skb to listener with nl_pid @pid 109 */ 110 static int send_reply(struct sk_buff *skb, struct genl_info *info) 111 { 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 void *reply = genlmsg_data(genlhdr); 114 115 genlmsg_end(skb, reply); 116 117 return genlmsg_reply(skb, info); 118 } 119 120 /* 121 * Send taskstats data in @skb to listeners registered for @cpu's exit data 122 */ 123 static void send_cpu_listeners(struct sk_buff *skb, 124 struct listener_list *listeners) 125 { 126 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 127 struct listener *s, *tmp; 128 struct sk_buff *skb_next, *skb_cur = skb; 129 void *reply = genlmsg_data(genlhdr); 130 int rc, delcount = 0; 131 132 genlmsg_end(skb, reply); 133 134 rc = 0; 135 down_read(&listeners->sem); 136 list_for_each_entry(s, &listeners->list, list) { 137 skb_next = NULL; 138 if (!list_is_last(&s->list, &listeners->list)) { 139 skb_next = skb_clone(skb_cur, GFP_KERNEL); 140 if (!skb_next) 141 break; 142 } 143 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 144 if (rc == -ECONNREFUSED) { 145 s->valid = 0; 146 delcount++; 147 } 148 skb_cur = skb_next; 149 } 150 up_read(&listeners->sem); 151 152 if (skb_cur) 153 nlmsg_free(skb_cur); 154 155 if (!delcount) 156 return; 157 158 /* Delete invalidated entries */ 159 down_write(&listeners->sem); 160 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 161 if (!s->valid) { 162 list_del(&s->list); 163 kfree(s); 164 } 165 } 166 up_write(&listeners->sem); 167 } 168 169 static void fill_stats(struct user_namespace *user_ns, 170 struct pid_namespace *pid_ns, 171 struct task_struct *tsk, struct taskstats *stats) 172 { 173 memset(stats, 0, sizeof(*stats)); 174 /* 175 * Each accounting subsystem adds calls to its functions to 176 * fill in relevant parts of struct taskstsats as follows 177 * 178 * per-task-foo(stats, tsk); 179 */ 180 181 delayacct_add_tsk(stats, tsk); 182 183 /* fill in basic acct fields */ 184 stats->version = TASKSTATS_VERSION; 185 stats->nvcsw = tsk->nvcsw; 186 stats->nivcsw = tsk->nivcsw; 187 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 188 189 /* fill in extended acct fields */ 190 xacct_add_tsk(stats, tsk); 191 } 192 193 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 194 { 195 struct task_struct *tsk; 196 197 rcu_read_lock(); 198 tsk = find_task_by_vpid(pid); 199 if (tsk) 200 get_task_struct(tsk); 201 rcu_read_unlock(); 202 if (!tsk) 203 return -ESRCH; 204 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 205 put_task_struct(tsk); 206 return 0; 207 } 208 209 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 210 { 211 struct task_struct *tsk, *first; 212 unsigned long flags; 213 int rc = -ESRCH; 214 u64 delta, utime, stime; 215 u64 start_time; 216 217 /* 218 * Add additional stats from live tasks except zombie thread group 219 * leaders who are already counted with the dead tasks 220 */ 221 rcu_read_lock(); 222 first = find_task_by_vpid(tgid); 223 224 if (!first || !lock_task_sighand(first, &flags)) 225 goto out; 226 227 if (first->signal->stats) 228 memcpy(stats, first->signal->stats, sizeof(*stats)); 229 else 230 memset(stats, 0, sizeof(*stats)); 231 232 tsk = first; 233 start_time = ktime_get_ns(); 234 do { 235 if (tsk->exit_state) 236 continue; 237 /* 238 * Accounting subsystem can call its functions here to 239 * fill in relevant parts of struct taskstsats as follows 240 * 241 * per-task-foo(stats, tsk); 242 */ 243 delayacct_add_tsk(stats, tsk); 244 245 /* calculate task elapsed time in nsec */ 246 delta = start_time - tsk->start_time; 247 /* Convert to micro seconds */ 248 do_div(delta, NSEC_PER_USEC); 249 stats->ac_etime += delta; 250 251 task_cputime(tsk, &utime, &stime); 252 stats->ac_utime += div_u64(utime, NSEC_PER_USEC); 253 stats->ac_stime += div_u64(stime, NSEC_PER_USEC); 254 255 stats->nvcsw += tsk->nvcsw; 256 stats->nivcsw += tsk->nivcsw; 257 } while_each_thread(first, tsk); 258 259 unlock_task_sighand(first, &flags); 260 rc = 0; 261 out: 262 rcu_read_unlock(); 263 264 stats->version = TASKSTATS_VERSION; 265 /* 266 * Accounting subsystems can also add calls here to modify 267 * fields of taskstats. 268 */ 269 return rc; 270 } 271 272 static void fill_tgid_exit(struct task_struct *tsk) 273 { 274 unsigned long flags; 275 276 spin_lock_irqsave(&tsk->sighand->siglock, flags); 277 if (!tsk->signal->stats) 278 goto ret; 279 280 /* 281 * Each accounting subsystem calls its functions here to 282 * accumalate its per-task stats for tsk, into the per-tgid structure 283 * 284 * per-task-foo(tsk->signal->stats, tsk); 285 */ 286 delayacct_add_tsk(tsk->signal->stats, tsk); 287 ret: 288 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 289 return; 290 } 291 292 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 293 { 294 struct listener_list *listeners; 295 struct listener *s, *tmp, *s2; 296 unsigned int cpu; 297 int ret = 0; 298 299 if (!cpumask_subset(mask, cpu_possible_mask)) 300 return -EINVAL; 301 302 if (current_user_ns() != &init_user_ns) 303 return -EINVAL; 304 305 if (task_active_pid_ns(current) != &init_pid_ns) 306 return -EINVAL; 307 308 if (isadd == REGISTER) { 309 for_each_cpu(cpu, mask) { 310 s = kmalloc_node(sizeof(struct listener), 311 GFP_KERNEL, cpu_to_node(cpu)); 312 if (!s) { 313 ret = -ENOMEM; 314 goto cleanup; 315 } 316 s->pid = pid; 317 s->valid = 1; 318 319 listeners = &per_cpu(listener_array, cpu); 320 down_write(&listeners->sem); 321 list_for_each_entry(s2, &listeners->list, list) { 322 if (s2->pid == pid && s2->valid) 323 goto exists; 324 } 325 list_add(&s->list, &listeners->list); 326 s = NULL; 327 exists: 328 up_write(&listeners->sem); 329 kfree(s); /* nop if NULL */ 330 } 331 return 0; 332 } 333 334 /* Deregister or cleanup */ 335 cleanup: 336 for_each_cpu(cpu, mask) { 337 listeners = &per_cpu(listener_array, cpu); 338 down_write(&listeners->sem); 339 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 340 if (s->pid == pid) { 341 list_del(&s->list); 342 kfree(s); 343 break; 344 } 345 } 346 up_write(&listeners->sem); 347 } 348 return ret; 349 } 350 351 static int parse(struct nlattr *na, struct cpumask *mask) 352 { 353 char *data; 354 int len; 355 int ret; 356 357 if (na == NULL) 358 return 1; 359 len = nla_len(na); 360 if (len > TASKSTATS_CPUMASK_MAXLEN) 361 return -E2BIG; 362 if (len < 1) 363 return -EINVAL; 364 data = kmalloc(len, GFP_KERNEL); 365 if (!data) 366 return -ENOMEM; 367 nla_strlcpy(data, na, len); 368 ret = cpulist_parse(data, mask); 369 kfree(data); 370 return ret; 371 } 372 373 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 374 { 375 struct nlattr *na, *ret; 376 int aggr; 377 378 aggr = (type == TASKSTATS_TYPE_PID) 379 ? TASKSTATS_TYPE_AGGR_PID 380 : TASKSTATS_TYPE_AGGR_TGID; 381 382 na = nla_nest_start(skb, aggr); 383 if (!na) 384 goto err; 385 386 if (nla_put(skb, type, sizeof(pid), &pid) < 0) { 387 nla_nest_cancel(skb, na); 388 goto err; 389 } 390 ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, 391 sizeof(struct taskstats), TASKSTATS_TYPE_NULL); 392 if (!ret) { 393 nla_nest_cancel(skb, na); 394 goto err; 395 } 396 nla_nest_end(skb, na); 397 398 return nla_data(ret); 399 err: 400 return NULL; 401 } 402 403 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 404 { 405 int rc = 0; 406 struct sk_buff *rep_skb; 407 struct cgroupstats *stats; 408 struct nlattr *na; 409 size_t size; 410 u32 fd; 411 struct fd f; 412 413 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 414 if (!na) 415 return -EINVAL; 416 417 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 418 f = fdget(fd); 419 if (!f.file) 420 return 0; 421 422 size = nla_total_size(sizeof(struct cgroupstats)); 423 424 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 425 size); 426 if (rc < 0) 427 goto err; 428 429 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 430 sizeof(struct cgroupstats)); 431 if (na == NULL) { 432 nlmsg_free(rep_skb); 433 rc = -EMSGSIZE; 434 goto err; 435 } 436 437 stats = nla_data(na); 438 memset(stats, 0, sizeof(*stats)); 439 440 rc = cgroupstats_build(stats, f.file->f_path.dentry); 441 if (rc < 0) { 442 nlmsg_free(rep_skb); 443 goto err; 444 } 445 446 rc = send_reply(rep_skb, info); 447 448 err: 449 fdput(f); 450 return rc; 451 } 452 453 static int cmd_attr_register_cpumask(struct genl_info *info) 454 { 455 cpumask_var_t mask; 456 int rc; 457 458 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 459 return -ENOMEM; 460 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 461 if (rc < 0) 462 goto out; 463 rc = add_del_listener(info->snd_portid, mask, REGISTER); 464 out: 465 free_cpumask_var(mask); 466 return rc; 467 } 468 469 static int cmd_attr_deregister_cpumask(struct genl_info *info) 470 { 471 cpumask_var_t mask; 472 int rc; 473 474 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 475 return -ENOMEM; 476 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 477 if (rc < 0) 478 goto out; 479 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 480 out: 481 free_cpumask_var(mask); 482 return rc; 483 } 484 485 static size_t taskstats_packet_size(void) 486 { 487 size_t size; 488 489 size = nla_total_size(sizeof(u32)) + 490 nla_total_size_64bit(sizeof(struct taskstats)) + 491 nla_total_size(0); 492 493 return size; 494 } 495 496 static int cmd_attr_pid(struct genl_info *info) 497 { 498 struct taskstats *stats; 499 struct sk_buff *rep_skb; 500 size_t size; 501 u32 pid; 502 int rc; 503 504 size = taskstats_packet_size(); 505 506 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 507 if (rc < 0) 508 return rc; 509 510 rc = -EINVAL; 511 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 512 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 513 if (!stats) 514 goto err; 515 516 rc = fill_stats_for_pid(pid, stats); 517 if (rc < 0) 518 goto err; 519 return send_reply(rep_skb, info); 520 err: 521 nlmsg_free(rep_skb); 522 return rc; 523 } 524 525 static int cmd_attr_tgid(struct genl_info *info) 526 { 527 struct taskstats *stats; 528 struct sk_buff *rep_skb; 529 size_t size; 530 u32 tgid; 531 int rc; 532 533 size = taskstats_packet_size(); 534 535 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 536 if (rc < 0) 537 return rc; 538 539 rc = -EINVAL; 540 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 541 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 542 if (!stats) 543 goto err; 544 545 rc = fill_stats_for_tgid(tgid, stats); 546 if (rc < 0) 547 goto err; 548 return send_reply(rep_skb, info); 549 err: 550 nlmsg_free(rep_skb); 551 return rc; 552 } 553 554 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 555 { 556 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 557 return cmd_attr_register_cpumask(info); 558 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 559 return cmd_attr_deregister_cpumask(info); 560 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 561 return cmd_attr_pid(info); 562 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 563 return cmd_attr_tgid(info); 564 else 565 return -EINVAL; 566 } 567 568 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 569 { 570 struct signal_struct *sig = tsk->signal; 571 struct taskstats *stats; 572 573 if (sig->stats || thread_group_empty(tsk)) 574 goto ret; 575 576 /* No problem if kmem_cache_zalloc() fails */ 577 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 578 579 spin_lock_irq(&tsk->sighand->siglock); 580 if (!sig->stats) { 581 sig->stats = stats; 582 stats = NULL; 583 } 584 spin_unlock_irq(&tsk->sighand->siglock); 585 586 if (stats) 587 kmem_cache_free(taskstats_cache, stats); 588 ret: 589 return sig->stats; 590 } 591 592 /* Send pid data out on exit */ 593 void taskstats_exit(struct task_struct *tsk, int group_dead) 594 { 595 int rc; 596 struct listener_list *listeners; 597 struct taskstats *stats; 598 struct sk_buff *rep_skb; 599 size_t size; 600 int is_thread_group; 601 602 if (!family_registered) 603 return; 604 605 /* 606 * Size includes space for nested attributes 607 */ 608 size = taskstats_packet_size(); 609 610 is_thread_group = !!taskstats_tgid_alloc(tsk); 611 if (is_thread_group) { 612 /* PID + STATS + TGID + STATS */ 613 size = 2 * size; 614 /* fill the tsk->signal->stats structure */ 615 fill_tgid_exit(tsk); 616 } 617 618 listeners = raw_cpu_ptr(&listener_array); 619 if (list_empty(&listeners->list)) 620 return; 621 622 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 623 if (rc < 0) 624 return; 625 626 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 627 task_pid_nr_ns(tsk, &init_pid_ns)); 628 if (!stats) 629 goto err; 630 631 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 632 633 /* 634 * Doesn't matter if tsk is the leader or the last group member leaving 635 */ 636 if (!is_thread_group || !group_dead) 637 goto send; 638 639 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 640 task_tgid_nr_ns(tsk, &init_pid_ns)); 641 if (!stats) 642 goto err; 643 644 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 645 646 send: 647 send_cpu_listeners(rep_skb, listeners); 648 return; 649 err: 650 nlmsg_free(rep_skb); 651 } 652 653 static const struct genl_ops taskstats_ops[] = { 654 { 655 .cmd = TASKSTATS_CMD_GET, 656 .doit = taskstats_user_cmd, 657 .policy = taskstats_cmd_get_policy, 658 .flags = GENL_ADMIN_PERM, 659 }, 660 { 661 .cmd = CGROUPSTATS_CMD_GET, 662 .doit = cgroupstats_user_cmd, 663 .policy = cgroupstats_cmd_get_policy, 664 }, 665 }; 666 667 static struct genl_family family __ro_after_init = { 668 .name = TASKSTATS_GENL_NAME, 669 .version = TASKSTATS_GENL_VERSION, 670 .maxattr = TASKSTATS_CMD_ATTR_MAX, 671 .module = THIS_MODULE, 672 .ops = taskstats_ops, 673 .n_ops = ARRAY_SIZE(taskstats_ops), 674 }; 675 676 /* Needed early in initialization */ 677 void __init taskstats_init_early(void) 678 { 679 unsigned int i; 680 681 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 682 for_each_possible_cpu(i) { 683 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 684 init_rwsem(&(per_cpu(listener_array, i).sem)); 685 } 686 } 687 688 static int __init taskstats_init(void) 689 { 690 int rc; 691 692 rc = genl_register_family(&family); 693 if (rc) 694 return rc; 695 696 family_registered = 1; 697 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 698 return 0; 699 } 700 701 /* 702 * late initcall ensures initialization of statistics collection 703 * mechanisms precedes initialization of the taskstats interface 704 */ 705 late_initcall(taskstats_init); 706