1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <net/genetlink.h> 31 #include <linux/atomic.h> 32 33 /* 34 * Maximum length of a cpumask that can be specified in 35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 36 */ 37 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 38 39 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 40 static int family_registered; 41 struct kmem_cache *taskstats_cache; 42 43 static struct genl_family family = { 44 .id = GENL_ID_GENERATE, 45 .name = TASKSTATS_GENL_NAME, 46 .version = TASKSTATS_GENL_VERSION, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX, 48 }; 49 50 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 58 }; 59 60 struct listener { 61 struct list_head list; 62 pid_t pid; 63 char valid; 64 }; 65 66 struct listener_list { 67 struct rw_semaphore sem; 68 struct list_head list; 69 }; 70 static DEFINE_PER_CPU(struct listener_list, listener_array); 71 72 enum actions { 73 REGISTER, 74 DEREGISTER, 75 CPU_DONT_CARE 76 }; 77 78 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 79 size_t size) 80 { 81 struct sk_buff *skb; 82 void *reply; 83 84 /* 85 * If new attributes are added, please revisit this allocation 86 */ 87 skb = genlmsg_new(size, GFP_KERNEL); 88 if (!skb) 89 return -ENOMEM; 90 91 if (!info) { 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 93 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 95 } else 96 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 97 if (reply == NULL) { 98 nlmsg_free(skb); 99 return -EINVAL; 100 } 101 102 *skbp = skb; 103 return 0; 104 } 105 106 /* 107 * Send taskstats data in @skb to listener with nl_pid @pid 108 */ 109 static int send_reply(struct sk_buff *skb, struct genl_info *info) 110 { 111 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 void *reply = genlmsg_data(genlhdr); 113 int rc; 114 115 rc = genlmsg_end(skb, reply); 116 if (rc < 0) { 117 nlmsg_free(skb); 118 return rc; 119 } 120 121 return genlmsg_reply(skb, info); 122 } 123 124 /* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127 static void send_cpu_listeners(struct sk_buff *skb, 128 struct listener_list *listeners) 129 { 130 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 rc = genlmsg_end(skb, reply); 137 if (rc < 0) { 138 nlmsg_free(skb); 139 return; 140 } 141 142 rc = 0; 143 down_read(&listeners->sem); 144 list_for_each_entry(s, &listeners->list, list) { 145 skb_next = NULL; 146 if (!list_is_last(&s->list, &listeners->list)) { 147 skb_next = skb_clone(skb_cur, GFP_KERNEL); 148 if (!skb_next) 149 break; 150 } 151 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 152 if (rc == -ECONNREFUSED) { 153 s->valid = 0; 154 delcount++; 155 } 156 skb_cur = skb_next; 157 } 158 up_read(&listeners->sem); 159 160 if (skb_cur) 161 nlmsg_free(skb_cur); 162 163 if (!delcount) 164 return; 165 166 /* Delete invalidated entries */ 167 down_write(&listeners->sem); 168 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 169 if (!s->valid) { 170 list_del(&s->list); 171 kfree(s); 172 } 173 } 174 up_write(&listeners->sem); 175 } 176 177 static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 178 { 179 memset(stats, 0, sizeof(*stats)); 180 /* 181 * Each accounting subsystem adds calls to its functions to 182 * fill in relevant parts of struct taskstsats as follows 183 * 184 * per-task-foo(stats, tsk); 185 */ 186 187 delayacct_add_tsk(stats, tsk); 188 189 /* fill in basic acct fields */ 190 stats->version = TASKSTATS_VERSION; 191 stats->nvcsw = tsk->nvcsw; 192 stats->nivcsw = tsk->nivcsw; 193 bacct_add_tsk(stats, tsk); 194 195 /* fill in extended acct fields */ 196 xacct_add_tsk(stats, tsk); 197 } 198 199 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 200 { 201 struct task_struct *tsk; 202 203 rcu_read_lock(); 204 tsk = find_task_by_vpid(pid); 205 if (tsk) 206 get_task_struct(tsk); 207 rcu_read_unlock(); 208 if (!tsk) 209 return -ESRCH; 210 fill_stats(tsk, stats); 211 put_task_struct(tsk); 212 return 0; 213 } 214 215 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 216 { 217 struct task_struct *tsk, *first; 218 unsigned long flags; 219 int rc = -ESRCH; 220 221 /* 222 * Add additional stats from live tasks except zombie thread group 223 * leaders who are already counted with the dead tasks 224 */ 225 rcu_read_lock(); 226 first = find_task_by_vpid(tgid); 227 228 if (!first || !lock_task_sighand(first, &flags)) 229 goto out; 230 231 if (first->signal->stats) 232 memcpy(stats, first->signal->stats, sizeof(*stats)); 233 else 234 memset(stats, 0, sizeof(*stats)); 235 236 tsk = first; 237 do { 238 if (tsk->exit_state) 239 continue; 240 /* 241 * Accounting subsystem can call its functions here to 242 * fill in relevant parts of struct taskstsats as follows 243 * 244 * per-task-foo(stats, tsk); 245 */ 246 delayacct_add_tsk(stats, tsk); 247 248 stats->nvcsw += tsk->nvcsw; 249 stats->nivcsw += tsk->nivcsw; 250 } while_each_thread(first, tsk); 251 252 unlock_task_sighand(first, &flags); 253 rc = 0; 254 out: 255 rcu_read_unlock(); 256 257 stats->version = TASKSTATS_VERSION; 258 /* 259 * Accounting subsystems can also add calls here to modify 260 * fields of taskstats. 261 */ 262 return rc; 263 } 264 265 static void fill_tgid_exit(struct task_struct *tsk) 266 { 267 unsigned long flags; 268 269 spin_lock_irqsave(&tsk->sighand->siglock, flags); 270 if (!tsk->signal->stats) 271 goto ret; 272 273 /* 274 * Each accounting subsystem calls its functions here to 275 * accumalate its per-task stats for tsk, into the per-tgid structure 276 * 277 * per-task-foo(tsk->signal->stats, tsk); 278 */ 279 delayacct_add_tsk(tsk->signal->stats, tsk); 280 ret: 281 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 282 return; 283 } 284 285 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 286 { 287 struct listener_list *listeners; 288 struct listener *s, *tmp, *s2; 289 unsigned int cpu; 290 291 if (!cpumask_subset(mask, cpu_possible_mask)) 292 return -EINVAL; 293 294 if (isadd == REGISTER) { 295 for_each_cpu(cpu, mask) { 296 s = kmalloc_node(sizeof(struct listener), 297 GFP_KERNEL, cpu_to_node(cpu)); 298 if (!s) 299 goto cleanup; 300 301 s->pid = pid; 302 s->valid = 1; 303 304 listeners = &per_cpu(listener_array, cpu); 305 down_write(&listeners->sem); 306 list_for_each_entry(s2, &listeners->list, list) { 307 if (s2->pid == pid && s2->valid) 308 goto exists; 309 } 310 list_add(&s->list, &listeners->list); 311 s = NULL; 312 exists: 313 up_write(&listeners->sem); 314 kfree(s); /* nop if NULL */ 315 } 316 return 0; 317 } 318 319 /* Deregister or cleanup */ 320 cleanup: 321 for_each_cpu(cpu, mask) { 322 listeners = &per_cpu(listener_array, cpu); 323 down_write(&listeners->sem); 324 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 if (s->pid == pid) { 326 list_del(&s->list); 327 kfree(s); 328 break; 329 } 330 } 331 up_write(&listeners->sem); 332 } 333 return 0; 334 } 335 336 static int parse(struct nlattr *na, struct cpumask *mask) 337 { 338 char *data; 339 int len; 340 int ret; 341 342 if (na == NULL) 343 return 1; 344 len = nla_len(na); 345 if (len > TASKSTATS_CPUMASK_MAXLEN) 346 return -E2BIG; 347 if (len < 1) 348 return -EINVAL; 349 data = kmalloc(len, GFP_KERNEL); 350 if (!data) 351 return -ENOMEM; 352 nla_strlcpy(data, na, len); 353 ret = cpulist_parse(data, mask); 354 kfree(data); 355 return ret; 356 } 357 358 #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 359 #define TASKSTATS_NEEDS_PADDING 1 360 #endif 361 362 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 363 { 364 struct nlattr *na, *ret; 365 int aggr; 366 367 aggr = (type == TASKSTATS_TYPE_PID) 368 ? TASKSTATS_TYPE_AGGR_PID 369 : TASKSTATS_TYPE_AGGR_TGID; 370 371 /* 372 * The taskstats structure is internally aligned on 8 byte 373 * boundaries but the layout of the aggregrate reply, with 374 * two NLA headers and the pid (each 4 bytes), actually 375 * force the entire structure to be unaligned. This causes 376 * the kernel to issue unaligned access warnings on some 377 * architectures like ia64. Unfortunately, some software out there 378 * doesn't properly unroll the NLA packet and assumes that the start 379 * of the taskstats structure will always be 20 bytes from the start 380 * of the netlink payload. Aligning the start of the taskstats 381 * structure breaks this software, which we don't want. So, for now 382 * the alignment only happens on architectures that require it 383 * and those users will have to update to fixed versions of those 384 * packages. Space is reserved in the packet only when needed. 385 * This ifdef should be removed in several years e.g. 2012 once 386 * we can be confident that fixed versions are installed on most 387 * systems. We add the padding before the aggregate since the 388 * aggregate is already a defined type. 389 */ 390 #ifdef TASKSTATS_NEEDS_PADDING 391 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) 392 goto err; 393 #endif 394 na = nla_nest_start(skb, aggr); 395 if (!na) 396 goto err; 397 398 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 399 goto err; 400 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 401 if (!ret) 402 goto err; 403 nla_nest_end(skb, na); 404 405 return nla_data(ret); 406 err: 407 return NULL; 408 } 409 410 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 411 { 412 int rc = 0; 413 struct sk_buff *rep_skb; 414 struct cgroupstats *stats; 415 struct nlattr *na; 416 size_t size; 417 u32 fd; 418 struct file *file; 419 int fput_needed; 420 421 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 422 if (!na) 423 return -EINVAL; 424 425 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 426 file = fget_light(fd, &fput_needed); 427 if (!file) 428 return 0; 429 430 size = nla_total_size(sizeof(struct cgroupstats)); 431 432 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 433 size); 434 if (rc < 0) 435 goto err; 436 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 438 sizeof(struct cgroupstats)); 439 stats = nla_data(na); 440 memset(stats, 0, sizeof(*stats)); 441 442 rc = cgroupstats_build(stats, file->f_dentry); 443 if (rc < 0) { 444 nlmsg_free(rep_skb); 445 goto err; 446 } 447 448 rc = send_reply(rep_skb, info); 449 450 err: 451 fput_light(file, fput_needed); 452 return rc; 453 } 454 455 static int cmd_attr_register_cpumask(struct genl_info *info) 456 { 457 cpumask_var_t mask; 458 int rc; 459 460 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 461 return -ENOMEM; 462 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 463 if (rc < 0) 464 goto out; 465 rc = add_del_listener(info->snd_pid, mask, REGISTER); 466 out: 467 free_cpumask_var(mask); 468 return rc; 469 } 470 471 static int cmd_attr_deregister_cpumask(struct genl_info *info) 472 { 473 cpumask_var_t mask; 474 int rc; 475 476 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 477 return -ENOMEM; 478 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 479 if (rc < 0) 480 goto out; 481 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 482 out: 483 free_cpumask_var(mask); 484 return rc; 485 } 486 487 static size_t taskstats_packet_size(void) 488 { 489 size_t size; 490 491 size = nla_total_size(sizeof(u32)) + 492 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 493 #ifdef TASKSTATS_NEEDS_PADDING 494 size += nla_total_size(0); /* Padding for alignment */ 495 #endif 496 return size; 497 } 498 499 static int cmd_attr_pid(struct genl_info *info) 500 { 501 struct taskstats *stats; 502 struct sk_buff *rep_skb; 503 size_t size; 504 u32 pid; 505 int rc; 506 507 size = taskstats_packet_size(); 508 509 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 510 if (rc < 0) 511 return rc; 512 513 rc = -EINVAL; 514 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 515 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 516 if (!stats) 517 goto err; 518 519 rc = fill_stats_for_pid(pid, stats); 520 if (rc < 0) 521 goto err; 522 return send_reply(rep_skb, info); 523 err: 524 nlmsg_free(rep_skb); 525 return rc; 526 } 527 528 static int cmd_attr_tgid(struct genl_info *info) 529 { 530 struct taskstats *stats; 531 struct sk_buff *rep_skb; 532 size_t size; 533 u32 tgid; 534 int rc; 535 536 size = taskstats_packet_size(); 537 538 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 539 if (rc < 0) 540 return rc; 541 542 rc = -EINVAL; 543 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 544 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 545 if (!stats) 546 goto err; 547 548 rc = fill_stats_for_tgid(tgid, stats); 549 if (rc < 0) 550 goto err; 551 return send_reply(rep_skb, info); 552 err: 553 nlmsg_free(rep_skb); 554 return rc; 555 } 556 557 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 558 { 559 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 560 return cmd_attr_register_cpumask(info); 561 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 562 return cmd_attr_deregister_cpumask(info); 563 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 564 return cmd_attr_pid(info); 565 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 566 return cmd_attr_tgid(info); 567 else 568 return -EINVAL; 569 } 570 571 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 572 { 573 struct signal_struct *sig = tsk->signal; 574 struct taskstats *stats; 575 576 if (sig->stats || thread_group_empty(tsk)) 577 goto ret; 578 579 /* No problem if kmem_cache_zalloc() fails */ 580 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 581 582 spin_lock_irq(&tsk->sighand->siglock); 583 if (!sig->stats) { 584 sig->stats = stats; 585 stats = NULL; 586 } 587 spin_unlock_irq(&tsk->sighand->siglock); 588 589 if (stats) 590 kmem_cache_free(taskstats_cache, stats); 591 ret: 592 return sig->stats; 593 } 594 595 /* Send pid data out on exit */ 596 void taskstats_exit(struct task_struct *tsk, int group_dead) 597 { 598 int rc; 599 struct listener_list *listeners; 600 struct taskstats *stats; 601 struct sk_buff *rep_skb; 602 size_t size; 603 int is_thread_group; 604 605 if (!family_registered) 606 return; 607 608 /* 609 * Size includes space for nested attributes 610 */ 611 size = taskstats_packet_size(); 612 613 is_thread_group = !!taskstats_tgid_alloc(tsk); 614 if (is_thread_group) { 615 /* PID + STATS + TGID + STATS */ 616 size = 2 * size; 617 /* fill the tsk->signal->stats structure */ 618 fill_tgid_exit(tsk); 619 } 620 621 listeners = __this_cpu_ptr(&listener_array); 622 if (list_empty(&listeners->list)) 623 return; 624 625 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 626 if (rc < 0) 627 return; 628 629 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 630 if (!stats) 631 goto err; 632 633 fill_stats(tsk, stats); 634 635 /* 636 * Doesn't matter if tsk is the leader or the last group member leaving 637 */ 638 if (!is_thread_group || !group_dead) 639 goto send; 640 641 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 642 if (!stats) 643 goto err; 644 645 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 646 647 send: 648 send_cpu_listeners(rep_skb, listeners); 649 return; 650 err: 651 nlmsg_free(rep_skb); 652 } 653 654 static struct genl_ops taskstats_ops = { 655 .cmd = TASKSTATS_CMD_GET, 656 .doit = taskstats_user_cmd, 657 .policy = taskstats_cmd_get_policy, 658 .flags = GENL_ADMIN_PERM, 659 }; 660 661 static struct genl_ops cgroupstats_ops = { 662 .cmd = CGROUPSTATS_CMD_GET, 663 .doit = cgroupstats_user_cmd, 664 .policy = cgroupstats_cmd_get_policy, 665 }; 666 667 /* Needed early in initialization */ 668 void __init taskstats_init_early(void) 669 { 670 unsigned int i; 671 672 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 673 for_each_possible_cpu(i) { 674 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 675 init_rwsem(&(per_cpu(listener_array, i).sem)); 676 } 677 } 678 679 static int __init taskstats_init(void) 680 { 681 int rc; 682 683 rc = genl_register_family(&family); 684 if (rc) 685 return rc; 686 687 rc = genl_register_ops(&family, &taskstats_ops); 688 if (rc < 0) 689 goto err; 690 691 rc = genl_register_ops(&family, &cgroupstats_ops); 692 if (rc < 0) 693 goto err_cgroup_ops; 694 695 family_registered = 1; 696 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 697 return 0; 698 err_cgroup_ops: 699 genl_unregister_ops(&family, &taskstats_ops); 700 err: 701 genl_unregister_family(&family); 702 return rc; 703 } 704 705 /* 706 * late initcall ensures initialization of statistics collection 707 * mechanisms precedes initialization of the taskstats interface 708 */ 709 late_initcall(taskstats_init); 710