1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <net/genetlink.h> 31 #include <linux/atomic.h> 32 33 /* 34 * Maximum length of a cpumask that can be specified in 35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 36 */ 37 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 38 39 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 40 static int family_registered; 41 struct kmem_cache *taskstats_cache; 42 43 static struct genl_family family = { 44 .id = GENL_ID_GENERATE, 45 .name = TASKSTATS_GENL_NAME, 46 .version = TASKSTATS_GENL_VERSION, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX, 48 }; 49 50 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 58 }; 59 60 struct listener { 61 struct list_head list; 62 pid_t pid; 63 char valid; 64 }; 65 66 struct listener_list { 67 struct rw_semaphore sem; 68 struct list_head list; 69 }; 70 static DEFINE_PER_CPU(struct listener_list, listener_array); 71 72 enum actions { 73 REGISTER, 74 DEREGISTER, 75 CPU_DONT_CARE 76 }; 77 78 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 79 size_t size) 80 { 81 struct sk_buff *skb; 82 void *reply; 83 84 /* 85 * If new attributes are added, please revisit this allocation 86 */ 87 skb = genlmsg_new(size, GFP_KERNEL); 88 if (!skb) 89 return -ENOMEM; 90 91 if (!info) { 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 93 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 95 } else 96 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 97 if (reply == NULL) { 98 nlmsg_free(skb); 99 return -EINVAL; 100 } 101 102 *skbp = skb; 103 return 0; 104 } 105 106 /* 107 * Send taskstats data in @skb to listener with nl_pid @pid 108 */ 109 static int send_reply(struct sk_buff *skb, struct genl_info *info) 110 { 111 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 void *reply = genlmsg_data(genlhdr); 113 int rc; 114 115 rc = genlmsg_end(skb, reply); 116 if (rc < 0) { 117 nlmsg_free(skb); 118 return rc; 119 } 120 121 return genlmsg_reply(skb, info); 122 } 123 124 /* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127 static void send_cpu_listeners(struct sk_buff *skb, 128 struct listener_list *listeners) 129 { 130 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 rc = genlmsg_end(skb, reply); 137 if (rc < 0) { 138 nlmsg_free(skb); 139 return; 140 } 141 142 rc = 0; 143 down_read(&listeners->sem); 144 list_for_each_entry(s, &listeners->list, list) { 145 skb_next = NULL; 146 if (!list_is_last(&s->list, &listeners->list)) { 147 skb_next = skb_clone(skb_cur, GFP_KERNEL); 148 if (!skb_next) 149 break; 150 } 151 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 152 if (rc == -ECONNREFUSED) { 153 s->valid = 0; 154 delcount++; 155 } 156 skb_cur = skb_next; 157 } 158 up_read(&listeners->sem); 159 160 if (skb_cur) 161 nlmsg_free(skb_cur); 162 163 if (!delcount) 164 return; 165 166 /* Delete invalidated entries */ 167 down_write(&listeners->sem); 168 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 169 if (!s->valid) { 170 list_del(&s->list); 171 kfree(s); 172 } 173 } 174 up_write(&listeners->sem); 175 } 176 177 static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 178 { 179 memset(stats, 0, sizeof(*stats)); 180 /* 181 * Each accounting subsystem adds calls to its functions to 182 * fill in relevant parts of struct taskstsats as follows 183 * 184 * per-task-foo(stats, tsk); 185 */ 186 187 delayacct_add_tsk(stats, tsk); 188 189 /* fill in basic acct fields */ 190 stats->version = TASKSTATS_VERSION; 191 stats->nvcsw = tsk->nvcsw; 192 stats->nivcsw = tsk->nivcsw; 193 bacct_add_tsk(stats, tsk); 194 195 /* fill in extended acct fields */ 196 xacct_add_tsk(stats, tsk); 197 } 198 199 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 200 { 201 struct task_struct *tsk; 202 203 rcu_read_lock(); 204 tsk = find_task_by_vpid(pid); 205 if (tsk) 206 get_task_struct(tsk); 207 rcu_read_unlock(); 208 if (!tsk) 209 return -ESRCH; 210 fill_stats(tsk, stats); 211 put_task_struct(tsk); 212 return 0; 213 } 214 215 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 216 { 217 struct task_struct *tsk, *first; 218 unsigned long flags; 219 int rc = -ESRCH; 220 221 /* 222 * Add additional stats from live tasks except zombie thread group 223 * leaders who are already counted with the dead tasks 224 */ 225 rcu_read_lock(); 226 first = find_task_by_vpid(tgid); 227 228 if (!first || !lock_task_sighand(first, &flags)) 229 goto out; 230 231 if (first->signal->stats) 232 memcpy(stats, first->signal->stats, sizeof(*stats)); 233 else 234 memset(stats, 0, sizeof(*stats)); 235 236 tsk = first; 237 do { 238 if (tsk->exit_state) 239 continue; 240 /* 241 * Accounting subsystem can call its functions here to 242 * fill in relevant parts of struct taskstsats as follows 243 * 244 * per-task-foo(stats, tsk); 245 */ 246 delayacct_add_tsk(stats, tsk); 247 248 stats->nvcsw += tsk->nvcsw; 249 stats->nivcsw += tsk->nivcsw; 250 } while_each_thread(first, tsk); 251 252 unlock_task_sighand(first, &flags); 253 rc = 0; 254 out: 255 rcu_read_unlock(); 256 257 stats->version = TASKSTATS_VERSION; 258 /* 259 * Accounting subsystems can also add calls here to modify 260 * fields of taskstats. 261 */ 262 return rc; 263 } 264 265 static void fill_tgid_exit(struct task_struct *tsk) 266 { 267 unsigned long flags; 268 269 spin_lock_irqsave(&tsk->sighand->siglock, flags); 270 if (!tsk->signal->stats) 271 goto ret; 272 273 /* 274 * Each accounting subsystem calls its functions here to 275 * accumalate its per-task stats for tsk, into the per-tgid structure 276 * 277 * per-task-foo(tsk->signal->stats, tsk); 278 */ 279 delayacct_add_tsk(tsk->signal->stats, tsk); 280 ret: 281 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 282 return; 283 } 284 285 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 286 { 287 struct listener_list *listeners; 288 struct listener *s, *tmp, *s2; 289 unsigned int cpu; 290 291 if (!cpumask_subset(mask, cpu_possible_mask)) 292 return -EINVAL; 293 294 if (isadd == REGISTER) { 295 for_each_cpu(cpu, mask) { 296 s = kmalloc_node(sizeof(struct listener), 297 GFP_KERNEL, cpu_to_node(cpu)); 298 if (!s) 299 goto cleanup; 300 301 s->pid = pid; 302 s->valid = 1; 303 304 listeners = &per_cpu(listener_array, cpu); 305 down_write(&listeners->sem); 306 list_for_each_entry(s2, &listeners->list, list) { 307 if (s2->pid == pid && s2->valid) 308 goto exists; 309 } 310 list_add(&s->list, &listeners->list); 311 s = NULL; 312 exists: 313 up_write(&listeners->sem); 314 kfree(s); /* nop if NULL */ 315 } 316 return 0; 317 } 318 319 /* Deregister or cleanup */ 320 cleanup: 321 for_each_cpu(cpu, mask) { 322 listeners = &per_cpu(listener_array, cpu); 323 down_write(&listeners->sem); 324 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 if (s->pid == pid) { 326 list_del(&s->list); 327 kfree(s); 328 break; 329 } 330 } 331 up_write(&listeners->sem); 332 } 333 return 0; 334 } 335 336 static int parse(struct nlattr *na, struct cpumask *mask) 337 { 338 char *data; 339 int len; 340 int ret; 341 342 if (na == NULL) 343 return 1; 344 len = nla_len(na); 345 if (len > TASKSTATS_CPUMASK_MAXLEN) 346 return -E2BIG; 347 if (len < 1) 348 return -EINVAL; 349 data = kmalloc(len, GFP_KERNEL); 350 if (!data) 351 return -ENOMEM; 352 nla_strlcpy(data, na, len); 353 ret = cpulist_parse(data, mask); 354 kfree(data); 355 return ret; 356 } 357 358 #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 359 #define TASKSTATS_NEEDS_PADDING 1 360 #endif 361 362 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 363 { 364 struct nlattr *na, *ret; 365 int aggr; 366 367 aggr = (type == TASKSTATS_TYPE_PID) 368 ? TASKSTATS_TYPE_AGGR_PID 369 : TASKSTATS_TYPE_AGGR_TGID; 370 371 /* 372 * The taskstats structure is internally aligned on 8 byte 373 * boundaries but the layout of the aggregrate reply, with 374 * two NLA headers and the pid (each 4 bytes), actually 375 * force the entire structure to be unaligned. This causes 376 * the kernel to issue unaligned access warnings on some 377 * architectures like ia64. Unfortunately, some software out there 378 * doesn't properly unroll the NLA packet and assumes that the start 379 * of the taskstats structure will always be 20 bytes from the start 380 * of the netlink payload. Aligning the start of the taskstats 381 * structure breaks this software, which we don't want. So, for now 382 * the alignment only happens on architectures that require it 383 * and those users will have to update to fixed versions of those 384 * packages. Space is reserved in the packet only when needed. 385 * This ifdef should be removed in several years e.g. 2012 once 386 * we can be confident that fixed versions are installed on most 387 * systems. We add the padding before the aggregate since the 388 * aggregate is already a defined type. 389 */ 390 #ifdef TASKSTATS_NEEDS_PADDING 391 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) 392 goto err; 393 #endif 394 na = nla_nest_start(skb, aggr); 395 if (!na) 396 goto err; 397 398 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 399 goto err; 400 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 401 if (!ret) 402 goto err; 403 nla_nest_end(skb, na); 404 405 return nla_data(ret); 406 err: 407 return NULL; 408 } 409 410 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 411 { 412 int rc = 0; 413 struct sk_buff *rep_skb; 414 struct cgroupstats *stats; 415 struct nlattr *na; 416 size_t size; 417 u32 fd; 418 struct file *file; 419 int fput_needed; 420 421 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 422 if (!na) 423 return -EINVAL; 424 425 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 426 file = fget_light(fd, &fput_needed); 427 if (!file) 428 return 0; 429 430 size = nla_total_size(sizeof(struct cgroupstats)); 431 432 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 433 size); 434 if (rc < 0) 435 goto err; 436 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 438 sizeof(struct cgroupstats)); 439 if (na == NULL) { 440 rc = -EMSGSIZE; 441 goto err; 442 } 443 444 stats = nla_data(na); 445 memset(stats, 0, sizeof(*stats)); 446 447 rc = cgroupstats_build(stats, file->f_dentry); 448 if (rc < 0) { 449 nlmsg_free(rep_skb); 450 goto err; 451 } 452 453 rc = send_reply(rep_skb, info); 454 455 err: 456 fput_light(file, fput_needed); 457 return rc; 458 } 459 460 static int cmd_attr_register_cpumask(struct genl_info *info) 461 { 462 cpumask_var_t mask; 463 int rc; 464 465 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 466 return -ENOMEM; 467 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 468 if (rc < 0) 469 goto out; 470 rc = add_del_listener(info->snd_pid, mask, REGISTER); 471 out: 472 free_cpumask_var(mask); 473 return rc; 474 } 475 476 static int cmd_attr_deregister_cpumask(struct genl_info *info) 477 { 478 cpumask_var_t mask; 479 int rc; 480 481 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 482 return -ENOMEM; 483 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 484 if (rc < 0) 485 goto out; 486 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 487 out: 488 free_cpumask_var(mask); 489 return rc; 490 } 491 492 static size_t taskstats_packet_size(void) 493 { 494 size_t size; 495 496 size = nla_total_size(sizeof(u32)) + 497 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 498 #ifdef TASKSTATS_NEEDS_PADDING 499 size += nla_total_size(0); /* Padding for alignment */ 500 #endif 501 return size; 502 } 503 504 static int cmd_attr_pid(struct genl_info *info) 505 { 506 struct taskstats *stats; 507 struct sk_buff *rep_skb; 508 size_t size; 509 u32 pid; 510 int rc; 511 512 size = taskstats_packet_size(); 513 514 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 515 if (rc < 0) 516 return rc; 517 518 rc = -EINVAL; 519 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 520 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 521 if (!stats) 522 goto err; 523 524 rc = fill_stats_for_pid(pid, stats); 525 if (rc < 0) 526 goto err; 527 return send_reply(rep_skb, info); 528 err: 529 nlmsg_free(rep_skb); 530 return rc; 531 } 532 533 static int cmd_attr_tgid(struct genl_info *info) 534 { 535 struct taskstats *stats; 536 struct sk_buff *rep_skb; 537 size_t size; 538 u32 tgid; 539 int rc; 540 541 size = taskstats_packet_size(); 542 543 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 544 if (rc < 0) 545 return rc; 546 547 rc = -EINVAL; 548 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 549 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 550 if (!stats) 551 goto err; 552 553 rc = fill_stats_for_tgid(tgid, stats); 554 if (rc < 0) 555 goto err; 556 return send_reply(rep_skb, info); 557 err: 558 nlmsg_free(rep_skb); 559 return rc; 560 } 561 562 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 563 { 564 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 565 return cmd_attr_register_cpumask(info); 566 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 567 return cmd_attr_deregister_cpumask(info); 568 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 569 return cmd_attr_pid(info); 570 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 571 return cmd_attr_tgid(info); 572 else 573 return -EINVAL; 574 } 575 576 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 577 { 578 struct signal_struct *sig = tsk->signal; 579 struct taskstats *stats; 580 581 if (sig->stats || thread_group_empty(tsk)) 582 goto ret; 583 584 /* No problem if kmem_cache_zalloc() fails */ 585 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 586 587 spin_lock_irq(&tsk->sighand->siglock); 588 if (!sig->stats) { 589 sig->stats = stats; 590 stats = NULL; 591 } 592 spin_unlock_irq(&tsk->sighand->siglock); 593 594 if (stats) 595 kmem_cache_free(taskstats_cache, stats); 596 ret: 597 return sig->stats; 598 } 599 600 /* Send pid data out on exit */ 601 void taskstats_exit(struct task_struct *tsk, int group_dead) 602 { 603 int rc; 604 struct listener_list *listeners; 605 struct taskstats *stats; 606 struct sk_buff *rep_skb; 607 size_t size; 608 int is_thread_group; 609 610 if (!family_registered) 611 return; 612 613 /* 614 * Size includes space for nested attributes 615 */ 616 size = taskstats_packet_size(); 617 618 is_thread_group = !!taskstats_tgid_alloc(tsk); 619 if (is_thread_group) { 620 /* PID + STATS + TGID + STATS */ 621 size = 2 * size; 622 /* fill the tsk->signal->stats structure */ 623 fill_tgid_exit(tsk); 624 } 625 626 listeners = __this_cpu_ptr(&listener_array); 627 if (list_empty(&listeners->list)) 628 return; 629 630 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 631 if (rc < 0) 632 return; 633 634 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 635 if (!stats) 636 goto err; 637 638 fill_stats(tsk, stats); 639 640 /* 641 * Doesn't matter if tsk is the leader or the last group member leaving 642 */ 643 if (!is_thread_group || !group_dead) 644 goto send; 645 646 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 647 if (!stats) 648 goto err; 649 650 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 651 652 send: 653 send_cpu_listeners(rep_skb, listeners); 654 return; 655 err: 656 nlmsg_free(rep_skb); 657 } 658 659 static struct genl_ops taskstats_ops = { 660 .cmd = TASKSTATS_CMD_GET, 661 .doit = taskstats_user_cmd, 662 .policy = taskstats_cmd_get_policy, 663 .flags = GENL_ADMIN_PERM, 664 }; 665 666 static struct genl_ops cgroupstats_ops = { 667 .cmd = CGROUPSTATS_CMD_GET, 668 .doit = cgroupstats_user_cmd, 669 .policy = cgroupstats_cmd_get_policy, 670 }; 671 672 /* Needed early in initialization */ 673 void __init taskstats_init_early(void) 674 { 675 unsigned int i; 676 677 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 678 for_each_possible_cpu(i) { 679 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 680 init_rwsem(&(per_cpu(listener_array, i).sem)); 681 } 682 } 683 684 static int __init taskstats_init(void) 685 { 686 int rc; 687 688 rc = genl_register_family(&family); 689 if (rc) 690 return rc; 691 692 rc = genl_register_ops(&family, &taskstats_ops); 693 if (rc < 0) 694 goto err; 695 696 rc = genl_register_ops(&family, &cgroupstats_ops); 697 if (rc < 0) 698 goto err_cgroup_ops; 699 700 family_registered = 1; 701 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 702 return 0; 703 err_cgroup_ops: 704 genl_unregister_ops(&family, &taskstats_ops); 705 err: 706 genl_unregister_family(&family); 707 return rc; 708 } 709 710 /* 711 * late initcall ensures initialization of statistics collection 712 * mechanisms precedes initialization of the taskstats interface 713 */ 714 late_initcall(taskstats_init); 715