1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <net/genetlink.h> 31 #include <asm/atomic.h> 32 33 /* 34 * Maximum length of a cpumask that can be specified in 35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 36 */ 37 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 38 39 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 40 static int family_registered; 41 struct kmem_cache *taskstats_cache; 42 43 static struct genl_family family = { 44 .id = GENL_ID_GENERATE, 45 .name = TASKSTATS_GENL_NAME, 46 .version = TASKSTATS_GENL_VERSION, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX, 48 }; 49 50 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 58 }; 59 60 struct listener { 61 struct list_head list; 62 pid_t pid; 63 char valid; 64 }; 65 66 struct listener_list { 67 struct rw_semaphore sem; 68 struct list_head list; 69 }; 70 static DEFINE_PER_CPU(struct listener_list, listener_array); 71 72 enum actions { 73 REGISTER, 74 DEREGISTER, 75 CPU_DONT_CARE 76 }; 77 78 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 79 size_t size) 80 { 81 struct sk_buff *skb; 82 void *reply; 83 84 /* 85 * If new attributes are added, please revisit this allocation 86 */ 87 skb = genlmsg_new(size, GFP_KERNEL); 88 if (!skb) 89 return -ENOMEM; 90 91 if (!info) { 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 93 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 95 } else 96 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 97 if (reply == NULL) { 98 nlmsg_free(skb); 99 return -EINVAL; 100 } 101 102 *skbp = skb; 103 return 0; 104 } 105 106 /* 107 * Send taskstats data in @skb to listener with nl_pid @pid 108 */ 109 static int send_reply(struct sk_buff *skb, struct genl_info *info) 110 { 111 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 void *reply = genlmsg_data(genlhdr); 113 int rc; 114 115 rc = genlmsg_end(skb, reply); 116 if (rc < 0) { 117 nlmsg_free(skb); 118 return rc; 119 } 120 121 return genlmsg_reply(skb, info); 122 } 123 124 /* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127 static void send_cpu_listeners(struct sk_buff *skb, 128 struct listener_list *listeners) 129 { 130 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 rc = genlmsg_end(skb, reply); 137 if (rc < 0) { 138 nlmsg_free(skb); 139 return; 140 } 141 142 rc = 0; 143 down_read(&listeners->sem); 144 list_for_each_entry(s, &listeners->list, list) { 145 skb_next = NULL; 146 if (!list_is_last(&s->list, &listeners->list)) { 147 skb_next = skb_clone(skb_cur, GFP_KERNEL); 148 if (!skb_next) 149 break; 150 } 151 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 152 if (rc == -ECONNREFUSED) { 153 s->valid = 0; 154 delcount++; 155 } 156 skb_cur = skb_next; 157 } 158 up_read(&listeners->sem); 159 160 if (skb_cur) 161 nlmsg_free(skb_cur); 162 163 if (!delcount) 164 return; 165 166 /* Delete invalidated entries */ 167 down_write(&listeners->sem); 168 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 169 if (!s->valid) { 170 list_del(&s->list); 171 kfree(s); 172 } 173 } 174 up_write(&listeners->sem); 175 } 176 177 static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 178 { 179 memset(stats, 0, sizeof(*stats)); 180 /* 181 * Each accounting subsystem adds calls to its functions to 182 * fill in relevant parts of struct taskstsats as follows 183 * 184 * per-task-foo(stats, tsk); 185 */ 186 187 delayacct_add_tsk(stats, tsk); 188 189 /* fill in basic acct fields */ 190 stats->version = TASKSTATS_VERSION; 191 stats->nvcsw = tsk->nvcsw; 192 stats->nivcsw = tsk->nivcsw; 193 bacct_add_tsk(stats, tsk); 194 195 /* fill in extended acct fields */ 196 xacct_add_tsk(stats, tsk); 197 } 198 199 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 200 { 201 struct task_struct *tsk; 202 203 rcu_read_lock(); 204 tsk = find_task_by_vpid(pid); 205 if (tsk) 206 get_task_struct(tsk); 207 rcu_read_unlock(); 208 if (!tsk) 209 return -ESRCH; 210 fill_stats(tsk, stats); 211 put_task_struct(tsk); 212 return 0; 213 } 214 215 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 216 { 217 struct task_struct *tsk, *first; 218 unsigned long flags; 219 int rc = -ESRCH; 220 221 /* 222 * Add additional stats from live tasks except zombie thread group 223 * leaders who are already counted with the dead tasks 224 */ 225 rcu_read_lock(); 226 first = find_task_by_vpid(tgid); 227 228 if (!first || !lock_task_sighand(first, &flags)) 229 goto out; 230 231 if (first->signal->stats) 232 memcpy(stats, first->signal->stats, sizeof(*stats)); 233 else 234 memset(stats, 0, sizeof(*stats)); 235 236 tsk = first; 237 do { 238 if (tsk->exit_state) 239 continue; 240 /* 241 * Accounting subsystem can call its functions here to 242 * fill in relevant parts of struct taskstsats as follows 243 * 244 * per-task-foo(stats, tsk); 245 */ 246 delayacct_add_tsk(stats, tsk); 247 248 stats->nvcsw += tsk->nvcsw; 249 stats->nivcsw += tsk->nivcsw; 250 } while_each_thread(first, tsk); 251 252 unlock_task_sighand(first, &flags); 253 rc = 0; 254 out: 255 rcu_read_unlock(); 256 257 stats->version = TASKSTATS_VERSION; 258 /* 259 * Accounting subsystems can also add calls here to modify 260 * fields of taskstats. 261 */ 262 return rc; 263 } 264 265 static void fill_tgid_exit(struct task_struct *tsk) 266 { 267 unsigned long flags; 268 269 spin_lock_irqsave(&tsk->sighand->siglock, flags); 270 if (!tsk->signal->stats) 271 goto ret; 272 273 /* 274 * Each accounting subsystem calls its functions here to 275 * accumalate its per-task stats for tsk, into the per-tgid structure 276 * 277 * per-task-foo(tsk->signal->stats, tsk); 278 */ 279 delayacct_add_tsk(tsk->signal->stats, tsk); 280 ret: 281 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 282 return; 283 } 284 285 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 286 { 287 struct listener_list *listeners; 288 struct listener *s, *tmp; 289 unsigned int cpu; 290 291 if (!cpumask_subset(mask, cpu_possible_mask)) 292 return -EINVAL; 293 294 if (isadd == REGISTER) { 295 for_each_cpu(cpu, mask) { 296 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 297 cpu_to_node(cpu)); 298 if (!s) 299 goto cleanup; 300 s->pid = pid; 301 INIT_LIST_HEAD(&s->list); 302 s->valid = 1; 303 304 listeners = &per_cpu(listener_array, cpu); 305 down_write(&listeners->sem); 306 list_add(&s->list, &listeners->list); 307 up_write(&listeners->sem); 308 } 309 return 0; 310 } 311 312 /* Deregister or cleanup */ 313 cleanup: 314 for_each_cpu(cpu, mask) { 315 listeners = &per_cpu(listener_array, cpu); 316 down_write(&listeners->sem); 317 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 318 if (s->pid == pid) { 319 list_del(&s->list); 320 kfree(s); 321 break; 322 } 323 } 324 up_write(&listeners->sem); 325 } 326 return 0; 327 } 328 329 static int parse(struct nlattr *na, struct cpumask *mask) 330 { 331 char *data; 332 int len; 333 int ret; 334 335 if (na == NULL) 336 return 1; 337 len = nla_len(na); 338 if (len > TASKSTATS_CPUMASK_MAXLEN) 339 return -E2BIG; 340 if (len < 1) 341 return -EINVAL; 342 data = kmalloc(len, GFP_KERNEL); 343 if (!data) 344 return -ENOMEM; 345 nla_strlcpy(data, na, len); 346 ret = cpulist_parse(data, mask); 347 kfree(data); 348 return ret; 349 } 350 351 #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 352 #define TASKSTATS_NEEDS_PADDING 1 353 #endif 354 355 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 356 { 357 struct nlattr *na, *ret; 358 int aggr; 359 360 aggr = (type == TASKSTATS_TYPE_PID) 361 ? TASKSTATS_TYPE_AGGR_PID 362 : TASKSTATS_TYPE_AGGR_TGID; 363 364 /* 365 * The taskstats structure is internally aligned on 8 byte 366 * boundaries but the layout of the aggregrate reply, with 367 * two NLA headers and the pid (each 4 bytes), actually 368 * force the entire structure to be unaligned. This causes 369 * the kernel to issue unaligned access warnings on some 370 * architectures like ia64. Unfortunately, some software out there 371 * doesn't properly unroll the NLA packet and assumes that the start 372 * of the taskstats structure will always be 20 bytes from the start 373 * of the netlink payload. Aligning the start of the taskstats 374 * structure breaks this software, which we don't want. So, for now 375 * the alignment only happens on architectures that require it 376 * and those users will have to update to fixed versions of those 377 * packages. Space is reserved in the packet only when needed. 378 * This ifdef should be removed in several years e.g. 2012 once 379 * we can be confident that fixed versions are installed on most 380 * systems. We add the padding before the aggregate since the 381 * aggregate is already a defined type. 382 */ 383 #ifdef TASKSTATS_NEEDS_PADDING 384 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) 385 goto err; 386 #endif 387 na = nla_nest_start(skb, aggr); 388 if (!na) 389 goto err; 390 391 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 392 goto err; 393 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 394 if (!ret) 395 goto err; 396 nla_nest_end(skb, na); 397 398 return nla_data(ret); 399 err: 400 return NULL; 401 } 402 403 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 404 { 405 int rc = 0; 406 struct sk_buff *rep_skb; 407 struct cgroupstats *stats; 408 struct nlattr *na; 409 size_t size; 410 u32 fd; 411 struct file *file; 412 int fput_needed; 413 414 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 415 if (!na) 416 return -EINVAL; 417 418 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 419 file = fget_light(fd, &fput_needed); 420 if (!file) 421 return 0; 422 423 size = nla_total_size(sizeof(struct cgroupstats)); 424 425 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 426 size); 427 if (rc < 0) 428 goto err; 429 430 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 431 sizeof(struct cgroupstats)); 432 stats = nla_data(na); 433 memset(stats, 0, sizeof(*stats)); 434 435 rc = cgroupstats_build(stats, file->f_dentry); 436 if (rc < 0) { 437 nlmsg_free(rep_skb); 438 goto err; 439 } 440 441 rc = send_reply(rep_skb, info); 442 443 err: 444 fput_light(file, fput_needed); 445 return rc; 446 } 447 448 static int cmd_attr_register_cpumask(struct genl_info *info) 449 { 450 cpumask_var_t mask; 451 int rc; 452 453 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 454 return -ENOMEM; 455 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 456 if (rc < 0) 457 goto out; 458 rc = add_del_listener(info->snd_pid, mask, REGISTER); 459 out: 460 free_cpumask_var(mask); 461 return rc; 462 } 463 464 static int cmd_attr_deregister_cpumask(struct genl_info *info) 465 { 466 cpumask_var_t mask; 467 int rc; 468 469 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 470 return -ENOMEM; 471 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 472 if (rc < 0) 473 goto out; 474 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 475 out: 476 free_cpumask_var(mask); 477 return rc; 478 } 479 480 static size_t taskstats_packet_size(void) 481 { 482 size_t size; 483 484 size = nla_total_size(sizeof(u32)) + 485 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 486 #ifdef TASKSTATS_NEEDS_PADDING 487 size += nla_total_size(0); /* Padding for alignment */ 488 #endif 489 return size; 490 } 491 492 static int cmd_attr_pid(struct genl_info *info) 493 { 494 struct taskstats *stats; 495 struct sk_buff *rep_skb; 496 size_t size; 497 u32 pid; 498 int rc; 499 500 size = taskstats_packet_size(); 501 502 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 503 if (rc < 0) 504 return rc; 505 506 rc = -EINVAL; 507 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 508 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 509 if (!stats) 510 goto err; 511 512 rc = fill_stats_for_pid(pid, stats); 513 if (rc < 0) 514 goto err; 515 return send_reply(rep_skb, info); 516 err: 517 nlmsg_free(rep_skb); 518 return rc; 519 } 520 521 static int cmd_attr_tgid(struct genl_info *info) 522 { 523 struct taskstats *stats; 524 struct sk_buff *rep_skb; 525 size_t size; 526 u32 tgid; 527 int rc; 528 529 size = taskstats_packet_size(); 530 531 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 532 if (rc < 0) 533 return rc; 534 535 rc = -EINVAL; 536 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 537 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 538 if (!stats) 539 goto err; 540 541 rc = fill_stats_for_tgid(tgid, stats); 542 if (rc < 0) 543 goto err; 544 return send_reply(rep_skb, info); 545 err: 546 nlmsg_free(rep_skb); 547 return rc; 548 } 549 550 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 551 { 552 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 553 return cmd_attr_register_cpumask(info); 554 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 555 return cmd_attr_deregister_cpumask(info); 556 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 557 return cmd_attr_pid(info); 558 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 559 return cmd_attr_tgid(info); 560 else 561 return -EINVAL; 562 } 563 564 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 565 { 566 struct signal_struct *sig = tsk->signal; 567 struct taskstats *stats; 568 569 if (sig->stats || thread_group_empty(tsk)) 570 goto ret; 571 572 /* No problem if kmem_cache_zalloc() fails */ 573 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 574 575 spin_lock_irq(&tsk->sighand->siglock); 576 if (!sig->stats) { 577 sig->stats = stats; 578 stats = NULL; 579 } 580 spin_unlock_irq(&tsk->sighand->siglock); 581 582 if (stats) 583 kmem_cache_free(taskstats_cache, stats); 584 ret: 585 return sig->stats; 586 } 587 588 /* Send pid data out on exit */ 589 void taskstats_exit(struct task_struct *tsk, int group_dead) 590 { 591 int rc; 592 struct listener_list *listeners; 593 struct taskstats *stats; 594 struct sk_buff *rep_skb; 595 size_t size; 596 int is_thread_group; 597 598 if (!family_registered) 599 return; 600 601 /* 602 * Size includes space for nested attributes 603 */ 604 size = taskstats_packet_size(); 605 606 is_thread_group = !!taskstats_tgid_alloc(tsk); 607 if (is_thread_group) { 608 /* PID + STATS + TGID + STATS */ 609 size = 2 * size; 610 /* fill the tsk->signal->stats structure */ 611 fill_tgid_exit(tsk); 612 } 613 614 listeners = __this_cpu_ptr(&listener_array); 615 if (list_empty(&listeners->list)) 616 return; 617 618 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 619 if (rc < 0) 620 return; 621 622 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 623 if (!stats) 624 goto err; 625 626 fill_stats(tsk, stats); 627 628 /* 629 * Doesn't matter if tsk is the leader or the last group member leaving 630 */ 631 if (!is_thread_group || !group_dead) 632 goto send; 633 634 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 635 if (!stats) 636 goto err; 637 638 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 639 640 send: 641 send_cpu_listeners(rep_skb, listeners); 642 return; 643 err: 644 nlmsg_free(rep_skb); 645 } 646 647 static struct genl_ops taskstats_ops = { 648 .cmd = TASKSTATS_CMD_GET, 649 .doit = taskstats_user_cmd, 650 .policy = taskstats_cmd_get_policy, 651 }; 652 653 static struct genl_ops cgroupstats_ops = { 654 .cmd = CGROUPSTATS_CMD_GET, 655 .doit = cgroupstats_user_cmd, 656 .policy = cgroupstats_cmd_get_policy, 657 }; 658 659 /* Needed early in initialization */ 660 void __init taskstats_init_early(void) 661 { 662 unsigned int i; 663 664 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 665 for_each_possible_cpu(i) { 666 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 667 init_rwsem(&(per_cpu(listener_array, i).sem)); 668 } 669 } 670 671 static int __init taskstats_init(void) 672 { 673 int rc; 674 675 rc = genl_register_family(&family); 676 if (rc) 677 return rc; 678 679 rc = genl_register_ops(&family, &taskstats_ops); 680 if (rc < 0) 681 goto err; 682 683 rc = genl_register_ops(&family, &cgroupstats_ops); 684 if (rc < 0) 685 goto err_cgroup_ops; 686 687 family_registered = 1; 688 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 689 return 0; 690 err_cgroup_ops: 691 genl_unregister_ops(&family, &taskstats_ops); 692 err: 693 genl_unregister_family(&family); 694 return rc; 695 } 696 697 /* 698 * late initcall ensures initialization of statistics collection 699 * mechanisms precedes initialization of the taskstats interface 700 */ 701 late_initcall(taskstats_init); 702