1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <net/genetlink.h> 31 #include <asm/atomic.h> 32 33 /* 34 * Maximum length of a cpumask that can be specified in 35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 36 */ 37 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 38 39 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 40 static int family_registered; 41 struct kmem_cache *taskstats_cache; 42 43 static struct genl_family family = { 44 .id = GENL_ID_GENERATE, 45 .name = TASKSTATS_GENL_NAME, 46 .version = TASKSTATS_GENL_VERSION, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX, 48 }; 49 50 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 58 }; 59 60 struct listener { 61 struct list_head list; 62 pid_t pid; 63 char valid; 64 }; 65 66 struct listener_list { 67 struct rw_semaphore sem; 68 struct list_head list; 69 }; 70 static DEFINE_PER_CPU(struct listener_list, listener_array); 71 72 enum actions { 73 REGISTER, 74 DEREGISTER, 75 CPU_DONT_CARE 76 }; 77 78 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 79 size_t size) 80 { 81 struct sk_buff *skb; 82 void *reply; 83 84 /* 85 * If new attributes are added, please revisit this allocation 86 */ 87 skb = genlmsg_new(size, GFP_KERNEL); 88 if (!skb) 89 return -ENOMEM; 90 91 if (!info) { 92 int seq = get_cpu_var(taskstats_seqnum)++; 93 put_cpu_var(taskstats_seqnum); 94 95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 96 } else 97 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 98 if (reply == NULL) { 99 nlmsg_free(skb); 100 return -EINVAL; 101 } 102 103 *skbp = skb; 104 return 0; 105 } 106 107 /* 108 * Send taskstats data in @skb to listener with nl_pid @pid 109 */ 110 static int send_reply(struct sk_buff *skb, struct genl_info *info) 111 { 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 void *reply = genlmsg_data(genlhdr); 114 int rc; 115 116 rc = genlmsg_end(skb, reply); 117 if (rc < 0) { 118 nlmsg_free(skb); 119 return rc; 120 } 121 122 return genlmsg_reply(skb, info); 123 } 124 125 /* 126 * Send taskstats data in @skb to listeners registered for @cpu's exit data 127 */ 128 static void send_cpu_listeners(struct sk_buff *skb, 129 struct listener_list *listeners) 130 { 131 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 132 struct listener *s, *tmp; 133 struct sk_buff *skb_next, *skb_cur = skb; 134 void *reply = genlmsg_data(genlhdr); 135 int rc, delcount = 0; 136 137 rc = genlmsg_end(skb, reply); 138 if (rc < 0) { 139 nlmsg_free(skb); 140 return; 141 } 142 143 rc = 0; 144 down_read(&listeners->sem); 145 list_for_each_entry(s, &listeners->list, list) { 146 skb_next = NULL; 147 if (!list_is_last(&s->list, &listeners->list)) { 148 skb_next = skb_clone(skb_cur, GFP_KERNEL); 149 if (!skb_next) 150 break; 151 } 152 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 153 if (rc == -ECONNREFUSED) { 154 s->valid = 0; 155 delcount++; 156 } 157 skb_cur = skb_next; 158 } 159 up_read(&listeners->sem); 160 161 if (skb_cur) 162 nlmsg_free(skb_cur); 163 164 if (!delcount) 165 return; 166 167 /* Delete invalidated entries */ 168 down_write(&listeners->sem); 169 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 170 if (!s->valid) { 171 list_del(&s->list); 172 kfree(s); 173 } 174 } 175 up_write(&listeners->sem); 176 } 177 178 static int fill_pid(pid_t pid, struct task_struct *tsk, 179 struct taskstats *stats) 180 { 181 int rc = 0; 182 183 if (!tsk) { 184 rcu_read_lock(); 185 tsk = find_task_by_vpid(pid); 186 if (tsk) 187 get_task_struct(tsk); 188 rcu_read_unlock(); 189 if (!tsk) 190 return -ESRCH; 191 } else 192 get_task_struct(tsk); 193 194 memset(stats, 0, sizeof(*stats)); 195 /* 196 * Each accounting subsystem adds calls to its functions to 197 * fill in relevant parts of struct taskstsats as follows 198 * 199 * per-task-foo(stats, tsk); 200 */ 201 202 delayacct_add_tsk(stats, tsk); 203 204 /* fill in basic acct fields */ 205 stats->version = TASKSTATS_VERSION; 206 stats->nvcsw = tsk->nvcsw; 207 stats->nivcsw = tsk->nivcsw; 208 bacct_add_tsk(stats, tsk); 209 210 /* fill in extended acct fields */ 211 xacct_add_tsk(stats, tsk); 212 213 /* Define err: label here if needed */ 214 put_task_struct(tsk); 215 return rc; 216 217 } 218 219 static int fill_tgid(pid_t tgid, struct task_struct *first, 220 struct taskstats *stats) 221 { 222 struct task_struct *tsk; 223 unsigned long flags; 224 int rc = -ESRCH; 225 226 /* 227 * Add additional stats from live tasks except zombie thread group 228 * leaders who are already counted with the dead tasks 229 */ 230 rcu_read_lock(); 231 if (!first) 232 first = find_task_by_vpid(tgid); 233 234 if (!first || !lock_task_sighand(first, &flags)) 235 goto out; 236 237 if (first->signal->stats) 238 memcpy(stats, first->signal->stats, sizeof(*stats)); 239 else 240 memset(stats, 0, sizeof(*stats)); 241 242 tsk = first; 243 do { 244 if (tsk->exit_state) 245 continue; 246 /* 247 * Accounting subsystem can call its functions here to 248 * fill in relevant parts of struct taskstsats as follows 249 * 250 * per-task-foo(stats, tsk); 251 */ 252 delayacct_add_tsk(stats, tsk); 253 254 stats->nvcsw += tsk->nvcsw; 255 stats->nivcsw += tsk->nivcsw; 256 } while_each_thread(first, tsk); 257 258 unlock_task_sighand(first, &flags); 259 rc = 0; 260 out: 261 rcu_read_unlock(); 262 263 stats->version = TASKSTATS_VERSION; 264 /* 265 * Accounting subsystems can also add calls here to modify 266 * fields of taskstats. 267 */ 268 return rc; 269 } 270 271 272 static void fill_tgid_exit(struct task_struct *tsk) 273 { 274 unsigned long flags; 275 276 spin_lock_irqsave(&tsk->sighand->siglock, flags); 277 if (!tsk->signal->stats) 278 goto ret; 279 280 /* 281 * Each accounting subsystem calls its functions here to 282 * accumalate its per-task stats for tsk, into the per-tgid structure 283 * 284 * per-task-foo(tsk->signal->stats, tsk); 285 */ 286 delayacct_add_tsk(tsk->signal->stats, tsk); 287 ret: 288 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 289 return; 290 } 291 292 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 293 { 294 struct listener_list *listeners; 295 struct listener *s, *tmp; 296 unsigned int cpu; 297 298 if (!cpumask_subset(mask, cpu_possible_mask)) 299 return -EINVAL; 300 301 if (isadd == REGISTER) { 302 for_each_cpu(cpu, mask) { 303 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 304 cpu_to_node(cpu)); 305 if (!s) 306 goto cleanup; 307 s->pid = pid; 308 INIT_LIST_HEAD(&s->list); 309 s->valid = 1; 310 311 listeners = &per_cpu(listener_array, cpu); 312 down_write(&listeners->sem); 313 list_add(&s->list, &listeners->list); 314 up_write(&listeners->sem); 315 } 316 return 0; 317 } 318 319 /* Deregister or cleanup */ 320 cleanup: 321 for_each_cpu(cpu, mask) { 322 listeners = &per_cpu(listener_array, cpu); 323 down_write(&listeners->sem); 324 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 if (s->pid == pid) { 326 list_del(&s->list); 327 kfree(s); 328 break; 329 } 330 } 331 up_write(&listeners->sem); 332 } 333 return 0; 334 } 335 336 static int parse(struct nlattr *na, struct cpumask *mask) 337 { 338 char *data; 339 int len; 340 int ret; 341 342 if (na == NULL) 343 return 1; 344 len = nla_len(na); 345 if (len > TASKSTATS_CPUMASK_MAXLEN) 346 return -E2BIG; 347 if (len < 1) 348 return -EINVAL; 349 data = kmalloc(len, GFP_KERNEL); 350 if (!data) 351 return -ENOMEM; 352 nla_strlcpy(data, na, len); 353 ret = cpulist_parse(data, mask); 354 kfree(data); 355 return ret; 356 } 357 358 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 359 { 360 struct nlattr *na, *ret; 361 int aggr; 362 363 aggr = (type == TASKSTATS_TYPE_PID) 364 ? TASKSTATS_TYPE_AGGR_PID 365 : TASKSTATS_TYPE_AGGR_TGID; 366 367 na = nla_nest_start(skb, aggr); 368 if (!na) 369 goto err; 370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 371 goto err; 372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 373 if (!ret) 374 goto err; 375 nla_nest_end(skb, na); 376 377 return nla_data(ret); 378 err: 379 return NULL; 380 } 381 382 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 383 { 384 int rc = 0; 385 struct sk_buff *rep_skb; 386 struct cgroupstats *stats; 387 struct nlattr *na; 388 size_t size; 389 u32 fd; 390 struct file *file; 391 int fput_needed; 392 393 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 394 if (!na) 395 return -EINVAL; 396 397 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 398 file = fget_light(fd, &fput_needed); 399 if (!file) 400 return 0; 401 402 size = nla_total_size(sizeof(struct cgroupstats)); 403 404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 405 size); 406 if (rc < 0) 407 goto err; 408 409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 410 sizeof(struct cgroupstats)); 411 stats = nla_data(na); 412 memset(stats, 0, sizeof(*stats)); 413 414 rc = cgroupstats_build(stats, file->f_dentry); 415 if (rc < 0) { 416 nlmsg_free(rep_skb); 417 goto err; 418 } 419 420 rc = send_reply(rep_skb, info); 421 422 err: 423 fput_light(file, fput_needed); 424 return rc; 425 } 426 427 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 428 { 429 int rc; 430 struct sk_buff *rep_skb; 431 struct taskstats *stats; 432 size_t size; 433 cpumask_var_t mask; 434 435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 436 return -ENOMEM; 437 438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 439 if (rc < 0) 440 goto free_return_rc; 441 if (rc == 0) { 442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 443 goto free_return_rc; 444 } 445 446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 447 if (rc < 0) 448 goto free_return_rc; 449 if (rc == 0) { 450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 451 free_return_rc: 452 free_cpumask_var(mask); 453 return rc; 454 } 455 free_cpumask_var(mask); 456 457 /* 458 * Size includes space for nested attributes 459 */ 460 size = nla_total_size(sizeof(u32)) + 461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 462 463 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 464 if (rc < 0) 465 return rc; 466 467 rc = -EINVAL; 468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 471 if (!stats) 472 goto err; 473 474 rc = fill_pid(pid, NULL, stats); 475 if (rc < 0) 476 goto err; 477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 480 if (!stats) 481 goto err; 482 483 rc = fill_tgid(tgid, NULL, stats); 484 if (rc < 0) 485 goto err; 486 } else 487 goto err; 488 489 return send_reply(rep_skb, info); 490 err: 491 nlmsg_free(rep_skb); 492 return rc; 493 } 494 495 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 496 { 497 struct signal_struct *sig = tsk->signal; 498 struct taskstats *stats; 499 500 if (sig->stats || thread_group_empty(tsk)) 501 goto ret; 502 503 /* No problem if kmem_cache_zalloc() fails */ 504 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 505 506 spin_lock_irq(&tsk->sighand->siglock); 507 if (!sig->stats) { 508 sig->stats = stats; 509 stats = NULL; 510 } 511 spin_unlock_irq(&tsk->sighand->siglock); 512 513 if (stats) 514 kmem_cache_free(taskstats_cache, stats); 515 ret: 516 return sig->stats; 517 } 518 519 /* Send pid data out on exit */ 520 void taskstats_exit(struct task_struct *tsk, int group_dead) 521 { 522 int rc; 523 struct listener_list *listeners; 524 struct taskstats *stats; 525 struct sk_buff *rep_skb; 526 size_t size; 527 int is_thread_group; 528 529 if (!family_registered) 530 return; 531 532 /* 533 * Size includes space for nested attributes 534 */ 535 size = nla_total_size(sizeof(u32)) + 536 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 537 538 is_thread_group = !!taskstats_tgid_alloc(tsk); 539 if (is_thread_group) { 540 /* PID + STATS + TGID + STATS */ 541 size = 2 * size; 542 /* fill the tsk->signal->stats structure */ 543 fill_tgid_exit(tsk); 544 } 545 546 listeners = &__raw_get_cpu_var(listener_array); 547 if (list_empty(&listeners->list)) 548 return; 549 550 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 551 if (rc < 0) 552 return; 553 554 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 555 if (!stats) 556 goto err; 557 558 rc = fill_pid(-1, tsk, stats); 559 if (rc < 0) 560 goto err; 561 562 /* 563 * Doesn't matter if tsk is the leader or the last group member leaving 564 */ 565 if (!is_thread_group || !group_dead) 566 goto send; 567 568 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 569 if (!stats) 570 goto err; 571 572 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 573 574 send: 575 send_cpu_listeners(rep_skb, listeners); 576 return; 577 err: 578 nlmsg_free(rep_skb); 579 } 580 581 static struct genl_ops taskstats_ops = { 582 .cmd = TASKSTATS_CMD_GET, 583 .doit = taskstats_user_cmd, 584 .policy = taskstats_cmd_get_policy, 585 }; 586 587 static struct genl_ops cgroupstats_ops = { 588 .cmd = CGROUPSTATS_CMD_GET, 589 .doit = cgroupstats_user_cmd, 590 .policy = cgroupstats_cmd_get_policy, 591 }; 592 593 /* Needed early in initialization */ 594 void __init taskstats_init_early(void) 595 { 596 unsigned int i; 597 598 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 599 for_each_possible_cpu(i) { 600 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 601 init_rwsem(&(per_cpu(listener_array, i).sem)); 602 } 603 } 604 605 static int __init taskstats_init(void) 606 { 607 int rc; 608 609 rc = genl_register_family(&family); 610 if (rc) 611 return rc; 612 613 rc = genl_register_ops(&family, &taskstats_ops); 614 if (rc < 0) 615 goto err; 616 617 rc = genl_register_ops(&family, &cgroupstats_ops); 618 if (rc < 0) 619 goto err_cgroup_ops; 620 621 family_registered = 1; 622 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 623 return 0; 624 err_cgroup_ops: 625 genl_unregister_ops(&family, &taskstats_ops); 626 err: 627 genl_unregister_family(&family); 628 return rc; 629 } 630 631 /* 632 * late initcall ensures initialization of statistics collection 633 * mechanisms precedes initialization of the taskstats interface 634 */ 635 late_initcall(taskstats_init); 636