1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/cgroupstats.h> 26 #include <linux/cgroup.h> 27 #include <linux/fs.h> 28 #include <linux/file.h> 29 #include <net/genetlink.h> 30 #include <asm/atomic.h> 31 32 /* 33 * Maximum length of a cpumask that can be specified in 34 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 35 */ 36 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 37 38 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 39 static int family_registered; 40 struct kmem_cache *taskstats_cache; 41 42 static struct genl_family family = { 43 .id = GENL_ID_GENERATE, 44 .name = TASKSTATS_GENL_NAME, 45 .version = TASKSTATS_GENL_VERSION, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 }; 48 49 static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 50 __read_mostly = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static struct nla_policy 57 cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { 58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 59 }; 60 61 struct listener { 62 struct list_head list; 63 pid_t pid; 64 char valid; 65 }; 66 67 struct listener_list { 68 struct rw_semaphore sem; 69 struct list_head list; 70 }; 71 static DEFINE_PER_CPU(struct listener_list, listener_array); 72 73 enum actions { 74 REGISTER, 75 DEREGISTER, 76 CPU_DONT_CARE 77 }; 78 79 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 80 size_t size) 81 { 82 struct sk_buff *skb; 83 void *reply; 84 85 /* 86 * If new attributes are added, please revisit this allocation 87 */ 88 skb = genlmsg_new(size, GFP_KERNEL); 89 if (!skb) 90 return -ENOMEM; 91 92 if (!info) { 93 int seq = get_cpu_var(taskstats_seqnum)++; 94 put_cpu_var(taskstats_seqnum); 95 96 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 97 } else 98 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 99 if (reply == NULL) { 100 nlmsg_free(skb); 101 return -EINVAL; 102 } 103 104 *skbp = skb; 105 return 0; 106 } 107 108 /* 109 * Send taskstats data in @skb to listener with nl_pid @pid 110 */ 111 static int send_reply(struct sk_buff *skb, struct genl_info *info) 112 { 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 114 void *reply = genlmsg_data(genlhdr); 115 int rc; 116 117 rc = genlmsg_end(skb, reply); 118 if (rc < 0) { 119 nlmsg_free(skb); 120 return rc; 121 } 122 123 return genlmsg_reply(skb, info); 124 } 125 126 /* 127 * Send taskstats data in @skb to listeners registered for @cpu's exit data 128 */ 129 static void send_cpu_listeners(struct sk_buff *skb, 130 struct listener_list *listeners) 131 { 132 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 133 struct listener *s, *tmp; 134 struct sk_buff *skb_next, *skb_cur = skb; 135 void *reply = genlmsg_data(genlhdr); 136 int rc, delcount = 0; 137 138 rc = genlmsg_end(skb, reply); 139 if (rc < 0) { 140 nlmsg_free(skb); 141 return; 142 } 143 144 rc = 0; 145 down_read(&listeners->sem); 146 list_for_each_entry(s, &listeners->list, list) { 147 skb_next = NULL; 148 if (!list_is_last(&s->list, &listeners->list)) { 149 skb_next = skb_clone(skb_cur, GFP_KERNEL); 150 if (!skb_next) 151 break; 152 } 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 154 if (rc == -ECONNREFUSED) { 155 s->valid = 0; 156 delcount++; 157 } 158 skb_cur = skb_next; 159 } 160 up_read(&listeners->sem); 161 162 if (skb_cur) 163 nlmsg_free(skb_cur); 164 165 if (!delcount) 166 return; 167 168 /* Delete invalidated entries */ 169 down_write(&listeners->sem); 170 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 171 if (!s->valid) { 172 list_del(&s->list); 173 kfree(s); 174 } 175 } 176 up_write(&listeners->sem); 177 } 178 179 static int fill_pid(pid_t pid, struct task_struct *tsk, 180 struct taskstats *stats) 181 { 182 int rc = 0; 183 184 if (!tsk) { 185 rcu_read_lock(); 186 tsk = find_task_by_vpid(pid); 187 if (tsk) 188 get_task_struct(tsk); 189 rcu_read_unlock(); 190 if (!tsk) 191 return -ESRCH; 192 } else 193 get_task_struct(tsk); 194 195 memset(stats, 0, sizeof(*stats)); 196 /* 197 * Each accounting subsystem adds calls to its functions to 198 * fill in relevant parts of struct taskstsats as follows 199 * 200 * per-task-foo(stats, tsk); 201 */ 202 203 delayacct_add_tsk(stats, tsk); 204 205 /* fill in basic acct fields */ 206 stats->version = TASKSTATS_VERSION; 207 stats->nvcsw = tsk->nvcsw; 208 stats->nivcsw = tsk->nivcsw; 209 bacct_add_tsk(stats, tsk); 210 211 /* fill in extended acct fields */ 212 xacct_add_tsk(stats, tsk); 213 214 /* Define err: label here if needed */ 215 put_task_struct(tsk); 216 return rc; 217 218 } 219 220 static int fill_tgid(pid_t tgid, struct task_struct *first, 221 struct taskstats *stats) 222 { 223 struct task_struct *tsk; 224 unsigned long flags; 225 int rc = -ESRCH; 226 227 /* 228 * Add additional stats from live tasks except zombie thread group 229 * leaders who are already counted with the dead tasks 230 */ 231 rcu_read_lock(); 232 if (!first) 233 first = find_task_by_vpid(tgid); 234 235 if (!first || !lock_task_sighand(first, &flags)) 236 goto out; 237 238 if (first->signal->stats) 239 memcpy(stats, first->signal->stats, sizeof(*stats)); 240 else 241 memset(stats, 0, sizeof(*stats)); 242 243 tsk = first; 244 do { 245 if (tsk->exit_state) 246 continue; 247 /* 248 * Accounting subsystem can call its functions here to 249 * fill in relevant parts of struct taskstsats as follows 250 * 251 * per-task-foo(stats, tsk); 252 */ 253 delayacct_add_tsk(stats, tsk); 254 255 stats->nvcsw += tsk->nvcsw; 256 stats->nivcsw += tsk->nivcsw; 257 } while_each_thread(first, tsk); 258 259 unlock_task_sighand(first, &flags); 260 rc = 0; 261 out: 262 rcu_read_unlock(); 263 264 stats->version = TASKSTATS_VERSION; 265 /* 266 * Accounting subsystems can also add calls here to modify 267 * fields of taskstats. 268 */ 269 return rc; 270 } 271 272 273 static void fill_tgid_exit(struct task_struct *tsk) 274 { 275 unsigned long flags; 276 277 spin_lock_irqsave(&tsk->sighand->siglock, flags); 278 if (!tsk->signal->stats) 279 goto ret; 280 281 /* 282 * Each accounting subsystem calls its functions here to 283 * accumalate its per-task stats for tsk, into the per-tgid structure 284 * 285 * per-task-foo(tsk->signal->stats, tsk); 286 */ 287 delayacct_add_tsk(tsk->signal->stats, tsk); 288 ret: 289 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 290 return; 291 } 292 293 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 294 { 295 struct listener_list *listeners; 296 struct listener *s, *tmp; 297 unsigned int cpu; 298 299 if (!cpumask_subset(mask, cpu_possible_mask)) 300 return -EINVAL; 301 302 if (isadd == REGISTER) { 303 for_each_cpu(cpu, mask) { 304 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 305 cpu_to_node(cpu)); 306 if (!s) 307 goto cleanup; 308 s->pid = pid; 309 INIT_LIST_HEAD(&s->list); 310 s->valid = 1; 311 312 listeners = &per_cpu(listener_array, cpu); 313 down_write(&listeners->sem); 314 list_add(&s->list, &listeners->list); 315 up_write(&listeners->sem); 316 } 317 return 0; 318 } 319 320 /* Deregister or cleanup */ 321 cleanup: 322 for_each_cpu(cpu, mask) { 323 listeners = &per_cpu(listener_array, cpu); 324 down_write(&listeners->sem); 325 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 326 if (s->pid == pid) { 327 list_del(&s->list); 328 kfree(s); 329 break; 330 } 331 } 332 up_write(&listeners->sem); 333 } 334 return 0; 335 } 336 337 static int parse(struct nlattr *na, struct cpumask *mask) 338 { 339 char *data; 340 int len; 341 int ret; 342 343 if (na == NULL) 344 return 1; 345 len = nla_len(na); 346 if (len > TASKSTATS_CPUMASK_MAXLEN) 347 return -E2BIG; 348 if (len < 1) 349 return -EINVAL; 350 data = kmalloc(len, GFP_KERNEL); 351 if (!data) 352 return -ENOMEM; 353 nla_strlcpy(data, na, len); 354 ret = cpulist_parse(data, mask); 355 kfree(data); 356 return ret; 357 } 358 359 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 360 { 361 struct nlattr *na, *ret; 362 int aggr; 363 364 aggr = (type == TASKSTATS_TYPE_PID) 365 ? TASKSTATS_TYPE_AGGR_PID 366 : TASKSTATS_TYPE_AGGR_TGID; 367 368 na = nla_nest_start(skb, aggr); 369 if (!na) 370 goto err; 371 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 372 goto err; 373 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 374 if (!ret) 375 goto err; 376 nla_nest_end(skb, na); 377 378 return nla_data(ret); 379 err: 380 return NULL; 381 } 382 383 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 384 { 385 int rc = 0; 386 struct sk_buff *rep_skb; 387 struct cgroupstats *stats; 388 struct nlattr *na; 389 size_t size; 390 u32 fd; 391 struct file *file; 392 int fput_needed; 393 394 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 395 if (!na) 396 return -EINVAL; 397 398 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 399 file = fget_light(fd, &fput_needed); 400 if (!file) 401 return 0; 402 403 size = nla_total_size(sizeof(struct cgroupstats)); 404 405 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 406 size); 407 if (rc < 0) 408 goto err; 409 410 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 411 sizeof(struct cgroupstats)); 412 stats = nla_data(na); 413 memset(stats, 0, sizeof(*stats)); 414 415 rc = cgroupstats_build(stats, file->f_dentry); 416 if (rc < 0) { 417 nlmsg_free(rep_skb); 418 goto err; 419 } 420 421 rc = send_reply(rep_skb, info); 422 423 err: 424 fput_light(file, fput_needed); 425 return rc; 426 } 427 428 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 429 { 430 int rc; 431 struct sk_buff *rep_skb; 432 struct taskstats *stats; 433 size_t size; 434 cpumask_var_t mask; 435 436 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 437 return -ENOMEM; 438 439 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 440 if (rc < 0) 441 goto free_return_rc; 442 if (rc == 0) { 443 rc = add_del_listener(info->snd_pid, mask, REGISTER); 444 goto free_return_rc; 445 } 446 447 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 448 if (rc < 0) 449 goto free_return_rc; 450 if (rc == 0) { 451 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 452 free_return_rc: 453 free_cpumask_var(mask); 454 return rc; 455 } 456 free_cpumask_var(mask); 457 458 /* 459 * Size includes space for nested attributes 460 */ 461 size = nla_total_size(sizeof(u32)) + 462 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 463 464 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 465 if (rc < 0) 466 return rc; 467 468 rc = -EINVAL; 469 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 470 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 471 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 472 if (!stats) 473 goto err; 474 475 rc = fill_pid(pid, NULL, stats); 476 if (rc < 0) 477 goto err; 478 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 479 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 480 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 481 if (!stats) 482 goto err; 483 484 rc = fill_tgid(tgid, NULL, stats); 485 if (rc < 0) 486 goto err; 487 } else 488 goto err; 489 490 return send_reply(rep_skb, info); 491 err: 492 nlmsg_free(rep_skb); 493 return rc; 494 } 495 496 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 497 { 498 struct signal_struct *sig = tsk->signal; 499 struct taskstats *stats; 500 501 if (sig->stats || thread_group_empty(tsk)) 502 goto ret; 503 504 /* No problem if kmem_cache_zalloc() fails */ 505 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 506 507 spin_lock_irq(&tsk->sighand->siglock); 508 if (!sig->stats) { 509 sig->stats = stats; 510 stats = NULL; 511 } 512 spin_unlock_irq(&tsk->sighand->siglock); 513 514 if (stats) 515 kmem_cache_free(taskstats_cache, stats); 516 ret: 517 return sig->stats; 518 } 519 520 /* Send pid data out on exit */ 521 void taskstats_exit(struct task_struct *tsk, int group_dead) 522 { 523 int rc; 524 struct listener_list *listeners; 525 struct taskstats *stats; 526 struct sk_buff *rep_skb; 527 size_t size; 528 int is_thread_group; 529 530 if (!family_registered) 531 return; 532 533 /* 534 * Size includes space for nested attributes 535 */ 536 size = nla_total_size(sizeof(u32)) + 537 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 538 539 is_thread_group = !!taskstats_tgid_alloc(tsk); 540 if (is_thread_group) { 541 /* PID + STATS + TGID + STATS */ 542 size = 2 * size; 543 /* fill the tsk->signal->stats structure */ 544 fill_tgid_exit(tsk); 545 } 546 547 listeners = &__raw_get_cpu_var(listener_array); 548 if (list_empty(&listeners->list)) 549 return; 550 551 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 552 if (rc < 0) 553 return; 554 555 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 556 if (!stats) 557 goto err; 558 559 rc = fill_pid(-1, tsk, stats); 560 if (rc < 0) 561 goto err; 562 563 /* 564 * Doesn't matter if tsk is the leader or the last group member leaving 565 */ 566 if (!is_thread_group || !group_dead) 567 goto send; 568 569 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 570 if (!stats) 571 goto err; 572 573 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 574 575 send: 576 send_cpu_listeners(rep_skb, listeners); 577 return; 578 err: 579 nlmsg_free(rep_skb); 580 } 581 582 static struct genl_ops taskstats_ops = { 583 .cmd = TASKSTATS_CMD_GET, 584 .doit = taskstats_user_cmd, 585 .policy = taskstats_cmd_get_policy, 586 }; 587 588 static struct genl_ops cgroupstats_ops = { 589 .cmd = CGROUPSTATS_CMD_GET, 590 .doit = cgroupstats_user_cmd, 591 .policy = cgroupstats_cmd_get_policy, 592 }; 593 594 /* Needed early in initialization */ 595 void __init taskstats_init_early(void) 596 { 597 unsigned int i; 598 599 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 600 for_each_possible_cpu(i) { 601 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 602 init_rwsem(&(per_cpu(listener_array, i).sem)); 603 } 604 } 605 606 static int __init taskstats_init(void) 607 { 608 int rc; 609 610 rc = genl_register_family(&family); 611 if (rc) 612 return rc; 613 614 rc = genl_register_ops(&family, &taskstats_ops); 615 if (rc < 0) 616 goto err; 617 618 rc = genl_register_ops(&family, &cgroupstats_ops); 619 if (rc < 0) 620 goto err_cgroup_ops; 621 622 family_registered = 1; 623 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 624 return 0; 625 err_cgroup_ops: 626 genl_unregister_ops(&family, &taskstats_ops); 627 err: 628 genl_unregister_family(&family); 629 return rc; 630 } 631 632 /* 633 * late initcall ensures initialization of statistics collection 634 * mechanisms precedes initialization of the taskstats interface 635 */ 636 late_initcall(taskstats_init); 637