1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/cgroupstats.h> 26 #include <linux/cgroup.h> 27 #include <linux/fs.h> 28 #include <linux/file.h> 29 #include <net/genetlink.h> 30 #include <asm/atomic.h> 31 32 /* 33 * Maximum length of a cpumask that can be specified in 34 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 35 */ 36 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 37 38 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 39 static int family_registered; 40 struct kmem_cache *taskstats_cache; 41 42 static struct genl_family family = { 43 .id = GENL_ID_GENERATE, 44 .name = TASKSTATS_GENL_NAME, 45 .version = TASKSTATS_GENL_VERSION, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 }; 48 49 static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 50 __read_mostly = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static struct nla_policy 57 cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { 58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 59 }; 60 61 struct listener { 62 struct list_head list; 63 pid_t pid; 64 char valid; 65 }; 66 67 struct listener_list { 68 struct rw_semaphore sem; 69 struct list_head list; 70 }; 71 static DEFINE_PER_CPU(struct listener_list, listener_array); 72 73 enum actions { 74 REGISTER, 75 DEREGISTER, 76 CPU_DONT_CARE 77 }; 78 79 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 80 size_t size) 81 { 82 struct sk_buff *skb; 83 void *reply; 84 85 /* 86 * If new attributes are added, please revisit this allocation 87 */ 88 skb = genlmsg_new(size, GFP_KERNEL); 89 if (!skb) 90 return -ENOMEM; 91 92 if (!info) { 93 int seq = get_cpu_var(taskstats_seqnum)++; 94 put_cpu_var(taskstats_seqnum); 95 96 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 97 } else 98 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 99 if (reply == NULL) { 100 nlmsg_free(skb); 101 return -EINVAL; 102 } 103 104 *skbp = skb; 105 return 0; 106 } 107 108 /* 109 * Send taskstats data in @skb to listener with nl_pid @pid 110 */ 111 static int send_reply(struct sk_buff *skb, pid_t pid) 112 { 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 114 void *reply = genlmsg_data(genlhdr); 115 int rc; 116 117 rc = genlmsg_end(skb, reply); 118 if (rc < 0) { 119 nlmsg_free(skb); 120 return rc; 121 } 122 123 return genlmsg_unicast(skb, pid); 124 } 125 126 /* 127 * Send taskstats data in @skb to listeners registered for @cpu's exit data 128 */ 129 static void send_cpu_listeners(struct sk_buff *skb, 130 struct listener_list *listeners) 131 { 132 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 133 struct listener *s, *tmp; 134 struct sk_buff *skb_next, *skb_cur = skb; 135 void *reply = genlmsg_data(genlhdr); 136 int rc, delcount = 0; 137 138 rc = genlmsg_end(skb, reply); 139 if (rc < 0) { 140 nlmsg_free(skb); 141 return; 142 } 143 144 rc = 0; 145 down_read(&listeners->sem); 146 list_for_each_entry(s, &listeners->list, list) { 147 skb_next = NULL; 148 if (!list_is_last(&s->list, &listeners->list)) { 149 skb_next = skb_clone(skb_cur, GFP_KERNEL); 150 if (!skb_next) 151 break; 152 } 153 rc = genlmsg_unicast(skb_cur, s->pid); 154 if (rc == -ECONNREFUSED) { 155 s->valid = 0; 156 delcount++; 157 } 158 skb_cur = skb_next; 159 } 160 up_read(&listeners->sem); 161 162 if (skb_cur) 163 nlmsg_free(skb_cur); 164 165 if (!delcount) 166 return; 167 168 /* Delete invalidated entries */ 169 down_write(&listeners->sem); 170 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 171 if (!s->valid) { 172 list_del(&s->list); 173 kfree(s); 174 } 175 } 176 up_write(&listeners->sem); 177 } 178 179 static int fill_pid(pid_t pid, struct task_struct *tsk, 180 struct taskstats *stats) 181 { 182 int rc = 0; 183 184 if (!tsk) { 185 rcu_read_lock(); 186 tsk = find_task_by_vpid(pid); 187 if (tsk) 188 get_task_struct(tsk); 189 rcu_read_unlock(); 190 if (!tsk) 191 return -ESRCH; 192 } else 193 get_task_struct(tsk); 194 195 memset(stats, 0, sizeof(*stats)); 196 /* 197 * Each accounting subsystem adds calls to its functions to 198 * fill in relevant parts of struct taskstsats as follows 199 * 200 * per-task-foo(stats, tsk); 201 */ 202 203 delayacct_add_tsk(stats, tsk); 204 205 /* fill in basic acct fields */ 206 stats->version = TASKSTATS_VERSION; 207 stats->nvcsw = tsk->nvcsw; 208 stats->nivcsw = tsk->nivcsw; 209 bacct_add_tsk(stats, tsk); 210 211 /* fill in extended acct fields */ 212 xacct_add_tsk(stats, tsk); 213 214 /* Define err: label here if needed */ 215 put_task_struct(tsk); 216 return rc; 217 218 } 219 220 static int fill_tgid(pid_t tgid, struct task_struct *first, 221 struct taskstats *stats) 222 { 223 struct task_struct *tsk; 224 unsigned long flags; 225 int rc = -ESRCH; 226 227 /* 228 * Add additional stats from live tasks except zombie thread group 229 * leaders who are already counted with the dead tasks 230 */ 231 rcu_read_lock(); 232 if (!first) 233 first = find_task_by_vpid(tgid); 234 235 if (!first || !lock_task_sighand(first, &flags)) 236 goto out; 237 238 if (first->signal->stats) 239 memcpy(stats, first->signal->stats, sizeof(*stats)); 240 else 241 memset(stats, 0, sizeof(*stats)); 242 243 tsk = first; 244 do { 245 if (tsk->exit_state) 246 continue; 247 /* 248 * Accounting subsystem can call its functions here to 249 * fill in relevant parts of struct taskstsats as follows 250 * 251 * per-task-foo(stats, tsk); 252 */ 253 delayacct_add_tsk(stats, tsk); 254 255 stats->nvcsw += tsk->nvcsw; 256 stats->nivcsw += tsk->nivcsw; 257 } while_each_thread(first, tsk); 258 259 unlock_task_sighand(first, &flags); 260 rc = 0; 261 out: 262 rcu_read_unlock(); 263 264 stats->version = TASKSTATS_VERSION; 265 /* 266 * Accounting subsystems can also add calls here to modify 267 * fields of taskstats. 268 */ 269 return rc; 270 } 271 272 273 static void fill_tgid_exit(struct task_struct *tsk) 274 { 275 unsigned long flags; 276 277 spin_lock_irqsave(&tsk->sighand->siglock, flags); 278 if (!tsk->signal->stats) 279 goto ret; 280 281 /* 282 * Each accounting subsystem calls its functions here to 283 * accumalate its per-task stats for tsk, into the per-tgid structure 284 * 285 * per-task-foo(tsk->signal->stats, tsk); 286 */ 287 delayacct_add_tsk(tsk->signal->stats, tsk); 288 ret: 289 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 290 return; 291 } 292 293 static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 294 { 295 struct listener_list *listeners; 296 struct listener *s, *tmp; 297 unsigned int cpu; 298 cpumask_t mask = *maskp; 299 300 if (!cpus_subset(mask, cpu_possible_map)) 301 return -EINVAL; 302 303 if (isadd == REGISTER) { 304 for_each_cpu_mask_nr(cpu, mask) { 305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 306 cpu_to_node(cpu)); 307 if (!s) 308 goto cleanup; 309 s->pid = pid; 310 INIT_LIST_HEAD(&s->list); 311 s->valid = 1; 312 313 listeners = &per_cpu(listener_array, cpu); 314 down_write(&listeners->sem); 315 list_add(&s->list, &listeners->list); 316 up_write(&listeners->sem); 317 } 318 return 0; 319 } 320 321 /* Deregister or cleanup */ 322 cleanup: 323 for_each_cpu_mask_nr(cpu, mask) { 324 listeners = &per_cpu(listener_array, cpu); 325 down_write(&listeners->sem); 326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 327 if (s->pid == pid) { 328 list_del(&s->list); 329 kfree(s); 330 break; 331 } 332 } 333 up_write(&listeners->sem); 334 } 335 return 0; 336 } 337 338 static int parse(struct nlattr *na, cpumask_t *mask) 339 { 340 char *data; 341 int len; 342 int ret; 343 344 if (na == NULL) 345 return 1; 346 len = nla_len(na); 347 if (len > TASKSTATS_CPUMASK_MAXLEN) 348 return -E2BIG; 349 if (len < 1) 350 return -EINVAL; 351 data = kmalloc(len, GFP_KERNEL); 352 if (!data) 353 return -ENOMEM; 354 nla_strlcpy(data, na, len); 355 ret = cpulist_parse(data, *mask); 356 kfree(data); 357 return ret; 358 } 359 360 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 361 { 362 struct nlattr *na, *ret; 363 int aggr; 364 365 aggr = (type == TASKSTATS_TYPE_PID) 366 ? TASKSTATS_TYPE_AGGR_PID 367 : TASKSTATS_TYPE_AGGR_TGID; 368 369 na = nla_nest_start(skb, aggr); 370 if (!na) 371 goto err; 372 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 373 goto err; 374 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 375 if (!ret) 376 goto err; 377 nla_nest_end(skb, na); 378 379 return nla_data(ret); 380 err: 381 return NULL; 382 } 383 384 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 385 { 386 int rc = 0; 387 struct sk_buff *rep_skb; 388 struct cgroupstats *stats; 389 struct nlattr *na; 390 size_t size; 391 u32 fd; 392 struct file *file; 393 int fput_needed; 394 395 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 396 if (!na) 397 return -EINVAL; 398 399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 400 file = fget_light(fd, &fput_needed); 401 if (!file) 402 return 0; 403 404 size = nla_total_size(sizeof(struct cgroupstats)); 405 406 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 407 size); 408 if (rc < 0) 409 goto err; 410 411 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 412 sizeof(struct cgroupstats)); 413 stats = nla_data(na); 414 memset(stats, 0, sizeof(*stats)); 415 416 rc = cgroupstats_build(stats, file->f_dentry); 417 if (rc < 0) { 418 nlmsg_free(rep_skb); 419 goto err; 420 } 421 422 rc = send_reply(rep_skb, info->snd_pid); 423 424 err: 425 fput_light(file, fput_needed); 426 return rc; 427 } 428 429 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 430 { 431 int rc = 0; 432 struct sk_buff *rep_skb; 433 struct taskstats *stats; 434 size_t size; 435 cpumask_t mask; 436 437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 438 if (rc < 0) 439 return rc; 440 if (rc == 0) 441 return add_del_listener(info->snd_pid, &mask, REGISTER); 442 443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 444 if (rc < 0) 445 return rc; 446 if (rc == 0) 447 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 448 449 /* 450 * Size includes space for nested attributes 451 */ 452 size = nla_total_size(sizeof(u32)) + 453 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 454 455 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 456 if (rc < 0) 457 return rc; 458 459 rc = -EINVAL; 460 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 461 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 462 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 463 if (!stats) 464 goto err; 465 466 rc = fill_pid(pid, NULL, stats); 467 if (rc < 0) 468 goto err; 469 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 470 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 471 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 472 if (!stats) 473 goto err; 474 475 rc = fill_tgid(tgid, NULL, stats); 476 if (rc < 0) 477 goto err; 478 } else 479 goto err; 480 481 return send_reply(rep_skb, info->snd_pid); 482 err: 483 nlmsg_free(rep_skb); 484 return rc; 485 } 486 487 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 488 { 489 struct signal_struct *sig = tsk->signal; 490 struct taskstats *stats; 491 492 if (sig->stats || thread_group_empty(tsk)) 493 goto ret; 494 495 /* No problem if kmem_cache_zalloc() fails */ 496 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 497 498 spin_lock_irq(&tsk->sighand->siglock); 499 if (!sig->stats) { 500 sig->stats = stats; 501 stats = NULL; 502 } 503 spin_unlock_irq(&tsk->sighand->siglock); 504 505 if (stats) 506 kmem_cache_free(taskstats_cache, stats); 507 ret: 508 return sig->stats; 509 } 510 511 /* Send pid data out on exit */ 512 void taskstats_exit(struct task_struct *tsk, int group_dead) 513 { 514 int rc; 515 struct listener_list *listeners; 516 struct taskstats *stats; 517 struct sk_buff *rep_skb; 518 size_t size; 519 int is_thread_group; 520 521 if (!family_registered) 522 return; 523 524 /* 525 * Size includes space for nested attributes 526 */ 527 size = nla_total_size(sizeof(u32)) + 528 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 529 530 is_thread_group = !!taskstats_tgid_alloc(tsk); 531 if (is_thread_group) { 532 /* PID + STATS + TGID + STATS */ 533 size = 2 * size; 534 /* fill the tsk->signal->stats structure */ 535 fill_tgid_exit(tsk); 536 } 537 538 listeners = &__raw_get_cpu_var(listener_array); 539 if (list_empty(&listeners->list)) 540 return; 541 542 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 543 if (rc < 0) 544 return; 545 546 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 547 if (!stats) 548 goto err; 549 550 rc = fill_pid(-1, tsk, stats); 551 if (rc < 0) 552 goto err; 553 554 /* 555 * Doesn't matter if tsk is the leader or the last group member leaving 556 */ 557 if (!is_thread_group || !group_dead) 558 goto send; 559 560 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 561 if (!stats) 562 goto err; 563 564 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 565 566 send: 567 send_cpu_listeners(rep_skb, listeners); 568 return; 569 err: 570 nlmsg_free(rep_skb); 571 } 572 573 static struct genl_ops taskstats_ops = { 574 .cmd = TASKSTATS_CMD_GET, 575 .doit = taskstats_user_cmd, 576 .policy = taskstats_cmd_get_policy, 577 }; 578 579 static struct genl_ops cgroupstats_ops = { 580 .cmd = CGROUPSTATS_CMD_GET, 581 .doit = cgroupstats_user_cmd, 582 .policy = cgroupstats_cmd_get_policy, 583 }; 584 585 /* Needed early in initialization */ 586 void __init taskstats_init_early(void) 587 { 588 unsigned int i; 589 590 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 591 for_each_possible_cpu(i) { 592 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 593 init_rwsem(&(per_cpu(listener_array, i).sem)); 594 } 595 } 596 597 static int __init taskstats_init(void) 598 { 599 int rc; 600 601 rc = genl_register_family(&family); 602 if (rc) 603 return rc; 604 605 rc = genl_register_ops(&family, &taskstats_ops); 606 if (rc < 0) 607 goto err; 608 609 rc = genl_register_ops(&family, &cgroupstats_ops); 610 if (rc < 0) 611 goto err_cgroup_ops; 612 613 family_registered = 1; 614 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 615 return 0; 616 err_cgroup_ops: 617 genl_unregister_ops(&family, &taskstats_ops); 618 err: 619 genl_unregister_family(&family); 620 return rc; 621 } 622 623 /* 624 * late initcall ensures initialization of statistics collection 625 * mechanisms precedes initialization of the taskstats interface 626 */ 627 late_initcall(taskstats_init); 628