1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 6 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 7 8 /* 9 * There are no locks covering percpu hardirq/softirq time. 10 * They are only modified in vtime_account, on corresponding CPU 11 * with interrupts disabled. So, writes are safe. 12 * They are read and saved off onto struct rq in update_rq_clock(). 13 * This may result in other CPU reading this CPU's irq time and can 14 * race with irq/vtime_account on this CPU. We would either get old 15 * or new value with a side effect of accounting a slice of irq time to wrong 16 * task when irq is in progress while we read rq->clock. That is a worthy 17 * compromise in place of having locks on each irq in account_system_time. 18 */ 19 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 20 21 static int sched_clock_irqtime; 22 23 void enable_sched_clock_irqtime(void) 24 { 25 sched_clock_irqtime = 1; 26 } 27 28 void disable_sched_clock_irqtime(void) 29 { 30 sched_clock_irqtime = 0; 31 } 32 33 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 34 enum cpu_usage_stat idx) 35 { 36 u64 *cpustat = kcpustat_this_cpu->cpustat; 37 38 u64_stats_update_begin(&irqtime->sync); 39 cpustat[idx] += delta; 40 irqtime->total += delta; 41 irqtime->tick_delta += delta; 42 u64_stats_update_end(&irqtime->sync); 43 } 44 45 /* 46 * Called after incrementing preempt_count on {soft,}irq_enter 47 * and before decrementing preempt_count on {soft,}irq_exit. 48 */ 49 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 50 { 51 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 52 unsigned int pc; 53 s64 delta; 54 int cpu; 55 56 if (!sched_clock_irqtime) 57 return; 58 59 cpu = smp_processor_id(); 60 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 61 irqtime->irq_start_time += delta; 62 pc = irq_count() - offset; 63 64 /* 65 * We do not account for softirq time from ksoftirqd here. 66 * We want to continue accounting softirq time to ksoftirqd thread 67 * in that case, so as not to confuse scheduler with a special task 68 * that do not consume any time, but still wants to run. 69 */ 70 if (pc & HARDIRQ_MASK) 71 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 72 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 73 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 74 } 75 76 static u64 irqtime_tick_accounted(u64 maxtime) 77 { 78 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 79 u64 delta; 80 81 delta = min(irqtime->tick_delta, maxtime); 82 irqtime->tick_delta -= delta; 83 84 return delta; 85 } 86 87 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 88 89 #define sched_clock_irqtime (0) 90 91 static u64 irqtime_tick_accounted(u64 dummy) 92 { 93 return 0; 94 } 95 96 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 97 98 static inline void task_group_account_field(struct task_struct *p, int index, 99 u64 tmp) 100 { 101 /* 102 * Since all updates are sure to touch the root cgroup, we 103 * get ourselves ahead and touch it first. If the root cgroup 104 * is the only cgroup, then nothing else should be necessary. 105 * 106 */ 107 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 108 109 cgroup_account_cputime_field(p, index, tmp); 110 } 111 112 /* 113 * Account user CPU time to a process. 114 * @p: the process that the CPU time gets accounted to 115 * @cputime: the CPU time spent in user space since the last update 116 */ 117 void account_user_time(struct task_struct *p, u64 cputime) 118 { 119 int index; 120 121 /* Add user time to process. */ 122 p->utime += cputime; 123 account_group_user_time(p, cputime); 124 125 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 126 127 /* Add user time to cpustat. */ 128 task_group_account_field(p, index, cputime); 129 130 /* Account for user time used */ 131 acct_account_cputime(p); 132 } 133 134 /* 135 * Account guest CPU time to a process. 136 * @p: the process that the CPU time gets accounted to 137 * @cputime: the CPU time spent in virtual machine since the last update 138 */ 139 void account_guest_time(struct task_struct *p, u64 cputime) 140 { 141 u64 *cpustat = kcpustat_this_cpu->cpustat; 142 143 /* Add guest time to process. */ 144 p->utime += cputime; 145 account_group_user_time(p, cputime); 146 p->gtime += cputime; 147 148 /* Add guest time to cpustat. */ 149 if (task_nice(p) > 0) { 150 task_group_account_field(p, CPUTIME_NICE, cputime); 151 cpustat[CPUTIME_GUEST_NICE] += cputime; 152 } else { 153 task_group_account_field(p, CPUTIME_USER, cputime); 154 cpustat[CPUTIME_GUEST] += cputime; 155 } 156 } 157 158 /* 159 * Account system CPU time to a process and desired cpustat field 160 * @p: the process that the CPU time gets accounted to 161 * @cputime: the CPU time spent in kernel space since the last update 162 * @index: pointer to cpustat field that has to be updated 163 */ 164 void account_system_index_time(struct task_struct *p, 165 u64 cputime, enum cpu_usage_stat index) 166 { 167 /* Add system time to process. */ 168 p->stime += cputime; 169 account_group_system_time(p, cputime); 170 171 /* Add system time to cpustat. */ 172 task_group_account_field(p, index, cputime); 173 174 /* Account for system time used */ 175 acct_account_cputime(p); 176 } 177 178 /* 179 * Account system CPU time to a process. 180 * @p: the process that the CPU time gets accounted to 181 * @hardirq_offset: the offset to subtract from hardirq_count() 182 * @cputime: the CPU time spent in kernel space since the last update 183 */ 184 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 185 { 186 int index; 187 188 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 189 account_guest_time(p, cputime); 190 return; 191 } 192 193 if (hardirq_count() - hardirq_offset) 194 index = CPUTIME_IRQ; 195 else if (in_serving_softirq()) 196 index = CPUTIME_SOFTIRQ; 197 else 198 index = CPUTIME_SYSTEM; 199 200 account_system_index_time(p, cputime, index); 201 } 202 203 /* 204 * Account for involuntary wait time. 205 * @cputime: the CPU time spent in involuntary wait 206 */ 207 void account_steal_time(u64 cputime) 208 { 209 u64 *cpustat = kcpustat_this_cpu->cpustat; 210 211 cpustat[CPUTIME_STEAL] += cputime; 212 } 213 214 /* 215 * Account for idle time. 216 * @cputime: the CPU time spent in idle wait 217 */ 218 void account_idle_time(u64 cputime) 219 { 220 u64 *cpustat = kcpustat_this_cpu->cpustat; 221 struct rq *rq = this_rq(); 222 223 if (atomic_read(&rq->nr_iowait) > 0) 224 cpustat[CPUTIME_IOWAIT] += cputime; 225 else 226 cpustat[CPUTIME_IDLE] += cputime; 227 } 228 229 230 #ifdef CONFIG_SCHED_CORE 231 /* 232 * Account for forceidle time due to core scheduling. 233 * 234 * REQUIRES: schedstat is enabled. 235 */ 236 void __account_forceidle_time(struct task_struct *p, u64 delta) 237 { 238 __schedstat_add(p->stats.core_forceidle_sum, delta); 239 240 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 241 } 242 #endif 243 244 /* 245 * When a guest is interrupted for a longer amount of time, missed clock 246 * ticks are not redelivered later. Due to that, this function may on 247 * occasion account more time than the calling functions think elapsed. 248 */ 249 static __always_inline u64 steal_account_process_time(u64 maxtime) 250 { 251 #ifdef CONFIG_PARAVIRT 252 if (static_key_false(¶virt_steal_enabled)) { 253 u64 steal; 254 255 steal = paravirt_steal_clock(smp_processor_id()); 256 steal -= this_rq()->prev_steal_time; 257 steal = min(steal, maxtime); 258 account_steal_time(steal); 259 this_rq()->prev_steal_time += steal; 260 261 return steal; 262 } 263 #endif 264 return 0; 265 } 266 267 /* 268 * Account how much elapsed time was spent in steal, irq, or softirq time. 269 */ 270 static inline u64 account_other_time(u64 max) 271 { 272 u64 accounted; 273 274 lockdep_assert_irqs_disabled(); 275 276 accounted = steal_account_process_time(max); 277 278 if (accounted < max) 279 accounted += irqtime_tick_accounted(max - accounted); 280 281 return accounted; 282 } 283 284 #ifdef CONFIG_64BIT 285 static inline u64 read_sum_exec_runtime(struct task_struct *t) 286 { 287 return t->se.sum_exec_runtime; 288 } 289 #else 290 static u64 read_sum_exec_runtime(struct task_struct *t) 291 { 292 u64 ns; 293 struct rq_flags rf; 294 struct rq *rq; 295 296 rq = task_rq_lock(t, &rf); 297 ns = t->se.sum_exec_runtime; 298 task_rq_unlock(rq, t, &rf); 299 300 return ns; 301 } 302 #endif 303 304 /* 305 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 306 * tasks (sum on group iteration) belonging to @tsk's group. 307 */ 308 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 309 { 310 struct signal_struct *sig = tsk->signal; 311 u64 utime, stime; 312 struct task_struct *t; 313 unsigned int seq, nextseq; 314 unsigned long flags; 315 316 /* 317 * Update current task runtime to account pending time since last 318 * scheduler action or thread_group_cputime() call. This thread group 319 * might have other running tasks on different CPUs, but updating 320 * their runtime can affect syscall performance, so we skip account 321 * those pending times and rely only on values updated on tick or 322 * other scheduler action. 323 */ 324 if (same_thread_group(current, tsk)) 325 (void) task_sched_runtime(current); 326 327 rcu_read_lock(); 328 /* Attempt a lockless read on the first round. */ 329 nextseq = 0; 330 do { 331 seq = nextseq; 332 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); 333 times->utime = sig->utime; 334 times->stime = sig->stime; 335 times->sum_exec_runtime = sig->sum_sched_runtime; 336 337 for_each_thread(tsk, t) { 338 task_cputime(t, &utime, &stime); 339 times->utime += utime; 340 times->stime += stime; 341 times->sum_exec_runtime += read_sum_exec_runtime(t); 342 } 343 /* If lockless access failed, take the lock. */ 344 nextseq = 1; 345 } while (need_seqretry(&sig->stats_lock, seq)); 346 done_seqretry_irqrestore(&sig->stats_lock, seq, flags); 347 rcu_read_unlock(); 348 } 349 350 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 351 /* 352 * Account a tick to a process and cpustat 353 * @p: the process that the CPU time gets accounted to 354 * @user_tick: is the tick from userspace 355 * @rq: the pointer to rq 356 * 357 * Tick demultiplexing follows the order 358 * - pending hardirq update 359 * - pending softirq update 360 * - user_time 361 * - idle_time 362 * - system time 363 * - check for guest_time 364 * - else account as system_time 365 * 366 * Check for hardirq is done both for system and user time as there is 367 * no timer going off while we are on hardirq and hence we may never get an 368 * opportunity to update it solely in system time. 369 * p->stime and friends are only updated on system time and not on irq 370 * softirq as those do not count in task exec_runtime any more. 371 */ 372 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 373 int ticks) 374 { 375 u64 other, cputime = TICK_NSEC * ticks; 376 377 /* 378 * When returning from idle, many ticks can get accounted at 379 * once, including some ticks of steal, irq, and softirq time. 380 * Subtract those ticks from the amount of time accounted to 381 * idle, or potentially user or system time. Due to rounding, 382 * other time can exceed ticks occasionally. 383 */ 384 other = account_other_time(ULONG_MAX); 385 if (other >= cputime) 386 return; 387 388 cputime -= other; 389 390 if (this_cpu_ksoftirqd() == p) { 391 /* 392 * ksoftirqd time do not get accounted in cpu_softirq_time. 393 * So, we have to handle it separately here. 394 * Also, p->stime needs to be updated for ksoftirqd. 395 */ 396 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 397 } else if (user_tick) { 398 account_user_time(p, cputime); 399 } else if (p == this_rq()->idle) { 400 account_idle_time(cputime); 401 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 402 account_guest_time(p, cputime); 403 } else { 404 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 405 } 406 } 407 408 static void irqtime_account_idle_ticks(int ticks) 409 { 410 irqtime_account_process_tick(current, 0, ticks); 411 } 412 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 413 static inline void irqtime_account_idle_ticks(int ticks) { } 414 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 415 int nr_ticks) { } 416 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 417 418 /* 419 * Use precise platform statistics if available: 420 */ 421 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 422 423 # ifndef __ARCH_HAS_VTIME_TASK_SWITCH 424 void vtime_task_switch(struct task_struct *prev) 425 { 426 if (is_idle_task(prev)) 427 vtime_account_idle(prev); 428 else 429 vtime_account_kernel(prev); 430 431 vtime_flush(prev); 432 arch_vtime_task_switch(prev); 433 } 434 # endif 435 436 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 437 { 438 unsigned int pc = irq_count() - offset; 439 440 if (pc & HARDIRQ_OFFSET) { 441 vtime_account_hardirq(tsk); 442 } else if (pc & SOFTIRQ_OFFSET) { 443 vtime_account_softirq(tsk); 444 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 445 is_idle_task(tsk)) { 446 vtime_account_idle(tsk); 447 } else { 448 vtime_account_kernel(tsk); 449 } 450 } 451 452 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 453 u64 *ut, u64 *st) 454 { 455 *ut = curr->utime; 456 *st = curr->stime; 457 } 458 459 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 460 { 461 *ut = p->utime; 462 *st = p->stime; 463 } 464 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 465 466 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 467 { 468 struct task_cputime cputime; 469 470 thread_group_cputime(p, &cputime); 471 472 *ut = cputime.utime; 473 *st = cputime.stime; 474 } 475 476 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 477 478 /* 479 * Account a single tick of CPU time. 480 * @p: the process that the CPU time gets accounted to 481 * @user_tick: indicates if the tick is a user or a system tick 482 */ 483 void account_process_tick(struct task_struct *p, int user_tick) 484 { 485 u64 cputime, steal; 486 487 if (vtime_accounting_enabled_this_cpu()) 488 return; 489 490 if (sched_clock_irqtime) { 491 irqtime_account_process_tick(p, user_tick, 1); 492 return; 493 } 494 495 cputime = TICK_NSEC; 496 steal = steal_account_process_time(ULONG_MAX); 497 498 if (steal >= cputime) 499 return; 500 501 cputime -= steal; 502 503 if (user_tick) 504 account_user_time(p, cputime); 505 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 506 account_system_time(p, HARDIRQ_OFFSET, cputime); 507 else 508 account_idle_time(cputime); 509 } 510 511 /* 512 * Account multiple ticks of idle time. 513 * @ticks: number of stolen ticks 514 */ 515 void account_idle_ticks(unsigned long ticks) 516 { 517 u64 cputime, steal; 518 519 if (sched_clock_irqtime) { 520 irqtime_account_idle_ticks(ticks); 521 return; 522 } 523 524 cputime = ticks * TICK_NSEC; 525 steal = steal_account_process_time(ULONG_MAX); 526 527 if (steal >= cputime) 528 return; 529 530 cputime -= steal; 531 account_idle_time(cputime); 532 } 533 534 /* 535 * Adjust tick based cputime random precision against scheduler runtime 536 * accounting. 537 * 538 * Tick based cputime accounting depend on random scheduling timeslices of a 539 * task to be interrupted or not by the timer. Depending on these 540 * circumstances, the number of these interrupts may be over or 541 * under-optimistic, matching the real user and system cputime with a variable 542 * precision. 543 * 544 * Fix this by scaling these tick based values against the total runtime 545 * accounted by the CFS scheduler. 546 * 547 * This code provides the following guarantees: 548 * 549 * stime + utime == rtime 550 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 551 * 552 * Assuming that rtime_i+1 >= rtime_i. 553 */ 554 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 555 u64 *ut, u64 *st) 556 { 557 u64 rtime, stime, utime; 558 unsigned long flags; 559 560 /* Serialize concurrent callers such that we can honour our guarantees */ 561 raw_spin_lock_irqsave(&prev->lock, flags); 562 rtime = curr->sum_exec_runtime; 563 564 /* 565 * This is possible under two circumstances: 566 * - rtime isn't monotonic after all (a bug); 567 * - we got reordered by the lock. 568 * 569 * In both cases this acts as a filter such that the rest of the code 570 * can assume it is monotonic regardless of anything else. 571 */ 572 if (prev->stime + prev->utime >= rtime) 573 goto out; 574 575 stime = curr->stime; 576 utime = curr->utime; 577 578 /* 579 * If either stime or utime are 0, assume all runtime is userspace. 580 * Once a task gets some ticks, the monotonicity code at 'update:' 581 * will ensure things converge to the observed ratio. 582 */ 583 if (stime == 0) { 584 utime = rtime; 585 goto update; 586 } 587 588 if (utime == 0) { 589 stime = rtime; 590 goto update; 591 } 592 593 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 594 595 update: 596 /* 597 * Make sure stime doesn't go backwards; this preserves monotonicity 598 * for utime because rtime is monotonic. 599 * 600 * utime_i+1 = rtime_i+1 - stime_i 601 * = rtime_i+1 - (rtime_i - utime_i) 602 * = (rtime_i+1 - rtime_i) + utime_i 603 * >= utime_i 604 */ 605 if (stime < prev->stime) 606 stime = prev->stime; 607 utime = rtime - stime; 608 609 /* 610 * Make sure utime doesn't go backwards; this still preserves 611 * monotonicity for stime, analogous argument to above. 612 */ 613 if (utime < prev->utime) { 614 utime = prev->utime; 615 stime = rtime - utime; 616 } 617 618 prev->stime = stime; 619 prev->utime = utime; 620 out: 621 *ut = prev->utime; 622 *st = prev->stime; 623 raw_spin_unlock_irqrestore(&prev->lock, flags); 624 } 625 626 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 627 { 628 struct task_cputime cputime = { 629 .sum_exec_runtime = p->se.sum_exec_runtime, 630 }; 631 632 if (task_cputime(p, &cputime.utime, &cputime.stime)) 633 cputime.sum_exec_runtime = task_sched_runtime(p); 634 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 635 } 636 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 637 638 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 639 { 640 struct task_cputime cputime; 641 642 thread_group_cputime(p, &cputime); 643 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 644 } 645 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 646 647 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 648 static u64 vtime_delta(struct vtime *vtime) 649 { 650 unsigned long long clock; 651 652 clock = sched_clock(); 653 if (clock < vtime->starttime) 654 return 0; 655 656 return clock - vtime->starttime; 657 } 658 659 static u64 get_vtime_delta(struct vtime *vtime) 660 { 661 u64 delta = vtime_delta(vtime); 662 u64 other; 663 664 /* 665 * Unlike tick based timing, vtime based timing never has lost 666 * ticks, and no need for steal time accounting to make up for 667 * lost ticks. Vtime accounts a rounded version of actual 668 * elapsed time. Limit account_other_time to prevent rounding 669 * errors from causing elapsed vtime to go negative. 670 */ 671 other = account_other_time(delta); 672 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 673 vtime->starttime += delta; 674 675 return delta - other; 676 } 677 678 static void vtime_account_system(struct task_struct *tsk, 679 struct vtime *vtime) 680 { 681 vtime->stime += get_vtime_delta(vtime); 682 if (vtime->stime >= TICK_NSEC) { 683 account_system_time(tsk, irq_count(), vtime->stime); 684 vtime->stime = 0; 685 } 686 } 687 688 static void vtime_account_guest(struct task_struct *tsk, 689 struct vtime *vtime) 690 { 691 vtime->gtime += get_vtime_delta(vtime); 692 if (vtime->gtime >= TICK_NSEC) { 693 account_guest_time(tsk, vtime->gtime); 694 vtime->gtime = 0; 695 } 696 } 697 698 static void __vtime_account_kernel(struct task_struct *tsk, 699 struct vtime *vtime) 700 { 701 /* We might have scheduled out from guest path */ 702 if (vtime->state == VTIME_GUEST) 703 vtime_account_guest(tsk, vtime); 704 else 705 vtime_account_system(tsk, vtime); 706 } 707 708 void vtime_account_kernel(struct task_struct *tsk) 709 { 710 struct vtime *vtime = &tsk->vtime; 711 712 if (!vtime_delta(vtime)) 713 return; 714 715 write_seqcount_begin(&vtime->seqcount); 716 __vtime_account_kernel(tsk, vtime); 717 write_seqcount_end(&vtime->seqcount); 718 } 719 720 void vtime_user_enter(struct task_struct *tsk) 721 { 722 struct vtime *vtime = &tsk->vtime; 723 724 write_seqcount_begin(&vtime->seqcount); 725 vtime_account_system(tsk, vtime); 726 vtime->state = VTIME_USER; 727 write_seqcount_end(&vtime->seqcount); 728 } 729 730 void vtime_user_exit(struct task_struct *tsk) 731 { 732 struct vtime *vtime = &tsk->vtime; 733 734 write_seqcount_begin(&vtime->seqcount); 735 vtime->utime += get_vtime_delta(vtime); 736 if (vtime->utime >= TICK_NSEC) { 737 account_user_time(tsk, vtime->utime); 738 vtime->utime = 0; 739 } 740 vtime->state = VTIME_SYS; 741 write_seqcount_end(&vtime->seqcount); 742 } 743 744 void vtime_guest_enter(struct task_struct *tsk) 745 { 746 struct vtime *vtime = &tsk->vtime; 747 /* 748 * The flags must be updated under the lock with 749 * the vtime_starttime flush and update. 750 * That enforces a right ordering and update sequence 751 * synchronization against the reader (task_gtime()) 752 * that can thus safely catch up with a tickless delta. 753 */ 754 write_seqcount_begin(&vtime->seqcount); 755 vtime_account_system(tsk, vtime); 756 tsk->flags |= PF_VCPU; 757 vtime->state = VTIME_GUEST; 758 write_seqcount_end(&vtime->seqcount); 759 } 760 EXPORT_SYMBOL_GPL(vtime_guest_enter); 761 762 void vtime_guest_exit(struct task_struct *tsk) 763 { 764 struct vtime *vtime = &tsk->vtime; 765 766 write_seqcount_begin(&vtime->seqcount); 767 vtime_account_guest(tsk, vtime); 768 tsk->flags &= ~PF_VCPU; 769 vtime->state = VTIME_SYS; 770 write_seqcount_end(&vtime->seqcount); 771 } 772 EXPORT_SYMBOL_GPL(vtime_guest_exit); 773 774 void vtime_account_idle(struct task_struct *tsk) 775 { 776 account_idle_time(get_vtime_delta(&tsk->vtime)); 777 } 778 779 void vtime_task_switch_generic(struct task_struct *prev) 780 { 781 struct vtime *vtime = &prev->vtime; 782 783 write_seqcount_begin(&vtime->seqcount); 784 if (vtime->state == VTIME_IDLE) 785 vtime_account_idle(prev); 786 else 787 __vtime_account_kernel(prev, vtime); 788 vtime->state = VTIME_INACTIVE; 789 vtime->cpu = -1; 790 write_seqcount_end(&vtime->seqcount); 791 792 vtime = ¤t->vtime; 793 794 write_seqcount_begin(&vtime->seqcount); 795 if (is_idle_task(current)) 796 vtime->state = VTIME_IDLE; 797 else if (current->flags & PF_VCPU) 798 vtime->state = VTIME_GUEST; 799 else 800 vtime->state = VTIME_SYS; 801 vtime->starttime = sched_clock(); 802 vtime->cpu = smp_processor_id(); 803 write_seqcount_end(&vtime->seqcount); 804 } 805 806 void vtime_init_idle(struct task_struct *t, int cpu) 807 { 808 struct vtime *vtime = &t->vtime; 809 unsigned long flags; 810 811 local_irq_save(flags); 812 write_seqcount_begin(&vtime->seqcount); 813 vtime->state = VTIME_IDLE; 814 vtime->starttime = sched_clock(); 815 vtime->cpu = cpu; 816 write_seqcount_end(&vtime->seqcount); 817 local_irq_restore(flags); 818 } 819 820 u64 task_gtime(struct task_struct *t) 821 { 822 struct vtime *vtime = &t->vtime; 823 unsigned int seq; 824 u64 gtime; 825 826 if (!vtime_accounting_enabled()) 827 return t->gtime; 828 829 do { 830 seq = read_seqcount_begin(&vtime->seqcount); 831 832 gtime = t->gtime; 833 if (vtime->state == VTIME_GUEST) 834 gtime += vtime->gtime + vtime_delta(vtime); 835 836 } while (read_seqcount_retry(&vtime->seqcount, seq)); 837 838 return gtime; 839 } 840 841 /* 842 * Fetch cputime raw values from fields of task_struct and 843 * add up the pending nohz execution time since the last 844 * cputime snapshot. 845 */ 846 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 847 { 848 struct vtime *vtime = &t->vtime; 849 unsigned int seq; 850 u64 delta; 851 int ret; 852 853 if (!vtime_accounting_enabled()) { 854 *utime = t->utime; 855 *stime = t->stime; 856 return false; 857 } 858 859 do { 860 ret = false; 861 seq = read_seqcount_begin(&vtime->seqcount); 862 863 *utime = t->utime; 864 *stime = t->stime; 865 866 /* Task is sleeping or idle, nothing to add */ 867 if (vtime->state < VTIME_SYS) 868 continue; 869 870 ret = true; 871 delta = vtime_delta(vtime); 872 873 /* 874 * Task runs either in user (including guest) or kernel space, 875 * add pending nohz time to the right place. 876 */ 877 if (vtime->state == VTIME_SYS) 878 *stime += vtime->stime + delta; 879 else 880 *utime += vtime->utime + delta; 881 } while (read_seqcount_retry(&vtime->seqcount, seq)); 882 883 return ret; 884 } 885 886 static int vtime_state_fetch(struct vtime *vtime, int cpu) 887 { 888 int state = READ_ONCE(vtime->state); 889 890 /* 891 * We raced against a context switch, fetch the 892 * kcpustat task again. 893 */ 894 if (vtime->cpu != cpu && vtime->cpu != -1) 895 return -EAGAIN; 896 897 /* 898 * Two possible things here: 899 * 1) We are seeing the scheduling out task (prev) or any past one. 900 * 2) We are seeing the scheduling in task (next) but it hasn't 901 * passed though vtime_task_switch() yet so the pending 902 * cputime of the prev task may not be flushed yet. 903 * 904 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 905 */ 906 if (state == VTIME_INACTIVE) 907 return -EAGAIN; 908 909 return state; 910 } 911 912 static u64 kcpustat_user_vtime(struct vtime *vtime) 913 { 914 if (vtime->state == VTIME_USER) 915 return vtime->utime + vtime_delta(vtime); 916 else if (vtime->state == VTIME_GUEST) 917 return vtime->gtime + vtime_delta(vtime); 918 return 0; 919 } 920 921 static int kcpustat_field_vtime(u64 *cpustat, 922 struct task_struct *tsk, 923 enum cpu_usage_stat usage, 924 int cpu, u64 *val) 925 { 926 struct vtime *vtime = &tsk->vtime; 927 unsigned int seq; 928 929 do { 930 int state; 931 932 seq = read_seqcount_begin(&vtime->seqcount); 933 934 state = vtime_state_fetch(vtime, cpu); 935 if (state < 0) 936 return state; 937 938 *val = cpustat[usage]; 939 940 /* 941 * Nice VS unnice cputime accounting may be inaccurate if 942 * the nice value has changed since the last vtime update. 943 * But proper fix would involve interrupting target on nice 944 * updates which is a no go on nohz_full (although the scheduler 945 * may still interrupt the target if rescheduling is needed...) 946 */ 947 switch (usage) { 948 case CPUTIME_SYSTEM: 949 if (state == VTIME_SYS) 950 *val += vtime->stime + vtime_delta(vtime); 951 break; 952 case CPUTIME_USER: 953 if (task_nice(tsk) <= 0) 954 *val += kcpustat_user_vtime(vtime); 955 break; 956 case CPUTIME_NICE: 957 if (task_nice(tsk) > 0) 958 *val += kcpustat_user_vtime(vtime); 959 break; 960 case CPUTIME_GUEST: 961 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 962 *val += vtime->gtime + vtime_delta(vtime); 963 break; 964 case CPUTIME_GUEST_NICE: 965 if (state == VTIME_GUEST && task_nice(tsk) > 0) 966 *val += vtime->gtime + vtime_delta(vtime); 967 break; 968 default: 969 break; 970 } 971 } while (read_seqcount_retry(&vtime->seqcount, seq)); 972 973 return 0; 974 } 975 976 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 977 enum cpu_usage_stat usage, int cpu) 978 { 979 u64 *cpustat = kcpustat->cpustat; 980 u64 val = cpustat[usage]; 981 struct rq *rq; 982 int err; 983 984 if (!vtime_accounting_enabled_cpu(cpu)) 985 return val; 986 987 rq = cpu_rq(cpu); 988 989 for (;;) { 990 struct task_struct *curr; 991 992 rcu_read_lock(); 993 curr = rcu_dereference(rq->curr); 994 if (WARN_ON_ONCE(!curr)) { 995 rcu_read_unlock(); 996 return cpustat[usage]; 997 } 998 999 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 1000 rcu_read_unlock(); 1001 1002 if (!err) 1003 return val; 1004 1005 cpu_relax(); 1006 } 1007 } 1008 EXPORT_SYMBOL_GPL(kcpustat_field); 1009 1010 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1011 const struct kernel_cpustat *src, 1012 struct task_struct *tsk, int cpu) 1013 { 1014 struct vtime *vtime = &tsk->vtime; 1015 unsigned int seq; 1016 1017 do { 1018 u64 *cpustat; 1019 u64 delta; 1020 int state; 1021 1022 seq = read_seqcount_begin(&vtime->seqcount); 1023 1024 state = vtime_state_fetch(vtime, cpu); 1025 if (state < 0) 1026 return state; 1027 1028 *dst = *src; 1029 cpustat = dst->cpustat; 1030 1031 /* Task is sleeping, dead or idle, nothing to add */ 1032 if (state < VTIME_SYS) 1033 continue; 1034 1035 delta = vtime_delta(vtime); 1036 1037 /* 1038 * Task runs either in user (including guest) or kernel space, 1039 * add pending nohz time to the right place. 1040 */ 1041 if (state == VTIME_SYS) { 1042 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1043 } else if (state == VTIME_USER) { 1044 if (task_nice(tsk) > 0) 1045 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1046 else 1047 cpustat[CPUTIME_USER] += vtime->utime + delta; 1048 } else { 1049 WARN_ON_ONCE(state != VTIME_GUEST); 1050 if (task_nice(tsk) > 0) { 1051 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1052 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1053 } else { 1054 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1055 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1056 } 1057 } 1058 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1059 1060 return 0; 1061 } 1062 1063 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1064 { 1065 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1066 struct rq *rq; 1067 int err; 1068 1069 if (!vtime_accounting_enabled_cpu(cpu)) { 1070 *dst = *src; 1071 return; 1072 } 1073 1074 rq = cpu_rq(cpu); 1075 1076 for (;;) { 1077 struct task_struct *curr; 1078 1079 rcu_read_lock(); 1080 curr = rcu_dereference(rq->curr); 1081 if (WARN_ON_ONCE(!curr)) { 1082 rcu_read_unlock(); 1083 *dst = *src; 1084 return; 1085 } 1086 1087 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1088 rcu_read_unlock(); 1089 1090 if (!err) 1091 return; 1092 1093 cpu_relax(); 1094 } 1095 } 1096 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1097 1098 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1099