xref: /openbmc/linux/kernel/sched/cputime.c (revision e657c18a)
1 /*
2  * Simple CPU accounting cgroup controller
3  */
4 #include "sched.h"
5 
6 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
7 
8 /*
9  * There are no locks covering percpu hardirq/softirq time.
10  * They are only modified in vtime_account, on corresponding CPU
11  * with interrupts disabled. So, writes are safe.
12  * They are read and saved off onto struct rq in update_rq_clock().
13  * This may result in other CPU reading this CPU's irq time and can
14  * race with irq/vtime_account on this CPU. We would either get old
15  * or new value with a side effect of accounting a slice of irq time to wrong
16  * task when irq is in progress while we read rq->clock. That is a worthy
17  * compromise in place of having locks on each irq in account_system_time.
18  */
19 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
20 
21 static int sched_clock_irqtime;
22 
23 void enable_sched_clock_irqtime(void)
24 {
25 	sched_clock_irqtime = 1;
26 }
27 
28 void disable_sched_clock_irqtime(void)
29 {
30 	sched_clock_irqtime = 0;
31 }
32 
33 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
34 				  enum cpu_usage_stat idx)
35 {
36 	u64 *cpustat = kcpustat_this_cpu->cpustat;
37 
38 	u64_stats_update_begin(&irqtime->sync);
39 	cpustat[idx] += delta;
40 	irqtime->total += delta;
41 	irqtime->tick_delta += delta;
42 	u64_stats_update_end(&irqtime->sync);
43 }
44 
45 /*
46  * Called before incrementing preempt_count on {soft,}irq_enter
47  * and before decrementing preempt_count on {soft,}irq_exit.
48  */
49 void irqtime_account_irq(struct task_struct *curr)
50 {
51 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
52 	s64 delta;
53 	int cpu;
54 
55 	if (!sched_clock_irqtime)
56 		return;
57 
58 	cpu = smp_processor_id();
59 	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
60 	irqtime->irq_start_time += delta;
61 
62 	/*
63 	 * We do not account for softirq time from ksoftirqd here.
64 	 * We want to continue accounting softirq time to ksoftirqd thread
65 	 * in that case, so as not to confuse scheduler with a special task
66 	 * that do not consume any time, but still wants to run.
67 	 */
68 	if (hardirq_count())
69 		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
70 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
72 }
73 EXPORT_SYMBOL_GPL(irqtime_account_irq);
74 
75 static u64 irqtime_tick_accounted(u64 maxtime)
76 {
77 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
78 	u64 delta;
79 
80 	delta = min(irqtime->tick_delta, maxtime);
81 	irqtime->tick_delta -= delta;
82 
83 	return delta;
84 }
85 
86 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
87 
88 #define sched_clock_irqtime	(0)
89 
90 static u64 irqtime_tick_accounted(u64 dummy)
91 {
92 	return 0;
93 }
94 
95 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
96 
97 static inline void task_group_account_field(struct task_struct *p, int index,
98 					    u64 tmp)
99 {
100 	/*
101 	 * Since all updates are sure to touch the root cgroup, we
102 	 * get ourselves ahead and touch it first. If the root cgroup
103 	 * is the only cgroup, then nothing else should be necessary.
104 	 *
105 	 */
106 	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
107 
108 	cgroup_account_cputime_field(p, index, tmp);
109 }
110 
111 /*
112  * Account user CPU time to a process.
113  * @p: the process that the CPU time gets accounted to
114  * @cputime: the CPU time spent in user space since the last update
115  */
116 void account_user_time(struct task_struct *p, u64 cputime)
117 {
118 	int index;
119 
120 	/* Add user time to process. */
121 	p->utime += cputime;
122 	account_group_user_time(p, cputime);
123 
124 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
125 
126 	/* Add user time to cpustat. */
127 	task_group_account_field(p, index, cputime);
128 
129 	/* Account for user time used */
130 	acct_account_cputime(p);
131 }
132 
133 /*
134  * Account guest CPU time to a process.
135  * @p: the process that the CPU time gets accounted to
136  * @cputime: the CPU time spent in virtual machine since the last update
137  */
138 void account_guest_time(struct task_struct *p, u64 cputime)
139 {
140 	u64 *cpustat = kcpustat_this_cpu->cpustat;
141 
142 	/* Add guest time to process. */
143 	p->utime += cputime;
144 	account_group_user_time(p, cputime);
145 	p->gtime += cputime;
146 
147 	/* Add guest time to cpustat. */
148 	if (task_nice(p) > 0) {
149 		cpustat[CPUTIME_NICE] += cputime;
150 		cpustat[CPUTIME_GUEST_NICE] += cputime;
151 	} else {
152 		cpustat[CPUTIME_USER] += cputime;
153 		cpustat[CPUTIME_GUEST] += cputime;
154 	}
155 }
156 
157 /*
158  * Account system CPU time to a process and desired cpustat field
159  * @p: the process that the CPU time gets accounted to
160  * @cputime: the CPU time spent in kernel space since the last update
161  * @index: pointer to cpustat field that has to be updated
162  */
163 void account_system_index_time(struct task_struct *p,
164 			       u64 cputime, enum cpu_usage_stat index)
165 {
166 	/* Add system time to process. */
167 	p->stime += cputime;
168 	account_group_system_time(p, cputime);
169 
170 	/* Add system time to cpustat. */
171 	task_group_account_field(p, index, cputime);
172 
173 	/* Account for system time used */
174 	acct_account_cputime(p);
175 }
176 
177 /*
178  * Account system CPU time to a process.
179  * @p: the process that the CPU time gets accounted to
180  * @hardirq_offset: the offset to subtract from hardirq_count()
181  * @cputime: the CPU time spent in kernel space since the last update
182  */
183 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
184 {
185 	int index;
186 
187 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
188 		account_guest_time(p, cputime);
189 		return;
190 	}
191 
192 	if (hardirq_count() - hardirq_offset)
193 		index = CPUTIME_IRQ;
194 	else if (in_serving_softirq())
195 		index = CPUTIME_SOFTIRQ;
196 	else
197 		index = CPUTIME_SYSTEM;
198 
199 	account_system_index_time(p, cputime, index);
200 }
201 
202 /*
203  * Account for involuntary wait time.
204  * @cputime: the CPU time spent in involuntary wait
205  */
206 void account_steal_time(u64 cputime)
207 {
208 	u64 *cpustat = kcpustat_this_cpu->cpustat;
209 
210 	cpustat[CPUTIME_STEAL] += cputime;
211 }
212 
213 /*
214  * Account for idle time.
215  * @cputime: the CPU time spent in idle wait
216  */
217 void account_idle_time(u64 cputime)
218 {
219 	u64 *cpustat = kcpustat_this_cpu->cpustat;
220 	struct rq *rq = this_rq();
221 
222 	if (atomic_read(&rq->nr_iowait) > 0)
223 		cpustat[CPUTIME_IOWAIT] += cputime;
224 	else
225 		cpustat[CPUTIME_IDLE] += cputime;
226 }
227 
228 /*
229  * When a guest is interrupted for a longer amount of time, missed clock
230  * ticks are not redelivered later. Due to that, this function may on
231  * occasion account more time than the calling functions think elapsed.
232  */
233 static __always_inline u64 steal_account_process_time(u64 maxtime)
234 {
235 #ifdef CONFIG_PARAVIRT
236 	if (static_key_false(&paravirt_steal_enabled)) {
237 		u64 steal;
238 
239 		steal = paravirt_steal_clock(smp_processor_id());
240 		steal -= this_rq()->prev_steal_time;
241 		steal = min(steal, maxtime);
242 		account_steal_time(steal);
243 		this_rq()->prev_steal_time += steal;
244 
245 		return steal;
246 	}
247 #endif
248 	return 0;
249 }
250 
251 /*
252  * Account how much elapsed time was spent in steal, irq, or softirq time.
253  */
254 static inline u64 account_other_time(u64 max)
255 {
256 	u64 accounted;
257 
258 	lockdep_assert_irqs_disabled();
259 
260 	accounted = steal_account_process_time(max);
261 
262 	if (accounted < max)
263 		accounted += irqtime_tick_accounted(max - accounted);
264 
265 	return accounted;
266 }
267 
268 #ifdef CONFIG_64BIT
269 static inline u64 read_sum_exec_runtime(struct task_struct *t)
270 {
271 	return t->se.sum_exec_runtime;
272 }
273 #else
274 static u64 read_sum_exec_runtime(struct task_struct *t)
275 {
276 	u64 ns;
277 	struct rq_flags rf;
278 	struct rq *rq;
279 
280 	rq = task_rq_lock(t, &rf);
281 	ns = t->se.sum_exec_runtime;
282 	task_rq_unlock(rq, t, &rf);
283 
284 	return ns;
285 }
286 #endif
287 
288 /*
289  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
290  * tasks (sum on group iteration) belonging to @tsk's group.
291  */
292 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
293 {
294 	struct signal_struct *sig = tsk->signal;
295 	u64 utime, stime;
296 	struct task_struct *t;
297 	unsigned int seq, nextseq;
298 	unsigned long flags;
299 
300 	/*
301 	 * Update current task runtime to account pending time since last
302 	 * scheduler action or thread_group_cputime() call. This thread group
303 	 * might have other running tasks on different CPUs, but updating
304 	 * their runtime can affect syscall performance, so we skip account
305 	 * those pending times and rely only on values updated on tick or
306 	 * other scheduler action.
307 	 */
308 	if (same_thread_group(current, tsk))
309 		(void) task_sched_runtime(current);
310 
311 	rcu_read_lock();
312 	/* Attempt a lockless read on the first round. */
313 	nextseq = 0;
314 	do {
315 		seq = nextseq;
316 		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
317 		times->utime = sig->utime;
318 		times->stime = sig->stime;
319 		times->sum_exec_runtime = sig->sum_sched_runtime;
320 
321 		for_each_thread(tsk, t) {
322 			task_cputime(t, &utime, &stime);
323 			times->utime += utime;
324 			times->stime += stime;
325 			times->sum_exec_runtime += read_sum_exec_runtime(t);
326 		}
327 		/* If lockless access failed, take the lock. */
328 		nextseq = 1;
329 	} while (need_seqretry(&sig->stats_lock, seq));
330 	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
331 	rcu_read_unlock();
332 }
333 
334 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
335 /*
336  * Account a tick to a process and cpustat
337  * @p: the process that the CPU time gets accounted to
338  * @user_tick: is the tick from userspace
339  * @rq: the pointer to rq
340  *
341  * Tick demultiplexing follows the order
342  * - pending hardirq update
343  * - pending softirq update
344  * - user_time
345  * - idle_time
346  * - system time
347  *   - check for guest_time
348  *   - else account as system_time
349  *
350  * Check for hardirq is done both for system and user time as there is
351  * no timer going off while we are on hardirq and hence we may never get an
352  * opportunity to update it solely in system time.
353  * p->stime and friends are only updated on system time and not on irq
354  * softirq as those do not count in task exec_runtime any more.
355  */
356 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
357 					 struct rq *rq, int ticks)
358 {
359 	u64 other, cputime = TICK_NSEC * ticks;
360 
361 	/*
362 	 * When returning from idle, many ticks can get accounted at
363 	 * once, including some ticks of steal, irq, and softirq time.
364 	 * Subtract those ticks from the amount of time accounted to
365 	 * idle, or potentially user or system time. Due to rounding,
366 	 * other time can exceed ticks occasionally.
367 	 */
368 	other = account_other_time(ULONG_MAX);
369 	if (other >= cputime)
370 		return;
371 
372 	cputime -= other;
373 
374 	if (this_cpu_ksoftirqd() == p) {
375 		/*
376 		 * ksoftirqd time do not get accounted in cpu_softirq_time.
377 		 * So, we have to handle it separately here.
378 		 * Also, p->stime needs to be updated for ksoftirqd.
379 		 */
380 		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
381 	} else if (user_tick) {
382 		account_user_time(p, cputime);
383 	} else if (p == rq->idle) {
384 		account_idle_time(cputime);
385 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
386 		account_guest_time(p, cputime);
387 	} else {
388 		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
389 	}
390 }
391 
392 static void irqtime_account_idle_ticks(int ticks)
393 {
394 	struct rq *rq = this_rq();
395 
396 	irqtime_account_process_tick(current, 0, rq, ticks);
397 }
398 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
399 static inline void irqtime_account_idle_ticks(int ticks) { }
400 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
401 						struct rq *rq, int nr_ticks) { }
402 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
403 
404 /*
405  * Use precise platform statistics if available:
406  */
407 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
408 # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
409 void vtime_common_task_switch(struct task_struct *prev)
410 {
411 	if (is_idle_task(prev))
412 		vtime_account_idle(prev);
413 	else
414 		vtime_account_system(prev);
415 
416 	vtime_flush(prev);
417 	arch_vtime_task_switch(prev);
418 }
419 # endif
420 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
421 
422 
423 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
424 /*
425  * Archs that account the whole time spent in the idle task
426  * (outside irq) as idle time can rely on this and just implement
427  * vtime_account_system() and vtime_account_idle(). Archs that
428  * have other meaning of the idle time (s390 only includes the
429  * time spent by the CPU when it's in low power mode) must override
430  * vtime_account().
431  */
432 #ifndef __ARCH_HAS_VTIME_ACCOUNT
433 void vtime_account_irq_enter(struct task_struct *tsk)
434 {
435 	if (!in_interrupt() && is_idle_task(tsk))
436 		vtime_account_idle(tsk);
437 	else
438 		vtime_account_system(tsk);
439 }
440 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
441 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
442 
443 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
444 		    u64 *ut, u64 *st)
445 {
446 	*ut = curr->utime;
447 	*st = curr->stime;
448 }
449 
450 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
451 {
452 	*ut = p->utime;
453 	*st = p->stime;
454 }
455 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
456 
457 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
458 {
459 	struct task_cputime cputime;
460 
461 	thread_group_cputime(p, &cputime);
462 
463 	*ut = cputime.utime;
464 	*st = cputime.stime;
465 }
466 
467 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
468 
469 /*
470  * Account a single tick of CPU time.
471  * @p: the process that the CPU time gets accounted to
472  * @user_tick: indicates if the tick is a user or a system tick
473  */
474 void account_process_tick(struct task_struct *p, int user_tick)
475 {
476 	u64 cputime, steal;
477 	struct rq *rq = this_rq();
478 
479 	if (vtime_accounting_cpu_enabled())
480 		return;
481 
482 	if (sched_clock_irqtime) {
483 		irqtime_account_process_tick(p, user_tick, rq, 1);
484 		return;
485 	}
486 
487 	cputime = TICK_NSEC;
488 	steal = steal_account_process_time(ULONG_MAX);
489 
490 	if (steal >= cputime)
491 		return;
492 
493 	cputime -= steal;
494 
495 	if (user_tick)
496 		account_user_time(p, cputime);
497 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
498 		account_system_time(p, HARDIRQ_OFFSET, cputime);
499 	else
500 		account_idle_time(cputime);
501 }
502 
503 /*
504  * Account multiple ticks of idle time.
505  * @ticks: number of stolen ticks
506  */
507 void account_idle_ticks(unsigned long ticks)
508 {
509 	u64 cputime, steal;
510 
511 	if (sched_clock_irqtime) {
512 		irqtime_account_idle_ticks(ticks);
513 		return;
514 	}
515 
516 	cputime = ticks * TICK_NSEC;
517 	steal = steal_account_process_time(ULONG_MAX);
518 
519 	if (steal >= cputime)
520 		return;
521 
522 	cputime -= steal;
523 	account_idle_time(cputime);
524 }
525 
526 /*
527  * Perform (stime * rtime) / total, but avoid multiplication overflow by
528  * losing precision when the numbers are big.
529  */
530 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
531 {
532 	u64 scaled;
533 
534 	for (;;) {
535 		/* Make sure "rtime" is the bigger of stime/rtime */
536 		if (stime > rtime)
537 			swap(rtime, stime);
538 
539 		/* Make sure 'total' fits in 32 bits */
540 		if (total >> 32)
541 			goto drop_precision;
542 
543 		/* Does rtime (and thus stime) fit in 32 bits? */
544 		if (!(rtime >> 32))
545 			break;
546 
547 		/* Can we just balance rtime/stime rather than dropping bits? */
548 		if (stime >> 31)
549 			goto drop_precision;
550 
551 		/* We can grow stime and shrink rtime and try to make them both fit */
552 		stime <<= 1;
553 		rtime >>= 1;
554 		continue;
555 
556 drop_precision:
557 		/* We drop from rtime, it has more bits than stime */
558 		rtime >>= 1;
559 		total >>= 1;
560 	}
561 
562 	/*
563 	 * Make sure gcc understands that this is a 32x32->64 multiply,
564 	 * followed by a 64/32->64 divide.
565 	 */
566 	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
567 	return scaled;
568 }
569 
570 /*
571  * Adjust tick based cputime random precision against scheduler runtime
572  * accounting.
573  *
574  * Tick based cputime accounting depend on random scheduling timeslices of a
575  * task to be interrupted or not by the timer.  Depending on these
576  * circumstances, the number of these interrupts may be over or
577  * under-optimistic, matching the real user and system cputime with a variable
578  * precision.
579  *
580  * Fix this by scaling these tick based values against the total runtime
581  * accounted by the CFS scheduler.
582  *
583  * This code provides the following guarantees:
584  *
585  *   stime + utime == rtime
586  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
587  *
588  * Assuming that rtime_i+1 >= rtime_i.
589  */
590 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
591 		    u64 *ut, u64 *st)
592 {
593 	u64 rtime, stime, utime;
594 	unsigned long flags;
595 
596 	/* Serialize concurrent callers such that we can honour our guarantees */
597 	raw_spin_lock_irqsave(&prev->lock, flags);
598 	rtime = curr->sum_exec_runtime;
599 
600 	/*
601 	 * This is possible under two circumstances:
602 	 *  - rtime isn't monotonic after all (a bug);
603 	 *  - we got reordered by the lock.
604 	 *
605 	 * In both cases this acts as a filter such that the rest of the code
606 	 * can assume it is monotonic regardless of anything else.
607 	 */
608 	if (prev->stime + prev->utime >= rtime)
609 		goto out;
610 
611 	stime = curr->stime;
612 	utime = curr->utime;
613 
614 	/*
615 	 * If either stime or utime are 0, assume all runtime is userspace.
616 	 * Once a task gets some ticks, the monotonicy code at 'update:'
617 	 * will ensure things converge to the observed ratio.
618 	 */
619 	if (stime == 0) {
620 		utime = rtime;
621 		goto update;
622 	}
623 
624 	if (utime == 0) {
625 		stime = rtime;
626 		goto update;
627 	}
628 
629 	stime = scale_stime(stime, rtime, stime + utime);
630 
631 update:
632 	/*
633 	 * Make sure stime doesn't go backwards; this preserves monotonicity
634 	 * for utime because rtime is monotonic.
635 	 *
636 	 *  utime_i+1 = rtime_i+1 - stime_i
637 	 *            = rtime_i+1 - (rtime_i - utime_i)
638 	 *            = (rtime_i+1 - rtime_i) + utime_i
639 	 *            >= utime_i
640 	 */
641 	if (stime < prev->stime)
642 		stime = prev->stime;
643 	utime = rtime - stime;
644 
645 	/*
646 	 * Make sure utime doesn't go backwards; this still preserves
647 	 * monotonicity for stime, analogous argument to above.
648 	 */
649 	if (utime < prev->utime) {
650 		utime = prev->utime;
651 		stime = rtime - utime;
652 	}
653 
654 	prev->stime = stime;
655 	prev->utime = utime;
656 out:
657 	*ut = prev->utime;
658 	*st = prev->stime;
659 	raw_spin_unlock_irqrestore(&prev->lock, flags);
660 }
661 
662 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
663 {
664 	struct task_cputime cputime = {
665 		.sum_exec_runtime = p->se.sum_exec_runtime,
666 	};
667 
668 	task_cputime(p, &cputime.utime, &cputime.stime);
669 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
670 }
671 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
672 
673 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
674 {
675 	struct task_cputime cputime;
676 
677 	thread_group_cputime(p, &cputime);
678 	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
679 }
680 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
681 
682 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
683 static u64 vtime_delta(struct vtime *vtime)
684 {
685 	unsigned long long clock;
686 
687 	clock = sched_clock();
688 	if (clock < vtime->starttime)
689 		return 0;
690 
691 	return clock - vtime->starttime;
692 }
693 
694 static u64 get_vtime_delta(struct vtime *vtime)
695 {
696 	u64 delta = vtime_delta(vtime);
697 	u64 other;
698 
699 	/*
700 	 * Unlike tick based timing, vtime based timing never has lost
701 	 * ticks, and no need for steal time accounting to make up for
702 	 * lost ticks. Vtime accounts a rounded version of actual
703 	 * elapsed time. Limit account_other_time to prevent rounding
704 	 * errors from causing elapsed vtime to go negative.
705 	 */
706 	other = account_other_time(delta);
707 	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
708 	vtime->starttime += delta;
709 
710 	return delta - other;
711 }
712 
713 static void __vtime_account_system(struct task_struct *tsk,
714 				   struct vtime *vtime)
715 {
716 	vtime->stime += get_vtime_delta(vtime);
717 	if (vtime->stime >= TICK_NSEC) {
718 		account_system_time(tsk, irq_count(), vtime->stime);
719 		vtime->stime = 0;
720 	}
721 }
722 
723 static void vtime_account_guest(struct task_struct *tsk,
724 				struct vtime *vtime)
725 {
726 	vtime->gtime += get_vtime_delta(vtime);
727 	if (vtime->gtime >= TICK_NSEC) {
728 		account_guest_time(tsk, vtime->gtime);
729 		vtime->gtime = 0;
730 	}
731 }
732 
733 void vtime_account_system(struct task_struct *tsk)
734 {
735 	struct vtime *vtime = &tsk->vtime;
736 
737 	if (!vtime_delta(vtime))
738 		return;
739 
740 	write_seqcount_begin(&vtime->seqcount);
741 	/* We might have scheduled out from guest path */
742 	if (current->flags & PF_VCPU)
743 		vtime_account_guest(tsk, vtime);
744 	else
745 		__vtime_account_system(tsk, vtime);
746 	write_seqcount_end(&vtime->seqcount);
747 }
748 
749 void vtime_user_enter(struct task_struct *tsk)
750 {
751 	struct vtime *vtime = &tsk->vtime;
752 
753 	write_seqcount_begin(&vtime->seqcount);
754 	__vtime_account_system(tsk, vtime);
755 	vtime->state = VTIME_USER;
756 	write_seqcount_end(&vtime->seqcount);
757 }
758 
759 void vtime_user_exit(struct task_struct *tsk)
760 {
761 	struct vtime *vtime = &tsk->vtime;
762 
763 	write_seqcount_begin(&vtime->seqcount);
764 	vtime->utime += get_vtime_delta(vtime);
765 	if (vtime->utime >= TICK_NSEC) {
766 		account_user_time(tsk, vtime->utime);
767 		vtime->utime = 0;
768 	}
769 	vtime->state = VTIME_SYS;
770 	write_seqcount_end(&vtime->seqcount);
771 }
772 
773 void vtime_guest_enter(struct task_struct *tsk)
774 {
775 	struct vtime *vtime = &tsk->vtime;
776 	/*
777 	 * The flags must be updated under the lock with
778 	 * the vtime_starttime flush and update.
779 	 * That enforces a right ordering and update sequence
780 	 * synchronization against the reader (task_gtime())
781 	 * that can thus safely catch up with a tickless delta.
782 	 */
783 	write_seqcount_begin(&vtime->seqcount);
784 	__vtime_account_system(tsk, vtime);
785 	current->flags |= PF_VCPU;
786 	write_seqcount_end(&vtime->seqcount);
787 }
788 EXPORT_SYMBOL_GPL(vtime_guest_enter);
789 
790 void vtime_guest_exit(struct task_struct *tsk)
791 {
792 	struct vtime *vtime = &tsk->vtime;
793 
794 	write_seqcount_begin(&vtime->seqcount);
795 	vtime_account_guest(tsk, vtime);
796 	current->flags &= ~PF_VCPU;
797 	write_seqcount_end(&vtime->seqcount);
798 }
799 EXPORT_SYMBOL_GPL(vtime_guest_exit);
800 
801 void vtime_account_idle(struct task_struct *tsk)
802 {
803 	account_idle_time(get_vtime_delta(&tsk->vtime));
804 }
805 
806 void arch_vtime_task_switch(struct task_struct *prev)
807 {
808 	struct vtime *vtime = &prev->vtime;
809 
810 	write_seqcount_begin(&vtime->seqcount);
811 	vtime->state = VTIME_INACTIVE;
812 	write_seqcount_end(&vtime->seqcount);
813 
814 	vtime = &current->vtime;
815 
816 	write_seqcount_begin(&vtime->seqcount);
817 	vtime->state = VTIME_SYS;
818 	vtime->starttime = sched_clock();
819 	write_seqcount_end(&vtime->seqcount);
820 }
821 
822 void vtime_init_idle(struct task_struct *t, int cpu)
823 {
824 	struct vtime *vtime = &t->vtime;
825 	unsigned long flags;
826 
827 	local_irq_save(flags);
828 	write_seqcount_begin(&vtime->seqcount);
829 	vtime->state = VTIME_SYS;
830 	vtime->starttime = sched_clock();
831 	write_seqcount_end(&vtime->seqcount);
832 	local_irq_restore(flags);
833 }
834 
835 u64 task_gtime(struct task_struct *t)
836 {
837 	struct vtime *vtime = &t->vtime;
838 	unsigned int seq;
839 	u64 gtime;
840 
841 	if (!vtime_accounting_enabled())
842 		return t->gtime;
843 
844 	do {
845 		seq = read_seqcount_begin(&vtime->seqcount);
846 
847 		gtime = t->gtime;
848 		if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
849 			gtime += vtime->gtime + vtime_delta(vtime);
850 
851 	} while (read_seqcount_retry(&vtime->seqcount, seq));
852 
853 	return gtime;
854 }
855 
856 /*
857  * Fetch cputime raw values from fields of task_struct and
858  * add up the pending nohz execution time since the last
859  * cputime snapshot.
860  */
861 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
862 {
863 	struct vtime *vtime = &t->vtime;
864 	unsigned int seq;
865 	u64 delta;
866 
867 	if (!vtime_accounting_enabled()) {
868 		*utime = t->utime;
869 		*stime = t->stime;
870 		return;
871 	}
872 
873 	do {
874 		seq = read_seqcount_begin(&vtime->seqcount);
875 
876 		*utime = t->utime;
877 		*stime = t->stime;
878 
879 		/* Task is sleeping, nothing to add */
880 		if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
881 			continue;
882 
883 		delta = vtime_delta(vtime);
884 
885 		/*
886 		 * Task runs either in user or kernel space, add pending nohz time to
887 		 * the right place.
888 		 */
889 		if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
890 			*utime += vtime->utime + delta;
891 		else if (vtime->state == VTIME_SYS)
892 			*stime += vtime->stime + delta;
893 	} while (read_seqcount_retry(&vtime->seqcount, seq));
894 }
895 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
896