xref: /openbmc/linux/kernel/cgroup/rstat.c (revision fd5e9fccbd504c5179ab57ff695c610bca8809d6)
1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2a5c2b93fSTejun Heo #include "cgroup-internal.h"
3a5c2b93fSTejun Heo 
4a5c2b93fSTejun Heo #include <linux/sched/cputime.h>
5a5c2b93fSTejun Heo 
6a319185bSYosry Ahmed #include <linux/bpf.h>
7a319185bSYosry Ahmed #include <linux/btf.h>
8a319185bSYosry Ahmed #include <linux/btf_ids.h>
9a319185bSYosry Ahmed 
100fa294fbSTejun Heo static DEFINE_SPINLOCK(cgroup_rstat_lock);
11c58632b3STejun Heo static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
12a5c2b93fSTejun Heo 
13a17556f8STejun Heo static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
14a17556f8STejun Heo 
cgroup_rstat_cpu(struct cgroup * cgrp,int cpu)15c58632b3STejun Heo static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
16a5c2b93fSTejun Heo {
17c58632b3STejun Heo 	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
18a5c2b93fSTejun Heo }
19a5c2b93fSTejun Heo 
20a5c2b93fSTejun Heo /**
216162cef0STejun Heo  * cgroup_rstat_updated - keep track of updated rstat_cpu
22a5c2b93fSTejun Heo  * @cgrp: target cgroup
23c58632b3STejun Heo  * @cpu: cpu on which rstat_cpu was updated
24a5c2b93fSTejun Heo  *
25c58632b3STejun Heo  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
26c58632b3STejun Heo  * rstat_cpu->updated_children list.  See the comment on top of
27c58632b3STejun Heo  * cgroup_rstat_cpu definition for details.
28a5c2b93fSTejun Heo  */
cgroup_rstat_updated(struct cgroup * cgrp,int cpu)29400031e0SDavid Vernet __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
30a5c2b93fSTejun Heo {
31c58632b3STejun Heo 	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
32a5c2b93fSTejun Heo 	unsigned long flags;
33a5c2b93fSTejun Heo 
34a5c2b93fSTejun Heo 	/*
35d8ef4b38STejun Heo 	 * Speculative already-on-list test. This may race leading to
36d8ef4b38STejun Heo 	 * temporary inaccuracies, which is fine.
37d8ef4b38STejun Heo 	 *
38a5c2b93fSTejun Heo 	 * Because @parent's updated_children is terminated with @parent
39a5c2b93fSTejun Heo 	 * instead of NULL, we can tell whether @cgrp is on the list by
40a5c2b93fSTejun Heo 	 * testing the next pointer for NULL.
41a5c2b93fSTejun Heo 	 */
42eda09706SMichal Koutný 	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
43a5c2b93fSTejun Heo 		return;
44a5c2b93fSTejun Heo 
45a5c2b93fSTejun Heo 	raw_spin_lock_irqsave(cpu_lock, flags);
46a5c2b93fSTejun Heo 
47a5c2b93fSTejun Heo 	/* put @cgrp and all ancestors on the corresponding updated lists */
48dc26532aSJohannes Weiner 	while (true) {
49c58632b3STejun Heo 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
50dc26532aSJohannes Weiner 		struct cgroup *parent = cgroup_parent(cgrp);
51dc26532aSJohannes Weiner 		struct cgroup_rstat_cpu *prstatc;
52a5c2b93fSTejun Heo 
53a5c2b93fSTejun Heo 		/*
54a5c2b93fSTejun Heo 		 * Both additions and removals are bottom-up.  If a cgroup
55a5c2b93fSTejun Heo 		 * is already in the tree, all ancestors are.
56a5c2b93fSTejun Heo 		 */
57c58632b3STejun Heo 		if (rstatc->updated_next)
58a5c2b93fSTejun Heo 			break;
59a5c2b93fSTejun Heo 
60dc26532aSJohannes Weiner 		/* Root has no parent to link it to, but mark it busy */
61dc26532aSJohannes Weiner 		if (!parent) {
62dc26532aSJohannes Weiner 			rstatc->updated_next = cgrp;
63dc26532aSJohannes Weiner 			break;
64dc26532aSJohannes Weiner 		}
65dc26532aSJohannes Weiner 
66dc26532aSJohannes Weiner 		prstatc = cgroup_rstat_cpu(parent, cpu);
67c58632b3STejun Heo 		rstatc->updated_next = prstatc->updated_children;
68c58632b3STejun Heo 		prstatc->updated_children = cgrp;
69dc26532aSJohannes Weiner 
70dc26532aSJohannes Weiner 		cgrp = parent;
71a5c2b93fSTejun Heo 	}
72a5c2b93fSTejun Heo 
73a5c2b93fSTejun Heo 	raw_spin_unlock_irqrestore(cpu_lock, flags);
74a5c2b93fSTejun Heo }
75a5c2b93fSTejun Heo 
76a5c2b93fSTejun Heo /**
77c58632b3STejun Heo  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
78a5c2b93fSTejun Heo  * @pos: current position
79a5c2b93fSTejun Heo  * @root: root of the tree to traversal
80a5c2b93fSTejun Heo  * @cpu: target cpu
81a5c2b93fSTejun Heo  *
8208b2b6fdSZhen Lei  * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
83a5c2b93fSTejun Heo  * the traversal and %NULL return indicates the end.  During traversal,
84a5c2b93fSTejun Heo  * each returned cgroup is unlinked from the tree.  Must be called with the
85c58632b3STejun Heo  * matching cgroup_rstat_cpu_lock held.
86a5c2b93fSTejun Heo  *
87a5c2b93fSTejun Heo  * The only ordering guarantee is that, for a parent and a child pair
88a5c2b93fSTejun Heo  * covered by a given traversal, if a child is visited, its parent is
89a5c2b93fSTejun Heo  * guaranteed to be visited afterwards.
90a5c2b93fSTejun Heo  */
cgroup_rstat_cpu_pop_updated(struct cgroup * pos,struct cgroup * root,int cpu)91c58632b3STejun Heo static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
92a5c2b93fSTejun Heo 						   struct cgroup *root, int cpu)
93a5c2b93fSTejun Heo {
94c58632b3STejun Heo 	struct cgroup_rstat_cpu *rstatc;
95f5f60d23SWei Yang 	struct cgroup *parent;
96a5c2b93fSTejun Heo 
97a5c2b93fSTejun Heo 	if (pos == root)
98a5c2b93fSTejun Heo 		return NULL;
99a5c2b93fSTejun Heo 
100a5c2b93fSTejun Heo 	/*
101a5c2b93fSTejun Heo 	 * We're gonna walk down to the first leaf and visit/remove it.  We
102a5c2b93fSTejun Heo 	 * can pick whatever unvisited node as the starting point.
103a5c2b93fSTejun Heo 	 */
104f5f60d23SWei Yang 	if (!pos) {
105a5c2b93fSTejun Heo 		pos = root;
106f5f60d23SWei Yang 		/* return NULL if this subtree is not on-list */
107f5f60d23SWei Yang 		if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
108f5f60d23SWei Yang 			return NULL;
109f5f60d23SWei Yang 	} else {
110a5c2b93fSTejun Heo 		pos = cgroup_parent(pos);
111f5f60d23SWei Yang 	}
112a5c2b93fSTejun Heo 
113a5c2b93fSTejun Heo 	/* walk down to the first leaf */
114a5c2b93fSTejun Heo 	while (true) {
115c58632b3STejun Heo 		rstatc = cgroup_rstat_cpu(pos, cpu);
116c58632b3STejun Heo 		if (rstatc->updated_children == pos)
117a5c2b93fSTejun Heo 			break;
118c58632b3STejun Heo 		pos = rstatc->updated_children;
119a5c2b93fSTejun Heo 	}
120a5c2b93fSTejun Heo 
121a5c2b93fSTejun Heo 	/*
122a5c2b93fSTejun Heo 	 * Unlink @pos from the tree.  As the updated_children list is
123a5c2b93fSTejun Heo 	 * singly linked, we have to walk it to find the removal point.
124a5c2b93fSTejun Heo 	 * However, due to the way we traverse, @pos will be the first
125a5c2b93fSTejun Heo 	 * child in most cases. The only exception is @root.
126a5c2b93fSTejun Heo 	 */
127f5f60d23SWei Yang 	parent = cgroup_parent(pos);
128dc26532aSJohannes Weiner 	if (parent) {
129dc26532aSJohannes Weiner 		struct cgroup_rstat_cpu *prstatc;
130a5c2b93fSTejun Heo 		struct cgroup **nextp;
131a5c2b93fSTejun Heo 
132dc26532aSJohannes Weiner 		prstatc = cgroup_rstat_cpu(parent, cpu);
133c58632b3STejun Heo 		nextp = &prstatc->updated_children;
1340da41f73SWei Yang 		while (*nextp != pos) {
135dc26532aSJohannes Weiner 			struct cgroup_rstat_cpu *nrstatc;
136dc26532aSJohannes Weiner 
137c58632b3STejun Heo 			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
138a5c2b93fSTejun Heo 			WARN_ON_ONCE(*nextp == parent);
139c58632b3STejun Heo 			nextp = &nrstatc->updated_next;
140a5c2b93fSTejun Heo 		}
141c58632b3STejun Heo 		*nextp = rstatc->updated_next;
142dc26532aSJohannes Weiner 	}
1439a9e97b2STejun Heo 
144dc26532aSJohannes Weiner 	rstatc->updated_next = NULL;
145a5c2b93fSTejun Heo 	return pos;
146a5c2b93fSTejun Heo }
147a5c2b93fSTejun Heo 
148a319185bSYosry Ahmed /*
149a319185bSYosry Ahmed  * A hook for bpf stat collectors to attach to and flush their stats.
150a319185bSYosry Ahmed  * Together with providing bpf kfuncs for cgroup_rstat_updated() and
151a319185bSYosry Ahmed  * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
152a319185bSYosry Ahmed  * collect cgroup stats can integrate with rstat for efficient flushing.
153a319185bSYosry Ahmed  *
154a319185bSYosry Ahmed  * A static noinline declaration here could cause the compiler to optimize away
155a319185bSYosry Ahmed  * the function. A global noinline declaration will keep the definition, but may
156a319185bSYosry Ahmed  * optimize away the callsite. Therefore, __weak is needed to ensure that the
157a319185bSYosry Ahmed  * call is still emitted, by telling the compiler that we don't know what the
158a319185bSYosry Ahmed  * function might eventually be.
159a319185bSYosry Ahmed  *
160a319185bSYosry Ahmed  * __diag_* below are needed to dismiss the missing prototype warning.
161a319185bSYosry Ahmed  */
162a319185bSYosry Ahmed __diag_push();
163a319185bSYosry Ahmed __diag_ignore_all("-Wmissing-prototypes",
164a319185bSYosry Ahmed 		  "kfuncs which will be used in BPF programs");
165a319185bSYosry Ahmed 
bpf_rstat_flush(struct cgroup * cgrp,struct cgroup * parent,int cpu)166a319185bSYosry Ahmed __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
167a319185bSYosry Ahmed 				     struct cgroup *parent, int cpu)
168a319185bSYosry Ahmed {
169a319185bSYosry Ahmed }
170a319185bSYosry Ahmed 
171a319185bSYosry Ahmed __diag_pop();
172a319185bSYosry Ahmed 
173a17556f8STejun Heo /* see cgroup_rstat_flush() */
cgroup_rstat_flush_locked(struct cgroup * cgrp)1740a2dc6acSYosry Ahmed static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
1750fa294fbSTejun Heo 	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
176a17556f8STejun Heo {
177a17556f8STejun Heo 	int cpu;
178a17556f8STejun Heo 
1790fa294fbSTejun Heo 	lockdep_assert_held(&cgroup_rstat_lock);
180a17556f8STejun Heo 
181a17556f8STejun Heo 	for_each_possible_cpu(cpu) {
182a17556f8STejun Heo 		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
183a17556f8STejun Heo 						       cpu);
184a17556f8STejun Heo 		struct cgroup *pos = NULL;
185b1e2c8dfSSebastian Andrzej Siewior 		unsigned long flags;
186a17556f8STejun Heo 
187b1e2c8dfSSebastian Andrzej Siewior 		/*
188b1e2c8dfSSebastian Andrzej Siewior 		 * The _irqsave() is needed because cgroup_rstat_lock is
189b1e2c8dfSSebastian Andrzej Siewior 		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
190b1e2c8dfSSebastian Andrzej Siewior 		 * this lock with the _irq() suffix only disables interrupts on
191b1e2c8dfSSebastian Andrzej Siewior 		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
192b1e2c8dfSSebastian Andrzej Siewior 		 * interrupts on both configurations. The _irqsave() ensures
193b1e2c8dfSSebastian Andrzej Siewior 		 * that interrupts are always disabled and later restored.
194b1e2c8dfSSebastian Andrzej Siewior 		 */
195b1e2c8dfSSebastian Andrzej Siewior 		raw_spin_lock_irqsave(cpu_lock, flags);
1968f53470bSTejun Heo 		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
1978f53470bSTejun Heo 			struct cgroup_subsys_state *css;
1988f53470bSTejun Heo 
199a17556f8STejun Heo 			cgroup_base_stat_flush(pos, cpu);
200a319185bSYosry Ahmed 			bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
2018f53470bSTejun Heo 
2028f53470bSTejun Heo 			rcu_read_lock();
2038f53470bSTejun Heo 			list_for_each_entry_rcu(css, &pos->rstat_css_list,
2048f53470bSTejun Heo 						rstat_css_node)
2058f53470bSTejun Heo 				css->ss->css_rstat_flush(css, cpu);
2068f53470bSTejun Heo 			rcu_read_unlock();
2078f53470bSTejun Heo 		}
208b1e2c8dfSSebastian Andrzej Siewior 		raw_spin_unlock_irqrestore(cpu_lock, flags);
2090fa294fbSTejun Heo 
2100a2dc6acSYosry Ahmed 		/* play nice and yield if necessary */
2110a2dc6acSYosry Ahmed 		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
2120fa294fbSTejun Heo 			spin_unlock_irq(&cgroup_rstat_lock);
2130fa294fbSTejun Heo 			if (!cond_resched())
2140fa294fbSTejun Heo 				cpu_relax();
2150fa294fbSTejun Heo 			spin_lock_irq(&cgroup_rstat_lock);
2160fa294fbSTejun Heo 		}
217a17556f8STejun Heo 	}
218a17556f8STejun Heo }
219a17556f8STejun Heo 
220a17556f8STejun Heo /**
221a17556f8STejun Heo  * cgroup_rstat_flush - flush stats in @cgrp's subtree
222a17556f8STejun Heo  * @cgrp: target cgroup
223a17556f8STejun Heo  *
224a17556f8STejun Heo  * Collect all per-cpu stats in @cgrp's subtree into the global counters
225a17556f8STejun Heo  * and propagate them upwards.  After this function returns, all cgroups in
226a17556f8STejun Heo  * the subtree have up-to-date ->stat.
227a17556f8STejun Heo  *
228a17556f8STejun Heo  * This also gets all cgroups in the subtree including @cgrp off the
229a17556f8STejun Heo  * ->updated_children lists.
2300fa294fbSTejun Heo  *
2310fa294fbSTejun Heo  * This function may block.
232a17556f8STejun Heo  */
cgroup_rstat_flush(struct cgroup * cgrp)233400031e0SDavid Vernet __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
234a17556f8STejun Heo {
2350fa294fbSTejun Heo 	might_sleep();
2360fa294fbSTejun Heo 
2370fa294fbSTejun Heo 	spin_lock_irq(&cgroup_rstat_lock);
2380a2dc6acSYosry Ahmed 	cgroup_rstat_flush_locked(cgrp);
2390fa294fbSTejun Heo 	spin_unlock_irq(&cgroup_rstat_lock);
2400fa294fbSTejun Heo }
2410fa294fbSTejun Heo 
2420fa294fbSTejun Heo /**
2432ca11b0eSYang Li  * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
2446162cef0STejun Heo  * @cgrp: target cgroup
2456162cef0STejun Heo  *
2466162cef0STejun Heo  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
2476162cef0STejun Heo  * paired with cgroup_rstat_flush_release().
2480fa294fbSTejun Heo  *
2490fa294fbSTejun Heo  * This function may block.
2506162cef0STejun Heo  */
cgroup_rstat_flush_hold(struct cgroup * cgrp)2516162cef0STejun Heo void cgroup_rstat_flush_hold(struct cgroup *cgrp)
2520fa294fbSTejun Heo 	__acquires(&cgroup_rstat_lock)
2536162cef0STejun Heo {
2540fa294fbSTejun Heo 	might_sleep();
2550fa294fbSTejun Heo 	spin_lock_irq(&cgroup_rstat_lock);
2560a2dc6acSYosry Ahmed 	cgroup_rstat_flush_locked(cgrp);
2576162cef0STejun Heo }
2586162cef0STejun Heo 
2596162cef0STejun Heo /**
2606162cef0STejun Heo  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
2616162cef0STejun Heo  */
cgroup_rstat_flush_release(void)2626162cef0STejun Heo void cgroup_rstat_flush_release(void)
2630fa294fbSTejun Heo 	__releases(&cgroup_rstat_lock)
2646162cef0STejun Heo {
2650fa294fbSTejun Heo 	spin_unlock_irq(&cgroup_rstat_lock);
2666162cef0STejun Heo }
2676162cef0STejun Heo 
cgroup_rstat_init(struct cgroup * cgrp)268a17556f8STejun Heo int cgroup_rstat_init(struct cgroup *cgrp)
269a17556f8STejun Heo {
270a17556f8STejun Heo 	int cpu;
271a17556f8STejun Heo 
272a17556f8STejun Heo 	/* the root cgrp has rstat_cpu preallocated */
273a17556f8STejun Heo 	if (!cgrp->rstat_cpu) {
274a17556f8STejun Heo 		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
275a17556f8STejun Heo 		if (!cgrp->rstat_cpu)
276a17556f8STejun Heo 			return -ENOMEM;
277a17556f8STejun Heo 	}
278a17556f8STejun Heo 
279a17556f8STejun Heo 	/* ->updated_children list is self terminated */
280a17556f8STejun Heo 	for_each_possible_cpu(cpu) {
281a17556f8STejun Heo 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
282a17556f8STejun Heo 
283a17556f8STejun Heo 		rstatc->updated_children = cgrp;
284a17556f8STejun Heo 		u64_stats_init(&rstatc->bsync);
285a17556f8STejun Heo 	}
286a17556f8STejun Heo 
287a17556f8STejun Heo 	return 0;
288a17556f8STejun Heo }
289a17556f8STejun Heo 
cgroup_rstat_exit(struct cgroup * cgrp)290a17556f8STejun Heo void cgroup_rstat_exit(struct cgroup *cgrp)
291a17556f8STejun Heo {
292a17556f8STejun Heo 	int cpu;
293a17556f8STejun Heo 
294a17556f8STejun Heo 	cgroup_rstat_flush(cgrp);
295a17556f8STejun Heo 
296a17556f8STejun Heo 	/* sanity check */
297a17556f8STejun Heo 	for_each_possible_cpu(cpu) {
298a17556f8STejun Heo 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
299a17556f8STejun Heo 
300a17556f8STejun Heo 		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
301a17556f8STejun Heo 		    WARN_ON_ONCE(rstatc->updated_next))
302a17556f8STejun Heo 			return;
303a17556f8STejun Heo 	}
304a17556f8STejun Heo 
305a17556f8STejun Heo 	free_percpu(cgrp->rstat_cpu);
306a17556f8STejun Heo 	cgrp->rstat_cpu = NULL;
307a17556f8STejun Heo }
308a17556f8STejun Heo 
cgroup_rstat_boot(void)309a17556f8STejun Heo void __init cgroup_rstat_boot(void)
310a17556f8STejun Heo {
311a17556f8STejun Heo 	int cpu;
312a17556f8STejun Heo 
313a17556f8STejun Heo 	for_each_possible_cpu(cpu)
314a17556f8STejun Heo 		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
315a17556f8STejun Heo }
316a17556f8STejun Heo 
317a17556f8STejun Heo /*
318a17556f8STejun Heo  * Functions for cgroup basic resource statistics implemented on top of
319a17556f8STejun Heo  * rstat.
320a17556f8STejun Heo  */
cgroup_base_stat_add(struct cgroup_base_stat * dst_bstat,struct cgroup_base_stat * src_bstat)3211bb5ec2eSTejun Heo static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
322d4ff749bSTejun Heo 				 struct cgroup_base_stat *src_bstat)
323a5c2b93fSTejun Heo {
324d4ff749bSTejun Heo 	dst_bstat->cputime.utime += src_bstat->cputime.utime;
325d4ff749bSTejun Heo 	dst_bstat->cputime.stime += src_bstat->cputime.stime;
326d4ff749bSTejun Heo 	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
3271fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
3281fcf54deSJosh Don 	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
3291fcf54deSJosh Don #endif
330a5c2b93fSTejun Heo }
331a5c2b93fSTejun Heo 
cgroup_base_stat_sub(struct cgroup_base_stat * dst_bstat,struct cgroup_base_stat * src_bstat)3321bb5ec2eSTejun Heo static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
3331bb5ec2eSTejun Heo 				 struct cgroup_base_stat *src_bstat)
3341bb5ec2eSTejun Heo {
3351bb5ec2eSTejun Heo 	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
3361bb5ec2eSTejun Heo 	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
3371bb5ec2eSTejun Heo 	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
3381fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
3391fcf54deSJosh Don 	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
3401fcf54deSJosh Don #endif
3411bb5ec2eSTejun Heo }
3421bb5ec2eSTejun Heo 
cgroup_base_stat_flush(struct cgroup * cgrp,int cpu)343d4ff749bSTejun Heo static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
344a5c2b93fSTejun Heo {
345c58632b3STejun Heo 	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
346dc26532aSJohannes Weiner 	struct cgroup *parent = cgroup_parent(cgrp);
347*0437719cSHao Jia 	struct cgroup_rstat_cpu *prstatc;
34895b99f35SWei Yang 	struct cgroup_base_stat delta;
349a5c2b93fSTejun Heo 	unsigned seq;
350a5c2b93fSTejun Heo 
351dc26532aSJohannes Weiner 	/* Root-level stats are sourced from system-wide CPU stats */
352dc26532aSJohannes Weiner 	if (!parent)
353dc26532aSJohannes Weiner 		return;
354dc26532aSJohannes Weiner 
355a5c2b93fSTejun Heo 	/* fetch the current per-cpu values */
356a5c2b93fSTejun Heo 	do {
357d4ff749bSTejun Heo 		seq = __u64_stats_fetch_begin(&rstatc->bsync);
35895b99f35SWei Yang 		delta = rstatc->bstat;
359d4ff749bSTejun Heo 	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
360a5c2b93fSTejun Heo 
361*0437719cSHao Jia 	/* propagate per-cpu delta to cgroup and per-cpu global statistics */
3621bb5ec2eSTejun Heo 	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
3631bb5ec2eSTejun Heo 	cgroup_base_stat_add(&cgrp->bstat, &delta);
3641bb5ec2eSTejun Heo 	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
365*0437719cSHao Jia 	cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
366a5c2b93fSTejun Heo 
367*0437719cSHao Jia 	/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
368dc26532aSJohannes Weiner 	if (cgroup_parent(parent)) {
3691bb5ec2eSTejun Heo 		delta = cgrp->bstat;
3701bb5ec2eSTejun Heo 		cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
3711bb5ec2eSTejun Heo 		cgroup_base_stat_add(&parent->bstat, &delta);
3721bb5ec2eSTejun Heo 		cgroup_base_stat_add(&cgrp->last_bstat, &delta);
373*0437719cSHao Jia 
374*0437719cSHao Jia 		delta = rstatc->subtree_bstat;
375*0437719cSHao Jia 		prstatc = cgroup_rstat_cpu(parent, cpu);
376*0437719cSHao Jia 		cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
377*0437719cSHao Jia 		cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
378*0437719cSHao Jia 		cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
3791bb5ec2eSTejun Heo 	}
380a5c2b93fSTejun Heo }
381a5c2b93fSTejun Heo 
382c58632b3STejun Heo static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup * cgrp,unsigned long * flags)383c3df5fb5STejun Heo cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
384a5c2b93fSTejun Heo {
385c58632b3STejun Heo 	struct cgroup_rstat_cpu *rstatc;
386a5c2b93fSTejun Heo 
387c58632b3STejun Heo 	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
388c3df5fb5STejun Heo 	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
389c58632b3STejun Heo 	return rstatc;
390a5c2b93fSTejun Heo }
391a5c2b93fSTejun Heo 
cgroup_base_stat_cputime_account_end(struct cgroup * cgrp,struct cgroup_rstat_cpu * rstatc,unsigned long flags)392d4ff749bSTejun Heo static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
393c3df5fb5STejun Heo 						 struct cgroup_rstat_cpu *rstatc,
394c3df5fb5STejun Heo 						 unsigned long flags)
395a5c2b93fSTejun Heo {
396c3df5fb5STejun Heo 	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
3976162cef0STejun Heo 	cgroup_rstat_updated(cgrp, smp_processor_id());
398c58632b3STejun Heo 	put_cpu_ptr(rstatc);
399a5c2b93fSTejun Heo }
400a5c2b93fSTejun Heo 
__cgroup_account_cputime(struct cgroup * cgrp,u64 delta_exec)401a5c2b93fSTejun Heo void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
402a5c2b93fSTejun Heo {
403c58632b3STejun Heo 	struct cgroup_rstat_cpu *rstatc;
404c3df5fb5STejun Heo 	unsigned long flags;
405a5c2b93fSTejun Heo 
406c3df5fb5STejun Heo 	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
407d4ff749bSTejun Heo 	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
408c3df5fb5STejun Heo 	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
409a5c2b93fSTejun Heo }
410a5c2b93fSTejun Heo 
__cgroup_account_cputime_field(struct cgroup * cgrp,enum cpu_usage_stat index,u64 delta_exec)411a5c2b93fSTejun Heo void __cgroup_account_cputime_field(struct cgroup *cgrp,
412a5c2b93fSTejun Heo 				    enum cpu_usage_stat index, u64 delta_exec)
413a5c2b93fSTejun Heo {
414c58632b3STejun Heo 	struct cgroup_rstat_cpu *rstatc;
415c3df5fb5STejun Heo 	unsigned long flags;
416a5c2b93fSTejun Heo 
417c3df5fb5STejun Heo 	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
418a5c2b93fSTejun Heo 
419a5c2b93fSTejun Heo 	switch (index) {
420a5c2b93fSTejun Heo 	case CPUTIME_USER:
421a5c2b93fSTejun Heo 	case CPUTIME_NICE:
422d4ff749bSTejun Heo 		rstatc->bstat.cputime.utime += delta_exec;
423a5c2b93fSTejun Heo 		break;
424a5c2b93fSTejun Heo 	case CPUTIME_SYSTEM:
425a5c2b93fSTejun Heo 	case CPUTIME_IRQ:
426a5c2b93fSTejun Heo 	case CPUTIME_SOFTIRQ:
427d4ff749bSTejun Heo 		rstatc->bstat.cputime.stime += delta_exec;
428a5c2b93fSTejun Heo 		break;
4291fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
4301fcf54deSJosh Don 	case CPUTIME_FORCEIDLE:
4311fcf54deSJosh Don 		rstatc->bstat.forceidle_sum += delta_exec;
4321fcf54deSJosh Don 		break;
4331fcf54deSJosh Don #endif
434a5c2b93fSTejun Heo 	default:
435a5c2b93fSTejun Heo 		break;
436a5c2b93fSTejun Heo 	}
437a5c2b93fSTejun Heo 
438c3df5fb5STejun Heo 	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
439a5c2b93fSTejun Heo }
440a5c2b93fSTejun Heo 
441936f2a70SBoris Burkov /*
442936f2a70SBoris Burkov  * compute the cputime for the root cgroup by getting the per cpu data
443936f2a70SBoris Burkov  * at a global level, then categorizing the fields in a manner consistent
444936f2a70SBoris Burkov  * with how it is done by __cgroup_account_cputime_field for each bit of
445936f2a70SBoris Burkov  * cpu time attributed to a cgroup.
446936f2a70SBoris Burkov  */
root_cgroup_cputime(struct cgroup_base_stat * bstat)4471fcf54deSJosh Don static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
448936f2a70SBoris Burkov {
4491fcf54deSJosh Don 	struct task_cputime *cputime = &bstat->cputime;
450936f2a70SBoris Burkov 	int i;
451936f2a70SBoris Burkov 
452fcdb1edaSJosh Don 	memset(bstat, 0, sizeof(*bstat));
453936f2a70SBoris Burkov 	for_each_possible_cpu(i) {
454936f2a70SBoris Burkov 		struct kernel_cpustat kcpustat;
455936f2a70SBoris Burkov 		u64 *cpustat = kcpustat.cpustat;
456936f2a70SBoris Burkov 		u64 user = 0;
457936f2a70SBoris Burkov 		u64 sys = 0;
458936f2a70SBoris Burkov 
459936f2a70SBoris Burkov 		kcpustat_cpu_fetch(&kcpustat, i);
460936f2a70SBoris Burkov 
461936f2a70SBoris Burkov 		user += cpustat[CPUTIME_USER];
462936f2a70SBoris Burkov 		user += cpustat[CPUTIME_NICE];
463936f2a70SBoris Burkov 		cputime->utime += user;
464936f2a70SBoris Burkov 
465936f2a70SBoris Burkov 		sys += cpustat[CPUTIME_SYSTEM];
466936f2a70SBoris Burkov 		sys += cpustat[CPUTIME_IRQ];
467936f2a70SBoris Burkov 		sys += cpustat[CPUTIME_SOFTIRQ];
468936f2a70SBoris Burkov 		cputime->stime += sys;
469936f2a70SBoris Burkov 
470936f2a70SBoris Burkov 		cputime->sum_exec_runtime += user;
471936f2a70SBoris Burkov 		cputime->sum_exec_runtime += sys;
4721fcf54deSJosh Don 
4731fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
4741fcf54deSJosh Don 		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
4751fcf54deSJosh Don #endif
476936f2a70SBoris Burkov 	}
477936f2a70SBoris Burkov }
478936f2a70SBoris Burkov 
cgroup_base_stat_cputime_show(struct seq_file * seq)479d4ff749bSTejun Heo void cgroup_base_stat_cputime_show(struct seq_file *seq)
480a5c2b93fSTejun Heo {
481a5c2b93fSTejun Heo 	struct cgroup *cgrp = seq_css(seq)->cgroup;
482a5c2b93fSTejun Heo 	u64 usage, utime, stime;
4831fcf54deSJosh Don 	struct cgroup_base_stat bstat;
4841fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
4851fcf54deSJosh Don 	u64 forceidle_time;
4861fcf54deSJosh Don #endif
487a5c2b93fSTejun Heo 
488936f2a70SBoris Burkov 	if (cgroup_parent(cgrp)) {
4896162cef0STejun Heo 		cgroup_rstat_flush_hold(cgrp);
490d4ff749bSTejun Heo 		usage = cgrp->bstat.cputime.sum_exec_runtime;
491936f2a70SBoris Burkov 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
492936f2a70SBoris Burkov 			       &utime, &stime);
4931fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
4941fcf54deSJosh Don 		forceidle_time = cgrp->bstat.forceidle_sum;
4951fcf54deSJosh Don #endif
4966162cef0STejun Heo 		cgroup_rstat_flush_release();
497936f2a70SBoris Burkov 	} else {
4981fcf54deSJosh Don 		root_cgroup_cputime(&bstat);
4991fcf54deSJosh Don 		usage = bstat.cputime.sum_exec_runtime;
5001fcf54deSJosh Don 		utime = bstat.cputime.utime;
5011fcf54deSJosh Don 		stime = bstat.cputime.stime;
5021fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
5031fcf54deSJosh Don 		forceidle_time = bstat.forceidle_sum;
5041fcf54deSJosh Don #endif
505936f2a70SBoris Burkov 	}
506a5c2b93fSTejun Heo 
507a5c2b93fSTejun Heo 	do_div(usage, NSEC_PER_USEC);
508a5c2b93fSTejun Heo 	do_div(utime, NSEC_PER_USEC);
509a5c2b93fSTejun Heo 	do_div(stime, NSEC_PER_USEC);
5101fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
5111fcf54deSJosh Don 	do_div(forceidle_time, NSEC_PER_USEC);
5121fcf54deSJosh Don #endif
513a5c2b93fSTejun Heo 
514a5c2b93fSTejun Heo 	seq_printf(seq, "usage_usec %llu\n"
515a5c2b93fSTejun Heo 		   "user_usec %llu\n"
516a5c2b93fSTejun Heo 		   "system_usec %llu\n",
517a5c2b93fSTejun Heo 		   usage, utime, stime);
5181fcf54deSJosh Don 
5191fcf54deSJosh Don #ifdef CONFIG_SCHED_CORE
5201fcf54deSJosh Don 	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
5211fcf54deSJosh Don #endif
522a5c2b93fSTejun Heo }
523a319185bSYosry Ahmed 
524a319185bSYosry Ahmed /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
525a319185bSYosry Ahmed BTF_SET8_START(bpf_rstat_kfunc_ids)
526a319185bSYosry Ahmed BTF_ID_FLAGS(func, cgroup_rstat_updated)
527a319185bSYosry Ahmed BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
528a319185bSYosry Ahmed BTF_SET8_END(bpf_rstat_kfunc_ids)
529a319185bSYosry Ahmed 
530a319185bSYosry Ahmed static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
531a319185bSYosry Ahmed 	.owner          = THIS_MODULE,
532a319185bSYosry Ahmed 	.set            = &bpf_rstat_kfunc_ids,
533a319185bSYosry Ahmed };
534a319185bSYosry Ahmed 
bpf_rstat_kfunc_init(void)535a319185bSYosry Ahmed static int __init bpf_rstat_kfunc_init(void)
536a319185bSYosry Ahmed {
537a319185bSYosry Ahmed 	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
538a319185bSYosry Ahmed 					 &bpf_rstat_kfunc_set);
539a319185bSYosry Ahmed }
540a319185bSYosry Ahmed late_initcall(bpf_rstat_kfunc_init);
541