xref: /openbmc/linux/samples/bpf/cpustat_kern.c (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1c5350777SLeo Yan // SPDX-License-Identifier: GPL-2.0
2c5350777SLeo Yan 
3c5350777SLeo Yan #include <linux/version.h>
4c5350777SLeo Yan #include <linux/ptrace.h>
5c5350777SLeo Yan #include <uapi/linux/bpf.h>
67cf245a3SToke Høiland-Jørgensen #include <bpf/bpf_helpers.h>
7c5350777SLeo Yan 
8c5350777SLeo Yan /*
9c5350777SLeo Yan  * The CPU number, cstate number and pstate number are based
10c5350777SLeo Yan  * on 96boards Hikey with octa CA53 CPUs.
11c5350777SLeo Yan  *
12c5350777SLeo Yan  * Every CPU have three idle states for cstate:
13c5350777SLeo Yan  *   WFI, CPU_OFF, CLUSTER_OFF
14c5350777SLeo Yan  *
15c5350777SLeo Yan  * Every CPU have 5 operating points:
16c5350777SLeo Yan  *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17c5350777SLeo Yan  *
18c5350777SLeo Yan  * This code is based on these assumption and other platforms
19c5350777SLeo Yan  * need to adjust these definitions.
20c5350777SLeo Yan  */
21c5350777SLeo Yan #define MAX_CPU			8
22c5350777SLeo Yan #define MAX_PSTATE_ENTRIES	5
23c5350777SLeo Yan #define MAX_CSTATE_ENTRIES	3
24c5350777SLeo Yan 
25c5350777SLeo Yan static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26c5350777SLeo Yan 
27c5350777SLeo Yan /*
28c5350777SLeo Yan  * my_map structure is used to record cstate and pstate index and
29c5350777SLeo Yan  * timestamp (Idx, Ts), when new event incoming we need to update
30c5350777SLeo Yan  * combination for new state index and timestamp (Idx`, Ts`).
31c5350777SLeo Yan  *
32c5350777SLeo Yan  * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33c5350777SLeo Yan  * interval for the previous state: Duration(Idx) = Ts` - Ts.
34c5350777SLeo Yan  *
35c5350777SLeo Yan  * Every CPU has one below array for recording state index and
36c5350777SLeo Yan  * timestamp, and record for cstate and pstate saperately:
37c5350777SLeo Yan  *
38c5350777SLeo Yan  * +--------------------------+
39c5350777SLeo Yan  * | cstate timestamp         |
40c5350777SLeo Yan  * +--------------------------+
41c5350777SLeo Yan  * | cstate index             |
42c5350777SLeo Yan  * +--------------------------+
43c5350777SLeo Yan  * | pstate timestamp         |
44c5350777SLeo Yan  * +--------------------------+
45c5350777SLeo Yan  * | pstate index             |
46c5350777SLeo Yan  * +--------------------------+
47c5350777SLeo Yan  */
48c5350777SLeo Yan #define MAP_OFF_CSTATE_TIME	0
49c5350777SLeo Yan #define MAP_OFF_CSTATE_IDX	1
50c5350777SLeo Yan #define MAP_OFF_PSTATE_TIME	2
51c5350777SLeo Yan #define MAP_OFF_PSTATE_IDX	3
52c5350777SLeo Yan #define MAP_OFF_NUM		4
53c5350777SLeo Yan 
54f0c328f8SDaniel T. Lee struct {
55f0c328f8SDaniel T. Lee 	__uint(type, BPF_MAP_TYPE_ARRAY);
56f0c328f8SDaniel T. Lee 	__type(key, u32);
57f0c328f8SDaniel T. Lee 	__type(value, u64);
58f0c328f8SDaniel T. Lee 	__uint(max_entries, MAX_CPU * MAP_OFF_NUM);
59f0c328f8SDaniel T. Lee } my_map SEC(".maps");
60c5350777SLeo Yan 
61c5350777SLeo Yan /* cstate_duration records duration time for every idle state per CPU */
62f0c328f8SDaniel T. Lee struct {
63f0c328f8SDaniel T. Lee 	__uint(type, BPF_MAP_TYPE_ARRAY);
64f0c328f8SDaniel T. Lee 	__type(key, u32);
65f0c328f8SDaniel T. Lee 	__type(value, u64);
66f0c328f8SDaniel T. Lee 	__uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
67f0c328f8SDaniel T. Lee } cstate_duration SEC(".maps");
68c5350777SLeo Yan 
69c5350777SLeo Yan /* pstate_duration records duration time for every operating point per CPU */
70f0c328f8SDaniel T. Lee struct {
71f0c328f8SDaniel T. Lee 	__uint(type, BPF_MAP_TYPE_ARRAY);
72f0c328f8SDaniel T. Lee 	__type(key, u32);
73f0c328f8SDaniel T. Lee 	__type(value, u64);
74f0c328f8SDaniel T. Lee 	__uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
75f0c328f8SDaniel T. Lee } pstate_duration SEC(".maps");
76c5350777SLeo Yan 
77c5350777SLeo Yan /*
78c5350777SLeo Yan  * The trace events for cpu_idle and cpu_frequency are taken from:
79*27d7fdf0SRoss Zwisler  * /sys/kernel/tracing/events/power/cpu_idle/format
80*27d7fdf0SRoss Zwisler  * /sys/kernel/tracing/events/power/cpu_frequency/format
81c5350777SLeo Yan  *
82c5350777SLeo Yan  * These two events have same format, so define one common structure.
83c5350777SLeo Yan  */
84c5350777SLeo Yan struct cpu_args {
85c5350777SLeo Yan 	u64 pad;
86c5350777SLeo Yan 	u32 state;
87c5350777SLeo Yan 	u32 cpu_id;
88c5350777SLeo Yan };
89c5350777SLeo Yan 
90c5350777SLeo Yan /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
find_cpu_pstate_idx(u32 frequency)91c5350777SLeo Yan static u32 find_cpu_pstate_idx(u32 frequency)
92c5350777SLeo Yan {
93c5350777SLeo Yan 	u32 i;
94c5350777SLeo Yan 
95c5350777SLeo Yan 	for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96c5350777SLeo Yan 		if (frequency == cpu_opps[i])
97c5350777SLeo Yan 			return i;
98c5350777SLeo Yan 	}
99c5350777SLeo Yan 
100c5350777SLeo Yan 	return i;
101c5350777SLeo Yan }
102c5350777SLeo Yan 
103c5350777SLeo Yan SEC("tracepoint/power/cpu_idle")
bpf_prog1(struct cpu_args * ctx)104c5350777SLeo Yan int bpf_prog1(struct cpu_args *ctx)
105c5350777SLeo Yan {
106c5350777SLeo Yan 	u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107c5350777SLeo Yan 	u32 key, cpu, pstate_idx;
108c5350777SLeo Yan 	u64 *val;
109c5350777SLeo Yan 
110c5350777SLeo Yan 	if (ctx->cpu_id > MAX_CPU)
111c5350777SLeo Yan 		return 0;
112c5350777SLeo Yan 
113c5350777SLeo Yan 	cpu = ctx->cpu_id;
114c5350777SLeo Yan 
115c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116c5350777SLeo Yan 	cts = bpf_map_lookup_elem(&my_map, &key);
117c5350777SLeo Yan 	if (!cts)
118c5350777SLeo Yan 		return 0;
119c5350777SLeo Yan 
120c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121c5350777SLeo Yan 	cstate = bpf_map_lookup_elem(&my_map, &key);
122c5350777SLeo Yan 	if (!cstate)
123c5350777SLeo Yan 		return 0;
124c5350777SLeo Yan 
125c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126c5350777SLeo Yan 	pts = bpf_map_lookup_elem(&my_map, &key);
127c5350777SLeo Yan 	if (!pts)
128c5350777SLeo Yan 		return 0;
129c5350777SLeo Yan 
130c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131c5350777SLeo Yan 	pstate = bpf_map_lookup_elem(&my_map, &key);
132c5350777SLeo Yan 	if (!pstate)
133c5350777SLeo Yan 		return 0;
134c5350777SLeo Yan 
135c5350777SLeo Yan 	prev_state = *cstate;
136c5350777SLeo Yan 	*cstate = ctx->state;
137c5350777SLeo Yan 
138c5350777SLeo Yan 	if (!*cts) {
139c5350777SLeo Yan 		*cts = bpf_ktime_get_ns();
140c5350777SLeo Yan 		return 0;
141c5350777SLeo Yan 	}
142c5350777SLeo Yan 
143c5350777SLeo Yan 	cur_ts = bpf_ktime_get_ns();
144c5350777SLeo Yan 	delta = cur_ts - *cts;
145c5350777SLeo Yan 	*cts = cur_ts;
146c5350777SLeo Yan 
147c5350777SLeo Yan 	/*
148c5350777SLeo Yan 	 * When state doesn't equal to (u32)-1, the cpu will enter
149c5350777SLeo Yan 	 * one idle state; for this case we need to record interval
150c5350777SLeo Yan 	 * for the pstate.
151c5350777SLeo Yan 	 *
152c5350777SLeo Yan 	 *                 OPP2
153c5350777SLeo Yan 	 *            +---------------------+
154c5350777SLeo Yan 	 *     OPP1   |                     |
155c5350777SLeo Yan 	 *   ---------+                     |
156c5350777SLeo Yan 	 *                                  |  Idle state
157c5350777SLeo Yan 	 *                                  +---------------
158c5350777SLeo Yan 	 *
159c5350777SLeo Yan 	 *            |<- pstate duration ->|
160c5350777SLeo Yan 	 *            ^                     ^
161c5350777SLeo Yan 	 *           pts                  cur_ts
162c5350777SLeo Yan 	 */
163c5350777SLeo Yan 	if (ctx->state != (u32)-1) {
164c5350777SLeo Yan 
165c5350777SLeo Yan 		/* record pstate after have first cpu_frequency event */
166c5350777SLeo Yan 		if (!*pts)
167c5350777SLeo Yan 			return 0;
168c5350777SLeo Yan 
169c5350777SLeo Yan 		delta = cur_ts - *pts;
170c5350777SLeo Yan 
171c5350777SLeo Yan 		pstate_idx = find_cpu_pstate_idx(*pstate);
172c5350777SLeo Yan 		if (pstate_idx >= MAX_PSTATE_ENTRIES)
173c5350777SLeo Yan 			return 0;
174c5350777SLeo Yan 
175c5350777SLeo Yan 		key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176c5350777SLeo Yan 		val = bpf_map_lookup_elem(&pstate_duration, &key);
177c5350777SLeo Yan 		if (val)
178c5350777SLeo Yan 			__sync_fetch_and_add((long *)val, delta);
179c5350777SLeo Yan 
180c5350777SLeo Yan 	/*
181c5350777SLeo Yan 	 * When state equal to (u32)-1, the cpu just exits from one
182c5350777SLeo Yan 	 * specific idle state; for this case we need to record
183c5350777SLeo Yan 	 * interval for the pstate.
184c5350777SLeo Yan 	 *
185c5350777SLeo Yan 	 *       OPP2
186c5350777SLeo Yan 	 *   -----------+
187c5350777SLeo Yan 	 *              |                          OPP1
188c5350777SLeo Yan 	 *              |                     +-----------
189c5350777SLeo Yan 	 *              |     Idle state      |
190c5350777SLeo Yan 	 *              +---------------------+
191c5350777SLeo Yan 	 *
192c5350777SLeo Yan 	 *              |<- cstate duration ->|
193c5350777SLeo Yan 	 *              ^                     ^
194c5350777SLeo Yan 	 *             cts                  cur_ts
195c5350777SLeo Yan 	 */
196c5350777SLeo Yan 	} else {
197c5350777SLeo Yan 
198c5350777SLeo Yan 		key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199c5350777SLeo Yan 		val = bpf_map_lookup_elem(&cstate_duration, &key);
200c5350777SLeo Yan 		if (val)
201c5350777SLeo Yan 			__sync_fetch_and_add((long *)val, delta);
202c5350777SLeo Yan 	}
203c5350777SLeo Yan 
204c5350777SLeo Yan 	/* Update timestamp for pstate as new start time */
205c5350777SLeo Yan 	if (*pts)
206c5350777SLeo Yan 		*pts = cur_ts;
207c5350777SLeo Yan 
208c5350777SLeo Yan 	return 0;
209c5350777SLeo Yan }
210c5350777SLeo Yan 
211c5350777SLeo Yan SEC("tracepoint/power/cpu_frequency")
bpf_prog2(struct cpu_args * ctx)212c5350777SLeo Yan int bpf_prog2(struct cpu_args *ctx)
213c5350777SLeo Yan {
214c5350777SLeo Yan 	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215c5350777SLeo Yan 	u32 key, cpu, pstate_idx;
216c5350777SLeo Yan 	u64 *val;
217c5350777SLeo Yan 
218c5350777SLeo Yan 	cpu = ctx->cpu_id;
219c5350777SLeo Yan 
220c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221c5350777SLeo Yan 	pts = bpf_map_lookup_elem(&my_map, &key);
222c5350777SLeo Yan 	if (!pts)
223c5350777SLeo Yan 		return 0;
224c5350777SLeo Yan 
225c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226c5350777SLeo Yan 	pstate = bpf_map_lookup_elem(&my_map, &key);
227c5350777SLeo Yan 	if (!pstate)
228c5350777SLeo Yan 		return 0;
229c5350777SLeo Yan 
230c5350777SLeo Yan 	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231c5350777SLeo Yan 	cstate = bpf_map_lookup_elem(&my_map, &key);
232c5350777SLeo Yan 	if (!cstate)
233c5350777SLeo Yan 		return 0;
234c5350777SLeo Yan 
235c5350777SLeo Yan 	prev_state = *pstate;
236c5350777SLeo Yan 	*pstate = ctx->state;
237c5350777SLeo Yan 
238c5350777SLeo Yan 	if (!*pts) {
239c5350777SLeo Yan 		*pts = bpf_ktime_get_ns();
240c5350777SLeo Yan 		return 0;
241c5350777SLeo Yan 	}
242c5350777SLeo Yan 
243c5350777SLeo Yan 	cur_ts = bpf_ktime_get_ns();
244c5350777SLeo Yan 	delta = cur_ts - *pts;
245c5350777SLeo Yan 	*pts = cur_ts;
246c5350777SLeo Yan 
247c5350777SLeo Yan 	/* When CPU is in idle, bail out to skip pstate statistics */
248c5350777SLeo Yan 	if (*cstate != (u32)(-1))
249c5350777SLeo Yan 		return 0;
250c5350777SLeo Yan 
251c5350777SLeo Yan 	/*
252c5350777SLeo Yan 	 * The cpu changes to another different OPP (in below diagram
253c5350777SLeo Yan 	 * change frequency from OPP3 to OPP1), need recording interval
254c5350777SLeo Yan 	 * for previous frequency OPP3 and update timestamp as start
255c5350777SLeo Yan 	 * time for new frequency OPP1.
256c5350777SLeo Yan 	 *
257c5350777SLeo Yan 	 *                 OPP3
258c5350777SLeo Yan 	 *            +---------------------+
259c5350777SLeo Yan 	 *     OPP2   |                     |
260c5350777SLeo Yan 	 *   ---------+                     |
261c5350777SLeo Yan 	 *                                  |    OPP1
262c5350777SLeo Yan 	 *                                  +---------------
263c5350777SLeo Yan 	 *
264c5350777SLeo Yan 	 *            |<- pstate duration ->|
265c5350777SLeo Yan 	 *            ^                     ^
266c5350777SLeo Yan 	 *           pts                  cur_ts
267c5350777SLeo Yan 	 */
268c5350777SLeo Yan 	pstate_idx = find_cpu_pstate_idx(*pstate);
269c5350777SLeo Yan 	if (pstate_idx >= MAX_PSTATE_ENTRIES)
270c5350777SLeo Yan 		return 0;
271c5350777SLeo Yan 
272c5350777SLeo Yan 	key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273c5350777SLeo Yan 	val = bpf_map_lookup_elem(&pstate_duration, &key);
274c5350777SLeo Yan 	if (val)
275c5350777SLeo Yan 		__sync_fetch_and_add((long *)val, delta);
276c5350777SLeo Yan 
277c5350777SLeo Yan 	return 0;
278c5350777SLeo Yan }
279c5350777SLeo Yan 
280c5350777SLeo Yan char _license[] SEC("license") = "GPL";
281c5350777SLeo Yan u32 _version SEC("version") = LINUX_VERSION_CODE;
282