xref: /openbmc/linux/samples/bpf/cpustat_kern.c (revision bef7a78d)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/version.h>
4 #include <linux/ptrace.h>
5 #include <uapi/linux/bpf.h>
6 #include <bpf/bpf_helpers.h>
7 
8 /*
9  * The CPU number, cstate number and pstate number are based
10  * on 96boards Hikey with octa CA53 CPUs.
11  *
12  * Every CPU have three idle states for cstate:
13  *   WFI, CPU_OFF, CLUSTER_OFF
14  *
15  * Every CPU have 5 operating points:
16  *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17  *
18  * This code is based on these assumption and other platforms
19  * need to adjust these definitions.
20  */
21 #define MAX_CPU			8
22 #define MAX_PSTATE_ENTRIES	5
23 #define MAX_CSTATE_ENTRIES	3
24 
25 static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26 
27 /*
28  * my_map structure is used to record cstate and pstate index and
29  * timestamp (Idx, Ts), when new event incoming we need to update
30  * combination for new state index and timestamp (Idx`, Ts`).
31  *
32  * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33  * interval for the previous state: Duration(Idx) = Ts` - Ts.
34  *
35  * Every CPU has one below array for recording state index and
36  * timestamp, and record for cstate and pstate saperately:
37  *
38  * +--------------------------+
39  * | cstate timestamp         |
40  * +--------------------------+
41  * | cstate index             |
42  * +--------------------------+
43  * | pstate timestamp         |
44  * +--------------------------+
45  * | pstate index             |
46  * +--------------------------+
47  */
48 #define MAP_OFF_CSTATE_TIME	0
49 #define MAP_OFF_CSTATE_IDX	1
50 #define MAP_OFF_PSTATE_TIME	2
51 #define MAP_OFF_PSTATE_IDX	3
52 #define MAP_OFF_NUM		4
53 
54 struct {
55 	__uint(type, BPF_MAP_TYPE_ARRAY);
56 	__type(key, u32);
57 	__type(value, u64);
58 	__uint(max_entries, MAX_CPU * MAP_OFF_NUM);
59 } my_map SEC(".maps");
60 
61 /* cstate_duration records duration time for every idle state per CPU */
62 struct {
63 	__uint(type, BPF_MAP_TYPE_ARRAY);
64 	__type(key, u32);
65 	__type(value, u64);
66 	__uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
67 } cstate_duration SEC(".maps");
68 
69 /* pstate_duration records duration time for every operating point per CPU */
70 struct {
71 	__uint(type, BPF_MAP_TYPE_ARRAY);
72 	__type(key, u32);
73 	__type(value, u64);
74 	__uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
75 } pstate_duration SEC(".maps");
76 
77 /*
78  * The trace events for cpu_idle and cpu_frequency are taken from:
79  * /sys/kernel/debug/tracing/events/power/cpu_idle/format
80  * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
81  *
82  * These two events have same format, so define one common structure.
83  */
84 struct cpu_args {
85 	u64 pad;
86 	u32 state;
87 	u32 cpu_id;
88 };
89 
90 /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
91 static u32 find_cpu_pstate_idx(u32 frequency)
92 {
93 	u32 i;
94 
95 	for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96 		if (frequency == cpu_opps[i])
97 			return i;
98 	}
99 
100 	return i;
101 }
102 
103 SEC("tracepoint/power/cpu_idle")
104 int bpf_prog1(struct cpu_args *ctx)
105 {
106 	u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107 	u32 key, cpu, pstate_idx;
108 	u64 *val;
109 
110 	if (ctx->cpu_id > MAX_CPU)
111 		return 0;
112 
113 	cpu = ctx->cpu_id;
114 
115 	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116 	cts = bpf_map_lookup_elem(&my_map, &key);
117 	if (!cts)
118 		return 0;
119 
120 	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121 	cstate = bpf_map_lookup_elem(&my_map, &key);
122 	if (!cstate)
123 		return 0;
124 
125 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126 	pts = bpf_map_lookup_elem(&my_map, &key);
127 	if (!pts)
128 		return 0;
129 
130 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131 	pstate = bpf_map_lookup_elem(&my_map, &key);
132 	if (!pstate)
133 		return 0;
134 
135 	prev_state = *cstate;
136 	*cstate = ctx->state;
137 
138 	if (!*cts) {
139 		*cts = bpf_ktime_get_ns();
140 		return 0;
141 	}
142 
143 	cur_ts = bpf_ktime_get_ns();
144 	delta = cur_ts - *cts;
145 	*cts = cur_ts;
146 
147 	/*
148 	 * When state doesn't equal to (u32)-1, the cpu will enter
149 	 * one idle state; for this case we need to record interval
150 	 * for the pstate.
151 	 *
152 	 *                 OPP2
153 	 *            +---------------------+
154 	 *     OPP1   |                     |
155 	 *   ---------+                     |
156 	 *                                  |  Idle state
157 	 *                                  +---------------
158 	 *
159 	 *            |<- pstate duration ->|
160 	 *            ^                     ^
161 	 *           pts                  cur_ts
162 	 */
163 	if (ctx->state != (u32)-1) {
164 
165 		/* record pstate after have first cpu_frequency event */
166 		if (!*pts)
167 			return 0;
168 
169 		delta = cur_ts - *pts;
170 
171 		pstate_idx = find_cpu_pstate_idx(*pstate);
172 		if (pstate_idx >= MAX_PSTATE_ENTRIES)
173 			return 0;
174 
175 		key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176 		val = bpf_map_lookup_elem(&pstate_duration, &key);
177 		if (val)
178 			__sync_fetch_and_add((long *)val, delta);
179 
180 	/*
181 	 * When state equal to (u32)-1, the cpu just exits from one
182 	 * specific idle state; for this case we need to record
183 	 * interval for the pstate.
184 	 *
185 	 *       OPP2
186 	 *   -----------+
187 	 *              |                          OPP1
188 	 *              |                     +-----------
189 	 *              |     Idle state      |
190 	 *              +---------------------+
191 	 *
192 	 *              |<- cstate duration ->|
193 	 *              ^                     ^
194 	 *             cts                  cur_ts
195 	 */
196 	} else {
197 
198 		key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199 		val = bpf_map_lookup_elem(&cstate_duration, &key);
200 		if (val)
201 			__sync_fetch_and_add((long *)val, delta);
202 	}
203 
204 	/* Update timestamp for pstate as new start time */
205 	if (*pts)
206 		*pts = cur_ts;
207 
208 	return 0;
209 }
210 
211 SEC("tracepoint/power/cpu_frequency")
212 int bpf_prog2(struct cpu_args *ctx)
213 {
214 	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215 	u32 key, cpu, pstate_idx;
216 	u64 *val;
217 
218 	cpu = ctx->cpu_id;
219 
220 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221 	pts = bpf_map_lookup_elem(&my_map, &key);
222 	if (!pts)
223 		return 0;
224 
225 	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226 	pstate = bpf_map_lookup_elem(&my_map, &key);
227 	if (!pstate)
228 		return 0;
229 
230 	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231 	cstate = bpf_map_lookup_elem(&my_map, &key);
232 	if (!cstate)
233 		return 0;
234 
235 	prev_state = *pstate;
236 	*pstate = ctx->state;
237 
238 	if (!*pts) {
239 		*pts = bpf_ktime_get_ns();
240 		return 0;
241 	}
242 
243 	cur_ts = bpf_ktime_get_ns();
244 	delta = cur_ts - *pts;
245 	*pts = cur_ts;
246 
247 	/* When CPU is in idle, bail out to skip pstate statistics */
248 	if (*cstate != (u32)(-1))
249 		return 0;
250 
251 	/*
252 	 * The cpu changes to another different OPP (in below diagram
253 	 * change frequency from OPP3 to OPP1), need recording interval
254 	 * for previous frequency OPP3 and update timestamp as start
255 	 * time for new frequency OPP1.
256 	 *
257 	 *                 OPP3
258 	 *            +---------------------+
259 	 *     OPP2   |                     |
260 	 *   ---------+                     |
261 	 *                                  |    OPP1
262 	 *                                  +---------------
263 	 *
264 	 *            |<- pstate duration ->|
265 	 *            ^                     ^
266 	 *           pts                  cur_ts
267 	 */
268 	pstate_idx = find_cpu_pstate_idx(*pstate);
269 	if (pstate_idx >= MAX_PSTATE_ENTRIES)
270 		return 0;
271 
272 	key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273 	val = bpf_map_lookup_elem(&pstate_duration, &key);
274 	if (val)
275 		__sync_fetch_and_add((long *)val, delta);
276 
277 	return 0;
278 }
279 
280 char _license[] SEC("license") = "GPL";
281 u32 _version SEC("version") = LINUX_VERSION_CODE;
282