1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD   0x00200000  /* I am a kernel thread */
10 
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE	0x0001
13 #define TASK_UNINTERRUPTIBLE	0x0002
14 
15 /* create a new thread */
16 #define CLONE_THREAD  0x10000
17 
18 #define MAX_STACKS   32
19 #define MAX_ENTRIES  102400
20 
21 struct tstamp_data {
22 	__u32 stack_id;
23 	__u32 state;
24 	__u64 timestamp;
25 };
26 
27 struct offcpu_key {
28 	__u32 pid;
29 	__u32 tgid;
30 	__u32 stack_id;
31 	__u32 state;
32 	__u64 cgroup_id;
33 };
34 
35 struct {
36 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
37 	__uint(key_size, sizeof(__u32));
38 	__uint(value_size, MAX_STACKS * sizeof(__u64));
39 	__uint(max_entries, MAX_ENTRIES);
40 } stacks SEC(".maps");
41 
42 struct {
43 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
44 	__uint(map_flags, BPF_F_NO_PREALLOC);
45 	__type(key, int);
46 	__type(value, struct tstamp_data);
47 } tstamp SEC(".maps");
48 
49 struct {
50 	__uint(type, BPF_MAP_TYPE_HASH);
51 	__uint(key_size, sizeof(struct offcpu_key));
52 	__uint(value_size, sizeof(__u64));
53 	__uint(max_entries, MAX_ENTRIES);
54 } off_cpu SEC(".maps");
55 
56 struct {
57 	__uint(type, BPF_MAP_TYPE_HASH);
58 	__uint(key_size, sizeof(__u32));
59 	__uint(value_size, sizeof(__u8));
60 	__uint(max_entries, 1);
61 } cpu_filter SEC(".maps");
62 
63 struct {
64 	__uint(type, BPF_MAP_TYPE_HASH);
65 	__uint(key_size, sizeof(__u32));
66 	__uint(value_size, sizeof(__u8));
67 	__uint(max_entries, 1);
68 } task_filter SEC(".maps");
69 
70 struct {
71 	__uint(type, BPF_MAP_TYPE_HASH);
72 	__uint(key_size, sizeof(__u64));
73 	__uint(value_size, sizeof(__u8));
74 	__uint(max_entries, 1);
75 } cgroup_filter SEC(".maps");
76 
77 /* new kernel task_struct definition */
78 struct task_struct___new {
79 	long __state;
80 } __attribute__((preserve_access_index));
81 
82 /* old kernel task_struct definition */
83 struct task_struct___old {
84 	long state;
85 } __attribute__((preserve_access_index));
86 
87 int enabled = 0;
88 int has_cpu = 0;
89 int has_task = 0;
90 int has_cgroup = 0;
91 int uses_tgid = 0;
92 
93 const volatile bool has_prev_state = false;
94 const volatile bool needs_cgroup = false;
95 const volatile bool uses_cgroup_v1 = false;
96 
97 /*
98  * Old kernel used to call it task_struct->state and now it's '__state'.
99  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
100  *
101  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
102  */
103 static inline int get_task_state(struct task_struct *t)
104 {
105 	/* recast pointer to capture new type for compiler */
106 	struct task_struct___new *t_new = (void *)t;
107 
108 	if (bpf_core_field_exists(t_new->__state)) {
109 		return BPF_CORE_READ(t_new, __state);
110 	} else {
111 		/* recast pointer to capture old type for compiler */
112 		struct task_struct___old *t_old = (void *)t;
113 
114 		return BPF_CORE_READ(t_old, state);
115 	}
116 }
117 
118 static inline __u64 get_cgroup_id(struct task_struct *t)
119 {
120 	struct cgroup *cgrp;
121 
122 	if (uses_cgroup_v1)
123 		cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
124 	else
125 		cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
126 
127 	return BPF_CORE_READ(cgrp, kn, id);
128 }
129 
130 static inline int can_record(struct task_struct *t, int state)
131 {
132 	/* kernel threads don't have user stack */
133 	if (t->flags & PF_KTHREAD)
134 		return 0;
135 
136 	if (state != TASK_INTERRUPTIBLE &&
137 	    state != TASK_UNINTERRUPTIBLE)
138 		return 0;
139 
140 	if (has_cpu) {
141 		__u32 cpu = bpf_get_smp_processor_id();
142 		__u8 *ok;
143 
144 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
145 		if (!ok)
146 			return 0;
147 	}
148 
149 	if (has_task) {
150 		__u8 *ok;
151 		__u32 pid;
152 
153 		if (uses_tgid)
154 			pid = t->tgid;
155 		else
156 			pid = t->pid;
157 
158 		ok = bpf_map_lookup_elem(&task_filter, &pid);
159 		if (!ok)
160 			return 0;
161 	}
162 
163 	if (has_cgroup) {
164 		__u8 *ok;
165 		__u64 cgrp_id = get_cgroup_id(t);
166 
167 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
168 		if (!ok)
169 			return 0;
170 	}
171 
172 	return 1;
173 }
174 
175 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
176 			struct task_struct *next, int state)
177 {
178 	__u64 ts;
179 	__u32 stack_id;
180 	struct tstamp_data *pelem;
181 
182 	ts = bpf_ktime_get_ns();
183 
184 	if (!can_record(prev, state))
185 		goto next;
186 
187 	stack_id = bpf_get_stackid(ctx, &stacks,
188 				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
189 
190 	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
191 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
192 	if (!pelem)
193 		goto next;
194 
195 	pelem->timestamp = ts;
196 	pelem->state = state;
197 	pelem->stack_id = stack_id;
198 
199 next:
200 	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
201 
202 	if (pelem && pelem->timestamp) {
203 		struct offcpu_key key = {
204 			.pid = next->pid,
205 			.tgid = next->tgid,
206 			.stack_id = pelem->stack_id,
207 			.state = pelem->state,
208 			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
209 		};
210 		__u64 delta = ts - pelem->timestamp;
211 		__u64 *total;
212 
213 		total = bpf_map_lookup_elem(&off_cpu, &key);
214 		if (total)
215 			*total += delta;
216 		else
217 			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
218 
219 		/* prevent to reuse the timestamp later */
220 		pelem->timestamp = 0;
221 	}
222 
223 	return 0;
224 }
225 
226 SEC("tp_btf/task_newtask")
227 int on_newtask(u64 *ctx)
228 {
229 	struct task_struct *task;
230 	u64 clone_flags;
231 	u32 pid;
232 	u8 val = 1;
233 
234 	if (!uses_tgid)
235 		return 0;
236 
237 	task = (struct task_struct *)bpf_get_current_task();
238 
239 	pid = BPF_CORE_READ(task, tgid);
240 	if (!bpf_map_lookup_elem(&task_filter, &pid))
241 		return 0;
242 
243 	task = (struct task_struct *)ctx[0];
244 	clone_flags = ctx[1];
245 
246 	pid = task->tgid;
247 	if (!(clone_flags & CLONE_THREAD))
248 		bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
249 
250 	return 0;
251 }
252 
253 SEC("tp_btf/sched_switch")
254 int on_switch(u64 *ctx)
255 {
256 	struct task_struct *prev, *next;
257 	int prev_state;
258 
259 	if (!enabled)
260 		return 0;
261 
262 	prev = (struct task_struct *)ctx[1];
263 	next = (struct task_struct *)ctx[2];
264 
265 	if (has_prev_state)
266 		prev_state = (int)ctx[3];
267 	else
268 		prev_state = get_task_state(prev);
269 
270 	return off_cpu_stat(ctx, prev, next, prev_state);
271 }
272 
273 char LICENSE[] SEC("license") = "Dual BSD/GPL";
274