1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD   0x00200000  /* I am a kernel thread */
10 
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE	0x0001
13 #define TASK_UNINTERRUPTIBLE	0x0002
14 
15 #define MAX_STACKS   32
16 #define MAX_ENTRIES  102400
17 
18 struct tstamp_data {
19 	__u32 stack_id;
20 	__u32 state;
21 	__u64 timestamp;
22 };
23 
24 struct offcpu_key {
25 	__u32 pid;
26 	__u32 tgid;
27 	__u32 stack_id;
28 	__u32 state;
29 	__u64 cgroup_id;
30 };
31 
32 struct {
33 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
34 	__uint(key_size, sizeof(__u32));
35 	__uint(value_size, MAX_STACKS * sizeof(__u64));
36 	__uint(max_entries, MAX_ENTRIES);
37 } stacks SEC(".maps");
38 
39 struct {
40 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
41 	__uint(map_flags, BPF_F_NO_PREALLOC);
42 	__type(key, int);
43 	__type(value, struct tstamp_data);
44 } tstamp SEC(".maps");
45 
46 struct {
47 	__uint(type, BPF_MAP_TYPE_HASH);
48 	__uint(key_size, sizeof(struct offcpu_key));
49 	__uint(value_size, sizeof(__u64));
50 	__uint(max_entries, MAX_ENTRIES);
51 } off_cpu SEC(".maps");
52 
53 struct {
54 	__uint(type, BPF_MAP_TYPE_HASH);
55 	__uint(key_size, sizeof(__u32));
56 	__uint(value_size, sizeof(__u8));
57 	__uint(max_entries, 1);
58 } cpu_filter SEC(".maps");
59 
60 struct {
61 	__uint(type, BPF_MAP_TYPE_HASH);
62 	__uint(key_size, sizeof(__u32));
63 	__uint(value_size, sizeof(__u8));
64 	__uint(max_entries, 1);
65 } task_filter SEC(".maps");
66 
67 struct {
68 	__uint(type, BPF_MAP_TYPE_HASH);
69 	__uint(key_size, sizeof(__u64));
70 	__uint(value_size, sizeof(__u8));
71 	__uint(max_entries, 1);
72 } cgroup_filter SEC(".maps");
73 
74 /* new kernel task_struct definition */
75 struct task_struct___new {
76 	long __state;
77 } __attribute__((preserve_access_index));
78 
79 /* old kernel task_struct definition */
80 struct task_struct___old {
81 	long state;
82 } __attribute__((preserve_access_index));
83 
84 int enabled = 0;
85 int has_cpu = 0;
86 int has_task = 0;
87 int has_cgroup = 0;
88 
89 const volatile bool has_prev_state = false;
90 const volatile bool needs_cgroup = false;
91 const volatile bool uses_cgroup_v1 = false;
92 
93 /*
94  * Old kernel used to call it task_struct->state and now it's '__state'.
95  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
96  *
97  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
98  */
99 static inline int get_task_state(struct task_struct *t)
100 {
101 	/* recast pointer to capture new type for compiler */
102 	struct task_struct___new *t_new = (void *)t;
103 
104 	if (bpf_core_field_exists(t_new->__state)) {
105 		return BPF_CORE_READ(t_new, __state);
106 	} else {
107 		/* recast pointer to capture old type for compiler */
108 		struct task_struct___old *t_old = (void *)t;
109 
110 		return BPF_CORE_READ(t_old, state);
111 	}
112 }
113 
114 static inline __u64 get_cgroup_id(struct task_struct *t)
115 {
116 	struct cgroup *cgrp;
117 
118 	if (uses_cgroup_v1)
119 		cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
120 	else
121 		cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
122 
123 	return BPF_CORE_READ(cgrp, kn, id);
124 }
125 
126 static inline int can_record(struct task_struct *t, int state)
127 {
128 	/* kernel threads don't have user stack */
129 	if (t->flags & PF_KTHREAD)
130 		return 0;
131 
132 	if (state != TASK_INTERRUPTIBLE &&
133 	    state != TASK_UNINTERRUPTIBLE)
134 		return 0;
135 
136 	if (has_cpu) {
137 		__u32 cpu = bpf_get_smp_processor_id();
138 		__u8 *ok;
139 
140 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
141 		if (!ok)
142 			return 0;
143 	}
144 
145 	if (has_task) {
146 		__u8 *ok;
147 		__u32 pid = t->pid;
148 
149 		ok = bpf_map_lookup_elem(&task_filter, &pid);
150 		if (!ok)
151 			return 0;
152 	}
153 
154 	if (has_cgroup) {
155 		__u8 *ok;
156 		__u64 cgrp_id = get_cgroup_id(t);
157 
158 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
159 		if (!ok)
160 			return 0;
161 	}
162 
163 	return 1;
164 }
165 
166 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
167 			struct task_struct *next, int state)
168 {
169 	__u64 ts;
170 	__u32 stack_id;
171 	struct tstamp_data *pelem;
172 
173 	ts = bpf_ktime_get_ns();
174 
175 	if (!can_record(prev, state))
176 		goto next;
177 
178 	stack_id = bpf_get_stackid(ctx, &stacks,
179 				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
180 
181 	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
182 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
183 	if (!pelem)
184 		goto next;
185 
186 	pelem->timestamp = ts;
187 	pelem->state = state;
188 	pelem->stack_id = stack_id;
189 
190 next:
191 	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
192 
193 	if (pelem && pelem->timestamp) {
194 		struct offcpu_key key = {
195 			.pid = next->pid,
196 			.tgid = next->tgid,
197 			.stack_id = pelem->stack_id,
198 			.state = pelem->state,
199 			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
200 		};
201 		__u64 delta = ts - pelem->timestamp;
202 		__u64 *total;
203 
204 		total = bpf_map_lookup_elem(&off_cpu, &key);
205 		if (total)
206 			*total += delta;
207 		else
208 			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
209 
210 		/* prevent to reuse the timestamp later */
211 		pelem->timestamp = 0;
212 	}
213 
214 	return 0;
215 }
216 
217 SEC("tp_btf/sched_switch")
218 int on_switch(u64 *ctx)
219 {
220 	struct task_struct *prev, *next;
221 	int prev_state;
222 
223 	if (!enabled)
224 		return 0;
225 
226 	prev = (struct task_struct *)ctx[1];
227 	next = (struct task_struct *)ctx[2];
228 
229 	if (has_prev_state)
230 		prev_state = (int)ctx[3];
231 	else
232 		prev_state = get_task_state(prev);
233 
234 	return off_cpu_stat(ctx, prev, next, prev_state);
235 }
236 
237 char LICENSE[] SEC("license") = "Dual BSD/GPL";
238