1edc41a10SNamhyung Kim // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2edc41a10SNamhyung Kim // Copyright (c) 2022 Google
3edc41a10SNamhyung Kim #include "vmlinux.h"
4edc41a10SNamhyung Kim #include <bpf/bpf_helpers.h>
5edc41a10SNamhyung Kim #include <bpf/bpf_tracing.h>
6edc41a10SNamhyung Kim #include <bpf/bpf_core_read.h>
7edc41a10SNamhyung Kim
8edc41a10SNamhyung Kim /* task->flags for off-cpu analysis */
9edc41a10SNamhyung Kim #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
10edc41a10SNamhyung Kim
11edc41a10SNamhyung Kim /* task->state for off-cpu analysis */
12edc41a10SNamhyung Kim #define TASK_INTERRUPTIBLE 0x0001
13edc41a10SNamhyung Kim #define TASK_UNINTERRUPTIBLE 0x0002
14edc41a10SNamhyung Kim
15d2347763SNamhyung Kim /* create a new thread */
16d2347763SNamhyung Kim #define CLONE_THREAD 0x10000
17d2347763SNamhyung Kim
18edc41a10SNamhyung Kim #define MAX_STACKS 32
19edc41a10SNamhyung Kim #define MAX_ENTRIES 102400
20edc41a10SNamhyung Kim
21edc41a10SNamhyung Kim struct tstamp_data {
22edc41a10SNamhyung Kim __u32 stack_id;
23edc41a10SNamhyung Kim __u32 state;
24edc41a10SNamhyung Kim __u64 timestamp;
25edc41a10SNamhyung Kim };
26edc41a10SNamhyung Kim
27edc41a10SNamhyung Kim struct offcpu_key {
28edc41a10SNamhyung Kim __u32 pid;
29edc41a10SNamhyung Kim __u32 tgid;
30edc41a10SNamhyung Kim __u32 stack_id;
31edc41a10SNamhyung Kim __u32 state;
32685439a7SNamhyung Kim __u64 cgroup_id;
33edc41a10SNamhyung Kim };
34edc41a10SNamhyung Kim
35edc41a10SNamhyung Kim struct {
36edc41a10SNamhyung Kim __uint(type, BPF_MAP_TYPE_STACK_TRACE);
37edc41a10SNamhyung Kim __uint(key_size, sizeof(__u32));
38edc41a10SNamhyung Kim __uint(value_size, MAX_STACKS * sizeof(__u64));
39edc41a10SNamhyung Kim __uint(max_entries, MAX_ENTRIES);
40edc41a10SNamhyung Kim } stacks SEC(".maps");
41edc41a10SNamhyung Kim
42edc41a10SNamhyung Kim struct {
43edc41a10SNamhyung Kim __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
44edc41a10SNamhyung Kim __uint(map_flags, BPF_F_NO_PREALLOC);
45edc41a10SNamhyung Kim __type(key, int);
46edc41a10SNamhyung Kim __type(value, struct tstamp_data);
47edc41a10SNamhyung Kim } tstamp SEC(".maps");
48edc41a10SNamhyung Kim
49edc41a10SNamhyung Kim struct {
50edc41a10SNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
51edc41a10SNamhyung Kim __uint(key_size, sizeof(struct offcpu_key));
52edc41a10SNamhyung Kim __uint(value_size, sizeof(__u64));
53edc41a10SNamhyung Kim __uint(max_entries, MAX_ENTRIES);
54edc41a10SNamhyung Kim } off_cpu SEC(".maps");
55edc41a10SNamhyung Kim
5610742d0cSNamhyung Kim struct {
5710742d0cSNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
5810742d0cSNamhyung Kim __uint(key_size, sizeof(__u32));
5910742d0cSNamhyung Kim __uint(value_size, sizeof(__u8));
6010742d0cSNamhyung Kim __uint(max_entries, 1);
6110742d0cSNamhyung Kim } cpu_filter SEC(".maps");
6210742d0cSNamhyung Kim
6310742d0cSNamhyung Kim struct {
6410742d0cSNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
6510742d0cSNamhyung Kim __uint(key_size, sizeof(__u32));
6610742d0cSNamhyung Kim __uint(value_size, sizeof(__u8));
6710742d0cSNamhyung Kim __uint(max_entries, 1);
6810742d0cSNamhyung Kim } task_filter SEC(".maps");
6910742d0cSNamhyung Kim
70685439a7SNamhyung Kim struct {
71685439a7SNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
72685439a7SNamhyung Kim __uint(key_size, sizeof(__u64));
73685439a7SNamhyung Kim __uint(value_size, sizeof(__u8));
74685439a7SNamhyung Kim __uint(max_entries, 1);
75685439a7SNamhyung Kim } cgroup_filter SEC(".maps");
76685439a7SNamhyung Kim
77d6838ec4SNamhyung Kim /* new kernel task_struct definition */
78d6838ec4SNamhyung Kim struct task_struct___new {
79d6838ec4SNamhyung Kim long __state;
80d6838ec4SNamhyung Kim } __attribute__((preserve_access_index));
81d6838ec4SNamhyung Kim
82edc41a10SNamhyung Kim /* old kernel task_struct definition */
83edc41a10SNamhyung Kim struct task_struct___old {
84edc41a10SNamhyung Kim long state;
85edc41a10SNamhyung Kim } __attribute__((preserve_access_index));
86edc41a10SNamhyung Kim
87edc41a10SNamhyung Kim int enabled = 0;
8810742d0cSNamhyung Kim int has_cpu = 0;
8910742d0cSNamhyung Kim int has_task = 0;
90685439a7SNamhyung Kim int has_cgroup = 0;
9107fc958bSNamhyung Kim int uses_tgid = 0;
92edc41a10SNamhyung Kim
93b36888f7SNamhyung Kim const volatile bool has_prev_state = false;
94685439a7SNamhyung Kim const volatile bool needs_cgroup = false;
95685439a7SNamhyung Kim const volatile bool uses_cgroup_v1 = false;
96b36888f7SNamhyung Kim
97e42c9c54SNamhyung Kim int perf_subsys_id = -1;
98e42c9c54SNamhyung Kim
99edc41a10SNamhyung Kim /*
100edc41a10SNamhyung Kim * Old kernel used to call it task_struct->state and now it's '__state'.
101edc41a10SNamhyung Kim * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
102edc41a10SNamhyung Kim *
103edc41a10SNamhyung Kim * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
104edc41a10SNamhyung Kim */
get_task_state(struct task_struct * t)105edc41a10SNamhyung Kim static inline int get_task_state(struct task_struct *t)
106edc41a10SNamhyung Kim {
107d6838ec4SNamhyung Kim /* recast pointer to capture new type for compiler */
108d6838ec4SNamhyung Kim struct task_struct___new *t_new = (void *)t;
109edc41a10SNamhyung Kim
110d6838ec4SNamhyung Kim if (bpf_core_field_exists(t_new->__state)) {
111d6838ec4SNamhyung Kim return BPF_CORE_READ(t_new, __state);
112d6838ec4SNamhyung Kim } else {
113d6838ec4SNamhyung Kim /* recast pointer to capture old type for compiler */
114edc41a10SNamhyung Kim struct task_struct___old *t_old = (void *)t;
115edc41a10SNamhyung Kim
116edc41a10SNamhyung Kim return BPF_CORE_READ(t_old, state);
117edc41a10SNamhyung Kim }
118d6838ec4SNamhyung Kim }
119edc41a10SNamhyung Kim
get_cgroup_id(struct task_struct * t)120685439a7SNamhyung Kim static inline __u64 get_cgroup_id(struct task_struct *t)
121685439a7SNamhyung Kim {
122685439a7SNamhyung Kim struct cgroup *cgrp;
123685439a7SNamhyung Kim
124e42c9c54SNamhyung Kim if (!uses_cgroup_v1)
125e42c9c54SNamhyung Kim return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
126685439a7SNamhyung Kim
127e42c9c54SNamhyung Kim if (perf_subsys_id == -1) {
128e42c9c54SNamhyung Kim #if __has_builtin(__builtin_preserve_enum_value)
129e42c9c54SNamhyung Kim perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
130e42c9c54SNamhyung Kim perf_event_cgrp_id);
131e42c9c54SNamhyung Kim #else
132e42c9c54SNamhyung Kim perf_subsys_id = perf_event_cgrp_id;
133e42c9c54SNamhyung Kim #endif
134e42c9c54SNamhyung Kim }
135e42c9c54SNamhyung Kim
136e42c9c54SNamhyung Kim cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
137685439a7SNamhyung Kim return BPF_CORE_READ(cgrp, kn, id);
138685439a7SNamhyung Kim }
139685439a7SNamhyung Kim
can_record(struct task_struct * t,int state)14010742d0cSNamhyung Kim static inline int can_record(struct task_struct *t, int state)
14110742d0cSNamhyung Kim {
14210742d0cSNamhyung Kim /* kernel threads don't have user stack */
14310742d0cSNamhyung Kim if (t->flags & PF_KTHREAD)
14410742d0cSNamhyung Kim return 0;
14510742d0cSNamhyung Kim
14610742d0cSNamhyung Kim if (state != TASK_INTERRUPTIBLE &&
14710742d0cSNamhyung Kim state != TASK_UNINTERRUPTIBLE)
14810742d0cSNamhyung Kim return 0;
14910742d0cSNamhyung Kim
15010742d0cSNamhyung Kim if (has_cpu) {
15110742d0cSNamhyung Kim __u32 cpu = bpf_get_smp_processor_id();
15210742d0cSNamhyung Kim __u8 *ok;
15310742d0cSNamhyung Kim
15410742d0cSNamhyung Kim ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
15510742d0cSNamhyung Kim if (!ok)
15610742d0cSNamhyung Kim return 0;
15710742d0cSNamhyung Kim }
15810742d0cSNamhyung Kim
15910742d0cSNamhyung Kim if (has_task) {
16010742d0cSNamhyung Kim __u8 *ok;
16107fc958bSNamhyung Kim __u32 pid;
16207fc958bSNamhyung Kim
16307fc958bSNamhyung Kim if (uses_tgid)
16407fc958bSNamhyung Kim pid = t->tgid;
16507fc958bSNamhyung Kim else
16607fc958bSNamhyung Kim pid = t->pid;
16710742d0cSNamhyung Kim
16810742d0cSNamhyung Kim ok = bpf_map_lookup_elem(&task_filter, &pid);
16910742d0cSNamhyung Kim if (!ok)
17010742d0cSNamhyung Kim return 0;
17110742d0cSNamhyung Kim }
17210742d0cSNamhyung Kim
173685439a7SNamhyung Kim if (has_cgroup) {
174685439a7SNamhyung Kim __u8 *ok;
175685439a7SNamhyung Kim __u64 cgrp_id = get_cgroup_id(t);
176685439a7SNamhyung Kim
177685439a7SNamhyung Kim ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
178685439a7SNamhyung Kim if (!ok)
179685439a7SNamhyung Kim return 0;
180685439a7SNamhyung Kim }
181685439a7SNamhyung Kim
18210742d0cSNamhyung Kim return 1;
18310742d0cSNamhyung Kim }
18410742d0cSNamhyung Kim
off_cpu_stat(u64 * ctx,struct task_struct * prev,struct task_struct * next,int state)185b36888f7SNamhyung Kim static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
186b36888f7SNamhyung Kim struct task_struct *next, int state)
187edc41a10SNamhyung Kim {
188edc41a10SNamhyung Kim __u64 ts;
189edc41a10SNamhyung Kim __u32 stack_id;
190edc41a10SNamhyung Kim struct tstamp_data *pelem;
191edc41a10SNamhyung Kim
192edc41a10SNamhyung Kim ts = bpf_ktime_get_ns();
193edc41a10SNamhyung Kim
19410742d0cSNamhyung Kim if (!can_record(prev, state))
195edc41a10SNamhyung Kim goto next;
196edc41a10SNamhyung Kim
197edc41a10SNamhyung Kim stack_id = bpf_get_stackid(ctx, &stacks,
198edc41a10SNamhyung Kim BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
199edc41a10SNamhyung Kim
200edc41a10SNamhyung Kim pelem = bpf_task_storage_get(&tstamp, prev, NULL,
201edc41a10SNamhyung Kim BPF_LOCAL_STORAGE_GET_F_CREATE);
202edc41a10SNamhyung Kim if (!pelem)
203edc41a10SNamhyung Kim goto next;
204edc41a10SNamhyung Kim
205edc41a10SNamhyung Kim pelem->timestamp = ts;
206edc41a10SNamhyung Kim pelem->state = state;
207edc41a10SNamhyung Kim pelem->stack_id = stack_id;
208edc41a10SNamhyung Kim
209edc41a10SNamhyung Kim next:
210edc41a10SNamhyung Kim pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
211edc41a10SNamhyung Kim
212edc41a10SNamhyung Kim if (pelem && pelem->timestamp) {
213edc41a10SNamhyung Kim struct offcpu_key key = {
214edc41a10SNamhyung Kim .pid = next->pid,
215edc41a10SNamhyung Kim .tgid = next->tgid,
216edc41a10SNamhyung Kim .stack_id = pelem->stack_id,
217edc41a10SNamhyung Kim .state = pelem->state,
218685439a7SNamhyung Kim .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
219edc41a10SNamhyung Kim };
220edc41a10SNamhyung Kim __u64 delta = ts - pelem->timestamp;
221edc41a10SNamhyung Kim __u64 *total;
222edc41a10SNamhyung Kim
223edc41a10SNamhyung Kim total = bpf_map_lookup_elem(&off_cpu, &key);
224edc41a10SNamhyung Kim if (total)
225edc41a10SNamhyung Kim *total += delta;
226edc41a10SNamhyung Kim else
227edc41a10SNamhyung Kim bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
228edc41a10SNamhyung Kim
229edc41a10SNamhyung Kim /* prevent to reuse the timestamp later */
230edc41a10SNamhyung Kim pelem->timestamp = 0;
231edc41a10SNamhyung Kim }
232edc41a10SNamhyung Kim
233edc41a10SNamhyung Kim return 0;
234edc41a10SNamhyung Kim }
235edc41a10SNamhyung Kim
236d2347763SNamhyung Kim SEC("tp_btf/task_newtask")
on_newtask(u64 * ctx)237d2347763SNamhyung Kim int on_newtask(u64 *ctx)
238d2347763SNamhyung Kim {
239d2347763SNamhyung Kim struct task_struct *task;
240d2347763SNamhyung Kim u64 clone_flags;
241d2347763SNamhyung Kim u32 pid;
242d2347763SNamhyung Kim u8 val = 1;
243d2347763SNamhyung Kim
244d2347763SNamhyung Kim if (!uses_tgid)
245d2347763SNamhyung Kim return 0;
246d2347763SNamhyung Kim
247d2347763SNamhyung Kim task = (struct task_struct *)bpf_get_current_task();
248d2347763SNamhyung Kim
249d2347763SNamhyung Kim pid = BPF_CORE_READ(task, tgid);
250d2347763SNamhyung Kim if (!bpf_map_lookup_elem(&task_filter, &pid))
251d2347763SNamhyung Kim return 0;
252d2347763SNamhyung Kim
253d2347763SNamhyung Kim task = (struct task_struct *)ctx[0];
254d2347763SNamhyung Kim clone_flags = ctx[1];
255d2347763SNamhyung Kim
256d2347763SNamhyung Kim pid = task->tgid;
257d2347763SNamhyung Kim if (!(clone_flags & CLONE_THREAD))
258d2347763SNamhyung Kim bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
259d2347763SNamhyung Kim
260d2347763SNamhyung Kim return 0;
261d2347763SNamhyung Kim }
262d2347763SNamhyung Kim
263b36888f7SNamhyung Kim SEC("tp_btf/sched_switch")
on_switch(u64 * ctx)264b36888f7SNamhyung Kim int on_switch(u64 *ctx)
265b36888f7SNamhyung Kim {
266b36888f7SNamhyung Kim struct task_struct *prev, *next;
267b36888f7SNamhyung Kim int prev_state;
268b36888f7SNamhyung Kim
269b36888f7SNamhyung Kim if (!enabled)
270b36888f7SNamhyung Kim return 0;
271b36888f7SNamhyung Kim
272b36888f7SNamhyung Kim prev = (struct task_struct *)ctx[1];
273b36888f7SNamhyung Kim next = (struct task_struct *)ctx[2];
274b36888f7SNamhyung Kim
275b36888f7SNamhyung Kim if (has_prev_state)
276b36888f7SNamhyung Kim prev_state = (int)ctx[3];
277b36888f7SNamhyung Kim else
278b36888f7SNamhyung Kim prev_state = get_task_state(prev);
279b36888f7SNamhyung Kim
280*b48279afSNamhyung Kim return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
281b36888f7SNamhyung Kim }
282b36888f7SNamhyung Kim
283edc41a10SNamhyung Kim char LICENSE[] SEC("license") = "Dual BSD/GPL";
284