1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD   0x00200000  /* I am a kernel thread */
10 
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE	0x0001
13 #define TASK_UNINTERRUPTIBLE	0x0002
14 
15 #define MAX_STACKS   32
16 #define MAX_ENTRIES  102400
17 
18 struct tstamp_data {
19 	__u32 stack_id;
20 	__u32 state;
21 	__u64 timestamp;
22 };
23 
24 struct offcpu_key {
25 	__u32 pid;
26 	__u32 tgid;
27 	__u32 stack_id;
28 	__u32 state;
29 };
30 
31 struct {
32 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
33 	__uint(key_size, sizeof(__u32));
34 	__uint(value_size, MAX_STACKS * sizeof(__u64));
35 	__uint(max_entries, MAX_ENTRIES);
36 } stacks SEC(".maps");
37 
38 struct {
39 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
40 	__uint(map_flags, BPF_F_NO_PREALLOC);
41 	__type(key, int);
42 	__type(value, struct tstamp_data);
43 } tstamp SEC(".maps");
44 
45 struct {
46 	__uint(type, BPF_MAP_TYPE_HASH);
47 	__uint(key_size, sizeof(struct offcpu_key));
48 	__uint(value_size, sizeof(__u64));
49 	__uint(max_entries, MAX_ENTRIES);
50 } off_cpu SEC(".maps");
51 
52 struct {
53 	__uint(type, BPF_MAP_TYPE_HASH);
54 	__uint(key_size, sizeof(__u32));
55 	__uint(value_size, sizeof(__u8));
56 	__uint(max_entries, 1);
57 } cpu_filter SEC(".maps");
58 
59 struct {
60 	__uint(type, BPF_MAP_TYPE_HASH);
61 	__uint(key_size, sizeof(__u32));
62 	__uint(value_size, sizeof(__u8));
63 	__uint(max_entries, 1);
64 } task_filter SEC(".maps");
65 
66 /* old kernel task_struct definition */
67 struct task_struct___old {
68 	long state;
69 } __attribute__((preserve_access_index));
70 
71 int enabled = 0;
72 int has_cpu = 0;
73 int has_task = 0;
74 
75 const volatile bool has_prev_state = false;
76 
77 /*
78  * Old kernel used to call it task_struct->state and now it's '__state'.
79  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
80  *
81  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
82  */
83 static inline int get_task_state(struct task_struct *t)
84 {
85 	if (bpf_core_field_exists(t->__state))
86 		return BPF_CORE_READ(t, __state);
87 
88 	/* recast pointer to capture task_struct___old type for compiler */
89 	struct task_struct___old *t_old = (void *)t;
90 
91 	/* now use old "state" name of the field */
92 	return BPF_CORE_READ(t_old, state);
93 }
94 
95 static inline int can_record(struct task_struct *t, int state)
96 {
97 	/* kernel threads don't have user stack */
98 	if (t->flags & PF_KTHREAD)
99 		return 0;
100 
101 	if (state != TASK_INTERRUPTIBLE &&
102 	    state != TASK_UNINTERRUPTIBLE)
103 		return 0;
104 
105 	if (has_cpu) {
106 		__u32 cpu = bpf_get_smp_processor_id();
107 		__u8 *ok;
108 
109 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
110 		if (!ok)
111 			return 0;
112 	}
113 
114 	if (has_task) {
115 		__u8 *ok;
116 		__u32 pid = t->pid;
117 
118 		ok = bpf_map_lookup_elem(&task_filter, &pid);
119 		if (!ok)
120 			return 0;
121 	}
122 
123 	return 1;
124 }
125 
126 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
127 			struct task_struct *next, int state)
128 {
129 	__u64 ts;
130 	__u32 stack_id;
131 	struct tstamp_data *pelem;
132 
133 	ts = bpf_ktime_get_ns();
134 
135 	if (!can_record(prev, state))
136 		goto next;
137 
138 	stack_id = bpf_get_stackid(ctx, &stacks,
139 				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
140 
141 	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
142 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
143 	if (!pelem)
144 		goto next;
145 
146 	pelem->timestamp = ts;
147 	pelem->state = state;
148 	pelem->stack_id = stack_id;
149 
150 next:
151 	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
152 
153 	if (pelem && pelem->timestamp) {
154 		struct offcpu_key key = {
155 			.pid = next->pid,
156 			.tgid = next->tgid,
157 			.stack_id = pelem->stack_id,
158 			.state = pelem->state,
159 		};
160 		__u64 delta = ts - pelem->timestamp;
161 		__u64 *total;
162 
163 		total = bpf_map_lookup_elem(&off_cpu, &key);
164 		if (total)
165 			*total += delta;
166 		else
167 			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
168 
169 		/* prevent to reuse the timestamp later */
170 		pelem->timestamp = 0;
171 	}
172 
173 	return 0;
174 }
175 
176 SEC("tp_btf/sched_switch")
177 int on_switch(u64 *ctx)
178 {
179 	struct task_struct *prev, *next;
180 	int prev_state;
181 
182 	if (!enabled)
183 		return 0;
184 
185 	prev = (struct task_struct *)ctx[1];
186 	next = (struct task_struct *)ctx[2];
187 
188 	if (has_prev_state)
189 		prev_state = (int)ctx[3];
190 	else
191 		prev_state = get_task_state(prev);
192 
193 	return off_cpu_stat(ctx, prev, next, prev_state);
194 }
195 
196 char LICENSE[] SEC("license") = "Dual BSD/GPL";
197