1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 
8 #include "lock_data.h"
9 
10 /* default buffer size */
11 #define MAX_ENTRIES  10240
12 
13 /* lock contention flags from include/trace/events/lock.h */
14 #define LCB_F_SPIN	(1U << 0)
15 #define LCB_F_READ	(1U << 1)
16 #define LCB_F_WRITE	(1U << 2)
17 #define LCB_F_RT	(1U << 3)
18 #define LCB_F_PERCPU	(1U << 4)
19 #define LCB_F_MUTEX	(1U << 5)
20 
21 struct tstamp_data {
22 	__u64 timestamp;
23 	__u64 lock;
24 	__u32 flags;
25 	__s32 stack_id;
26 };
27 
28 /* callstack storage  */
29 struct {
30 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
31 	__uint(key_size, sizeof(__u32));
32 	__uint(value_size, sizeof(__u64));
33 	__uint(max_entries, MAX_ENTRIES);
34 } stacks SEC(".maps");
35 
36 /* maintain timestamp at the beginning of contention */
37 struct {
38 	__uint(type, BPF_MAP_TYPE_HASH);
39 	__type(key, int);
40 	__type(value, struct tstamp_data);
41 	__uint(max_entries, MAX_ENTRIES);
42 } tstamp SEC(".maps");
43 
44 /* actual lock contention statistics */
45 struct {
46 	__uint(type, BPF_MAP_TYPE_HASH);
47 	__uint(key_size, sizeof(struct contention_key));
48 	__uint(value_size, sizeof(struct contention_data));
49 	__uint(max_entries, MAX_ENTRIES);
50 } lock_stat SEC(".maps");
51 
52 struct {
53 	__uint(type, BPF_MAP_TYPE_HASH);
54 	__uint(key_size, sizeof(__u32));
55 	__uint(value_size, sizeof(struct contention_task_data));
56 	__uint(max_entries, MAX_ENTRIES);
57 } task_data SEC(".maps");
58 
59 struct {
60 	__uint(type, BPF_MAP_TYPE_HASH);
61 	__uint(key_size, sizeof(__u32));
62 	__uint(value_size, sizeof(__u8));
63 	__uint(max_entries, 1);
64 } cpu_filter SEC(".maps");
65 
66 struct {
67 	__uint(type, BPF_MAP_TYPE_HASH);
68 	__uint(key_size, sizeof(__u32));
69 	__uint(value_size, sizeof(__u8));
70 	__uint(max_entries, 1);
71 } task_filter SEC(".maps");
72 
73 struct {
74 	__uint(type, BPF_MAP_TYPE_HASH);
75 	__uint(key_size, sizeof(__u32));
76 	__uint(value_size, sizeof(__u8));
77 	__uint(max_entries, 1);
78 } type_filter SEC(".maps");
79 
80 struct {
81 	__uint(type, BPF_MAP_TYPE_HASH);
82 	__uint(key_size, sizeof(__u64));
83 	__uint(value_size, sizeof(__u8));
84 	__uint(max_entries, 1);
85 } addr_filter SEC(".maps");
86 
87 struct rw_semaphore___old {
88 	struct task_struct *owner;
89 } __attribute__((preserve_access_index));
90 
91 struct rw_semaphore___new {
92 	atomic_long_t owner;
93 } __attribute__((preserve_access_index));
94 
95 /* control flags */
96 int enabled;
97 int has_cpu;
98 int has_task;
99 int has_type;
100 int has_addr;
101 int needs_callstack;
102 int stack_skip;
103 int lock_owner;
104 
105 /* determine the key of lock stat */
106 int aggr_mode;
107 
108 /* error stat */
109 int lost;
110 
111 static inline int can_record(u64 *ctx)
112 {
113 	if (has_cpu) {
114 		__u32 cpu = bpf_get_smp_processor_id();
115 		__u8 *ok;
116 
117 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
118 		if (!ok)
119 			return 0;
120 	}
121 
122 	if (has_task) {
123 		__u8 *ok;
124 		__u32 pid = bpf_get_current_pid_tgid();
125 
126 		ok = bpf_map_lookup_elem(&task_filter, &pid);
127 		if (!ok)
128 			return 0;
129 	}
130 
131 	if (has_type) {
132 		__u8 *ok;
133 		__u32 flags = (__u32)ctx[1];
134 
135 		ok = bpf_map_lookup_elem(&type_filter, &flags);
136 		if (!ok)
137 			return 0;
138 	}
139 
140 	if (has_addr) {
141 		__u8 *ok;
142 		__u64 addr = ctx[0];
143 
144 		ok = bpf_map_lookup_elem(&addr_filter, &addr);
145 		if (!ok)
146 			return 0;
147 	}
148 
149 	return 1;
150 }
151 
152 static inline int update_task_data(struct task_struct *task)
153 {
154 	struct contention_task_data *p;
155 	int pid, err;
156 
157 	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
158 	if (err)
159 		return -1;
160 
161 	p = bpf_map_lookup_elem(&task_data, &pid);
162 	if (p == NULL) {
163 		struct contention_task_data data = {};
164 
165 		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
166 		bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST);
167 	}
168 
169 	return 0;
170 }
171 
172 #ifndef __has_builtin
173 # define __has_builtin(x) 0
174 #endif
175 
176 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
177 {
178 	struct task_struct *task;
179 	__u64 owner = 0;
180 
181 	if (flags & LCB_F_MUTEX) {
182 		struct mutex *mutex = (void *)lock;
183 		owner = BPF_CORE_READ(mutex, owner.counter);
184 	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
185 #if __has_builtin(bpf_core_type_matches)
186 		if (bpf_core_type_matches(struct rw_semaphore___old)) {
187 			struct rw_semaphore___old *rwsem = (void *)lock;
188 			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
189 		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
190 			struct rw_semaphore___new *rwsem = (void *)lock;
191 			owner = BPF_CORE_READ(rwsem, owner.counter);
192 		}
193 #else
194 		/* assume new struct */
195 		struct rw_semaphore *rwsem = (void *)lock;
196 		owner = BPF_CORE_READ(rwsem, owner.counter);
197 #endif
198 	}
199 
200 	if (!owner)
201 		return NULL;
202 
203 	task = (void *)(owner & ~7UL);
204 	return task;
205 }
206 
207 SEC("tp_btf/contention_begin")
208 int contention_begin(u64 *ctx)
209 {
210 	__u32 pid;
211 	struct tstamp_data *pelem;
212 
213 	if (!enabled || !can_record(ctx))
214 		return 0;
215 
216 	pid = bpf_get_current_pid_tgid();
217 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
218 	if (pelem && pelem->lock)
219 		return 0;
220 
221 	if (pelem == NULL) {
222 		struct tstamp_data zero = {};
223 
224 		bpf_map_update_elem(&tstamp, &pid, &zero, BPF_ANY);
225 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
226 		if (pelem == NULL) {
227 			lost++;
228 			return 0;
229 		}
230 	}
231 
232 	pelem->timestamp = bpf_ktime_get_ns();
233 	pelem->lock = (__u64)ctx[0];
234 	pelem->flags = (__u32)ctx[1];
235 
236 	if (needs_callstack) {
237 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
238 						  BPF_F_FAST_STACK_CMP | stack_skip);
239 		if (pelem->stack_id < 0)
240 			lost++;
241 	} else if (aggr_mode == LOCK_AGGR_TASK) {
242 		struct task_struct *task;
243 
244 		if (lock_owner) {
245 			task = get_lock_owner(pelem->lock, pelem->flags);
246 
247 			/* The flags is not used anymore.  Pass the owner pid. */
248 			if (task)
249 				pelem->flags = BPF_CORE_READ(task, pid);
250 			else
251 				pelem->flags = -1U;
252 
253 		} else {
254 			task = bpf_get_current_task_btf();
255 		}
256 
257 		if (task) {
258 			if (update_task_data(task) < 0 && lock_owner)
259 				pelem->flags = -1U;
260 		}
261 	}
262 
263 	return 0;
264 }
265 
266 SEC("tp_btf/contention_end")
267 int contention_end(u64 *ctx)
268 {
269 	__u32 pid;
270 	struct tstamp_data *pelem;
271 	struct contention_key key = {};
272 	struct contention_data *data;
273 	__u64 duration;
274 
275 	if (!enabled)
276 		return 0;
277 
278 	pid = bpf_get_current_pid_tgid();
279 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
280 	if (!pelem || pelem->lock != ctx[0])
281 		return 0;
282 
283 	duration = bpf_ktime_get_ns() - pelem->timestamp;
284 
285 	switch (aggr_mode) {
286 	case LOCK_AGGR_CALLER:
287 		key.stack_id = pelem->stack_id;
288 		break;
289 	case LOCK_AGGR_TASK:
290 		if (lock_owner)
291 			key.pid = pelem->flags;
292 		else
293 			key.pid = pid;
294 		if (needs_callstack)
295 			key.stack_id = pelem->stack_id;
296 		break;
297 	case LOCK_AGGR_ADDR:
298 		key.lock_addr = pelem->lock;
299 		if (needs_callstack)
300 			key.stack_id = pelem->stack_id;
301 		break;
302 	default:
303 		/* should not happen */
304 		return 0;
305 	}
306 
307 	data = bpf_map_lookup_elem(&lock_stat, &key);
308 	if (!data) {
309 		struct contention_data first = {
310 			.total_time = duration,
311 			.max_time = duration,
312 			.min_time = duration,
313 			.count = 1,
314 			.flags = pelem->flags,
315 		};
316 
317 		bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
318 		bpf_map_delete_elem(&tstamp, &pid);
319 		return 0;
320 	}
321 
322 	__sync_fetch_and_add(&data->total_time, duration);
323 	__sync_fetch_and_add(&data->count, 1);
324 
325 	/* FIXME: need atomic operations */
326 	if (data->max_time < duration)
327 		data->max_time = duration;
328 	if (data->min_time > duration)
329 		data->min_time = duration;
330 
331 	bpf_map_delete_elem(&tstamp, &pid);
332 	return 0;
333 }
334 
335 char LICENSE[] SEC("license") = "Dual BSD/GPL";
336