1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 
8 #include "lock_data.h"
9 
10 /* default buffer size */
11 #define MAX_ENTRIES  10240
12 
13 /* for collect_lock_syms().  4096 was rejected by the verifier */
14 #define MAX_CPUS  1024
15 
16 /* lock contention flags from include/trace/events/lock.h */
17 #define LCB_F_SPIN	(1U << 0)
18 #define LCB_F_READ	(1U << 1)
19 #define LCB_F_WRITE	(1U << 2)
20 #define LCB_F_RT	(1U << 3)
21 #define LCB_F_PERCPU	(1U << 4)
22 #define LCB_F_MUTEX	(1U << 5)
23 
24 struct tstamp_data {
25 	__u64 timestamp;
26 	__u64 lock;
27 	__u32 flags;
28 	__s32 stack_id;
29 };
30 
31 /* callstack storage  */
32 struct {
33 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
34 	__uint(key_size, sizeof(__u32));
35 	__uint(value_size, sizeof(__u64));
36 	__uint(max_entries, MAX_ENTRIES);
37 } stacks SEC(".maps");
38 
39 /* maintain timestamp at the beginning of contention */
40 struct {
41 	__uint(type, BPF_MAP_TYPE_HASH);
42 	__type(key, int);
43 	__type(value, struct tstamp_data);
44 	__uint(max_entries, MAX_ENTRIES);
45 } tstamp SEC(".maps");
46 
47 /* actual lock contention statistics */
48 struct {
49 	__uint(type, BPF_MAP_TYPE_HASH);
50 	__uint(key_size, sizeof(struct contention_key));
51 	__uint(value_size, sizeof(struct contention_data));
52 	__uint(max_entries, MAX_ENTRIES);
53 } lock_stat SEC(".maps");
54 
55 struct {
56 	__uint(type, BPF_MAP_TYPE_HASH);
57 	__uint(key_size, sizeof(__u32));
58 	__uint(value_size, sizeof(struct contention_task_data));
59 	__uint(max_entries, MAX_ENTRIES);
60 } task_data SEC(".maps");
61 
62 struct {
63 	__uint(type, BPF_MAP_TYPE_HASH);
64 	__uint(key_size, sizeof(__u64));
65 	__uint(value_size, sizeof(__u32));
66 	__uint(max_entries, 16384);
67 } lock_syms SEC(".maps");
68 
69 struct {
70 	__uint(type, BPF_MAP_TYPE_HASH);
71 	__uint(key_size, sizeof(__u32));
72 	__uint(value_size, sizeof(__u8));
73 	__uint(max_entries, 1);
74 } cpu_filter SEC(".maps");
75 
76 struct {
77 	__uint(type, BPF_MAP_TYPE_HASH);
78 	__uint(key_size, sizeof(__u32));
79 	__uint(value_size, sizeof(__u8));
80 	__uint(max_entries, 1);
81 } task_filter SEC(".maps");
82 
83 struct {
84 	__uint(type, BPF_MAP_TYPE_HASH);
85 	__uint(key_size, sizeof(__u32));
86 	__uint(value_size, sizeof(__u8));
87 	__uint(max_entries, 1);
88 } type_filter SEC(".maps");
89 
90 struct {
91 	__uint(type, BPF_MAP_TYPE_HASH);
92 	__uint(key_size, sizeof(__u64));
93 	__uint(value_size, sizeof(__u8));
94 	__uint(max_entries, 1);
95 } addr_filter SEC(".maps");
96 
97 struct rw_semaphore___old {
98 	struct task_struct *owner;
99 } __attribute__((preserve_access_index));
100 
101 struct rw_semaphore___new {
102 	atomic_long_t owner;
103 } __attribute__((preserve_access_index));
104 
105 struct mm_struct___old {
106 	struct rw_semaphore mmap_sem;
107 } __attribute__((preserve_access_index));
108 
109 struct mm_struct___new {
110 	struct rw_semaphore mmap_lock;
111 } __attribute__((preserve_access_index));
112 
113 /* control flags */
114 int enabled;
115 int has_cpu;
116 int has_task;
117 int has_type;
118 int has_addr;
119 int needs_callstack;
120 int stack_skip;
121 int lock_owner;
122 
123 /* determine the key of lock stat */
124 int aggr_mode;
125 
126 /* error stat */
127 int lost;
128 
129 static inline int can_record(u64 *ctx)
130 {
131 	if (has_cpu) {
132 		__u32 cpu = bpf_get_smp_processor_id();
133 		__u8 *ok;
134 
135 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
136 		if (!ok)
137 			return 0;
138 	}
139 
140 	if (has_task) {
141 		__u8 *ok;
142 		__u32 pid = bpf_get_current_pid_tgid();
143 
144 		ok = bpf_map_lookup_elem(&task_filter, &pid);
145 		if (!ok)
146 			return 0;
147 	}
148 
149 	if (has_type) {
150 		__u8 *ok;
151 		__u32 flags = (__u32)ctx[1];
152 
153 		ok = bpf_map_lookup_elem(&type_filter, &flags);
154 		if (!ok)
155 			return 0;
156 	}
157 
158 	if (has_addr) {
159 		__u8 *ok;
160 		__u64 addr = ctx[0];
161 
162 		ok = bpf_map_lookup_elem(&addr_filter, &addr);
163 		if (!ok)
164 			return 0;
165 	}
166 
167 	return 1;
168 }
169 
170 static inline int update_task_data(struct task_struct *task)
171 {
172 	struct contention_task_data *p;
173 	int pid, err;
174 
175 	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
176 	if (err)
177 		return -1;
178 
179 	p = bpf_map_lookup_elem(&task_data, &pid);
180 	if (p == NULL) {
181 		struct contention_task_data data = {};
182 
183 		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
184 		bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST);
185 	}
186 
187 	return 0;
188 }
189 
190 #ifndef __has_builtin
191 # define __has_builtin(x) 0
192 #endif
193 
194 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
195 {
196 	struct task_struct *task;
197 	__u64 owner = 0;
198 
199 	if (flags & LCB_F_MUTEX) {
200 		struct mutex *mutex = (void *)lock;
201 		owner = BPF_CORE_READ(mutex, owner.counter);
202 	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
203 	/*
204 	 * Support for the BPF_TYPE_MATCHES argument to the
205 	 * __builtin_preserve_type_info builtin was added at some point during
206 	 * development of clang 15 and it's what is needed for
207 	 * bpf_core_type_matches.
208 	 */
209 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
210 		if (bpf_core_type_matches(struct rw_semaphore___old)) {
211 			struct rw_semaphore___old *rwsem = (void *)lock;
212 			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
213 		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
214 			struct rw_semaphore___new *rwsem = (void *)lock;
215 			owner = BPF_CORE_READ(rwsem, owner.counter);
216 		}
217 #else
218 		/* assume new struct */
219 		struct rw_semaphore *rwsem = (void *)lock;
220 		owner = BPF_CORE_READ(rwsem, owner.counter);
221 #endif
222 	}
223 
224 	if (!owner)
225 		return NULL;
226 
227 	task = (void *)(owner & ~7UL);
228 	return task;
229 }
230 
231 static inline __u32 check_lock_type(__u64 lock, __u32 flags)
232 {
233 	struct task_struct *curr;
234 	struct mm_struct___old *mm_old;
235 	struct mm_struct___new *mm_new;
236 
237 	switch (flags) {
238 	case LCB_F_READ:  /* rwsem */
239 	case LCB_F_WRITE:
240 		curr = bpf_get_current_task_btf();
241 		if (curr->mm == NULL)
242 			break;
243 		mm_new = (void *)curr->mm;
244 		if (bpf_core_field_exists(mm_new->mmap_lock)) {
245 			if (&mm_new->mmap_lock == (void *)lock)
246 				return LCD_F_MMAP_LOCK;
247 			break;
248 		}
249 		mm_old = (void *)curr->mm;
250 		if (bpf_core_field_exists(mm_old->mmap_sem)) {
251 			if (&mm_old->mmap_sem == (void *)lock)
252 				return LCD_F_MMAP_LOCK;
253 		}
254 		break;
255 	case LCB_F_SPIN:  /* spinlock */
256 		curr = bpf_get_current_task_btf();
257 		if (&curr->sighand->siglock == (void *)lock)
258 			return LCD_F_SIGHAND_LOCK;
259 		break;
260 	default:
261 		break;
262 	}
263 	return 0;
264 }
265 
266 SEC("tp_btf/contention_begin")
267 int contention_begin(u64 *ctx)
268 {
269 	__u32 pid;
270 	struct tstamp_data *pelem;
271 
272 	if (!enabled || !can_record(ctx))
273 		return 0;
274 
275 	pid = bpf_get_current_pid_tgid();
276 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
277 	if (pelem && pelem->lock)
278 		return 0;
279 
280 	if (pelem == NULL) {
281 		struct tstamp_data zero = {};
282 
283 		bpf_map_update_elem(&tstamp, &pid, &zero, BPF_ANY);
284 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
285 		if (pelem == NULL) {
286 			lost++;
287 			return 0;
288 		}
289 	}
290 
291 	pelem->timestamp = bpf_ktime_get_ns();
292 	pelem->lock = (__u64)ctx[0];
293 	pelem->flags = (__u32)ctx[1];
294 
295 	if (needs_callstack) {
296 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
297 						  BPF_F_FAST_STACK_CMP | stack_skip);
298 		if (pelem->stack_id < 0)
299 			lost++;
300 	} else if (aggr_mode == LOCK_AGGR_TASK) {
301 		struct task_struct *task;
302 
303 		if (lock_owner) {
304 			task = get_lock_owner(pelem->lock, pelem->flags);
305 
306 			/* The flags is not used anymore.  Pass the owner pid. */
307 			if (task)
308 				pelem->flags = BPF_CORE_READ(task, pid);
309 			else
310 				pelem->flags = -1U;
311 
312 		} else {
313 			task = bpf_get_current_task_btf();
314 		}
315 
316 		if (task) {
317 			if (update_task_data(task) < 0 && lock_owner)
318 				pelem->flags = -1U;
319 		}
320 	}
321 
322 	return 0;
323 }
324 
325 SEC("tp_btf/contention_end")
326 int contention_end(u64 *ctx)
327 {
328 	__u32 pid;
329 	struct tstamp_data *pelem;
330 	struct contention_key key = {};
331 	struct contention_data *data;
332 	__u64 duration;
333 
334 	if (!enabled)
335 		return 0;
336 
337 	pid = bpf_get_current_pid_tgid();
338 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
339 	if (!pelem || pelem->lock != ctx[0])
340 		return 0;
341 
342 	duration = bpf_ktime_get_ns() - pelem->timestamp;
343 
344 	switch (aggr_mode) {
345 	case LOCK_AGGR_CALLER:
346 		key.stack_id = pelem->stack_id;
347 		break;
348 	case LOCK_AGGR_TASK:
349 		if (lock_owner)
350 			key.pid = pelem->flags;
351 		else
352 			key.pid = pid;
353 		if (needs_callstack)
354 			key.stack_id = pelem->stack_id;
355 		break;
356 	case LOCK_AGGR_ADDR:
357 		key.lock_addr = pelem->lock;
358 		if (needs_callstack)
359 			key.stack_id = pelem->stack_id;
360 		break;
361 	default:
362 		/* should not happen */
363 		return 0;
364 	}
365 
366 	data = bpf_map_lookup_elem(&lock_stat, &key);
367 	if (!data) {
368 		struct contention_data first = {
369 			.total_time = duration,
370 			.max_time = duration,
371 			.min_time = duration,
372 			.count = 1,
373 			.flags = pelem->flags,
374 		};
375 
376 		if (aggr_mode == LOCK_AGGR_ADDR)
377 			first.flags |= check_lock_type(pelem->lock, pelem->flags);
378 
379 		bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
380 		bpf_map_delete_elem(&tstamp, &pid);
381 		return 0;
382 	}
383 
384 	__sync_fetch_and_add(&data->total_time, duration);
385 	__sync_fetch_and_add(&data->count, 1);
386 
387 	/* FIXME: need atomic operations */
388 	if (data->max_time < duration)
389 		data->max_time = duration;
390 	if (data->min_time > duration)
391 		data->min_time = duration;
392 
393 	bpf_map_delete_elem(&tstamp, &pid);
394 	return 0;
395 }
396 
397 extern struct rq runqueues __ksym;
398 
399 SEC("raw_tp/bpf_test_finish")
400 int BPF_PROG(collect_lock_syms)
401 {
402 	__u64 lock_addr;
403 	__u32 lock_flag;
404 
405 	for (int i = 0; i < MAX_CPUS; i++) {
406 		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
407 
408 		if (rq == NULL)
409 			break;
410 
411 		lock_addr = (__u64)&rq->__lock;
412 		lock_flag = LOCK_CLASS_RQLOCK;
413 		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
414 	}
415 	return 0;
416 }
417 
418 char LICENSE[] SEC("license") = "Dual BSD/GPL";
419