1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Facebook
3 
4 #include <stdint.h>
5 #include <stddef.h>
6 #include <stdbool.h>
7 #include <linux/bpf.h>
8 #include <linux/ptrace.h>
9 #include <linux/sched.h>
10 #include <linux/types.h>
11 #include <bpf/bpf_helpers.h>
12 
13 typedef uint32_t pid_t;
14 struct task_struct {};
15 
16 #define TASK_COMM_LEN 16
17 #define PERF_MAX_STACK_DEPTH 127
18 
19 #define STROBE_TYPE_INVALID 0
20 #define STROBE_TYPE_INT 1
21 #define STROBE_TYPE_STR 2
22 #define STROBE_TYPE_MAP 3
23 
24 #define STACK_TABLE_EPOCH_SHIFT 20
25 #define STROBE_MAX_STR_LEN 1
26 #define STROBE_MAX_CFGS 32
27 #define STROBE_MAX_PAYLOAD						\
28 	(STROBE_MAX_STRS * STROBE_MAX_STR_LEN +				\
29 	STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
30 
31 struct strobe_value_header {
32 	/*
33 	 * meaning depends on type:
34 	 * 1. int: 0, if value not set, 1 otherwise
35 	 * 2. str: 1 always, whether value is set or not is determined by ptr
36 	 * 3. map: 1 always, pointer points to additional struct with number
37 	 *    of entries (up to STROBE_MAX_MAP_ENTRIES)
38 	 */
39 	uint16_t len;
40 	/*
41 	 * _reserved might be used for some future fields/flags, but we always
42 	 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
43 	 * bytes in one go and get both header and value
44 	 */
45 	uint8_t _reserved[6];
46 };
47 
48 /*
49  * strobe_value_generic is used from BPF probe only, but needs to be a union
50  * of strobe_value_int/strobe_value_str/strobe_value_map
51  */
52 struct strobe_value_generic {
53 	struct strobe_value_header header;
54 	union {
55 		int64_t val;
56 		void *ptr;
57 	};
58 };
59 
60 struct strobe_value_int {
61 	struct strobe_value_header header;
62 	int64_t value;
63 };
64 
65 struct strobe_value_str {
66 	struct strobe_value_header header;
67 	const char* value;
68 };
69 
70 struct strobe_value_map {
71 	struct strobe_value_header header;
72 	const struct strobe_map_raw* value;
73 };
74 
75 struct strobe_map_entry {
76 	const char* key;
77 	const char* val;
78 };
79 
80 /*
81  * Map of C-string key/value pairs with fixed maximum capacity. Each map has
82  * corresponding int64 ID, which application can use (or ignore) in whatever
83  * way appropriate. Map is "write-only", there is no way to get data out of
84  * map. Map is intended to be used to provide metadata for profilers and is
85  * not to be used for internal in-app communication. All methods are
86  * thread-safe.
87  */
88 struct strobe_map_raw {
89 	/*
90 	 * general purpose unique ID that's up to application to decide
91 	 * whether and how to use; for request metadata use case id is unique
92 	 * request ID that's used to match metadata with stack traces on
93 	 * Strobelight backend side
94 	 */
95 	int64_t id;
96 	/* number of used entries in map */
97 	int64_t cnt;
98 	/*
99 	 * having volatile doesn't change anything on BPF side, but clang
100 	 * emits warnings for passing `volatile const char *` into
101 	 * bpf_probe_read_user_str that expects just `const char *`
102 	 */
103 	const char* tag;
104 	/*
105 	 * key/value entries, each consisting of 2 pointers to key and value
106 	 * C strings
107 	 */
108 	struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
109 };
110 
111 /* Following values define supported values of TLS mode */
112 #define TLS_NOT_SET -1
113 #define TLS_LOCAL_EXEC 0
114 #define TLS_IMM_EXEC 1
115 #define TLS_GENERAL_DYN 2
116 
117 /*
118  * structure that universally represents TLS location (both for static
119  * executables and shared libraries)
120  */
121 struct strobe_value_loc {
122 	/*
123 	 * tls_mode defines what TLS mode was used for particular metavariable:
124 	 * - -1 (TLS_NOT_SET) - no metavariable;
125 	 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
126 	 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
127 	 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
128 	 * Local Dynamic mode is not yet supported, because never seen in
129 	 * practice.  Mode defines how offset field is interpreted. See
130 	 * calc_location() in below for details.
131 	 */
132 	int64_t tls_mode;
133 	/*
134 	 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
135 	 * tpidr_el0 for aarch64).
136 	 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
137 	 * from thread pointer;
138 	 * TLS_GENERAL_DYN: absolute addres of double GOT entry
139 	 * containing tls_index_t struct;
140 	 */
141 	int64_t offset;
142 };
143 
144 struct strobemeta_cfg {
145 	int64_t req_meta_idx;
146 	struct strobe_value_loc int_locs[STROBE_MAX_INTS];
147 	struct strobe_value_loc str_locs[STROBE_MAX_STRS];
148 	struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
149 };
150 
151 struct strobe_map_descr {
152 	uint64_t id;
153 	int16_t tag_len;
154 	/*
155 	 * cnt <0 - map value isn't set;
156 	 * 0 - map has id set, but no key/value entries
157 	 */
158 	int16_t cnt;
159 	/*
160 	 * both key_lens[i] and val_lens[i] should be >0 for present key/value
161 	 * entry
162 	 */
163 	uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
164 	uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
165 };
166 
167 struct strobemeta_payload {
168 	/* req_id has valid request ID, if req_meta_valid == 1 */
169 	int64_t req_id;
170 	uint8_t req_meta_valid;
171 	/*
172 	 * mask has Nth bit set to 1, if Nth metavar was present and
173 	 * successfully read
174 	 */
175 	uint64_t int_vals_set_mask;
176 	int64_t int_vals[STROBE_MAX_INTS];
177 	/* len is >0 for present values */
178 	uint16_t str_lens[STROBE_MAX_STRS];
179 	/* if map_descrs[i].cnt == -1, metavar is not present/set */
180 	struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
181 	/*
182 	 * payload has compactly packed values of str and map variables in the
183 	 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
184 	 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
185 	 * value length
186 	 */
187 	char payload[STROBE_MAX_PAYLOAD];
188 };
189 
190 struct strobelight_bpf_sample {
191 	uint64_t ktime;
192 	char comm[TASK_COMM_LEN];
193 	pid_t pid;
194 	int user_stack_id;
195 	int kernel_stack_id;
196 	int has_meta;
197 	struct strobemeta_payload metadata;
198 	/*
199 	 * makes it possible to pass (<real payload size> + 1) as data size to
200 	 * perf_submit() to avoid perf_submit's paranoia about passing zero as
201 	 * size, as it deduces that <real payload size> might be
202 	 * **theoretically** zero
203 	 */
204 	char dummy_safeguard;
205 };
206 
207 struct {
208 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
209 	__uint(max_entries, 32);
210 	__uint(key_size, sizeof(int));
211 	__uint(value_size, sizeof(int));
212 } samples SEC(".maps");
213 
214 struct {
215 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
216 	__uint(max_entries, 16);
217 	__uint(key_size, sizeof(uint32_t));
218 	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
219 } stacks_0 SEC(".maps");
220 
221 struct {
222 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
223 	__uint(max_entries, 16);
224 	__uint(key_size, sizeof(uint32_t));
225 	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
226 } stacks_1 SEC(".maps");
227 
228 struct {
229 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
230 	__uint(max_entries, 1);
231 	__type(key, uint32_t);
232 	__type(value, struct strobelight_bpf_sample);
233 } sample_heap SEC(".maps");
234 
235 struct {
236 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
237 	__uint(max_entries, STROBE_MAX_CFGS);
238 	__type(key, pid_t);
239 	__type(value, struct strobemeta_cfg);
240 } strobemeta_cfgs SEC(".maps");
241 
242 /* Type for the dtv.  */
243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
244 typedef union dtv {
245 	size_t counter;
246 	struct {
247 		void* val;
248 		bool is_static;
249 	} pointer;
250 } dtv_t;
251 
252 /* Partial definition for tcbhead_t */
253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
254 struct tcbhead {
255 	void* tcb;
256 	dtv_t* dtv;
257 };
258 
259 /*
260  * TLS module/offset information for shared library case.
261  * For x86-64, this is mapped onto two entries in GOT.
262  * For aarch64, this is pointed to by second GOT entry.
263  */
264 struct tls_index {
265 	uint64_t module;
266 	uint64_t offset;
267 };
268 
269 #ifdef SUBPROGS
270 __noinline
271 #else
272 __always_inline
273 #endif
274 static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
275 {
276 	/*
277 	 * tls_mode value is:
278 	 * - -1 (TLS_NOT_SET), if no metavar is present;
279 	 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
280 	 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
281 	 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
282 	 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
283 	 * This schema allows to use something like:
284 	 * (tls_mode + 1) * (tls_base + offset)
285 	 * to get NULL for "no metavar" location, or correct pointer for local
286 	 * executable mode without doing extra ifs.
287 	 */
288 	if (loc->tls_mode <= TLS_LOCAL_EXEC) {
289 		/* static executable is simple, we just have offset from
290 		 * tls_base */
291 		void *addr = tls_base + loc->offset;
292 		/* multiply by (tls_mode + 1) to get NULL, if we have no
293 		 * metavar in this slot */
294 		return (void *)((loc->tls_mode + 1) * (int64_t)addr);
295 	}
296 	/*
297 	 * Other modes are more complicated, we need to jump through few hoops.
298 	 *
299 	 * For immediate executable mode (currently supported only for aarch64):
300 	 *  - loc->offset is pointing to a GOT entry containing fixed offset
301 	 *  relative to tls_base;
302 	 *
303 	 * For general dynamic mode:
304 	 *  - loc->offset is pointing to a beginning of double GOT entries;
305 	 *  - (for aarch64 only) second entry points to tls_index_t struct;
306 	 *  - (for x86-64 only) two GOT entries are already tls_index_t;
307 	 *  - tls_index_t->module is used to find start of TLS section in
308 	 *  which variable resides;
309 	 *  - tls_index_t->offset provides offset within that TLS section,
310 	 *  pointing to value of variable.
311 	 */
312 	struct tls_index tls_index;
313 	dtv_t *dtv;
314 	void *tls_ptr;
315 
316 	bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
317 			    (void *)loc->offset);
318 	/* valid module index is always positive */
319 	if (tls_index.module > 0) {
320 		/* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
321 		bpf_probe_read_user(&dtv, sizeof(dtv),
322 				    &((struct tcbhead *)tls_base)->dtv);
323 		dtv += tls_index.module;
324 	} else {
325 		dtv = NULL;
326 	}
327 	bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
328 	/* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
329 	return tls_ptr && tls_ptr != (void *)-1
330 		? tls_ptr + tls_index.offset
331 		: NULL;
332 }
333 
334 #ifdef SUBPROGS
335 __noinline
336 #else
337 __always_inline
338 #endif
339 static void read_int_var(struct strobemeta_cfg *cfg,
340 			 size_t idx, void *tls_base,
341 			 struct strobe_value_generic *value,
342 			 struct strobemeta_payload *data)
343 {
344 	void *location = calc_location(&cfg->int_locs[idx], tls_base);
345 	if (!location)
346 		return;
347 
348 	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
349 	data->int_vals[idx] = value->val;
350 	if (value->header.len)
351 		data->int_vals_set_mask |= (1 << idx);
352 }
353 
354 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
355 					     size_t idx, void *tls_base,
356 					     struct strobe_value_generic *value,
357 					     struct strobemeta_payload *data,
358 					     void *payload)
359 {
360 	void *location;
361 	uint64_t len;
362 
363 	data->str_lens[idx] = 0;
364 	location = calc_location(&cfg->str_locs[idx], tls_base);
365 	if (!location)
366 		return 0;
367 
368 	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
369 	len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr);
370 	/*
371 	 * if bpf_probe_read_user_str returns error (<0), due to casting to
372 	 * unsinged int, it will become big number, so next check is
373 	 * sufficient to check for errors AND prove to BPF verifier, that
374 	 * bpf_probe_read_user_str won't return anything bigger than
375 	 * STROBE_MAX_STR_LEN
376 	 */
377 	if (len > STROBE_MAX_STR_LEN)
378 		return 0;
379 
380 	data->str_lens[idx] = len;
381 	return len;
382 }
383 
384 static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
385 					  size_t idx, void *tls_base,
386 					  struct strobe_value_generic *value,
387 					  struct strobemeta_payload *data,
388 					  void *payload)
389 {
390 	struct strobe_map_descr* descr = &data->map_descrs[idx];
391 	struct strobe_map_raw map;
392 	void *location;
393 	uint64_t len;
394 	int i;
395 
396 	descr->tag_len = 0; /* presume no tag is set */
397 	descr->cnt = -1; /* presume no value is set */
398 
399 	location = calc_location(&cfg->map_locs[idx], tls_base);
400 	if (!location)
401 		return payload;
402 
403 	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
404 	if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
405 		return payload;
406 
407 	descr->id = map.id;
408 	descr->cnt = map.cnt;
409 	if (cfg->req_meta_idx == idx) {
410 		data->req_id = map.id;
411 		data->req_meta_valid = 1;
412 	}
413 
414 	len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag);
415 	if (len <= STROBE_MAX_STR_LEN) {
416 		descr->tag_len = len;
417 		payload += len;
418 	}
419 
420 #ifdef NO_UNROLL
421 #pragma clang loop unroll(disable)
422 #else
423 #pragma unroll
424 #endif
425 	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
426 		if (i >= map.cnt)
427 			break;
428 
429 		descr->key_lens[i] = 0;
430 		len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
431 					      map.entries[i].key);
432 		if (len <= STROBE_MAX_STR_LEN) {
433 			descr->key_lens[i] = len;
434 			payload += len;
435 		}
436 		descr->val_lens[i] = 0;
437 		len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
438 					      map.entries[i].val);
439 		if (len <= STROBE_MAX_STR_LEN) {
440 			descr->val_lens[i] = len;
441 			payload += len;
442 		}
443 	}
444 
445 	return payload;
446 }
447 
448 #ifdef USE_BPF_LOOP
449 enum read_type {
450 	READ_INT_VAR,
451 	READ_MAP_VAR,
452 	READ_STR_VAR,
453 };
454 
455 struct read_var_ctx {
456 	struct strobemeta_payload *data;
457 	void *tls_base;
458 	struct strobemeta_cfg *cfg;
459 	void *payload;
460 	/* value gets mutated */
461 	struct strobe_value_generic *value;
462 	enum read_type type;
463 };
464 
465 static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
466 {
467 	switch (ctx->type) {
468 	case READ_INT_VAR:
469 		if (index >= STROBE_MAX_INTS)
470 			return 1;
471 		read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
472 		break;
473 	case READ_MAP_VAR:
474 		if (index >= STROBE_MAX_MAPS)
475 			return 1;
476 		ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base,
477 					    ctx->value, ctx->data, ctx->payload);
478 		break;
479 	case READ_STR_VAR:
480 		if (index >= STROBE_MAX_STRS)
481 			return 1;
482 		ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base,
483 					     ctx->value, ctx->data, ctx->payload);
484 		break;
485 	}
486 	return 0;
487 }
488 #endif /* USE_BPF_LOOP */
489 
490 /*
491  * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
492  * pointer to *right after* payload ends
493  */
494 #ifdef SUBPROGS
495 __noinline
496 #else
497 __always_inline
498 #endif
499 static void *read_strobe_meta(struct task_struct *task,
500 			      struct strobemeta_payload *data)
501 {
502 	pid_t pid = bpf_get_current_pid_tgid() >> 32;
503 	struct strobe_value_generic value = {0};
504 	struct strobemeta_cfg *cfg;
505 	void *tls_base, *payload;
506 
507 	cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
508 	if (!cfg)
509 		return NULL;
510 
511 	data->int_vals_set_mask = 0;
512 	data->req_meta_valid = 0;
513 	payload = data->payload;
514 	/*
515 	 * we don't have struct task_struct definition, it should be:
516 	 * tls_base = (void *)task->thread.fsbase;
517 	 */
518 	tls_base = (void *)task;
519 
520 #ifdef USE_BPF_LOOP
521 	struct read_var_ctx ctx = {
522 		.cfg = cfg,
523 		.tls_base = tls_base,
524 		.value = &value,
525 		.data = data,
526 		.payload = payload,
527 	};
528 	int err;
529 
530 	ctx.type = READ_INT_VAR;
531 	err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
532 	if (err != STROBE_MAX_INTS)
533 		return NULL;
534 
535 	ctx.type = READ_STR_VAR;
536 	err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
537 	if (err != STROBE_MAX_STRS)
538 		return NULL;
539 
540 	ctx.type = READ_MAP_VAR;
541 	err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
542 	if (err != STROBE_MAX_MAPS)
543 		return NULL;
544 #else
545 #ifdef NO_UNROLL
546 #pragma clang loop unroll(disable)
547 #else
548 #pragma unroll
549 #endif /* NO_UNROLL */
550 	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
551 		read_int_var(cfg, i, tls_base, &value, data);
552 	}
553 #ifdef NO_UNROLL
554 #pragma clang loop unroll(disable)
555 #else
556 #pragma unroll
557 #endif /* NO_UNROLL */
558 	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
559 		payload += read_str_var(cfg, i, tls_base, &value, data, payload);
560 	}
561 #ifdef NO_UNROLL
562 #pragma clang loop unroll(disable)
563 #else
564 #pragma unroll
565 #endif /* NO_UNROLL */
566 	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
567 		payload = read_map_var(cfg, i, tls_base, &value, data, payload);
568 	}
569 #endif /* USE_BPF_LOOP */
570 
571 	/*
572 	 * return pointer right after end of payload, so it's possible to
573 	 * calculate exact amount of useful data that needs to be sent
574 	 */
575 	return payload;
576 }
577 
578 SEC("raw_tracepoint/kfree_skb")
579 int on_event(struct pt_regs *ctx) {
580 	pid_t pid =  bpf_get_current_pid_tgid() >> 32;
581 	struct strobelight_bpf_sample* sample;
582 	struct task_struct *task;
583 	uint32_t zero = 0;
584 	uint64_t ktime_ns;
585 	void *sample_end;
586 
587 	sample = bpf_map_lookup_elem(&sample_heap, &zero);
588 	if (!sample)
589 		return 0; /* this will never happen */
590 
591 	sample->pid = pid;
592 	bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
593 	ktime_ns = bpf_ktime_get_ns();
594 	sample->ktime = ktime_ns;
595 
596 	task = (struct task_struct *)bpf_get_current_task();
597 	sample_end = read_strobe_meta(task, &sample->metadata);
598 	sample->has_meta = sample_end != NULL;
599 	sample_end = sample_end ? : &sample->metadata;
600 
601 	if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
602 		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
603 		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
604 	} else {
605 		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
606 		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
607 	}
608 
609 	uint64_t sample_size = sample_end - (void *)sample;
610 	/* should always be true */
611 	if (sample_size < sizeof(struct strobelight_bpf_sample))
612 		bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
613 	return 0;
614 }
615 
616 char _license[] SEC("license") = "GPL";
617