1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Facebook 3 4 #include <stdint.h> 5 #include <stddef.h> 6 #include <stdbool.h> 7 #include <linux/bpf.h> 8 #include <linux/ptrace.h> 9 #include <linux/sched.h> 10 #include <linux/types.h> 11 #include "bpf_helpers.h" 12 13 typedef uint32_t pid_t; 14 struct task_struct {}; 15 16 #define TASK_COMM_LEN 16 17 #define PERF_MAX_STACK_DEPTH 127 18 19 #define STROBE_TYPE_INVALID 0 20 #define STROBE_TYPE_INT 1 21 #define STROBE_TYPE_STR 2 22 #define STROBE_TYPE_MAP 3 23 24 #define STACK_TABLE_EPOCH_SHIFT 20 25 #define STROBE_MAX_STR_LEN 1 26 #define STROBE_MAX_CFGS 32 27 #define STROBE_MAX_PAYLOAD \ 28 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 29 STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 30 31 struct strobe_value_header { 32 /* 33 * meaning depends on type: 34 * 1. int: 0, if value not set, 1 otherwise 35 * 2. str: 1 always, whether value is set or not is determined by ptr 36 * 3. map: 1 always, pointer points to additional struct with number 37 * of entries (up to STROBE_MAX_MAP_ENTRIES) 38 */ 39 uint16_t len; 40 /* 41 * _reserved might be used for some future fields/flags, but we always 42 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 43 * bytes in one go and get both header and value 44 */ 45 uint8_t _reserved[6]; 46 }; 47 48 /* 49 * strobe_value_generic is used from BPF probe only, but needs to be a union 50 * of strobe_value_int/strobe_value_str/strobe_value_map 51 */ 52 struct strobe_value_generic { 53 struct strobe_value_header header; 54 union { 55 int64_t val; 56 void *ptr; 57 }; 58 }; 59 60 struct strobe_value_int { 61 struct strobe_value_header header; 62 int64_t value; 63 }; 64 65 struct strobe_value_str { 66 struct strobe_value_header header; 67 const char* value; 68 }; 69 70 struct strobe_value_map { 71 struct strobe_value_header header; 72 const struct strobe_map_raw* value; 73 }; 74 75 struct strobe_map_entry { 76 const char* key; 77 const char* val; 78 }; 79 80 /* 81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has 82 * corresponding int64 ID, which application can use (or ignore) in whatever 83 * way appropriate. Map is "write-only", there is no way to get data out of 84 * map. Map is intended to be used to provide metadata for profilers and is 85 * not to be used for internal in-app communication. All methods are 86 * thread-safe. 87 */ 88 struct strobe_map_raw { 89 /* 90 * general purpose unique ID that's up to application to decide 91 * whether and how to use; for request metadata use case id is unique 92 * request ID that's used to match metadata with stack traces on 93 * Strobelight backend side 94 */ 95 int64_t id; 96 /* number of used entries in map */ 97 int64_t cnt; 98 /* 99 * having volatile doesn't change anything on BPF side, but clang 100 * emits warnings for passing `volatile const char *` into 101 * bpf_probe_read_str that expects just `const char *` 102 */ 103 const char* tag; 104 /* 105 * key/value entries, each consisting of 2 pointers to key and value 106 * C strings 107 */ 108 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; 109 }; 110 111 /* Following values define supported values of TLS mode */ 112 #define TLS_NOT_SET -1 113 #define TLS_LOCAL_EXEC 0 114 #define TLS_IMM_EXEC 1 115 #define TLS_GENERAL_DYN 2 116 117 /* 118 * structure that universally represents TLS location (both for static 119 * executables and shared libraries) 120 */ 121 struct strobe_value_loc { 122 /* 123 * tls_mode defines what TLS mode was used for particular metavariable: 124 * - -1 (TLS_NOT_SET) - no metavariable; 125 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; 126 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; 127 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; 128 * Local Dynamic mode is not yet supported, because never seen in 129 * practice. Mode defines how offset field is interpreted. See 130 * calc_location() in below for details. 131 */ 132 int64_t tls_mode; 133 /* 134 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, 135 * tpidr_el0 for aarch64). 136 * TLS_IMM_EXEC: absolute address of GOT entry containing offset 137 * from thread pointer; 138 * TLS_GENERAL_DYN: absolute addres of double GOT entry 139 * containing tls_index_t struct; 140 */ 141 int64_t offset; 142 }; 143 144 struct strobemeta_cfg { 145 int64_t req_meta_idx; 146 struct strobe_value_loc int_locs[STROBE_MAX_INTS]; 147 struct strobe_value_loc str_locs[STROBE_MAX_STRS]; 148 struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; 149 }; 150 151 struct strobe_map_descr { 152 uint64_t id; 153 int16_t tag_len; 154 /* 155 * cnt <0 - map value isn't set; 156 * 0 - map has id set, but no key/value entries 157 */ 158 int16_t cnt; 159 /* 160 * both key_lens[i] and val_lens[i] should be >0 for present key/value 161 * entry 162 */ 163 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; 164 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; 165 }; 166 167 struct strobemeta_payload { 168 /* req_id has valid request ID, if req_meta_valid == 1 */ 169 int64_t req_id; 170 uint8_t req_meta_valid; 171 /* 172 * mask has Nth bit set to 1, if Nth metavar was present and 173 * successfully read 174 */ 175 uint64_t int_vals_set_mask; 176 int64_t int_vals[STROBE_MAX_INTS]; 177 /* len is >0 for present values */ 178 uint16_t str_lens[STROBE_MAX_STRS]; 179 /* if map_descrs[i].cnt == -1, metavar is not present/set */ 180 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; 181 /* 182 * payload has compactly packed values of str and map variables in the 183 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 184 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines 185 * value length 186 */ 187 char payload[STROBE_MAX_PAYLOAD]; 188 }; 189 190 struct strobelight_bpf_sample { 191 uint64_t ktime; 192 char comm[TASK_COMM_LEN]; 193 pid_t pid; 194 int user_stack_id; 195 int kernel_stack_id; 196 int has_meta; 197 struct strobemeta_payload metadata; 198 /* 199 * makes it possible to pass (<real payload size> + 1) as data size to 200 * perf_submit() to avoid perf_submit's paranoia about passing zero as 201 * size, as it deduces that <real payload size> might be 202 * **theoretically** zero 203 */ 204 char dummy_safeguard; 205 }; 206 207 struct { 208 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 209 __uint(max_entries, 32); 210 __uint(key_size, sizeof(int)); 211 __uint(value_size, sizeof(int)); 212 } samples SEC(".maps"); 213 214 struct { 215 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 216 __uint(max_entries, 16); 217 __uint(key_size, sizeof(uint32_t)); 218 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 219 } stacks_0 SEC(".maps"); 220 221 struct { 222 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 223 __uint(max_entries, 16); 224 __uint(key_size, sizeof(uint32_t)); 225 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 226 } stacks_1 SEC(".maps"); 227 228 struct { 229 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 230 __uint(max_entries, 1); 231 __type(key, uint32_t); 232 __type(value, struct strobelight_bpf_sample); 233 } sample_heap SEC(".maps"); 234 235 struct { 236 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 237 __uint(max_entries, STROBE_MAX_CFGS); 238 __type(key, pid_t); 239 __type(value, struct strobemeta_cfg); 240 } strobemeta_cfgs SEC(".maps"); 241 242 /* Type for the dtv. */ 243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ 244 typedef union dtv { 245 size_t counter; 246 struct { 247 void* val; 248 bool is_static; 249 } pointer; 250 } dtv_t; 251 252 /* Partial definition for tcbhead_t */ 253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ 254 struct tcbhead { 255 void* tcb; 256 dtv_t* dtv; 257 }; 258 259 /* 260 * TLS module/offset information for shared library case. 261 * For x86-64, this is mapped onto two entries in GOT. 262 * For aarch64, this is pointed to by second GOT entry. 263 */ 264 struct tls_index { 265 uint64_t module; 266 uint64_t offset; 267 }; 268 269 static __always_inline void *calc_location(struct strobe_value_loc *loc, 270 void *tls_base) 271 { 272 /* 273 * tls_mode value is: 274 * - -1 (TLS_NOT_SET), if no metavar is present; 275 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS 276 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); 277 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; 278 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; 279 * This schema allows to use something like: 280 * (tls_mode + 1) * (tls_base + offset) 281 * to get NULL for "no metavar" location, or correct pointer for local 282 * executable mode without doing extra ifs. 283 */ 284 if (loc->tls_mode <= TLS_LOCAL_EXEC) { 285 /* static executable is simple, we just have offset from 286 * tls_base */ 287 void *addr = tls_base + loc->offset; 288 /* multiply by (tls_mode + 1) to get NULL, if we have no 289 * metavar in this slot */ 290 return (void *)((loc->tls_mode + 1) * (int64_t)addr); 291 } 292 /* 293 * Other modes are more complicated, we need to jump through few hoops. 294 * 295 * For immediate executable mode (currently supported only for aarch64): 296 * - loc->offset is pointing to a GOT entry containing fixed offset 297 * relative to tls_base; 298 * 299 * For general dynamic mode: 300 * - loc->offset is pointing to a beginning of double GOT entries; 301 * - (for aarch64 only) second entry points to tls_index_t struct; 302 * - (for x86-64 only) two GOT entries are already tls_index_t; 303 * - tls_index_t->module is used to find start of TLS section in 304 * which variable resides; 305 * - tls_index_t->offset provides offset within that TLS section, 306 * pointing to value of variable. 307 */ 308 struct tls_index tls_index; 309 dtv_t *dtv; 310 void *tls_ptr; 311 312 bpf_probe_read(&tls_index, sizeof(struct tls_index), 313 (void *)loc->offset); 314 /* valid module index is always positive */ 315 if (tls_index.module > 0) { 316 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ 317 bpf_probe_read(&dtv, sizeof(dtv), 318 &((struct tcbhead *)tls_base)->dtv); 319 dtv += tls_index.module; 320 } else { 321 dtv = NULL; 322 } 323 bpf_probe_read(&tls_ptr, sizeof(void *), dtv); 324 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ 325 return tls_ptr && tls_ptr != (void *)-1 326 ? tls_ptr + tls_index.offset 327 : NULL; 328 } 329 330 static __always_inline void read_int_var(struct strobemeta_cfg *cfg, 331 size_t idx, void *tls_base, 332 struct strobe_value_generic *value, 333 struct strobemeta_payload *data) 334 { 335 void *location = calc_location(&cfg->int_locs[idx], tls_base); 336 if (!location) 337 return; 338 339 bpf_probe_read(value, sizeof(struct strobe_value_generic), location); 340 data->int_vals[idx] = value->val; 341 if (value->header.len) 342 data->int_vals_set_mask |= (1 << idx); 343 } 344 345 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, 346 size_t idx, void *tls_base, 347 struct strobe_value_generic *value, 348 struct strobemeta_payload *data, 349 void *payload) 350 { 351 void *location; 352 uint32_t len; 353 354 data->str_lens[idx] = 0; 355 location = calc_location(&cfg->str_locs[idx], tls_base); 356 if (!location) 357 return 0; 358 359 bpf_probe_read(value, sizeof(struct strobe_value_generic), location); 360 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, value->ptr); 361 /* 362 * if bpf_probe_read_str returns error (<0), due to casting to 363 * unsinged int, it will become big number, so next check is 364 * sufficient to check for errors AND prove to BPF verifier, that 365 * bpf_probe_read_str won't return anything bigger than 366 * STROBE_MAX_STR_LEN 367 */ 368 if (len > STROBE_MAX_STR_LEN) 369 return 0; 370 371 data->str_lens[idx] = len; 372 return len; 373 } 374 375 static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, 376 size_t idx, void *tls_base, 377 struct strobe_value_generic *value, 378 struct strobemeta_payload *data, 379 void *payload) 380 { 381 struct strobe_map_descr* descr = &data->map_descrs[idx]; 382 struct strobe_map_raw map; 383 void *location; 384 uint32_t len; 385 int i; 386 387 descr->tag_len = 0; /* presume no tag is set */ 388 descr->cnt = -1; /* presume no value is set */ 389 390 location = calc_location(&cfg->map_locs[idx], tls_base); 391 if (!location) 392 return payload; 393 394 bpf_probe_read(value, sizeof(struct strobe_value_generic), location); 395 if (bpf_probe_read(&map, sizeof(struct strobe_map_raw), value->ptr)) 396 return payload; 397 398 descr->id = map.id; 399 descr->cnt = map.cnt; 400 if (cfg->req_meta_idx == idx) { 401 data->req_id = map.id; 402 data->req_meta_valid = 1; 403 } 404 405 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.tag); 406 if (len <= STROBE_MAX_STR_LEN) { 407 descr->tag_len = len; 408 payload += len; 409 } 410 411 #ifdef NO_UNROLL 412 #pragma clang loop unroll(disable) 413 #else 414 #pragma unroll 415 #endif 416 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { 417 if (i >= map.cnt) 418 break; 419 420 descr->key_lens[i] = 0; 421 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, 422 map.entries[i].key); 423 if (len <= STROBE_MAX_STR_LEN) { 424 descr->key_lens[i] = len; 425 payload += len; 426 } 427 descr->val_lens[i] = 0; 428 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, 429 map.entries[i].val); 430 if (len <= STROBE_MAX_STR_LEN) { 431 descr->val_lens[i] = len; 432 payload += len; 433 } 434 } 435 436 return payload; 437 } 438 439 /* 440 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns 441 * pointer to *right after* payload ends 442 */ 443 static __always_inline void *read_strobe_meta(struct task_struct *task, 444 struct strobemeta_payload *data) 445 { 446 pid_t pid = bpf_get_current_pid_tgid() >> 32; 447 struct strobe_value_generic value = {0}; 448 struct strobemeta_cfg *cfg; 449 void *tls_base, *payload; 450 451 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 452 if (!cfg) 453 return NULL; 454 455 data->int_vals_set_mask = 0; 456 data->req_meta_valid = 0; 457 payload = data->payload; 458 /* 459 * we don't have struct task_struct definition, it should be: 460 * tls_base = (void *)task->thread.fsbase; 461 */ 462 tls_base = (void *)task; 463 464 #ifdef NO_UNROLL 465 #pragma clang loop unroll(disable) 466 #else 467 #pragma unroll 468 #endif 469 for (int i = 0; i < STROBE_MAX_INTS; ++i) { 470 read_int_var(cfg, i, tls_base, &value, data); 471 } 472 #ifdef NO_UNROLL 473 #pragma clang loop unroll(disable) 474 #else 475 #pragma unroll 476 #endif 477 for (int i = 0; i < STROBE_MAX_STRS; ++i) { 478 payload += read_str_var(cfg, i, tls_base, &value, data, payload); 479 } 480 #ifdef NO_UNROLL 481 #pragma clang loop unroll(disable) 482 #else 483 #pragma unroll 484 #endif 485 for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 486 payload = read_map_var(cfg, i, tls_base, &value, data, payload); 487 } 488 /* 489 * return pointer right after end of payload, so it's possible to 490 * calculate exact amount of useful data that needs to be sent 491 */ 492 return payload; 493 } 494 495 SEC("raw_tracepoint/kfree_skb") 496 int on_event(struct pt_regs *ctx) { 497 pid_t pid = bpf_get_current_pid_tgid() >> 32; 498 struct strobelight_bpf_sample* sample; 499 struct task_struct *task; 500 uint32_t zero = 0; 501 uint64_t ktime_ns; 502 void *sample_end; 503 504 sample = bpf_map_lookup_elem(&sample_heap, &zero); 505 if (!sample) 506 return 0; /* this will never happen */ 507 508 sample->pid = pid; 509 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); 510 ktime_ns = bpf_ktime_get_ns(); 511 sample->ktime = ktime_ns; 512 513 task = (struct task_struct *)bpf_get_current_task(); 514 sample_end = read_strobe_meta(task, &sample->metadata); 515 sample->has_meta = sample_end != NULL; 516 sample_end = sample_end ? : &sample->metadata; 517 518 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { 519 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); 520 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); 521 } else { 522 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); 523 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); 524 } 525 526 uint64_t sample_size = sample_end - (void *)sample; 527 /* should always be true */ 528 if (sample_size < sizeof(struct strobelight_bpf_sample)) 529 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); 530 return 0; 531 } 532 533 char _license[] SEC("license") = "GPL"; 534