1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Facebook 3 4 #include <stdint.h> 5 #include <stddef.h> 6 #include <stdbool.h> 7 #include <linux/bpf.h> 8 #include <linux/ptrace.h> 9 #include <linux/sched.h> 10 #include <linux/types.h> 11 #include <bpf/bpf_helpers.h> 12 13 typedef uint32_t pid_t; 14 struct task_struct {}; 15 16 #define TASK_COMM_LEN 16 17 #define PERF_MAX_STACK_DEPTH 127 18 19 #define STROBE_TYPE_INVALID 0 20 #define STROBE_TYPE_INT 1 21 #define STROBE_TYPE_STR 2 22 #define STROBE_TYPE_MAP 3 23 24 #define STACK_TABLE_EPOCH_SHIFT 20 25 #define STROBE_MAX_STR_LEN 1 26 #define STROBE_MAX_CFGS 32 27 #define READ_MAP_VAR_PAYLOAD_CAP \ 28 ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 29 #define STROBE_MAX_PAYLOAD \ 30 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 31 STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP) 32 33 struct strobe_value_header { 34 /* 35 * meaning depends on type: 36 * 1. int: 0, if value not set, 1 otherwise 37 * 2. str: 1 always, whether value is set or not is determined by ptr 38 * 3. map: 1 always, pointer points to additional struct with number 39 * of entries (up to STROBE_MAX_MAP_ENTRIES) 40 */ 41 uint16_t len; 42 /* 43 * _reserved might be used for some future fields/flags, but we always 44 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 45 * bytes in one go and get both header and value 46 */ 47 uint8_t _reserved[6]; 48 }; 49 50 /* 51 * strobe_value_generic is used from BPF probe only, but needs to be a union 52 * of strobe_value_int/strobe_value_str/strobe_value_map 53 */ 54 struct strobe_value_generic { 55 struct strobe_value_header header; 56 union { 57 int64_t val; 58 void *ptr; 59 }; 60 }; 61 62 struct strobe_value_int { 63 struct strobe_value_header header; 64 int64_t value; 65 }; 66 67 struct strobe_value_str { 68 struct strobe_value_header header; 69 const char* value; 70 }; 71 72 struct strobe_value_map { 73 struct strobe_value_header header; 74 const struct strobe_map_raw* value; 75 }; 76 77 struct strobe_map_entry { 78 const char* key; 79 const char* val; 80 }; 81 82 /* 83 * Map of C-string key/value pairs with fixed maximum capacity. Each map has 84 * corresponding int64 ID, which application can use (or ignore) in whatever 85 * way appropriate. Map is "write-only", there is no way to get data out of 86 * map. Map is intended to be used to provide metadata for profilers and is 87 * not to be used for internal in-app communication. All methods are 88 * thread-safe. 89 */ 90 struct strobe_map_raw { 91 /* 92 * general purpose unique ID that's up to application to decide 93 * whether and how to use; for request metadata use case id is unique 94 * request ID that's used to match metadata with stack traces on 95 * Strobelight backend side 96 */ 97 int64_t id; 98 /* number of used entries in map */ 99 int64_t cnt; 100 /* 101 * having volatile doesn't change anything on BPF side, but clang 102 * emits warnings for passing `volatile const char *` into 103 * bpf_probe_read_user_str that expects just `const char *` 104 */ 105 const char* tag; 106 /* 107 * key/value entries, each consisting of 2 pointers to key and value 108 * C strings 109 */ 110 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; 111 }; 112 113 /* Following values define supported values of TLS mode */ 114 #define TLS_NOT_SET -1 115 #define TLS_LOCAL_EXEC 0 116 #define TLS_IMM_EXEC 1 117 #define TLS_GENERAL_DYN 2 118 119 /* 120 * structure that universally represents TLS location (both for static 121 * executables and shared libraries) 122 */ 123 struct strobe_value_loc { 124 /* 125 * tls_mode defines what TLS mode was used for particular metavariable: 126 * - -1 (TLS_NOT_SET) - no metavariable; 127 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; 128 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; 129 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; 130 * Local Dynamic mode is not yet supported, because never seen in 131 * practice. Mode defines how offset field is interpreted. See 132 * calc_location() in below for details. 133 */ 134 int64_t tls_mode; 135 /* 136 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, 137 * tpidr_el0 for aarch64). 138 * TLS_IMM_EXEC: absolute address of GOT entry containing offset 139 * from thread pointer; 140 * TLS_GENERAL_DYN: absolute address of double GOT entry 141 * containing tls_index_t struct; 142 */ 143 int64_t offset; 144 }; 145 146 struct strobemeta_cfg { 147 int64_t req_meta_idx; 148 struct strobe_value_loc int_locs[STROBE_MAX_INTS]; 149 struct strobe_value_loc str_locs[STROBE_MAX_STRS]; 150 struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; 151 }; 152 153 struct strobe_map_descr { 154 uint64_t id; 155 int16_t tag_len; 156 /* 157 * cnt <0 - map value isn't set; 158 * 0 - map has id set, but no key/value entries 159 */ 160 int16_t cnt; 161 /* 162 * both key_lens[i] and val_lens[i] should be >0 for present key/value 163 * entry 164 */ 165 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; 166 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; 167 }; 168 169 struct strobemeta_payload { 170 /* req_id has valid request ID, if req_meta_valid == 1 */ 171 int64_t req_id; 172 uint8_t req_meta_valid; 173 /* 174 * mask has Nth bit set to 1, if Nth metavar was present and 175 * successfully read 176 */ 177 uint64_t int_vals_set_mask; 178 int64_t int_vals[STROBE_MAX_INTS]; 179 /* len is >0 for present values */ 180 uint16_t str_lens[STROBE_MAX_STRS]; 181 /* if map_descrs[i].cnt == -1, metavar is not present/set */ 182 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; 183 /* 184 * payload has compactly packed values of str and map variables in the 185 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 186 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines 187 * value length 188 */ 189 char payload[STROBE_MAX_PAYLOAD]; 190 }; 191 192 struct strobelight_bpf_sample { 193 uint64_t ktime; 194 char comm[TASK_COMM_LEN]; 195 pid_t pid; 196 int user_stack_id; 197 int kernel_stack_id; 198 int has_meta; 199 struct strobemeta_payload metadata; 200 /* 201 * makes it possible to pass (<real payload size> + 1) as data size to 202 * perf_submit() to avoid perf_submit's paranoia about passing zero as 203 * size, as it deduces that <real payload size> might be 204 * **theoretically** zero 205 */ 206 char dummy_safeguard; 207 }; 208 209 struct { 210 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 211 __uint(max_entries, 32); 212 __uint(key_size, sizeof(int)); 213 __uint(value_size, sizeof(int)); 214 } samples SEC(".maps"); 215 216 struct { 217 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 218 __uint(max_entries, 16); 219 __uint(key_size, sizeof(uint32_t)); 220 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 221 } stacks_0 SEC(".maps"); 222 223 struct { 224 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 225 __uint(max_entries, 16); 226 __uint(key_size, sizeof(uint32_t)); 227 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 228 } stacks_1 SEC(".maps"); 229 230 struct { 231 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 232 __uint(max_entries, 1); 233 __type(key, uint32_t); 234 __type(value, struct strobelight_bpf_sample); 235 } sample_heap SEC(".maps"); 236 237 struct { 238 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 239 __uint(max_entries, STROBE_MAX_CFGS); 240 __type(key, pid_t); 241 __type(value, struct strobemeta_cfg); 242 } strobemeta_cfgs SEC(".maps"); 243 244 /* Type for the dtv. */ 245 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ 246 typedef union dtv { 247 size_t counter; 248 struct { 249 void* val; 250 bool is_static; 251 } pointer; 252 } dtv_t; 253 254 /* Partial definition for tcbhead_t */ 255 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ 256 struct tcbhead { 257 void* tcb; 258 dtv_t* dtv; 259 }; 260 261 /* 262 * TLS module/offset information for shared library case. 263 * For x86-64, this is mapped onto two entries in GOT. 264 * For aarch64, this is pointed to by second GOT entry. 265 */ 266 struct tls_index { 267 uint64_t module; 268 uint64_t offset; 269 }; 270 271 #ifdef SUBPROGS 272 __noinline 273 #else 274 __always_inline 275 #endif 276 static void *calc_location(struct strobe_value_loc *loc, void *tls_base) 277 { 278 /* 279 * tls_mode value is: 280 * - -1 (TLS_NOT_SET), if no metavar is present; 281 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS 282 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); 283 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; 284 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; 285 * This schema allows to use something like: 286 * (tls_mode + 1) * (tls_base + offset) 287 * to get NULL for "no metavar" location, or correct pointer for local 288 * executable mode without doing extra ifs. 289 */ 290 if (loc->tls_mode <= TLS_LOCAL_EXEC) { 291 /* static executable is simple, we just have offset from 292 * tls_base */ 293 void *addr = tls_base + loc->offset; 294 /* multiply by (tls_mode + 1) to get NULL, if we have no 295 * metavar in this slot */ 296 return (void *)((loc->tls_mode + 1) * (int64_t)addr); 297 } 298 /* 299 * Other modes are more complicated, we need to jump through few hoops. 300 * 301 * For immediate executable mode (currently supported only for aarch64): 302 * - loc->offset is pointing to a GOT entry containing fixed offset 303 * relative to tls_base; 304 * 305 * For general dynamic mode: 306 * - loc->offset is pointing to a beginning of double GOT entries; 307 * - (for aarch64 only) second entry points to tls_index_t struct; 308 * - (for x86-64 only) two GOT entries are already tls_index_t; 309 * - tls_index_t->module is used to find start of TLS section in 310 * which variable resides; 311 * - tls_index_t->offset provides offset within that TLS section, 312 * pointing to value of variable. 313 */ 314 struct tls_index tls_index; 315 dtv_t *dtv; 316 void *tls_ptr; 317 318 bpf_probe_read_user(&tls_index, sizeof(struct tls_index), 319 (void *)loc->offset); 320 /* valid module index is always positive */ 321 if (tls_index.module > 0) { 322 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ 323 bpf_probe_read_user(&dtv, sizeof(dtv), 324 &((struct tcbhead *)tls_base)->dtv); 325 dtv += tls_index.module; 326 } else { 327 dtv = NULL; 328 } 329 bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); 330 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ 331 return tls_ptr && tls_ptr != (void *)-1 332 ? tls_ptr + tls_index.offset 333 : NULL; 334 } 335 336 #ifdef SUBPROGS 337 __noinline 338 #else 339 __always_inline 340 #endif 341 static void read_int_var(struct strobemeta_cfg *cfg, 342 size_t idx, void *tls_base, 343 struct strobe_value_generic *value, 344 struct strobemeta_payload *data) 345 { 346 void *location = calc_location(&cfg->int_locs[idx], tls_base); 347 if (!location) 348 return; 349 350 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 351 data->int_vals[idx] = value->val; 352 if (value->header.len) 353 data->int_vals_set_mask |= (1 << idx); 354 } 355 356 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, 357 size_t idx, void *tls_base, 358 struct strobe_value_generic *value, 359 struct strobemeta_payload *data, 360 size_t off) 361 { 362 void *location; 363 uint64_t len; 364 365 data->str_lens[idx] = 0; 366 location = calc_location(&cfg->str_locs[idx], tls_base); 367 if (!location) 368 return 0; 369 370 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 371 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); 372 /* 373 * if bpf_probe_read_user_str returns error (<0), due to casting to 374 * unsinged int, it will become big number, so next check is 375 * sufficient to check for errors AND prove to BPF verifier, that 376 * bpf_probe_read_user_str won't return anything bigger than 377 * STROBE_MAX_STR_LEN 378 */ 379 if (len > STROBE_MAX_STR_LEN) 380 return 0; 381 382 data->str_lens[idx] = len; 383 return off + len; 384 } 385 386 static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, 387 size_t idx, void *tls_base, 388 struct strobe_value_generic *value, 389 struct strobemeta_payload *data, 390 size_t off) 391 { 392 struct strobe_map_descr* descr = &data->map_descrs[idx]; 393 struct strobe_map_raw map; 394 void *location; 395 uint64_t len; 396 397 descr->tag_len = 0; /* presume no tag is set */ 398 descr->cnt = -1; /* presume no value is set */ 399 400 location = calc_location(&cfg->map_locs[idx], tls_base); 401 if (!location) 402 return off; 403 404 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 405 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) 406 return off; 407 408 descr->id = map.id; 409 descr->cnt = map.cnt; 410 if (cfg->req_meta_idx == idx) { 411 data->req_id = map.id; 412 data->req_meta_valid = 1; 413 } 414 415 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag); 416 if (len <= STROBE_MAX_STR_LEN) { 417 descr->tag_len = len; 418 off += len; 419 } 420 421 #ifdef NO_UNROLL 422 #pragma clang loop unroll(disable) 423 #else 424 #pragma unroll 425 #endif 426 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { 427 if (i >= map.cnt) 428 break; 429 430 descr->key_lens[i] = 0; 431 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, 432 map.entries[i].key); 433 if (len <= STROBE_MAX_STR_LEN) { 434 descr->key_lens[i] = len; 435 off += len; 436 } 437 descr->val_lens[i] = 0; 438 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, 439 map.entries[i].val); 440 if (len <= STROBE_MAX_STR_LEN) { 441 descr->val_lens[i] = len; 442 off += len; 443 } 444 } 445 446 return off; 447 } 448 449 #ifdef USE_BPF_LOOP 450 enum read_type { 451 READ_INT_VAR, 452 READ_MAP_VAR, 453 READ_STR_VAR, 454 }; 455 456 struct read_var_ctx { 457 struct strobemeta_payload *data; 458 void *tls_base; 459 struct strobemeta_cfg *cfg; 460 size_t payload_off; 461 /* value gets mutated */ 462 struct strobe_value_generic *value; 463 enum read_type type; 464 }; 465 466 static int read_var_callback(__u64 index, struct read_var_ctx *ctx) 467 { 468 /* lose precision info for ctx->payload_off, verifier won't track 469 * double xor, barrier_var() is needed to force clang keep both xors. 470 */ 471 ctx->payload_off ^= index; 472 barrier_var(ctx->payload_off); 473 ctx->payload_off ^= index; 474 switch (ctx->type) { 475 case READ_INT_VAR: 476 if (index >= STROBE_MAX_INTS) 477 return 1; 478 read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data); 479 break; 480 case READ_MAP_VAR: 481 if (index >= STROBE_MAX_MAPS) 482 return 1; 483 if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP) 484 return 1; 485 ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base, 486 ctx->value, ctx->data, ctx->payload_off); 487 break; 488 case READ_STR_VAR: 489 if (index >= STROBE_MAX_STRS) 490 return 1; 491 if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN) 492 return 1; 493 ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base, 494 ctx->value, ctx->data, ctx->payload_off); 495 break; 496 } 497 return 0; 498 } 499 #endif /* USE_BPF_LOOP */ 500 501 /* 502 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns 503 * pointer to *right after* payload ends 504 */ 505 #ifdef SUBPROGS 506 __noinline 507 #else 508 __always_inline 509 #endif 510 static void *read_strobe_meta(struct task_struct *task, 511 struct strobemeta_payload *data) 512 { 513 pid_t pid = bpf_get_current_pid_tgid() >> 32; 514 struct strobe_value_generic value = {0}; 515 struct strobemeta_cfg *cfg; 516 size_t payload_off; 517 void *tls_base; 518 519 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 520 if (!cfg) 521 return NULL; 522 523 data->int_vals_set_mask = 0; 524 data->req_meta_valid = 0; 525 payload_off = 0; 526 /* 527 * we don't have struct task_struct definition, it should be: 528 * tls_base = (void *)task->thread.fsbase; 529 */ 530 tls_base = (void *)task; 531 532 #ifdef USE_BPF_LOOP 533 struct read_var_ctx ctx = { 534 .cfg = cfg, 535 .tls_base = tls_base, 536 .value = &value, 537 .data = data, 538 .payload_off = 0, 539 }; 540 int err; 541 542 ctx.type = READ_INT_VAR; 543 err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0); 544 if (err != STROBE_MAX_INTS) 545 return NULL; 546 547 ctx.type = READ_STR_VAR; 548 err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0); 549 if (err != STROBE_MAX_STRS) 550 return NULL; 551 552 ctx.type = READ_MAP_VAR; 553 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); 554 if (err != STROBE_MAX_MAPS) 555 return NULL; 556 557 payload_off = ctx.payload_off; 558 /* this should not really happen, here only to satisfy verifer */ 559 if (payload_off > sizeof(data->payload)) 560 payload_off = sizeof(data->payload); 561 #else 562 #ifdef NO_UNROLL 563 #pragma clang loop unroll(disable) 564 #else 565 #pragma unroll 566 #endif /* NO_UNROLL */ 567 for (int i = 0; i < STROBE_MAX_INTS; ++i) { 568 read_int_var(cfg, i, tls_base, &value, data); 569 } 570 #ifdef NO_UNROLL 571 #pragma clang loop unroll(disable) 572 #else 573 #pragma unroll 574 #endif /* NO_UNROLL */ 575 for (int i = 0; i < STROBE_MAX_STRS; ++i) { 576 payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); 577 } 578 #ifdef NO_UNROLL 579 #pragma clang loop unroll(disable) 580 #else 581 #pragma unroll 582 #endif /* NO_UNROLL */ 583 for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 584 payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); 585 } 586 #endif /* USE_BPF_LOOP */ 587 588 /* 589 * return pointer right after end of payload, so it's possible to 590 * calculate exact amount of useful data that needs to be sent 591 */ 592 return &data->payload[payload_off]; 593 } 594 595 SEC("raw_tracepoint/kfree_skb") 596 int on_event(struct pt_regs *ctx) { 597 pid_t pid = bpf_get_current_pid_tgid() >> 32; 598 struct strobelight_bpf_sample* sample; 599 struct task_struct *task; 600 uint32_t zero = 0; 601 uint64_t ktime_ns; 602 void *sample_end; 603 604 sample = bpf_map_lookup_elem(&sample_heap, &zero); 605 if (!sample) 606 return 0; /* this will never happen */ 607 608 sample->pid = pid; 609 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); 610 ktime_ns = bpf_ktime_get_ns(); 611 sample->ktime = ktime_ns; 612 613 task = (struct task_struct *)bpf_get_current_task(); 614 sample_end = read_strobe_meta(task, &sample->metadata); 615 sample->has_meta = sample_end != NULL; 616 sample_end = sample_end ? : &sample->metadata; 617 618 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { 619 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); 620 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); 621 } else { 622 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); 623 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); 624 } 625 626 uint64_t sample_size = sample_end - (void *)sample; 627 /* should always be true */ 628 if (sample_size < sizeof(struct strobelight_bpf_sample)) 629 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); 630 return 0; 631 } 632 633 char _license[] SEC("license") = "GPL"; 634