1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Facebook 3 4 #include <stdint.h> 5 #include <stddef.h> 6 #include <stdbool.h> 7 #include <linux/bpf.h> 8 #include <linux/ptrace.h> 9 #include <linux/sched.h> 10 #include <linux/types.h> 11 #include <bpf/bpf_helpers.h> 12 13 typedef uint32_t pid_t; 14 struct task_struct {}; 15 16 #define TASK_COMM_LEN 16 17 #define PERF_MAX_STACK_DEPTH 127 18 19 #define STROBE_TYPE_INVALID 0 20 #define STROBE_TYPE_INT 1 21 #define STROBE_TYPE_STR 2 22 #define STROBE_TYPE_MAP 3 23 24 #define STACK_TABLE_EPOCH_SHIFT 20 25 #define STROBE_MAX_STR_LEN 1 26 #define STROBE_MAX_CFGS 32 27 #define STROBE_MAX_PAYLOAD \ 28 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 29 STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 30 31 struct strobe_value_header { 32 /* 33 * meaning depends on type: 34 * 1. int: 0, if value not set, 1 otherwise 35 * 2. str: 1 always, whether value is set or not is determined by ptr 36 * 3. map: 1 always, pointer points to additional struct with number 37 * of entries (up to STROBE_MAX_MAP_ENTRIES) 38 */ 39 uint16_t len; 40 /* 41 * _reserved might be used for some future fields/flags, but we always 42 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 43 * bytes in one go and get both header and value 44 */ 45 uint8_t _reserved[6]; 46 }; 47 48 /* 49 * strobe_value_generic is used from BPF probe only, but needs to be a union 50 * of strobe_value_int/strobe_value_str/strobe_value_map 51 */ 52 struct strobe_value_generic { 53 struct strobe_value_header header; 54 union { 55 int64_t val; 56 void *ptr; 57 }; 58 }; 59 60 struct strobe_value_int { 61 struct strobe_value_header header; 62 int64_t value; 63 }; 64 65 struct strobe_value_str { 66 struct strobe_value_header header; 67 const char* value; 68 }; 69 70 struct strobe_value_map { 71 struct strobe_value_header header; 72 const struct strobe_map_raw* value; 73 }; 74 75 struct strobe_map_entry { 76 const char* key; 77 const char* val; 78 }; 79 80 /* 81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has 82 * corresponding int64 ID, which application can use (or ignore) in whatever 83 * way appropriate. Map is "write-only", there is no way to get data out of 84 * map. Map is intended to be used to provide metadata for profilers and is 85 * not to be used for internal in-app communication. All methods are 86 * thread-safe. 87 */ 88 struct strobe_map_raw { 89 /* 90 * general purpose unique ID that's up to application to decide 91 * whether and how to use; for request metadata use case id is unique 92 * request ID that's used to match metadata with stack traces on 93 * Strobelight backend side 94 */ 95 int64_t id; 96 /* number of used entries in map */ 97 int64_t cnt; 98 /* 99 * having volatile doesn't change anything on BPF side, but clang 100 * emits warnings for passing `volatile const char *` into 101 * bpf_probe_read_user_str that expects just `const char *` 102 */ 103 const char* tag; 104 /* 105 * key/value entries, each consisting of 2 pointers to key and value 106 * C strings 107 */ 108 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; 109 }; 110 111 /* Following values define supported values of TLS mode */ 112 #define TLS_NOT_SET -1 113 #define TLS_LOCAL_EXEC 0 114 #define TLS_IMM_EXEC 1 115 #define TLS_GENERAL_DYN 2 116 117 /* 118 * structure that universally represents TLS location (both for static 119 * executables and shared libraries) 120 */ 121 struct strobe_value_loc { 122 /* 123 * tls_mode defines what TLS mode was used for particular metavariable: 124 * - -1 (TLS_NOT_SET) - no metavariable; 125 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; 126 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; 127 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; 128 * Local Dynamic mode is not yet supported, because never seen in 129 * practice. Mode defines how offset field is interpreted. See 130 * calc_location() in below for details. 131 */ 132 int64_t tls_mode; 133 /* 134 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, 135 * tpidr_el0 for aarch64). 136 * TLS_IMM_EXEC: absolute address of GOT entry containing offset 137 * from thread pointer; 138 * TLS_GENERAL_DYN: absolute address of double GOT entry 139 * containing tls_index_t struct; 140 */ 141 int64_t offset; 142 }; 143 144 struct strobemeta_cfg { 145 int64_t req_meta_idx; 146 struct strobe_value_loc int_locs[STROBE_MAX_INTS]; 147 struct strobe_value_loc str_locs[STROBE_MAX_STRS]; 148 struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; 149 }; 150 151 struct strobe_map_descr { 152 uint64_t id; 153 int16_t tag_len; 154 /* 155 * cnt <0 - map value isn't set; 156 * 0 - map has id set, but no key/value entries 157 */ 158 int16_t cnt; 159 /* 160 * both key_lens[i] and val_lens[i] should be >0 for present key/value 161 * entry 162 */ 163 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; 164 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; 165 }; 166 167 struct strobemeta_payload { 168 /* req_id has valid request ID, if req_meta_valid == 1 */ 169 int64_t req_id; 170 uint8_t req_meta_valid; 171 /* 172 * mask has Nth bit set to 1, if Nth metavar was present and 173 * successfully read 174 */ 175 uint64_t int_vals_set_mask; 176 int64_t int_vals[STROBE_MAX_INTS]; 177 /* len is >0 for present values */ 178 uint16_t str_lens[STROBE_MAX_STRS]; 179 /* if map_descrs[i].cnt == -1, metavar is not present/set */ 180 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; 181 /* 182 * payload has compactly packed values of str and map variables in the 183 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 184 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines 185 * value length 186 */ 187 char payload[STROBE_MAX_PAYLOAD]; 188 }; 189 190 struct strobelight_bpf_sample { 191 uint64_t ktime; 192 char comm[TASK_COMM_LEN]; 193 pid_t pid; 194 int user_stack_id; 195 int kernel_stack_id; 196 int has_meta; 197 struct strobemeta_payload metadata; 198 /* 199 * makes it possible to pass (<real payload size> + 1) as data size to 200 * perf_submit() to avoid perf_submit's paranoia about passing zero as 201 * size, as it deduces that <real payload size> might be 202 * **theoretically** zero 203 */ 204 char dummy_safeguard; 205 }; 206 207 struct { 208 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 209 __uint(max_entries, 32); 210 __uint(key_size, sizeof(int)); 211 __uint(value_size, sizeof(int)); 212 } samples SEC(".maps"); 213 214 struct { 215 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 216 __uint(max_entries, 16); 217 __uint(key_size, sizeof(uint32_t)); 218 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 219 } stacks_0 SEC(".maps"); 220 221 struct { 222 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 223 __uint(max_entries, 16); 224 __uint(key_size, sizeof(uint32_t)); 225 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 226 } stacks_1 SEC(".maps"); 227 228 struct { 229 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 230 __uint(max_entries, 1); 231 __type(key, uint32_t); 232 __type(value, struct strobelight_bpf_sample); 233 } sample_heap SEC(".maps"); 234 235 struct { 236 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 237 __uint(max_entries, STROBE_MAX_CFGS); 238 __type(key, pid_t); 239 __type(value, struct strobemeta_cfg); 240 } strobemeta_cfgs SEC(".maps"); 241 242 /* Type for the dtv. */ 243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ 244 typedef union dtv { 245 size_t counter; 246 struct { 247 void* val; 248 bool is_static; 249 } pointer; 250 } dtv_t; 251 252 /* Partial definition for tcbhead_t */ 253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ 254 struct tcbhead { 255 void* tcb; 256 dtv_t* dtv; 257 }; 258 259 /* 260 * TLS module/offset information for shared library case. 261 * For x86-64, this is mapped onto two entries in GOT. 262 * For aarch64, this is pointed to by second GOT entry. 263 */ 264 struct tls_index { 265 uint64_t module; 266 uint64_t offset; 267 }; 268 269 #ifdef SUBPROGS 270 __noinline 271 #else 272 __always_inline 273 #endif 274 static void *calc_location(struct strobe_value_loc *loc, void *tls_base) 275 { 276 /* 277 * tls_mode value is: 278 * - -1 (TLS_NOT_SET), if no metavar is present; 279 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS 280 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); 281 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; 282 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; 283 * This schema allows to use something like: 284 * (tls_mode + 1) * (tls_base + offset) 285 * to get NULL for "no metavar" location, or correct pointer for local 286 * executable mode without doing extra ifs. 287 */ 288 if (loc->tls_mode <= TLS_LOCAL_EXEC) { 289 /* static executable is simple, we just have offset from 290 * tls_base */ 291 void *addr = tls_base + loc->offset; 292 /* multiply by (tls_mode + 1) to get NULL, if we have no 293 * metavar in this slot */ 294 return (void *)((loc->tls_mode + 1) * (int64_t)addr); 295 } 296 /* 297 * Other modes are more complicated, we need to jump through few hoops. 298 * 299 * For immediate executable mode (currently supported only for aarch64): 300 * - loc->offset is pointing to a GOT entry containing fixed offset 301 * relative to tls_base; 302 * 303 * For general dynamic mode: 304 * - loc->offset is pointing to a beginning of double GOT entries; 305 * - (for aarch64 only) second entry points to tls_index_t struct; 306 * - (for x86-64 only) two GOT entries are already tls_index_t; 307 * - tls_index_t->module is used to find start of TLS section in 308 * which variable resides; 309 * - tls_index_t->offset provides offset within that TLS section, 310 * pointing to value of variable. 311 */ 312 struct tls_index tls_index; 313 dtv_t *dtv; 314 void *tls_ptr; 315 316 bpf_probe_read_user(&tls_index, sizeof(struct tls_index), 317 (void *)loc->offset); 318 /* valid module index is always positive */ 319 if (tls_index.module > 0) { 320 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ 321 bpf_probe_read_user(&dtv, sizeof(dtv), 322 &((struct tcbhead *)tls_base)->dtv); 323 dtv += tls_index.module; 324 } else { 325 dtv = NULL; 326 } 327 bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); 328 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ 329 return tls_ptr && tls_ptr != (void *)-1 330 ? tls_ptr + tls_index.offset 331 : NULL; 332 } 333 334 #ifdef SUBPROGS 335 __noinline 336 #else 337 __always_inline 338 #endif 339 static void read_int_var(struct strobemeta_cfg *cfg, 340 size_t idx, void *tls_base, 341 struct strobe_value_generic *value, 342 struct strobemeta_payload *data) 343 { 344 void *location = calc_location(&cfg->int_locs[idx], tls_base); 345 if (!location) 346 return; 347 348 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 349 data->int_vals[idx] = value->val; 350 if (value->header.len) 351 data->int_vals_set_mask |= (1 << idx); 352 } 353 354 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, 355 size_t idx, void *tls_base, 356 struct strobe_value_generic *value, 357 struct strobemeta_payload *data, 358 void *payload) 359 { 360 void *location; 361 uint64_t len; 362 363 data->str_lens[idx] = 0; 364 location = calc_location(&cfg->str_locs[idx], tls_base); 365 if (!location) 366 return 0; 367 368 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 369 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr); 370 /* 371 * if bpf_probe_read_user_str returns error (<0), due to casting to 372 * unsinged int, it will become big number, so next check is 373 * sufficient to check for errors AND prove to BPF verifier, that 374 * bpf_probe_read_user_str won't return anything bigger than 375 * STROBE_MAX_STR_LEN 376 */ 377 if (len > STROBE_MAX_STR_LEN) 378 return 0; 379 380 data->str_lens[idx] = len; 381 return len; 382 } 383 384 static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, 385 size_t idx, void *tls_base, 386 struct strobe_value_generic *value, 387 struct strobemeta_payload *data, 388 void *payload) 389 { 390 struct strobe_map_descr* descr = &data->map_descrs[idx]; 391 struct strobe_map_raw map; 392 void *location; 393 uint64_t len; 394 int i; 395 396 descr->tag_len = 0; /* presume no tag is set */ 397 descr->cnt = -1; /* presume no value is set */ 398 399 location = calc_location(&cfg->map_locs[idx], tls_base); 400 if (!location) 401 return payload; 402 403 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 404 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) 405 return payload; 406 407 descr->id = map.id; 408 descr->cnt = map.cnt; 409 if (cfg->req_meta_idx == idx) { 410 data->req_id = map.id; 411 data->req_meta_valid = 1; 412 } 413 414 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag); 415 if (len <= STROBE_MAX_STR_LEN) { 416 descr->tag_len = len; 417 payload += len; 418 } 419 420 #ifdef NO_UNROLL 421 #pragma clang loop unroll(disable) 422 #else 423 #pragma unroll 424 #endif 425 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { 426 if (i >= map.cnt) 427 break; 428 429 descr->key_lens[i] = 0; 430 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, 431 map.entries[i].key); 432 if (len <= STROBE_MAX_STR_LEN) { 433 descr->key_lens[i] = len; 434 payload += len; 435 } 436 descr->val_lens[i] = 0; 437 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, 438 map.entries[i].val); 439 if (len <= STROBE_MAX_STR_LEN) { 440 descr->val_lens[i] = len; 441 payload += len; 442 } 443 } 444 445 return payload; 446 } 447 448 #ifdef USE_BPF_LOOP 449 enum read_type { 450 READ_INT_VAR, 451 READ_MAP_VAR, 452 READ_STR_VAR, 453 }; 454 455 struct read_var_ctx { 456 struct strobemeta_payload *data; 457 void *tls_base; 458 struct strobemeta_cfg *cfg; 459 void *payload; 460 /* value gets mutated */ 461 struct strobe_value_generic *value; 462 enum read_type type; 463 }; 464 465 static int read_var_callback(__u32 index, struct read_var_ctx *ctx) 466 { 467 switch (ctx->type) { 468 case READ_INT_VAR: 469 if (index >= STROBE_MAX_INTS) 470 return 1; 471 read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data); 472 break; 473 case READ_MAP_VAR: 474 if (index >= STROBE_MAX_MAPS) 475 return 1; 476 ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, 477 ctx->value, ctx->data, ctx->payload); 478 break; 479 case READ_STR_VAR: 480 if (index >= STROBE_MAX_STRS) 481 return 1; 482 ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, 483 ctx->value, ctx->data, ctx->payload); 484 break; 485 } 486 return 0; 487 } 488 #endif /* USE_BPF_LOOP */ 489 490 /* 491 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns 492 * pointer to *right after* payload ends 493 */ 494 #ifdef SUBPROGS 495 __noinline 496 #else 497 __always_inline 498 #endif 499 static void *read_strobe_meta(struct task_struct *task, 500 struct strobemeta_payload *data) 501 { 502 pid_t pid = bpf_get_current_pid_tgid() >> 32; 503 struct strobe_value_generic value = {0}; 504 struct strobemeta_cfg *cfg; 505 void *tls_base, *payload; 506 507 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 508 if (!cfg) 509 return NULL; 510 511 data->int_vals_set_mask = 0; 512 data->req_meta_valid = 0; 513 payload = data->payload; 514 /* 515 * we don't have struct task_struct definition, it should be: 516 * tls_base = (void *)task->thread.fsbase; 517 */ 518 tls_base = (void *)task; 519 520 #ifdef USE_BPF_LOOP 521 struct read_var_ctx ctx = { 522 .cfg = cfg, 523 .tls_base = tls_base, 524 .value = &value, 525 .data = data, 526 .payload = payload, 527 }; 528 int err; 529 530 ctx.type = READ_INT_VAR; 531 err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0); 532 if (err != STROBE_MAX_INTS) 533 return NULL; 534 535 ctx.type = READ_STR_VAR; 536 err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0); 537 if (err != STROBE_MAX_STRS) 538 return NULL; 539 540 ctx.type = READ_MAP_VAR; 541 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); 542 if (err != STROBE_MAX_MAPS) 543 return NULL; 544 #else 545 #ifdef NO_UNROLL 546 #pragma clang loop unroll(disable) 547 #else 548 #pragma unroll 549 #endif /* NO_UNROLL */ 550 for (int i = 0; i < STROBE_MAX_INTS; ++i) { 551 read_int_var(cfg, i, tls_base, &value, data); 552 } 553 #ifdef NO_UNROLL 554 #pragma clang loop unroll(disable) 555 #else 556 #pragma unroll 557 #endif /* NO_UNROLL */ 558 for (int i = 0; i < STROBE_MAX_STRS; ++i) { 559 payload += read_str_var(cfg, i, tls_base, &value, data, payload); 560 } 561 #ifdef NO_UNROLL 562 #pragma clang loop unroll(disable) 563 #else 564 #pragma unroll 565 #endif /* NO_UNROLL */ 566 for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 567 payload = read_map_var(cfg, i, tls_base, &value, data, payload); 568 } 569 #endif /* USE_BPF_LOOP */ 570 571 /* 572 * return pointer right after end of payload, so it's possible to 573 * calculate exact amount of useful data that needs to be sent 574 */ 575 return payload; 576 } 577 578 SEC("raw_tracepoint/kfree_skb") 579 int on_event(struct pt_regs *ctx) { 580 pid_t pid = bpf_get_current_pid_tgid() >> 32; 581 struct strobelight_bpf_sample* sample; 582 struct task_struct *task; 583 uint32_t zero = 0; 584 uint64_t ktime_ns; 585 void *sample_end; 586 587 sample = bpf_map_lookup_elem(&sample_heap, &zero); 588 if (!sample) 589 return 0; /* this will never happen */ 590 591 sample->pid = pid; 592 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); 593 ktime_ns = bpf_ktime_get_ns(); 594 sample->ktime = ktime_ns; 595 596 task = (struct task_struct *)bpf_get_current_task(); 597 sample_end = read_strobe_meta(task, &sample->metadata); 598 sample->has_meta = sample_end != NULL; 599 sample_end = sample_end ? : &sample->metadata; 600 601 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { 602 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); 603 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); 604 } else { 605 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); 606 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); 607 } 608 609 uint64_t sample_size = sample_end - (void *)sample; 610 /* should always be true */ 611 if (sample_size < sizeof(struct strobelight_bpf_sample)) 612 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); 613 return 0; 614 } 615 616 char _license[] SEC("license") = "GPL"; 617