1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Facebook 3 4 #include <stdint.h> 5 #include <stddef.h> 6 #include <stdbool.h> 7 #include <linux/bpf.h> 8 #include <linux/ptrace.h> 9 #include <linux/sched.h> 10 #include <linux/types.h> 11 #include <bpf/bpf_helpers.h> 12 13 typedef uint32_t pid_t; 14 struct task_struct {}; 15 16 #define TASK_COMM_LEN 16 17 #define PERF_MAX_STACK_DEPTH 127 18 19 #define STROBE_TYPE_INVALID 0 20 #define STROBE_TYPE_INT 1 21 #define STROBE_TYPE_STR 2 22 #define STROBE_TYPE_MAP 3 23 24 #define STACK_TABLE_EPOCH_SHIFT 20 25 #define STROBE_MAX_STR_LEN 1 26 #define STROBE_MAX_CFGS 32 27 #define STROBE_MAX_PAYLOAD \ 28 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 29 STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 30 31 struct strobe_value_header { 32 /* 33 * meaning depends on type: 34 * 1. int: 0, if value not set, 1 otherwise 35 * 2. str: 1 always, whether value is set or not is determined by ptr 36 * 3. map: 1 always, pointer points to additional struct with number 37 * of entries (up to STROBE_MAX_MAP_ENTRIES) 38 */ 39 uint16_t len; 40 /* 41 * _reserved might be used for some future fields/flags, but we always 42 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 43 * bytes in one go and get both header and value 44 */ 45 uint8_t _reserved[6]; 46 }; 47 48 /* 49 * strobe_value_generic is used from BPF probe only, but needs to be a union 50 * of strobe_value_int/strobe_value_str/strobe_value_map 51 */ 52 struct strobe_value_generic { 53 struct strobe_value_header header; 54 union { 55 int64_t val; 56 void *ptr; 57 }; 58 }; 59 60 struct strobe_value_int { 61 struct strobe_value_header header; 62 int64_t value; 63 }; 64 65 struct strobe_value_str { 66 struct strobe_value_header header; 67 const char* value; 68 }; 69 70 struct strobe_value_map { 71 struct strobe_value_header header; 72 const struct strobe_map_raw* value; 73 }; 74 75 struct strobe_map_entry { 76 const char* key; 77 const char* val; 78 }; 79 80 /* 81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has 82 * corresponding int64 ID, which application can use (or ignore) in whatever 83 * way appropriate. Map is "write-only", there is no way to get data out of 84 * map. Map is intended to be used to provide metadata for profilers and is 85 * not to be used for internal in-app communication. All methods are 86 * thread-safe. 87 */ 88 struct strobe_map_raw { 89 /* 90 * general purpose unique ID that's up to application to decide 91 * whether and how to use; for request metadata use case id is unique 92 * request ID that's used to match metadata with stack traces on 93 * Strobelight backend side 94 */ 95 int64_t id; 96 /* number of used entries in map */ 97 int64_t cnt; 98 /* 99 * having volatile doesn't change anything on BPF side, but clang 100 * emits warnings for passing `volatile const char *` into 101 * bpf_probe_read_user_str that expects just `const char *` 102 */ 103 const char* tag; 104 /* 105 * key/value entries, each consisting of 2 pointers to key and value 106 * C strings 107 */ 108 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; 109 }; 110 111 /* Following values define supported values of TLS mode */ 112 #define TLS_NOT_SET -1 113 #define TLS_LOCAL_EXEC 0 114 #define TLS_IMM_EXEC 1 115 #define TLS_GENERAL_DYN 2 116 117 /* 118 * structure that universally represents TLS location (both for static 119 * executables and shared libraries) 120 */ 121 struct strobe_value_loc { 122 /* 123 * tls_mode defines what TLS mode was used for particular metavariable: 124 * - -1 (TLS_NOT_SET) - no metavariable; 125 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; 126 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; 127 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; 128 * Local Dynamic mode is not yet supported, because never seen in 129 * practice. Mode defines how offset field is interpreted. See 130 * calc_location() in below for details. 131 */ 132 int64_t tls_mode; 133 /* 134 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, 135 * tpidr_el0 for aarch64). 136 * TLS_IMM_EXEC: absolute address of GOT entry containing offset 137 * from thread pointer; 138 * TLS_GENERAL_DYN: absolute address of double GOT entry 139 * containing tls_index_t struct; 140 */ 141 int64_t offset; 142 }; 143 144 struct strobemeta_cfg { 145 int64_t req_meta_idx; 146 struct strobe_value_loc int_locs[STROBE_MAX_INTS]; 147 struct strobe_value_loc str_locs[STROBE_MAX_STRS]; 148 struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; 149 }; 150 151 struct strobe_map_descr { 152 uint64_t id; 153 int16_t tag_len; 154 /* 155 * cnt <0 - map value isn't set; 156 * 0 - map has id set, but no key/value entries 157 */ 158 int16_t cnt; 159 /* 160 * both key_lens[i] and val_lens[i] should be >0 for present key/value 161 * entry 162 */ 163 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; 164 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; 165 }; 166 167 struct strobemeta_payload { 168 /* req_id has valid request ID, if req_meta_valid == 1 */ 169 int64_t req_id; 170 uint8_t req_meta_valid; 171 /* 172 * mask has Nth bit set to 1, if Nth metavar was present and 173 * successfully read 174 */ 175 uint64_t int_vals_set_mask; 176 int64_t int_vals[STROBE_MAX_INTS]; 177 /* len is >0 for present values */ 178 uint16_t str_lens[STROBE_MAX_STRS]; 179 /* if map_descrs[i].cnt == -1, metavar is not present/set */ 180 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; 181 /* 182 * payload has compactly packed values of str and map variables in the 183 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 184 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines 185 * value length 186 */ 187 char payload[STROBE_MAX_PAYLOAD]; 188 }; 189 190 struct strobelight_bpf_sample { 191 uint64_t ktime; 192 char comm[TASK_COMM_LEN]; 193 pid_t pid; 194 int user_stack_id; 195 int kernel_stack_id; 196 int has_meta; 197 struct strobemeta_payload metadata; 198 /* 199 * makes it possible to pass (<real payload size> + 1) as data size to 200 * perf_submit() to avoid perf_submit's paranoia about passing zero as 201 * size, as it deduces that <real payload size> might be 202 * **theoretically** zero 203 */ 204 char dummy_safeguard; 205 }; 206 207 struct { 208 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 209 __uint(max_entries, 32); 210 __uint(key_size, sizeof(int)); 211 __uint(value_size, sizeof(int)); 212 } samples SEC(".maps"); 213 214 struct { 215 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 216 __uint(max_entries, 16); 217 __uint(key_size, sizeof(uint32_t)); 218 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 219 } stacks_0 SEC(".maps"); 220 221 struct { 222 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 223 __uint(max_entries, 16); 224 __uint(key_size, sizeof(uint32_t)); 225 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 226 } stacks_1 SEC(".maps"); 227 228 struct { 229 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 230 __uint(max_entries, 1); 231 __type(key, uint32_t); 232 __type(value, struct strobelight_bpf_sample); 233 } sample_heap SEC(".maps"); 234 235 struct { 236 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 237 __uint(max_entries, STROBE_MAX_CFGS); 238 __type(key, pid_t); 239 __type(value, struct strobemeta_cfg); 240 } strobemeta_cfgs SEC(".maps"); 241 242 /* Type for the dtv. */ 243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ 244 typedef union dtv { 245 size_t counter; 246 struct { 247 void* val; 248 bool is_static; 249 } pointer; 250 } dtv_t; 251 252 /* Partial definition for tcbhead_t */ 253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ 254 struct tcbhead { 255 void* tcb; 256 dtv_t* dtv; 257 }; 258 259 /* 260 * TLS module/offset information for shared library case. 261 * For x86-64, this is mapped onto two entries in GOT. 262 * For aarch64, this is pointed to by second GOT entry. 263 */ 264 struct tls_index { 265 uint64_t module; 266 uint64_t offset; 267 }; 268 269 #ifdef SUBPROGS 270 __noinline 271 #else 272 __always_inline 273 #endif 274 static void *calc_location(struct strobe_value_loc *loc, void *tls_base) 275 { 276 /* 277 * tls_mode value is: 278 * - -1 (TLS_NOT_SET), if no metavar is present; 279 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS 280 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); 281 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; 282 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; 283 * This schema allows to use something like: 284 * (tls_mode + 1) * (tls_base + offset) 285 * to get NULL for "no metavar" location, or correct pointer for local 286 * executable mode without doing extra ifs. 287 */ 288 if (loc->tls_mode <= TLS_LOCAL_EXEC) { 289 /* static executable is simple, we just have offset from 290 * tls_base */ 291 void *addr = tls_base + loc->offset; 292 /* multiply by (tls_mode + 1) to get NULL, if we have no 293 * metavar in this slot */ 294 return (void *)((loc->tls_mode + 1) * (int64_t)addr); 295 } 296 /* 297 * Other modes are more complicated, we need to jump through few hoops. 298 * 299 * For immediate executable mode (currently supported only for aarch64): 300 * - loc->offset is pointing to a GOT entry containing fixed offset 301 * relative to tls_base; 302 * 303 * For general dynamic mode: 304 * - loc->offset is pointing to a beginning of double GOT entries; 305 * - (for aarch64 only) second entry points to tls_index_t struct; 306 * - (for x86-64 only) two GOT entries are already tls_index_t; 307 * - tls_index_t->module is used to find start of TLS section in 308 * which variable resides; 309 * - tls_index_t->offset provides offset within that TLS section, 310 * pointing to value of variable. 311 */ 312 struct tls_index tls_index; 313 dtv_t *dtv; 314 void *tls_ptr; 315 316 bpf_probe_read_user(&tls_index, sizeof(struct tls_index), 317 (void *)loc->offset); 318 /* valid module index is always positive */ 319 if (tls_index.module > 0) { 320 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ 321 bpf_probe_read_user(&dtv, sizeof(dtv), 322 &((struct tcbhead *)tls_base)->dtv); 323 dtv += tls_index.module; 324 } else { 325 dtv = NULL; 326 } 327 bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); 328 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ 329 return tls_ptr && tls_ptr != (void *)-1 330 ? tls_ptr + tls_index.offset 331 : NULL; 332 } 333 334 #ifdef SUBPROGS 335 __noinline 336 #else 337 __always_inline 338 #endif 339 static void read_int_var(struct strobemeta_cfg *cfg, 340 size_t idx, void *tls_base, 341 struct strobe_value_generic *value, 342 struct strobemeta_payload *data) 343 { 344 void *location = calc_location(&cfg->int_locs[idx], tls_base); 345 if (!location) 346 return; 347 348 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 349 data->int_vals[idx] = value->val; 350 if (value->header.len) 351 data->int_vals_set_mask |= (1 << idx); 352 } 353 354 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, 355 size_t idx, void *tls_base, 356 struct strobe_value_generic *value, 357 struct strobemeta_payload *data, 358 void *payload) 359 { 360 void *location; 361 uint64_t len; 362 363 data->str_lens[idx] = 0; 364 location = calc_location(&cfg->str_locs[idx], tls_base); 365 if (!location) 366 return 0; 367 368 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 369 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr); 370 /* 371 * if bpf_probe_read_user_str returns error (<0), due to casting to 372 * unsinged int, it will become big number, so next check is 373 * sufficient to check for errors AND prove to BPF verifier, that 374 * bpf_probe_read_user_str won't return anything bigger than 375 * STROBE_MAX_STR_LEN 376 */ 377 if (len > STROBE_MAX_STR_LEN) 378 return 0; 379 380 data->str_lens[idx] = len; 381 return len; 382 } 383 384 static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, 385 size_t idx, void *tls_base, 386 struct strobe_value_generic *value, 387 struct strobemeta_payload *data, 388 void *payload) 389 { 390 struct strobe_map_descr* descr = &data->map_descrs[idx]; 391 struct strobe_map_raw map; 392 void *location; 393 uint64_t len; 394 395 descr->tag_len = 0; /* presume no tag is set */ 396 descr->cnt = -1; /* presume no value is set */ 397 398 location = calc_location(&cfg->map_locs[idx], tls_base); 399 if (!location) 400 return payload; 401 402 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 403 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) 404 return payload; 405 406 descr->id = map.id; 407 descr->cnt = map.cnt; 408 if (cfg->req_meta_idx == idx) { 409 data->req_id = map.id; 410 data->req_meta_valid = 1; 411 } 412 413 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag); 414 if (len <= STROBE_MAX_STR_LEN) { 415 descr->tag_len = len; 416 payload += len; 417 } 418 419 #ifdef NO_UNROLL 420 #pragma clang loop unroll(disable) 421 #else 422 #pragma unroll 423 #endif 424 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { 425 if (i >= map.cnt) 426 break; 427 428 descr->key_lens[i] = 0; 429 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, 430 map.entries[i].key); 431 if (len <= STROBE_MAX_STR_LEN) { 432 descr->key_lens[i] = len; 433 payload += len; 434 } 435 descr->val_lens[i] = 0; 436 len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, 437 map.entries[i].val); 438 if (len <= STROBE_MAX_STR_LEN) { 439 descr->val_lens[i] = len; 440 payload += len; 441 } 442 } 443 444 return payload; 445 } 446 447 #ifdef USE_BPF_LOOP 448 enum read_type { 449 READ_INT_VAR, 450 READ_MAP_VAR, 451 READ_STR_VAR, 452 }; 453 454 struct read_var_ctx { 455 struct strobemeta_payload *data; 456 void *tls_base; 457 struct strobemeta_cfg *cfg; 458 void *payload; 459 /* value gets mutated */ 460 struct strobe_value_generic *value; 461 enum read_type type; 462 }; 463 464 static int read_var_callback(__u32 index, struct read_var_ctx *ctx) 465 { 466 switch (ctx->type) { 467 case READ_INT_VAR: 468 if (index >= STROBE_MAX_INTS) 469 return 1; 470 read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data); 471 break; 472 case READ_MAP_VAR: 473 if (index >= STROBE_MAX_MAPS) 474 return 1; 475 ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, 476 ctx->value, ctx->data, ctx->payload); 477 break; 478 case READ_STR_VAR: 479 if (index >= STROBE_MAX_STRS) 480 return 1; 481 ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, 482 ctx->value, ctx->data, ctx->payload); 483 break; 484 } 485 return 0; 486 } 487 #endif /* USE_BPF_LOOP */ 488 489 /* 490 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns 491 * pointer to *right after* payload ends 492 */ 493 #ifdef SUBPROGS 494 __noinline 495 #else 496 __always_inline 497 #endif 498 static void *read_strobe_meta(struct task_struct *task, 499 struct strobemeta_payload *data) 500 { 501 pid_t pid = bpf_get_current_pid_tgid() >> 32; 502 struct strobe_value_generic value = {0}; 503 struct strobemeta_cfg *cfg; 504 void *tls_base, *payload; 505 506 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 507 if (!cfg) 508 return NULL; 509 510 data->int_vals_set_mask = 0; 511 data->req_meta_valid = 0; 512 payload = data->payload; 513 /* 514 * we don't have struct task_struct definition, it should be: 515 * tls_base = (void *)task->thread.fsbase; 516 */ 517 tls_base = (void *)task; 518 519 #ifdef USE_BPF_LOOP 520 struct read_var_ctx ctx = { 521 .cfg = cfg, 522 .tls_base = tls_base, 523 .value = &value, 524 .data = data, 525 .payload = payload, 526 }; 527 int err; 528 529 ctx.type = READ_INT_VAR; 530 err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0); 531 if (err != STROBE_MAX_INTS) 532 return NULL; 533 534 ctx.type = READ_STR_VAR; 535 err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0); 536 if (err != STROBE_MAX_STRS) 537 return NULL; 538 539 ctx.type = READ_MAP_VAR; 540 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); 541 if (err != STROBE_MAX_MAPS) 542 return NULL; 543 #else 544 #ifdef NO_UNROLL 545 #pragma clang loop unroll(disable) 546 #else 547 #pragma unroll 548 #endif /* NO_UNROLL */ 549 for (int i = 0; i < STROBE_MAX_INTS; ++i) { 550 read_int_var(cfg, i, tls_base, &value, data); 551 } 552 #ifdef NO_UNROLL 553 #pragma clang loop unroll(disable) 554 #else 555 #pragma unroll 556 #endif /* NO_UNROLL */ 557 for (int i = 0; i < STROBE_MAX_STRS; ++i) { 558 payload += read_str_var(cfg, i, tls_base, &value, data, payload); 559 } 560 #ifdef NO_UNROLL 561 #pragma clang loop unroll(disable) 562 #else 563 #pragma unroll 564 #endif /* NO_UNROLL */ 565 for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 566 payload = read_map_var(cfg, i, tls_base, &value, data, payload); 567 } 568 #endif /* USE_BPF_LOOP */ 569 570 /* 571 * return pointer right after end of payload, so it's possible to 572 * calculate exact amount of useful data that needs to be sent 573 */ 574 return payload; 575 } 576 577 SEC("raw_tracepoint/kfree_skb") 578 int on_event(struct pt_regs *ctx) { 579 pid_t pid = bpf_get_current_pid_tgid() >> 32; 580 struct strobelight_bpf_sample* sample; 581 struct task_struct *task; 582 uint32_t zero = 0; 583 uint64_t ktime_ns; 584 void *sample_end; 585 586 sample = bpf_map_lookup_elem(&sample_heap, &zero); 587 if (!sample) 588 return 0; /* this will never happen */ 589 590 sample->pid = pid; 591 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); 592 ktime_ns = bpf_ktime_get_ns(); 593 sample->ktime = ktime_ns; 594 595 task = (struct task_struct *)bpf_get_current_task(); 596 sample_end = read_strobe_meta(task, &sample->metadata); 597 sample->has_meta = sample_end != NULL; 598 sample_end = sample_end ? : &sample->metadata; 599 600 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { 601 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); 602 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); 603 } else { 604 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); 605 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); 606 } 607 608 uint64_t sample_size = sample_end - (void *)sample; 609 /* should always be true */ 610 if (sample_size < sizeof(struct strobelight_bpf_sample)) 611 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); 612 return 0; 613 } 614 615 char _license[] SEC("license") = "GPL"; 616