1 /* Copyright (c) 2016 Facebook 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 */ 7 #include <linux/bpf.h> 8 #include <linux/jhash.h> 9 #include <linux/filter.h> 10 #include <linux/stacktrace.h> 11 #include <linux/perf_event.h> 12 #include <linux/elf.h> 13 #include <linux/pagemap.h> 14 #include "percpu_freelist.h" 15 16 #define STACK_CREATE_FLAG_MASK \ 17 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ 18 BPF_F_STACK_BUILD_ID) 19 20 struct stack_map_bucket { 21 struct pcpu_freelist_node fnode; 22 u32 hash; 23 u32 nr; 24 u64 data[]; 25 }; 26 27 struct bpf_stack_map { 28 struct bpf_map map; 29 void *elems; 30 struct pcpu_freelist freelist; 31 u32 n_buckets; 32 struct stack_map_bucket *buckets[]; 33 }; 34 35 static inline bool stack_map_use_build_id(struct bpf_map *map) 36 { 37 return (map->map_flags & BPF_F_STACK_BUILD_ID); 38 } 39 40 static inline int stack_map_data_size(struct bpf_map *map) 41 { 42 return stack_map_use_build_id(map) ? 43 sizeof(struct bpf_stack_build_id) : sizeof(u64); 44 } 45 46 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) 47 { 48 u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; 49 int err; 50 51 smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, 52 smap->map.numa_node); 53 if (!smap->elems) 54 return -ENOMEM; 55 56 err = pcpu_freelist_init(&smap->freelist); 57 if (err) 58 goto free_elems; 59 60 pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, 61 smap->map.max_entries); 62 return 0; 63 64 free_elems: 65 bpf_map_area_free(smap->elems); 66 return err; 67 } 68 69 /* Called from syscall */ 70 static struct bpf_map *stack_map_alloc(union bpf_attr *attr) 71 { 72 u32 value_size = attr->value_size; 73 struct bpf_stack_map *smap; 74 u64 cost, n_buckets; 75 int err; 76 77 if (!capable(CAP_SYS_ADMIN)) 78 return ERR_PTR(-EPERM); 79 80 if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) 81 return ERR_PTR(-EINVAL); 82 83 /* check sanity of attributes */ 84 if (attr->max_entries == 0 || attr->key_size != 4 || 85 value_size < 8 || value_size % 8) 86 return ERR_PTR(-EINVAL); 87 88 BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); 89 if (attr->map_flags & BPF_F_STACK_BUILD_ID) { 90 if (value_size % sizeof(struct bpf_stack_build_id) || 91 value_size / sizeof(struct bpf_stack_build_id) 92 > sysctl_perf_event_max_stack) 93 return ERR_PTR(-EINVAL); 94 } else if (value_size / 8 > sysctl_perf_event_max_stack) 95 return ERR_PTR(-EINVAL); 96 97 /* hash table size must be power of 2 */ 98 n_buckets = roundup_pow_of_two(attr->max_entries); 99 100 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 101 if (cost >= U32_MAX - PAGE_SIZE) 102 return ERR_PTR(-E2BIG); 103 104 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 105 if (!smap) 106 return ERR_PTR(-ENOMEM); 107 108 err = -E2BIG; 109 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); 110 if (cost >= U32_MAX - PAGE_SIZE) 111 goto free_smap; 112 113 bpf_map_init_from_attr(&smap->map, attr); 114 smap->map.value_size = value_size; 115 smap->n_buckets = n_buckets; 116 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 117 118 err = bpf_map_precharge_memlock(smap->map.pages); 119 if (err) 120 goto free_smap; 121 122 err = get_callchain_buffers(sysctl_perf_event_max_stack); 123 if (err) 124 goto free_smap; 125 126 err = prealloc_elems_and_freelist(smap); 127 if (err) 128 goto put_buffers; 129 130 return &smap->map; 131 132 put_buffers: 133 put_callchain_buffers(); 134 free_smap: 135 bpf_map_area_free(smap); 136 return ERR_PTR(err); 137 } 138 139 #define BPF_BUILD_ID 3 140 /* 141 * Parse build id from the note segment. This logic can be shared between 142 * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are 143 * identical. 144 */ 145 static inline int stack_map_parse_build_id(void *page_addr, 146 unsigned char *build_id, 147 void *note_start, 148 Elf32_Word note_size) 149 { 150 Elf32_Word note_offs = 0, new_offs; 151 152 /* check for overflow */ 153 if (note_start < page_addr || note_start + note_size < note_start) 154 return -EINVAL; 155 156 /* only supports note that fits in the first page */ 157 if (note_start + note_size > page_addr + PAGE_SIZE) 158 return -EINVAL; 159 160 while (note_offs + sizeof(Elf32_Nhdr) < note_size) { 161 Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); 162 163 if (nhdr->n_type == BPF_BUILD_ID && 164 nhdr->n_namesz == sizeof("GNU") && 165 nhdr->n_descsz == BPF_BUILD_ID_SIZE) { 166 memcpy(build_id, 167 note_start + note_offs + 168 ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), 169 BPF_BUILD_ID_SIZE); 170 return 0; 171 } 172 new_offs = note_offs + sizeof(Elf32_Nhdr) + 173 ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); 174 if (new_offs <= note_offs) /* overflow */ 175 break; 176 note_offs = new_offs; 177 } 178 return -EINVAL; 179 } 180 181 /* Parse build ID from 32-bit ELF */ 182 static int stack_map_get_build_id_32(void *page_addr, 183 unsigned char *build_id) 184 { 185 Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; 186 Elf32_Phdr *phdr; 187 int i; 188 189 /* only supports phdr that fits in one page */ 190 if (ehdr->e_phnum > 191 (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) 192 return -EINVAL; 193 194 phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); 195 196 for (i = 0; i < ehdr->e_phnum; ++i) 197 if (phdr[i].p_type == PT_NOTE) 198 return stack_map_parse_build_id(page_addr, build_id, 199 page_addr + phdr[i].p_offset, 200 phdr[i].p_filesz); 201 return -EINVAL; 202 } 203 204 /* Parse build ID from 64-bit ELF */ 205 static int stack_map_get_build_id_64(void *page_addr, 206 unsigned char *build_id) 207 { 208 Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; 209 Elf64_Phdr *phdr; 210 int i; 211 212 /* only supports phdr that fits in one page */ 213 if (ehdr->e_phnum > 214 (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) 215 return -EINVAL; 216 217 phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); 218 219 for (i = 0; i < ehdr->e_phnum; ++i) 220 if (phdr[i].p_type == PT_NOTE) 221 return stack_map_parse_build_id(page_addr, build_id, 222 page_addr + phdr[i].p_offset, 223 phdr[i].p_filesz); 224 return -EINVAL; 225 } 226 227 /* Parse build ID of ELF file mapped to vma */ 228 static int stack_map_get_build_id(struct vm_area_struct *vma, 229 unsigned char *build_id) 230 { 231 Elf32_Ehdr *ehdr; 232 struct page *page; 233 void *page_addr; 234 int ret; 235 236 /* only works for page backed storage */ 237 if (!vma->vm_file) 238 return -EINVAL; 239 240 page = find_get_page(vma->vm_file->f_mapping, 0); 241 if (!page) 242 return -EFAULT; /* page not mapped */ 243 244 ret = -EINVAL; 245 page_addr = page_address(page); 246 ehdr = (Elf32_Ehdr *)page_addr; 247 248 /* compare magic x7f "ELF" */ 249 if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) 250 goto out; 251 252 /* only support executable file and shared object file */ 253 if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) 254 goto out; 255 256 if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) 257 ret = stack_map_get_build_id_32(page_addr, build_id); 258 else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) 259 ret = stack_map_get_build_id_64(page_addr, build_id); 260 out: 261 put_page(page); 262 return ret; 263 } 264 265 static void stack_map_get_build_id_offset(struct bpf_map *map, 266 struct stack_map_bucket *bucket, 267 u64 *ips, u32 trace_nr, bool user) 268 { 269 int i; 270 struct vm_area_struct *vma; 271 struct bpf_stack_build_id *id_offs; 272 273 bucket->nr = trace_nr; 274 id_offs = (struct bpf_stack_build_id *)bucket->data; 275 276 /* 277 * We cannot do up_read() in nmi context, so build_id lookup is 278 * only supported for non-nmi events. If at some point, it is 279 * possible to run find_vma() without taking the semaphore, we 280 * would like to allow build_id lookup in nmi context. 281 * 282 * Same fallback is used for kernel stack (!user) on a stackmap 283 * with build_id. 284 */ 285 if (!user || !current || !current->mm || in_nmi() || 286 down_read_trylock(¤t->mm->mmap_sem) == 0) { 287 /* cannot access current->mm, fall back to ips */ 288 for (i = 0; i < trace_nr; i++) { 289 id_offs[i].status = BPF_STACK_BUILD_ID_IP; 290 id_offs[i].ip = ips[i]; 291 } 292 return; 293 } 294 295 for (i = 0; i < trace_nr; i++) { 296 vma = find_vma(current->mm, ips[i]); 297 if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { 298 /* per entry fall back to ips */ 299 id_offs[i].status = BPF_STACK_BUILD_ID_IP; 300 id_offs[i].ip = ips[i]; 301 continue; 302 } 303 id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] 304 - vma->vm_start; 305 id_offs[i].status = BPF_STACK_BUILD_ID_VALID; 306 } 307 up_read(¤t->mm->mmap_sem); 308 } 309 310 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, 311 u64, flags) 312 { 313 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 314 struct perf_callchain_entry *trace; 315 struct stack_map_bucket *bucket, *new_bucket, *old_bucket; 316 u32 max_depth = map->value_size / stack_map_data_size(map); 317 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ 318 u32 init_nr = sysctl_perf_event_max_stack - max_depth; 319 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 320 u32 hash, id, trace_nr, trace_len; 321 bool user = flags & BPF_F_USER_STACK; 322 bool kernel = !user; 323 u64 *ips; 324 bool hash_matches; 325 326 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 327 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) 328 return -EINVAL; 329 330 trace = get_perf_callchain(regs, init_nr, kernel, user, 331 sysctl_perf_event_max_stack, false, false); 332 333 if (unlikely(!trace)) 334 /* couldn't fetch the stack trace */ 335 return -EFAULT; 336 337 /* get_perf_callchain() guarantees that trace->nr >= init_nr 338 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth 339 */ 340 trace_nr = trace->nr - init_nr; 341 342 if (trace_nr <= skip) 343 /* skipping more than usable stack trace */ 344 return -EFAULT; 345 346 trace_nr -= skip; 347 trace_len = trace_nr * sizeof(u64); 348 ips = trace->ip + skip + init_nr; 349 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); 350 id = hash & (smap->n_buckets - 1); 351 bucket = READ_ONCE(smap->buckets[id]); 352 353 hash_matches = bucket && bucket->hash == hash; 354 /* fast cmp */ 355 if (hash_matches && flags & BPF_F_FAST_STACK_CMP) 356 return id; 357 358 if (stack_map_use_build_id(map)) { 359 /* for build_id+offset, pop a bucket before slow cmp */ 360 new_bucket = (struct stack_map_bucket *) 361 pcpu_freelist_pop(&smap->freelist); 362 if (unlikely(!new_bucket)) 363 return -ENOMEM; 364 stack_map_get_build_id_offset(map, new_bucket, ips, 365 trace_nr, user); 366 trace_len = trace_nr * sizeof(struct bpf_stack_build_id); 367 if (hash_matches && bucket->nr == trace_nr && 368 memcmp(bucket->data, new_bucket->data, trace_len) == 0) { 369 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); 370 return id; 371 } 372 if (bucket && !(flags & BPF_F_REUSE_STACKID)) { 373 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); 374 return -EEXIST; 375 } 376 } else { 377 if (hash_matches && bucket->nr == trace_nr && 378 memcmp(bucket->data, ips, trace_len) == 0) 379 return id; 380 if (bucket && !(flags & BPF_F_REUSE_STACKID)) 381 return -EEXIST; 382 383 new_bucket = (struct stack_map_bucket *) 384 pcpu_freelist_pop(&smap->freelist); 385 if (unlikely(!new_bucket)) 386 return -ENOMEM; 387 memcpy(new_bucket->data, ips, trace_len); 388 } 389 390 new_bucket->hash = hash; 391 new_bucket->nr = trace_nr; 392 393 old_bucket = xchg(&smap->buckets[id], new_bucket); 394 if (old_bucket) 395 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 396 return id; 397 } 398 399 const struct bpf_func_proto bpf_get_stackid_proto = { 400 .func = bpf_get_stackid, 401 .gpl_only = true, 402 .ret_type = RET_INTEGER, 403 .arg1_type = ARG_PTR_TO_CTX, 404 .arg2_type = ARG_CONST_MAP_PTR, 405 .arg3_type = ARG_ANYTHING, 406 }; 407 408 /* Called from eBPF program */ 409 static void *stack_map_lookup_elem(struct bpf_map *map, void *key) 410 { 411 return NULL; 412 } 413 414 /* Called from syscall */ 415 int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) 416 { 417 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 418 struct stack_map_bucket *bucket, *old_bucket; 419 u32 id = *(u32 *)key, trace_len; 420 421 if (unlikely(id >= smap->n_buckets)) 422 return -ENOENT; 423 424 bucket = xchg(&smap->buckets[id], NULL); 425 if (!bucket) 426 return -ENOENT; 427 428 trace_len = bucket->nr * stack_map_data_size(map); 429 memcpy(value, bucket->data, trace_len); 430 memset(value + trace_len, 0, map->value_size - trace_len); 431 432 old_bucket = xchg(&smap->buckets[id], bucket); 433 if (old_bucket) 434 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 435 return 0; 436 } 437 438 static int stack_map_get_next_key(struct bpf_map *map, void *key, 439 void *next_key) 440 { 441 struct bpf_stack_map *smap = container_of(map, 442 struct bpf_stack_map, map); 443 u32 id; 444 445 WARN_ON_ONCE(!rcu_read_lock_held()); 446 447 if (!key) { 448 id = 0; 449 } else { 450 id = *(u32 *)key; 451 if (id >= smap->n_buckets || !smap->buckets[id]) 452 id = 0; 453 else 454 id++; 455 } 456 457 while (id < smap->n_buckets && !smap->buckets[id]) 458 id++; 459 460 if (id >= smap->n_buckets) 461 return -ENOENT; 462 463 *(u32 *)next_key = id; 464 return 0; 465 } 466 467 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, 468 u64 map_flags) 469 { 470 return -EINVAL; 471 } 472 473 /* Called from syscall or from eBPF program */ 474 static int stack_map_delete_elem(struct bpf_map *map, void *key) 475 { 476 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 477 struct stack_map_bucket *old_bucket; 478 u32 id = *(u32 *)key; 479 480 if (unlikely(id >= smap->n_buckets)) 481 return -E2BIG; 482 483 old_bucket = xchg(&smap->buckets[id], NULL); 484 if (old_bucket) { 485 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 486 return 0; 487 } else { 488 return -ENOENT; 489 } 490 } 491 492 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 493 static void stack_map_free(struct bpf_map *map) 494 { 495 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 496 497 /* wait for bpf programs to complete before freeing stack map */ 498 synchronize_rcu(); 499 500 bpf_map_area_free(smap->elems); 501 pcpu_freelist_destroy(&smap->freelist); 502 bpf_map_area_free(smap); 503 put_callchain_buffers(); 504 } 505 506 const struct bpf_map_ops stack_map_ops = { 507 .map_alloc = stack_map_alloc, 508 .map_free = stack_map_free, 509 .map_get_next_key = stack_map_get_next_key, 510 .map_lookup_elem = stack_map_lookup_elem, 511 .map_update_elem = stack_map_update_elem, 512 .map_delete_elem = stack_map_delete_elem, 513 }; 514