1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016,2017 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/err.h> 8 #include <linux/slab.h> 9 #include <linux/mm.h> 10 #include <linux/filter.h> 11 #include <linux/perf_event.h> 12 #include <uapi/linux/btf.h> 13 #include <linux/rcupdate_trace.h> 14 15 #include "map_in_map.h" 16 17 #define ARRAY_CREATE_FLAG_MASK \ 18 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) 19 20 static void bpf_array_free_percpu(struct bpf_array *array) 21 { 22 int i; 23 24 for (i = 0; i < array->map.max_entries; i++) { 25 free_percpu(array->pptrs[i]); 26 cond_resched(); 27 } 28 } 29 30 static int bpf_array_alloc_percpu(struct bpf_array *array) 31 { 32 void __percpu *ptr; 33 int i; 34 35 for (i = 0; i < array->map.max_entries; i++) { 36 ptr = __alloc_percpu_gfp(array->elem_size, 8, 37 GFP_USER | __GFP_NOWARN); 38 if (!ptr) { 39 bpf_array_free_percpu(array); 40 return -ENOMEM; 41 } 42 array->pptrs[i] = ptr; 43 cond_resched(); 44 } 45 46 return 0; 47 } 48 49 /* Called from syscall */ 50 int array_map_alloc_check(union bpf_attr *attr) 51 { 52 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 53 int numa_node = bpf_map_attr_numa_node(attr); 54 55 /* check sanity of attributes */ 56 if (attr->max_entries == 0 || attr->key_size != 4 || 57 attr->value_size == 0 || 58 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 59 !bpf_map_flags_access_ok(attr->map_flags) || 60 (percpu && numa_node != NUMA_NO_NODE)) 61 return -EINVAL; 62 63 if (attr->map_type != BPF_MAP_TYPE_ARRAY && 64 attr->map_flags & BPF_F_MMAPABLE) 65 return -EINVAL; 66 67 if (attr->value_size > KMALLOC_MAX_SIZE) 68 /* if value_size is bigger, the user space won't be able to 69 * access the elements. 70 */ 71 return -E2BIG; 72 73 return 0; 74 } 75 76 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 77 { 78 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 79 int ret, numa_node = bpf_map_attr_numa_node(attr); 80 u32 elem_size, index_mask, max_entries; 81 bool bypass_spec_v1 = bpf_bypass_spec_v1(); 82 u64 cost, array_size, mask64; 83 struct bpf_map_memory mem; 84 struct bpf_array *array; 85 86 elem_size = round_up(attr->value_size, 8); 87 88 max_entries = attr->max_entries; 89 90 /* On 32 bit archs roundup_pow_of_two() with max_entries that has 91 * upper most bit set in u32 space is undefined behavior due to 92 * resulting 1U << 32, so do it manually here in u64 space. 93 */ 94 mask64 = fls_long(max_entries - 1); 95 mask64 = 1ULL << mask64; 96 mask64 -= 1; 97 98 index_mask = mask64; 99 if (!bypass_spec_v1) { 100 /* round up array size to nearest power of 2, 101 * since cpu will speculate within index_mask limits 102 */ 103 max_entries = index_mask + 1; 104 /* Check for overflows. */ 105 if (max_entries < attr->max_entries) 106 return ERR_PTR(-E2BIG); 107 } 108 109 array_size = sizeof(*array); 110 if (percpu) { 111 array_size += (u64) max_entries * sizeof(void *); 112 } else { 113 /* rely on vmalloc() to return page-aligned memory and 114 * ensure array->value is exactly page-aligned 115 */ 116 if (attr->map_flags & BPF_F_MMAPABLE) { 117 array_size = PAGE_ALIGN(array_size); 118 array_size += PAGE_ALIGN((u64) max_entries * elem_size); 119 } else { 120 array_size += (u64) max_entries * elem_size; 121 } 122 } 123 124 /* make sure there is no u32 overflow later in round_up() */ 125 cost = array_size; 126 if (percpu) 127 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 128 129 ret = bpf_map_charge_init(&mem, cost); 130 if (ret < 0) 131 return ERR_PTR(ret); 132 133 /* allocate all map elements and zero-initialize them */ 134 if (attr->map_flags & BPF_F_MMAPABLE) { 135 void *data; 136 137 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ 138 data = bpf_map_area_mmapable_alloc(array_size, numa_node); 139 if (!data) { 140 bpf_map_charge_finish(&mem); 141 return ERR_PTR(-ENOMEM); 142 } 143 array = data + PAGE_ALIGN(sizeof(struct bpf_array)) 144 - offsetof(struct bpf_array, value); 145 } else { 146 array = bpf_map_area_alloc(array_size, numa_node); 147 } 148 if (!array) { 149 bpf_map_charge_finish(&mem); 150 return ERR_PTR(-ENOMEM); 151 } 152 array->index_mask = index_mask; 153 array->map.bypass_spec_v1 = bypass_spec_v1; 154 155 /* copy mandatory map attributes */ 156 bpf_map_init_from_attr(&array->map, attr); 157 bpf_map_charge_move(&array->map.memory, &mem); 158 array->elem_size = elem_size; 159 160 if (percpu && bpf_array_alloc_percpu(array)) { 161 bpf_map_charge_finish(&array->map.memory); 162 bpf_map_area_free(array); 163 return ERR_PTR(-ENOMEM); 164 } 165 166 return &array->map; 167 } 168 169 /* Called from syscall or from eBPF program */ 170 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 171 { 172 struct bpf_array *array = container_of(map, struct bpf_array, map); 173 u32 index = *(u32 *)key; 174 175 if (unlikely(index >= array->map.max_entries)) 176 return NULL; 177 178 return array->value + array->elem_size * (index & array->index_mask); 179 } 180 181 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, 182 u32 off) 183 { 184 struct bpf_array *array = container_of(map, struct bpf_array, map); 185 186 if (map->max_entries != 1) 187 return -ENOTSUPP; 188 if (off >= map->value_size) 189 return -EINVAL; 190 191 *imm = (unsigned long)array->value; 192 return 0; 193 } 194 195 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, 196 u32 *off) 197 { 198 struct bpf_array *array = container_of(map, struct bpf_array, map); 199 u64 base = (unsigned long)array->value; 200 u64 range = array->elem_size; 201 202 if (map->max_entries != 1) 203 return -ENOTSUPP; 204 if (imm < base || imm >= base + range) 205 return -ENOENT; 206 207 *off = imm - base; 208 return 0; 209 } 210 211 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 212 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 213 { 214 struct bpf_array *array = container_of(map, struct bpf_array, map); 215 struct bpf_insn *insn = insn_buf; 216 u32 elem_size = round_up(map->value_size, 8); 217 const int ret = BPF_REG_0; 218 const int map_ptr = BPF_REG_1; 219 const int index = BPF_REG_2; 220 221 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 222 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 223 if (!map->bypass_spec_v1) { 224 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); 225 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 226 } else { 227 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); 228 } 229 230 if (is_power_of_2(elem_size)) { 231 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 232 } else { 233 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 234 } 235 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 236 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 237 *insn++ = BPF_MOV64_IMM(ret, 0); 238 return insn - insn_buf; 239 } 240 241 /* Called from eBPF program */ 242 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 243 { 244 struct bpf_array *array = container_of(map, struct bpf_array, map); 245 u32 index = *(u32 *)key; 246 247 if (unlikely(index >= array->map.max_entries)) 248 return NULL; 249 250 return this_cpu_ptr(array->pptrs[index & array->index_mask]); 251 } 252 253 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 254 { 255 struct bpf_array *array = container_of(map, struct bpf_array, map); 256 u32 index = *(u32 *)key; 257 void __percpu *pptr; 258 int cpu, off = 0; 259 u32 size; 260 261 if (unlikely(index >= array->map.max_entries)) 262 return -ENOENT; 263 264 /* per_cpu areas are zero-filled and bpf programs can only 265 * access 'value_size' of them, so copying rounded areas 266 * will not leak any kernel data 267 */ 268 size = round_up(map->value_size, 8); 269 rcu_read_lock(); 270 pptr = array->pptrs[index & array->index_mask]; 271 for_each_possible_cpu(cpu) { 272 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 273 off += size; 274 } 275 rcu_read_unlock(); 276 return 0; 277 } 278 279 /* Called from syscall */ 280 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 281 { 282 struct bpf_array *array = container_of(map, struct bpf_array, map); 283 u32 index = key ? *(u32 *)key : U32_MAX; 284 u32 *next = (u32 *)next_key; 285 286 if (index >= array->map.max_entries) { 287 *next = 0; 288 return 0; 289 } 290 291 if (index == array->map.max_entries - 1) 292 return -ENOENT; 293 294 *next = index + 1; 295 return 0; 296 } 297 298 /* Called from syscall or from eBPF program */ 299 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 300 u64 map_flags) 301 { 302 struct bpf_array *array = container_of(map, struct bpf_array, map); 303 u32 index = *(u32 *)key; 304 char *val; 305 306 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 307 /* unknown flags */ 308 return -EINVAL; 309 310 if (unlikely(index >= array->map.max_entries)) 311 /* all elements were pre-allocated, cannot insert a new one */ 312 return -E2BIG; 313 314 if (unlikely(map_flags & BPF_NOEXIST)) 315 /* all elements already exist */ 316 return -EEXIST; 317 318 if (unlikely((map_flags & BPF_F_LOCK) && 319 !map_value_has_spin_lock(map))) 320 return -EINVAL; 321 322 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 323 memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), 324 value, map->value_size); 325 } else { 326 val = array->value + 327 array->elem_size * (index & array->index_mask); 328 if (map_flags & BPF_F_LOCK) 329 copy_map_value_locked(map, val, value, false); 330 else 331 copy_map_value(map, val, value); 332 } 333 return 0; 334 } 335 336 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 337 u64 map_flags) 338 { 339 struct bpf_array *array = container_of(map, struct bpf_array, map); 340 u32 index = *(u32 *)key; 341 void __percpu *pptr; 342 int cpu, off = 0; 343 u32 size; 344 345 if (unlikely(map_flags > BPF_EXIST)) 346 /* unknown flags */ 347 return -EINVAL; 348 349 if (unlikely(index >= array->map.max_entries)) 350 /* all elements were pre-allocated, cannot insert a new one */ 351 return -E2BIG; 352 353 if (unlikely(map_flags == BPF_NOEXIST)) 354 /* all elements already exist */ 355 return -EEXIST; 356 357 /* the user space will provide round_up(value_size, 8) bytes that 358 * will be copied into per-cpu area. bpf programs can only access 359 * value_size of it. During lookup the same extra bytes will be 360 * returned or zeros which were zero-filled by percpu_alloc, 361 * so no kernel data leaks possible 362 */ 363 size = round_up(map->value_size, 8); 364 rcu_read_lock(); 365 pptr = array->pptrs[index & array->index_mask]; 366 for_each_possible_cpu(cpu) { 367 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 368 off += size; 369 } 370 rcu_read_unlock(); 371 return 0; 372 } 373 374 /* Called from syscall or from eBPF program */ 375 static int array_map_delete_elem(struct bpf_map *map, void *key) 376 { 377 return -EINVAL; 378 } 379 380 static void *array_map_vmalloc_addr(struct bpf_array *array) 381 { 382 return (void *)round_down((unsigned long)array, PAGE_SIZE); 383 } 384 385 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 386 static void array_map_free(struct bpf_map *map) 387 { 388 struct bpf_array *array = container_of(map, struct bpf_array, map); 389 390 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 391 bpf_array_free_percpu(array); 392 393 if (array->map.map_flags & BPF_F_MMAPABLE) 394 bpf_map_area_free(array_map_vmalloc_addr(array)); 395 else 396 bpf_map_area_free(array); 397 } 398 399 static void array_map_seq_show_elem(struct bpf_map *map, void *key, 400 struct seq_file *m) 401 { 402 void *value; 403 404 rcu_read_lock(); 405 406 value = array_map_lookup_elem(map, key); 407 if (!value) { 408 rcu_read_unlock(); 409 return; 410 } 411 412 if (map->btf_key_type_id) 413 seq_printf(m, "%u: ", *(u32 *)key); 414 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 415 seq_puts(m, "\n"); 416 417 rcu_read_unlock(); 418 } 419 420 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, 421 struct seq_file *m) 422 { 423 struct bpf_array *array = container_of(map, struct bpf_array, map); 424 u32 index = *(u32 *)key; 425 void __percpu *pptr; 426 int cpu; 427 428 rcu_read_lock(); 429 430 seq_printf(m, "%u: {\n", *(u32 *)key); 431 pptr = array->pptrs[index & array->index_mask]; 432 for_each_possible_cpu(cpu) { 433 seq_printf(m, "\tcpu%d: ", cpu); 434 btf_type_seq_show(map->btf, map->btf_value_type_id, 435 per_cpu_ptr(pptr, cpu), m); 436 seq_puts(m, "\n"); 437 } 438 seq_puts(m, "}\n"); 439 440 rcu_read_unlock(); 441 } 442 443 static int array_map_check_btf(const struct bpf_map *map, 444 const struct btf *btf, 445 const struct btf_type *key_type, 446 const struct btf_type *value_type) 447 { 448 u32 int_data; 449 450 /* One exception for keyless BTF: .bss/.data/.rodata map */ 451 if (btf_type_is_void(key_type)) { 452 if (map->map_type != BPF_MAP_TYPE_ARRAY || 453 map->max_entries != 1) 454 return -EINVAL; 455 456 if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) 457 return -EINVAL; 458 459 return 0; 460 } 461 462 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 463 return -EINVAL; 464 465 int_data = *(u32 *)(key_type + 1); 466 /* bpf array can only take a u32 key. This check makes sure 467 * that the btf matches the attr used during map_create. 468 */ 469 if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 470 return -EINVAL; 471 472 return 0; 473 } 474 475 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 476 { 477 struct bpf_array *array = container_of(map, struct bpf_array, map); 478 pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; 479 480 if (!(map->map_flags & BPF_F_MMAPABLE)) 481 return -EINVAL; 482 483 if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > 484 PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) 485 return -EINVAL; 486 487 return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), 488 vma->vm_pgoff + pgoff); 489 } 490 491 static bool array_map_meta_equal(const struct bpf_map *meta0, 492 const struct bpf_map *meta1) 493 { 494 return meta0->max_entries == meta1->max_entries && 495 bpf_map_meta_equal(meta0, meta1); 496 } 497 498 struct bpf_iter_seq_array_map_info { 499 struct bpf_map *map; 500 void *percpu_value_buf; 501 u32 index; 502 }; 503 504 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) 505 { 506 struct bpf_iter_seq_array_map_info *info = seq->private; 507 struct bpf_map *map = info->map; 508 struct bpf_array *array; 509 u32 index; 510 511 if (info->index >= map->max_entries) 512 return NULL; 513 514 if (*pos == 0) 515 ++*pos; 516 array = container_of(map, struct bpf_array, map); 517 index = info->index & array->index_mask; 518 if (info->percpu_value_buf) 519 return array->pptrs[index]; 520 return array->value + array->elem_size * index; 521 } 522 523 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 524 { 525 struct bpf_iter_seq_array_map_info *info = seq->private; 526 struct bpf_map *map = info->map; 527 struct bpf_array *array; 528 u32 index; 529 530 ++*pos; 531 ++info->index; 532 if (info->index >= map->max_entries) 533 return NULL; 534 535 array = container_of(map, struct bpf_array, map); 536 index = info->index & array->index_mask; 537 if (info->percpu_value_buf) 538 return array->pptrs[index]; 539 return array->value + array->elem_size * index; 540 } 541 542 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) 543 { 544 struct bpf_iter_seq_array_map_info *info = seq->private; 545 struct bpf_iter__bpf_map_elem ctx = {}; 546 struct bpf_map *map = info->map; 547 struct bpf_iter_meta meta; 548 struct bpf_prog *prog; 549 int off = 0, cpu = 0; 550 void __percpu **pptr; 551 u32 size; 552 553 meta.seq = seq; 554 prog = bpf_iter_get_info(&meta, v == NULL); 555 if (!prog) 556 return 0; 557 558 ctx.meta = &meta; 559 ctx.map = info->map; 560 if (v) { 561 ctx.key = &info->index; 562 563 if (!info->percpu_value_buf) { 564 ctx.value = v; 565 } else { 566 pptr = v; 567 size = round_up(map->value_size, 8); 568 for_each_possible_cpu(cpu) { 569 bpf_long_memcpy(info->percpu_value_buf + off, 570 per_cpu_ptr(pptr, cpu), 571 size); 572 off += size; 573 } 574 ctx.value = info->percpu_value_buf; 575 } 576 } 577 578 return bpf_iter_run_prog(prog, &ctx); 579 } 580 581 static int bpf_array_map_seq_show(struct seq_file *seq, void *v) 582 { 583 return __bpf_array_map_seq_show(seq, v); 584 } 585 586 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) 587 { 588 if (!v) 589 (void)__bpf_array_map_seq_show(seq, NULL); 590 } 591 592 static int bpf_iter_init_array_map(void *priv_data, 593 struct bpf_iter_aux_info *aux) 594 { 595 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 596 struct bpf_map *map = aux->map; 597 void *value_buf; 598 u32 buf_size; 599 600 if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 601 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 602 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 603 if (!value_buf) 604 return -ENOMEM; 605 606 seq_info->percpu_value_buf = value_buf; 607 } 608 609 seq_info->map = map; 610 return 0; 611 } 612 613 static void bpf_iter_fini_array_map(void *priv_data) 614 { 615 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 616 617 kfree(seq_info->percpu_value_buf); 618 } 619 620 static const struct seq_operations bpf_array_map_seq_ops = { 621 .start = bpf_array_map_seq_start, 622 .next = bpf_array_map_seq_next, 623 .stop = bpf_array_map_seq_stop, 624 .show = bpf_array_map_seq_show, 625 }; 626 627 static const struct bpf_iter_seq_info iter_seq_info = { 628 .seq_ops = &bpf_array_map_seq_ops, 629 .init_seq_private = bpf_iter_init_array_map, 630 .fini_seq_private = bpf_iter_fini_array_map, 631 .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), 632 }; 633 634 static int array_map_btf_id; 635 const struct bpf_map_ops array_map_ops = { 636 .map_meta_equal = array_map_meta_equal, 637 .map_alloc_check = array_map_alloc_check, 638 .map_alloc = array_map_alloc, 639 .map_free = array_map_free, 640 .map_get_next_key = array_map_get_next_key, 641 .map_lookup_elem = array_map_lookup_elem, 642 .map_update_elem = array_map_update_elem, 643 .map_delete_elem = array_map_delete_elem, 644 .map_gen_lookup = array_map_gen_lookup, 645 .map_direct_value_addr = array_map_direct_value_addr, 646 .map_direct_value_meta = array_map_direct_value_meta, 647 .map_mmap = array_map_mmap, 648 .map_seq_show_elem = array_map_seq_show_elem, 649 .map_check_btf = array_map_check_btf, 650 .map_lookup_batch = generic_map_lookup_batch, 651 .map_update_batch = generic_map_update_batch, 652 .map_btf_name = "bpf_array", 653 .map_btf_id = &array_map_btf_id, 654 .iter_seq_info = &iter_seq_info, 655 }; 656 657 static int percpu_array_map_btf_id; 658 const struct bpf_map_ops percpu_array_map_ops = { 659 .map_meta_equal = bpf_map_meta_equal, 660 .map_alloc_check = array_map_alloc_check, 661 .map_alloc = array_map_alloc, 662 .map_free = array_map_free, 663 .map_get_next_key = array_map_get_next_key, 664 .map_lookup_elem = percpu_array_map_lookup_elem, 665 .map_update_elem = array_map_update_elem, 666 .map_delete_elem = array_map_delete_elem, 667 .map_seq_show_elem = percpu_array_map_seq_show_elem, 668 .map_check_btf = array_map_check_btf, 669 .map_btf_name = "bpf_array", 670 .map_btf_id = &percpu_array_map_btf_id, 671 .iter_seq_info = &iter_seq_info, 672 }; 673 674 static int fd_array_map_alloc_check(union bpf_attr *attr) 675 { 676 /* only file descriptors can be stored in this type of map */ 677 if (attr->value_size != sizeof(u32)) 678 return -EINVAL; 679 /* Program read-only/write-only not supported for special maps yet. */ 680 if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) 681 return -EINVAL; 682 return array_map_alloc_check(attr); 683 } 684 685 static void fd_array_map_free(struct bpf_map *map) 686 { 687 struct bpf_array *array = container_of(map, struct bpf_array, map); 688 int i; 689 690 /* make sure it's empty */ 691 for (i = 0; i < array->map.max_entries; i++) 692 BUG_ON(array->ptrs[i] != NULL); 693 694 bpf_map_area_free(array); 695 } 696 697 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 698 { 699 return ERR_PTR(-EOPNOTSUPP); 700 } 701 702 /* only called from syscall */ 703 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 704 { 705 void **elem, *ptr; 706 int ret = 0; 707 708 if (!map->ops->map_fd_sys_lookup_elem) 709 return -ENOTSUPP; 710 711 rcu_read_lock(); 712 elem = array_map_lookup_elem(map, key); 713 if (elem && (ptr = READ_ONCE(*elem))) 714 *value = map->ops->map_fd_sys_lookup_elem(ptr); 715 else 716 ret = -ENOENT; 717 rcu_read_unlock(); 718 719 return ret; 720 } 721 722 /* only called from syscall */ 723 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 724 void *key, void *value, u64 map_flags) 725 { 726 struct bpf_array *array = container_of(map, struct bpf_array, map); 727 void *new_ptr, *old_ptr; 728 u32 index = *(u32 *)key, ufd; 729 730 if (map_flags != BPF_ANY) 731 return -EINVAL; 732 733 if (index >= array->map.max_entries) 734 return -E2BIG; 735 736 ufd = *(u32 *)value; 737 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 738 if (IS_ERR(new_ptr)) 739 return PTR_ERR(new_ptr); 740 741 if (map->ops->map_poke_run) { 742 mutex_lock(&array->aux->poke_mutex); 743 old_ptr = xchg(array->ptrs + index, new_ptr); 744 map->ops->map_poke_run(map, index, old_ptr, new_ptr); 745 mutex_unlock(&array->aux->poke_mutex); 746 } else { 747 old_ptr = xchg(array->ptrs + index, new_ptr); 748 } 749 750 if (old_ptr) 751 map->ops->map_fd_put_ptr(old_ptr); 752 return 0; 753 } 754 755 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 756 { 757 struct bpf_array *array = container_of(map, struct bpf_array, map); 758 void *old_ptr; 759 u32 index = *(u32 *)key; 760 761 if (index >= array->map.max_entries) 762 return -E2BIG; 763 764 if (map->ops->map_poke_run) { 765 mutex_lock(&array->aux->poke_mutex); 766 old_ptr = xchg(array->ptrs + index, NULL); 767 map->ops->map_poke_run(map, index, old_ptr, NULL); 768 mutex_unlock(&array->aux->poke_mutex); 769 } else { 770 old_ptr = xchg(array->ptrs + index, NULL); 771 } 772 773 if (old_ptr) { 774 map->ops->map_fd_put_ptr(old_ptr); 775 return 0; 776 } else { 777 return -ENOENT; 778 } 779 } 780 781 static void *prog_fd_array_get_ptr(struct bpf_map *map, 782 struct file *map_file, int fd) 783 { 784 struct bpf_array *array = container_of(map, struct bpf_array, map); 785 struct bpf_prog *prog = bpf_prog_get(fd); 786 787 if (IS_ERR(prog)) 788 return prog; 789 790 if (!bpf_prog_array_compatible(array, prog)) { 791 bpf_prog_put(prog); 792 return ERR_PTR(-EINVAL); 793 } 794 795 return prog; 796 } 797 798 static void prog_fd_array_put_ptr(void *ptr) 799 { 800 bpf_prog_put(ptr); 801 } 802 803 static u32 prog_fd_array_sys_lookup_elem(void *ptr) 804 { 805 return ((struct bpf_prog *)ptr)->aux->id; 806 } 807 808 /* decrement refcnt of all bpf_progs that are stored in this map */ 809 static void bpf_fd_array_map_clear(struct bpf_map *map) 810 { 811 struct bpf_array *array = container_of(map, struct bpf_array, map); 812 int i; 813 814 for (i = 0; i < array->map.max_entries; i++) 815 fd_array_map_delete_elem(map, &i); 816 } 817 818 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, 819 struct seq_file *m) 820 { 821 void **elem, *ptr; 822 u32 prog_id; 823 824 rcu_read_lock(); 825 826 elem = array_map_lookup_elem(map, key); 827 if (elem) { 828 ptr = READ_ONCE(*elem); 829 if (ptr) { 830 seq_printf(m, "%u: ", *(u32 *)key); 831 prog_id = prog_fd_array_sys_lookup_elem(ptr); 832 btf_type_seq_show(map->btf, map->btf_value_type_id, 833 &prog_id, m); 834 seq_puts(m, "\n"); 835 } 836 } 837 838 rcu_read_unlock(); 839 } 840 841 struct prog_poke_elem { 842 struct list_head list; 843 struct bpf_prog_aux *aux; 844 }; 845 846 static int prog_array_map_poke_track(struct bpf_map *map, 847 struct bpf_prog_aux *prog_aux) 848 { 849 struct prog_poke_elem *elem; 850 struct bpf_array_aux *aux; 851 int ret = 0; 852 853 aux = container_of(map, struct bpf_array, map)->aux; 854 mutex_lock(&aux->poke_mutex); 855 list_for_each_entry(elem, &aux->poke_progs, list) { 856 if (elem->aux == prog_aux) 857 goto out; 858 } 859 860 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 861 if (!elem) { 862 ret = -ENOMEM; 863 goto out; 864 } 865 866 INIT_LIST_HEAD(&elem->list); 867 /* We must track the program's aux info at this point in time 868 * since the program pointer itself may not be stable yet, see 869 * also comment in prog_array_map_poke_run(). 870 */ 871 elem->aux = prog_aux; 872 873 list_add_tail(&elem->list, &aux->poke_progs); 874 out: 875 mutex_unlock(&aux->poke_mutex); 876 return ret; 877 } 878 879 static void prog_array_map_poke_untrack(struct bpf_map *map, 880 struct bpf_prog_aux *prog_aux) 881 { 882 struct prog_poke_elem *elem, *tmp; 883 struct bpf_array_aux *aux; 884 885 aux = container_of(map, struct bpf_array, map)->aux; 886 mutex_lock(&aux->poke_mutex); 887 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 888 if (elem->aux == prog_aux) { 889 list_del_init(&elem->list); 890 kfree(elem); 891 break; 892 } 893 } 894 mutex_unlock(&aux->poke_mutex); 895 } 896 897 static void prog_array_map_poke_run(struct bpf_map *map, u32 key, 898 struct bpf_prog *old, 899 struct bpf_prog *new) 900 { 901 u8 *old_addr, *new_addr, *old_bypass_addr; 902 struct prog_poke_elem *elem; 903 struct bpf_array_aux *aux; 904 905 aux = container_of(map, struct bpf_array, map)->aux; 906 WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); 907 908 list_for_each_entry(elem, &aux->poke_progs, list) { 909 struct bpf_jit_poke_descriptor *poke; 910 int i, ret; 911 912 for (i = 0; i < elem->aux->size_poke_tab; i++) { 913 poke = &elem->aux->poke_tab[i]; 914 915 /* Few things to be aware of: 916 * 917 * 1) We can only ever access aux in this context, but 918 * not aux->prog since it might not be stable yet and 919 * there could be danger of use after free otherwise. 920 * 2) Initially when we start tracking aux, the program 921 * is not JITed yet and also does not have a kallsyms 922 * entry. We skip these as poke->tailcall_target_stable 923 * is not active yet. The JIT will do the final fixup 924 * before setting it stable. The various 925 * poke->tailcall_target_stable are successively 926 * activated, so tail call updates can arrive from here 927 * while JIT is still finishing its final fixup for 928 * non-activated poke entries. 929 * 3) On program teardown, the program's kallsym entry gets 930 * removed out of RCU callback, but we can only untrack 931 * from sleepable context, therefore bpf_arch_text_poke() 932 * might not see that this is in BPF text section and 933 * bails out with -EINVAL. As these are unreachable since 934 * RCU grace period already passed, we simply skip them. 935 * 4) Also programs reaching refcount of zero while patching 936 * is in progress is okay since we're protected under 937 * poke_mutex and untrack the programs before the JIT 938 * buffer is freed. When we're still in the middle of 939 * patching and suddenly kallsyms entry of the program 940 * gets evicted, we just skip the rest which is fine due 941 * to point 3). 942 * 5) Any other error happening below from bpf_arch_text_poke() 943 * is a unexpected bug. 944 */ 945 if (!READ_ONCE(poke->tailcall_target_stable)) 946 continue; 947 if (poke->reason != BPF_POKE_REASON_TAIL_CALL) 948 continue; 949 if (poke->tail_call.map != map || 950 poke->tail_call.key != key) 951 continue; 952 953 old_bypass_addr = old ? NULL : poke->bypass_addr; 954 old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; 955 new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; 956 957 if (new) { 958 ret = bpf_arch_text_poke(poke->tailcall_target, 959 BPF_MOD_JUMP, 960 old_addr, new_addr); 961 BUG_ON(ret < 0 && ret != -EINVAL); 962 if (!old) { 963 ret = bpf_arch_text_poke(poke->tailcall_bypass, 964 BPF_MOD_JUMP, 965 poke->bypass_addr, 966 NULL); 967 BUG_ON(ret < 0 && ret != -EINVAL); 968 } 969 } else { 970 ret = bpf_arch_text_poke(poke->tailcall_bypass, 971 BPF_MOD_JUMP, 972 old_bypass_addr, 973 poke->bypass_addr); 974 BUG_ON(ret < 0 && ret != -EINVAL); 975 /* let other CPUs finish the execution of program 976 * so that it will not possible to expose them 977 * to invalid nop, stack unwind, nop state 978 */ 979 if (!ret) 980 synchronize_rcu(); 981 ret = bpf_arch_text_poke(poke->tailcall_target, 982 BPF_MOD_JUMP, 983 old_addr, NULL); 984 BUG_ON(ret < 0 && ret != -EINVAL); 985 } 986 } 987 } 988 } 989 990 static void prog_array_map_clear_deferred(struct work_struct *work) 991 { 992 struct bpf_map *map = container_of(work, struct bpf_array_aux, 993 work)->map; 994 bpf_fd_array_map_clear(map); 995 bpf_map_put(map); 996 } 997 998 static void prog_array_map_clear(struct bpf_map *map) 999 { 1000 struct bpf_array_aux *aux = container_of(map, struct bpf_array, 1001 map)->aux; 1002 bpf_map_inc(map); 1003 schedule_work(&aux->work); 1004 } 1005 1006 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) 1007 { 1008 struct bpf_array_aux *aux; 1009 struct bpf_map *map; 1010 1011 aux = kzalloc(sizeof(*aux), GFP_KERNEL); 1012 if (!aux) 1013 return ERR_PTR(-ENOMEM); 1014 1015 INIT_WORK(&aux->work, prog_array_map_clear_deferred); 1016 INIT_LIST_HEAD(&aux->poke_progs); 1017 mutex_init(&aux->poke_mutex); 1018 1019 map = array_map_alloc(attr); 1020 if (IS_ERR(map)) { 1021 kfree(aux); 1022 return map; 1023 } 1024 1025 container_of(map, struct bpf_array, map)->aux = aux; 1026 aux->map = map; 1027 1028 return map; 1029 } 1030 1031 static void prog_array_map_free(struct bpf_map *map) 1032 { 1033 struct prog_poke_elem *elem, *tmp; 1034 struct bpf_array_aux *aux; 1035 1036 aux = container_of(map, struct bpf_array, map)->aux; 1037 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 1038 list_del_init(&elem->list); 1039 kfree(elem); 1040 } 1041 kfree(aux); 1042 fd_array_map_free(map); 1043 } 1044 1045 /* prog_array->aux->{type,jited} is a runtime binding. 1046 * Doing static check alone in the verifier is not enough. 1047 * Thus, prog_array_map cannot be used as an inner_map 1048 * and map_meta_equal is not implemented. 1049 */ 1050 static int prog_array_map_btf_id; 1051 const struct bpf_map_ops prog_array_map_ops = { 1052 .map_alloc_check = fd_array_map_alloc_check, 1053 .map_alloc = prog_array_map_alloc, 1054 .map_free = prog_array_map_free, 1055 .map_poke_track = prog_array_map_poke_track, 1056 .map_poke_untrack = prog_array_map_poke_untrack, 1057 .map_poke_run = prog_array_map_poke_run, 1058 .map_get_next_key = array_map_get_next_key, 1059 .map_lookup_elem = fd_array_map_lookup_elem, 1060 .map_delete_elem = fd_array_map_delete_elem, 1061 .map_fd_get_ptr = prog_fd_array_get_ptr, 1062 .map_fd_put_ptr = prog_fd_array_put_ptr, 1063 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 1064 .map_release_uref = prog_array_map_clear, 1065 .map_seq_show_elem = prog_array_map_seq_show_elem, 1066 .map_btf_name = "bpf_array", 1067 .map_btf_id = &prog_array_map_btf_id, 1068 }; 1069 1070 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 1071 struct file *map_file) 1072 { 1073 struct bpf_event_entry *ee; 1074 1075 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 1076 if (ee) { 1077 ee->event = perf_file->private_data; 1078 ee->perf_file = perf_file; 1079 ee->map_file = map_file; 1080 } 1081 1082 return ee; 1083 } 1084 1085 static void __bpf_event_entry_free(struct rcu_head *rcu) 1086 { 1087 struct bpf_event_entry *ee; 1088 1089 ee = container_of(rcu, struct bpf_event_entry, rcu); 1090 fput(ee->perf_file); 1091 kfree(ee); 1092 } 1093 1094 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 1095 { 1096 call_rcu(&ee->rcu, __bpf_event_entry_free); 1097 } 1098 1099 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 1100 struct file *map_file, int fd) 1101 { 1102 struct bpf_event_entry *ee; 1103 struct perf_event *event; 1104 struct file *perf_file; 1105 u64 value; 1106 1107 perf_file = perf_event_get(fd); 1108 if (IS_ERR(perf_file)) 1109 return perf_file; 1110 1111 ee = ERR_PTR(-EOPNOTSUPP); 1112 event = perf_file->private_data; 1113 if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) 1114 goto err_out; 1115 1116 ee = bpf_event_entry_gen(perf_file, map_file); 1117 if (ee) 1118 return ee; 1119 ee = ERR_PTR(-ENOMEM); 1120 err_out: 1121 fput(perf_file); 1122 return ee; 1123 } 1124 1125 static void perf_event_fd_array_put_ptr(void *ptr) 1126 { 1127 bpf_event_entry_free_rcu(ptr); 1128 } 1129 1130 static void perf_event_fd_array_release(struct bpf_map *map, 1131 struct file *map_file) 1132 { 1133 struct bpf_array *array = container_of(map, struct bpf_array, map); 1134 struct bpf_event_entry *ee; 1135 int i; 1136 1137 rcu_read_lock(); 1138 for (i = 0; i < array->map.max_entries; i++) { 1139 ee = READ_ONCE(array->ptrs[i]); 1140 if (ee && ee->map_file == map_file) 1141 fd_array_map_delete_elem(map, &i); 1142 } 1143 rcu_read_unlock(); 1144 } 1145 1146 static int perf_event_array_map_btf_id; 1147 const struct bpf_map_ops perf_event_array_map_ops = { 1148 .map_meta_equal = bpf_map_meta_equal, 1149 .map_alloc_check = fd_array_map_alloc_check, 1150 .map_alloc = array_map_alloc, 1151 .map_free = fd_array_map_free, 1152 .map_get_next_key = array_map_get_next_key, 1153 .map_lookup_elem = fd_array_map_lookup_elem, 1154 .map_delete_elem = fd_array_map_delete_elem, 1155 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 1156 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 1157 .map_release = perf_event_fd_array_release, 1158 .map_check_btf = map_check_no_btf, 1159 .map_btf_name = "bpf_array", 1160 .map_btf_id = &perf_event_array_map_btf_id, 1161 }; 1162 1163 #ifdef CONFIG_CGROUPS 1164 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 1165 struct file *map_file /* not used */, 1166 int fd) 1167 { 1168 return cgroup_get_from_fd(fd); 1169 } 1170 1171 static void cgroup_fd_array_put_ptr(void *ptr) 1172 { 1173 /* cgroup_put free cgrp after a rcu grace period */ 1174 cgroup_put(ptr); 1175 } 1176 1177 static void cgroup_fd_array_free(struct bpf_map *map) 1178 { 1179 bpf_fd_array_map_clear(map); 1180 fd_array_map_free(map); 1181 } 1182 1183 static int cgroup_array_map_btf_id; 1184 const struct bpf_map_ops cgroup_array_map_ops = { 1185 .map_meta_equal = bpf_map_meta_equal, 1186 .map_alloc_check = fd_array_map_alloc_check, 1187 .map_alloc = array_map_alloc, 1188 .map_free = cgroup_fd_array_free, 1189 .map_get_next_key = array_map_get_next_key, 1190 .map_lookup_elem = fd_array_map_lookup_elem, 1191 .map_delete_elem = fd_array_map_delete_elem, 1192 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 1193 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 1194 .map_check_btf = map_check_no_btf, 1195 .map_btf_name = "bpf_array", 1196 .map_btf_id = &cgroup_array_map_btf_id, 1197 }; 1198 #endif 1199 1200 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) 1201 { 1202 struct bpf_map *map, *inner_map_meta; 1203 1204 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1205 if (IS_ERR(inner_map_meta)) 1206 return inner_map_meta; 1207 1208 map = array_map_alloc(attr); 1209 if (IS_ERR(map)) { 1210 bpf_map_meta_free(inner_map_meta); 1211 return map; 1212 } 1213 1214 map->inner_map_meta = inner_map_meta; 1215 1216 return map; 1217 } 1218 1219 static void array_of_map_free(struct bpf_map *map) 1220 { 1221 /* map->inner_map_meta is only accessed by syscall which 1222 * is protected by fdget/fdput. 1223 */ 1224 bpf_map_meta_free(map->inner_map_meta); 1225 bpf_fd_array_map_clear(map); 1226 fd_array_map_free(map); 1227 } 1228 1229 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) 1230 { 1231 struct bpf_map **inner_map = array_map_lookup_elem(map, key); 1232 1233 if (!inner_map) 1234 return NULL; 1235 1236 return READ_ONCE(*inner_map); 1237 } 1238 1239 static u32 array_of_map_gen_lookup(struct bpf_map *map, 1240 struct bpf_insn *insn_buf) 1241 { 1242 struct bpf_array *array = container_of(map, struct bpf_array, map); 1243 u32 elem_size = round_up(map->value_size, 8); 1244 struct bpf_insn *insn = insn_buf; 1245 const int ret = BPF_REG_0; 1246 const int map_ptr = BPF_REG_1; 1247 const int index = BPF_REG_2; 1248 1249 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 1250 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 1251 if (!map->bypass_spec_v1) { 1252 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); 1253 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 1254 } else { 1255 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); 1256 } 1257 if (is_power_of_2(elem_size)) 1258 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 1259 else 1260 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 1261 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 1262 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1263 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 1264 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 1265 *insn++ = BPF_MOV64_IMM(ret, 0); 1266 1267 return insn - insn_buf; 1268 } 1269 1270 static int array_of_maps_map_btf_id; 1271 const struct bpf_map_ops array_of_maps_map_ops = { 1272 .map_alloc_check = fd_array_map_alloc_check, 1273 .map_alloc = array_of_map_alloc, 1274 .map_free = array_of_map_free, 1275 .map_get_next_key = array_map_get_next_key, 1276 .map_lookup_elem = array_of_map_lookup_elem, 1277 .map_delete_elem = fd_array_map_delete_elem, 1278 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1279 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1280 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1281 .map_gen_lookup = array_of_map_gen_lookup, 1282 .map_check_btf = map_check_no_btf, 1283 .map_btf_name = "bpf_array", 1284 .map_btf_id = &array_of_maps_map_btf_id, 1285 }; 1286