1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016,2017 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/err.h> 8 #include <linux/slab.h> 9 #include <linux/mm.h> 10 #include <linux/filter.h> 11 #include <linux/perf_event.h> 12 #include <uapi/linux/btf.h> 13 #include <linux/rcupdate_trace.h> 14 15 #include "map_in_map.h" 16 17 #define ARRAY_CREATE_FLAG_MASK \ 18 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ 19 BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) 20 21 static void bpf_array_free_percpu(struct bpf_array *array) 22 { 23 int i; 24 25 for (i = 0; i < array->map.max_entries; i++) { 26 free_percpu(array->pptrs[i]); 27 cond_resched(); 28 } 29 } 30 31 static int bpf_array_alloc_percpu(struct bpf_array *array) 32 { 33 void __percpu *ptr; 34 int i; 35 36 for (i = 0; i < array->map.max_entries; i++) { 37 ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, 38 GFP_USER | __GFP_NOWARN); 39 if (!ptr) { 40 bpf_array_free_percpu(array); 41 return -ENOMEM; 42 } 43 array->pptrs[i] = ptr; 44 cond_resched(); 45 } 46 47 return 0; 48 } 49 50 /* Called from syscall */ 51 int array_map_alloc_check(union bpf_attr *attr) 52 { 53 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 54 int numa_node = bpf_map_attr_numa_node(attr); 55 56 /* check sanity of attributes */ 57 if (attr->max_entries == 0 || attr->key_size != 4 || 58 attr->value_size == 0 || 59 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 60 !bpf_map_flags_access_ok(attr->map_flags) || 61 (percpu && numa_node != NUMA_NO_NODE)) 62 return -EINVAL; 63 64 if (attr->map_type != BPF_MAP_TYPE_ARRAY && 65 attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) 66 return -EINVAL; 67 68 if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && 69 attr->map_flags & BPF_F_PRESERVE_ELEMS) 70 return -EINVAL; 71 72 if (attr->value_size > KMALLOC_MAX_SIZE) 73 /* if value_size is bigger, the user space won't be able to 74 * access the elements. 75 */ 76 return -E2BIG; 77 78 return 0; 79 } 80 81 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 82 { 83 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 84 int numa_node = bpf_map_attr_numa_node(attr); 85 u32 elem_size, index_mask, max_entries; 86 bool bypass_spec_v1 = bpf_bypass_spec_v1(); 87 u64 array_size, mask64; 88 struct bpf_array *array; 89 90 elem_size = round_up(attr->value_size, 8); 91 92 max_entries = attr->max_entries; 93 94 /* On 32 bit archs roundup_pow_of_two() with max_entries that has 95 * upper most bit set in u32 space is undefined behavior due to 96 * resulting 1U << 32, so do it manually here in u64 space. 97 */ 98 mask64 = fls_long(max_entries - 1); 99 mask64 = 1ULL << mask64; 100 mask64 -= 1; 101 102 index_mask = mask64; 103 if (!bypass_spec_v1) { 104 /* round up array size to nearest power of 2, 105 * since cpu will speculate within index_mask limits 106 */ 107 max_entries = index_mask + 1; 108 /* Check for overflows. */ 109 if (max_entries < attr->max_entries) 110 return ERR_PTR(-E2BIG); 111 } 112 113 array_size = sizeof(*array); 114 if (percpu) { 115 array_size += (u64) max_entries * sizeof(void *); 116 } else { 117 /* rely on vmalloc() to return page-aligned memory and 118 * ensure array->value is exactly page-aligned 119 */ 120 if (attr->map_flags & BPF_F_MMAPABLE) { 121 array_size = PAGE_ALIGN(array_size); 122 array_size += PAGE_ALIGN((u64) max_entries * elem_size); 123 } else { 124 array_size += (u64) max_entries * elem_size; 125 } 126 } 127 128 /* allocate all map elements and zero-initialize them */ 129 if (attr->map_flags & BPF_F_MMAPABLE) { 130 void *data; 131 132 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ 133 data = bpf_map_area_mmapable_alloc(array_size, numa_node); 134 if (!data) 135 return ERR_PTR(-ENOMEM); 136 array = data + PAGE_ALIGN(sizeof(struct bpf_array)) 137 - offsetof(struct bpf_array, value); 138 } else { 139 array = bpf_map_area_alloc(array_size, numa_node); 140 } 141 if (!array) 142 return ERR_PTR(-ENOMEM); 143 array->index_mask = index_mask; 144 array->map.bypass_spec_v1 = bypass_spec_v1; 145 146 /* copy mandatory map attributes */ 147 bpf_map_init_from_attr(&array->map, attr); 148 array->elem_size = elem_size; 149 150 if (percpu && bpf_array_alloc_percpu(array)) { 151 bpf_map_area_free(array); 152 return ERR_PTR(-ENOMEM); 153 } 154 155 return &array->map; 156 } 157 158 /* Called from syscall or from eBPF program */ 159 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 160 { 161 struct bpf_array *array = container_of(map, struct bpf_array, map); 162 u32 index = *(u32 *)key; 163 164 if (unlikely(index >= array->map.max_entries)) 165 return NULL; 166 167 return array->value + array->elem_size * (index & array->index_mask); 168 } 169 170 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, 171 u32 off) 172 { 173 struct bpf_array *array = container_of(map, struct bpf_array, map); 174 175 if (map->max_entries != 1) 176 return -ENOTSUPP; 177 if (off >= map->value_size) 178 return -EINVAL; 179 180 *imm = (unsigned long)array->value; 181 return 0; 182 } 183 184 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, 185 u32 *off) 186 { 187 struct bpf_array *array = container_of(map, struct bpf_array, map); 188 u64 base = (unsigned long)array->value; 189 u64 range = array->elem_size; 190 191 if (map->max_entries != 1) 192 return -ENOTSUPP; 193 if (imm < base || imm >= base + range) 194 return -ENOENT; 195 196 *off = imm - base; 197 return 0; 198 } 199 200 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 201 static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 202 { 203 struct bpf_array *array = container_of(map, struct bpf_array, map); 204 struct bpf_insn *insn = insn_buf; 205 u32 elem_size = round_up(map->value_size, 8); 206 const int ret = BPF_REG_0; 207 const int map_ptr = BPF_REG_1; 208 const int index = BPF_REG_2; 209 210 if (map->map_flags & BPF_F_INNER_MAP) 211 return -EOPNOTSUPP; 212 213 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 214 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 215 if (!map->bypass_spec_v1) { 216 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); 217 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 218 } else { 219 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); 220 } 221 222 if (is_power_of_2(elem_size)) { 223 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 224 } else { 225 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 226 } 227 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 228 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 229 *insn++ = BPF_MOV64_IMM(ret, 0); 230 return insn - insn_buf; 231 } 232 233 /* Called from eBPF program */ 234 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 235 { 236 struct bpf_array *array = container_of(map, struct bpf_array, map); 237 u32 index = *(u32 *)key; 238 239 if (unlikely(index >= array->map.max_entries)) 240 return NULL; 241 242 return this_cpu_ptr(array->pptrs[index & array->index_mask]); 243 } 244 245 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 246 { 247 struct bpf_array *array = container_of(map, struct bpf_array, map); 248 u32 index = *(u32 *)key; 249 void __percpu *pptr; 250 int cpu, off = 0; 251 u32 size; 252 253 if (unlikely(index >= array->map.max_entries)) 254 return -ENOENT; 255 256 /* per_cpu areas are zero-filled and bpf programs can only 257 * access 'value_size' of them, so copying rounded areas 258 * will not leak any kernel data 259 */ 260 size = round_up(map->value_size, 8); 261 rcu_read_lock(); 262 pptr = array->pptrs[index & array->index_mask]; 263 for_each_possible_cpu(cpu) { 264 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 265 off += size; 266 } 267 rcu_read_unlock(); 268 return 0; 269 } 270 271 /* Called from syscall */ 272 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 273 { 274 struct bpf_array *array = container_of(map, struct bpf_array, map); 275 u32 index = key ? *(u32 *)key : U32_MAX; 276 u32 *next = (u32 *)next_key; 277 278 if (index >= array->map.max_entries) { 279 *next = 0; 280 return 0; 281 } 282 283 if (index == array->map.max_entries - 1) 284 return -ENOENT; 285 286 *next = index + 1; 287 return 0; 288 } 289 290 /* Called from syscall or from eBPF program */ 291 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 292 u64 map_flags) 293 { 294 struct bpf_array *array = container_of(map, struct bpf_array, map); 295 u32 index = *(u32 *)key; 296 char *val; 297 298 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 299 /* unknown flags */ 300 return -EINVAL; 301 302 if (unlikely(index >= array->map.max_entries)) 303 /* all elements were pre-allocated, cannot insert a new one */ 304 return -E2BIG; 305 306 if (unlikely(map_flags & BPF_NOEXIST)) 307 /* all elements already exist */ 308 return -EEXIST; 309 310 if (unlikely((map_flags & BPF_F_LOCK) && 311 !map_value_has_spin_lock(map))) 312 return -EINVAL; 313 314 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 315 memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), 316 value, map->value_size); 317 } else { 318 val = array->value + 319 array->elem_size * (index & array->index_mask); 320 if (map_flags & BPF_F_LOCK) 321 copy_map_value_locked(map, val, value, false); 322 else 323 copy_map_value(map, val, value); 324 } 325 return 0; 326 } 327 328 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 329 u64 map_flags) 330 { 331 struct bpf_array *array = container_of(map, struct bpf_array, map); 332 u32 index = *(u32 *)key; 333 void __percpu *pptr; 334 int cpu, off = 0; 335 u32 size; 336 337 if (unlikely(map_flags > BPF_EXIST)) 338 /* unknown flags */ 339 return -EINVAL; 340 341 if (unlikely(index >= array->map.max_entries)) 342 /* all elements were pre-allocated, cannot insert a new one */ 343 return -E2BIG; 344 345 if (unlikely(map_flags == BPF_NOEXIST)) 346 /* all elements already exist */ 347 return -EEXIST; 348 349 /* the user space will provide round_up(value_size, 8) bytes that 350 * will be copied into per-cpu area. bpf programs can only access 351 * value_size of it. During lookup the same extra bytes will be 352 * returned or zeros which were zero-filled by percpu_alloc, 353 * so no kernel data leaks possible 354 */ 355 size = round_up(map->value_size, 8); 356 rcu_read_lock(); 357 pptr = array->pptrs[index & array->index_mask]; 358 for_each_possible_cpu(cpu) { 359 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 360 off += size; 361 } 362 rcu_read_unlock(); 363 return 0; 364 } 365 366 /* Called from syscall or from eBPF program */ 367 static int array_map_delete_elem(struct bpf_map *map, void *key) 368 { 369 return -EINVAL; 370 } 371 372 static void *array_map_vmalloc_addr(struct bpf_array *array) 373 { 374 return (void *)round_down((unsigned long)array, PAGE_SIZE); 375 } 376 377 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 378 static void array_map_free(struct bpf_map *map) 379 { 380 struct bpf_array *array = container_of(map, struct bpf_array, map); 381 382 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 383 bpf_array_free_percpu(array); 384 385 if (array->map.map_flags & BPF_F_MMAPABLE) 386 bpf_map_area_free(array_map_vmalloc_addr(array)); 387 else 388 bpf_map_area_free(array); 389 } 390 391 static void array_map_seq_show_elem(struct bpf_map *map, void *key, 392 struct seq_file *m) 393 { 394 void *value; 395 396 rcu_read_lock(); 397 398 value = array_map_lookup_elem(map, key); 399 if (!value) { 400 rcu_read_unlock(); 401 return; 402 } 403 404 if (map->btf_key_type_id) 405 seq_printf(m, "%u: ", *(u32 *)key); 406 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 407 seq_puts(m, "\n"); 408 409 rcu_read_unlock(); 410 } 411 412 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, 413 struct seq_file *m) 414 { 415 struct bpf_array *array = container_of(map, struct bpf_array, map); 416 u32 index = *(u32 *)key; 417 void __percpu *pptr; 418 int cpu; 419 420 rcu_read_lock(); 421 422 seq_printf(m, "%u: {\n", *(u32 *)key); 423 pptr = array->pptrs[index & array->index_mask]; 424 for_each_possible_cpu(cpu) { 425 seq_printf(m, "\tcpu%d: ", cpu); 426 btf_type_seq_show(map->btf, map->btf_value_type_id, 427 per_cpu_ptr(pptr, cpu), m); 428 seq_puts(m, "\n"); 429 } 430 seq_puts(m, "}\n"); 431 432 rcu_read_unlock(); 433 } 434 435 static int array_map_check_btf(const struct bpf_map *map, 436 const struct btf *btf, 437 const struct btf_type *key_type, 438 const struct btf_type *value_type) 439 { 440 u32 int_data; 441 442 /* One exception for keyless BTF: .bss/.data/.rodata map */ 443 if (btf_type_is_void(key_type)) { 444 if (map->map_type != BPF_MAP_TYPE_ARRAY || 445 map->max_entries != 1) 446 return -EINVAL; 447 448 if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) 449 return -EINVAL; 450 451 return 0; 452 } 453 454 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 455 return -EINVAL; 456 457 int_data = *(u32 *)(key_type + 1); 458 /* bpf array can only take a u32 key. This check makes sure 459 * that the btf matches the attr used during map_create. 460 */ 461 if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 462 return -EINVAL; 463 464 return 0; 465 } 466 467 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 468 { 469 struct bpf_array *array = container_of(map, struct bpf_array, map); 470 pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; 471 472 if (!(map->map_flags & BPF_F_MMAPABLE)) 473 return -EINVAL; 474 475 if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > 476 PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) 477 return -EINVAL; 478 479 return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), 480 vma->vm_pgoff + pgoff); 481 } 482 483 static bool array_map_meta_equal(const struct bpf_map *meta0, 484 const struct bpf_map *meta1) 485 { 486 if (!bpf_map_meta_equal(meta0, meta1)) 487 return false; 488 return meta0->map_flags & BPF_F_INNER_MAP ? true : 489 meta0->max_entries == meta1->max_entries; 490 } 491 492 struct bpf_iter_seq_array_map_info { 493 struct bpf_map *map; 494 void *percpu_value_buf; 495 u32 index; 496 }; 497 498 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) 499 { 500 struct bpf_iter_seq_array_map_info *info = seq->private; 501 struct bpf_map *map = info->map; 502 struct bpf_array *array; 503 u32 index; 504 505 if (info->index >= map->max_entries) 506 return NULL; 507 508 if (*pos == 0) 509 ++*pos; 510 array = container_of(map, struct bpf_array, map); 511 index = info->index & array->index_mask; 512 if (info->percpu_value_buf) 513 return array->pptrs[index]; 514 return array->value + array->elem_size * index; 515 } 516 517 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 518 { 519 struct bpf_iter_seq_array_map_info *info = seq->private; 520 struct bpf_map *map = info->map; 521 struct bpf_array *array; 522 u32 index; 523 524 ++*pos; 525 ++info->index; 526 if (info->index >= map->max_entries) 527 return NULL; 528 529 array = container_of(map, struct bpf_array, map); 530 index = info->index & array->index_mask; 531 if (info->percpu_value_buf) 532 return array->pptrs[index]; 533 return array->value + array->elem_size * index; 534 } 535 536 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) 537 { 538 struct bpf_iter_seq_array_map_info *info = seq->private; 539 struct bpf_iter__bpf_map_elem ctx = {}; 540 struct bpf_map *map = info->map; 541 struct bpf_iter_meta meta; 542 struct bpf_prog *prog; 543 int off = 0, cpu = 0; 544 void __percpu **pptr; 545 u32 size; 546 547 meta.seq = seq; 548 prog = bpf_iter_get_info(&meta, v == NULL); 549 if (!prog) 550 return 0; 551 552 ctx.meta = &meta; 553 ctx.map = info->map; 554 if (v) { 555 ctx.key = &info->index; 556 557 if (!info->percpu_value_buf) { 558 ctx.value = v; 559 } else { 560 pptr = v; 561 size = round_up(map->value_size, 8); 562 for_each_possible_cpu(cpu) { 563 bpf_long_memcpy(info->percpu_value_buf + off, 564 per_cpu_ptr(pptr, cpu), 565 size); 566 off += size; 567 } 568 ctx.value = info->percpu_value_buf; 569 } 570 } 571 572 return bpf_iter_run_prog(prog, &ctx); 573 } 574 575 static int bpf_array_map_seq_show(struct seq_file *seq, void *v) 576 { 577 return __bpf_array_map_seq_show(seq, v); 578 } 579 580 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) 581 { 582 if (!v) 583 (void)__bpf_array_map_seq_show(seq, NULL); 584 } 585 586 static int bpf_iter_init_array_map(void *priv_data, 587 struct bpf_iter_aux_info *aux) 588 { 589 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 590 struct bpf_map *map = aux->map; 591 void *value_buf; 592 u32 buf_size; 593 594 if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 595 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 596 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 597 if (!value_buf) 598 return -ENOMEM; 599 600 seq_info->percpu_value_buf = value_buf; 601 } 602 603 seq_info->map = map; 604 return 0; 605 } 606 607 static void bpf_iter_fini_array_map(void *priv_data) 608 { 609 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 610 611 kfree(seq_info->percpu_value_buf); 612 } 613 614 static const struct seq_operations bpf_array_map_seq_ops = { 615 .start = bpf_array_map_seq_start, 616 .next = bpf_array_map_seq_next, 617 .stop = bpf_array_map_seq_stop, 618 .show = bpf_array_map_seq_show, 619 }; 620 621 static const struct bpf_iter_seq_info iter_seq_info = { 622 .seq_ops = &bpf_array_map_seq_ops, 623 .init_seq_private = bpf_iter_init_array_map, 624 .fini_seq_private = bpf_iter_fini_array_map, 625 .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), 626 }; 627 628 static int array_map_btf_id; 629 const struct bpf_map_ops array_map_ops = { 630 .map_meta_equal = array_map_meta_equal, 631 .map_alloc_check = array_map_alloc_check, 632 .map_alloc = array_map_alloc, 633 .map_free = array_map_free, 634 .map_get_next_key = array_map_get_next_key, 635 .map_lookup_elem = array_map_lookup_elem, 636 .map_update_elem = array_map_update_elem, 637 .map_delete_elem = array_map_delete_elem, 638 .map_gen_lookup = array_map_gen_lookup, 639 .map_direct_value_addr = array_map_direct_value_addr, 640 .map_direct_value_meta = array_map_direct_value_meta, 641 .map_mmap = array_map_mmap, 642 .map_seq_show_elem = array_map_seq_show_elem, 643 .map_check_btf = array_map_check_btf, 644 .map_lookup_batch = generic_map_lookup_batch, 645 .map_update_batch = generic_map_update_batch, 646 .map_btf_name = "bpf_array", 647 .map_btf_id = &array_map_btf_id, 648 .iter_seq_info = &iter_seq_info, 649 }; 650 651 static int percpu_array_map_btf_id; 652 const struct bpf_map_ops percpu_array_map_ops = { 653 .map_meta_equal = bpf_map_meta_equal, 654 .map_alloc_check = array_map_alloc_check, 655 .map_alloc = array_map_alloc, 656 .map_free = array_map_free, 657 .map_get_next_key = array_map_get_next_key, 658 .map_lookup_elem = percpu_array_map_lookup_elem, 659 .map_update_elem = array_map_update_elem, 660 .map_delete_elem = array_map_delete_elem, 661 .map_seq_show_elem = percpu_array_map_seq_show_elem, 662 .map_check_btf = array_map_check_btf, 663 .map_btf_name = "bpf_array", 664 .map_btf_id = &percpu_array_map_btf_id, 665 .iter_seq_info = &iter_seq_info, 666 }; 667 668 static int fd_array_map_alloc_check(union bpf_attr *attr) 669 { 670 /* only file descriptors can be stored in this type of map */ 671 if (attr->value_size != sizeof(u32)) 672 return -EINVAL; 673 /* Program read-only/write-only not supported for special maps yet. */ 674 if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) 675 return -EINVAL; 676 return array_map_alloc_check(attr); 677 } 678 679 static void fd_array_map_free(struct bpf_map *map) 680 { 681 struct bpf_array *array = container_of(map, struct bpf_array, map); 682 int i; 683 684 /* make sure it's empty */ 685 for (i = 0; i < array->map.max_entries; i++) 686 BUG_ON(array->ptrs[i] != NULL); 687 688 bpf_map_area_free(array); 689 } 690 691 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 692 { 693 return ERR_PTR(-EOPNOTSUPP); 694 } 695 696 /* only called from syscall */ 697 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 698 { 699 void **elem, *ptr; 700 int ret = 0; 701 702 if (!map->ops->map_fd_sys_lookup_elem) 703 return -ENOTSUPP; 704 705 rcu_read_lock(); 706 elem = array_map_lookup_elem(map, key); 707 if (elem && (ptr = READ_ONCE(*elem))) 708 *value = map->ops->map_fd_sys_lookup_elem(ptr); 709 else 710 ret = -ENOENT; 711 rcu_read_unlock(); 712 713 return ret; 714 } 715 716 /* only called from syscall */ 717 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 718 void *key, void *value, u64 map_flags) 719 { 720 struct bpf_array *array = container_of(map, struct bpf_array, map); 721 void *new_ptr, *old_ptr; 722 u32 index = *(u32 *)key, ufd; 723 724 if (map_flags != BPF_ANY) 725 return -EINVAL; 726 727 if (index >= array->map.max_entries) 728 return -E2BIG; 729 730 ufd = *(u32 *)value; 731 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 732 if (IS_ERR(new_ptr)) 733 return PTR_ERR(new_ptr); 734 735 if (map->ops->map_poke_run) { 736 mutex_lock(&array->aux->poke_mutex); 737 old_ptr = xchg(array->ptrs + index, new_ptr); 738 map->ops->map_poke_run(map, index, old_ptr, new_ptr); 739 mutex_unlock(&array->aux->poke_mutex); 740 } else { 741 old_ptr = xchg(array->ptrs + index, new_ptr); 742 } 743 744 if (old_ptr) 745 map->ops->map_fd_put_ptr(old_ptr); 746 return 0; 747 } 748 749 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 750 { 751 struct bpf_array *array = container_of(map, struct bpf_array, map); 752 void *old_ptr; 753 u32 index = *(u32 *)key; 754 755 if (index >= array->map.max_entries) 756 return -E2BIG; 757 758 if (map->ops->map_poke_run) { 759 mutex_lock(&array->aux->poke_mutex); 760 old_ptr = xchg(array->ptrs + index, NULL); 761 map->ops->map_poke_run(map, index, old_ptr, NULL); 762 mutex_unlock(&array->aux->poke_mutex); 763 } else { 764 old_ptr = xchg(array->ptrs + index, NULL); 765 } 766 767 if (old_ptr) { 768 map->ops->map_fd_put_ptr(old_ptr); 769 return 0; 770 } else { 771 return -ENOENT; 772 } 773 } 774 775 static void *prog_fd_array_get_ptr(struct bpf_map *map, 776 struct file *map_file, int fd) 777 { 778 struct bpf_array *array = container_of(map, struct bpf_array, map); 779 struct bpf_prog *prog = bpf_prog_get(fd); 780 781 if (IS_ERR(prog)) 782 return prog; 783 784 if (!bpf_prog_array_compatible(array, prog)) { 785 bpf_prog_put(prog); 786 return ERR_PTR(-EINVAL); 787 } 788 789 return prog; 790 } 791 792 static void prog_fd_array_put_ptr(void *ptr) 793 { 794 bpf_prog_put(ptr); 795 } 796 797 static u32 prog_fd_array_sys_lookup_elem(void *ptr) 798 { 799 return ((struct bpf_prog *)ptr)->aux->id; 800 } 801 802 /* decrement refcnt of all bpf_progs that are stored in this map */ 803 static void bpf_fd_array_map_clear(struct bpf_map *map) 804 { 805 struct bpf_array *array = container_of(map, struct bpf_array, map); 806 int i; 807 808 for (i = 0; i < array->map.max_entries; i++) 809 fd_array_map_delete_elem(map, &i); 810 } 811 812 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, 813 struct seq_file *m) 814 { 815 void **elem, *ptr; 816 u32 prog_id; 817 818 rcu_read_lock(); 819 820 elem = array_map_lookup_elem(map, key); 821 if (elem) { 822 ptr = READ_ONCE(*elem); 823 if (ptr) { 824 seq_printf(m, "%u: ", *(u32 *)key); 825 prog_id = prog_fd_array_sys_lookup_elem(ptr); 826 btf_type_seq_show(map->btf, map->btf_value_type_id, 827 &prog_id, m); 828 seq_puts(m, "\n"); 829 } 830 } 831 832 rcu_read_unlock(); 833 } 834 835 struct prog_poke_elem { 836 struct list_head list; 837 struct bpf_prog_aux *aux; 838 }; 839 840 static int prog_array_map_poke_track(struct bpf_map *map, 841 struct bpf_prog_aux *prog_aux) 842 { 843 struct prog_poke_elem *elem; 844 struct bpf_array_aux *aux; 845 int ret = 0; 846 847 aux = container_of(map, struct bpf_array, map)->aux; 848 mutex_lock(&aux->poke_mutex); 849 list_for_each_entry(elem, &aux->poke_progs, list) { 850 if (elem->aux == prog_aux) 851 goto out; 852 } 853 854 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 855 if (!elem) { 856 ret = -ENOMEM; 857 goto out; 858 } 859 860 INIT_LIST_HEAD(&elem->list); 861 /* We must track the program's aux info at this point in time 862 * since the program pointer itself may not be stable yet, see 863 * also comment in prog_array_map_poke_run(). 864 */ 865 elem->aux = prog_aux; 866 867 list_add_tail(&elem->list, &aux->poke_progs); 868 out: 869 mutex_unlock(&aux->poke_mutex); 870 return ret; 871 } 872 873 static void prog_array_map_poke_untrack(struct bpf_map *map, 874 struct bpf_prog_aux *prog_aux) 875 { 876 struct prog_poke_elem *elem, *tmp; 877 struct bpf_array_aux *aux; 878 879 aux = container_of(map, struct bpf_array, map)->aux; 880 mutex_lock(&aux->poke_mutex); 881 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 882 if (elem->aux == prog_aux) { 883 list_del_init(&elem->list); 884 kfree(elem); 885 break; 886 } 887 } 888 mutex_unlock(&aux->poke_mutex); 889 } 890 891 static void prog_array_map_poke_run(struct bpf_map *map, u32 key, 892 struct bpf_prog *old, 893 struct bpf_prog *new) 894 { 895 u8 *old_addr, *new_addr, *old_bypass_addr; 896 struct prog_poke_elem *elem; 897 struct bpf_array_aux *aux; 898 899 aux = container_of(map, struct bpf_array, map)->aux; 900 WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); 901 902 list_for_each_entry(elem, &aux->poke_progs, list) { 903 struct bpf_jit_poke_descriptor *poke; 904 int i, ret; 905 906 for (i = 0; i < elem->aux->size_poke_tab; i++) { 907 poke = &elem->aux->poke_tab[i]; 908 909 /* Few things to be aware of: 910 * 911 * 1) We can only ever access aux in this context, but 912 * not aux->prog since it might not be stable yet and 913 * there could be danger of use after free otherwise. 914 * 2) Initially when we start tracking aux, the program 915 * is not JITed yet and also does not have a kallsyms 916 * entry. We skip these as poke->tailcall_target_stable 917 * is not active yet. The JIT will do the final fixup 918 * before setting it stable. The various 919 * poke->tailcall_target_stable are successively 920 * activated, so tail call updates can arrive from here 921 * while JIT is still finishing its final fixup for 922 * non-activated poke entries. 923 * 3) On program teardown, the program's kallsym entry gets 924 * removed out of RCU callback, but we can only untrack 925 * from sleepable context, therefore bpf_arch_text_poke() 926 * might not see that this is in BPF text section and 927 * bails out with -EINVAL. As these are unreachable since 928 * RCU grace period already passed, we simply skip them. 929 * 4) Also programs reaching refcount of zero while patching 930 * is in progress is okay since we're protected under 931 * poke_mutex and untrack the programs before the JIT 932 * buffer is freed. When we're still in the middle of 933 * patching and suddenly kallsyms entry of the program 934 * gets evicted, we just skip the rest which is fine due 935 * to point 3). 936 * 5) Any other error happening below from bpf_arch_text_poke() 937 * is a unexpected bug. 938 */ 939 if (!READ_ONCE(poke->tailcall_target_stable)) 940 continue; 941 if (poke->reason != BPF_POKE_REASON_TAIL_CALL) 942 continue; 943 if (poke->tail_call.map != map || 944 poke->tail_call.key != key) 945 continue; 946 947 old_bypass_addr = old ? NULL : poke->bypass_addr; 948 old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; 949 new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; 950 951 if (new) { 952 ret = bpf_arch_text_poke(poke->tailcall_target, 953 BPF_MOD_JUMP, 954 old_addr, new_addr); 955 BUG_ON(ret < 0 && ret != -EINVAL); 956 if (!old) { 957 ret = bpf_arch_text_poke(poke->tailcall_bypass, 958 BPF_MOD_JUMP, 959 poke->bypass_addr, 960 NULL); 961 BUG_ON(ret < 0 && ret != -EINVAL); 962 } 963 } else { 964 ret = bpf_arch_text_poke(poke->tailcall_bypass, 965 BPF_MOD_JUMP, 966 old_bypass_addr, 967 poke->bypass_addr); 968 BUG_ON(ret < 0 && ret != -EINVAL); 969 /* let other CPUs finish the execution of program 970 * so that it will not possible to expose them 971 * to invalid nop, stack unwind, nop state 972 */ 973 if (!ret) 974 synchronize_rcu(); 975 ret = bpf_arch_text_poke(poke->tailcall_target, 976 BPF_MOD_JUMP, 977 old_addr, NULL); 978 BUG_ON(ret < 0 && ret != -EINVAL); 979 } 980 } 981 } 982 } 983 984 static void prog_array_map_clear_deferred(struct work_struct *work) 985 { 986 struct bpf_map *map = container_of(work, struct bpf_array_aux, 987 work)->map; 988 bpf_fd_array_map_clear(map); 989 bpf_map_put(map); 990 } 991 992 static void prog_array_map_clear(struct bpf_map *map) 993 { 994 struct bpf_array_aux *aux = container_of(map, struct bpf_array, 995 map)->aux; 996 bpf_map_inc(map); 997 schedule_work(&aux->work); 998 } 999 1000 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) 1001 { 1002 struct bpf_array_aux *aux; 1003 struct bpf_map *map; 1004 1005 aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); 1006 if (!aux) 1007 return ERR_PTR(-ENOMEM); 1008 1009 INIT_WORK(&aux->work, prog_array_map_clear_deferred); 1010 INIT_LIST_HEAD(&aux->poke_progs); 1011 mutex_init(&aux->poke_mutex); 1012 1013 map = array_map_alloc(attr); 1014 if (IS_ERR(map)) { 1015 kfree(aux); 1016 return map; 1017 } 1018 1019 container_of(map, struct bpf_array, map)->aux = aux; 1020 aux->map = map; 1021 1022 return map; 1023 } 1024 1025 static void prog_array_map_free(struct bpf_map *map) 1026 { 1027 struct prog_poke_elem *elem, *tmp; 1028 struct bpf_array_aux *aux; 1029 1030 aux = container_of(map, struct bpf_array, map)->aux; 1031 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 1032 list_del_init(&elem->list); 1033 kfree(elem); 1034 } 1035 kfree(aux); 1036 fd_array_map_free(map); 1037 } 1038 1039 /* prog_array->aux->{type,jited} is a runtime binding. 1040 * Doing static check alone in the verifier is not enough. 1041 * Thus, prog_array_map cannot be used as an inner_map 1042 * and map_meta_equal is not implemented. 1043 */ 1044 static int prog_array_map_btf_id; 1045 const struct bpf_map_ops prog_array_map_ops = { 1046 .map_alloc_check = fd_array_map_alloc_check, 1047 .map_alloc = prog_array_map_alloc, 1048 .map_free = prog_array_map_free, 1049 .map_poke_track = prog_array_map_poke_track, 1050 .map_poke_untrack = prog_array_map_poke_untrack, 1051 .map_poke_run = prog_array_map_poke_run, 1052 .map_get_next_key = array_map_get_next_key, 1053 .map_lookup_elem = fd_array_map_lookup_elem, 1054 .map_delete_elem = fd_array_map_delete_elem, 1055 .map_fd_get_ptr = prog_fd_array_get_ptr, 1056 .map_fd_put_ptr = prog_fd_array_put_ptr, 1057 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 1058 .map_release_uref = prog_array_map_clear, 1059 .map_seq_show_elem = prog_array_map_seq_show_elem, 1060 .map_btf_name = "bpf_array", 1061 .map_btf_id = &prog_array_map_btf_id, 1062 }; 1063 1064 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 1065 struct file *map_file) 1066 { 1067 struct bpf_event_entry *ee; 1068 1069 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 1070 if (ee) { 1071 ee->event = perf_file->private_data; 1072 ee->perf_file = perf_file; 1073 ee->map_file = map_file; 1074 } 1075 1076 return ee; 1077 } 1078 1079 static void __bpf_event_entry_free(struct rcu_head *rcu) 1080 { 1081 struct bpf_event_entry *ee; 1082 1083 ee = container_of(rcu, struct bpf_event_entry, rcu); 1084 fput(ee->perf_file); 1085 kfree(ee); 1086 } 1087 1088 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 1089 { 1090 call_rcu(&ee->rcu, __bpf_event_entry_free); 1091 } 1092 1093 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 1094 struct file *map_file, int fd) 1095 { 1096 struct bpf_event_entry *ee; 1097 struct perf_event *event; 1098 struct file *perf_file; 1099 u64 value; 1100 1101 perf_file = perf_event_get(fd); 1102 if (IS_ERR(perf_file)) 1103 return perf_file; 1104 1105 ee = ERR_PTR(-EOPNOTSUPP); 1106 event = perf_file->private_data; 1107 if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) 1108 goto err_out; 1109 1110 ee = bpf_event_entry_gen(perf_file, map_file); 1111 if (ee) 1112 return ee; 1113 ee = ERR_PTR(-ENOMEM); 1114 err_out: 1115 fput(perf_file); 1116 return ee; 1117 } 1118 1119 static void perf_event_fd_array_put_ptr(void *ptr) 1120 { 1121 bpf_event_entry_free_rcu(ptr); 1122 } 1123 1124 static void perf_event_fd_array_release(struct bpf_map *map, 1125 struct file *map_file) 1126 { 1127 struct bpf_array *array = container_of(map, struct bpf_array, map); 1128 struct bpf_event_entry *ee; 1129 int i; 1130 1131 if (map->map_flags & BPF_F_PRESERVE_ELEMS) 1132 return; 1133 1134 rcu_read_lock(); 1135 for (i = 0; i < array->map.max_entries; i++) { 1136 ee = READ_ONCE(array->ptrs[i]); 1137 if (ee && ee->map_file == map_file) 1138 fd_array_map_delete_elem(map, &i); 1139 } 1140 rcu_read_unlock(); 1141 } 1142 1143 static void perf_event_fd_array_map_free(struct bpf_map *map) 1144 { 1145 if (map->map_flags & BPF_F_PRESERVE_ELEMS) 1146 bpf_fd_array_map_clear(map); 1147 fd_array_map_free(map); 1148 } 1149 1150 static int perf_event_array_map_btf_id; 1151 const struct bpf_map_ops perf_event_array_map_ops = { 1152 .map_meta_equal = bpf_map_meta_equal, 1153 .map_alloc_check = fd_array_map_alloc_check, 1154 .map_alloc = array_map_alloc, 1155 .map_free = perf_event_fd_array_map_free, 1156 .map_get_next_key = array_map_get_next_key, 1157 .map_lookup_elem = fd_array_map_lookup_elem, 1158 .map_delete_elem = fd_array_map_delete_elem, 1159 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 1160 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 1161 .map_release = perf_event_fd_array_release, 1162 .map_check_btf = map_check_no_btf, 1163 .map_btf_name = "bpf_array", 1164 .map_btf_id = &perf_event_array_map_btf_id, 1165 }; 1166 1167 #ifdef CONFIG_CGROUPS 1168 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 1169 struct file *map_file /* not used */, 1170 int fd) 1171 { 1172 return cgroup_get_from_fd(fd); 1173 } 1174 1175 static void cgroup_fd_array_put_ptr(void *ptr) 1176 { 1177 /* cgroup_put free cgrp after a rcu grace period */ 1178 cgroup_put(ptr); 1179 } 1180 1181 static void cgroup_fd_array_free(struct bpf_map *map) 1182 { 1183 bpf_fd_array_map_clear(map); 1184 fd_array_map_free(map); 1185 } 1186 1187 static int cgroup_array_map_btf_id; 1188 const struct bpf_map_ops cgroup_array_map_ops = { 1189 .map_meta_equal = bpf_map_meta_equal, 1190 .map_alloc_check = fd_array_map_alloc_check, 1191 .map_alloc = array_map_alloc, 1192 .map_free = cgroup_fd_array_free, 1193 .map_get_next_key = array_map_get_next_key, 1194 .map_lookup_elem = fd_array_map_lookup_elem, 1195 .map_delete_elem = fd_array_map_delete_elem, 1196 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 1197 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 1198 .map_check_btf = map_check_no_btf, 1199 .map_btf_name = "bpf_array", 1200 .map_btf_id = &cgroup_array_map_btf_id, 1201 }; 1202 #endif 1203 1204 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) 1205 { 1206 struct bpf_map *map, *inner_map_meta; 1207 1208 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1209 if (IS_ERR(inner_map_meta)) 1210 return inner_map_meta; 1211 1212 map = array_map_alloc(attr); 1213 if (IS_ERR(map)) { 1214 bpf_map_meta_free(inner_map_meta); 1215 return map; 1216 } 1217 1218 map->inner_map_meta = inner_map_meta; 1219 1220 return map; 1221 } 1222 1223 static void array_of_map_free(struct bpf_map *map) 1224 { 1225 /* map->inner_map_meta is only accessed by syscall which 1226 * is protected by fdget/fdput. 1227 */ 1228 bpf_map_meta_free(map->inner_map_meta); 1229 bpf_fd_array_map_clear(map); 1230 fd_array_map_free(map); 1231 } 1232 1233 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) 1234 { 1235 struct bpf_map **inner_map = array_map_lookup_elem(map, key); 1236 1237 if (!inner_map) 1238 return NULL; 1239 1240 return READ_ONCE(*inner_map); 1241 } 1242 1243 static int array_of_map_gen_lookup(struct bpf_map *map, 1244 struct bpf_insn *insn_buf) 1245 { 1246 struct bpf_array *array = container_of(map, struct bpf_array, map); 1247 u32 elem_size = round_up(map->value_size, 8); 1248 struct bpf_insn *insn = insn_buf; 1249 const int ret = BPF_REG_0; 1250 const int map_ptr = BPF_REG_1; 1251 const int index = BPF_REG_2; 1252 1253 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 1254 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 1255 if (!map->bypass_spec_v1) { 1256 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); 1257 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 1258 } else { 1259 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); 1260 } 1261 if (is_power_of_2(elem_size)) 1262 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 1263 else 1264 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 1265 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 1266 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1267 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 1268 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 1269 *insn++ = BPF_MOV64_IMM(ret, 0); 1270 1271 return insn - insn_buf; 1272 } 1273 1274 static int array_of_maps_map_btf_id; 1275 const struct bpf_map_ops array_of_maps_map_ops = { 1276 .map_alloc_check = fd_array_map_alloc_check, 1277 .map_alloc = array_of_map_alloc, 1278 .map_free = array_of_map_free, 1279 .map_get_next_key = array_map_get_next_key, 1280 .map_lookup_elem = array_of_map_lookup_elem, 1281 .map_delete_elem = fd_array_map_delete_elem, 1282 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1283 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1284 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1285 .map_gen_lookup = array_of_map_gen_lookup, 1286 .map_check_btf = map_check_no_btf, 1287 .map_btf_name = "bpf_array", 1288 .map_btf_id = &array_of_maps_map_btf_id, 1289 }; 1290