1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016,2017 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/err.h> 8 #include <linux/slab.h> 9 #include <linux/mm.h> 10 #include <linux/filter.h> 11 #include <linux/perf_event.h> 12 #include <uapi/linux/btf.h> 13 #include <linux/rcupdate_trace.h> 14 15 #include "map_in_map.h" 16 17 #define ARRAY_CREATE_FLAG_MASK \ 18 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ 19 BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) 20 21 static void bpf_array_free_percpu(struct bpf_array *array) 22 { 23 int i; 24 25 for (i = 0; i < array->map.max_entries; i++) { 26 free_percpu(array->pptrs[i]); 27 cond_resched(); 28 } 29 } 30 31 static int bpf_array_alloc_percpu(struct bpf_array *array) 32 { 33 void __percpu *ptr; 34 int i; 35 36 for (i = 0; i < array->map.max_entries; i++) { 37 ptr = __alloc_percpu_gfp(array->elem_size, 8, 38 GFP_USER | __GFP_NOWARN); 39 if (!ptr) { 40 bpf_array_free_percpu(array); 41 return -ENOMEM; 42 } 43 array->pptrs[i] = ptr; 44 cond_resched(); 45 } 46 47 return 0; 48 } 49 50 /* Called from syscall */ 51 int array_map_alloc_check(union bpf_attr *attr) 52 { 53 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 54 int numa_node = bpf_map_attr_numa_node(attr); 55 56 /* check sanity of attributes */ 57 if (attr->max_entries == 0 || attr->key_size != 4 || 58 attr->value_size == 0 || 59 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 60 !bpf_map_flags_access_ok(attr->map_flags) || 61 (percpu && numa_node != NUMA_NO_NODE)) 62 return -EINVAL; 63 64 if (attr->map_type != BPF_MAP_TYPE_ARRAY && 65 attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) 66 return -EINVAL; 67 68 if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && 69 attr->map_flags & BPF_F_PRESERVE_ELEMS) 70 return -EINVAL; 71 72 if (attr->value_size > KMALLOC_MAX_SIZE) 73 /* if value_size is bigger, the user space won't be able to 74 * access the elements. 75 */ 76 return -E2BIG; 77 78 return 0; 79 } 80 81 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 82 { 83 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 84 int ret, numa_node = bpf_map_attr_numa_node(attr); 85 u32 elem_size, index_mask, max_entries; 86 bool bypass_spec_v1 = bpf_bypass_spec_v1(); 87 u64 cost, array_size, mask64; 88 struct bpf_map_memory mem; 89 struct bpf_array *array; 90 91 elem_size = round_up(attr->value_size, 8); 92 93 max_entries = attr->max_entries; 94 95 /* On 32 bit archs roundup_pow_of_two() with max_entries that has 96 * upper most bit set in u32 space is undefined behavior due to 97 * resulting 1U << 32, so do it manually here in u64 space. 98 */ 99 mask64 = fls_long(max_entries - 1); 100 mask64 = 1ULL << mask64; 101 mask64 -= 1; 102 103 index_mask = mask64; 104 if (!bypass_spec_v1) { 105 /* round up array size to nearest power of 2, 106 * since cpu will speculate within index_mask limits 107 */ 108 max_entries = index_mask + 1; 109 /* Check for overflows. */ 110 if (max_entries < attr->max_entries) 111 return ERR_PTR(-E2BIG); 112 } 113 114 array_size = sizeof(*array); 115 if (percpu) { 116 array_size += (u64) max_entries * sizeof(void *); 117 } else { 118 /* rely on vmalloc() to return page-aligned memory and 119 * ensure array->value is exactly page-aligned 120 */ 121 if (attr->map_flags & BPF_F_MMAPABLE) { 122 array_size = PAGE_ALIGN(array_size); 123 array_size += PAGE_ALIGN((u64) max_entries * elem_size); 124 } else { 125 array_size += (u64) max_entries * elem_size; 126 } 127 } 128 129 /* make sure there is no u32 overflow later in round_up() */ 130 cost = array_size; 131 if (percpu) 132 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 133 134 ret = bpf_map_charge_init(&mem, cost); 135 if (ret < 0) 136 return ERR_PTR(ret); 137 138 /* allocate all map elements and zero-initialize them */ 139 if (attr->map_flags & BPF_F_MMAPABLE) { 140 void *data; 141 142 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ 143 data = bpf_map_area_mmapable_alloc(array_size, numa_node); 144 if (!data) { 145 bpf_map_charge_finish(&mem); 146 return ERR_PTR(-ENOMEM); 147 } 148 array = data + PAGE_ALIGN(sizeof(struct bpf_array)) 149 - offsetof(struct bpf_array, value); 150 } else { 151 array = bpf_map_area_alloc(array_size, numa_node); 152 } 153 if (!array) { 154 bpf_map_charge_finish(&mem); 155 return ERR_PTR(-ENOMEM); 156 } 157 array->index_mask = index_mask; 158 array->map.bypass_spec_v1 = bypass_spec_v1; 159 160 /* copy mandatory map attributes */ 161 bpf_map_init_from_attr(&array->map, attr); 162 bpf_map_charge_move(&array->map.memory, &mem); 163 array->elem_size = elem_size; 164 165 if (percpu && bpf_array_alloc_percpu(array)) { 166 bpf_map_charge_finish(&array->map.memory); 167 bpf_map_area_free(array); 168 return ERR_PTR(-ENOMEM); 169 } 170 171 return &array->map; 172 } 173 174 /* Called from syscall or from eBPF program */ 175 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 176 { 177 struct bpf_array *array = container_of(map, struct bpf_array, map); 178 u32 index = *(u32 *)key; 179 180 if (unlikely(index >= array->map.max_entries)) 181 return NULL; 182 183 return array->value + array->elem_size * (index & array->index_mask); 184 } 185 186 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, 187 u32 off) 188 { 189 struct bpf_array *array = container_of(map, struct bpf_array, map); 190 191 if (map->max_entries != 1) 192 return -ENOTSUPP; 193 if (off >= map->value_size) 194 return -EINVAL; 195 196 *imm = (unsigned long)array->value; 197 return 0; 198 } 199 200 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, 201 u32 *off) 202 { 203 struct bpf_array *array = container_of(map, struct bpf_array, map); 204 u64 base = (unsigned long)array->value; 205 u64 range = array->elem_size; 206 207 if (map->max_entries != 1) 208 return -ENOTSUPP; 209 if (imm < base || imm >= base + range) 210 return -ENOENT; 211 212 *off = imm - base; 213 return 0; 214 } 215 216 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 217 static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 218 { 219 struct bpf_array *array = container_of(map, struct bpf_array, map); 220 struct bpf_insn *insn = insn_buf; 221 u32 elem_size = round_up(map->value_size, 8); 222 const int ret = BPF_REG_0; 223 const int map_ptr = BPF_REG_1; 224 const int index = BPF_REG_2; 225 226 if (map->map_flags & BPF_F_INNER_MAP) 227 return -EOPNOTSUPP; 228 229 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 230 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 231 if (!map->bypass_spec_v1) { 232 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); 233 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 234 } else { 235 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); 236 } 237 238 if (is_power_of_2(elem_size)) { 239 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 240 } else { 241 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 242 } 243 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 244 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 245 *insn++ = BPF_MOV64_IMM(ret, 0); 246 return insn - insn_buf; 247 } 248 249 /* Called from eBPF program */ 250 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 251 { 252 struct bpf_array *array = container_of(map, struct bpf_array, map); 253 u32 index = *(u32 *)key; 254 255 if (unlikely(index >= array->map.max_entries)) 256 return NULL; 257 258 return this_cpu_ptr(array->pptrs[index & array->index_mask]); 259 } 260 261 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 262 { 263 struct bpf_array *array = container_of(map, struct bpf_array, map); 264 u32 index = *(u32 *)key; 265 void __percpu *pptr; 266 int cpu, off = 0; 267 u32 size; 268 269 if (unlikely(index >= array->map.max_entries)) 270 return -ENOENT; 271 272 /* per_cpu areas are zero-filled and bpf programs can only 273 * access 'value_size' of them, so copying rounded areas 274 * will not leak any kernel data 275 */ 276 size = round_up(map->value_size, 8); 277 rcu_read_lock(); 278 pptr = array->pptrs[index & array->index_mask]; 279 for_each_possible_cpu(cpu) { 280 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 281 off += size; 282 } 283 rcu_read_unlock(); 284 return 0; 285 } 286 287 /* Called from syscall */ 288 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 289 { 290 struct bpf_array *array = container_of(map, struct bpf_array, map); 291 u32 index = key ? *(u32 *)key : U32_MAX; 292 u32 *next = (u32 *)next_key; 293 294 if (index >= array->map.max_entries) { 295 *next = 0; 296 return 0; 297 } 298 299 if (index == array->map.max_entries - 1) 300 return -ENOENT; 301 302 *next = index + 1; 303 return 0; 304 } 305 306 /* Called from syscall or from eBPF program */ 307 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 308 u64 map_flags) 309 { 310 struct bpf_array *array = container_of(map, struct bpf_array, map); 311 u32 index = *(u32 *)key; 312 char *val; 313 314 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 315 /* unknown flags */ 316 return -EINVAL; 317 318 if (unlikely(index >= array->map.max_entries)) 319 /* all elements were pre-allocated, cannot insert a new one */ 320 return -E2BIG; 321 322 if (unlikely(map_flags & BPF_NOEXIST)) 323 /* all elements already exist */ 324 return -EEXIST; 325 326 if (unlikely((map_flags & BPF_F_LOCK) && 327 !map_value_has_spin_lock(map))) 328 return -EINVAL; 329 330 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 331 memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), 332 value, map->value_size); 333 } else { 334 val = array->value + 335 array->elem_size * (index & array->index_mask); 336 if (map_flags & BPF_F_LOCK) 337 copy_map_value_locked(map, val, value, false); 338 else 339 copy_map_value(map, val, value); 340 } 341 return 0; 342 } 343 344 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 345 u64 map_flags) 346 { 347 struct bpf_array *array = container_of(map, struct bpf_array, map); 348 u32 index = *(u32 *)key; 349 void __percpu *pptr; 350 int cpu, off = 0; 351 u32 size; 352 353 if (unlikely(map_flags > BPF_EXIST)) 354 /* unknown flags */ 355 return -EINVAL; 356 357 if (unlikely(index >= array->map.max_entries)) 358 /* all elements were pre-allocated, cannot insert a new one */ 359 return -E2BIG; 360 361 if (unlikely(map_flags == BPF_NOEXIST)) 362 /* all elements already exist */ 363 return -EEXIST; 364 365 /* the user space will provide round_up(value_size, 8) bytes that 366 * will be copied into per-cpu area. bpf programs can only access 367 * value_size of it. During lookup the same extra bytes will be 368 * returned or zeros which were zero-filled by percpu_alloc, 369 * so no kernel data leaks possible 370 */ 371 size = round_up(map->value_size, 8); 372 rcu_read_lock(); 373 pptr = array->pptrs[index & array->index_mask]; 374 for_each_possible_cpu(cpu) { 375 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 376 off += size; 377 } 378 rcu_read_unlock(); 379 return 0; 380 } 381 382 /* Called from syscall or from eBPF program */ 383 static int array_map_delete_elem(struct bpf_map *map, void *key) 384 { 385 return -EINVAL; 386 } 387 388 static void *array_map_vmalloc_addr(struct bpf_array *array) 389 { 390 return (void *)round_down((unsigned long)array, PAGE_SIZE); 391 } 392 393 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 394 static void array_map_free(struct bpf_map *map) 395 { 396 struct bpf_array *array = container_of(map, struct bpf_array, map); 397 398 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 399 bpf_array_free_percpu(array); 400 401 if (array->map.map_flags & BPF_F_MMAPABLE) 402 bpf_map_area_free(array_map_vmalloc_addr(array)); 403 else 404 bpf_map_area_free(array); 405 } 406 407 static void array_map_seq_show_elem(struct bpf_map *map, void *key, 408 struct seq_file *m) 409 { 410 void *value; 411 412 rcu_read_lock(); 413 414 value = array_map_lookup_elem(map, key); 415 if (!value) { 416 rcu_read_unlock(); 417 return; 418 } 419 420 if (map->btf_key_type_id) 421 seq_printf(m, "%u: ", *(u32 *)key); 422 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 423 seq_puts(m, "\n"); 424 425 rcu_read_unlock(); 426 } 427 428 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, 429 struct seq_file *m) 430 { 431 struct bpf_array *array = container_of(map, struct bpf_array, map); 432 u32 index = *(u32 *)key; 433 void __percpu *pptr; 434 int cpu; 435 436 rcu_read_lock(); 437 438 seq_printf(m, "%u: {\n", *(u32 *)key); 439 pptr = array->pptrs[index & array->index_mask]; 440 for_each_possible_cpu(cpu) { 441 seq_printf(m, "\tcpu%d: ", cpu); 442 btf_type_seq_show(map->btf, map->btf_value_type_id, 443 per_cpu_ptr(pptr, cpu), m); 444 seq_puts(m, "\n"); 445 } 446 seq_puts(m, "}\n"); 447 448 rcu_read_unlock(); 449 } 450 451 static int array_map_check_btf(const struct bpf_map *map, 452 const struct btf *btf, 453 const struct btf_type *key_type, 454 const struct btf_type *value_type) 455 { 456 u32 int_data; 457 458 /* One exception for keyless BTF: .bss/.data/.rodata map */ 459 if (btf_type_is_void(key_type)) { 460 if (map->map_type != BPF_MAP_TYPE_ARRAY || 461 map->max_entries != 1) 462 return -EINVAL; 463 464 if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) 465 return -EINVAL; 466 467 return 0; 468 } 469 470 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 471 return -EINVAL; 472 473 int_data = *(u32 *)(key_type + 1); 474 /* bpf array can only take a u32 key. This check makes sure 475 * that the btf matches the attr used during map_create. 476 */ 477 if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 478 return -EINVAL; 479 480 return 0; 481 } 482 483 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 484 { 485 struct bpf_array *array = container_of(map, struct bpf_array, map); 486 pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; 487 488 if (!(map->map_flags & BPF_F_MMAPABLE)) 489 return -EINVAL; 490 491 if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > 492 PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) 493 return -EINVAL; 494 495 return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), 496 vma->vm_pgoff + pgoff); 497 } 498 499 static bool array_map_meta_equal(const struct bpf_map *meta0, 500 const struct bpf_map *meta1) 501 { 502 if (!bpf_map_meta_equal(meta0, meta1)) 503 return false; 504 return meta0->map_flags & BPF_F_INNER_MAP ? true : 505 meta0->max_entries == meta1->max_entries; 506 } 507 508 struct bpf_iter_seq_array_map_info { 509 struct bpf_map *map; 510 void *percpu_value_buf; 511 u32 index; 512 }; 513 514 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) 515 { 516 struct bpf_iter_seq_array_map_info *info = seq->private; 517 struct bpf_map *map = info->map; 518 struct bpf_array *array; 519 u32 index; 520 521 if (info->index >= map->max_entries) 522 return NULL; 523 524 if (*pos == 0) 525 ++*pos; 526 array = container_of(map, struct bpf_array, map); 527 index = info->index & array->index_mask; 528 if (info->percpu_value_buf) 529 return array->pptrs[index]; 530 return array->value + array->elem_size * index; 531 } 532 533 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 534 { 535 struct bpf_iter_seq_array_map_info *info = seq->private; 536 struct bpf_map *map = info->map; 537 struct bpf_array *array; 538 u32 index; 539 540 ++*pos; 541 ++info->index; 542 if (info->index >= map->max_entries) 543 return NULL; 544 545 array = container_of(map, struct bpf_array, map); 546 index = info->index & array->index_mask; 547 if (info->percpu_value_buf) 548 return array->pptrs[index]; 549 return array->value + array->elem_size * index; 550 } 551 552 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) 553 { 554 struct bpf_iter_seq_array_map_info *info = seq->private; 555 struct bpf_iter__bpf_map_elem ctx = {}; 556 struct bpf_map *map = info->map; 557 struct bpf_iter_meta meta; 558 struct bpf_prog *prog; 559 int off = 0, cpu = 0; 560 void __percpu **pptr; 561 u32 size; 562 563 meta.seq = seq; 564 prog = bpf_iter_get_info(&meta, v == NULL); 565 if (!prog) 566 return 0; 567 568 ctx.meta = &meta; 569 ctx.map = info->map; 570 if (v) { 571 ctx.key = &info->index; 572 573 if (!info->percpu_value_buf) { 574 ctx.value = v; 575 } else { 576 pptr = v; 577 size = round_up(map->value_size, 8); 578 for_each_possible_cpu(cpu) { 579 bpf_long_memcpy(info->percpu_value_buf + off, 580 per_cpu_ptr(pptr, cpu), 581 size); 582 off += size; 583 } 584 ctx.value = info->percpu_value_buf; 585 } 586 } 587 588 return bpf_iter_run_prog(prog, &ctx); 589 } 590 591 static int bpf_array_map_seq_show(struct seq_file *seq, void *v) 592 { 593 return __bpf_array_map_seq_show(seq, v); 594 } 595 596 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) 597 { 598 if (!v) 599 (void)__bpf_array_map_seq_show(seq, NULL); 600 } 601 602 static int bpf_iter_init_array_map(void *priv_data, 603 struct bpf_iter_aux_info *aux) 604 { 605 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 606 struct bpf_map *map = aux->map; 607 void *value_buf; 608 u32 buf_size; 609 610 if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 611 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 612 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 613 if (!value_buf) 614 return -ENOMEM; 615 616 seq_info->percpu_value_buf = value_buf; 617 } 618 619 seq_info->map = map; 620 return 0; 621 } 622 623 static void bpf_iter_fini_array_map(void *priv_data) 624 { 625 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 626 627 kfree(seq_info->percpu_value_buf); 628 } 629 630 static const struct seq_operations bpf_array_map_seq_ops = { 631 .start = bpf_array_map_seq_start, 632 .next = bpf_array_map_seq_next, 633 .stop = bpf_array_map_seq_stop, 634 .show = bpf_array_map_seq_show, 635 }; 636 637 static const struct bpf_iter_seq_info iter_seq_info = { 638 .seq_ops = &bpf_array_map_seq_ops, 639 .init_seq_private = bpf_iter_init_array_map, 640 .fini_seq_private = bpf_iter_fini_array_map, 641 .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), 642 }; 643 644 static int array_map_btf_id; 645 const struct bpf_map_ops array_map_ops = { 646 .map_meta_equal = array_map_meta_equal, 647 .map_alloc_check = array_map_alloc_check, 648 .map_alloc = array_map_alloc, 649 .map_free = array_map_free, 650 .map_get_next_key = array_map_get_next_key, 651 .map_lookup_elem = array_map_lookup_elem, 652 .map_update_elem = array_map_update_elem, 653 .map_delete_elem = array_map_delete_elem, 654 .map_gen_lookup = array_map_gen_lookup, 655 .map_direct_value_addr = array_map_direct_value_addr, 656 .map_direct_value_meta = array_map_direct_value_meta, 657 .map_mmap = array_map_mmap, 658 .map_seq_show_elem = array_map_seq_show_elem, 659 .map_check_btf = array_map_check_btf, 660 .map_lookup_batch = generic_map_lookup_batch, 661 .map_update_batch = generic_map_update_batch, 662 .map_btf_name = "bpf_array", 663 .map_btf_id = &array_map_btf_id, 664 .iter_seq_info = &iter_seq_info, 665 }; 666 667 static int percpu_array_map_btf_id; 668 const struct bpf_map_ops percpu_array_map_ops = { 669 .map_meta_equal = bpf_map_meta_equal, 670 .map_alloc_check = array_map_alloc_check, 671 .map_alloc = array_map_alloc, 672 .map_free = array_map_free, 673 .map_get_next_key = array_map_get_next_key, 674 .map_lookup_elem = percpu_array_map_lookup_elem, 675 .map_update_elem = array_map_update_elem, 676 .map_delete_elem = array_map_delete_elem, 677 .map_seq_show_elem = percpu_array_map_seq_show_elem, 678 .map_check_btf = array_map_check_btf, 679 .map_btf_name = "bpf_array", 680 .map_btf_id = &percpu_array_map_btf_id, 681 .iter_seq_info = &iter_seq_info, 682 }; 683 684 static int fd_array_map_alloc_check(union bpf_attr *attr) 685 { 686 /* only file descriptors can be stored in this type of map */ 687 if (attr->value_size != sizeof(u32)) 688 return -EINVAL; 689 /* Program read-only/write-only not supported for special maps yet. */ 690 if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) 691 return -EINVAL; 692 return array_map_alloc_check(attr); 693 } 694 695 static void fd_array_map_free(struct bpf_map *map) 696 { 697 struct bpf_array *array = container_of(map, struct bpf_array, map); 698 int i; 699 700 /* make sure it's empty */ 701 for (i = 0; i < array->map.max_entries; i++) 702 BUG_ON(array->ptrs[i] != NULL); 703 704 bpf_map_area_free(array); 705 } 706 707 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 708 { 709 return ERR_PTR(-EOPNOTSUPP); 710 } 711 712 /* only called from syscall */ 713 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 714 { 715 void **elem, *ptr; 716 int ret = 0; 717 718 if (!map->ops->map_fd_sys_lookup_elem) 719 return -ENOTSUPP; 720 721 rcu_read_lock(); 722 elem = array_map_lookup_elem(map, key); 723 if (elem && (ptr = READ_ONCE(*elem))) 724 *value = map->ops->map_fd_sys_lookup_elem(ptr); 725 else 726 ret = -ENOENT; 727 rcu_read_unlock(); 728 729 return ret; 730 } 731 732 /* only called from syscall */ 733 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 734 void *key, void *value, u64 map_flags) 735 { 736 struct bpf_array *array = container_of(map, struct bpf_array, map); 737 void *new_ptr, *old_ptr; 738 u32 index = *(u32 *)key, ufd; 739 740 if (map_flags != BPF_ANY) 741 return -EINVAL; 742 743 if (index >= array->map.max_entries) 744 return -E2BIG; 745 746 ufd = *(u32 *)value; 747 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 748 if (IS_ERR(new_ptr)) 749 return PTR_ERR(new_ptr); 750 751 if (map->ops->map_poke_run) { 752 mutex_lock(&array->aux->poke_mutex); 753 old_ptr = xchg(array->ptrs + index, new_ptr); 754 map->ops->map_poke_run(map, index, old_ptr, new_ptr); 755 mutex_unlock(&array->aux->poke_mutex); 756 } else { 757 old_ptr = xchg(array->ptrs + index, new_ptr); 758 } 759 760 if (old_ptr) 761 map->ops->map_fd_put_ptr(old_ptr); 762 return 0; 763 } 764 765 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 766 { 767 struct bpf_array *array = container_of(map, struct bpf_array, map); 768 void *old_ptr; 769 u32 index = *(u32 *)key; 770 771 if (index >= array->map.max_entries) 772 return -E2BIG; 773 774 if (map->ops->map_poke_run) { 775 mutex_lock(&array->aux->poke_mutex); 776 old_ptr = xchg(array->ptrs + index, NULL); 777 map->ops->map_poke_run(map, index, old_ptr, NULL); 778 mutex_unlock(&array->aux->poke_mutex); 779 } else { 780 old_ptr = xchg(array->ptrs + index, NULL); 781 } 782 783 if (old_ptr) { 784 map->ops->map_fd_put_ptr(old_ptr); 785 return 0; 786 } else { 787 return -ENOENT; 788 } 789 } 790 791 static void *prog_fd_array_get_ptr(struct bpf_map *map, 792 struct file *map_file, int fd) 793 { 794 struct bpf_array *array = container_of(map, struct bpf_array, map); 795 struct bpf_prog *prog = bpf_prog_get(fd); 796 797 if (IS_ERR(prog)) 798 return prog; 799 800 if (!bpf_prog_array_compatible(array, prog)) { 801 bpf_prog_put(prog); 802 return ERR_PTR(-EINVAL); 803 } 804 805 return prog; 806 } 807 808 static void prog_fd_array_put_ptr(void *ptr) 809 { 810 bpf_prog_put(ptr); 811 } 812 813 static u32 prog_fd_array_sys_lookup_elem(void *ptr) 814 { 815 return ((struct bpf_prog *)ptr)->aux->id; 816 } 817 818 /* decrement refcnt of all bpf_progs that are stored in this map */ 819 static void bpf_fd_array_map_clear(struct bpf_map *map) 820 { 821 struct bpf_array *array = container_of(map, struct bpf_array, map); 822 int i; 823 824 for (i = 0; i < array->map.max_entries; i++) 825 fd_array_map_delete_elem(map, &i); 826 } 827 828 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, 829 struct seq_file *m) 830 { 831 void **elem, *ptr; 832 u32 prog_id; 833 834 rcu_read_lock(); 835 836 elem = array_map_lookup_elem(map, key); 837 if (elem) { 838 ptr = READ_ONCE(*elem); 839 if (ptr) { 840 seq_printf(m, "%u: ", *(u32 *)key); 841 prog_id = prog_fd_array_sys_lookup_elem(ptr); 842 btf_type_seq_show(map->btf, map->btf_value_type_id, 843 &prog_id, m); 844 seq_puts(m, "\n"); 845 } 846 } 847 848 rcu_read_unlock(); 849 } 850 851 struct prog_poke_elem { 852 struct list_head list; 853 struct bpf_prog_aux *aux; 854 }; 855 856 static int prog_array_map_poke_track(struct bpf_map *map, 857 struct bpf_prog_aux *prog_aux) 858 { 859 struct prog_poke_elem *elem; 860 struct bpf_array_aux *aux; 861 int ret = 0; 862 863 aux = container_of(map, struct bpf_array, map)->aux; 864 mutex_lock(&aux->poke_mutex); 865 list_for_each_entry(elem, &aux->poke_progs, list) { 866 if (elem->aux == prog_aux) 867 goto out; 868 } 869 870 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 871 if (!elem) { 872 ret = -ENOMEM; 873 goto out; 874 } 875 876 INIT_LIST_HEAD(&elem->list); 877 /* We must track the program's aux info at this point in time 878 * since the program pointer itself may not be stable yet, see 879 * also comment in prog_array_map_poke_run(). 880 */ 881 elem->aux = prog_aux; 882 883 list_add_tail(&elem->list, &aux->poke_progs); 884 out: 885 mutex_unlock(&aux->poke_mutex); 886 return ret; 887 } 888 889 static void prog_array_map_poke_untrack(struct bpf_map *map, 890 struct bpf_prog_aux *prog_aux) 891 { 892 struct prog_poke_elem *elem, *tmp; 893 struct bpf_array_aux *aux; 894 895 aux = container_of(map, struct bpf_array, map)->aux; 896 mutex_lock(&aux->poke_mutex); 897 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 898 if (elem->aux == prog_aux) { 899 list_del_init(&elem->list); 900 kfree(elem); 901 break; 902 } 903 } 904 mutex_unlock(&aux->poke_mutex); 905 } 906 907 static void prog_array_map_poke_run(struct bpf_map *map, u32 key, 908 struct bpf_prog *old, 909 struct bpf_prog *new) 910 { 911 u8 *old_addr, *new_addr, *old_bypass_addr; 912 struct prog_poke_elem *elem; 913 struct bpf_array_aux *aux; 914 915 aux = container_of(map, struct bpf_array, map)->aux; 916 WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); 917 918 list_for_each_entry(elem, &aux->poke_progs, list) { 919 struct bpf_jit_poke_descriptor *poke; 920 int i, ret; 921 922 for (i = 0; i < elem->aux->size_poke_tab; i++) { 923 poke = &elem->aux->poke_tab[i]; 924 925 /* Few things to be aware of: 926 * 927 * 1) We can only ever access aux in this context, but 928 * not aux->prog since it might not be stable yet and 929 * there could be danger of use after free otherwise. 930 * 2) Initially when we start tracking aux, the program 931 * is not JITed yet and also does not have a kallsyms 932 * entry. We skip these as poke->tailcall_target_stable 933 * is not active yet. The JIT will do the final fixup 934 * before setting it stable. The various 935 * poke->tailcall_target_stable are successively 936 * activated, so tail call updates can arrive from here 937 * while JIT is still finishing its final fixup for 938 * non-activated poke entries. 939 * 3) On program teardown, the program's kallsym entry gets 940 * removed out of RCU callback, but we can only untrack 941 * from sleepable context, therefore bpf_arch_text_poke() 942 * might not see that this is in BPF text section and 943 * bails out with -EINVAL. As these are unreachable since 944 * RCU grace period already passed, we simply skip them. 945 * 4) Also programs reaching refcount of zero while patching 946 * is in progress is okay since we're protected under 947 * poke_mutex and untrack the programs before the JIT 948 * buffer is freed. When we're still in the middle of 949 * patching and suddenly kallsyms entry of the program 950 * gets evicted, we just skip the rest which is fine due 951 * to point 3). 952 * 5) Any other error happening below from bpf_arch_text_poke() 953 * is a unexpected bug. 954 */ 955 if (!READ_ONCE(poke->tailcall_target_stable)) 956 continue; 957 if (poke->reason != BPF_POKE_REASON_TAIL_CALL) 958 continue; 959 if (poke->tail_call.map != map || 960 poke->tail_call.key != key) 961 continue; 962 963 old_bypass_addr = old ? NULL : poke->bypass_addr; 964 old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; 965 new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; 966 967 if (new) { 968 ret = bpf_arch_text_poke(poke->tailcall_target, 969 BPF_MOD_JUMP, 970 old_addr, new_addr); 971 BUG_ON(ret < 0 && ret != -EINVAL); 972 if (!old) { 973 ret = bpf_arch_text_poke(poke->tailcall_bypass, 974 BPF_MOD_JUMP, 975 poke->bypass_addr, 976 NULL); 977 BUG_ON(ret < 0 && ret != -EINVAL); 978 } 979 } else { 980 ret = bpf_arch_text_poke(poke->tailcall_bypass, 981 BPF_MOD_JUMP, 982 old_bypass_addr, 983 poke->bypass_addr); 984 BUG_ON(ret < 0 && ret != -EINVAL); 985 /* let other CPUs finish the execution of program 986 * so that it will not possible to expose them 987 * to invalid nop, stack unwind, nop state 988 */ 989 if (!ret) 990 synchronize_rcu(); 991 ret = bpf_arch_text_poke(poke->tailcall_target, 992 BPF_MOD_JUMP, 993 old_addr, NULL); 994 BUG_ON(ret < 0 && ret != -EINVAL); 995 } 996 } 997 } 998 } 999 1000 static void prog_array_map_clear_deferred(struct work_struct *work) 1001 { 1002 struct bpf_map *map = container_of(work, struct bpf_array_aux, 1003 work)->map; 1004 bpf_fd_array_map_clear(map); 1005 bpf_map_put(map); 1006 } 1007 1008 static void prog_array_map_clear(struct bpf_map *map) 1009 { 1010 struct bpf_array_aux *aux = container_of(map, struct bpf_array, 1011 map)->aux; 1012 bpf_map_inc(map); 1013 schedule_work(&aux->work); 1014 } 1015 1016 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) 1017 { 1018 struct bpf_array_aux *aux; 1019 struct bpf_map *map; 1020 1021 aux = kzalloc(sizeof(*aux), GFP_KERNEL); 1022 if (!aux) 1023 return ERR_PTR(-ENOMEM); 1024 1025 INIT_WORK(&aux->work, prog_array_map_clear_deferred); 1026 INIT_LIST_HEAD(&aux->poke_progs); 1027 mutex_init(&aux->poke_mutex); 1028 1029 map = array_map_alloc(attr); 1030 if (IS_ERR(map)) { 1031 kfree(aux); 1032 return map; 1033 } 1034 1035 container_of(map, struct bpf_array, map)->aux = aux; 1036 aux->map = map; 1037 1038 return map; 1039 } 1040 1041 static void prog_array_map_free(struct bpf_map *map) 1042 { 1043 struct prog_poke_elem *elem, *tmp; 1044 struct bpf_array_aux *aux; 1045 1046 aux = container_of(map, struct bpf_array, map)->aux; 1047 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 1048 list_del_init(&elem->list); 1049 kfree(elem); 1050 } 1051 kfree(aux); 1052 fd_array_map_free(map); 1053 } 1054 1055 /* prog_array->aux->{type,jited} is a runtime binding. 1056 * Doing static check alone in the verifier is not enough. 1057 * Thus, prog_array_map cannot be used as an inner_map 1058 * and map_meta_equal is not implemented. 1059 */ 1060 static int prog_array_map_btf_id; 1061 const struct bpf_map_ops prog_array_map_ops = { 1062 .map_alloc_check = fd_array_map_alloc_check, 1063 .map_alloc = prog_array_map_alloc, 1064 .map_free = prog_array_map_free, 1065 .map_poke_track = prog_array_map_poke_track, 1066 .map_poke_untrack = prog_array_map_poke_untrack, 1067 .map_poke_run = prog_array_map_poke_run, 1068 .map_get_next_key = array_map_get_next_key, 1069 .map_lookup_elem = fd_array_map_lookup_elem, 1070 .map_delete_elem = fd_array_map_delete_elem, 1071 .map_fd_get_ptr = prog_fd_array_get_ptr, 1072 .map_fd_put_ptr = prog_fd_array_put_ptr, 1073 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 1074 .map_release_uref = prog_array_map_clear, 1075 .map_seq_show_elem = prog_array_map_seq_show_elem, 1076 .map_btf_name = "bpf_array", 1077 .map_btf_id = &prog_array_map_btf_id, 1078 }; 1079 1080 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 1081 struct file *map_file) 1082 { 1083 struct bpf_event_entry *ee; 1084 1085 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 1086 if (ee) { 1087 ee->event = perf_file->private_data; 1088 ee->perf_file = perf_file; 1089 ee->map_file = map_file; 1090 } 1091 1092 return ee; 1093 } 1094 1095 static void __bpf_event_entry_free(struct rcu_head *rcu) 1096 { 1097 struct bpf_event_entry *ee; 1098 1099 ee = container_of(rcu, struct bpf_event_entry, rcu); 1100 fput(ee->perf_file); 1101 kfree(ee); 1102 } 1103 1104 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 1105 { 1106 call_rcu(&ee->rcu, __bpf_event_entry_free); 1107 } 1108 1109 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 1110 struct file *map_file, int fd) 1111 { 1112 struct bpf_event_entry *ee; 1113 struct perf_event *event; 1114 struct file *perf_file; 1115 u64 value; 1116 1117 perf_file = perf_event_get(fd); 1118 if (IS_ERR(perf_file)) 1119 return perf_file; 1120 1121 ee = ERR_PTR(-EOPNOTSUPP); 1122 event = perf_file->private_data; 1123 if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) 1124 goto err_out; 1125 1126 ee = bpf_event_entry_gen(perf_file, map_file); 1127 if (ee) 1128 return ee; 1129 ee = ERR_PTR(-ENOMEM); 1130 err_out: 1131 fput(perf_file); 1132 return ee; 1133 } 1134 1135 static void perf_event_fd_array_put_ptr(void *ptr) 1136 { 1137 bpf_event_entry_free_rcu(ptr); 1138 } 1139 1140 static void perf_event_fd_array_release(struct bpf_map *map, 1141 struct file *map_file) 1142 { 1143 struct bpf_array *array = container_of(map, struct bpf_array, map); 1144 struct bpf_event_entry *ee; 1145 int i; 1146 1147 if (map->map_flags & BPF_F_PRESERVE_ELEMS) 1148 return; 1149 1150 rcu_read_lock(); 1151 for (i = 0; i < array->map.max_entries; i++) { 1152 ee = READ_ONCE(array->ptrs[i]); 1153 if (ee && ee->map_file == map_file) 1154 fd_array_map_delete_elem(map, &i); 1155 } 1156 rcu_read_unlock(); 1157 } 1158 1159 static void perf_event_fd_array_map_free(struct bpf_map *map) 1160 { 1161 if (map->map_flags & BPF_F_PRESERVE_ELEMS) 1162 bpf_fd_array_map_clear(map); 1163 fd_array_map_free(map); 1164 } 1165 1166 static int perf_event_array_map_btf_id; 1167 const struct bpf_map_ops perf_event_array_map_ops = { 1168 .map_meta_equal = bpf_map_meta_equal, 1169 .map_alloc_check = fd_array_map_alloc_check, 1170 .map_alloc = array_map_alloc, 1171 .map_free = perf_event_fd_array_map_free, 1172 .map_get_next_key = array_map_get_next_key, 1173 .map_lookup_elem = fd_array_map_lookup_elem, 1174 .map_delete_elem = fd_array_map_delete_elem, 1175 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 1176 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 1177 .map_release = perf_event_fd_array_release, 1178 .map_check_btf = map_check_no_btf, 1179 .map_btf_name = "bpf_array", 1180 .map_btf_id = &perf_event_array_map_btf_id, 1181 }; 1182 1183 #ifdef CONFIG_CGROUPS 1184 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 1185 struct file *map_file /* not used */, 1186 int fd) 1187 { 1188 return cgroup_get_from_fd(fd); 1189 } 1190 1191 static void cgroup_fd_array_put_ptr(void *ptr) 1192 { 1193 /* cgroup_put free cgrp after a rcu grace period */ 1194 cgroup_put(ptr); 1195 } 1196 1197 static void cgroup_fd_array_free(struct bpf_map *map) 1198 { 1199 bpf_fd_array_map_clear(map); 1200 fd_array_map_free(map); 1201 } 1202 1203 static int cgroup_array_map_btf_id; 1204 const struct bpf_map_ops cgroup_array_map_ops = { 1205 .map_meta_equal = bpf_map_meta_equal, 1206 .map_alloc_check = fd_array_map_alloc_check, 1207 .map_alloc = array_map_alloc, 1208 .map_free = cgroup_fd_array_free, 1209 .map_get_next_key = array_map_get_next_key, 1210 .map_lookup_elem = fd_array_map_lookup_elem, 1211 .map_delete_elem = fd_array_map_delete_elem, 1212 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 1213 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 1214 .map_check_btf = map_check_no_btf, 1215 .map_btf_name = "bpf_array", 1216 .map_btf_id = &cgroup_array_map_btf_id, 1217 }; 1218 #endif 1219 1220 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) 1221 { 1222 struct bpf_map *map, *inner_map_meta; 1223 1224 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1225 if (IS_ERR(inner_map_meta)) 1226 return inner_map_meta; 1227 1228 map = array_map_alloc(attr); 1229 if (IS_ERR(map)) { 1230 bpf_map_meta_free(inner_map_meta); 1231 return map; 1232 } 1233 1234 map->inner_map_meta = inner_map_meta; 1235 1236 return map; 1237 } 1238 1239 static void array_of_map_free(struct bpf_map *map) 1240 { 1241 /* map->inner_map_meta is only accessed by syscall which 1242 * is protected by fdget/fdput. 1243 */ 1244 bpf_map_meta_free(map->inner_map_meta); 1245 bpf_fd_array_map_clear(map); 1246 fd_array_map_free(map); 1247 } 1248 1249 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) 1250 { 1251 struct bpf_map **inner_map = array_map_lookup_elem(map, key); 1252 1253 if (!inner_map) 1254 return NULL; 1255 1256 return READ_ONCE(*inner_map); 1257 } 1258 1259 static int array_of_map_gen_lookup(struct bpf_map *map, 1260 struct bpf_insn *insn_buf) 1261 { 1262 struct bpf_array *array = container_of(map, struct bpf_array, map); 1263 u32 elem_size = round_up(map->value_size, 8); 1264 struct bpf_insn *insn = insn_buf; 1265 const int ret = BPF_REG_0; 1266 const int map_ptr = BPF_REG_1; 1267 const int index = BPF_REG_2; 1268 1269 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 1270 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 1271 if (!map->bypass_spec_v1) { 1272 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); 1273 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 1274 } else { 1275 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); 1276 } 1277 if (is_power_of_2(elem_size)) 1278 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 1279 else 1280 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 1281 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 1282 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1283 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 1284 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 1285 *insn++ = BPF_MOV64_IMM(ret, 0); 1286 1287 return insn - insn_buf; 1288 } 1289 1290 static int array_of_maps_map_btf_id; 1291 const struct bpf_map_ops array_of_maps_map_ops = { 1292 .map_alloc_check = fd_array_map_alloc_check, 1293 .map_alloc = array_of_map_alloc, 1294 .map_free = array_of_map_free, 1295 .map_get_next_key = array_map_get_next_key, 1296 .map_lookup_elem = array_of_map_lookup_elem, 1297 .map_delete_elem = fd_array_map_delete_elem, 1298 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1299 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1300 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1301 .map_gen_lookup = array_of_map_gen_lookup, 1302 .map_check_btf = map_check_no_btf, 1303 .map_btf_name = "bpf_array", 1304 .map_btf_id = &array_of_maps_map_btf_id, 1305 }; 1306