1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/random.h> 11 #include <uapi/linux/btf.h> 12 #include "percpu_freelist.h" 13 #include "bpf_lru_list.h" 14 #include "map_in_map.h" 15 16 #define HTAB_CREATE_FLAG_MASK \ 17 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 18 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 19 20 #define BATCH_OPS(_name) \ 21 .map_lookup_batch = \ 22 _name##_map_lookup_batch, \ 23 .map_lookup_and_delete_batch = \ 24 _name##_map_lookup_and_delete_batch, \ 25 .map_update_batch = \ 26 generic_map_update_batch, \ 27 .map_delete_batch = \ 28 generic_map_delete_batch 29 30 struct bucket { 31 struct hlist_nulls_head head; 32 raw_spinlock_t lock; 33 }; 34 35 struct bpf_htab { 36 struct bpf_map map; 37 struct bucket *buckets; 38 void *elems; 39 union { 40 struct pcpu_freelist freelist; 41 struct bpf_lru lru; 42 }; 43 struct htab_elem *__percpu *extra_elems; 44 atomic_t count; /* number of elements in this hashtable */ 45 u32 n_buckets; /* number of hash buckets */ 46 u32 elem_size; /* size of each element in bytes */ 47 u32 hashrnd; 48 }; 49 50 /* each htab element is struct htab_elem + key + value */ 51 struct htab_elem { 52 union { 53 struct hlist_nulls_node hash_node; 54 struct { 55 void *padding; 56 union { 57 struct bpf_htab *htab; 58 struct pcpu_freelist_node fnode; 59 }; 60 }; 61 }; 62 union { 63 struct rcu_head rcu; 64 struct bpf_lru_node lru_node; 65 }; 66 u32 hash; 67 char key[0] __aligned(8); 68 }; 69 70 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 71 72 static bool htab_is_lru(const struct bpf_htab *htab) 73 { 74 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 75 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 76 } 77 78 static bool htab_is_percpu(const struct bpf_htab *htab) 79 { 80 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 81 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 82 } 83 84 static bool htab_is_prealloc(const struct bpf_htab *htab) 85 { 86 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 87 } 88 89 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 90 void __percpu *pptr) 91 { 92 *(void __percpu **)(l->key + key_size) = pptr; 93 } 94 95 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 96 { 97 return *(void __percpu **)(l->key + key_size); 98 } 99 100 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 101 { 102 return *(void **)(l->key + roundup(map->key_size, 8)); 103 } 104 105 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 106 { 107 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 108 } 109 110 static void htab_free_elems(struct bpf_htab *htab) 111 { 112 int i; 113 114 if (!htab_is_percpu(htab)) 115 goto free_elems; 116 117 for (i = 0; i < htab->map.max_entries; i++) { 118 void __percpu *pptr; 119 120 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 121 htab->map.key_size); 122 free_percpu(pptr); 123 cond_resched(); 124 } 125 free_elems: 126 bpf_map_area_free(htab->elems); 127 } 128 129 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 130 u32 hash) 131 { 132 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 133 struct htab_elem *l; 134 135 if (node) { 136 l = container_of(node, struct htab_elem, lru_node); 137 memcpy(l->key, key, htab->map.key_size); 138 return l; 139 } 140 141 return NULL; 142 } 143 144 static int prealloc_init(struct bpf_htab *htab) 145 { 146 u32 num_entries = htab->map.max_entries; 147 int err = -ENOMEM, i; 148 149 if (!htab_is_percpu(htab) && !htab_is_lru(htab)) 150 num_entries += num_possible_cpus(); 151 152 htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, 153 htab->map.numa_node); 154 if (!htab->elems) 155 return -ENOMEM; 156 157 if (!htab_is_percpu(htab)) 158 goto skip_percpu_elems; 159 160 for (i = 0; i < num_entries; i++) { 161 u32 size = round_up(htab->map.value_size, 8); 162 void __percpu *pptr; 163 164 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 165 if (!pptr) 166 goto free_elems; 167 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 168 pptr); 169 cond_resched(); 170 } 171 172 skip_percpu_elems: 173 if (htab_is_lru(htab)) 174 err = bpf_lru_init(&htab->lru, 175 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 176 offsetof(struct htab_elem, hash) - 177 offsetof(struct htab_elem, lru_node), 178 htab_lru_map_delete_node, 179 htab); 180 else 181 err = pcpu_freelist_init(&htab->freelist); 182 183 if (err) 184 goto free_elems; 185 186 if (htab_is_lru(htab)) 187 bpf_lru_populate(&htab->lru, htab->elems, 188 offsetof(struct htab_elem, lru_node), 189 htab->elem_size, num_entries); 190 else 191 pcpu_freelist_populate(&htab->freelist, 192 htab->elems + offsetof(struct htab_elem, fnode), 193 htab->elem_size, num_entries); 194 195 return 0; 196 197 free_elems: 198 htab_free_elems(htab); 199 return err; 200 } 201 202 static void prealloc_destroy(struct bpf_htab *htab) 203 { 204 htab_free_elems(htab); 205 206 if (htab_is_lru(htab)) 207 bpf_lru_destroy(&htab->lru); 208 else 209 pcpu_freelist_destroy(&htab->freelist); 210 } 211 212 static int alloc_extra_elems(struct bpf_htab *htab) 213 { 214 struct htab_elem *__percpu *pptr, *l_new; 215 struct pcpu_freelist_node *l; 216 int cpu; 217 218 pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, 219 GFP_USER | __GFP_NOWARN); 220 if (!pptr) 221 return -ENOMEM; 222 223 for_each_possible_cpu(cpu) { 224 l = pcpu_freelist_pop(&htab->freelist); 225 /* pop will succeed, since prealloc_init() 226 * preallocated extra num_possible_cpus elements 227 */ 228 l_new = container_of(l, struct htab_elem, fnode); 229 *per_cpu_ptr(pptr, cpu) = l_new; 230 } 231 htab->extra_elems = pptr; 232 return 0; 233 } 234 235 /* Called from syscall */ 236 static int htab_map_alloc_check(union bpf_attr *attr) 237 { 238 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 239 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 240 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 241 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 242 /* percpu_lru means each cpu has its own LRU list. 243 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 244 * the map's value itself is percpu. percpu_lru has 245 * nothing to do with the map's value. 246 */ 247 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 248 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 249 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 250 int numa_node = bpf_map_attr_numa_node(attr); 251 252 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 253 offsetof(struct htab_elem, hash_node.pprev)); 254 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 255 offsetof(struct htab_elem, hash_node.pprev)); 256 257 if (lru && !capable(CAP_SYS_ADMIN)) 258 /* LRU implementation is much complicated than other 259 * maps. Hence, limit to CAP_SYS_ADMIN for now. 260 */ 261 return -EPERM; 262 263 if (zero_seed && !capable(CAP_SYS_ADMIN)) 264 /* Guard against local DoS, and discourage production use. */ 265 return -EPERM; 266 267 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 268 !bpf_map_flags_access_ok(attr->map_flags)) 269 return -EINVAL; 270 271 if (!lru && percpu_lru) 272 return -EINVAL; 273 274 if (lru && !prealloc) 275 return -ENOTSUPP; 276 277 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 278 return -EINVAL; 279 280 /* check sanity of attributes. 281 * value_size == 0 may be allowed in the future to use map as a set 282 */ 283 if (attr->max_entries == 0 || attr->key_size == 0 || 284 attr->value_size == 0) 285 return -EINVAL; 286 287 if (attr->key_size > MAX_BPF_STACK) 288 /* eBPF programs initialize keys on stack, so they cannot be 289 * larger than max stack size 290 */ 291 return -E2BIG; 292 293 if (attr->value_size >= KMALLOC_MAX_SIZE - 294 MAX_BPF_STACK - sizeof(struct htab_elem)) 295 /* if value_size is bigger, the user space won't be able to 296 * access the elements via bpf syscall. This check also makes 297 * sure that the elem_size doesn't overflow and it's 298 * kmalloc-able later in htab_map_update_elem() 299 */ 300 return -E2BIG; 301 302 return 0; 303 } 304 305 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 306 { 307 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 308 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 309 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 310 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 311 /* percpu_lru means each cpu has its own LRU list. 312 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 313 * the map's value itself is percpu. percpu_lru has 314 * nothing to do with the map's value. 315 */ 316 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 317 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 318 struct bpf_htab *htab; 319 int err, i; 320 u64 cost; 321 322 htab = kzalloc(sizeof(*htab), GFP_USER); 323 if (!htab) 324 return ERR_PTR(-ENOMEM); 325 326 bpf_map_init_from_attr(&htab->map, attr); 327 328 if (percpu_lru) { 329 /* ensure each CPU's lru list has >=1 elements. 330 * since we are at it, make each lru list has the same 331 * number of elements. 332 */ 333 htab->map.max_entries = roundup(attr->max_entries, 334 num_possible_cpus()); 335 if (htab->map.max_entries < attr->max_entries) 336 htab->map.max_entries = rounddown(attr->max_entries, 337 num_possible_cpus()); 338 } 339 340 /* hash table size must be power of 2 */ 341 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 342 343 htab->elem_size = sizeof(struct htab_elem) + 344 round_up(htab->map.key_size, 8); 345 if (percpu) 346 htab->elem_size += sizeof(void *); 347 else 348 htab->elem_size += round_up(htab->map.value_size, 8); 349 350 err = -E2BIG; 351 /* prevent zero size kmalloc and check for u32 overflow */ 352 if (htab->n_buckets == 0 || 353 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 354 goto free_htab; 355 356 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 357 (u64) htab->elem_size * htab->map.max_entries; 358 359 if (percpu) 360 cost += (u64) round_up(htab->map.value_size, 8) * 361 num_possible_cpus() * htab->map.max_entries; 362 else 363 cost += (u64) htab->elem_size * num_possible_cpus(); 364 365 /* if map size is larger than memlock limit, reject it */ 366 err = bpf_map_charge_init(&htab->map.memory, cost); 367 if (err) 368 goto free_htab; 369 370 err = -ENOMEM; 371 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 372 sizeof(struct bucket), 373 htab->map.numa_node); 374 if (!htab->buckets) 375 goto free_charge; 376 377 if (htab->map.map_flags & BPF_F_ZERO_SEED) 378 htab->hashrnd = 0; 379 else 380 htab->hashrnd = get_random_int(); 381 382 for (i = 0; i < htab->n_buckets; i++) { 383 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 384 raw_spin_lock_init(&htab->buckets[i].lock); 385 } 386 387 if (prealloc) { 388 err = prealloc_init(htab); 389 if (err) 390 goto free_buckets; 391 392 if (!percpu && !lru) { 393 /* lru itself can remove the least used element, so 394 * there is no need for an extra elem during map_update. 395 */ 396 err = alloc_extra_elems(htab); 397 if (err) 398 goto free_prealloc; 399 } 400 } 401 402 return &htab->map; 403 404 free_prealloc: 405 prealloc_destroy(htab); 406 free_buckets: 407 bpf_map_area_free(htab->buckets); 408 free_charge: 409 bpf_map_charge_finish(&htab->map.memory); 410 free_htab: 411 kfree(htab); 412 return ERR_PTR(err); 413 } 414 415 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 416 { 417 return jhash(key, key_len, hashrnd); 418 } 419 420 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 421 { 422 return &htab->buckets[hash & (htab->n_buckets - 1)]; 423 } 424 425 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 426 { 427 return &__select_bucket(htab, hash)->head; 428 } 429 430 /* this lookup function can only be called with bucket lock taken */ 431 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 432 void *key, u32 key_size) 433 { 434 struct hlist_nulls_node *n; 435 struct htab_elem *l; 436 437 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 438 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 439 return l; 440 441 return NULL; 442 } 443 444 /* can be called without bucket lock. it will repeat the loop in 445 * the unlikely event when elements moved from one bucket into another 446 * while link list is being walked 447 */ 448 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 449 u32 hash, void *key, 450 u32 key_size, u32 n_buckets) 451 { 452 struct hlist_nulls_node *n; 453 struct htab_elem *l; 454 455 again: 456 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 457 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 458 return l; 459 460 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 461 goto again; 462 463 return NULL; 464 } 465 466 /* Called from syscall or from eBPF program directly, so 467 * arguments have to match bpf_map_lookup_elem() exactly. 468 * The return value is adjusted by BPF instructions 469 * in htab_map_gen_lookup(). 470 */ 471 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 472 { 473 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 474 struct hlist_nulls_head *head; 475 struct htab_elem *l; 476 u32 hash, key_size; 477 478 /* Must be called with rcu_read_lock. */ 479 WARN_ON_ONCE(!rcu_read_lock_held()); 480 481 key_size = map->key_size; 482 483 hash = htab_map_hash(key, key_size, htab->hashrnd); 484 485 head = select_bucket(htab, hash); 486 487 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 488 489 return l; 490 } 491 492 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 493 { 494 struct htab_elem *l = __htab_map_lookup_elem(map, key); 495 496 if (l) 497 return l->key + round_up(map->key_size, 8); 498 499 return NULL; 500 } 501 502 /* inline bpf_map_lookup_elem() call. 503 * Instead of: 504 * bpf_prog 505 * bpf_map_lookup_elem 506 * map->ops->map_lookup_elem 507 * htab_map_lookup_elem 508 * __htab_map_lookup_elem 509 * do: 510 * bpf_prog 511 * __htab_map_lookup_elem 512 */ 513 static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 514 { 515 struct bpf_insn *insn = insn_buf; 516 const int ret = BPF_REG_0; 517 518 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 519 (void *(*)(struct bpf_map *map, void *key))NULL)); 520 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 521 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 522 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 523 offsetof(struct htab_elem, key) + 524 round_up(map->key_size, 8)); 525 return insn - insn_buf; 526 } 527 528 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 529 void *key, const bool mark) 530 { 531 struct htab_elem *l = __htab_map_lookup_elem(map, key); 532 533 if (l) { 534 if (mark) 535 bpf_lru_node_set_ref(&l->lru_node); 536 return l->key + round_up(map->key_size, 8); 537 } 538 539 return NULL; 540 } 541 542 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 543 { 544 return __htab_lru_map_lookup_elem(map, key, true); 545 } 546 547 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 548 { 549 return __htab_lru_map_lookup_elem(map, key, false); 550 } 551 552 static u32 htab_lru_map_gen_lookup(struct bpf_map *map, 553 struct bpf_insn *insn_buf) 554 { 555 struct bpf_insn *insn = insn_buf; 556 const int ret = BPF_REG_0; 557 const int ref_reg = BPF_REG_1; 558 559 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 560 (void *(*)(struct bpf_map *map, void *key))NULL)); 561 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 562 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 563 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 564 offsetof(struct htab_elem, lru_node) + 565 offsetof(struct bpf_lru_node, ref)); 566 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 567 *insn++ = BPF_ST_MEM(BPF_B, ret, 568 offsetof(struct htab_elem, lru_node) + 569 offsetof(struct bpf_lru_node, ref), 570 1); 571 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 572 offsetof(struct htab_elem, key) + 573 round_up(map->key_size, 8)); 574 return insn - insn_buf; 575 } 576 577 /* It is called from the bpf_lru_list when the LRU needs to delete 578 * older elements from the htab. 579 */ 580 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 581 { 582 struct bpf_htab *htab = (struct bpf_htab *)arg; 583 struct htab_elem *l = NULL, *tgt_l; 584 struct hlist_nulls_head *head; 585 struct hlist_nulls_node *n; 586 unsigned long flags; 587 struct bucket *b; 588 589 tgt_l = container_of(node, struct htab_elem, lru_node); 590 b = __select_bucket(htab, tgt_l->hash); 591 head = &b->head; 592 593 raw_spin_lock_irqsave(&b->lock, flags); 594 595 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 596 if (l == tgt_l) { 597 hlist_nulls_del_rcu(&l->hash_node); 598 break; 599 } 600 601 raw_spin_unlock_irqrestore(&b->lock, flags); 602 603 return l == tgt_l; 604 } 605 606 /* Called from syscall */ 607 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 608 { 609 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 610 struct hlist_nulls_head *head; 611 struct htab_elem *l, *next_l; 612 u32 hash, key_size; 613 int i = 0; 614 615 WARN_ON_ONCE(!rcu_read_lock_held()); 616 617 key_size = map->key_size; 618 619 if (!key) 620 goto find_first_elem; 621 622 hash = htab_map_hash(key, key_size, htab->hashrnd); 623 624 head = select_bucket(htab, hash); 625 626 /* lookup the key */ 627 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 628 629 if (!l) 630 goto find_first_elem; 631 632 /* key was found, get next key in the same bucket */ 633 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 634 struct htab_elem, hash_node); 635 636 if (next_l) { 637 /* if next elem in this hash list is non-zero, just return it */ 638 memcpy(next_key, next_l->key, key_size); 639 return 0; 640 } 641 642 /* no more elements in this hash list, go to the next bucket */ 643 i = hash & (htab->n_buckets - 1); 644 i++; 645 646 find_first_elem: 647 /* iterate over buckets */ 648 for (; i < htab->n_buckets; i++) { 649 head = select_bucket(htab, i); 650 651 /* pick first element in the bucket */ 652 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 653 struct htab_elem, hash_node); 654 if (next_l) { 655 /* if it's not empty, just return it */ 656 memcpy(next_key, next_l->key, key_size); 657 return 0; 658 } 659 } 660 661 /* iterated over all buckets and all elements */ 662 return -ENOENT; 663 } 664 665 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 666 { 667 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 668 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 669 kfree(l); 670 } 671 672 static void htab_elem_free_rcu(struct rcu_head *head) 673 { 674 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 675 struct bpf_htab *htab = l->htab; 676 677 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while 678 * we're calling kfree, otherwise deadlock is possible if kprobes 679 * are placed somewhere inside of slub 680 */ 681 preempt_disable(); 682 __this_cpu_inc(bpf_prog_active); 683 htab_elem_free(htab, l); 684 __this_cpu_dec(bpf_prog_active); 685 preempt_enable(); 686 } 687 688 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 689 { 690 struct bpf_map *map = &htab->map; 691 692 if (map->ops->map_fd_put_ptr) { 693 void *ptr = fd_htab_map_get_ptr(map, l); 694 695 map->ops->map_fd_put_ptr(ptr); 696 } 697 698 if (htab_is_prealloc(htab)) { 699 __pcpu_freelist_push(&htab->freelist, &l->fnode); 700 } else { 701 atomic_dec(&htab->count); 702 l->htab = htab; 703 call_rcu(&l->rcu, htab_elem_free_rcu); 704 } 705 } 706 707 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 708 void *value, bool onallcpus) 709 { 710 if (!onallcpus) { 711 /* copy true value_size bytes */ 712 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 713 } else { 714 u32 size = round_up(htab->map.value_size, 8); 715 int off = 0, cpu; 716 717 for_each_possible_cpu(cpu) { 718 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 719 value + off, size); 720 off += size; 721 } 722 } 723 } 724 725 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 726 { 727 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 728 BITS_PER_LONG == 64; 729 } 730 731 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 732 void *value, u32 key_size, u32 hash, 733 bool percpu, bool onallcpus, 734 struct htab_elem *old_elem) 735 { 736 u32 size = htab->map.value_size; 737 bool prealloc = htab_is_prealloc(htab); 738 struct htab_elem *l_new, **pl_new; 739 void __percpu *pptr; 740 741 if (prealloc) { 742 if (old_elem) { 743 /* if we're updating the existing element, 744 * use per-cpu extra elems to avoid freelist_pop/push 745 */ 746 pl_new = this_cpu_ptr(htab->extra_elems); 747 l_new = *pl_new; 748 *pl_new = old_elem; 749 } else { 750 struct pcpu_freelist_node *l; 751 752 l = __pcpu_freelist_pop(&htab->freelist); 753 if (!l) 754 return ERR_PTR(-E2BIG); 755 l_new = container_of(l, struct htab_elem, fnode); 756 } 757 } else { 758 if (atomic_inc_return(&htab->count) > htab->map.max_entries) 759 if (!old_elem) { 760 /* when map is full and update() is replacing 761 * old element, it's ok to allocate, since 762 * old element will be freed immediately. 763 * Otherwise return an error 764 */ 765 l_new = ERR_PTR(-E2BIG); 766 goto dec_count; 767 } 768 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 769 htab->map.numa_node); 770 if (!l_new) { 771 l_new = ERR_PTR(-ENOMEM); 772 goto dec_count; 773 } 774 check_and_init_map_lock(&htab->map, 775 l_new->key + round_up(key_size, 8)); 776 } 777 778 memcpy(l_new->key, key, key_size); 779 if (percpu) { 780 size = round_up(size, 8); 781 if (prealloc) { 782 pptr = htab_elem_get_ptr(l_new, key_size); 783 } else { 784 /* alloc_percpu zero-fills */ 785 pptr = __alloc_percpu_gfp(size, 8, 786 GFP_ATOMIC | __GFP_NOWARN); 787 if (!pptr) { 788 kfree(l_new); 789 l_new = ERR_PTR(-ENOMEM); 790 goto dec_count; 791 } 792 } 793 794 pcpu_copy_value(htab, pptr, value, onallcpus); 795 796 if (!prealloc) 797 htab_elem_set_ptr(l_new, key_size, pptr); 798 } else if (fd_htab_map_needs_adjust(htab)) { 799 size = round_up(size, 8); 800 memcpy(l_new->key + round_up(key_size, 8), value, size); 801 } else { 802 copy_map_value(&htab->map, 803 l_new->key + round_up(key_size, 8), 804 value); 805 } 806 807 l_new->hash = hash; 808 return l_new; 809 dec_count: 810 atomic_dec(&htab->count); 811 return l_new; 812 } 813 814 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 815 u64 map_flags) 816 { 817 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 818 /* elem already exists */ 819 return -EEXIST; 820 821 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 822 /* elem doesn't exist, cannot update it */ 823 return -ENOENT; 824 825 return 0; 826 } 827 828 /* Called from syscall or from eBPF program */ 829 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 830 u64 map_flags) 831 { 832 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 833 struct htab_elem *l_new = NULL, *l_old; 834 struct hlist_nulls_head *head; 835 unsigned long flags; 836 struct bucket *b; 837 u32 key_size, hash; 838 int ret; 839 840 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 841 /* unknown flags */ 842 return -EINVAL; 843 844 WARN_ON_ONCE(!rcu_read_lock_held()); 845 846 key_size = map->key_size; 847 848 hash = htab_map_hash(key, key_size, htab->hashrnd); 849 850 b = __select_bucket(htab, hash); 851 head = &b->head; 852 853 if (unlikely(map_flags & BPF_F_LOCK)) { 854 if (unlikely(!map_value_has_spin_lock(map))) 855 return -EINVAL; 856 /* find an element without taking the bucket lock */ 857 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 858 htab->n_buckets); 859 ret = check_flags(htab, l_old, map_flags); 860 if (ret) 861 return ret; 862 if (l_old) { 863 /* grab the element lock and update value in place */ 864 copy_map_value_locked(map, 865 l_old->key + round_up(key_size, 8), 866 value, false); 867 return 0; 868 } 869 /* fall through, grab the bucket lock and lookup again. 870 * 99.9% chance that the element won't be found, 871 * but second lookup under lock has to be done. 872 */ 873 } 874 875 /* bpf_map_update_elem() can be called in_irq() */ 876 raw_spin_lock_irqsave(&b->lock, flags); 877 878 l_old = lookup_elem_raw(head, hash, key, key_size); 879 880 ret = check_flags(htab, l_old, map_flags); 881 if (ret) 882 goto err; 883 884 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 885 /* first lookup without the bucket lock didn't find the element, 886 * but second lookup with the bucket lock found it. 887 * This case is highly unlikely, but has to be dealt with: 888 * grab the element lock in addition to the bucket lock 889 * and update element in place 890 */ 891 copy_map_value_locked(map, 892 l_old->key + round_up(key_size, 8), 893 value, false); 894 ret = 0; 895 goto err; 896 } 897 898 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 899 l_old); 900 if (IS_ERR(l_new)) { 901 /* all pre-allocated elements are in use or memory exhausted */ 902 ret = PTR_ERR(l_new); 903 goto err; 904 } 905 906 /* add new element to the head of the list, so that 907 * concurrent search will find it before old elem 908 */ 909 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 910 if (l_old) { 911 hlist_nulls_del_rcu(&l_old->hash_node); 912 if (!htab_is_prealloc(htab)) 913 free_htab_elem(htab, l_old); 914 } 915 ret = 0; 916 err: 917 raw_spin_unlock_irqrestore(&b->lock, flags); 918 return ret; 919 } 920 921 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 922 u64 map_flags) 923 { 924 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 925 struct htab_elem *l_new, *l_old = NULL; 926 struct hlist_nulls_head *head; 927 unsigned long flags; 928 struct bucket *b; 929 u32 key_size, hash; 930 int ret; 931 932 if (unlikely(map_flags > BPF_EXIST)) 933 /* unknown flags */ 934 return -EINVAL; 935 936 WARN_ON_ONCE(!rcu_read_lock_held()); 937 938 key_size = map->key_size; 939 940 hash = htab_map_hash(key, key_size, htab->hashrnd); 941 942 b = __select_bucket(htab, hash); 943 head = &b->head; 944 945 /* For LRU, we need to alloc before taking bucket's 946 * spinlock because getting free nodes from LRU may need 947 * to remove older elements from htab and this removal 948 * operation will need a bucket lock. 949 */ 950 l_new = prealloc_lru_pop(htab, key, hash); 951 if (!l_new) 952 return -ENOMEM; 953 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 954 955 /* bpf_map_update_elem() can be called in_irq() */ 956 raw_spin_lock_irqsave(&b->lock, flags); 957 958 l_old = lookup_elem_raw(head, hash, key, key_size); 959 960 ret = check_flags(htab, l_old, map_flags); 961 if (ret) 962 goto err; 963 964 /* add new element to the head of the list, so that 965 * concurrent search will find it before old elem 966 */ 967 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 968 if (l_old) { 969 bpf_lru_node_set_ref(&l_new->lru_node); 970 hlist_nulls_del_rcu(&l_old->hash_node); 971 } 972 ret = 0; 973 974 err: 975 raw_spin_unlock_irqrestore(&b->lock, flags); 976 977 if (ret) 978 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 979 else if (l_old) 980 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 981 982 return ret; 983 } 984 985 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 986 void *value, u64 map_flags, 987 bool onallcpus) 988 { 989 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 990 struct htab_elem *l_new = NULL, *l_old; 991 struct hlist_nulls_head *head; 992 unsigned long flags; 993 struct bucket *b; 994 u32 key_size, hash; 995 int ret; 996 997 if (unlikely(map_flags > BPF_EXIST)) 998 /* unknown flags */ 999 return -EINVAL; 1000 1001 WARN_ON_ONCE(!rcu_read_lock_held()); 1002 1003 key_size = map->key_size; 1004 1005 hash = htab_map_hash(key, key_size, htab->hashrnd); 1006 1007 b = __select_bucket(htab, hash); 1008 head = &b->head; 1009 1010 /* bpf_map_update_elem() can be called in_irq() */ 1011 raw_spin_lock_irqsave(&b->lock, flags); 1012 1013 l_old = lookup_elem_raw(head, hash, key, key_size); 1014 1015 ret = check_flags(htab, l_old, map_flags); 1016 if (ret) 1017 goto err; 1018 1019 if (l_old) { 1020 /* per-cpu hash map can update value in-place */ 1021 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1022 value, onallcpus); 1023 } else { 1024 l_new = alloc_htab_elem(htab, key, value, key_size, 1025 hash, true, onallcpus, NULL); 1026 if (IS_ERR(l_new)) { 1027 ret = PTR_ERR(l_new); 1028 goto err; 1029 } 1030 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1031 } 1032 ret = 0; 1033 err: 1034 raw_spin_unlock_irqrestore(&b->lock, flags); 1035 return ret; 1036 } 1037 1038 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1039 void *value, u64 map_flags, 1040 bool onallcpus) 1041 { 1042 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1043 struct htab_elem *l_new = NULL, *l_old; 1044 struct hlist_nulls_head *head; 1045 unsigned long flags; 1046 struct bucket *b; 1047 u32 key_size, hash; 1048 int ret; 1049 1050 if (unlikely(map_flags > BPF_EXIST)) 1051 /* unknown flags */ 1052 return -EINVAL; 1053 1054 WARN_ON_ONCE(!rcu_read_lock_held()); 1055 1056 key_size = map->key_size; 1057 1058 hash = htab_map_hash(key, key_size, htab->hashrnd); 1059 1060 b = __select_bucket(htab, hash); 1061 head = &b->head; 1062 1063 /* For LRU, we need to alloc before taking bucket's 1064 * spinlock because LRU's elem alloc may need 1065 * to remove older elem from htab and this removal 1066 * operation will need a bucket lock. 1067 */ 1068 if (map_flags != BPF_EXIST) { 1069 l_new = prealloc_lru_pop(htab, key, hash); 1070 if (!l_new) 1071 return -ENOMEM; 1072 } 1073 1074 /* bpf_map_update_elem() can be called in_irq() */ 1075 raw_spin_lock_irqsave(&b->lock, flags); 1076 1077 l_old = lookup_elem_raw(head, hash, key, key_size); 1078 1079 ret = check_flags(htab, l_old, map_flags); 1080 if (ret) 1081 goto err; 1082 1083 if (l_old) { 1084 bpf_lru_node_set_ref(&l_old->lru_node); 1085 1086 /* per-cpu hash map can update value in-place */ 1087 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1088 value, onallcpus); 1089 } else { 1090 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 1091 value, onallcpus); 1092 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1093 l_new = NULL; 1094 } 1095 ret = 0; 1096 err: 1097 raw_spin_unlock_irqrestore(&b->lock, flags); 1098 if (l_new) 1099 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1100 return ret; 1101 } 1102 1103 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1104 void *value, u64 map_flags) 1105 { 1106 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1107 } 1108 1109 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1110 void *value, u64 map_flags) 1111 { 1112 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1113 false); 1114 } 1115 1116 /* Called from syscall or from eBPF program */ 1117 static int htab_map_delete_elem(struct bpf_map *map, void *key) 1118 { 1119 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1120 struct hlist_nulls_head *head; 1121 struct bucket *b; 1122 struct htab_elem *l; 1123 unsigned long flags; 1124 u32 hash, key_size; 1125 int ret = -ENOENT; 1126 1127 WARN_ON_ONCE(!rcu_read_lock_held()); 1128 1129 key_size = map->key_size; 1130 1131 hash = htab_map_hash(key, key_size, htab->hashrnd); 1132 b = __select_bucket(htab, hash); 1133 head = &b->head; 1134 1135 raw_spin_lock_irqsave(&b->lock, flags); 1136 1137 l = lookup_elem_raw(head, hash, key, key_size); 1138 1139 if (l) { 1140 hlist_nulls_del_rcu(&l->hash_node); 1141 free_htab_elem(htab, l); 1142 ret = 0; 1143 } 1144 1145 raw_spin_unlock_irqrestore(&b->lock, flags); 1146 return ret; 1147 } 1148 1149 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1150 { 1151 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1152 struct hlist_nulls_head *head; 1153 struct bucket *b; 1154 struct htab_elem *l; 1155 unsigned long flags; 1156 u32 hash, key_size; 1157 int ret = -ENOENT; 1158 1159 WARN_ON_ONCE(!rcu_read_lock_held()); 1160 1161 key_size = map->key_size; 1162 1163 hash = htab_map_hash(key, key_size, htab->hashrnd); 1164 b = __select_bucket(htab, hash); 1165 head = &b->head; 1166 1167 raw_spin_lock_irqsave(&b->lock, flags); 1168 1169 l = lookup_elem_raw(head, hash, key, key_size); 1170 1171 if (l) { 1172 hlist_nulls_del_rcu(&l->hash_node); 1173 ret = 0; 1174 } 1175 1176 raw_spin_unlock_irqrestore(&b->lock, flags); 1177 if (l) 1178 bpf_lru_push_free(&htab->lru, &l->lru_node); 1179 return ret; 1180 } 1181 1182 static void delete_all_elements(struct bpf_htab *htab) 1183 { 1184 int i; 1185 1186 for (i = 0; i < htab->n_buckets; i++) { 1187 struct hlist_nulls_head *head = select_bucket(htab, i); 1188 struct hlist_nulls_node *n; 1189 struct htab_elem *l; 1190 1191 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1192 hlist_nulls_del_rcu(&l->hash_node); 1193 htab_elem_free(htab, l); 1194 } 1195 } 1196 } 1197 1198 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1199 static void htab_map_free(struct bpf_map *map) 1200 { 1201 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1202 1203 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 1204 * so the programs (can be more than one that used this map) were 1205 * disconnected from events. Wait for outstanding critical sections in 1206 * these programs to complete 1207 */ 1208 synchronize_rcu(); 1209 1210 /* some of free_htab_elem() callbacks for elements of this map may 1211 * not have executed. Wait for them. 1212 */ 1213 rcu_barrier(); 1214 if (!htab_is_prealloc(htab)) 1215 delete_all_elements(htab); 1216 else 1217 prealloc_destroy(htab); 1218 1219 free_percpu(htab->extra_elems); 1220 bpf_map_area_free(htab->buckets); 1221 kfree(htab); 1222 } 1223 1224 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1225 struct seq_file *m) 1226 { 1227 void *value; 1228 1229 rcu_read_lock(); 1230 1231 value = htab_map_lookup_elem(map, key); 1232 if (!value) { 1233 rcu_read_unlock(); 1234 return; 1235 } 1236 1237 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1238 seq_puts(m, ": "); 1239 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1240 seq_puts(m, "\n"); 1241 1242 rcu_read_unlock(); 1243 } 1244 1245 static int 1246 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1247 const union bpf_attr *attr, 1248 union bpf_attr __user *uattr, 1249 bool do_delete, bool is_lru_map, 1250 bool is_percpu) 1251 { 1252 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1253 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1254 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1255 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1256 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1257 void *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1258 u32 batch, max_count, size, bucket_size; 1259 u64 elem_map_flags, map_flags; 1260 struct hlist_nulls_head *head; 1261 struct hlist_nulls_node *n; 1262 unsigned long flags; 1263 struct htab_elem *l; 1264 struct bucket *b; 1265 int ret = 0; 1266 1267 elem_map_flags = attr->batch.elem_flags; 1268 if ((elem_map_flags & ~BPF_F_LOCK) || 1269 ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) 1270 return -EINVAL; 1271 1272 map_flags = attr->batch.flags; 1273 if (map_flags) 1274 return -EINVAL; 1275 1276 max_count = attr->batch.count; 1277 if (!max_count) 1278 return 0; 1279 1280 if (put_user(0, &uattr->batch.count)) 1281 return -EFAULT; 1282 1283 batch = 0; 1284 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1285 return -EFAULT; 1286 1287 if (batch >= htab->n_buckets) 1288 return -ENOENT; 1289 1290 key_size = htab->map.key_size; 1291 roundup_key_size = round_up(htab->map.key_size, 8); 1292 value_size = htab->map.value_size; 1293 size = round_up(value_size, 8); 1294 if (is_percpu) 1295 value_size = size * num_possible_cpus(); 1296 total = 0; 1297 /* while experimenting with hash tables with sizes ranging from 10 to 1298 * 1000, it was observed that a bucket can have upto 5 entries. 1299 */ 1300 bucket_size = 5; 1301 1302 alloc: 1303 /* We cannot do copy_from_user or copy_to_user inside 1304 * the rcu_read_lock. Allocate enough space here. 1305 */ 1306 keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); 1307 values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); 1308 if (!keys || !values) { 1309 ret = -ENOMEM; 1310 goto after_loop; 1311 } 1312 1313 again: 1314 preempt_disable(); 1315 this_cpu_inc(bpf_prog_active); 1316 rcu_read_lock(); 1317 again_nocopy: 1318 dst_key = keys; 1319 dst_val = values; 1320 b = &htab->buckets[batch]; 1321 head = &b->head; 1322 raw_spin_lock_irqsave(&b->lock, flags); 1323 1324 bucket_cnt = 0; 1325 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1326 bucket_cnt++; 1327 1328 if (bucket_cnt > (max_count - total)) { 1329 if (total == 0) 1330 ret = -ENOSPC; 1331 raw_spin_unlock_irqrestore(&b->lock, flags); 1332 rcu_read_unlock(); 1333 this_cpu_dec(bpf_prog_active); 1334 preempt_enable(); 1335 goto after_loop; 1336 } 1337 1338 if (bucket_cnt > bucket_size) { 1339 bucket_size = bucket_cnt; 1340 raw_spin_unlock_irqrestore(&b->lock, flags); 1341 rcu_read_unlock(); 1342 this_cpu_dec(bpf_prog_active); 1343 preempt_enable(); 1344 kvfree(keys); 1345 kvfree(values); 1346 goto alloc; 1347 } 1348 1349 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1350 memcpy(dst_key, l->key, key_size); 1351 1352 if (is_percpu) { 1353 int off = 0, cpu; 1354 void __percpu *pptr; 1355 1356 pptr = htab_elem_get_ptr(l, map->key_size); 1357 for_each_possible_cpu(cpu) { 1358 bpf_long_memcpy(dst_val + off, 1359 per_cpu_ptr(pptr, cpu), size); 1360 off += size; 1361 } 1362 } else { 1363 value = l->key + roundup_key_size; 1364 if (elem_map_flags & BPF_F_LOCK) 1365 copy_map_value_locked(map, dst_val, value, 1366 true); 1367 else 1368 copy_map_value(map, dst_val, value); 1369 check_and_init_map_lock(map, dst_val); 1370 } 1371 if (do_delete) { 1372 hlist_nulls_del_rcu(&l->hash_node); 1373 if (is_lru_map) 1374 bpf_lru_push_free(&htab->lru, &l->lru_node); 1375 else 1376 free_htab_elem(htab, l); 1377 } 1378 dst_key += key_size; 1379 dst_val += value_size; 1380 } 1381 1382 raw_spin_unlock_irqrestore(&b->lock, flags); 1383 /* If we are not copying data, we can go to next bucket and avoid 1384 * unlocking the rcu. 1385 */ 1386 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1387 batch++; 1388 goto again_nocopy; 1389 } 1390 1391 rcu_read_unlock(); 1392 this_cpu_dec(bpf_prog_active); 1393 preempt_enable(); 1394 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1395 key_size * bucket_cnt) || 1396 copy_to_user(uvalues + total * value_size, values, 1397 value_size * bucket_cnt))) { 1398 ret = -EFAULT; 1399 goto after_loop; 1400 } 1401 1402 total += bucket_cnt; 1403 batch++; 1404 if (batch >= htab->n_buckets) { 1405 ret = -ENOENT; 1406 goto after_loop; 1407 } 1408 goto again; 1409 1410 after_loop: 1411 if (ret == -EFAULT) 1412 goto out; 1413 1414 /* copy # of entries and next batch */ 1415 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1416 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1417 put_user(total, &uattr->batch.count)) 1418 ret = -EFAULT; 1419 1420 out: 1421 kvfree(keys); 1422 kvfree(values); 1423 return ret; 1424 } 1425 1426 static int 1427 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1428 union bpf_attr __user *uattr) 1429 { 1430 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1431 false, true); 1432 } 1433 1434 static int 1435 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1436 const union bpf_attr *attr, 1437 union bpf_attr __user *uattr) 1438 { 1439 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1440 false, true); 1441 } 1442 1443 static int 1444 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1445 union bpf_attr __user *uattr) 1446 { 1447 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1448 false, false); 1449 } 1450 1451 static int 1452 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1453 const union bpf_attr *attr, 1454 union bpf_attr __user *uattr) 1455 { 1456 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1457 false, false); 1458 } 1459 1460 static int 1461 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1462 const union bpf_attr *attr, 1463 union bpf_attr __user *uattr) 1464 { 1465 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1466 true, true); 1467 } 1468 1469 static int 1470 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1471 const union bpf_attr *attr, 1472 union bpf_attr __user *uattr) 1473 { 1474 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1475 true, true); 1476 } 1477 1478 static int 1479 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1480 union bpf_attr __user *uattr) 1481 { 1482 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1483 true, false); 1484 } 1485 1486 static int 1487 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1488 const union bpf_attr *attr, 1489 union bpf_attr __user *uattr) 1490 { 1491 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1492 true, false); 1493 } 1494 1495 const struct bpf_map_ops htab_map_ops = { 1496 .map_alloc_check = htab_map_alloc_check, 1497 .map_alloc = htab_map_alloc, 1498 .map_free = htab_map_free, 1499 .map_get_next_key = htab_map_get_next_key, 1500 .map_lookup_elem = htab_map_lookup_elem, 1501 .map_update_elem = htab_map_update_elem, 1502 .map_delete_elem = htab_map_delete_elem, 1503 .map_gen_lookup = htab_map_gen_lookup, 1504 .map_seq_show_elem = htab_map_seq_show_elem, 1505 BATCH_OPS(htab), 1506 }; 1507 1508 const struct bpf_map_ops htab_lru_map_ops = { 1509 .map_alloc_check = htab_map_alloc_check, 1510 .map_alloc = htab_map_alloc, 1511 .map_free = htab_map_free, 1512 .map_get_next_key = htab_map_get_next_key, 1513 .map_lookup_elem = htab_lru_map_lookup_elem, 1514 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 1515 .map_update_elem = htab_lru_map_update_elem, 1516 .map_delete_elem = htab_lru_map_delete_elem, 1517 .map_gen_lookup = htab_lru_map_gen_lookup, 1518 .map_seq_show_elem = htab_map_seq_show_elem, 1519 BATCH_OPS(htab_lru), 1520 }; 1521 1522 /* Called from eBPF program */ 1523 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1524 { 1525 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1526 1527 if (l) 1528 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1529 else 1530 return NULL; 1531 } 1532 1533 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1534 { 1535 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1536 1537 if (l) { 1538 bpf_lru_node_set_ref(&l->lru_node); 1539 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1540 } 1541 1542 return NULL; 1543 } 1544 1545 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1546 { 1547 struct htab_elem *l; 1548 void __percpu *pptr; 1549 int ret = -ENOENT; 1550 int cpu, off = 0; 1551 u32 size; 1552 1553 /* per_cpu areas are zero-filled and bpf programs can only 1554 * access 'value_size' of them, so copying rounded areas 1555 * will not leak any kernel data 1556 */ 1557 size = round_up(map->value_size, 8); 1558 rcu_read_lock(); 1559 l = __htab_map_lookup_elem(map, key); 1560 if (!l) 1561 goto out; 1562 /* We do not mark LRU map element here in order to not mess up 1563 * eviction heuristics when user space does a map walk. 1564 */ 1565 pptr = htab_elem_get_ptr(l, map->key_size); 1566 for_each_possible_cpu(cpu) { 1567 bpf_long_memcpy(value + off, 1568 per_cpu_ptr(pptr, cpu), size); 1569 off += size; 1570 } 1571 ret = 0; 1572 out: 1573 rcu_read_unlock(); 1574 return ret; 1575 } 1576 1577 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1578 u64 map_flags) 1579 { 1580 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1581 int ret; 1582 1583 rcu_read_lock(); 1584 if (htab_is_lru(htab)) 1585 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1586 map_flags, true); 1587 else 1588 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1589 true); 1590 rcu_read_unlock(); 1591 1592 return ret; 1593 } 1594 1595 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 1596 struct seq_file *m) 1597 { 1598 struct htab_elem *l; 1599 void __percpu *pptr; 1600 int cpu; 1601 1602 rcu_read_lock(); 1603 1604 l = __htab_map_lookup_elem(map, key); 1605 if (!l) { 1606 rcu_read_unlock(); 1607 return; 1608 } 1609 1610 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1611 seq_puts(m, ": {\n"); 1612 pptr = htab_elem_get_ptr(l, map->key_size); 1613 for_each_possible_cpu(cpu) { 1614 seq_printf(m, "\tcpu%d: ", cpu); 1615 btf_type_seq_show(map->btf, map->btf_value_type_id, 1616 per_cpu_ptr(pptr, cpu), m); 1617 seq_puts(m, "\n"); 1618 } 1619 seq_puts(m, "}\n"); 1620 1621 rcu_read_unlock(); 1622 } 1623 1624 const struct bpf_map_ops htab_percpu_map_ops = { 1625 .map_alloc_check = htab_map_alloc_check, 1626 .map_alloc = htab_map_alloc, 1627 .map_free = htab_map_free, 1628 .map_get_next_key = htab_map_get_next_key, 1629 .map_lookup_elem = htab_percpu_map_lookup_elem, 1630 .map_update_elem = htab_percpu_map_update_elem, 1631 .map_delete_elem = htab_map_delete_elem, 1632 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 1633 BATCH_OPS(htab_percpu), 1634 }; 1635 1636 const struct bpf_map_ops htab_lru_percpu_map_ops = { 1637 .map_alloc_check = htab_map_alloc_check, 1638 .map_alloc = htab_map_alloc, 1639 .map_free = htab_map_free, 1640 .map_get_next_key = htab_map_get_next_key, 1641 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 1642 .map_update_elem = htab_lru_percpu_map_update_elem, 1643 .map_delete_elem = htab_lru_map_delete_elem, 1644 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 1645 BATCH_OPS(htab_lru_percpu), 1646 }; 1647 1648 static int fd_htab_map_alloc_check(union bpf_attr *attr) 1649 { 1650 if (attr->value_size != sizeof(u32)) 1651 return -EINVAL; 1652 return htab_map_alloc_check(attr); 1653 } 1654 1655 static void fd_htab_map_free(struct bpf_map *map) 1656 { 1657 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1658 struct hlist_nulls_node *n; 1659 struct hlist_nulls_head *head; 1660 struct htab_elem *l; 1661 int i; 1662 1663 for (i = 0; i < htab->n_buckets; i++) { 1664 head = select_bucket(htab, i); 1665 1666 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1667 void *ptr = fd_htab_map_get_ptr(map, l); 1668 1669 map->ops->map_fd_put_ptr(ptr); 1670 } 1671 } 1672 1673 htab_map_free(map); 1674 } 1675 1676 /* only called from syscall */ 1677 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 1678 { 1679 void **ptr; 1680 int ret = 0; 1681 1682 if (!map->ops->map_fd_sys_lookup_elem) 1683 return -ENOTSUPP; 1684 1685 rcu_read_lock(); 1686 ptr = htab_map_lookup_elem(map, key); 1687 if (ptr) 1688 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 1689 else 1690 ret = -ENOENT; 1691 rcu_read_unlock(); 1692 1693 return ret; 1694 } 1695 1696 /* only called from syscall */ 1697 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 1698 void *key, void *value, u64 map_flags) 1699 { 1700 void *ptr; 1701 int ret; 1702 u32 ufd = *(u32 *)value; 1703 1704 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 1705 if (IS_ERR(ptr)) 1706 return PTR_ERR(ptr); 1707 1708 ret = htab_map_update_elem(map, key, &ptr, map_flags); 1709 if (ret) 1710 map->ops->map_fd_put_ptr(ptr); 1711 1712 return ret; 1713 } 1714 1715 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 1716 { 1717 struct bpf_map *map, *inner_map_meta; 1718 1719 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1720 if (IS_ERR(inner_map_meta)) 1721 return inner_map_meta; 1722 1723 map = htab_map_alloc(attr); 1724 if (IS_ERR(map)) { 1725 bpf_map_meta_free(inner_map_meta); 1726 return map; 1727 } 1728 1729 map->inner_map_meta = inner_map_meta; 1730 1731 return map; 1732 } 1733 1734 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 1735 { 1736 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 1737 1738 if (!inner_map) 1739 return NULL; 1740 1741 return READ_ONCE(*inner_map); 1742 } 1743 1744 static u32 htab_of_map_gen_lookup(struct bpf_map *map, 1745 struct bpf_insn *insn_buf) 1746 { 1747 struct bpf_insn *insn = insn_buf; 1748 const int ret = BPF_REG_0; 1749 1750 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 1751 (void *(*)(struct bpf_map *map, void *key))NULL)); 1752 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 1753 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 1754 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 1755 offsetof(struct htab_elem, key) + 1756 round_up(map->key_size, 8)); 1757 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1758 1759 return insn - insn_buf; 1760 } 1761 1762 static void htab_of_map_free(struct bpf_map *map) 1763 { 1764 bpf_map_meta_free(map->inner_map_meta); 1765 fd_htab_map_free(map); 1766 } 1767 1768 const struct bpf_map_ops htab_of_maps_map_ops = { 1769 .map_alloc_check = fd_htab_map_alloc_check, 1770 .map_alloc = htab_of_map_alloc, 1771 .map_free = htab_of_map_free, 1772 .map_get_next_key = htab_map_get_next_key, 1773 .map_lookup_elem = htab_of_map_lookup_elem, 1774 .map_delete_elem = htab_map_delete_elem, 1775 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1776 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1777 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1778 .map_gen_lookup = htab_of_map_gen_lookup, 1779 .map_check_btf = map_check_no_btf, 1780 }; 1781