1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/random.h> 11 #include <uapi/linux/btf.h> 12 #include "percpu_freelist.h" 13 #include "bpf_lru_list.h" 14 #include "map_in_map.h" 15 16 #define HTAB_CREATE_FLAG_MASK \ 17 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 18 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 19 20 #define BATCH_OPS(_name) \ 21 .map_lookup_batch = \ 22 _name##_map_lookup_batch, \ 23 .map_lookup_and_delete_batch = \ 24 _name##_map_lookup_and_delete_batch, \ 25 .map_update_batch = \ 26 generic_map_update_batch, \ 27 .map_delete_batch = \ 28 generic_map_delete_batch 29 30 struct bucket { 31 struct hlist_nulls_head head; 32 raw_spinlock_t lock; 33 }; 34 35 struct bpf_htab { 36 struct bpf_map map; 37 struct bucket *buckets; 38 void *elems; 39 union { 40 struct pcpu_freelist freelist; 41 struct bpf_lru lru; 42 }; 43 struct htab_elem *__percpu *extra_elems; 44 atomic_t count; /* number of elements in this hashtable */ 45 u32 n_buckets; /* number of hash buckets */ 46 u32 elem_size; /* size of each element in bytes */ 47 u32 hashrnd; 48 }; 49 50 /* each htab element is struct htab_elem + key + value */ 51 struct htab_elem { 52 union { 53 struct hlist_nulls_node hash_node; 54 struct { 55 void *padding; 56 union { 57 struct bpf_htab *htab; 58 struct pcpu_freelist_node fnode; 59 struct htab_elem *batch_flink; 60 }; 61 }; 62 }; 63 union { 64 struct rcu_head rcu; 65 struct bpf_lru_node lru_node; 66 }; 67 u32 hash; 68 char key[0] __aligned(8); 69 }; 70 71 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 72 73 static bool htab_is_lru(const struct bpf_htab *htab) 74 { 75 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 76 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 77 } 78 79 static bool htab_is_percpu(const struct bpf_htab *htab) 80 { 81 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 82 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 83 } 84 85 static bool htab_is_prealloc(const struct bpf_htab *htab) 86 { 87 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 88 } 89 90 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 91 void __percpu *pptr) 92 { 93 *(void __percpu **)(l->key + key_size) = pptr; 94 } 95 96 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 97 { 98 return *(void __percpu **)(l->key + key_size); 99 } 100 101 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 102 { 103 return *(void **)(l->key + roundup(map->key_size, 8)); 104 } 105 106 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 107 { 108 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 109 } 110 111 static void htab_free_elems(struct bpf_htab *htab) 112 { 113 int i; 114 115 if (!htab_is_percpu(htab)) 116 goto free_elems; 117 118 for (i = 0; i < htab->map.max_entries; i++) { 119 void __percpu *pptr; 120 121 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 122 htab->map.key_size); 123 free_percpu(pptr); 124 cond_resched(); 125 } 126 free_elems: 127 bpf_map_area_free(htab->elems); 128 } 129 130 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock 131 * (bucket_lock). If both locks need to be acquired together, the lock 132 * order is always lru_lock -> bucket_lock and this only happens in 133 * bpf_lru_list.c logic. For example, certain code path of 134 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), 135 * will acquire lru_lock first followed by acquiring bucket_lock. 136 * 137 * In hashtab.c, to avoid deadlock, lock acquisition of 138 * bucket_lock followed by lru_lock is not allowed. In such cases, 139 * bucket_lock needs to be released first before acquiring lru_lock. 140 */ 141 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 142 u32 hash) 143 { 144 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 145 struct htab_elem *l; 146 147 if (node) { 148 l = container_of(node, struct htab_elem, lru_node); 149 memcpy(l->key, key, htab->map.key_size); 150 return l; 151 } 152 153 return NULL; 154 } 155 156 static int prealloc_init(struct bpf_htab *htab) 157 { 158 u32 num_entries = htab->map.max_entries; 159 int err = -ENOMEM, i; 160 161 if (!htab_is_percpu(htab) && !htab_is_lru(htab)) 162 num_entries += num_possible_cpus(); 163 164 htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, 165 htab->map.numa_node); 166 if (!htab->elems) 167 return -ENOMEM; 168 169 if (!htab_is_percpu(htab)) 170 goto skip_percpu_elems; 171 172 for (i = 0; i < num_entries; i++) { 173 u32 size = round_up(htab->map.value_size, 8); 174 void __percpu *pptr; 175 176 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 177 if (!pptr) 178 goto free_elems; 179 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 180 pptr); 181 cond_resched(); 182 } 183 184 skip_percpu_elems: 185 if (htab_is_lru(htab)) 186 err = bpf_lru_init(&htab->lru, 187 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 188 offsetof(struct htab_elem, hash) - 189 offsetof(struct htab_elem, lru_node), 190 htab_lru_map_delete_node, 191 htab); 192 else 193 err = pcpu_freelist_init(&htab->freelist); 194 195 if (err) 196 goto free_elems; 197 198 if (htab_is_lru(htab)) 199 bpf_lru_populate(&htab->lru, htab->elems, 200 offsetof(struct htab_elem, lru_node), 201 htab->elem_size, num_entries); 202 else 203 pcpu_freelist_populate(&htab->freelist, 204 htab->elems + offsetof(struct htab_elem, fnode), 205 htab->elem_size, num_entries); 206 207 return 0; 208 209 free_elems: 210 htab_free_elems(htab); 211 return err; 212 } 213 214 static void prealloc_destroy(struct bpf_htab *htab) 215 { 216 htab_free_elems(htab); 217 218 if (htab_is_lru(htab)) 219 bpf_lru_destroy(&htab->lru); 220 else 221 pcpu_freelist_destroy(&htab->freelist); 222 } 223 224 static int alloc_extra_elems(struct bpf_htab *htab) 225 { 226 struct htab_elem *__percpu *pptr, *l_new; 227 struct pcpu_freelist_node *l; 228 int cpu; 229 230 pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, 231 GFP_USER | __GFP_NOWARN); 232 if (!pptr) 233 return -ENOMEM; 234 235 for_each_possible_cpu(cpu) { 236 l = pcpu_freelist_pop(&htab->freelist); 237 /* pop will succeed, since prealloc_init() 238 * preallocated extra num_possible_cpus elements 239 */ 240 l_new = container_of(l, struct htab_elem, fnode); 241 *per_cpu_ptr(pptr, cpu) = l_new; 242 } 243 htab->extra_elems = pptr; 244 return 0; 245 } 246 247 /* Called from syscall */ 248 static int htab_map_alloc_check(union bpf_attr *attr) 249 { 250 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 251 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 252 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 253 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 254 /* percpu_lru means each cpu has its own LRU list. 255 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 256 * the map's value itself is percpu. percpu_lru has 257 * nothing to do with the map's value. 258 */ 259 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 260 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 261 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 262 int numa_node = bpf_map_attr_numa_node(attr); 263 264 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 265 offsetof(struct htab_elem, hash_node.pprev)); 266 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 267 offsetof(struct htab_elem, hash_node.pprev)); 268 269 if (lru && !capable(CAP_SYS_ADMIN)) 270 /* LRU implementation is much complicated than other 271 * maps. Hence, limit to CAP_SYS_ADMIN for now. 272 */ 273 return -EPERM; 274 275 if (zero_seed && !capable(CAP_SYS_ADMIN)) 276 /* Guard against local DoS, and discourage production use. */ 277 return -EPERM; 278 279 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 280 !bpf_map_flags_access_ok(attr->map_flags)) 281 return -EINVAL; 282 283 if (!lru && percpu_lru) 284 return -EINVAL; 285 286 if (lru && !prealloc) 287 return -ENOTSUPP; 288 289 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 290 return -EINVAL; 291 292 /* check sanity of attributes. 293 * value_size == 0 may be allowed in the future to use map as a set 294 */ 295 if (attr->max_entries == 0 || attr->key_size == 0 || 296 attr->value_size == 0) 297 return -EINVAL; 298 299 if (attr->key_size > MAX_BPF_STACK) 300 /* eBPF programs initialize keys on stack, so they cannot be 301 * larger than max stack size 302 */ 303 return -E2BIG; 304 305 if (attr->value_size >= KMALLOC_MAX_SIZE - 306 MAX_BPF_STACK - sizeof(struct htab_elem)) 307 /* if value_size is bigger, the user space won't be able to 308 * access the elements via bpf syscall. This check also makes 309 * sure that the elem_size doesn't overflow and it's 310 * kmalloc-able later in htab_map_update_elem() 311 */ 312 return -E2BIG; 313 314 return 0; 315 } 316 317 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 318 { 319 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 320 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 321 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 322 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 323 /* percpu_lru means each cpu has its own LRU list. 324 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 325 * the map's value itself is percpu. percpu_lru has 326 * nothing to do with the map's value. 327 */ 328 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 329 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 330 struct bpf_htab *htab; 331 int err, i; 332 u64 cost; 333 334 htab = kzalloc(sizeof(*htab), GFP_USER); 335 if (!htab) 336 return ERR_PTR(-ENOMEM); 337 338 bpf_map_init_from_attr(&htab->map, attr); 339 340 if (percpu_lru) { 341 /* ensure each CPU's lru list has >=1 elements. 342 * since we are at it, make each lru list has the same 343 * number of elements. 344 */ 345 htab->map.max_entries = roundup(attr->max_entries, 346 num_possible_cpus()); 347 if (htab->map.max_entries < attr->max_entries) 348 htab->map.max_entries = rounddown(attr->max_entries, 349 num_possible_cpus()); 350 } 351 352 /* hash table size must be power of 2 */ 353 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 354 355 htab->elem_size = sizeof(struct htab_elem) + 356 round_up(htab->map.key_size, 8); 357 if (percpu) 358 htab->elem_size += sizeof(void *); 359 else 360 htab->elem_size += round_up(htab->map.value_size, 8); 361 362 err = -E2BIG; 363 /* prevent zero size kmalloc and check for u32 overflow */ 364 if (htab->n_buckets == 0 || 365 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 366 goto free_htab; 367 368 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 369 (u64) htab->elem_size * htab->map.max_entries; 370 371 if (percpu) 372 cost += (u64) round_up(htab->map.value_size, 8) * 373 num_possible_cpus() * htab->map.max_entries; 374 else 375 cost += (u64) htab->elem_size * num_possible_cpus(); 376 377 /* if map size is larger than memlock limit, reject it */ 378 err = bpf_map_charge_init(&htab->map.memory, cost); 379 if (err) 380 goto free_htab; 381 382 err = -ENOMEM; 383 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 384 sizeof(struct bucket), 385 htab->map.numa_node); 386 if (!htab->buckets) 387 goto free_charge; 388 389 if (htab->map.map_flags & BPF_F_ZERO_SEED) 390 htab->hashrnd = 0; 391 else 392 htab->hashrnd = get_random_int(); 393 394 for (i = 0; i < htab->n_buckets; i++) { 395 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 396 raw_spin_lock_init(&htab->buckets[i].lock); 397 } 398 399 if (prealloc) { 400 err = prealloc_init(htab); 401 if (err) 402 goto free_buckets; 403 404 if (!percpu && !lru) { 405 /* lru itself can remove the least used element, so 406 * there is no need for an extra elem during map_update. 407 */ 408 err = alloc_extra_elems(htab); 409 if (err) 410 goto free_prealloc; 411 } 412 } 413 414 return &htab->map; 415 416 free_prealloc: 417 prealloc_destroy(htab); 418 free_buckets: 419 bpf_map_area_free(htab->buckets); 420 free_charge: 421 bpf_map_charge_finish(&htab->map.memory); 422 free_htab: 423 kfree(htab); 424 return ERR_PTR(err); 425 } 426 427 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 428 { 429 return jhash(key, key_len, hashrnd); 430 } 431 432 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 433 { 434 return &htab->buckets[hash & (htab->n_buckets - 1)]; 435 } 436 437 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 438 { 439 return &__select_bucket(htab, hash)->head; 440 } 441 442 /* this lookup function can only be called with bucket lock taken */ 443 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 444 void *key, u32 key_size) 445 { 446 struct hlist_nulls_node *n; 447 struct htab_elem *l; 448 449 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 450 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 451 return l; 452 453 return NULL; 454 } 455 456 /* can be called without bucket lock. it will repeat the loop in 457 * the unlikely event when elements moved from one bucket into another 458 * while link list is being walked 459 */ 460 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 461 u32 hash, void *key, 462 u32 key_size, u32 n_buckets) 463 { 464 struct hlist_nulls_node *n; 465 struct htab_elem *l; 466 467 again: 468 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 469 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 470 return l; 471 472 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 473 goto again; 474 475 return NULL; 476 } 477 478 /* Called from syscall or from eBPF program directly, so 479 * arguments have to match bpf_map_lookup_elem() exactly. 480 * The return value is adjusted by BPF instructions 481 * in htab_map_gen_lookup(). 482 */ 483 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 484 { 485 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 486 struct hlist_nulls_head *head; 487 struct htab_elem *l; 488 u32 hash, key_size; 489 490 /* Must be called with rcu_read_lock. */ 491 WARN_ON_ONCE(!rcu_read_lock_held()); 492 493 key_size = map->key_size; 494 495 hash = htab_map_hash(key, key_size, htab->hashrnd); 496 497 head = select_bucket(htab, hash); 498 499 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 500 501 return l; 502 } 503 504 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 505 { 506 struct htab_elem *l = __htab_map_lookup_elem(map, key); 507 508 if (l) 509 return l->key + round_up(map->key_size, 8); 510 511 return NULL; 512 } 513 514 /* inline bpf_map_lookup_elem() call. 515 * Instead of: 516 * bpf_prog 517 * bpf_map_lookup_elem 518 * map->ops->map_lookup_elem 519 * htab_map_lookup_elem 520 * __htab_map_lookup_elem 521 * do: 522 * bpf_prog 523 * __htab_map_lookup_elem 524 */ 525 static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 526 { 527 struct bpf_insn *insn = insn_buf; 528 const int ret = BPF_REG_0; 529 530 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 531 (void *(*)(struct bpf_map *map, void *key))NULL)); 532 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 533 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 534 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 535 offsetof(struct htab_elem, key) + 536 round_up(map->key_size, 8)); 537 return insn - insn_buf; 538 } 539 540 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 541 void *key, const bool mark) 542 { 543 struct htab_elem *l = __htab_map_lookup_elem(map, key); 544 545 if (l) { 546 if (mark) 547 bpf_lru_node_set_ref(&l->lru_node); 548 return l->key + round_up(map->key_size, 8); 549 } 550 551 return NULL; 552 } 553 554 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 555 { 556 return __htab_lru_map_lookup_elem(map, key, true); 557 } 558 559 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 560 { 561 return __htab_lru_map_lookup_elem(map, key, false); 562 } 563 564 static u32 htab_lru_map_gen_lookup(struct bpf_map *map, 565 struct bpf_insn *insn_buf) 566 { 567 struct bpf_insn *insn = insn_buf; 568 const int ret = BPF_REG_0; 569 const int ref_reg = BPF_REG_1; 570 571 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 572 (void *(*)(struct bpf_map *map, void *key))NULL)); 573 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 574 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 575 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 576 offsetof(struct htab_elem, lru_node) + 577 offsetof(struct bpf_lru_node, ref)); 578 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 579 *insn++ = BPF_ST_MEM(BPF_B, ret, 580 offsetof(struct htab_elem, lru_node) + 581 offsetof(struct bpf_lru_node, ref), 582 1); 583 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 584 offsetof(struct htab_elem, key) + 585 round_up(map->key_size, 8)); 586 return insn - insn_buf; 587 } 588 589 /* It is called from the bpf_lru_list when the LRU needs to delete 590 * older elements from the htab. 591 */ 592 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 593 { 594 struct bpf_htab *htab = (struct bpf_htab *)arg; 595 struct htab_elem *l = NULL, *tgt_l; 596 struct hlist_nulls_head *head; 597 struct hlist_nulls_node *n; 598 unsigned long flags; 599 struct bucket *b; 600 601 tgt_l = container_of(node, struct htab_elem, lru_node); 602 b = __select_bucket(htab, tgt_l->hash); 603 head = &b->head; 604 605 raw_spin_lock_irqsave(&b->lock, flags); 606 607 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 608 if (l == tgt_l) { 609 hlist_nulls_del_rcu(&l->hash_node); 610 break; 611 } 612 613 raw_spin_unlock_irqrestore(&b->lock, flags); 614 615 return l == tgt_l; 616 } 617 618 /* Called from syscall */ 619 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 620 { 621 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 622 struct hlist_nulls_head *head; 623 struct htab_elem *l, *next_l; 624 u32 hash, key_size; 625 int i = 0; 626 627 WARN_ON_ONCE(!rcu_read_lock_held()); 628 629 key_size = map->key_size; 630 631 if (!key) 632 goto find_first_elem; 633 634 hash = htab_map_hash(key, key_size, htab->hashrnd); 635 636 head = select_bucket(htab, hash); 637 638 /* lookup the key */ 639 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 640 641 if (!l) 642 goto find_first_elem; 643 644 /* key was found, get next key in the same bucket */ 645 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 646 struct htab_elem, hash_node); 647 648 if (next_l) { 649 /* if next elem in this hash list is non-zero, just return it */ 650 memcpy(next_key, next_l->key, key_size); 651 return 0; 652 } 653 654 /* no more elements in this hash list, go to the next bucket */ 655 i = hash & (htab->n_buckets - 1); 656 i++; 657 658 find_first_elem: 659 /* iterate over buckets */ 660 for (; i < htab->n_buckets; i++) { 661 head = select_bucket(htab, i); 662 663 /* pick first element in the bucket */ 664 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 665 struct htab_elem, hash_node); 666 if (next_l) { 667 /* if it's not empty, just return it */ 668 memcpy(next_key, next_l->key, key_size); 669 return 0; 670 } 671 } 672 673 /* iterated over all buckets and all elements */ 674 return -ENOENT; 675 } 676 677 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 678 { 679 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 680 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 681 kfree(l); 682 } 683 684 static void htab_elem_free_rcu(struct rcu_head *head) 685 { 686 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 687 struct bpf_htab *htab = l->htab; 688 689 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while 690 * we're calling kfree, otherwise deadlock is possible if kprobes 691 * are placed somewhere inside of slub 692 */ 693 preempt_disable(); 694 __this_cpu_inc(bpf_prog_active); 695 htab_elem_free(htab, l); 696 __this_cpu_dec(bpf_prog_active); 697 preempt_enable(); 698 } 699 700 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 701 { 702 struct bpf_map *map = &htab->map; 703 704 if (map->ops->map_fd_put_ptr) { 705 void *ptr = fd_htab_map_get_ptr(map, l); 706 707 map->ops->map_fd_put_ptr(ptr); 708 } 709 710 if (htab_is_prealloc(htab)) { 711 __pcpu_freelist_push(&htab->freelist, &l->fnode); 712 } else { 713 atomic_dec(&htab->count); 714 l->htab = htab; 715 call_rcu(&l->rcu, htab_elem_free_rcu); 716 } 717 } 718 719 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 720 void *value, bool onallcpus) 721 { 722 if (!onallcpus) { 723 /* copy true value_size bytes */ 724 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 725 } else { 726 u32 size = round_up(htab->map.value_size, 8); 727 int off = 0, cpu; 728 729 for_each_possible_cpu(cpu) { 730 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 731 value + off, size); 732 off += size; 733 } 734 } 735 } 736 737 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 738 { 739 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 740 BITS_PER_LONG == 64; 741 } 742 743 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 744 void *value, u32 key_size, u32 hash, 745 bool percpu, bool onallcpus, 746 struct htab_elem *old_elem) 747 { 748 u32 size = htab->map.value_size; 749 bool prealloc = htab_is_prealloc(htab); 750 struct htab_elem *l_new, **pl_new; 751 void __percpu *pptr; 752 753 if (prealloc) { 754 if (old_elem) { 755 /* if we're updating the existing element, 756 * use per-cpu extra elems to avoid freelist_pop/push 757 */ 758 pl_new = this_cpu_ptr(htab->extra_elems); 759 l_new = *pl_new; 760 *pl_new = old_elem; 761 } else { 762 struct pcpu_freelist_node *l; 763 764 l = __pcpu_freelist_pop(&htab->freelist); 765 if (!l) 766 return ERR_PTR(-E2BIG); 767 l_new = container_of(l, struct htab_elem, fnode); 768 } 769 } else { 770 if (atomic_inc_return(&htab->count) > htab->map.max_entries) 771 if (!old_elem) { 772 /* when map is full and update() is replacing 773 * old element, it's ok to allocate, since 774 * old element will be freed immediately. 775 * Otherwise return an error 776 */ 777 l_new = ERR_PTR(-E2BIG); 778 goto dec_count; 779 } 780 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 781 htab->map.numa_node); 782 if (!l_new) { 783 l_new = ERR_PTR(-ENOMEM); 784 goto dec_count; 785 } 786 check_and_init_map_lock(&htab->map, 787 l_new->key + round_up(key_size, 8)); 788 } 789 790 memcpy(l_new->key, key, key_size); 791 if (percpu) { 792 size = round_up(size, 8); 793 if (prealloc) { 794 pptr = htab_elem_get_ptr(l_new, key_size); 795 } else { 796 /* alloc_percpu zero-fills */ 797 pptr = __alloc_percpu_gfp(size, 8, 798 GFP_ATOMIC | __GFP_NOWARN); 799 if (!pptr) { 800 kfree(l_new); 801 l_new = ERR_PTR(-ENOMEM); 802 goto dec_count; 803 } 804 } 805 806 pcpu_copy_value(htab, pptr, value, onallcpus); 807 808 if (!prealloc) 809 htab_elem_set_ptr(l_new, key_size, pptr); 810 } else if (fd_htab_map_needs_adjust(htab)) { 811 size = round_up(size, 8); 812 memcpy(l_new->key + round_up(key_size, 8), value, size); 813 } else { 814 copy_map_value(&htab->map, 815 l_new->key + round_up(key_size, 8), 816 value); 817 } 818 819 l_new->hash = hash; 820 return l_new; 821 dec_count: 822 atomic_dec(&htab->count); 823 return l_new; 824 } 825 826 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 827 u64 map_flags) 828 { 829 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 830 /* elem already exists */ 831 return -EEXIST; 832 833 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 834 /* elem doesn't exist, cannot update it */ 835 return -ENOENT; 836 837 return 0; 838 } 839 840 /* Called from syscall or from eBPF program */ 841 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 842 u64 map_flags) 843 { 844 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 845 struct htab_elem *l_new = NULL, *l_old; 846 struct hlist_nulls_head *head; 847 unsigned long flags; 848 struct bucket *b; 849 u32 key_size, hash; 850 int ret; 851 852 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 853 /* unknown flags */ 854 return -EINVAL; 855 856 WARN_ON_ONCE(!rcu_read_lock_held()); 857 858 key_size = map->key_size; 859 860 hash = htab_map_hash(key, key_size, htab->hashrnd); 861 862 b = __select_bucket(htab, hash); 863 head = &b->head; 864 865 if (unlikely(map_flags & BPF_F_LOCK)) { 866 if (unlikely(!map_value_has_spin_lock(map))) 867 return -EINVAL; 868 /* find an element without taking the bucket lock */ 869 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 870 htab->n_buckets); 871 ret = check_flags(htab, l_old, map_flags); 872 if (ret) 873 return ret; 874 if (l_old) { 875 /* grab the element lock and update value in place */ 876 copy_map_value_locked(map, 877 l_old->key + round_up(key_size, 8), 878 value, false); 879 return 0; 880 } 881 /* fall through, grab the bucket lock and lookup again. 882 * 99.9% chance that the element won't be found, 883 * but second lookup under lock has to be done. 884 */ 885 } 886 887 /* bpf_map_update_elem() can be called in_irq() */ 888 raw_spin_lock_irqsave(&b->lock, flags); 889 890 l_old = lookup_elem_raw(head, hash, key, key_size); 891 892 ret = check_flags(htab, l_old, map_flags); 893 if (ret) 894 goto err; 895 896 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 897 /* first lookup without the bucket lock didn't find the element, 898 * but second lookup with the bucket lock found it. 899 * This case is highly unlikely, but has to be dealt with: 900 * grab the element lock in addition to the bucket lock 901 * and update element in place 902 */ 903 copy_map_value_locked(map, 904 l_old->key + round_up(key_size, 8), 905 value, false); 906 ret = 0; 907 goto err; 908 } 909 910 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 911 l_old); 912 if (IS_ERR(l_new)) { 913 /* all pre-allocated elements are in use or memory exhausted */ 914 ret = PTR_ERR(l_new); 915 goto err; 916 } 917 918 /* add new element to the head of the list, so that 919 * concurrent search will find it before old elem 920 */ 921 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 922 if (l_old) { 923 hlist_nulls_del_rcu(&l_old->hash_node); 924 if (!htab_is_prealloc(htab)) 925 free_htab_elem(htab, l_old); 926 } 927 ret = 0; 928 err: 929 raw_spin_unlock_irqrestore(&b->lock, flags); 930 return ret; 931 } 932 933 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 934 u64 map_flags) 935 { 936 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 937 struct htab_elem *l_new, *l_old = NULL; 938 struct hlist_nulls_head *head; 939 unsigned long flags; 940 struct bucket *b; 941 u32 key_size, hash; 942 int ret; 943 944 if (unlikely(map_flags > BPF_EXIST)) 945 /* unknown flags */ 946 return -EINVAL; 947 948 WARN_ON_ONCE(!rcu_read_lock_held()); 949 950 key_size = map->key_size; 951 952 hash = htab_map_hash(key, key_size, htab->hashrnd); 953 954 b = __select_bucket(htab, hash); 955 head = &b->head; 956 957 /* For LRU, we need to alloc before taking bucket's 958 * spinlock because getting free nodes from LRU may need 959 * to remove older elements from htab and this removal 960 * operation will need a bucket lock. 961 */ 962 l_new = prealloc_lru_pop(htab, key, hash); 963 if (!l_new) 964 return -ENOMEM; 965 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 966 967 /* bpf_map_update_elem() can be called in_irq() */ 968 raw_spin_lock_irqsave(&b->lock, flags); 969 970 l_old = lookup_elem_raw(head, hash, key, key_size); 971 972 ret = check_flags(htab, l_old, map_flags); 973 if (ret) 974 goto err; 975 976 /* add new element to the head of the list, so that 977 * concurrent search will find it before old elem 978 */ 979 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 980 if (l_old) { 981 bpf_lru_node_set_ref(&l_new->lru_node); 982 hlist_nulls_del_rcu(&l_old->hash_node); 983 } 984 ret = 0; 985 986 err: 987 raw_spin_unlock_irqrestore(&b->lock, flags); 988 989 if (ret) 990 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 991 else if (l_old) 992 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 993 994 return ret; 995 } 996 997 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 998 void *value, u64 map_flags, 999 bool onallcpus) 1000 { 1001 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1002 struct htab_elem *l_new = NULL, *l_old; 1003 struct hlist_nulls_head *head; 1004 unsigned long flags; 1005 struct bucket *b; 1006 u32 key_size, hash; 1007 int ret; 1008 1009 if (unlikely(map_flags > BPF_EXIST)) 1010 /* unknown flags */ 1011 return -EINVAL; 1012 1013 WARN_ON_ONCE(!rcu_read_lock_held()); 1014 1015 key_size = map->key_size; 1016 1017 hash = htab_map_hash(key, key_size, htab->hashrnd); 1018 1019 b = __select_bucket(htab, hash); 1020 head = &b->head; 1021 1022 /* bpf_map_update_elem() can be called in_irq() */ 1023 raw_spin_lock_irqsave(&b->lock, flags); 1024 1025 l_old = lookup_elem_raw(head, hash, key, key_size); 1026 1027 ret = check_flags(htab, l_old, map_flags); 1028 if (ret) 1029 goto err; 1030 1031 if (l_old) { 1032 /* per-cpu hash map can update value in-place */ 1033 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1034 value, onallcpus); 1035 } else { 1036 l_new = alloc_htab_elem(htab, key, value, key_size, 1037 hash, true, onallcpus, NULL); 1038 if (IS_ERR(l_new)) { 1039 ret = PTR_ERR(l_new); 1040 goto err; 1041 } 1042 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1043 } 1044 ret = 0; 1045 err: 1046 raw_spin_unlock_irqrestore(&b->lock, flags); 1047 return ret; 1048 } 1049 1050 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1051 void *value, u64 map_flags, 1052 bool onallcpus) 1053 { 1054 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1055 struct htab_elem *l_new = NULL, *l_old; 1056 struct hlist_nulls_head *head; 1057 unsigned long flags; 1058 struct bucket *b; 1059 u32 key_size, hash; 1060 int ret; 1061 1062 if (unlikely(map_flags > BPF_EXIST)) 1063 /* unknown flags */ 1064 return -EINVAL; 1065 1066 WARN_ON_ONCE(!rcu_read_lock_held()); 1067 1068 key_size = map->key_size; 1069 1070 hash = htab_map_hash(key, key_size, htab->hashrnd); 1071 1072 b = __select_bucket(htab, hash); 1073 head = &b->head; 1074 1075 /* For LRU, we need to alloc before taking bucket's 1076 * spinlock because LRU's elem alloc may need 1077 * to remove older elem from htab and this removal 1078 * operation will need a bucket lock. 1079 */ 1080 if (map_flags != BPF_EXIST) { 1081 l_new = prealloc_lru_pop(htab, key, hash); 1082 if (!l_new) 1083 return -ENOMEM; 1084 } 1085 1086 /* bpf_map_update_elem() can be called in_irq() */ 1087 raw_spin_lock_irqsave(&b->lock, flags); 1088 1089 l_old = lookup_elem_raw(head, hash, key, key_size); 1090 1091 ret = check_flags(htab, l_old, map_flags); 1092 if (ret) 1093 goto err; 1094 1095 if (l_old) { 1096 bpf_lru_node_set_ref(&l_old->lru_node); 1097 1098 /* per-cpu hash map can update value in-place */ 1099 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1100 value, onallcpus); 1101 } else { 1102 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 1103 value, onallcpus); 1104 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1105 l_new = NULL; 1106 } 1107 ret = 0; 1108 err: 1109 raw_spin_unlock_irqrestore(&b->lock, flags); 1110 if (l_new) 1111 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1112 return ret; 1113 } 1114 1115 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1116 void *value, u64 map_flags) 1117 { 1118 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1119 } 1120 1121 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1122 void *value, u64 map_flags) 1123 { 1124 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1125 false); 1126 } 1127 1128 /* Called from syscall or from eBPF program */ 1129 static int htab_map_delete_elem(struct bpf_map *map, void *key) 1130 { 1131 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1132 struct hlist_nulls_head *head; 1133 struct bucket *b; 1134 struct htab_elem *l; 1135 unsigned long flags; 1136 u32 hash, key_size; 1137 int ret = -ENOENT; 1138 1139 WARN_ON_ONCE(!rcu_read_lock_held()); 1140 1141 key_size = map->key_size; 1142 1143 hash = htab_map_hash(key, key_size, htab->hashrnd); 1144 b = __select_bucket(htab, hash); 1145 head = &b->head; 1146 1147 raw_spin_lock_irqsave(&b->lock, flags); 1148 1149 l = lookup_elem_raw(head, hash, key, key_size); 1150 1151 if (l) { 1152 hlist_nulls_del_rcu(&l->hash_node); 1153 free_htab_elem(htab, l); 1154 ret = 0; 1155 } 1156 1157 raw_spin_unlock_irqrestore(&b->lock, flags); 1158 return ret; 1159 } 1160 1161 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1162 { 1163 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1164 struct hlist_nulls_head *head; 1165 struct bucket *b; 1166 struct htab_elem *l; 1167 unsigned long flags; 1168 u32 hash, key_size; 1169 int ret = -ENOENT; 1170 1171 WARN_ON_ONCE(!rcu_read_lock_held()); 1172 1173 key_size = map->key_size; 1174 1175 hash = htab_map_hash(key, key_size, htab->hashrnd); 1176 b = __select_bucket(htab, hash); 1177 head = &b->head; 1178 1179 raw_spin_lock_irqsave(&b->lock, flags); 1180 1181 l = lookup_elem_raw(head, hash, key, key_size); 1182 1183 if (l) { 1184 hlist_nulls_del_rcu(&l->hash_node); 1185 ret = 0; 1186 } 1187 1188 raw_spin_unlock_irqrestore(&b->lock, flags); 1189 if (l) 1190 bpf_lru_push_free(&htab->lru, &l->lru_node); 1191 return ret; 1192 } 1193 1194 static void delete_all_elements(struct bpf_htab *htab) 1195 { 1196 int i; 1197 1198 for (i = 0; i < htab->n_buckets; i++) { 1199 struct hlist_nulls_head *head = select_bucket(htab, i); 1200 struct hlist_nulls_node *n; 1201 struct htab_elem *l; 1202 1203 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1204 hlist_nulls_del_rcu(&l->hash_node); 1205 htab_elem_free(htab, l); 1206 } 1207 } 1208 } 1209 1210 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1211 static void htab_map_free(struct bpf_map *map) 1212 { 1213 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1214 1215 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 1216 * so the programs (can be more than one that used this map) were 1217 * disconnected from events. Wait for outstanding critical sections in 1218 * these programs to complete 1219 */ 1220 synchronize_rcu(); 1221 1222 /* some of free_htab_elem() callbacks for elements of this map may 1223 * not have executed. Wait for them. 1224 */ 1225 rcu_barrier(); 1226 if (!htab_is_prealloc(htab)) 1227 delete_all_elements(htab); 1228 else 1229 prealloc_destroy(htab); 1230 1231 free_percpu(htab->extra_elems); 1232 bpf_map_area_free(htab->buckets); 1233 kfree(htab); 1234 } 1235 1236 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1237 struct seq_file *m) 1238 { 1239 void *value; 1240 1241 rcu_read_lock(); 1242 1243 value = htab_map_lookup_elem(map, key); 1244 if (!value) { 1245 rcu_read_unlock(); 1246 return; 1247 } 1248 1249 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1250 seq_puts(m, ": "); 1251 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1252 seq_puts(m, "\n"); 1253 1254 rcu_read_unlock(); 1255 } 1256 1257 static int 1258 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1259 const union bpf_attr *attr, 1260 union bpf_attr __user *uattr, 1261 bool do_delete, bool is_lru_map, 1262 bool is_percpu) 1263 { 1264 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1265 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1266 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1267 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1268 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1269 void *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1270 u32 batch, max_count, size, bucket_size; 1271 struct htab_elem *node_to_free = NULL; 1272 u64 elem_map_flags, map_flags; 1273 struct hlist_nulls_head *head; 1274 struct hlist_nulls_node *n; 1275 unsigned long flags = 0; 1276 bool locked = false; 1277 struct htab_elem *l; 1278 struct bucket *b; 1279 int ret = 0; 1280 1281 elem_map_flags = attr->batch.elem_flags; 1282 if ((elem_map_flags & ~BPF_F_LOCK) || 1283 ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) 1284 return -EINVAL; 1285 1286 map_flags = attr->batch.flags; 1287 if (map_flags) 1288 return -EINVAL; 1289 1290 max_count = attr->batch.count; 1291 if (!max_count) 1292 return 0; 1293 1294 if (put_user(0, &uattr->batch.count)) 1295 return -EFAULT; 1296 1297 batch = 0; 1298 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1299 return -EFAULT; 1300 1301 if (batch >= htab->n_buckets) 1302 return -ENOENT; 1303 1304 key_size = htab->map.key_size; 1305 roundup_key_size = round_up(htab->map.key_size, 8); 1306 value_size = htab->map.value_size; 1307 size = round_up(value_size, 8); 1308 if (is_percpu) 1309 value_size = size * num_possible_cpus(); 1310 total = 0; 1311 /* while experimenting with hash tables with sizes ranging from 10 to 1312 * 1000, it was observed that a bucket can have upto 5 entries. 1313 */ 1314 bucket_size = 5; 1315 1316 alloc: 1317 /* We cannot do copy_from_user or copy_to_user inside 1318 * the rcu_read_lock. Allocate enough space here. 1319 */ 1320 keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); 1321 values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); 1322 if (!keys || !values) { 1323 ret = -ENOMEM; 1324 goto after_loop; 1325 } 1326 1327 again: 1328 preempt_disable(); 1329 this_cpu_inc(bpf_prog_active); 1330 rcu_read_lock(); 1331 again_nocopy: 1332 dst_key = keys; 1333 dst_val = values; 1334 b = &htab->buckets[batch]; 1335 head = &b->head; 1336 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1337 if (locked) 1338 raw_spin_lock_irqsave(&b->lock, flags); 1339 1340 bucket_cnt = 0; 1341 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1342 bucket_cnt++; 1343 1344 if (bucket_cnt && !locked) { 1345 locked = true; 1346 goto again_nocopy; 1347 } 1348 1349 if (bucket_cnt > (max_count - total)) { 1350 if (total == 0) 1351 ret = -ENOSPC; 1352 /* Note that since bucket_cnt > 0 here, it is implicit 1353 * that the locked was grabbed, so release it. 1354 */ 1355 raw_spin_unlock_irqrestore(&b->lock, flags); 1356 rcu_read_unlock(); 1357 this_cpu_dec(bpf_prog_active); 1358 preempt_enable(); 1359 goto after_loop; 1360 } 1361 1362 if (bucket_cnt > bucket_size) { 1363 bucket_size = bucket_cnt; 1364 /* Note that since bucket_cnt > 0 here, it is implicit 1365 * that the locked was grabbed, so release it. 1366 */ 1367 raw_spin_unlock_irqrestore(&b->lock, flags); 1368 rcu_read_unlock(); 1369 this_cpu_dec(bpf_prog_active); 1370 preempt_enable(); 1371 kvfree(keys); 1372 kvfree(values); 1373 goto alloc; 1374 } 1375 1376 /* Next block is only safe to run if you have grabbed the lock */ 1377 if (!locked) 1378 goto next_batch; 1379 1380 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1381 memcpy(dst_key, l->key, key_size); 1382 1383 if (is_percpu) { 1384 int off = 0, cpu; 1385 void __percpu *pptr; 1386 1387 pptr = htab_elem_get_ptr(l, map->key_size); 1388 for_each_possible_cpu(cpu) { 1389 bpf_long_memcpy(dst_val + off, 1390 per_cpu_ptr(pptr, cpu), size); 1391 off += size; 1392 } 1393 } else { 1394 value = l->key + roundup_key_size; 1395 if (elem_map_flags & BPF_F_LOCK) 1396 copy_map_value_locked(map, dst_val, value, 1397 true); 1398 else 1399 copy_map_value(map, dst_val, value); 1400 check_and_init_map_lock(map, dst_val); 1401 } 1402 if (do_delete) { 1403 hlist_nulls_del_rcu(&l->hash_node); 1404 1405 /* bpf_lru_push_free() will acquire lru_lock, which 1406 * may cause deadlock. See comments in function 1407 * prealloc_lru_pop(). Let us do bpf_lru_push_free() 1408 * after releasing the bucket lock. 1409 */ 1410 if (is_lru_map) { 1411 l->batch_flink = node_to_free; 1412 node_to_free = l; 1413 } else { 1414 free_htab_elem(htab, l); 1415 } 1416 } 1417 dst_key += key_size; 1418 dst_val += value_size; 1419 } 1420 1421 raw_spin_unlock_irqrestore(&b->lock, flags); 1422 locked = false; 1423 1424 while (node_to_free) { 1425 l = node_to_free; 1426 node_to_free = node_to_free->batch_flink; 1427 bpf_lru_push_free(&htab->lru, &l->lru_node); 1428 } 1429 1430 next_batch: 1431 /* If we are not copying data, we can go to next bucket and avoid 1432 * unlocking the rcu. 1433 */ 1434 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1435 batch++; 1436 goto again_nocopy; 1437 } 1438 1439 rcu_read_unlock(); 1440 this_cpu_dec(bpf_prog_active); 1441 preempt_enable(); 1442 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1443 key_size * bucket_cnt) || 1444 copy_to_user(uvalues + total * value_size, values, 1445 value_size * bucket_cnt))) { 1446 ret = -EFAULT; 1447 goto after_loop; 1448 } 1449 1450 total += bucket_cnt; 1451 batch++; 1452 if (batch >= htab->n_buckets) { 1453 ret = -ENOENT; 1454 goto after_loop; 1455 } 1456 goto again; 1457 1458 after_loop: 1459 if (ret == -EFAULT) 1460 goto out; 1461 1462 /* copy # of entries and next batch */ 1463 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1464 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1465 put_user(total, &uattr->batch.count)) 1466 ret = -EFAULT; 1467 1468 out: 1469 kvfree(keys); 1470 kvfree(values); 1471 return ret; 1472 } 1473 1474 static int 1475 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1476 union bpf_attr __user *uattr) 1477 { 1478 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1479 false, true); 1480 } 1481 1482 static int 1483 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1484 const union bpf_attr *attr, 1485 union bpf_attr __user *uattr) 1486 { 1487 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1488 false, true); 1489 } 1490 1491 static int 1492 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1493 union bpf_attr __user *uattr) 1494 { 1495 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1496 false, false); 1497 } 1498 1499 static int 1500 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1501 const union bpf_attr *attr, 1502 union bpf_attr __user *uattr) 1503 { 1504 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1505 false, false); 1506 } 1507 1508 static int 1509 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1510 const union bpf_attr *attr, 1511 union bpf_attr __user *uattr) 1512 { 1513 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1514 true, true); 1515 } 1516 1517 static int 1518 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1519 const union bpf_attr *attr, 1520 union bpf_attr __user *uattr) 1521 { 1522 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1523 true, true); 1524 } 1525 1526 static int 1527 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1528 union bpf_attr __user *uattr) 1529 { 1530 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1531 true, false); 1532 } 1533 1534 static int 1535 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1536 const union bpf_attr *attr, 1537 union bpf_attr __user *uattr) 1538 { 1539 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1540 true, false); 1541 } 1542 1543 const struct bpf_map_ops htab_map_ops = { 1544 .map_alloc_check = htab_map_alloc_check, 1545 .map_alloc = htab_map_alloc, 1546 .map_free = htab_map_free, 1547 .map_get_next_key = htab_map_get_next_key, 1548 .map_lookup_elem = htab_map_lookup_elem, 1549 .map_update_elem = htab_map_update_elem, 1550 .map_delete_elem = htab_map_delete_elem, 1551 .map_gen_lookup = htab_map_gen_lookup, 1552 .map_seq_show_elem = htab_map_seq_show_elem, 1553 BATCH_OPS(htab), 1554 }; 1555 1556 const struct bpf_map_ops htab_lru_map_ops = { 1557 .map_alloc_check = htab_map_alloc_check, 1558 .map_alloc = htab_map_alloc, 1559 .map_free = htab_map_free, 1560 .map_get_next_key = htab_map_get_next_key, 1561 .map_lookup_elem = htab_lru_map_lookup_elem, 1562 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 1563 .map_update_elem = htab_lru_map_update_elem, 1564 .map_delete_elem = htab_lru_map_delete_elem, 1565 .map_gen_lookup = htab_lru_map_gen_lookup, 1566 .map_seq_show_elem = htab_map_seq_show_elem, 1567 BATCH_OPS(htab_lru), 1568 }; 1569 1570 /* Called from eBPF program */ 1571 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1572 { 1573 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1574 1575 if (l) 1576 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1577 else 1578 return NULL; 1579 } 1580 1581 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1582 { 1583 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1584 1585 if (l) { 1586 bpf_lru_node_set_ref(&l->lru_node); 1587 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1588 } 1589 1590 return NULL; 1591 } 1592 1593 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1594 { 1595 struct htab_elem *l; 1596 void __percpu *pptr; 1597 int ret = -ENOENT; 1598 int cpu, off = 0; 1599 u32 size; 1600 1601 /* per_cpu areas are zero-filled and bpf programs can only 1602 * access 'value_size' of them, so copying rounded areas 1603 * will not leak any kernel data 1604 */ 1605 size = round_up(map->value_size, 8); 1606 rcu_read_lock(); 1607 l = __htab_map_lookup_elem(map, key); 1608 if (!l) 1609 goto out; 1610 /* We do not mark LRU map element here in order to not mess up 1611 * eviction heuristics when user space does a map walk. 1612 */ 1613 pptr = htab_elem_get_ptr(l, map->key_size); 1614 for_each_possible_cpu(cpu) { 1615 bpf_long_memcpy(value + off, 1616 per_cpu_ptr(pptr, cpu), size); 1617 off += size; 1618 } 1619 ret = 0; 1620 out: 1621 rcu_read_unlock(); 1622 return ret; 1623 } 1624 1625 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1626 u64 map_flags) 1627 { 1628 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1629 int ret; 1630 1631 rcu_read_lock(); 1632 if (htab_is_lru(htab)) 1633 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1634 map_flags, true); 1635 else 1636 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1637 true); 1638 rcu_read_unlock(); 1639 1640 return ret; 1641 } 1642 1643 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 1644 struct seq_file *m) 1645 { 1646 struct htab_elem *l; 1647 void __percpu *pptr; 1648 int cpu; 1649 1650 rcu_read_lock(); 1651 1652 l = __htab_map_lookup_elem(map, key); 1653 if (!l) { 1654 rcu_read_unlock(); 1655 return; 1656 } 1657 1658 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1659 seq_puts(m, ": {\n"); 1660 pptr = htab_elem_get_ptr(l, map->key_size); 1661 for_each_possible_cpu(cpu) { 1662 seq_printf(m, "\tcpu%d: ", cpu); 1663 btf_type_seq_show(map->btf, map->btf_value_type_id, 1664 per_cpu_ptr(pptr, cpu), m); 1665 seq_puts(m, "\n"); 1666 } 1667 seq_puts(m, "}\n"); 1668 1669 rcu_read_unlock(); 1670 } 1671 1672 const struct bpf_map_ops htab_percpu_map_ops = { 1673 .map_alloc_check = htab_map_alloc_check, 1674 .map_alloc = htab_map_alloc, 1675 .map_free = htab_map_free, 1676 .map_get_next_key = htab_map_get_next_key, 1677 .map_lookup_elem = htab_percpu_map_lookup_elem, 1678 .map_update_elem = htab_percpu_map_update_elem, 1679 .map_delete_elem = htab_map_delete_elem, 1680 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 1681 BATCH_OPS(htab_percpu), 1682 }; 1683 1684 const struct bpf_map_ops htab_lru_percpu_map_ops = { 1685 .map_alloc_check = htab_map_alloc_check, 1686 .map_alloc = htab_map_alloc, 1687 .map_free = htab_map_free, 1688 .map_get_next_key = htab_map_get_next_key, 1689 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 1690 .map_update_elem = htab_lru_percpu_map_update_elem, 1691 .map_delete_elem = htab_lru_map_delete_elem, 1692 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 1693 BATCH_OPS(htab_lru_percpu), 1694 }; 1695 1696 static int fd_htab_map_alloc_check(union bpf_attr *attr) 1697 { 1698 if (attr->value_size != sizeof(u32)) 1699 return -EINVAL; 1700 return htab_map_alloc_check(attr); 1701 } 1702 1703 static void fd_htab_map_free(struct bpf_map *map) 1704 { 1705 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1706 struct hlist_nulls_node *n; 1707 struct hlist_nulls_head *head; 1708 struct htab_elem *l; 1709 int i; 1710 1711 for (i = 0; i < htab->n_buckets; i++) { 1712 head = select_bucket(htab, i); 1713 1714 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1715 void *ptr = fd_htab_map_get_ptr(map, l); 1716 1717 map->ops->map_fd_put_ptr(ptr); 1718 } 1719 } 1720 1721 htab_map_free(map); 1722 } 1723 1724 /* only called from syscall */ 1725 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 1726 { 1727 void **ptr; 1728 int ret = 0; 1729 1730 if (!map->ops->map_fd_sys_lookup_elem) 1731 return -ENOTSUPP; 1732 1733 rcu_read_lock(); 1734 ptr = htab_map_lookup_elem(map, key); 1735 if (ptr) 1736 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 1737 else 1738 ret = -ENOENT; 1739 rcu_read_unlock(); 1740 1741 return ret; 1742 } 1743 1744 /* only called from syscall */ 1745 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 1746 void *key, void *value, u64 map_flags) 1747 { 1748 void *ptr; 1749 int ret; 1750 u32 ufd = *(u32 *)value; 1751 1752 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 1753 if (IS_ERR(ptr)) 1754 return PTR_ERR(ptr); 1755 1756 ret = htab_map_update_elem(map, key, &ptr, map_flags); 1757 if (ret) 1758 map->ops->map_fd_put_ptr(ptr); 1759 1760 return ret; 1761 } 1762 1763 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 1764 { 1765 struct bpf_map *map, *inner_map_meta; 1766 1767 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1768 if (IS_ERR(inner_map_meta)) 1769 return inner_map_meta; 1770 1771 map = htab_map_alloc(attr); 1772 if (IS_ERR(map)) { 1773 bpf_map_meta_free(inner_map_meta); 1774 return map; 1775 } 1776 1777 map->inner_map_meta = inner_map_meta; 1778 1779 return map; 1780 } 1781 1782 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 1783 { 1784 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 1785 1786 if (!inner_map) 1787 return NULL; 1788 1789 return READ_ONCE(*inner_map); 1790 } 1791 1792 static u32 htab_of_map_gen_lookup(struct bpf_map *map, 1793 struct bpf_insn *insn_buf) 1794 { 1795 struct bpf_insn *insn = insn_buf; 1796 const int ret = BPF_REG_0; 1797 1798 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 1799 (void *(*)(struct bpf_map *map, void *key))NULL)); 1800 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 1801 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 1802 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 1803 offsetof(struct htab_elem, key) + 1804 round_up(map->key_size, 8)); 1805 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1806 1807 return insn - insn_buf; 1808 } 1809 1810 static void htab_of_map_free(struct bpf_map *map) 1811 { 1812 bpf_map_meta_free(map->inner_map_meta); 1813 fd_htab_map_free(map); 1814 } 1815 1816 const struct bpf_map_ops htab_of_maps_map_ops = { 1817 .map_alloc_check = fd_htab_map_alloc_check, 1818 .map_alloc = htab_of_map_alloc, 1819 .map_free = htab_of_map_free, 1820 .map_get_next_key = htab_map_get_next_key, 1821 .map_lookup_elem = htab_of_map_lookup_elem, 1822 .map_delete_elem = htab_map_delete_elem, 1823 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1824 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1825 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1826 .map_gen_lookup = htab_of_map_gen_lookup, 1827 .map_check_btf = map_check_no_btf, 1828 }; 1829