1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/random.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/rcupdate_trace.h> 13 #include <linux/btf_ids.h> 14 #include "percpu_freelist.h" 15 #include "bpf_lru_list.h" 16 #include "map_in_map.h" 17 #include <linux/bpf_mem_alloc.h> 18 19 #define HTAB_CREATE_FLAG_MASK \ 20 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 21 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 22 23 #define BATCH_OPS(_name) \ 24 .map_lookup_batch = \ 25 _name##_map_lookup_batch, \ 26 .map_lookup_and_delete_batch = \ 27 _name##_map_lookup_and_delete_batch, \ 28 .map_update_batch = \ 29 generic_map_update_batch, \ 30 .map_delete_batch = \ 31 generic_map_delete_batch 32 33 /* 34 * The bucket lock has two protection scopes: 35 * 36 * 1) Serializing concurrent operations from BPF programs on different 37 * CPUs 38 * 39 * 2) Serializing concurrent operations from BPF programs and sys_bpf() 40 * 41 * BPF programs can execute in any context including perf, kprobes and 42 * tracing. As there are almost no limits where perf, kprobes and tracing 43 * can be invoked from the lock operations need to be protected against 44 * deadlocks. Deadlocks can be caused by recursion and by an invocation in 45 * the lock held section when functions which acquire this lock are invoked 46 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU 47 * variable bpf_prog_active, which prevents BPF programs attached to perf 48 * events, kprobes and tracing to be invoked before the prior invocation 49 * from one of these contexts completed. sys_bpf() uses the same mechanism 50 * by pinning the task to the current CPU and incrementing the recursion 51 * protection across the map operation. 52 * 53 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain 54 * operations like memory allocations (even with GFP_ATOMIC) from atomic 55 * contexts. This is required because even with GFP_ATOMIC the memory 56 * allocator calls into code paths which acquire locks with long held lock 57 * sections. To ensure the deterministic behaviour these locks are regular 58 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only 59 * true atomic contexts on an RT kernel are the low level hardware 60 * handling, scheduling, low level interrupt handling, NMIs etc. None of 61 * these contexts should ever do memory allocations. 62 * 63 * As regular device interrupt handlers and soft interrupts are forced into 64 * thread context, the existing code which does 65 * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); 66 * just works. 67 * 68 * In theory the BPF locks could be converted to regular spinlocks as well, 69 * but the bucket locks and percpu_freelist locks can be taken from 70 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be 71 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, 72 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, 73 * because there is no memory allocation within the lock held sections. However 74 * after hash map was fully converted to use bpf_mem_alloc, there will be 75 * non-synchronous memory allocation for non-preallocated hash map, so it is 76 * safe to always use raw spinlock for bucket lock. 77 */ 78 struct bucket { 79 struct hlist_nulls_head head; 80 raw_spinlock_t raw_lock; 81 }; 82 83 #define HASHTAB_MAP_LOCK_COUNT 8 84 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) 85 86 struct bpf_htab { 87 struct bpf_map map; 88 struct bpf_mem_alloc ma; 89 struct bpf_mem_alloc pcpu_ma; 90 struct bucket *buckets; 91 void *elems; 92 union { 93 struct pcpu_freelist freelist; 94 struct bpf_lru lru; 95 }; 96 struct htab_elem *__percpu *extra_elems; 97 /* number of elements in non-preallocated hashtable are kept 98 * in either pcount or count 99 */ 100 struct percpu_counter pcount; 101 atomic_t count; 102 bool use_percpu_counter; 103 u32 n_buckets; /* number of hash buckets */ 104 u32 elem_size; /* size of each element in bytes */ 105 u32 hashrnd; 106 struct lock_class_key lockdep_key; 107 int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 108 }; 109 110 /* each htab element is struct htab_elem + key + value */ 111 struct htab_elem { 112 union { 113 struct hlist_nulls_node hash_node; 114 struct { 115 void *padding; 116 union { 117 struct pcpu_freelist_node fnode; 118 struct htab_elem *batch_flink; 119 }; 120 }; 121 }; 122 union { 123 /* pointer to per-cpu pointer */ 124 void *ptr_to_pptr; 125 struct bpf_lru_node lru_node; 126 }; 127 u32 hash; 128 char key[] __aligned(8); 129 }; 130 131 static inline bool htab_is_prealloc(const struct bpf_htab *htab) 132 { 133 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 134 } 135 136 static void htab_init_buckets(struct bpf_htab *htab) 137 { 138 unsigned int i; 139 140 for (i = 0; i < htab->n_buckets; i++) { 141 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 142 raw_spin_lock_init(&htab->buckets[i].raw_lock); 143 lockdep_set_class(&htab->buckets[i].raw_lock, 144 &htab->lockdep_key); 145 cond_resched(); 146 } 147 } 148 149 static inline int htab_lock_bucket(const struct bpf_htab *htab, 150 struct bucket *b, u32 hash, 151 unsigned long *pflags) 152 { 153 unsigned long flags; 154 155 hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 156 157 preempt_disable(); 158 local_irq_save(flags); 159 if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 160 __this_cpu_dec(*(htab->map_locked[hash])); 161 local_irq_restore(flags); 162 preempt_enable(); 163 return -EBUSY; 164 } 165 166 raw_spin_lock(&b->raw_lock); 167 *pflags = flags; 168 169 return 0; 170 } 171 172 static inline void htab_unlock_bucket(const struct bpf_htab *htab, 173 struct bucket *b, u32 hash, 174 unsigned long flags) 175 { 176 hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 177 raw_spin_unlock(&b->raw_lock); 178 __this_cpu_dec(*(htab->map_locked[hash])); 179 local_irq_restore(flags); 180 preempt_enable(); 181 } 182 183 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 184 185 static bool htab_is_lru(const struct bpf_htab *htab) 186 { 187 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 188 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 189 } 190 191 static bool htab_is_percpu(const struct bpf_htab *htab) 192 { 193 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 194 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 195 } 196 197 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 198 void __percpu *pptr) 199 { 200 *(void __percpu **)(l->key + key_size) = pptr; 201 } 202 203 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 204 { 205 return *(void __percpu **)(l->key + key_size); 206 } 207 208 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 209 { 210 return *(void **)(l->key + roundup(map->key_size, 8)); 211 } 212 213 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 214 { 215 return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); 216 } 217 218 static bool htab_has_extra_elems(struct bpf_htab *htab) 219 { 220 return !htab_is_percpu(htab) && !htab_is_lru(htab); 221 } 222 223 static void htab_free_prealloced_timers(struct bpf_htab *htab) 224 { 225 u32 num_entries = htab->map.max_entries; 226 int i; 227 228 if (!btf_record_has_field(htab->map.record, BPF_TIMER)) 229 return; 230 if (htab_has_extra_elems(htab)) 231 num_entries += num_possible_cpus(); 232 233 for (i = 0; i < num_entries; i++) { 234 struct htab_elem *elem; 235 236 elem = get_htab_elem(htab, i); 237 bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); 238 cond_resched(); 239 } 240 } 241 242 static void htab_free_prealloced_fields(struct bpf_htab *htab) 243 { 244 u32 num_entries = htab->map.max_entries; 245 int i; 246 247 if (IS_ERR_OR_NULL(htab->map.record)) 248 return; 249 if (htab_has_extra_elems(htab)) 250 num_entries += num_possible_cpus(); 251 for (i = 0; i < num_entries; i++) { 252 struct htab_elem *elem; 253 254 elem = get_htab_elem(htab, i); 255 if (htab_is_percpu(htab)) { 256 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); 257 int cpu; 258 259 for_each_possible_cpu(cpu) { 260 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); 261 cond_resched(); 262 } 263 } else { 264 bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); 265 cond_resched(); 266 } 267 cond_resched(); 268 } 269 } 270 271 static void htab_free_elems(struct bpf_htab *htab) 272 { 273 int i; 274 275 if (!htab_is_percpu(htab)) 276 goto free_elems; 277 278 for (i = 0; i < htab->map.max_entries; i++) { 279 void __percpu *pptr; 280 281 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 282 htab->map.key_size); 283 free_percpu(pptr); 284 cond_resched(); 285 } 286 free_elems: 287 bpf_map_area_free(htab->elems); 288 } 289 290 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock 291 * (bucket_lock). If both locks need to be acquired together, the lock 292 * order is always lru_lock -> bucket_lock and this only happens in 293 * bpf_lru_list.c logic. For example, certain code path of 294 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), 295 * will acquire lru_lock first followed by acquiring bucket_lock. 296 * 297 * In hashtab.c, to avoid deadlock, lock acquisition of 298 * bucket_lock followed by lru_lock is not allowed. In such cases, 299 * bucket_lock needs to be released first before acquiring lru_lock. 300 */ 301 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 302 u32 hash) 303 { 304 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 305 struct htab_elem *l; 306 307 if (node) { 308 bpf_map_inc_elem_count(&htab->map); 309 l = container_of(node, struct htab_elem, lru_node); 310 memcpy(l->key, key, htab->map.key_size); 311 return l; 312 } 313 314 return NULL; 315 } 316 317 static int prealloc_init(struct bpf_htab *htab) 318 { 319 u32 num_entries = htab->map.max_entries; 320 int err = -ENOMEM, i; 321 322 if (htab_has_extra_elems(htab)) 323 num_entries += num_possible_cpus(); 324 325 htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, 326 htab->map.numa_node); 327 if (!htab->elems) 328 return -ENOMEM; 329 330 if (!htab_is_percpu(htab)) 331 goto skip_percpu_elems; 332 333 for (i = 0; i < num_entries; i++) { 334 u32 size = round_up(htab->map.value_size, 8); 335 void __percpu *pptr; 336 337 pptr = bpf_map_alloc_percpu(&htab->map, size, 8, 338 GFP_USER | __GFP_NOWARN); 339 if (!pptr) 340 goto free_elems; 341 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 342 pptr); 343 cond_resched(); 344 } 345 346 skip_percpu_elems: 347 if (htab_is_lru(htab)) 348 err = bpf_lru_init(&htab->lru, 349 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 350 offsetof(struct htab_elem, hash) - 351 offsetof(struct htab_elem, lru_node), 352 htab_lru_map_delete_node, 353 htab); 354 else 355 err = pcpu_freelist_init(&htab->freelist); 356 357 if (err) 358 goto free_elems; 359 360 if (htab_is_lru(htab)) 361 bpf_lru_populate(&htab->lru, htab->elems, 362 offsetof(struct htab_elem, lru_node), 363 htab->elem_size, num_entries); 364 else 365 pcpu_freelist_populate(&htab->freelist, 366 htab->elems + offsetof(struct htab_elem, fnode), 367 htab->elem_size, num_entries); 368 369 return 0; 370 371 free_elems: 372 htab_free_elems(htab); 373 return err; 374 } 375 376 static void prealloc_destroy(struct bpf_htab *htab) 377 { 378 htab_free_elems(htab); 379 380 if (htab_is_lru(htab)) 381 bpf_lru_destroy(&htab->lru); 382 else 383 pcpu_freelist_destroy(&htab->freelist); 384 } 385 386 static int alloc_extra_elems(struct bpf_htab *htab) 387 { 388 struct htab_elem *__percpu *pptr, *l_new; 389 struct pcpu_freelist_node *l; 390 int cpu; 391 392 pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, 393 GFP_USER | __GFP_NOWARN); 394 if (!pptr) 395 return -ENOMEM; 396 397 for_each_possible_cpu(cpu) { 398 l = pcpu_freelist_pop(&htab->freelist); 399 /* pop will succeed, since prealloc_init() 400 * preallocated extra num_possible_cpus elements 401 */ 402 l_new = container_of(l, struct htab_elem, fnode); 403 *per_cpu_ptr(pptr, cpu) = l_new; 404 } 405 htab->extra_elems = pptr; 406 return 0; 407 } 408 409 /* Called from syscall */ 410 static int htab_map_alloc_check(union bpf_attr *attr) 411 { 412 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 413 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 414 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 415 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 416 /* percpu_lru means each cpu has its own LRU list. 417 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 418 * the map's value itself is percpu. percpu_lru has 419 * nothing to do with the map's value. 420 */ 421 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 422 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 423 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 424 int numa_node = bpf_map_attr_numa_node(attr); 425 426 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 427 offsetof(struct htab_elem, hash_node.pprev)); 428 429 if (zero_seed && !capable(CAP_SYS_ADMIN)) 430 /* Guard against local DoS, and discourage production use. */ 431 return -EPERM; 432 433 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 434 !bpf_map_flags_access_ok(attr->map_flags)) 435 return -EINVAL; 436 437 if (!lru && percpu_lru) 438 return -EINVAL; 439 440 if (lru && !prealloc) 441 return -ENOTSUPP; 442 443 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 444 return -EINVAL; 445 446 /* check sanity of attributes. 447 * value_size == 0 may be allowed in the future to use map as a set 448 */ 449 if (attr->max_entries == 0 || attr->key_size == 0 || 450 attr->value_size == 0) 451 return -EINVAL; 452 453 if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - 454 sizeof(struct htab_elem)) 455 /* if key_size + value_size is bigger, the user space won't be 456 * able to access the elements via bpf syscall. This check 457 * also makes sure that the elem_size doesn't overflow and it's 458 * kmalloc-able later in htab_map_update_elem() 459 */ 460 return -E2BIG; 461 462 return 0; 463 } 464 465 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 466 { 467 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 468 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 469 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 470 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 471 /* percpu_lru means each cpu has its own LRU list. 472 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 473 * the map's value itself is percpu. percpu_lru has 474 * nothing to do with the map's value. 475 */ 476 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 477 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 478 struct bpf_htab *htab; 479 int err, i; 480 481 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 482 if (!htab) 483 return ERR_PTR(-ENOMEM); 484 485 lockdep_register_key(&htab->lockdep_key); 486 487 bpf_map_init_from_attr(&htab->map, attr); 488 489 if (percpu_lru) { 490 /* ensure each CPU's lru list has >=1 elements. 491 * since we are at it, make each lru list has the same 492 * number of elements. 493 */ 494 htab->map.max_entries = roundup(attr->max_entries, 495 num_possible_cpus()); 496 if (htab->map.max_entries < attr->max_entries) 497 htab->map.max_entries = rounddown(attr->max_entries, 498 num_possible_cpus()); 499 } 500 501 /* hash table size must be power of 2; roundup_pow_of_two() can overflow 502 * into UB on 32-bit arches, so check that first 503 */ 504 err = -E2BIG; 505 if (htab->map.max_entries > 1UL << 31) 506 goto free_htab; 507 508 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 509 510 htab->elem_size = sizeof(struct htab_elem) + 511 round_up(htab->map.key_size, 8); 512 if (percpu) 513 htab->elem_size += sizeof(void *); 514 else 515 htab->elem_size += round_up(htab->map.value_size, 8); 516 517 /* check for u32 overflow */ 518 if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) 519 goto free_htab; 520 521 err = bpf_map_init_elem_count(&htab->map); 522 if (err) 523 goto free_htab; 524 525 err = -ENOMEM; 526 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 527 sizeof(struct bucket), 528 htab->map.numa_node); 529 if (!htab->buckets) 530 goto free_elem_count; 531 532 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 533 htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 534 sizeof(int), 535 sizeof(int), 536 GFP_USER); 537 if (!htab->map_locked[i]) 538 goto free_map_locked; 539 } 540 541 if (htab->map.map_flags & BPF_F_ZERO_SEED) 542 htab->hashrnd = 0; 543 else 544 htab->hashrnd = get_random_u32(); 545 546 htab_init_buckets(htab); 547 548 /* compute_batch_value() computes batch value as num_online_cpus() * 2 549 * and __percpu_counter_compare() needs 550 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() 551 * for percpu_counter to be faster than atomic_t. In practice the average bpf 552 * hash map size is 10k, which means that a system with 64 cpus will fill 553 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore 554 * define our own batch count as 32 then 10k hash map can be filled up to 80%: 555 * 10k - 8k > 32 _batch_ * 64 _cpus_ 556 * and __percpu_counter_compare() will still be fast. At that point hash map 557 * collisions will dominate its performance anyway. Assume that hash map filled 558 * to 50+% isn't going to be O(1) and use the following formula to choose 559 * between percpu_counter and atomic_t. 560 */ 561 #define PERCPU_COUNTER_BATCH 32 562 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) 563 htab->use_percpu_counter = true; 564 565 if (htab->use_percpu_counter) { 566 err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); 567 if (err) 568 goto free_map_locked; 569 } 570 571 if (prealloc) { 572 err = prealloc_init(htab); 573 if (err) 574 goto free_map_locked; 575 576 if (!percpu && !lru) { 577 /* lru itself can remove the least used element, so 578 * there is no need for an extra elem during map_update. 579 */ 580 err = alloc_extra_elems(htab); 581 if (err) 582 goto free_prealloc; 583 } 584 } else { 585 err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); 586 if (err) 587 goto free_map_locked; 588 if (percpu) { 589 err = bpf_mem_alloc_init(&htab->pcpu_ma, 590 round_up(htab->map.value_size, 8), true); 591 if (err) 592 goto free_map_locked; 593 } 594 } 595 596 return &htab->map; 597 598 free_prealloc: 599 prealloc_destroy(htab); 600 free_map_locked: 601 if (htab->use_percpu_counter) 602 percpu_counter_destroy(&htab->pcount); 603 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 604 free_percpu(htab->map_locked[i]); 605 bpf_map_area_free(htab->buckets); 606 bpf_mem_alloc_destroy(&htab->pcpu_ma); 607 bpf_mem_alloc_destroy(&htab->ma); 608 free_elem_count: 609 bpf_map_free_elem_count(&htab->map); 610 free_htab: 611 lockdep_unregister_key(&htab->lockdep_key); 612 bpf_map_area_free(htab); 613 return ERR_PTR(err); 614 } 615 616 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 617 { 618 if (likely(key_len % 4 == 0)) 619 return jhash2(key, key_len / 4, hashrnd); 620 return jhash(key, key_len, hashrnd); 621 } 622 623 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 624 { 625 return &htab->buckets[hash & (htab->n_buckets - 1)]; 626 } 627 628 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 629 { 630 return &__select_bucket(htab, hash)->head; 631 } 632 633 /* this lookup function can only be called with bucket lock taken */ 634 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 635 void *key, u32 key_size) 636 { 637 struct hlist_nulls_node *n; 638 struct htab_elem *l; 639 640 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 641 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 642 return l; 643 644 return NULL; 645 } 646 647 /* can be called without bucket lock. it will repeat the loop in 648 * the unlikely event when elements moved from one bucket into another 649 * while link list is being walked 650 */ 651 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 652 u32 hash, void *key, 653 u32 key_size, u32 n_buckets) 654 { 655 struct hlist_nulls_node *n; 656 struct htab_elem *l; 657 658 again: 659 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 660 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 661 return l; 662 663 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 664 goto again; 665 666 return NULL; 667 } 668 669 /* Called from syscall or from eBPF program directly, so 670 * arguments have to match bpf_map_lookup_elem() exactly. 671 * The return value is adjusted by BPF instructions 672 * in htab_map_gen_lookup(). 673 */ 674 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 675 { 676 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 677 struct hlist_nulls_head *head; 678 struct htab_elem *l; 679 u32 hash, key_size; 680 681 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 682 !rcu_read_lock_bh_held()); 683 684 key_size = map->key_size; 685 686 hash = htab_map_hash(key, key_size, htab->hashrnd); 687 688 head = select_bucket(htab, hash); 689 690 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 691 692 return l; 693 } 694 695 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 696 { 697 struct htab_elem *l = __htab_map_lookup_elem(map, key); 698 699 if (l) 700 return l->key + round_up(map->key_size, 8); 701 702 return NULL; 703 } 704 705 /* inline bpf_map_lookup_elem() call. 706 * Instead of: 707 * bpf_prog 708 * bpf_map_lookup_elem 709 * map->ops->map_lookup_elem 710 * htab_map_lookup_elem 711 * __htab_map_lookup_elem 712 * do: 713 * bpf_prog 714 * __htab_map_lookup_elem 715 */ 716 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 717 { 718 struct bpf_insn *insn = insn_buf; 719 const int ret = BPF_REG_0; 720 721 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 722 (void *(*)(struct bpf_map *map, void *key))NULL)); 723 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 724 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 725 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 726 offsetof(struct htab_elem, key) + 727 round_up(map->key_size, 8)); 728 return insn - insn_buf; 729 } 730 731 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 732 void *key, const bool mark) 733 { 734 struct htab_elem *l = __htab_map_lookup_elem(map, key); 735 736 if (l) { 737 if (mark) 738 bpf_lru_node_set_ref(&l->lru_node); 739 return l->key + round_up(map->key_size, 8); 740 } 741 742 return NULL; 743 } 744 745 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 746 { 747 return __htab_lru_map_lookup_elem(map, key, true); 748 } 749 750 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 751 { 752 return __htab_lru_map_lookup_elem(map, key, false); 753 } 754 755 static int htab_lru_map_gen_lookup(struct bpf_map *map, 756 struct bpf_insn *insn_buf) 757 { 758 struct bpf_insn *insn = insn_buf; 759 const int ret = BPF_REG_0; 760 const int ref_reg = BPF_REG_1; 761 762 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 763 (void *(*)(struct bpf_map *map, void *key))NULL)); 764 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 765 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 766 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 767 offsetof(struct htab_elem, lru_node) + 768 offsetof(struct bpf_lru_node, ref)); 769 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 770 *insn++ = BPF_ST_MEM(BPF_B, ret, 771 offsetof(struct htab_elem, lru_node) + 772 offsetof(struct bpf_lru_node, ref), 773 1); 774 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 775 offsetof(struct htab_elem, key) + 776 round_up(map->key_size, 8)); 777 return insn - insn_buf; 778 } 779 780 static void check_and_free_fields(struct bpf_htab *htab, 781 struct htab_elem *elem) 782 { 783 if (htab_is_percpu(htab)) { 784 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); 785 int cpu; 786 787 for_each_possible_cpu(cpu) 788 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); 789 } else { 790 void *map_value = elem->key + round_up(htab->map.key_size, 8); 791 792 bpf_obj_free_fields(htab->map.record, map_value); 793 } 794 } 795 796 /* It is called from the bpf_lru_list when the LRU needs to delete 797 * older elements from the htab. 798 */ 799 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 800 { 801 struct bpf_htab *htab = arg; 802 struct htab_elem *l = NULL, *tgt_l; 803 struct hlist_nulls_head *head; 804 struct hlist_nulls_node *n; 805 unsigned long flags; 806 struct bucket *b; 807 int ret; 808 809 tgt_l = container_of(node, struct htab_elem, lru_node); 810 b = __select_bucket(htab, tgt_l->hash); 811 head = &b->head; 812 813 ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 814 if (ret) 815 return false; 816 817 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 818 if (l == tgt_l) { 819 hlist_nulls_del_rcu(&l->hash_node); 820 check_and_free_fields(htab, l); 821 bpf_map_dec_elem_count(&htab->map); 822 break; 823 } 824 825 htab_unlock_bucket(htab, b, tgt_l->hash, flags); 826 827 return l == tgt_l; 828 } 829 830 /* Called from syscall */ 831 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 832 { 833 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 834 struct hlist_nulls_head *head; 835 struct htab_elem *l, *next_l; 836 u32 hash, key_size; 837 int i = 0; 838 839 WARN_ON_ONCE(!rcu_read_lock_held()); 840 841 key_size = map->key_size; 842 843 if (!key) 844 goto find_first_elem; 845 846 hash = htab_map_hash(key, key_size, htab->hashrnd); 847 848 head = select_bucket(htab, hash); 849 850 /* lookup the key */ 851 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 852 853 if (!l) 854 goto find_first_elem; 855 856 /* key was found, get next key in the same bucket */ 857 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 858 struct htab_elem, hash_node); 859 860 if (next_l) { 861 /* if next elem in this hash list is non-zero, just return it */ 862 memcpy(next_key, next_l->key, key_size); 863 return 0; 864 } 865 866 /* no more elements in this hash list, go to the next bucket */ 867 i = hash & (htab->n_buckets - 1); 868 i++; 869 870 find_first_elem: 871 /* iterate over buckets */ 872 for (; i < htab->n_buckets; i++) { 873 head = select_bucket(htab, i); 874 875 /* pick first element in the bucket */ 876 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 877 struct htab_elem, hash_node); 878 if (next_l) { 879 /* if it's not empty, just return it */ 880 memcpy(next_key, next_l->key, key_size); 881 return 0; 882 } 883 } 884 885 /* iterated over all buckets and all elements */ 886 return -ENOENT; 887 } 888 889 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 890 { 891 check_and_free_fields(htab, l); 892 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 893 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); 894 bpf_mem_cache_free(&htab->ma, l); 895 } 896 897 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) 898 { 899 struct bpf_map *map = &htab->map; 900 void *ptr; 901 902 if (map->ops->map_fd_put_ptr) { 903 ptr = fd_htab_map_get_ptr(map, l); 904 map->ops->map_fd_put_ptr(map, ptr, true); 905 } 906 } 907 908 static bool is_map_full(struct bpf_htab *htab) 909 { 910 if (htab->use_percpu_counter) 911 return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, 912 PERCPU_COUNTER_BATCH) >= 0; 913 return atomic_read(&htab->count) >= htab->map.max_entries; 914 } 915 916 static void inc_elem_count(struct bpf_htab *htab) 917 { 918 bpf_map_inc_elem_count(&htab->map); 919 920 if (htab->use_percpu_counter) 921 percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); 922 else 923 atomic_inc(&htab->count); 924 } 925 926 static void dec_elem_count(struct bpf_htab *htab) 927 { 928 bpf_map_dec_elem_count(&htab->map); 929 930 if (htab->use_percpu_counter) 931 percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); 932 else 933 atomic_dec(&htab->count); 934 } 935 936 937 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 938 { 939 htab_put_fd_value(htab, l); 940 941 if (htab_is_prealloc(htab)) { 942 bpf_map_dec_elem_count(&htab->map); 943 check_and_free_fields(htab, l); 944 __pcpu_freelist_push(&htab->freelist, &l->fnode); 945 } else { 946 dec_elem_count(htab); 947 htab_elem_free(htab, l); 948 } 949 } 950 951 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 952 void *value, bool onallcpus) 953 { 954 if (!onallcpus) { 955 /* copy true value_size bytes */ 956 copy_map_value(&htab->map, this_cpu_ptr(pptr), value); 957 } else { 958 u32 size = round_up(htab->map.value_size, 8); 959 int off = 0, cpu; 960 961 for_each_possible_cpu(cpu) { 962 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); 963 off += size; 964 } 965 } 966 } 967 968 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, 969 void *value, bool onallcpus) 970 { 971 /* When not setting the initial value on all cpus, zero-fill element 972 * values for other cpus. Otherwise, bpf program has no way to ensure 973 * known initial values for cpus other than current one 974 * (onallcpus=false always when coming from bpf prog). 975 */ 976 if (!onallcpus) { 977 int current_cpu = raw_smp_processor_id(); 978 int cpu; 979 980 for_each_possible_cpu(cpu) { 981 if (cpu == current_cpu) 982 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); 983 else /* Since elem is preallocated, we cannot touch special fields */ 984 zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); 985 } 986 } else { 987 pcpu_copy_value(htab, pptr, value, onallcpus); 988 } 989 } 990 991 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 992 { 993 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 994 BITS_PER_LONG == 64; 995 } 996 997 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 998 void *value, u32 key_size, u32 hash, 999 bool percpu, bool onallcpus, 1000 struct htab_elem *old_elem) 1001 { 1002 u32 size = htab->map.value_size; 1003 bool prealloc = htab_is_prealloc(htab); 1004 struct htab_elem *l_new, **pl_new; 1005 void __percpu *pptr; 1006 1007 if (prealloc) { 1008 if (old_elem) { 1009 /* if we're updating the existing element, 1010 * use per-cpu extra elems to avoid freelist_pop/push 1011 */ 1012 pl_new = this_cpu_ptr(htab->extra_elems); 1013 l_new = *pl_new; 1014 htab_put_fd_value(htab, old_elem); 1015 *pl_new = old_elem; 1016 } else { 1017 struct pcpu_freelist_node *l; 1018 1019 l = __pcpu_freelist_pop(&htab->freelist); 1020 if (!l) 1021 return ERR_PTR(-E2BIG); 1022 l_new = container_of(l, struct htab_elem, fnode); 1023 bpf_map_inc_elem_count(&htab->map); 1024 } 1025 } else { 1026 if (is_map_full(htab)) 1027 if (!old_elem) 1028 /* when map is full and update() is replacing 1029 * old element, it's ok to allocate, since 1030 * old element will be freed immediately. 1031 * Otherwise return an error 1032 */ 1033 return ERR_PTR(-E2BIG); 1034 inc_elem_count(htab); 1035 l_new = bpf_mem_cache_alloc(&htab->ma); 1036 if (!l_new) { 1037 l_new = ERR_PTR(-ENOMEM); 1038 goto dec_count; 1039 } 1040 } 1041 1042 memcpy(l_new->key, key, key_size); 1043 if (percpu) { 1044 if (prealloc) { 1045 pptr = htab_elem_get_ptr(l_new, key_size); 1046 } else { 1047 /* alloc_percpu zero-fills */ 1048 pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); 1049 if (!pptr) { 1050 bpf_mem_cache_free(&htab->ma, l_new); 1051 l_new = ERR_PTR(-ENOMEM); 1052 goto dec_count; 1053 } 1054 l_new->ptr_to_pptr = pptr; 1055 pptr = *(void **)pptr; 1056 } 1057 1058 pcpu_init_value(htab, pptr, value, onallcpus); 1059 1060 if (!prealloc) 1061 htab_elem_set_ptr(l_new, key_size, pptr); 1062 } else if (fd_htab_map_needs_adjust(htab)) { 1063 size = round_up(size, 8); 1064 memcpy(l_new->key + round_up(key_size, 8), value, size); 1065 } else { 1066 copy_map_value(&htab->map, 1067 l_new->key + round_up(key_size, 8), 1068 value); 1069 } 1070 1071 l_new->hash = hash; 1072 return l_new; 1073 dec_count: 1074 dec_elem_count(htab); 1075 return l_new; 1076 } 1077 1078 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 1079 u64 map_flags) 1080 { 1081 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 1082 /* elem already exists */ 1083 return -EEXIST; 1084 1085 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 1086 /* elem doesn't exist, cannot update it */ 1087 return -ENOENT; 1088 1089 return 0; 1090 } 1091 1092 /* Called from syscall or from eBPF program */ 1093 static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, 1094 u64 map_flags) 1095 { 1096 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1097 struct htab_elem *l_new = NULL, *l_old; 1098 struct hlist_nulls_head *head; 1099 unsigned long flags; 1100 struct bucket *b; 1101 u32 key_size, hash; 1102 int ret; 1103 1104 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 1105 /* unknown flags */ 1106 return -EINVAL; 1107 1108 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1109 !rcu_read_lock_bh_held()); 1110 1111 key_size = map->key_size; 1112 1113 hash = htab_map_hash(key, key_size, htab->hashrnd); 1114 1115 b = __select_bucket(htab, hash); 1116 head = &b->head; 1117 1118 if (unlikely(map_flags & BPF_F_LOCK)) { 1119 if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) 1120 return -EINVAL; 1121 /* find an element without taking the bucket lock */ 1122 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 1123 htab->n_buckets); 1124 ret = check_flags(htab, l_old, map_flags); 1125 if (ret) 1126 return ret; 1127 if (l_old) { 1128 /* grab the element lock and update value in place */ 1129 copy_map_value_locked(map, 1130 l_old->key + round_up(key_size, 8), 1131 value, false); 1132 return 0; 1133 } 1134 /* fall through, grab the bucket lock and lookup again. 1135 * 99.9% chance that the element won't be found, 1136 * but second lookup under lock has to be done. 1137 */ 1138 } 1139 1140 ret = htab_lock_bucket(htab, b, hash, &flags); 1141 if (ret) 1142 return ret; 1143 1144 l_old = lookup_elem_raw(head, hash, key, key_size); 1145 1146 ret = check_flags(htab, l_old, map_flags); 1147 if (ret) 1148 goto err; 1149 1150 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 1151 /* first lookup without the bucket lock didn't find the element, 1152 * but second lookup with the bucket lock found it. 1153 * This case is highly unlikely, but has to be dealt with: 1154 * grab the element lock in addition to the bucket lock 1155 * and update element in place 1156 */ 1157 copy_map_value_locked(map, 1158 l_old->key + round_up(key_size, 8), 1159 value, false); 1160 ret = 0; 1161 goto err; 1162 } 1163 1164 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 1165 l_old); 1166 if (IS_ERR(l_new)) { 1167 /* all pre-allocated elements are in use or memory exhausted */ 1168 ret = PTR_ERR(l_new); 1169 goto err; 1170 } 1171 1172 /* add new element to the head of the list, so that 1173 * concurrent search will find it before old elem 1174 */ 1175 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1176 if (l_old) { 1177 hlist_nulls_del_rcu(&l_old->hash_node); 1178 if (!htab_is_prealloc(htab)) 1179 free_htab_elem(htab, l_old); 1180 else 1181 check_and_free_fields(htab, l_old); 1182 } 1183 ret = 0; 1184 err: 1185 htab_unlock_bucket(htab, b, hash, flags); 1186 return ret; 1187 } 1188 1189 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) 1190 { 1191 check_and_free_fields(htab, elem); 1192 bpf_map_dec_elem_count(&htab->map); 1193 bpf_lru_push_free(&htab->lru, &elem->lru_node); 1194 } 1195 1196 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 1197 u64 map_flags) 1198 { 1199 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1200 struct htab_elem *l_new, *l_old = NULL; 1201 struct hlist_nulls_head *head; 1202 unsigned long flags; 1203 struct bucket *b; 1204 u32 key_size, hash; 1205 int ret; 1206 1207 if (unlikely(map_flags > BPF_EXIST)) 1208 /* unknown flags */ 1209 return -EINVAL; 1210 1211 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1212 !rcu_read_lock_bh_held()); 1213 1214 key_size = map->key_size; 1215 1216 hash = htab_map_hash(key, key_size, htab->hashrnd); 1217 1218 b = __select_bucket(htab, hash); 1219 head = &b->head; 1220 1221 /* For LRU, we need to alloc before taking bucket's 1222 * spinlock because getting free nodes from LRU may need 1223 * to remove older elements from htab and this removal 1224 * operation will need a bucket lock. 1225 */ 1226 l_new = prealloc_lru_pop(htab, key, hash); 1227 if (!l_new) 1228 return -ENOMEM; 1229 copy_map_value(&htab->map, 1230 l_new->key + round_up(map->key_size, 8), value); 1231 1232 ret = htab_lock_bucket(htab, b, hash, &flags); 1233 if (ret) 1234 goto err_lock_bucket; 1235 1236 l_old = lookup_elem_raw(head, hash, key, key_size); 1237 1238 ret = check_flags(htab, l_old, map_flags); 1239 if (ret) 1240 goto err; 1241 1242 /* add new element to the head of the list, so that 1243 * concurrent search will find it before old elem 1244 */ 1245 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1246 if (l_old) { 1247 bpf_lru_node_set_ref(&l_new->lru_node); 1248 hlist_nulls_del_rcu(&l_old->hash_node); 1249 } 1250 ret = 0; 1251 1252 err: 1253 htab_unlock_bucket(htab, b, hash, flags); 1254 1255 err_lock_bucket: 1256 if (ret) 1257 htab_lru_push_free(htab, l_new); 1258 else if (l_old) 1259 htab_lru_push_free(htab, l_old); 1260 1261 return ret; 1262 } 1263 1264 static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1265 void *value, u64 map_flags, 1266 bool onallcpus) 1267 { 1268 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1269 struct htab_elem *l_new = NULL, *l_old; 1270 struct hlist_nulls_head *head; 1271 unsigned long flags; 1272 struct bucket *b; 1273 u32 key_size, hash; 1274 int ret; 1275 1276 if (unlikely(map_flags > BPF_EXIST)) 1277 /* unknown flags */ 1278 return -EINVAL; 1279 1280 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1281 !rcu_read_lock_bh_held()); 1282 1283 key_size = map->key_size; 1284 1285 hash = htab_map_hash(key, key_size, htab->hashrnd); 1286 1287 b = __select_bucket(htab, hash); 1288 head = &b->head; 1289 1290 ret = htab_lock_bucket(htab, b, hash, &flags); 1291 if (ret) 1292 return ret; 1293 1294 l_old = lookup_elem_raw(head, hash, key, key_size); 1295 1296 ret = check_flags(htab, l_old, map_flags); 1297 if (ret) 1298 goto err; 1299 1300 if (l_old) { 1301 /* per-cpu hash map can update value in-place */ 1302 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1303 value, onallcpus); 1304 } else { 1305 l_new = alloc_htab_elem(htab, key, value, key_size, 1306 hash, true, onallcpus, NULL); 1307 if (IS_ERR(l_new)) { 1308 ret = PTR_ERR(l_new); 1309 goto err; 1310 } 1311 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1312 } 1313 ret = 0; 1314 err: 1315 htab_unlock_bucket(htab, b, hash, flags); 1316 return ret; 1317 } 1318 1319 static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1320 void *value, u64 map_flags, 1321 bool onallcpus) 1322 { 1323 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1324 struct htab_elem *l_new = NULL, *l_old; 1325 struct hlist_nulls_head *head; 1326 unsigned long flags; 1327 struct bucket *b; 1328 u32 key_size, hash; 1329 int ret; 1330 1331 if (unlikely(map_flags > BPF_EXIST)) 1332 /* unknown flags */ 1333 return -EINVAL; 1334 1335 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1336 !rcu_read_lock_bh_held()); 1337 1338 key_size = map->key_size; 1339 1340 hash = htab_map_hash(key, key_size, htab->hashrnd); 1341 1342 b = __select_bucket(htab, hash); 1343 head = &b->head; 1344 1345 /* For LRU, we need to alloc before taking bucket's 1346 * spinlock because LRU's elem alloc may need 1347 * to remove older elem from htab and this removal 1348 * operation will need a bucket lock. 1349 */ 1350 if (map_flags != BPF_EXIST) { 1351 l_new = prealloc_lru_pop(htab, key, hash); 1352 if (!l_new) 1353 return -ENOMEM; 1354 } 1355 1356 ret = htab_lock_bucket(htab, b, hash, &flags); 1357 if (ret) 1358 goto err_lock_bucket; 1359 1360 l_old = lookup_elem_raw(head, hash, key, key_size); 1361 1362 ret = check_flags(htab, l_old, map_flags); 1363 if (ret) 1364 goto err; 1365 1366 if (l_old) { 1367 bpf_lru_node_set_ref(&l_old->lru_node); 1368 1369 /* per-cpu hash map can update value in-place */ 1370 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1371 value, onallcpus); 1372 } else { 1373 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), 1374 value, onallcpus); 1375 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1376 l_new = NULL; 1377 } 1378 ret = 0; 1379 err: 1380 htab_unlock_bucket(htab, b, hash, flags); 1381 err_lock_bucket: 1382 if (l_new) { 1383 bpf_map_dec_elem_count(&htab->map); 1384 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1385 } 1386 return ret; 1387 } 1388 1389 static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1390 void *value, u64 map_flags) 1391 { 1392 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1393 } 1394 1395 static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1396 void *value, u64 map_flags) 1397 { 1398 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1399 false); 1400 } 1401 1402 /* Called from syscall or from eBPF program */ 1403 static long htab_map_delete_elem(struct bpf_map *map, void *key) 1404 { 1405 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1406 struct hlist_nulls_head *head; 1407 struct bucket *b; 1408 struct htab_elem *l; 1409 unsigned long flags; 1410 u32 hash, key_size; 1411 int ret; 1412 1413 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1414 !rcu_read_lock_bh_held()); 1415 1416 key_size = map->key_size; 1417 1418 hash = htab_map_hash(key, key_size, htab->hashrnd); 1419 b = __select_bucket(htab, hash); 1420 head = &b->head; 1421 1422 ret = htab_lock_bucket(htab, b, hash, &flags); 1423 if (ret) 1424 return ret; 1425 1426 l = lookup_elem_raw(head, hash, key, key_size); 1427 1428 if (l) { 1429 hlist_nulls_del_rcu(&l->hash_node); 1430 free_htab_elem(htab, l); 1431 } else { 1432 ret = -ENOENT; 1433 } 1434 1435 htab_unlock_bucket(htab, b, hash, flags); 1436 return ret; 1437 } 1438 1439 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1440 { 1441 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1442 struct hlist_nulls_head *head; 1443 struct bucket *b; 1444 struct htab_elem *l; 1445 unsigned long flags; 1446 u32 hash, key_size; 1447 int ret; 1448 1449 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1450 !rcu_read_lock_bh_held()); 1451 1452 key_size = map->key_size; 1453 1454 hash = htab_map_hash(key, key_size, htab->hashrnd); 1455 b = __select_bucket(htab, hash); 1456 head = &b->head; 1457 1458 ret = htab_lock_bucket(htab, b, hash, &flags); 1459 if (ret) 1460 return ret; 1461 1462 l = lookup_elem_raw(head, hash, key, key_size); 1463 1464 if (l) 1465 hlist_nulls_del_rcu(&l->hash_node); 1466 else 1467 ret = -ENOENT; 1468 1469 htab_unlock_bucket(htab, b, hash, flags); 1470 if (l) 1471 htab_lru_push_free(htab, l); 1472 return ret; 1473 } 1474 1475 static void delete_all_elements(struct bpf_htab *htab) 1476 { 1477 int i; 1478 1479 /* It's called from a worker thread, so disable migration here, 1480 * since bpf_mem_cache_free() relies on that. 1481 */ 1482 migrate_disable(); 1483 for (i = 0; i < htab->n_buckets; i++) { 1484 struct hlist_nulls_head *head = select_bucket(htab, i); 1485 struct hlist_nulls_node *n; 1486 struct htab_elem *l; 1487 1488 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1489 hlist_nulls_del_rcu(&l->hash_node); 1490 htab_elem_free(htab, l); 1491 } 1492 } 1493 migrate_enable(); 1494 } 1495 1496 static void htab_free_malloced_timers(struct bpf_htab *htab) 1497 { 1498 int i; 1499 1500 rcu_read_lock(); 1501 for (i = 0; i < htab->n_buckets; i++) { 1502 struct hlist_nulls_head *head = select_bucket(htab, i); 1503 struct hlist_nulls_node *n; 1504 struct htab_elem *l; 1505 1506 hlist_nulls_for_each_entry(l, n, head, hash_node) { 1507 /* We only free timer on uref dropping to zero */ 1508 bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); 1509 } 1510 cond_resched_rcu(); 1511 } 1512 rcu_read_unlock(); 1513 } 1514 1515 static void htab_map_free_timers(struct bpf_map *map) 1516 { 1517 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1518 1519 /* We only free timer on uref dropping to zero */ 1520 if (!btf_record_has_field(htab->map.record, BPF_TIMER)) 1521 return; 1522 if (!htab_is_prealloc(htab)) 1523 htab_free_malloced_timers(htab); 1524 else 1525 htab_free_prealloced_timers(htab); 1526 } 1527 1528 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1529 static void htab_map_free(struct bpf_map *map) 1530 { 1531 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1532 int i; 1533 1534 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1535 * bpf_free_used_maps() is called after bpf prog is no longer executing. 1536 * There is no need to synchronize_rcu() here to protect map elements. 1537 */ 1538 1539 /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it 1540 * underneath and is reponsible for waiting for callbacks to finish 1541 * during bpf_mem_alloc_destroy(). 1542 */ 1543 if (!htab_is_prealloc(htab)) { 1544 delete_all_elements(htab); 1545 } else { 1546 htab_free_prealloced_fields(htab); 1547 prealloc_destroy(htab); 1548 } 1549 1550 bpf_map_free_elem_count(map); 1551 free_percpu(htab->extra_elems); 1552 bpf_map_area_free(htab->buckets); 1553 bpf_mem_alloc_destroy(&htab->pcpu_ma); 1554 bpf_mem_alloc_destroy(&htab->ma); 1555 if (htab->use_percpu_counter) 1556 percpu_counter_destroy(&htab->pcount); 1557 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1558 free_percpu(htab->map_locked[i]); 1559 lockdep_unregister_key(&htab->lockdep_key); 1560 bpf_map_area_free(htab); 1561 } 1562 1563 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1564 struct seq_file *m) 1565 { 1566 void *value; 1567 1568 rcu_read_lock(); 1569 1570 value = htab_map_lookup_elem(map, key); 1571 if (!value) { 1572 rcu_read_unlock(); 1573 return; 1574 } 1575 1576 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1577 seq_puts(m, ": "); 1578 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1579 seq_puts(m, "\n"); 1580 1581 rcu_read_unlock(); 1582 } 1583 1584 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1585 void *value, bool is_lru_map, 1586 bool is_percpu, u64 flags) 1587 { 1588 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1589 struct hlist_nulls_head *head; 1590 unsigned long bflags; 1591 struct htab_elem *l; 1592 u32 hash, key_size; 1593 struct bucket *b; 1594 int ret; 1595 1596 key_size = map->key_size; 1597 1598 hash = htab_map_hash(key, key_size, htab->hashrnd); 1599 b = __select_bucket(htab, hash); 1600 head = &b->head; 1601 1602 ret = htab_lock_bucket(htab, b, hash, &bflags); 1603 if (ret) 1604 return ret; 1605 1606 l = lookup_elem_raw(head, hash, key, key_size); 1607 if (!l) { 1608 ret = -ENOENT; 1609 } else { 1610 if (is_percpu) { 1611 u32 roundup_value_size = round_up(map->value_size, 8); 1612 void __percpu *pptr; 1613 int off = 0, cpu; 1614 1615 pptr = htab_elem_get_ptr(l, key_size); 1616 for_each_possible_cpu(cpu) { 1617 copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); 1618 check_and_init_map_value(&htab->map, value + off); 1619 off += roundup_value_size; 1620 } 1621 } else { 1622 u32 roundup_key_size = round_up(map->key_size, 8); 1623 1624 if (flags & BPF_F_LOCK) 1625 copy_map_value_locked(map, value, l->key + 1626 roundup_key_size, 1627 true); 1628 else 1629 copy_map_value(map, value, l->key + 1630 roundup_key_size); 1631 /* Zeroing special fields in the temp buffer */ 1632 check_and_init_map_value(map, value); 1633 } 1634 1635 hlist_nulls_del_rcu(&l->hash_node); 1636 if (!is_lru_map) 1637 free_htab_elem(htab, l); 1638 } 1639 1640 htab_unlock_bucket(htab, b, hash, bflags); 1641 1642 if (is_lru_map && l) 1643 htab_lru_push_free(htab, l); 1644 1645 return ret; 1646 } 1647 1648 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1649 void *value, u64 flags) 1650 { 1651 return __htab_map_lookup_and_delete_elem(map, key, value, false, false, 1652 flags); 1653 } 1654 1655 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, 1656 void *key, void *value, 1657 u64 flags) 1658 { 1659 return __htab_map_lookup_and_delete_elem(map, key, value, false, true, 1660 flags); 1661 } 1662 1663 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1664 void *value, u64 flags) 1665 { 1666 return __htab_map_lookup_and_delete_elem(map, key, value, true, false, 1667 flags); 1668 } 1669 1670 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, 1671 void *key, void *value, 1672 u64 flags) 1673 { 1674 return __htab_map_lookup_and_delete_elem(map, key, value, true, true, 1675 flags); 1676 } 1677 1678 static int 1679 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1680 const union bpf_attr *attr, 1681 union bpf_attr __user *uattr, 1682 bool do_delete, bool is_lru_map, 1683 bool is_percpu) 1684 { 1685 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1686 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1687 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1688 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1689 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1690 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1691 u32 batch, max_count, size, bucket_size, map_id; 1692 struct htab_elem *node_to_free = NULL; 1693 u64 elem_map_flags, map_flags; 1694 struct hlist_nulls_head *head; 1695 struct hlist_nulls_node *n; 1696 unsigned long flags = 0; 1697 bool locked = false; 1698 struct htab_elem *l; 1699 struct bucket *b; 1700 int ret = 0; 1701 1702 elem_map_flags = attr->batch.elem_flags; 1703 if ((elem_map_flags & ~BPF_F_LOCK) || 1704 ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) 1705 return -EINVAL; 1706 1707 map_flags = attr->batch.flags; 1708 if (map_flags) 1709 return -EINVAL; 1710 1711 max_count = attr->batch.count; 1712 if (!max_count) 1713 return 0; 1714 1715 if (put_user(0, &uattr->batch.count)) 1716 return -EFAULT; 1717 1718 batch = 0; 1719 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1720 return -EFAULT; 1721 1722 if (batch >= htab->n_buckets) 1723 return -ENOENT; 1724 1725 key_size = htab->map.key_size; 1726 roundup_key_size = round_up(htab->map.key_size, 8); 1727 value_size = htab->map.value_size; 1728 size = round_up(value_size, 8); 1729 if (is_percpu) 1730 value_size = size * num_possible_cpus(); 1731 total = 0; 1732 /* while experimenting with hash tables with sizes ranging from 10 to 1733 * 1000, it was observed that a bucket can have up to 5 entries. 1734 */ 1735 bucket_size = 5; 1736 1737 alloc: 1738 /* We cannot do copy_from_user or copy_to_user inside 1739 * the rcu_read_lock. Allocate enough space here. 1740 */ 1741 keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); 1742 values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); 1743 if (!keys || !values) { 1744 ret = -ENOMEM; 1745 goto after_loop; 1746 } 1747 1748 again: 1749 bpf_disable_instrumentation(); 1750 rcu_read_lock(); 1751 again_nocopy: 1752 dst_key = keys; 1753 dst_val = values; 1754 b = &htab->buckets[batch]; 1755 head = &b->head; 1756 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1757 if (locked) { 1758 ret = htab_lock_bucket(htab, b, batch, &flags); 1759 if (ret) { 1760 rcu_read_unlock(); 1761 bpf_enable_instrumentation(); 1762 goto after_loop; 1763 } 1764 } 1765 1766 bucket_cnt = 0; 1767 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1768 bucket_cnt++; 1769 1770 if (bucket_cnt && !locked) { 1771 locked = true; 1772 goto again_nocopy; 1773 } 1774 1775 if (bucket_cnt > (max_count - total)) { 1776 if (total == 0) 1777 ret = -ENOSPC; 1778 /* Note that since bucket_cnt > 0 here, it is implicit 1779 * that the locked was grabbed, so release it. 1780 */ 1781 htab_unlock_bucket(htab, b, batch, flags); 1782 rcu_read_unlock(); 1783 bpf_enable_instrumentation(); 1784 goto after_loop; 1785 } 1786 1787 if (bucket_cnt > bucket_size) { 1788 bucket_size = bucket_cnt; 1789 /* Note that since bucket_cnt > 0 here, it is implicit 1790 * that the locked was grabbed, so release it. 1791 */ 1792 htab_unlock_bucket(htab, b, batch, flags); 1793 rcu_read_unlock(); 1794 bpf_enable_instrumentation(); 1795 kvfree(keys); 1796 kvfree(values); 1797 goto alloc; 1798 } 1799 1800 /* Next block is only safe to run if you have grabbed the lock */ 1801 if (!locked) 1802 goto next_batch; 1803 1804 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1805 memcpy(dst_key, l->key, key_size); 1806 1807 if (is_percpu) { 1808 int off = 0, cpu; 1809 void __percpu *pptr; 1810 1811 pptr = htab_elem_get_ptr(l, map->key_size); 1812 for_each_possible_cpu(cpu) { 1813 copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); 1814 check_and_init_map_value(&htab->map, dst_val + off); 1815 off += size; 1816 } 1817 } else { 1818 value = l->key + roundup_key_size; 1819 if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 1820 struct bpf_map **inner_map = value; 1821 1822 /* Actual value is the id of the inner map */ 1823 map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); 1824 value = &map_id; 1825 } 1826 1827 if (elem_map_flags & BPF_F_LOCK) 1828 copy_map_value_locked(map, dst_val, value, 1829 true); 1830 else 1831 copy_map_value(map, dst_val, value); 1832 /* Zeroing special fields in the temp buffer */ 1833 check_and_init_map_value(map, dst_val); 1834 } 1835 if (do_delete) { 1836 hlist_nulls_del_rcu(&l->hash_node); 1837 1838 /* bpf_lru_push_free() will acquire lru_lock, which 1839 * may cause deadlock. See comments in function 1840 * prealloc_lru_pop(). Let us do bpf_lru_push_free() 1841 * after releasing the bucket lock. 1842 */ 1843 if (is_lru_map) { 1844 l->batch_flink = node_to_free; 1845 node_to_free = l; 1846 } else { 1847 free_htab_elem(htab, l); 1848 } 1849 } 1850 dst_key += key_size; 1851 dst_val += value_size; 1852 } 1853 1854 htab_unlock_bucket(htab, b, batch, flags); 1855 locked = false; 1856 1857 while (node_to_free) { 1858 l = node_to_free; 1859 node_to_free = node_to_free->batch_flink; 1860 htab_lru_push_free(htab, l); 1861 } 1862 1863 next_batch: 1864 /* If we are not copying data, we can go to next bucket and avoid 1865 * unlocking the rcu. 1866 */ 1867 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1868 batch++; 1869 goto again_nocopy; 1870 } 1871 1872 rcu_read_unlock(); 1873 bpf_enable_instrumentation(); 1874 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1875 key_size * bucket_cnt) || 1876 copy_to_user(uvalues + total * value_size, values, 1877 value_size * bucket_cnt))) { 1878 ret = -EFAULT; 1879 goto after_loop; 1880 } 1881 1882 total += bucket_cnt; 1883 batch++; 1884 if (batch >= htab->n_buckets) { 1885 ret = -ENOENT; 1886 goto after_loop; 1887 } 1888 goto again; 1889 1890 after_loop: 1891 if (ret == -EFAULT) 1892 goto out; 1893 1894 /* copy # of entries and next batch */ 1895 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1896 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1897 put_user(total, &uattr->batch.count)) 1898 ret = -EFAULT; 1899 1900 out: 1901 kvfree(keys); 1902 kvfree(values); 1903 return ret; 1904 } 1905 1906 static int 1907 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1908 union bpf_attr __user *uattr) 1909 { 1910 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1911 false, true); 1912 } 1913 1914 static int 1915 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1916 const union bpf_attr *attr, 1917 union bpf_attr __user *uattr) 1918 { 1919 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1920 false, true); 1921 } 1922 1923 static int 1924 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1925 union bpf_attr __user *uattr) 1926 { 1927 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1928 false, false); 1929 } 1930 1931 static int 1932 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1933 const union bpf_attr *attr, 1934 union bpf_attr __user *uattr) 1935 { 1936 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1937 false, false); 1938 } 1939 1940 static int 1941 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1942 const union bpf_attr *attr, 1943 union bpf_attr __user *uattr) 1944 { 1945 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1946 true, true); 1947 } 1948 1949 static int 1950 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1951 const union bpf_attr *attr, 1952 union bpf_attr __user *uattr) 1953 { 1954 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1955 true, true); 1956 } 1957 1958 static int 1959 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1960 union bpf_attr __user *uattr) 1961 { 1962 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1963 true, false); 1964 } 1965 1966 static int 1967 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1968 const union bpf_attr *attr, 1969 union bpf_attr __user *uattr) 1970 { 1971 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1972 true, false); 1973 } 1974 1975 struct bpf_iter_seq_hash_map_info { 1976 struct bpf_map *map; 1977 struct bpf_htab *htab; 1978 void *percpu_value_buf; // non-zero means percpu hash 1979 u32 bucket_id; 1980 u32 skip_elems; 1981 }; 1982 1983 static struct htab_elem * 1984 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, 1985 struct htab_elem *prev_elem) 1986 { 1987 const struct bpf_htab *htab = info->htab; 1988 u32 skip_elems = info->skip_elems; 1989 u32 bucket_id = info->bucket_id; 1990 struct hlist_nulls_head *head; 1991 struct hlist_nulls_node *n; 1992 struct htab_elem *elem; 1993 struct bucket *b; 1994 u32 i, count; 1995 1996 if (bucket_id >= htab->n_buckets) 1997 return NULL; 1998 1999 /* try to find next elem in the same bucket */ 2000 if (prev_elem) { 2001 /* no update/deletion on this bucket, prev_elem should be still valid 2002 * and we won't skip elements. 2003 */ 2004 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); 2005 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); 2006 if (elem) 2007 return elem; 2008 2009 /* not found, unlock and go to the next bucket */ 2010 b = &htab->buckets[bucket_id++]; 2011 rcu_read_unlock(); 2012 skip_elems = 0; 2013 } 2014 2015 for (i = bucket_id; i < htab->n_buckets; i++) { 2016 b = &htab->buckets[i]; 2017 rcu_read_lock(); 2018 2019 count = 0; 2020 head = &b->head; 2021 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 2022 if (count >= skip_elems) { 2023 info->bucket_id = i; 2024 info->skip_elems = count; 2025 return elem; 2026 } 2027 count++; 2028 } 2029 2030 rcu_read_unlock(); 2031 skip_elems = 0; 2032 } 2033 2034 info->bucket_id = i; 2035 info->skip_elems = 0; 2036 return NULL; 2037 } 2038 2039 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) 2040 { 2041 struct bpf_iter_seq_hash_map_info *info = seq->private; 2042 struct htab_elem *elem; 2043 2044 elem = bpf_hash_map_seq_find_next(info, NULL); 2045 if (!elem) 2046 return NULL; 2047 2048 if (*pos == 0) 2049 ++*pos; 2050 return elem; 2051 } 2052 2053 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2054 { 2055 struct bpf_iter_seq_hash_map_info *info = seq->private; 2056 2057 ++*pos; 2058 ++info->skip_elems; 2059 return bpf_hash_map_seq_find_next(info, v); 2060 } 2061 2062 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) 2063 { 2064 struct bpf_iter_seq_hash_map_info *info = seq->private; 2065 u32 roundup_key_size, roundup_value_size; 2066 struct bpf_iter__bpf_map_elem ctx = {}; 2067 struct bpf_map *map = info->map; 2068 struct bpf_iter_meta meta; 2069 int ret = 0, off = 0, cpu; 2070 struct bpf_prog *prog; 2071 void __percpu *pptr; 2072 2073 meta.seq = seq; 2074 prog = bpf_iter_get_info(&meta, elem == NULL); 2075 if (prog) { 2076 ctx.meta = &meta; 2077 ctx.map = info->map; 2078 if (elem) { 2079 roundup_key_size = round_up(map->key_size, 8); 2080 ctx.key = elem->key; 2081 if (!info->percpu_value_buf) { 2082 ctx.value = elem->key + roundup_key_size; 2083 } else { 2084 roundup_value_size = round_up(map->value_size, 8); 2085 pptr = htab_elem_get_ptr(elem, map->key_size); 2086 for_each_possible_cpu(cpu) { 2087 copy_map_value_long(map, info->percpu_value_buf + off, 2088 per_cpu_ptr(pptr, cpu)); 2089 check_and_init_map_value(map, info->percpu_value_buf + off); 2090 off += roundup_value_size; 2091 } 2092 ctx.value = info->percpu_value_buf; 2093 } 2094 } 2095 ret = bpf_iter_run_prog(prog, &ctx); 2096 } 2097 2098 return ret; 2099 } 2100 2101 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) 2102 { 2103 return __bpf_hash_map_seq_show(seq, v); 2104 } 2105 2106 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) 2107 { 2108 if (!v) 2109 (void)__bpf_hash_map_seq_show(seq, NULL); 2110 else 2111 rcu_read_unlock(); 2112 } 2113 2114 static int bpf_iter_init_hash_map(void *priv_data, 2115 struct bpf_iter_aux_info *aux) 2116 { 2117 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 2118 struct bpf_map *map = aux->map; 2119 void *value_buf; 2120 u32 buf_size; 2121 2122 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 2123 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 2124 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 2125 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 2126 if (!value_buf) 2127 return -ENOMEM; 2128 2129 seq_info->percpu_value_buf = value_buf; 2130 } 2131 2132 bpf_map_inc_with_uref(map); 2133 seq_info->map = map; 2134 seq_info->htab = container_of(map, struct bpf_htab, map); 2135 return 0; 2136 } 2137 2138 static void bpf_iter_fini_hash_map(void *priv_data) 2139 { 2140 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 2141 2142 bpf_map_put_with_uref(seq_info->map); 2143 kfree(seq_info->percpu_value_buf); 2144 } 2145 2146 static const struct seq_operations bpf_hash_map_seq_ops = { 2147 .start = bpf_hash_map_seq_start, 2148 .next = bpf_hash_map_seq_next, 2149 .stop = bpf_hash_map_seq_stop, 2150 .show = bpf_hash_map_seq_show, 2151 }; 2152 2153 static const struct bpf_iter_seq_info iter_seq_info = { 2154 .seq_ops = &bpf_hash_map_seq_ops, 2155 .init_seq_private = bpf_iter_init_hash_map, 2156 .fini_seq_private = bpf_iter_fini_hash_map, 2157 .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), 2158 }; 2159 2160 static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, 2161 void *callback_ctx, u64 flags) 2162 { 2163 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2164 struct hlist_nulls_head *head; 2165 struct hlist_nulls_node *n; 2166 struct htab_elem *elem; 2167 u32 roundup_key_size; 2168 int i, num_elems = 0; 2169 void __percpu *pptr; 2170 struct bucket *b; 2171 void *key, *val; 2172 bool is_percpu; 2173 u64 ret = 0; 2174 2175 if (flags != 0) 2176 return -EINVAL; 2177 2178 is_percpu = htab_is_percpu(htab); 2179 2180 roundup_key_size = round_up(map->key_size, 8); 2181 /* disable migration so percpu value prepared here will be the 2182 * same as the one seen by the bpf program with bpf_map_lookup_elem(). 2183 */ 2184 if (is_percpu) 2185 migrate_disable(); 2186 for (i = 0; i < htab->n_buckets; i++) { 2187 b = &htab->buckets[i]; 2188 rcu_read_lock(); 2189 head = &b->head; 2190 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 2191 key = elem->key; 2192 if (is_percpu) { 2193 /* current cpu value for percpu map */ 2194 pptr = htab_elem_get_ptr(elem, map->key_size); 2195 val = this_cpu_ptr(pptr); 2196 } else { 2197 val = elem->key + roundup_key_size; 2198 } 2199 num_elems++; 2200 ret = callback_fn((u64)(long)map, (u64)(long)key, 2201 (u64)(long)val, (u64)(long)callback_ctx, 0); 2202 /* return value: 0 - continue, 1 - stop and return */ 2203 if (ret) { 2204 rcu_read_unlock(); 2205 goto out; 2206 } 2207 } 2208 rcu_read_unlock(); 2209 } 2210 out: 2211 if (is_percpu) 2212 migrate_enable(); 2213 return num_elems; 2214 } 2215 2216 static u64 htab_map_mem_usage(const struct bpf_map *map) 2217 { 2218 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2219 u32 value_size = round_up(htab->map.value_size, 8); 2220 bool prealloc = htab_is_prealloc(htab); 2221 bool percpu = htab_is_percpu(htab); 2222 bool lru = htab_is_lru(htab); 2223 u64 num_entries; 2224 u64 usage = sizeof(struct bpf_htab); 2225 2226 usage += sizeof(struct bucket) * htab->n_buckets; 2227 usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; 2228 if (prealloc) { 2229 num_entries = map->max_entries; 2230 if (htab_has_extra_elems(htab)) 2231 num_entries += num_possible_cpus(); 2232 2233 usage += htab->elem_size * num_entries; 2234 2235 if (percpu) 2236 usage += value_size * num_possible_cpus() * num_entries; 2237 else if (!lru) 2238 usage += sizeof(struct htab_elem *) * num_possible_cpus(); 2239 } else { 2240 #define LLIST_NODE_SZ sizeof(struct llist_node) 2241 2242 num_entries = htab->use_percpu_counter ? 2243 percpu_counter_sum(&htab->pcount) : 2244 atomic_read(&htab->count); 2245 usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; 2246 if (percpu) { 2247 usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; 2248 usage += value_size * num_possible_cpus() * num_entries; 2249 } 2250 } 2251 return usage; 2252 } 2253 2254 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) 2255 const struct bpf_map_ops htab_map_ops = { 2256 .map_meta_equal = bpf_map_meta_equal, 2257 .map_alloc_check = htab_map_alloc_check, 2258 .map_alloc = htab_map_alloc, 2259 .map_free = htab_map_free, 2260 .map_get_next_key = htab_map_get_next_key, 2261 .map_release_uref = htab_map_free_timers, 2262 .map_lookup_elem = htab_map_lookup_elem, 2263 .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, 2264 .map_update_elem = htab_map_update_elem, 2265 .map_delete_elem = htab_map_delete_elem, 2266 .map_gen_lookup = htab_map_gen_lookup, 2267 .map_seq_show_elem = htab_map_seq_show_elem, 2268 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2269 .map_for_each_callback = bpf_for_each_hash_elem, 2270 .map_mem_usage = htab_map_mem_usage, 2271 BATCH_OPS(htab), 2272 .map_btf_id = &htab_map_btf_ids[0], 2273 .iter_seq_info = &iter_seq_info, 2274 }; 2275 2276 const struct bpf_map_ops htab_lru_map_ops = { 2277 .map_meta_equal = bpf_map_meta_equal, 2278 .map_alloc_check = htab_map_alloc_check, 2279 .map_alloc = htab_map_alloc, 2280 .map_free = htab_map_free, 2281 .map_get_next_key = htab_map_get_next_key, 2282 .map_release_uref = htab_map_free_timers, 2283 .map_lookup_elem = htab_lru_map_lookup_elem, 2284 .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, 2285 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 2286 .map_update_elem = htab_lru_map_update_elem, 2287 .map_delete_elem = htab_lru_map_delete_elem, 2288 .map_gen_lookup = htab_lru_map_gen_lookup, 2289 .map_seq_show_elem = htab_map_seq_show_elem, 2290 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2291 .map_for_each_callback = bpf_for_each_hash_elem, 2292 .map_mem_usage = htab_map_mem_usage, 2293 BATCH_OPS(htab_lru), 2294 .map_btf_id = &htab_map_btf_ids[0], 2295 .iter_seq_info = &iter_seq_info, 2296 }; 2297 2298 /* Called from eBPF program */ 2299 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 2300 { 2301 struct htab_elem *l = __htab_map_lookup_elem(map, key); 2302 2303 if (l) 2304 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 2305 else 2306 return NULL; 2307 } 2308 2309 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) 2310 { 2311 struct htab_elem *l; 2312 2313 if (cpu >= nr_cpu_ids) 2314 return NULL; 2315 2316 l = __htab_map_lookup_elem(map, key); 2317 if (l) 2318 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); 2319 else 2320 return NULL; 2321 } 2322 2323 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 2324 { 2325 struct htab_elem *l = __htab_map_lookup_elem(map, key); 2326 2327 if (l) { 2328 bpf_lru_node_set_ref(&l->lru_node); 2329 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 2330 } 2331 2332 return NULL; 2333 } 2334 2335 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) 2336 { 2337 struct htab_elem *l; 2338 2339 if (cpu >= nr_cpu_ids) 2340 return NULL; 2341 2342 l = __htab_map_lookup_elem(map, key); 2343 if (l) { 2344 bpf_lru_node_set_ref(&l->lru_node); 2345 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); 2346 } 2347 2348 return NULL; 2349 } 2350 2351 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 2352 { 2353 struct htab_elem *l; 2354 void __percpu *pptr; 2355 int ret = -ENOENT; 2356 int cpu, off = 0; 2357 u32 size; 2358 2359 /* per_cpu areas are zero-filled and bpf programs can only 2360 * access 'value_size' of them, so copying rounded areas 2361 * will not leak any kernel data 2362 */ 2363 size = round_up(map->value_size, 8); 2364 rcu_read_lock(); 2365 l = __htab_map_lookup_elem(map, key); 2366 if (!l) 2367 goto out; 2368 /* We do not mark LRU map element here in order to not mess up 2369 * eviction heuristics when user space does a map walk. 2370 */ 2371 pptr = htab_elem_get_ptr(l, map->key_size); 2372 for_each_possible_cpu(cpu) { 2373 copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); 2374 check_and_init_map_value(map, value + off); 2375 off += size; 2376 } 2377 ret = 0; 2378 out: 2379 rcu_read_unlock(); 2380 return ret; 2381 } 2382 2383 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 2384 u64 map_flags) 2385 { 2386 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2387 int ret; 2388 2389 rcu_read_lock(); 2390 if (htab_is_lru(htab)) 2391 ret = __htab_lru_percpu_map_update_elem(map, key, value, 2392 map_flags, true); 2393 else 2394 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 2395 true); 2396 rcu_read_unlock(); 2397 2398 return ret; 2399 } 2400 2401 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 2402 struct seq_file *m) 2403 { 2404 struct htab_elem *l; 2405 void __percpu *pptr; 2406 int cpu; 2407 2408 rcu_read_lock(); 2409 2410 l = __htab_map_lookup_elem(map, key); 2411 if (!l) { 2412 rcu_read_unlock(); 2413 return; 2414 } 2415 2416 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 2417 seq_puts(m, ": {\n"); 2418 pptr = htab_elem_get_ptr(l, map->key_size); 2419 for_each_possible_cpu(cpu) { 2420 seq_printf(m, "\tcpu%d: ", cpu); 2421 btf_type_seq_show(map->btf, map->btf_value_type_id, 2422 per_cpu_ptr(pptr, cpu), m); 2423 seq_puts(m, "\n"); 2424 } 2425 seq_puts(m, "}\n"); 2426 2427 rcu_read_unlock(); 2428 } 2429 2430 const struct bpf_map_ops htab_percpu_map_ops = { 2431 .map_meta_equal = bpf_map_meta_equal, 2432 .map_alloc_check = htab_map_alloc_check, 2433 .map_alloc = htab_map_alloc, 2434 .map_free = htab_map_free, 2435 .map_get_next_key = htab_map_get_next_key, 2436 .map_lookup_elem = htab_percpu_map_lookup_elem, 2437 .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, 2438 .map_update_elem = htab_percpu_map_update_elem, 2439 .map_delete_elem = htab_map_delete_elem, 2440 .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, 2441 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2442 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2443 .map_for_each_callback = bpf_for_each_hash_elem, 2444 .map_mem_usage = htab_map_mem_usage, 2445 BATCH_OPS(htab_percpu), 2446 .map_btf_id = &htab_map_btf_ids[0], 2447 .iter_seq_info = &iter_seq_info, 2448 }; 2449 2450 const struct bpf_map_ops htab_lru_percpu_map_ops = { 2451 .map_meta_equal = bpf_map_meta_equal, 2452 .map_alloc_check = htab_map_alloc_check, 2453 .map_alloc = htab_map_alloc, 2454 .map_free = htab_map_free, 2455 .map_get_next_key = htab_map_get_next_key, 2456 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 2457 .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, 2458 .map_update_elem = htab_lru_percpu_map_update_elem, 2459 .map_delete_elem = htab_lru_map_delete_elem, 2460 .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, 2461 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2462 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2463 .map_for_each_callback = bpf_for_each_hash_elem, 2464 .map_mem_usage = htab_map_mem_usage, 2465 BATCH_OPS(htab_lru_percpu), 2466 .map_btf_id = &htab_map_btf_ids[0], 2467 .iter_seq_info = &iter_seq_info, 2468 }; 2469 2470 static int fd_htab_map_alloc_check(union bpf_attr *attr) 2471 { 2472 if (attr->value_size != sizeof(u32)) 2473 return -EINVAL; 2474 return htab_map_alloc_check(attr); 2475 } 2476 2477 static void fd_htab_map_free(struct bpf_map *map) 2478 { 2479 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2480 struct hlist_nulls_node *n; 2481 struct hlist_nulls_head *head; 2482 struct htab_elem *l; 2483 int i; 2484 2485 for (i = 0; i < htab->n_buckets; i++) { 2486 head = select_bucket(htab, i); 2487 2488 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 2489 void *ptr = fd_htab_map_get_ptr(map, l); 2490 2491 map->ops->map_fd_put_ptr(map, ptr, false); 2492 } 2493 } 2494 2495 htab_map_free(map); 2496 } 2497 2498 /* only called from syscall */ 2499 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 2500 { 2501 void **ptr; 2502 int ret = 0; 2503 2504 if (!map->ops->map_fd_sys_lookup_elem) 2505 return -ENOTSUPP; 2506 2507 rcu_read_lock(); 2508 ptr = htab_map_lookup_elem(map, key); 2509 if (ptr) 2510 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 2511 else 2512 ret = -ENOENT; 2513 rcu_read_unlock(); 2514 2515 return ret; 2516 } 2517 2518 /* only called from syscall */ 2519 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 2520 void *key, void *value, u64 map_flags) 2521 { 2522 void *ptr; 2523 int ret; 2524 u32 ufd = *(u32 *)value; 2525 2526 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 2527 if (IS_ERR(ptr)) 2528 return PTR_ERR(ptr); 2529 2530 ret = htab_map_update_elem(map, key, &ptr, map_flags); 2531 if (ret) 2532 map->ops->map_fd_put_ptr(map, ptr, false); 2533 2534 return ret; 2535 } 2536 2537 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 2538 { 2539 struct bpf_map *map, *inner_map_meta; 2540 2541 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 2542 if (IS_ERR(inner_map_meta)) 2543 return inner_map_meta; 2544 2545 map = htab_map_alloc(attr); 2546 if (IS_ERR(map)) { 2547 bpf_map_meta_free(inner_map_meta); 2548 return map; 2549 } 2550 2551 map->inner_map_meta = inner_map_meta; 2552 2553 return map; 2554 } 2555 2556 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 2557 { 2558 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 2559 2560 if (!inner_map) 2561 return NULL; 2562 2563 return READ_ONCE(*inner_map); 2564 } 2565 2566 static int htab_of_map_gen_lookup(struct bpf_map *map, 2567 struct bpf_insn *insn_buf) 2568 { 2569 struct bpf_insn *insn = insn_buf; 2570 const int ret = BPF_REG_0; 2571 2572 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 2573 (void *(*)(struct bpf_map *map, void *key))NULL)); 2574 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 2575 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 2576 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 2577 offsetof(struct htab_elem, key) + 2578 round_up(map->key_size, 8)); 2579 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 2580 2581 return insn - insn_buf; 2582 } 2583 2584 static void htab_of_map_free(struct bpf_map *map) 2585 { 2586 bpf_map_meta_free(map->inner_map_meta); 2587 fd_htab_map_free(map); 2588 } 2589 2590 const struct bpf_map_ops htab_of_maps_map_ops = { 2591 .map_alloc_check = fd_htab_map_alloc_check, 2592 .map_alloc = htab_of_map_alloc, 2593 .map_free = htab_of_map_free, 2594 .map_get_next_key = htab_map_get_next_key, 2595 .map_lookup_elem = htab_of_map_lookup_elem, 2596 .map_delete_elem = htab_map_delete_elem, 2597 .map_fd_get_ptr = bpf_map_fd_get_ptr, 2598 .map_fd_put_ptr = bpf_map_fd_put_ptr, 2599 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 2600 .map_gen_lookup = htab_of_map_gen_lookup, 2601 .map_check_btf = map_check_no_btf, 2602 .map_mem_usage = htab_map_mem_usage, 2603 BATCH_OPS(htab), 2604 .map_btf_id = &htab_map_btf_ids[0], 2605 }; 2606