1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/random.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/rcupdate_trace.h> 13 #include <linux/btf_ids.h> 14 #include "percpu_freelist.h" 15 #include "bpf_lru_list.h" 16 #include "map_in_map.h" 17 #include <linux/bpf_mem_alloc.h> 18 19 #define HTAB_CREATE_FLAG_MASK \ 20 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 21 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 22 23 #define BATCH_OPS(_name) \ 24 .map_lookup_batch = \ 25 _name##_map_lookup_batch, \ 26 .map_lookup_and_delete_batch = \ 27 _name##_map_lookup_and_delete_batch, \ 28 .map_update_batch = \ 29 generic_map_update_batch, \ 30 .map_delete_batch = \ 31 generic_map_delete_batch 32 33 /* 34 * The bucket lock has two protection scopes: 35 * 36 * 1) Serializing concurrent operations from BPF programs on different 37 * CPUs 38 * 39 * 2) Serializing concurrent operations from BPF programs and sys_bpf() 40 * 41 * BPF programs can execute in any context including perf, kprobes and 42 * tracing. As there are almost no limits where perf, kprobes and tracing 43 * can be invoked from the lock operations need to be protected against 44 * deadlocks. Deadlocks can be caused by recursion and by an invocation in 45 * the lock held section when functions which acquire this lock are invoked 46 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU 47 * variable bpf_prog_active, which prevents BPF programs attached to perf 48 * events, kprobes and tracing to be invoked before the prior invocation 49 * from one of these contexts completed. sys_bpf() uses the same mechanism 50 * by pinning the task to the current CPU and incrementing the recursion 51 * protection across the map operation. 52 * 53 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain 54 * operations like memory allocations (even with GFP_ATOMIC) from atomic 55 * contexts. This is required because even with GFP_ATOMIC the memory 56 * allocator calls into code paths which acquire locks with long held lock 57 * sections. To ensure the deterministic behaviour these locks are regular 58 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only 59 * true atomic contexts on an RT kernel are the low level hardware 60 * handling, scheduling, low level interrupt handling, NMIs etc. None of 61 * these contexts should ever do memory allocations. 62 * 63 * As regular device interrupt handlers and soft interrupts are forced into 64 * thread context, the existing code which does 65 * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); 66 * just works. 67 * 68 * In theory the BPF locks could be converted to regular spinlocks as well, 69 * but the bucket locks and percpu_freelist locks can be taken from 70 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be 71 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, 72 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, 73 * because there is no memory allocation within the lock held sections. However 74 * after hash map was fully converted to use bpf_mem_alloc, there will be 75 * non-synchronous memory allocation for non-preallocated hash map, so it is 76 * safe to always use raw spinlock for bucket lock. 77 */ 78 struct bucket { 79 struct hlist_nulls_head head; 80 raw_spinlock_t raw_lock; 81 }; 82 83 #define HASHTAB_MAP_LOCK_COUNT 8 84 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) 85 86 struct bpf_htab { 87 struct bpf_map map; 88 struct bpf_mem_alloc ma; 89 struct bpf_mem_alloc pcpu_ma; 90 struct bucket *buckets; 91 void *elems; 92 union { 93 struct pcpu_freelist freelist; 94 struct bpf_lru lru; 95 }; 96 struct htab_elem *__percpu *extra_elems; 97 /* number of elements in non-preallocated hashtable are kept 98 * in either pcount or count 99 */ 100 struct percpu_counter pcount; 101 atomic_t count; 102 bool use_percpu_counter; 103 u32 n_buckets; /* number of hash buckets */ 104 u32 elem_size; /* size of each element in bytes */ 105 u32 hashrnd; 106 struct lock_class_key lockdep_key; 107 int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 108 }; 109 110 /* each htab element is struct htab_elem + key + value */ 111 struct htab_elem { 112 union { 113 struct hlist_nulls_node hash_node; 114 struct { 115 void *padding; 116 union { 117 struct pcpu_freelist_node fnode; 118 struct htab_elem *batch_flink; 119 }; 120 }; 121 }; 122 union { 123 /* pointer to per-cpu pointer */ 124 void *ptr_to_pptr; 125 struct bpf_lru_node lru_node; 126 }; 127 u32 hash; 128 char key[] __aligned(8); 129 }; 130 131 static inline bool htab_is_prealloc(const struct bpf_htab *htab) 132 { 133 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 134 } 135 136 static void htab_init_buckets(struct bpf_htab *htab) 137 { 138 unsigned int i; 139 140 for (i = 0; i < htab->n_buckets; i++) { 141 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 142 raw_spin_lock_init(&htab->buckets[i].raw_lock); 143 lockdep_set_class(&htab->buckets[i].raw_lock, 144 &htab->lockdep_key); 145 cond_resched(); 146 } 147 } 148 149 static inline int htab_lock_bucket(const struct bpf_htab *htab, 150 struct bucket *b, u32 hash, 151 unsigned long *pflags) 152 { 153 unsigned long flags; 154 155 hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 156 157 preempt_disable(); 158 local_irq_save(flags); 159 if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 160 __this_cpu_dec(*(htab->map_locked[hash])); 161 local_irq_restore(flags); 162 preempt_enable(); 163 return -EBUSY; 164 } 165 166 raw_spin_lock(&b->raw_lock); 167 *pflags = flags; 168 169 return 0; 170 } 171 172 static inline void htab_unlock_bucket(const struct bpf_htab *htab, 173 struct bucket *b, u32 hash, 174 unsigned long flags) 175 { 176 hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 177 raw_spin_unlock(&b->raw_lock); 178 __this_cpu_dec(*(htab->map_locked[hash])); 179 local_irq_restore(flags); 180 preempt_enable(); 181 } 182 183 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 184 185 static bool htab_is_lru(const struct bpf_htab *htab) 186 { 187 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 188 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 189 } 190 191 static bool htab_is_percpu(const struct bpf_htab *htab) 192 { 193 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 194 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 195 } 196 197 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 198 void __percpu *pptr) 199 { 200 *(void __percpu **)(l->key + key_size) = pptr; 201 } 202 203 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 204 { 205 return *(void __percpu **)(l->key + key_size); 206 } 207 208 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 209 { 210 return *(void **)(l->key + roundup(map->key_size, 8)); 211 } 212 213 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 214 { 215 return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); 216 } 217 218 static bool htab_has_extra_elems(struct bpf_htab *htab) 219 { 220 return !htab_is_percpu(htab) && !htab_is_lru(htab); 221 } 222 223 static void htab_free_prealloced_timers(struct bpf_htab *htab) 224 { 225 u32 num_entries = htab->map.max_entries; 226 int i; 227 228 if (!btf_record_has_field(htab->map.record, BPF_TIMER)) 229 return; 230 if (htab_has_extra_elems(htab)) 231 num_entries += num_possible_cpus(); 232 233 for (i = 0; i < num_entries; i++) { 234 struct htab_elem *elem; 235 236 elem = get_htab_elem(htab, i); 237 bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); 238 cond_resched(); 239 } 240 } 241 242 static void htab_free_prealloced_fields(struct bpf_htab *htab) 243 { 244 u32 num_entries = htab->map.max_entries; 245 int i; 246 247 if (IS_ERR_OR_NULL(htab->map.record)) 248 return; 249 if (htab_has_extra_elems(htab)) 250 num_entries += num_possible_cpus(); 251 for (i = 0; i < num_entries; i++) { 252 struct htab_elem *elem; 253 254 elem = get_htab_elem(htab, i); 255 if (htab_is_percpu(htab)) { 256 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); 257 int cpu; 258 259 for_each_possible_cpu(cpu) { 260 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); 261 cond_resched(); 262 } 263 } else { 264 bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); 265 cond_resched(); 266 } 267 cond_resched(); 268 } 269 } 270 271 static void htab_free_elems(struct bpf_htab *htab) 272 { 273 int i; 274 275 if (!htab_is_percpu(htab)) 276 goto free_elems; 277 278 for (i = 0; i < htab->map.max_entries; i++) { 279 void __percpu *pptr; 280 281 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 282 htab->map.key_size); 283 free_percpu(pptr); 284 cond_resched(); 285 } 286 free_elems: 287 bpf_map_area_free(htab->elems); 288 } 289 290 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock 291 * (bucket_lock). If both locks need to be acquired together, the lock 292 * order is always lru_lock -> bucket_lock and this only happens in 293 * bpf_lru_list.c logic. For example, certain code path of 294 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), 295 * will acquire lru_lock first followed by acquiring bucket_lock. 296 * 297 * In hashtab.c, to avoid deadlock, lock acquisition of 298 * bucket_lock followed by lru_lock is not allowed. In such cases, 299 * bucket_lock needs to be released first before acquiring lru_lock. 300 */ 301 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 302 u32 hash) 303 { 304 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 305 struct htab_elem *l; 306 307 if (node) { 308 bpf_map_inc_elem_count(&htab->map); 309 l = container_of(node, struct htab_elem, lru_node); 310 memcpy(l->key, key, htab->map.key_size); 311 return l; 312 } 313 314 return NULL; 315 } 316 317 static int prealloc_init(struct bpf_htab *htab) 318 { 319 u32 num_entries = htab->map.max_entries; 320 int err = -ENOMEM, i; 321 322 if (htab_has_extra_elems(htab)) 323 num_entries += num_possible_cpus(); 324 325 htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, 326 htab->map.numa_node); 327 if (!htab->elems) 328 return -ENOMEM; 329 330 if (!htab_is_percpu(htab)) 331 goto skip_percpu_elems; 332 333 for (i = 0; i < num_entries; i++) { 334 u32 size = round_up(htab->map.value_size, 8); 335 void __percpu *pptr; 336 337 pptr = bpf_map_alloc_percpu(&htab->map, size, 8, 338 GFP_USER | __GFP_NOWARN); 339 if (!pptr) 340 goto free_elems; 341 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 342 pptr); 343 cond_resched(); 344 } 345 346 skip_percpu_elems: 347 if (htab_is_lru(htab)) 348 err = bpf_lru_init(&htab->lru, 349 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 350 offsetof(struct htab_elem, hash) - 351 offsetof(struct htab_elem, lru_node), 352 htab_lru_map_delete_node, 353 htab); 354 else 355 err = pcpu_freelist_init(&htab->freelist); 356 357 if (err) 358 goto free_elems; 359 360 if (htab_is_lru(htab)) 361 bpf_lru_populate(&htab->lru, htab->elems, 362 offsetof(struct htab_elem, lru_node), 363 htab->elem_size, num_entries); 364 else 365 pcpu_freelist_populate(&htab->freelist, 366 htab->elems + offsetof(struct htab_elem, fnode), 367 htab->elem_size, num_entries); 368 369 return 0; 370 371 free_elems: 372 htab_free_elems(htab); 373 return err; 374 } 375 376 static void prealloc_destroy(struct bpf_htab *htab) 377 { 378 htab_free_elems(htab); 379 380 if (htab_is_lru(htab)) 381 bpf_lru_destroy(&htab->lru); 382 else 383 pcpu_freelist_destroy(&htab->freelist); 384 } 385 386 static int alloc_extra_elems(struct bpf_htab *htab) 387 { 388 struct htab_elem *__percpu *pptr, *l_new; 389 struct pcpu_freelist_node *l; 390 int cpu; 391 392 pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, 393 GFP_USER | __GFP_NOWARN); 394 if (!pptr) 395 return -ENOMEM; 396 397 for_each_possible_cpu(cpu) { 398 l = pcpu_freelist_pop(&htab->freelist); 399 /* pop will succeed, since prealloc_init() 400 * preallocated extra num_possible_cpus elements 401 */ 402 l_new = container_of(l, struct htab_elem, fnode); 403 *per_cpu_ptr(pptr, cpu) = l_new; 404 } 405 htab->extra_elems = pptr; 406 return 0; 407 } 408 409 /* Called from syscall */ 410 static int htab_map_alloc_check(union bpf_attr *attr) 411 { 412 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 413 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 414 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 415 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 416 /* percpu_lru means each cpu has its own LRU list. 417 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 418 * the map's value itself is percpu. percpu_lru has 419 * nothing to do with the map's value. 420 */ 421 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 422 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 423 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 424 int numa_node = bpf_map_attr_numa_node(attr); 425 426 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 427 offsetof(struct htab_elem, hash_node.pprev)); 428 429 if (zero_seed && !capable(CAP_SYS_ADMIN)) 430 /* Guard against local DoS, and discourage production use. */ 431 return -EPERM; 432 433 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 434 !bpf_map_flags_access_ok(attr->map_flags)) 435 return -EINVAL; 436 437 if (!lru && percpu_lru) 438 return -EINVAL; 439 440 if (lru && !prealloc) 441 return -ENOTSUPP; 442 443 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 444 return -EINVAL; 445 446 /* check sanity of attributes. 447 * value_size == 0 may be allowed in the future to use map as a set 448 */ 449 if (attr->max_entries == 0 || attr->key_size == 0 || 450 attr->value_size == 0) 451 return -EINVAL; 452 453 if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - 454 sizeof(struct htab_elem)) 455 /* if key_size + value_size is bigger, the user space won't be 456 * able to access the elements via bpf syscall. This check 457 * also makes sure that the elem_size doesn't overflow and it's 458 * kmalloc-able later in htab_map_update_elem() 459 */ 460 return -E2BIG; 461 462 return 0; 463 } 464 465 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 466 { 467 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 468 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 469 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 470 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 471 /* percpu_lru means each cpu has its own LRU list. 472 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 473 * the map's value itself is percpu. percpu_lru has 474 * nothing to do with the map's value. 475 */ 476 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 477 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 478 struct bpf_htab *htab; 479 int err, i; 480 481 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 482 if (!htab) 483 return ERR_PTR(-ENOMEM); 484 485 lockdep_register_key(&htab->lockdep_key); 486 487 bpf_map_init_from_attr(&htab->map, attr); 488 489 if (percpu_lru) { 490 /* ensure each CPU's lru list has >=1 elements. 491 * since we are at it, make each lru list has the same 492 * number of elements. 493 */ 494 htab->map.max_entries = roundup(attr->max_entries, 495 num_possible_cpus()); 496 if (htab->map.max_entries < attr->max_entries) 497 htab->map.max_entries = rounddown(attr->max_entries, 498 num_possible_cpus()); 499 } 500 501 /* hash table size must be power of 2 */ 502 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 503 504 htab->elem_size = sizeof(struct htab_elem) + 505 round_up(htab->map.key_size, 8); 506 if (percpu) 507 htab->elem_size += sizeof(void *); 508 else 509 htab->elem_size += round_up(htab->map.value_size, 8); 510 511 err = -E2BIG; 512 /* prevent zero size kmalloc and check for u32 overflow */ 513 if (htab->n_buckets == 0 || 514 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 515 goto free_htab; 516 517 err = bpf_map_init_elem_count(&htab->map); 518 if (err) 519 goto free_htab; 520 521 err = -ENOMEM; 522 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 523 sizeof(struct bucket), 524 htab->map.numa_node); 525 if (!htab->buckets) 526 goto free_elem_count; 527 528 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 529 htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 530 sizeof(int), 531 sizeof(int), 532 GFP_USER); 533 if (!htab->map_locked[i]) 534 goto free_map_locked; 535 } 536 537 if (htab->map.map_flags & BPF_F_ZERO_SEED) 538 htab->hashrnd = 0; 539 else 540 htab->hashrnd = get_random_u32(); 541 542 htab_init_buckets(htab); 543 544 /* compute_batch_value() computes batch value as num_online_cpus() * 2 545 * and __percpu_counter_compare() needs 546 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() 547 * for percpu_counter to be faster than atomic_t. In practice the average bpf 548 * hash map size is 10k, which means that a system with 64 cpus will fill 549 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore 550 * define our own batch count as 32 then 10k hash map can be filled up to 80%: 551 * 10k - 8k > 32 _batch_ * 64 _cpus_ 552 * and __percpu_counter_compare() will still be fast. At that point hash map 553 * collisions will dominate its performance anyway. Assume that hash map filled 554 * to 50+% isn't going to be O(1) and use the following formula to choose 555 * between percpu_counter and atomic_t. 556 */ 557 #define PERCPU_COUNTER_BATCH 32 558 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) 559 htab->use_percpu_counter = true; 560 561 if (htab->use_percpu_counter) { 562 err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); 563 if (err) 564 goto free_map_locked; 565 } 566 567 if (prealloc) { 568 err = prealloc_init(htab); 569 if (err) 570 goto free_map_locked; 571 572 if (!percpu && !lru) { 573 /* lru itself can remove the least used element, so 574 * there is no need for an extra elem during map_update. 575 */ 576 err = alloc_extra_elems(htab); 577 if (err) 578 goto free_prealloc; 579 } 580 } else { 581 err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); 582 if (err) 583 goto free_map_locked; 584 if (percpu) { 585 err = bpf_mem_alloc_init(&htab->pcpu_ma, 586 round_up(htab->map.value_size, 8), true); 587 if (err) 588 goto free_map_locked; 589 } 590 } 591 592 return &htab->map; 593 594 free_prealloc: 595 prealloc_destroy(htab); 596 free_map_locked: 597 if (htab->use_percpu_counter) 598 percpu_counter_destroy(&htab->pcount); 599 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 600 free_percpu(htab->map_locked[i]); 601 bpf_map_area_free(htab->buckets); 602 bpf_mem_alloc_destroy(&htab->pcpu_ma); 603 bpf_mem_alloc_destroy(&htab->ma); 604 free_elem_count: 605 bpf_map_free_elem_count(&htab->map); 606 free_htab: 607 lockdep_unregister_key(&htab->lockdep_key); 608 bpf_map_area_free(htab); 609 return ERR_PTR(err); 610 } 611 612 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 613 { 614 if (likely(key_len % 4 == 0)) 615 return jhash2(key, key_len / 4, hashrnd); 616 return jhash(key, key_len, hashrnd); 617 } 618 619 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 620 { 621 return &htab->buckets[hash & (htab->n_buckets - 1)]; 622 } 623 624 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 625 { 626 return &__select_bucket(htab, hash)->head; 627 } 628 629 /* this lookup function can only be called with bucket lock taken */ 630 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 631 void *key, u32 key_size) 632 { 633 struct hlist_nulls_node *n; 634 struct htab_elem *l; 635 636 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 637 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 638 return l; 639 640 return NULL; 641 } 642 643 /* can be called without bucket lock. it will repeat the loop in 644 * the unlikely event when elements moved from one bucket into another 645 * while link list is being walked 646 */ 647 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 648 u32 hash, void *key, 649 u32 key_size, u32 n_buckets) 650 { 651 struct hlist_nulls_node *n; 652 struct htab_elem *l; 653 654 again: 655 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 656 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 657 return l; 658 659 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 660 goto again; 661 662 return NULL; 663 } 664 665 /* Called from syscall or from eBPF program directly, so 666 * arguments have to match bpf_map_lookup_elem() exactly. 667 * The return value is adjusted by BPF instructions 668 * in htab_map_gen_lookup(). 669 */ 670 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 671 { 672 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 673 struct hlist_nulls_head *head; 674 struct htab_elem *l; 675 u32 hash, key_size; 676 677 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 678 !rcu_read_lock_bh_held()); 679 680 key_size = map->key_size; 681 682 hash = htab_map_hash(key, key_size, htab->hashrnd); 683 684 head = select_bucket(htab, hash); 685 686 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 687 688 return l; 689 } 690 691 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 692 { 693 struct htab_elem *l = __htab_map_lookup_elem(map, key); 694 695 if (l) 696 return l->key + round_up(map->key_size, 8); 697 698 return NULL; 699 } 700 701 /* inline bpf_map_lookup_elem() call. 702 * Instead of: 703 * bpf_prog 704 * bpf_map_lookup_elem 705 * map->ops->map_lookup_elem 706 * htab_map_lookup_elem 707 * __htab_map_lookup_elem 708 * do: 709 * bpf_prog 710 * __htab_map_lookup_elem 711 */ 712 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 713 { 714 struct bpf_insn *insn = insn_buf; 715 const int ret = BPF_REG_0; 716 717 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 718 (void *(*)(struct bpf_map *map, void *key))NULL)); 719 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 720 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 721 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 722 offsetof(struct htab_elem, key) + 723 round_up(map->key_size, 8)); 724 return insn - insn_buf; 725 } 726 727 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 728 void *key, const bool mark) 729 { 730 struct htab_elem *l = __htab_map_lookup_elem(map, key); 731 732 if (l) { 733 if (mark) 734 bpf_lru_node_set_ref(&l->lru_node); 735 return l->key + round_up(map->key_size, 8); 736 } 737 738 return NULL; 739 } 740 741 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 742 { 743 return __htab_lru_map_lookup_elem(map, key, true); 744 } 745 746 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 747 { 748 return __htab_lru_map_lookup_elem(map, key, false); 749 } 750 751 static int htab_lru_map_gen_lookup(struct bpf_map *map, 752 struct bpf_insn *insn_buf) 753 { 754 struct bpf_insn *insn = insn_buf; 755 const int ret = BPF_REG_0; 756 const int ref_reg = BPF_REG_1; 757 758 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 759 (void *(*)(struct bpf_map *map, void *key))NULL)); 760 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 761 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 762 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 763 offsetof(struct htab_elem, lru_node) + 764 offsetof(struct bpf_lru_node, ref)); 765 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 766 *insn++ = BPF_ST_MEM(BPF_B, ret, 767 offsetof(struct htab_elem, lru_node) + 768 offsetof(struct bpf_lru_node, ref), 769 1); 770 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 771 offsetof(struct htab_elem, key) + 772 round_up(map->key_size, 8)); 773 return insn - insn_buf; 774 } 775 776 static void check_and_free_fields(struct bpf_htab *htab, 777 struct htab_elem *elem) 778 { 779 if (htab_is_percpu(htab)) { 780 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); 781 int cpu; 782 783 for_each_possible_cpu(cpu) 784 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); 785 } else { 786 void *map_value = elem->key + round_up(htab->map.key_size, 8); 787 788 bpf_obj_free_fields(htab->map.record, map_value); 789 } 790 } 791 792 /* It is called from the bpf_lru_list when the LRU needs to delete 793 * older elements from the htab. 794 */ 795 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 796 { 797 struct bpf_htab *htab = arg; 798 struct htab_elem *l = NULL, *tgt_l; 799 struct hlist_nulls_head *head; 800 struct hlist_nulls_node *n; 801 unsigned long flags; 802 struct bucket *b; 803 int ret; 804 805 tgt_l = container_of(node, struct htab_elem, lru_node); 806 b = __select_bucket(htab, tgt_l->hash); 807 head = &b->head; 808 809 ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 810 if (ret) 811 return false; 812 813 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 814 if (l == tgt_l) { 815 hlist_nulls_del_rcu(&l->hash_node); 816 check_and_free_fields(htab, l); 817 bpf_map_dec_elem_count(&htab->map); 818 break; 819 } 820 821 htab_unlock_bucket(htab, b, tgt_l->hash, flags); 822 823 return l == tgt_l; 824 } 825 826 /* Called from syscall */ 827 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 828 { 829 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 830 struct hlist_nulls_head *head; 831 struct htab_elem *l, *next_l; 832 u32 hash, key_size; 833 int i = 0; 834 835 WARN_ON_ONCE(!rcu_read_lock_held()); 836 837 key_size = map->key_size; 838 839 if (!key) 840 goto find_first_elem; 841 842 hash = htab_map_hash(key, key_size, htab->hashrnd); 843 844 head = select_bucket(htab, hash); 845 846 /* lookup the key */ 847 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 848 849 if (!l) 850 goto find_first_elem; 851 852 /* key was found, get next key in the same bucket */ 853 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 854 struct htab_elem, hash_node); 855 856 if (next_l) { 857 /* if next elem in this hash list is non-zero, just return it */ 858 memcpy(next_key, next_l->key, key_size); 859 return 0; 860 } 861 862 /* no more elements in this hash list, go to the next bucket */ 863 i = hash & (htab->n_buckets - 1); 864 i++; 865 866 find_first_elem: 867 /* iterate over buckets */ 868 for (; i < htab->n_buckets; i++) { 869 head = select_bucket(htab, i); 870 871 /* pick first element in the bucket */ 872 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 873 struct htab_elem, hash_node); 874 if (next_l) { 875 /* if it's not empty, just return it */ 876 memcpy(next_key, next_l->key, key_size); 877 return 0; 878 } 879 } 880 881 /* iterated over all buckets and all elements */ 882 return -ENOENT; 883 } 884 885 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 886 { 887 check_and_free_fields(htab, l); 888 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 889 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); 890 bpf_mem_cache_free(&htab->ma, l); 891 } 892 893 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) 894 { 895 struct bpf_map *map = &htab->map; 896 void *ptr; 897 898 if (map->ops->map_fd_put_ptr) { 899 ptr = fd_htab_map_get_ptr(map, l); 900 map->ops->map_fd_put_ptr(map, ptr, true); 901 } 902 } 903 904 static bool is_map_full(struct bpf_htab *htab) 905 { 906 if (htab->use_percpu_counter) 907 return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, 908 PERCPU_COUNTER_BATCH) >= 0; 909 return atomic_read(&htab->count) >= htab->map.max_entries; 910 } 911 912 static void inc_elem_count(struct bpf_htab *htab) 913 { 914 bpf_map_inc_elem_count(&htab->map); 915 916 if (htab->use_percpu_counter) 917 percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); 918 else 919 atomic_inc(&htab->count); 920 } 921 922 static void dec_elem_count(struct bpf_htab *htab) 923 { 924 bpf_map_dec_elem_count(&htab->map); 925 926 if (htab->use_percpu_counter) 927 percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); 928 else 929 atomic_dec(&htab->count); 930 } 931 932 933 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 934 { 935 htab_put_fd_value(htab, l); 936 937 if (htab_is_prealloc(htab)) { 938 bpf_map_dec_elem_count(&htab->map); 939 check_and_free_fields(htab, l); 940 __pcpu_freelist_push(&htab->freelist, &l->fnode); 941 } else { 942 dec_elem_count(htab); 943 htab_elem_free(htab, l); 944 } 945 } 946 947 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 948 void *value, bool onallcpus) 949 { 950 if (!onallcpus) { 951 /* copy true value_size bytes */ 952 copy_map_value(&htab->map, this_cpu_ptr(pptr), value); 953 } else { 954 u32 size = round_up(htab->map.value_size, 8); 955 int off = 0, cpu; 956 957 for_each_possible_cpu(cpu) { 958 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); 959 off += size; 960 } 961 } 962 } 963 964 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, 965 void *value, bool onallcpus) 966 { 967 /* When not setting the initial value on all cpus, zero-fill element 968 * values for other cpus. Otherwise, bpf program has no way to ensure 969 * known initial values for cpus other than current one 970 * (onallcpus=false always when coming from bpf prog). 971 */ 972 if (!onallcpus) { 973 int current_cpu = raw_smp_processor_id(); 974 int cpu; 975 976 for_each_possible_cpu(cpu) { 977 if (cpu == current_cpu) 978 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); 979 else /* Since elem is preallocated, we cannot touch special fields */ 980 zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); 981 } 982 } else { 983 pcpu_copy_value(htab, pptr, value, onallcpus); 984 } 985 } 986 987 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 988 { 989 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 990 BITS_PER_LONG == 64; 991 } 992 993 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 994 void *value, u32 key_size, u32 hash, 995 bool percpu, bool onallcpus, 996 struct htab_elem *old_elem) 997 { 998 u32 size = htab->map.value_size; 999 bool prealloc = htab_is_prealloc(htab); 1000 struct htab_elem *l_new, **pl_new; 1001 void __percpu *pptr; 1002 1003 if (prealloc) { 1004 if (old_elem) { 1005 /* if we're updating the existing element, 1006 * use per-cpu extra elems to avoid freelist_pop/push 1007 */ 1008 pl_new = this_cpu_ptr(htab->extra_elems); 1009 l_new = *pl_new; 1010 htab_put_fd_value(htab, old_elem); 1011 *pl_new = old_elem; 1012 } else { 1013 struct pcpu_freelist_node *l; 1014 1015 l = __pcpu_freelist_pop(&htab->freelist); 1016 if (!l) 1017 return ERR_PTR(-E2BIG); 1018 l_new = container_of(l, struct htab_elem, fnode); 1019 bpf_map_inc_elem_count(&htab->map); 1020 } 1021 } else { 1022 if (is_map_full(htab)) 1023 if (!old_elem) 1024 /* when map is full and update() is replacing 1025 * old element, it's ok to allocate, since 1026 * old element will be freed immediately. 1027 * Otherwise return an error 1028 */ 1029 return ERR_PTR(-E2BIG); 1030 inc_elem_count(htab); 1031 l_new = bpf_mem_cache_alloc(&htab->ma); 1032 if (!l_new) { 1033 l_new = ERR_PTR(-ENOMEM); 1034 goto dec_count; 1035 } 1036 } 1037 1038 memcpy(l_new->key, key, key_size); 1039 if (percpu) { 1040 if (prealloc) { 1041 pptr = htab_elem_get_ptr(l_new, key_size); 1042 } else { 1043 /* alloc_percpu zero-fills */ 1044 pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); 1045 if (!pptr) { 1046 bpf_mem_cache_free(&htab->ma, l_new); 1047 l_new = ERR_PTR(-ENOMEM); 1048 goto dec_count; 1049 } 1050 l_new->ptr_to_pptr = pptr; 1051 pptr = *(void **)pptr; 1052 } 1053 1054 pcpu_init_value(htab, pptr, value, onallcpus); 1055 1056 if (!prealloc) 1057 htab_elem_set_ptr(l_new, key_size, pptr); 1058 } else if (fd_htab_map_needs_adjust(htab)) { 1059 size = round_up(size, 8); 1060 memcpy(l_new->key + round_up(key_size, 8), value, size); 1061 } else { 1062 copy_map_value(&htab->map, 1063 l_new->key + round_up(key_size, 8), 1064 value); 1065 } 1066 1067 l_new->hash = hash; 1068 return l_new; 1069 dec_count: 1070 dec_elem_count(htab); 1071 return l_new; 1072 } 1073 1074 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 1075 u64 map_flags) 1076 { 1077 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 1078 /* elem already exists */ 1079 return -EEXIST; 1080 1081 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 1082 /* elem doesn't exist, cannot update it */ 1083 return -ENOENT; 1084 1085 return 0; 1086 } 1087 1088 /* Called from syscall or from eBPF program */ 1089 static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, 1090 u64 map_flags) 1091 { 1092 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1093 struct htab_elem *l_new = NULL, *l_old; 1094 struct hlist_nulls_head *head; 1095 unsigned long flags; 1096 struct bucket *b; 1097 u32 key_size, hash; 1098 int ret; 1099 1100 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 1101 /* unknown flags */ 1102 return -EINVAL; 1103 1104 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1105 !rcu_read_lock_bh_held()); 1106 1107 key_size = map->key_size; 1108 1109 hash = htab_map_hash(key, key_size, htab->hashrnd); 1110 1111 b = __select_bucket(htab, hash); 1112 head = &b->head; 1113 1114 if (unlikely(map_flags & BPF_F_LOCK)) { 1115 if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) 1116 return -EINVAL; 1117 /* find an element without taking the bucket lock */ 1118 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 1119 htab->n_buckets); 1120 ret = check_flags(htab, l_old, map_flags); 1121 if (ret) 1122 return ret; 1123 if (l_old) { 1124 /* grab the element lock and update value in place */ 1125 copy_map_value_locked(map, 1126 l_old->key + round_up(key_size, 8), 1127 value, false); 1128 return 0; 1129 } 1130 /* fall through, grab the bucket lock and lookup again. 1131 * 99.9% chance that the element won't be found, 1132 * but second lookup under lock has to be done. 1133 */ 1134 } 1135 1136 ret = htab_lock_bucket(htab, b, hash, &flags); 1137 if (ret) 1138 return ret; 1139 1140 l_old = lookup_elem_raw(head, hash, key, key_size); 1141 1142 ret = check_flags(htab, l_old, map_flags); 1143 if (ret) 1144 goto err; 1145 1146 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 1147 /* first lookup without the bucket lock didn't find the element, 1148 * but second lookup with the bucket lock found it. 1149 * This case is highly unlikely, but has to be dealt with: 1150 * grab the element lock in addition to the bucket lock 1151 * and update element in place 1152 */ 1153 copy_map_value_locked(map, 1154 l_old->key + round_up(key_size, 8), 1155 value, false); 1156 ret = 0; 1157 goto err; 1158 } 1159 1160 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 1161 l_old); 1162 if (IS_ERR(l_new)) { 1163 /* all pre-allocated elements are in use or memory exhausted */ 1164 ret = PTR_ERR(l_new); 1165 goto err; 1166 } 1167 1168 /* add new element to the head of the list, so that 1169 * concurrent search will find it before old elem 1170 */ 1171 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1172 if (l_old) { 1173 hlist_nulls_del_rcu(&l_old->hash_node); 1174 if (!htab_is_prealloc(htab)) 1175 free_htab_elem(htab, l_old); 1176 else 1177 check_and_free_fields(htab, l_old); 1178 } 1179 ret = 0; 1180 err: 1181 htab_unlock_bucket(htab, b, hash, flags); 1182 return ret; 1183 } 1184 1185 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) 1186 { 1187 check_and_free_fields(htab, elem); 1188 bpf_map_dec_elem_count(&htab->map); 1189 bpf_lru_push_free(&htab->lru, &elem->lru_node); 1190 } 1191 1192 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 1193 u64 map_flags) 1194 { 1195 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1196 struct htab_elem *l_new, *l_old = NULL; 1197 struct hlist_nulls_head *head; 1198 unsigned long flags; 1199 struct bucket *b; 1200 u32 key_size, hash; 1201 int ret; 1202 1203 if (unlikely(map_flags > BPF_EXIST)) 1204 /* unknown flags */ 1205 return -EINVAL; 1206 1207 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1208 !rcu_read_lock_bh_held()); 1209 1210 key_size = map->key_size; 1211 1212 hash = htab_map_hash(key, key_size, htab->hashrnd); 1213 1214 b = __select_bucket(htab, hash); 1215 head = &b->head; 1216 1217 /* For LRU, we need to alloc before taking bucket's 1218 * spinlock because getting free nodes from LRU may need 1219 * to remove older elements from htab and this removal 1220 * operation will need a bucket lock. 1221 */ 1222 l_new = prealloc_lru_pop(htab, key, hash); 1223 if (!l_new) 1224 return -ENOMEM; 1225 copy_map_value(&htab->map, 1226 l_new->key + round_up(map->key_size, 8), value); 1227 1228 ret = htab_lock_bucket(htab, b, hash, &flags); 1229 if (ret) 1230 goto err_lock_bucket; 1231 1232 l_old = lookup_elem_raw(head, hash, key, key_size); 1233 1234 ret = check_flags(htab, l_old, map_flags); 1235 if (ret) 1236 goto err; 1237 1238 /* add new element to the head of the list, so that 1239 * concurrent search will find it before old elem 1240 */ 1241 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1242 if (l_old) { 1243 bpf_lru_node_set_ref(&l_new->lru_node); 1244 hlist_nulls_del_rcu(&l_old->hash_node); 1245 } 1246 ret = 0; 1247 1248 err: 1249 htab_unlock_bucket(htab, b, hash, flags); 1250 1251 err_lock_bucket: 1252 if (ret) 1253 htab_lru_push_free(htab, l_new); 1254 else if (l_old) 1255 htab_lru_push_free(htab, l_old); 1256 1257 return ret; 1258 } 1259 1260 static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1261 void *value, u64 map_flags, 1262 bool onallcpus) 1263 { 1264 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1265 struct htab_elem *l_new = NULL, *l_old; 1266 struct hlist_nulls_head *head; 1267 unsigned long flags; 1268 struct bucket *b; 1269 u32 key_size, hash; 1270 int ret; 1271 1272 if (unlikely(map_flags > BPF_EXIST)) 1273 /* unknown flags */ 1274 return -EINVAL; 1275 1276 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1277 !rcu_read_lock_bh_held()); 1278 1279 key_size = map->key_size; 1280 1281 hash = htab_map_hash(key, key_size, htab->hashrnd); 1282 1283 b = __select_bucket(htab, hash); 1284 head = &b->head; 1285 1286 ret = htab_lock_bucket(htab, b, hash, &flags); 1287 if (ret) 1288 return ret; 1289 1290 l_old = lookup_elem_raw(head, hash, key, key_size); 1291 1292 ret = check_flags(htab, l_old, map_flags); 1293 if (ret) 1294 goto err; 1295 1296 if (l_old) { 1297 /* per-cpu hash map can update value in-place */ 1298 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1299 value, onallcpus); 1300 } else { 1301 l_new = alloc_htab_elem(htab, key, value, key_size, 1302 hash, true, onallcpus, NULL); 1303 if (IS_ERR(l_new)) { 1304 ret = PTR_ERR(l_new); 1305 goto err; 1306 } 1307 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1308 } 1309 ret = 0; 1310 err: 1311 htab_unlock_bucket(htab, b, hash, flags); 1312 return ret; 1313 } 1314 1315 static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1316 void *value, u64 map_flags, 1317 bool onallcpus) 1318 { 1319 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1320 struct htab_elem *l_new = NULL, *l_old; 1321 struct hlist_nulls_head *head; 1322 unsigned long flags; 1323 struct bucket *b; 1324 u32 key_size, hash; 1325 int ret; 1326 1327 if (unlikely(map_flags > BPF_EXIST)) 1328 /* unknown flags */ 1329 return -EINVAL; 1330 1331 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1332 !rcu_read_lock_bh_held()); 1333 1334 key_size = map->key_size; 1335 1336 hash = htab_map_hash(key, key_size, htab->hashrnd); 1337 1338 b = __select_bucket(htab, hash); 1339 head = &b->head; 1340 1341 /* For LRU, we need to alloc before taking bucket's 1342 * spinlock because LRU's elem alloc may need 1343 * to remove older elem from htab and this removal 1344 * operation will need a bucket lock. 1345 */ 1346 if (map_flags != BPF_EXIST) { 1347 l_new = prealloc_lru_pop(htab, key, hash); 1348 if (!l_new) 1349 return -ENOMEM; 1350 } 1351 1352 ret = htab_lock_bucket(htab, b, hash, &flags); 1353 if (ret) 1354 goto err_lock_bucket; 1355 1356 l_old = lookup_elem_raw(head, hash, key, key_size); 1357 1358 ret = check_flags(htab, l_old, map_flags); 1359 if (ret) 1360 goto err; 1361 1362 if (l_old) { 1363 bpf_lru_node_set_ref(&l_old->lru_node); 1364 1365 /* per-cpu hash map can update value in-place */ 1366 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1367 value, onallcpus); 1368 } else { 1369 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), 1370 value, onallcpus); 1371 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1372 l_new = NULL; 1373 } 1374 ret = 0; 1375 err: 1376 htab_unlock_bucket(htab, b, hash, flags); 1377 err_lock_bucket: 1378 if (l_new) { 1379 bpf_map_dec_elem_count(&htab->map); 1380 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1381 } 1382 return ret; 1383 } 1384 1385 static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1386 void *value, u64 map_flags) 1387 { 1388 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1389 } 1390 1391 static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1392 void *value, u64 map_flags) 1393 { 1394 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1395 false); 1396 } 1397 1398 /* Called from syscall or from eBPF program */ 1399 static long htab_map_delete_elem(struct bpf_map *map, void *key) 1400 { 1401 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1402 struct hlist_nulls_head *head; 1403 struct bucket *b; 1404 struct htab_elem *l; 1405 unsigned long flags; 1406 u32 hash, key_size; 1407 int ret; 1408 1409 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1410 !rcu_read_lock_bh_held()); 1411 1412 key_size = map->key_size; 1413 1414 hash = htab_map_hash(key, key_size, htab->hashrnd); 1415 b = __select_bucket(htab, hash); 1416 head = &b->head; 1417 1418 ret = htab_lock_bucket(htab, b, hash, &flags); 1419 if (ret) 1420 return ret; 1421 1422 l = lookup_elem_raw(head, hash, key, key_size); 1423 1424 if (l) { 1425 hlist_nulls_del_rcu(&l->hash_node); 1426 free_htab_elem(htab, l); 1427 } else { 1428 ret = -ENOENT; 1429 } 1430 1431 htab_unlock_bucket(htab, b, hash, flags); 1432 return ret; 1433 } 1434 1435 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1436 { 1437 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1438 struct hlist_nulls_head *head; 1439 struct bucket *b; 1440 struct htab_elem *l; 1441 unsigned long flags; 1442 u32 hash, key_size; 1443 int ret; 1444 1445 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1446 !rcu_read_lock_bh_held()); 1447 1448 key_size = map->key_size; 1449 1450 hash = htab_map_hash(key, key_size, htab->hashrnd); 1451 b = __select_bucket(htab, hash); 1452 head = &b->head; 1453 1454 ret = htab_lock_bucket(htab, b, hash, &flags); 1455 if (ret) 1456 return ret; 1457 1458 l = lookup_elem_raw(head, hash, key, key_size); 1459 1460 if (l) 1461 hlist_nulls_del_rcu(&l->hash_node); 1462 else 1463 ret = -ENOENT; 1464 1465 htab_unlock_bucket(htab, b, hash, flags); 1466 if (l) 1467 htab_lru_push_free(htab, l); 1468 return ret; 1469 } 1470 1471 static void delete_all_elements(struct bpf_htab *htab) 1472 { 1473 int i; 1474 1475 /* It's called from a worker thread, so disable migration here, 1476 * since bpf_mem_cache_free() relies on that. 1477 */ 1478 migrate_disable(); 1479 for (i = 0; i < htab->n_buckets; i++) { 1480 struct hlist_nulls_head *head = select_bucket(htab, i); 1481 struct hlist_nulls_node *n; 1482 struct htab_elem *l; 1483 1484 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1485 hlist_nulls_del_rcu(&l->hash_node); 1486 htab_elem_free(htab, l); 1487 } 1488 } 1489 migrate_enable(); 1490 } 1491 1492 static void htab_free_malloced_timers(struct bpf_htab *htab) 1493 { 1494 int i; 1495 1496 rcu_read_lock(); 1497 for (i = 0; i < htab->n_buckets; i++) { 1498 struct hlist_nulls_head *head = select_bucket(htab, i); 1499 struct hlist_nulls_node *n; 1500 struct htab_elem *l; 1501 1502 hlist_nulls_for_each_entry(l, n, head, hash_node) { 1503 /* We only free timer on uref dropping to zero */ 1504 bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); 1505 } 1506 cond_resched_rcu(); 1507 } 1508 rcu_read_unlock(); 1509 } 1510 1511 static void htab_map_free_timers(struct bpf_map *map) 1512 { 1513 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1514 1515 /* We only free timer on uref dropping to zero */ 1516 if (!btf_record_has_field(htab->map.record, BPF_TIMER)) 1517 return; 1518 if (!htab_is_prealloc(htab)) 1519 htab_free_malloced_timers(htab); 1520 else 1521 htab_free_prealloced_timers(htab); 1522 } 1523 1524 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1525 static void htab_map_free(struct bpf_map *map) 1526 { 1527 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1528 int i; 1529 1530 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1531 * bpf_free_used_maps() is called after bpf prog is no longer executing. 1532 * There is no need to synchronize_rcu() here to protect map elements. 1533 */ 1534 1535 /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it 1536 * underneath and is reponsible for waiting for callbacks to finish 1537 * during bpf_mem_alloc_destroy(). 1538 */ 1539 if (!htab_is_prealloc(htab)) { 1540 delete_all_elements(htab); 1541 } else { 1542 htab_free_prealloced_fields(htab); 1543 prealloc_destroy(htab); 1544 } 1545 1546 bpf_map_free_elem_count(map); 1547 free_percpu(htab->extra_elems); 1548 bpf_map_area_free(htab->buckets); 1549 bpf_mem_alloc_destroy(&htab->pcpu_ma); 1550 bpf_mem_alloc_destroy(&htab->ma); 1551 if (htab->use_percpu_counter) 1552 percpu_counter_destroy(&htab->pcount); 1553 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1554 free_percpu(htab->map_locked[i]); 1555 lockdep_unregister_key(&htab->lockdep_key); 1556 bpf_map_area_free(htab); 1557 } 1558 1559 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1560 struct seq_file *m) 1561 { 1562 void *value; 1563 1564 rcu_read_lock(); 1565 1566 value = htab_map_lookup_elem(map, key); 1567 if (!value) { 1568 rcu_read_unlock(); 1569 return; 1570 } 1571 1572 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1573 seq_puts(m, ": "); 1574 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1575 seq_puts(m, "\n"); 1576 1577 rcu_read_unlock(); 1578 } 1579 1580 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1581 void *value, bool is_lru_map, 1582 bool is_percpu, u64 flags) 1583 { 1584 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1585 struct hlist_nulls_head *head; 1586 unsigned long bflags; 1587 struct htab_elem *l; 1588 u32 hash, key_size; 1589 struct bucket *b; 1590 int ret; 1591 1592 key_size = map->key_size; 1593 1594 hash = htab_map_hash(key, key_size, htab->hashrnd); 1595 b = __select_bucket(htab, hash); 1596 head = &b->head; 1597 1598 ret = htab_lock_bucket(htab, b, hash, &bflags); 1599 if (ret) 1600 return ret; 1601 1602 l = lookup_elem_raw(head, hash, key, key_size); 1603 if (!l) { 1604 ret = -ENOENT; 1605 } else { 1606 if (is_percpu) { 1607 u32 roundup_value_size = round_up(map->value_size, 8); 1608 void __percpu *pptr; 1609 int off = 0, cpu; 1610 1611 pptr = htab_elem_get_ptr(l, key_size); 1612 for_each_possible_cpu(cpu) { 1613 copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); 1614 check_and_init_map_value(&htab->map, value + off); 1615 off += roundup_value_size; 1616 } 1617 } else { 1618 u32 roundup_key_size = round_up(map->key_size, 8); 1619 1620 if (flags & BPF_F_LOCK) 1621 copy_map_value_locked(map, value, l->key + 1622 roundup_key_size, 1623 true); 1624 else 1625 copy_map_value(map, value, l->key + 1626 roundup_key_size); 1627 /* Zeroing special fields in the temp buffer */ 1628 check_and_init_map_value(map, value); 1629 } 1630 1631 hlist_nulls_del_rcu(&l->hash_node); 1632 if (!is_lru_map) 1633 free_htab_elem(htab, l); 1634 } 1635 1636 htab_unlock_bucket(htab, b, hash, bflags); 1637 1638 if (is_lru_map && l) 1639 htab_lru_push_free(htab, l); 1640 1641 return ret; 1642 } 1643 1644 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1645 void *value, u64 flags) 1646 { 1647 return __htab_map_lookup_and_delete_elem(map, key, value, false, false, 1648 flags); 1649 } 1650 1651 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, 1652 void *key, void *value, 1653 u64 flags) 1654 { 1655 return __htab_map_lookup_and_delete_elem(map, key, value, false, true, 1656 flags); 1657 } 1658 1659 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1660 void *value, u64 flags) 1661 { 1662 return __htab_map_lookup_and_delete_elem(map, key, value, true, false, 1663 flags); 1664 } 1665 1666 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, 1667 void *key, void *value, 1668 u64 flags) 1669 { 1670 return __htab_map_lookup_and_delete_elem(map, key, value, true, true, 1671 flags); 1672 } 1673 1674 static int 1675 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1676 const union bpf_attr *attr, 1677 union bpf_attr __user *uattr, 1678 bool do_delete, bool is_lru_map, 1679 bool is_percpu) 1680 { 1681 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1682 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1683 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1684 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1685 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1686 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1687 u32 batch, max_count, size, bucket_size, map_id; 1688 struct htab_elem *node_to_free = NULL; 1689 u64 elem_map_flags, map_flags; 1690 struct hlist_nulls_head *head; 1691 struct hlist_nulls_node *n; 1692 unsigned long flags = 0; 1693 bool locked = false; 1694 struct htab_elem *l; 1695 struct bucket *b; 1696 int ret = 0; 1697 1698 elem_map_flags = attr->batch.elem_flags; 1699 if ((elem_map_flags & ~BPF_F_LOCK) || 1700 ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) 1701 return -EINVAL; 1702 1703 map_flags = attr->batch.flags; 1704 if (map_flags) 1705 return -EINVAL; 1706 1707 max_count = attr->batch.count; 1708 if (!max_count) 1709 return 0; 1710 1711 if (put_user(0, &uattr->batch.count)) 1712 return -EFAULT; 1713 1714 batch = 0; 1715 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1716 return -EFAULT; 1717 1718 if (batch >= htab->n_buckets) 1719 return -ENOENT; 1720 1721 key_size = htab->map.key_size; 1722 roundup_key_size = round_up(htab->map.key_size, 8); 1723 value_size = htab->map.value_size; 1724 size = round_up(value_size, 8); 1725 if (is_percpu) 1726 value_size = size * num_possible_cpus(); 1727 total = 0; 1728 /* while experimenting with hash tables with sizes ranging from 10 to 1729 * 1000, it was observed that a bucket can have up to 5 entries. 1730 */ 1731 bucket_size = 5; 1732 1733 alloc: 1734 /* We cannot do copy_from_user or copy_to_user inside 1735 * the rcu_read_lock. Allocate enough space here. 1736 */ 1737 keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); 1738 values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); 1739 if (!keys || !values) { 1740 ret = -ENOMEM; 1741 goto after_loop; 1742 } 1743 1744 again: 1745 bpf_disable_instrumentation(); 1746 rcu_read_lock(); 1747 again_nocopy: 1748 dst_key = keys; 1749 dst_val = values; 1750 b = &htab->buckets[batch]; 1751 head = &b->head; 1752 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1753 if (locked) { 1754 ret = htab_lock_bucket(htab, b, batch, &flags); 1755 if (ret) { 1756 rcu_read_unlock(); 1757 bpf_enable_instrumentation(); 1758 goto after_loop; 1759 } 1760 } 1761 1762 bucket_cnt = 0; 1763 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1764 bucket_cnt++; 1765 1766 if (bucket_cnt && !locked) { 1767 locked = true; 1768 goto again_nocopy; 1769 } 1770 1771 if (bucket_cnt > (max_count - total)) { 1772 if (total == 0) 1773 ret = -ENOSPC; 1774 /* Note that since bucket_cnt > 0 here, it is implicit 1775 * that the locked was grabbed, so release it. 1776 */ 1777 htab_unlock_bucket(htab, b, batch, flags); 1778 rcu_read_unlock(); 1779 bpf_enable_instrumentation(); 1780 goto after_loop; 1781 } 1782 1783 if (bucket_cnt > bucket_size) { 1784 bucket_size = bucket_cnt; 1785 /* Note that since bucket_cnt > 0 here, it is implicit 1786 * that the locked was grabbed, so release it. 1787 */ 1788 htab_unlock_bucket(htab, b, batch, flags); 1789 rcu_read_unlock(); 1790 bpf_enable_instrumentation(); 1791 kvfree(keys); 1792 kvfree(values); 1793 goto alloc; 1794 } 1795 1796 /* Next block is only safe to run if you have grabbed the lock */ 1797 if (!locked) 1798 goto next_batch; 1799 1800 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1801 memcpy(dst_key, l->key, key_size); 1802 1803 if (is_percpu) { 1804 int off = 0, cpu; 1805 void __percpu *pptr; 1806 1807 pptr = htab_elem_get_ptr(l, map->key_size); 1808 for_each_possible_cpu(cpu) { 1809 copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); 1810 check_and_init_map_value(&htab->map, dst_val + off); 1811 off += size; 1812 } 1813 } else { 1814 value = l->key + roundup_key_size; 1815 if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 1816 struct bpf_map **inner_map = value; 1817 1818 /* Actual value is the id of the inner map */ 1819 map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); 1820 value = &map_id; 1821 } 1822 1823 if (elem_map_flags & BPF_F_LOCK) 1824 copy_map_value_locked(map, dst_val, value, 1825 true); 1826 else 1827 copy_map_value(map, dst_val, value); 1828 /* Zeroing special fields in the temp buffer */ 1829 check_and_init_map_value(map, dst_val); 1830 } 1831 if (do_delete) { 1832 hlist_nulls_del_rcu(&l->hash_node); 1833 1834 /* bpf_lru_push_free() will acquire lru_lock, which 1835 * may cause deadlock. See comments in function 1836 * prealloc_lru_pop(). Let us do bpf_lru_push_free() 1837 * after releasing the bucket lock. 1838 */ 1839 if (is_lru_map) { 1840 l->batch_flink = node_to_free; 1841 node_to_free = l; 1842 } else { 1843 free_htab_elem(htab, l); 1844 } 1845 } 1846 dst_key += key_size; 1847 dst_val += value_size; 1848 } 1849 1850 htab_unlock_bucket(htab, b, batch, flags); 1851 locked = false; 1852 1853 while (node_to_free) { 1854 l = node_to_free; 1855 node_to_free = node_to_free->batch_flink; 1856 htab_lru_push_free(htab, l); 1857 } 1858 1859 next_batch: 1860 /* If we are not copying data, we can go to next bucket and avoid 1861 * unlocking the rcu. 1862 */ 1863 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1864 batch++; 1865 goto again_nocopy; 1866 } 1867 1868 rcu_read_unlock(); 1869 bpf_enable_instrumentation(); 1870 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1871 key_size * bucket_cnt) || 1872 copy_to_user(uvalues + total * value_size, values, 1873 value_size * bucket_cnt))) { 1874 ret = -EFAULT; 1875 goto after_loop; 1876 } 1877 1878 total += bucket_cnt; 1879 batch++; 1880 if (batch >= htab->n_buckets) { 1881 ret = -ENOENT; 1882 goto after_loop; 1883 } 1884 goto again; 1885 1886 after_loop: 1887 if (ret == -EFAULT) 1888 goto out; 1889 1890 /* copy # of entries and next batch */ 1891 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1892 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1893 put_user(total, &uattr->batch.count)) 1894 ret = -EFAULT; 1895 1896 out: 1897 kvfree(keys); 1898 kvfree(values); 1899 return ret; 1900 } 1901 1902 static int 1903 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1904 union bpf_attr __user *uattr) 1905 { 1906 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1907 false, true); 1908 } 1909 1910 static int 1911 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1912 const union bpf_attr *attr, 1913 union bpf_attr __user *uattr) 1914 { 1915 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1916 false, true); 1917 } 1918 1919 static int 1920 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1921 union bpf_attr __user *uattr) 1922 { 1923 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1924 false, false); 1925 } 1926 1927 static int 1928 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1929 const union bpf_attr *attr, 1930 union bpf_attr __user *uattr) 1931 { 1932 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1933 false, false); 1934 } 1935 1936 static int 1937 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1938 const union bpf_attr *attr, 1939 union bpf_attr __user *uattr) 1940 { 1941 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1942 true, true); 1943 } 1944 1945 static int 1946 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1947 const union bpf_attr *attr, 1948 union bpf_attr __user *uattr) 1949 { 1950 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1951 true, true); 1952 } 1953 1954 static int 1955 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1956 union bpf_attr __user *uattr) 1957 { 1958 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1959 true, false); 1960 } 1961 1962 static int 1963 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1964 const union bpf_attr *attr, 1965 union bpf_attr __user *uattr) 1966 { 1967 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1968 true, false); 1969 } 1970 1971 struct bpf_iter_seq_hash_map_info { 1972 struct bpf_map *map; 1973 struct bpf_htab *htab; 1974 void *percpu_value_buf; // non-zero means percpu hash 1975 u32 bucket_id; 1976 u32 skip_elems; 1977 }; 1978 1979 static struct htab_elem * 1980 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, 1981 struct htab_elem *prev_elem) 1982 { 1983 const struct bpf_htab *htab = info->htab; 1984 u32 skip_elems = info->skip_elems; 1985 u32 bucket_id = info->bucket_id; 1986 struct hlist_nulls_head *head; 1987 struct hlist_nulls_node *n; 1988 struct htab_elem *elem; 1989 struct bucket *b; 1990 u32 i, count; 1991 1992 if (bucket_id >= htab->n_buckets) 1993 return NULL; 1994 1995 /* try to find next elem in the same bucket */ 1996 if (prev_elem) { 1997 /* no update/deletion on this bucket, prev_elem should be still valid 1998 * and we won't skip elements. 1999 */ 2000 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); 2001 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); 2002 if (elem) 2003 return elem; 2004 2005 /* not found, unlock and go to the next bucket */ 2006 b = &htab->buckets[bucket_id++]; 2007 rcu_read_unlock(); 2008 skip_elems = 0; 2009 } 2010 2011 for (i = bucket_id; i < htab->n_buckets; i++) { 2012 b = &htab->buckets[i]; 2013 rcu_read_lock(); 2014 2015 count = 0; 2016 head = &b->head; 2017 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 2018 if (count >= skip_elems) { 2019 info->bucket_id = i; 2020 info->skip_elems = count; 2021 return elem; 2022 } 2023 count++; 2024 } 2025 2026 rcu_read_unlock(); 2027 skip_elems = 0; 2028 } 2029 2030 info->bucket_id = i; 2031 info->skip_elems = 0; 2032 return NULL; 2033 } 2034 2035 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) 2036 { 2037 struct bpf_iter_seq_hash_map_info *info = seq->private; 2038 struct htab_elem *elem; 2039 2040 elem = bpf_hash_map_seq_find_next(info, NULL); 2041 if (!elem) 2042 return NULL; 2043 2044 if (*pos == 0) 2045 ++*pos; 2046 return elem; 2047 } 2048 2049 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2050 { 2051 struct bpf_iter_seq_hash_map_info *info = seq->private; 2052 2053 ++*pos; 2054 ++info->skip_elems; 2055 return bpf_hash_map_seq_find_next(info, v); 2056 } 2057 2058 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) 2059 { 2060 struct bpf_iter_seq_hash_map_info *info = seq->private; 2061 u32 roundup_key_size, roundup_value_size; 2062 struct bpf_iter__bpf_map_elem ctx = {}; 2063 struct bpf_map *map = info->map; 2064 struct bpf_iter_meta meta; 2065 int ret = 0, off = 0, cpu; 2066 struct bpf_prog *prog; 2067 void __percpu *pptr; 2068 2069 meta.seq = seq; 2070 prog = bpf_iter_get_info(&meta, elem == NULL); 2071 if (prog) { 2072 ctx.meta = &meta; 2073 ctx.map = info->map; 2074 if (elem) { 2075 roundup_key_size = round_up(map->key_size, 8); 2076 ctx.key = elem->key; 2077 if (!info->percpu_value_buf) { 2078 ctx.value = elem->key + roundup_key_size; 2079 } else { 2080 roundup_value_size = round_up(map->value_size, 8); 2081 pptr = htab_elem_get_ptr(elem, map->key_size); 2082 for_each_possible_cpu(cpu) { 2083 copy_map_value_long(map, info->percpu_value_buf + off, 2084 per_cpu_ptr(pptr, cpu)); 2085 check_and_init_map_value(map, info->percpu_value_buf + off); 2086 off += roundup_value_size; 2087 } 2088 ctx.value = info->percpu_value_buf; 2089 } 2090 } 2091 ret = bpf_iter_run_prog(prog, &ctx); 2092 } 2093 2094 return ret; 2095 } 2096 2097 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) 2098 { 2099 return __bpf_hash_map_seq_show(seq, v); 2100 } 2101 2102 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) 2103 { 2104 if (!v) 2105 (void)__bpf_hash_map_seq_show(seq, NULL); 2106 else 2107 rcu_read_unlock(); 2108 } 2109 2110 static int bpf_iter_init_hash_map(void *priv_data, 2111 struct bpf_iter_aux_info *aux) 2112 { 2113 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 2114 struct bpf_map *map = aux->map; 2115 void *value_buf; 2116 u32 buf_size; 2117 2118 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 2119 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 2120 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 2121 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 2122 if (!value_buf) 2123 return -ENOMEM; 2124 2125 seq_info->percpu_value_buf = value_buf; 2126 } 2127 2128 bpf_map_inc_with_uref(map); 2129 seq_info->map = map; 2130 seq_info->htab = container_of(map, struct bpf_htab, map); 2131 return 0; 2132 } 2133 2134 static void bpf_iter_fini_hash_map(void *priv_data) 2135 { 2136 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 2137 2138 bpf_map_put_with_uref(seq_info->map); 2139 kfree(seq_info->percpu_value_buf); 2140 } 2141 2142 static const struct seq_operations bpf_hash_map_seq_ops = { 2143 .start = bpf_hash_map_seq_start, 2144 .next = bpf_hash_map_seq_next, 2145 .stop = bpf_hash_map_seq_stop, 2146 .show = bpf_hash_map_seq_show, 2147 }; 2148 2149 static const struct bpf_iter_seq_info iter_seq_info = { 2150 .seq_ops = &bpf_hash_map_seq_ops, 2151 .init_seq_private = bpf_iter_init_hash_map, 2152 .fini_seq_private = bpf_iter_fini_hash_map, 2153 .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), 2154 }; 2155 2156 static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, 2157 void *callback_ctx, u64 flags) 2158 { 2159 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2160 struct hlist_nulls_head *head; 2161 struct hlist_nulls_node *n; 2162 struct htab_elem *elem; 2163 u32 roundup_key_size; 2164 int i, num_elems = 0; 2165 void __percpu *pptr; 2166 struct bucket *b; 2167 void *key, *val; 2168 bool is_percpu; 2169 u64 ret = 0; 2170 2171 if (flags != 0) 2172 return -EINVAL; 2173 2174 is_percpu = htab_is_percpu(htab); 2175 2176 roundup_key_size = round_up(map->key_size, 8); 2177 /* disable migration so percpu value prepared here will be the 2178 * same as the one seen by the bpf program with bpf_map_lookup_elem(). 2179 */ 2180 if (is_percpu) 2181 migrate_disable(); 2182 for (i = 0; i < htab->n_buckets; i++) { 2183 b = &htab->buckets[i]; 2184 rcu_read_lock(); 2185 head = &b->head; 2186 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 2187 key = elem->key; 2188 if (is_percpu) { 2189 /* current cpu value for percpu map */ 2190 pptr = htab_elem_get_ptr(elem, map->key_size); 2191 val = this_cpu_ptr(pptr); 2192 } else { 2193 val = elem->key + roundup_key_size; 2194 } 2195 num_elems++; 2196 ret = callback_fn((u64)(long)map, (u64)(long)key, 2197 (u64)(long)val, (u64)(long)callback_ctx, 0); 2198 /* return value: 0 - continue, 1 - stop and return */ 2199 if (ret) { 2200 rcu_read_unlock(); 2201 goto out; 2202 } 2203 } 2204 rcu_read_unlock(); 2205 } 2206 out: 2207 if (is_percpu) 2208 migrate_enable(); 2209 return num_elems; 2210 } 2211 2212 static u64 htab_map_mem_usage(const struct bpf_map *map) 2213 { 2214 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2215 u32 value_size = round_up(htab->map.value_size, 8); 2216 bool prealloc = htab_is_prealloc(htab); 2217 bool percpu = htab_is_percpu(htab); 2218 bool lru = htab_is_lru(htab); 2219 u64 num_entries; 2220 u64 usage = sizeof(struct bpf_htab); 2221 2222 usage += sizeof(struct bucket) * htab->n_buckets; 2223 usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; 2224 if (prealloc) { 2225 num_entries = map->max_entries; 2226 if (htab_has_extra_elems(htab)) 2227 num_entries += num_possible_cpus(); 2228 2229 usage += htab->elem_size * num_entries; 2230 2231 if (percpu) 2232 usage += value_size * num_possible_cpus() * num_entries; 2233 else if (!lru) 2234 usage += sizeof(struct htab_elem *) * num_possible_cpus(); 2235 } else { 2236 #define LLIST_NODE_SZ sizeof(struct llist_node) 2237 2238 num_entries = htab->use_percpu_counter ? 2239 percpu_counter_sum(&htab->pcount) : 2240 atomic_read(&htab->count); 2241 usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; 2242 if (percpu) { 2243 usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; 2244 usage += value_size * num_possible_cpus() * num_entries; 2245 } 2246 } 2247 return usage; 2248 } 2249 2250 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) 2251 const struct bpf_map_ops htab_map_ops = { 2252 .map_meta_equal = bpf_map_meta_equal, 2253 .map_alloc_check = htab_map_alloc_check, 2254 .map_alloc = htab_map_alloc, 2255 .map_free = htab_map_free, 2256 .map_get_next_key = htab_map_get_next_key, 2257 .map_release_uref = htab_map_free_timers, 2258 .map_lookup_elem = htab_map_lookup_elem, 2259 .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, 2260 .map_update_elem = htab_map_update_elem, 2261 .map_delete_elem = htab_map_delete_elem, 2262 .map_gen_lookup = htab_map_gen_lookup, 2263 .map_seq_show_elem = htab_map_seq_show_elem, 2264 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2265 .map_for_each_callback = bpf_for_each_hash_elem, 2266 .map_mem_usage = htab_map_mem_usage, 2267 BATCH_OPS(htab), 2268 .map_btf_id = &htab_map_btf_ids[0], 2269 .iter_seq_info = &iter_seq_info, 2270 }; 2271 2272 const struct bpf_map_ops htab_lru_map_ops = { 2273 .map_meta_equal = bpf_map_meta_equal, 2274 .map_alloc_check = htab_map_alloc_check, 2275 .map_alloc = htab_map_alloc, 2276 .map_free = htab_map_free, 2277 .map_get_next_key = htab_map_get_next_key, 2278 .map_release_uref = htab_map_free_timers, 2279 .map_lookup_elem = htab_lru_map_lookup_elem, 2280 .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, 2281 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 2282 .map_update_elem = htab_lru_map_update_elem, 2283 .map_delete_elem = htab_lru_map_delete_elem, 2284 .map_gen_lookup = htab_lru_map_gen_lookup, 2285 .map_seq_show_elem = htab_map_seq_show_elem, 2286 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2287 .map_for_each_callback = bpf_for_each_hash_elem, 2288 .map_mem_usage = htab_map_mem_usage, 2289 BATCH_OPS(htab_lru), 2290 .map_btf_id = &htab_map_btf_ids[0], 2291 .iter_seq_info = &iter_seq_info, 2292 }; 2293 2294 /* Called from eBPF program */ 2295 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 2296 { 2297 struct htab_elem *l = __htab_map_lookup_elem(map, key); 2298 2299 if (l) 2300 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 2301 else 2302 return NULL; 2303 } 2304 2305 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) 2306 { 2307 struct htab_elem *l; 2308 2309 if (cpu >= nr_cpu_ids) 2310 return NULL; 2311 2312 l = __htab_map_lookup_elem(map, key); 2313 if (l) 2314 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); 2315 else 2316 return NULL; 2317 } 2318 2319 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 2320 { 2321 struct htab_elem *l = __htab_map_lookup_elem(map, key); 2322 2323 if (l) { 2324 bpf_lru_node_set_ref(&l->lru_node); 2325 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 2326 } 2327 2328 return NULL; 2329 } 2330 2331 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) 2332 { 2333 struct htab_elem *l; 2334 2335 if (cpu >= nr_cpu_ids) 2336 return NULL; 2337 2338 l = __htab_map_lookup_elem(map, key); 2339 if (l) { 2340 bpf_lru_node_set_ref(&l->lru_node); 2341 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); 2342 } 2343 2344 return NULL; 2345 } 2346 2347 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 2348 { 2349 struct htab_elem *l; 2350 void __percpu *pptr; 2351 int ret = -ENOENT; 2352 int cpu, off = 0; 2353 u32 size; 2354 2355 /* per_cpu areas are zero-filled and bpf programs can only 2356 * access 'value_size' of them, so copying rounded areas 2357 * will not leak any kernel data 2358 */ 2359 size = round_up(map->value_size, 8); 2360 rcu_read_lock(); 2361 l = __htab_map_lookup_elem(map, key); 2362 if (!l) 2363 goto out; 2364 /* We do not mark LRU map element here in order to not mess up 2365 * eviction heuristics when user space does a map walk. 2366 */ 2367 pptr = htab_elem_get_ptr(l, map->key_size); 2368 for_each_possible_cpu(cpu) { 2369 copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); 2370 check_and_init_map_value(map, value + off); 2371 off += size; 2372 } 2373 ret = 0; 2374 out: 2375 rcu_read_unlock(); 2376 return ret; 2377 } 2378 2379 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 2380 u64 map_flags) 2381 { 2382 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2383 int ret; 2384 2385 rcu_read_lock(); 2386 if (htab_is_lru(htab)) 2387 ret = __htab_lru_percpu_map_update_elem(map, key, value, 2388 map_flags, true); 2389 else 2390 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 2391 true); 2392 rcu_read_unlock(); 2393 2394 return ret; 2395 } 2396 2397 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 2398 struct seq_file *m) 2399 { 2400 struct htab_elem *l; 2401 void __percpu *pptr; 2402 int cpu; 2403 2404 rcu_read_lock(); 2405 2406 l = __htab_map_lookup_elem(map, key); 2407 if (!l) { 2408 rcu_read_unlock(); 2409 return; 2410 } 2411 2412 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 2413 seq_puts(m, ": {\n"); 2414 pptr = htab_elem_get_ptr(l, map->key_size); 2415 for_each_possible_cpu(cpu) { 2416 seq_printf(m, "\tcpu%d: ", cpu); 2417 btf_type_seq_show(map->btf, map->btf_value_type_id, 2418 per_cpu_ptr(pptr, cpu), m); 2419 seq_puts(m, "\n"); 2420 } 2421 seq_puts(m, "}\n"); 2422 2423 rcu_read_unlock(); 2424 } 2425 2426 const struct bpf_map_ops htab_percpu_map_ops = { 2427 .map_meta_equal = bpf_map_meta_equal, 2428 .map_alloc_check = htab_map_alloc_check, 2429 .map_alloc = htab_map_alloc, 2430 .map_free = htab_map_free, 2431 .map_get_next_key = htab_map_get_next_key, 2432 .map_lookup_elem = htab_percpu_map_lookup_elem, 2433 .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, 2434 .map_update_elem = htab_percpu_map_update_elem, 2435 .map_delete_elem = htab_map_delete_elem, 2436 .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, 2437 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2438 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2439 .map_for_each_callback = bpf_for_each_hash_elem, 2440 .map_mem_usage = htab_map_mem_usage, 2441 BATCH_OPS(htab_percpu), 2442 .map_btf_id = &htab_map_btf_ids[0], 2443 .iter_seq_info = &iter_seq_info, 2444 }; 2445 2446 const struct bpf_map_ops htab_lru_percpu_map_ops = { 2447 .map_meta_equal = bpf_map_meta_equal, 2448 .map_alloc_check = htab_map_alloc_check, 2449 .map_alloc = htab_map_alloc, 2450 .map_free = htab_map_free, 2451 .map_get_next_key = htab_map_get_next_key, 2452 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 2453 .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, 2454 .map_update_elem = htab_lru_percpu_map_update_elem, 2455 .map_delete_elem = htab_lru_map_delete_elem, 2456 .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, 2457 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2458 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2459 .map_for_each_callback = bpf_for_each_hash_elem, 2460 .map_mem_usage = htab_map_mem_usage, 2461 BATCH_OPS(htab_lru_percpu), 2462 .map_btf_id = &htab_map_btf_ids[0], 2463 .iter_seq_info = &iter_seq_info, 2464 }; 2465 2466 static int fd_htab_map_alloc_check(union bpf_attr *attr) 2467 { 2468 if (attr->value_size != sizeof(u32)) 2469 return -EINVAL; 2470 return htab_map_alloc_check(attr); 2471 } 2472 2473 static void fd_htab_map_free(struct bpf_map *map) 2474 { 2475 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2476 struct hlist_nulls_node *n; 2477 struct hlist_nulls_head *head; 2478 struct htab_elem *l; 2479 int i; 2480 2481 for (i = 0; i < htab->n_buckets; i++) { 2482 head = select_bucket(htab, i); 2483 2484 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 2485 void *ptr = fd_htab_map_get_ptr(map, l); 2486 2487 map->ops->map_fd_put_ptr(map, ptr, false); 2488 } 2489 } 2490 2491 htab_map_free(map); 2492 } 2493 2494 /* only called from syscall */ 2495 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 2496 { 2497 void **ptr; 2498 int ret = 0; 2499 2500 if (!map->ops->map_fd_sys_lookup_elem) 2501 return -ENOTSUPP; 2502 2503 rcu_read_lock(); 2504 ptr = htab_map_lookup_elem(map, key); 2505 if (ptr) 2506 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 2507 else 2508 ret = -ENOENT; 2509 rcu_read_unlock(); 2510 2511 return ret; 2512 } 2513 2514 /* only called from syscall */ 2515 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 2516 void *key, void *value, u64 map_flags) 2517 { 2518 void *ptr; 2519 int ret; 2520 u32 ufd = *(u32 *)value; 2521 2522 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 2523 if (IS_ERR(ptr)) 2524 return PTR_ERR(ptr); 2525 2526 ret = htab_map_update_elem(map, key, &ptr, map_flags); 2527 if (ret) 2528 map->ops->map_fd_put_ptr(map, ptr, false); 2529 2530 return ret; 2531 } 2532 2533 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 2534 { 2535 struct bpf_map *map, *inner_map_meta; 2536 2537 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 2538 if (IS_ERR(inner_map_meta)) 2539 return inner_map_meta; 2540 2541 map = htab_map_alloc(attr); 2542 if (IS_ERR(map)) { 2543 bpf_map_meta_free(inner_map_meta); 2544 return map; 2545 } 2546 2547 map->inner_map_meta = inner_map_meta; 2548 2549 return map; 2550 } 2551 2552 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 2553 { 2554 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 2555 2556 if (!inner_map) 2557 return NULL; 2558 2559 return READ_ONCE(*inner_map); 2560 } 2561 2562 static int htab_of_map_gen_lookup(struct bpf_map *map, 2563 struct bpf_insn *insn_buf) 2564 { 2565 struct bpf_insn *insn = insn_buf; 2566 const int ret = BPF_REG_0; 2567 2568 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 2569 (void *(*)(struct bpf_map *map, void *key))NULL)); 2570 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 2571 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 2572 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 2573 offsetof(struct htab_elem, key) + 2574 round_up(map->key_size, 8)); 2575 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 2576 2577 return insn - insn_buf; 2578 } 2579 2580 static void htab_of_map_free(struct bpf_map *map) 2581 { 2582 bpf_map_meta_free(map->inner_map_meta); 2583 fd_htab_map_free(map); 2584 } 2585 2586 const struct bpf_map_ops htab_of_maps_map_ops = { 2587 .map_alloc_check = fd_htab_map_alloc_check, 2588 .map_alloc = htab_of_map_alloc, 2589 .map_free = htab_of_map_free, 2590 .map_get_next_key = htab_map_get_next_key, 2591 .map_lookup_elem = htab_of_map_lookup_elem, 2592 .map_delete_elem = htab_map_delete_elem, 2593 .map_fd_get_ptr = bpf_map_fd_get_ptr, 2594 .map_fd_put_ptr = bpf_map_fd_put_ptr, 2595 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 2596 .map_gen_lookup = htab_of_map_gen_lookup, 2597 .map_check_btf = map_check_no_btf, 2598 .map_mem_usage = htab_map_mem_usage, 2599 BATCH_OPS(htab), 2600 .map_btf_id = &htab_map_btf_ids[0], 2601 }; 2602