1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * Copyright (c) 2016 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/bpf.h> 14 #include <linux/jhash.h> 15 #include <linux/filter.h> 16 #include "percpu_freelist.h" 17 #include "bpf_lru_list.h" 18 19 struct bucket { 20 struct hlist_head head; 21 raw_spinlock_t lock; 22 }; 23 24 struct bpf_htab { 25 struct bpf_map map; 26 struct bucket *buckets; 27 void *elems; 28 union { 29 struct pcpu_freelist freelist; 30 struct bpf_lru lru; 31 }; 32 void __percpu *extra_elems; 33 atomic_t count; /* number of elements in this hashtable */ 34 u32 n_buckets; /* number of hash buckets */ 35 u32 elem_size; /* size of each element in bytes */ 36 }; 37 38 enum extra_elem_state { 39 HTAB_NOT_AN_EXTRA_ELEM = 0, 40 HTAB_EXTRA_ELEM_FREE, 41 HTAB_EXTRA_ELEM_USED 42 }; 43 44 /* each htab element is struct htab_elem + key + value */ 45 struct htab_elem { 46 union { 47 struct hlist_node hash_node; 48 struct bpf_htab *htab; 49 struct pcpu_freelist_node fnode; 50 }; 51 union { 52 struct rcu_head rcu; 53 enum extra_elem_state state; 54 struct bpf_lru_node lru_node; 55 }; 56 u32 hash; 57 char key[0] __aligned(8); 58 }; 59 60 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 61 62 static bool htab_is_lru(const struct bpf_htab *htab) 63 { 64 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 65 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 66 } 67 68 static bool htab_is_percpu(const struct bpf_htab *htab) 69 { 70 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 71 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 72 } 73 74 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 75 void __percpu *pptr) 76 { 77 *(void __percpu **)(l->key + key_size) = pptr; 78 } 79 80 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 81 { 82 return *(void __percpu **)(l->key + key_size); 83 } 84 85 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 86 { 87 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 88 } 89 90 static void htab_free_elems(struct bpf_htab *htab) 91 { 92 int i; 93 94 if (!htab_is_percpu(htab)) 95 goto free_elems; 96 97 for (i = 0; i < htab->map.max_entries; i++) { 98 void __percpu *pptr; 99 100 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 101 htab->map.key_size); 102 free_percpu(pptr); 103 } 104 free_elems: 105 bpf_map_area_free(htab->elems); 106 } 107 108 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 109 u32 hash) 110 { 111 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 112 struct htab_elem *l; 113 114 if (node) { 115 l = container_of(node, struct htab_elem, lru_node); 116 memcpy(l->key, key, htab->map.key_size); 117 return l; 118 } 119 120 return NULL; 121 } 122 123 static int prealloc_init(struct bpf_htab *htab) 124 { 125 int err = -ENOMEM, i; 126 127 htab->elems = bpf_map_area_alloc(htab->elem_size * 128 htab->map.max_entries); 129 if (!htab->elems) 130 return -ENOMEM; 131 132 if (!htab_is_percpu(htab)) 133 goto skip_percpu_elems; 134 135 for (i = 0; i < htab->map.max_entries; i++) { 136 u32 size = round_up(htab->map.value_size, 8); 137 void __percpu *pptr; 138 139 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 140 if (!pptr) 141 goto free_elems; 142 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 143 pptr); 144 } 145 146 skip_percpu_elems: 147 if (htab_is_lru(htab)) 148 err = bpf_lru_init(&htab->lru, 149 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 150 offsetof(struct htab_elem, hash) - 151 offsetof(struct htab_elem, lru_node), 152 htab_lru_map_delete_node, 153 htab); 154 else 155 err = pcpu_freelist_init(&htab->freelist); 156 157 if (err) 158 goto free_elems; 159 160 if (htab_is_lru(htab)) 161 bpf_lru_populate(&htab->lru, htab->elems, 162 offsetof(struct htab_elem, lru_node), 163 htab->elem_size, htab->map.max_entries); 164 else 165 pcpu_freelist_populate(&htab->freelist, htab->elems, 166 htab->elem_size, htab->map.max_entries); 167 168 return 0; 169 170 free_elems: 171 htab_free_elems(htab); 172 return err; 173 } 174 175 static void prealloc_destroy(struct bpf_htab *htab) 176 { 177 htab_free_elems(htab); 178 179 if (htab_is_lru(htab)) 180 bpf_lru_destroy(&htab->lru); 181 else 182 pcpu_freelist_destroy(&htab->freelist); 183 } 184 185 static int alloc_extra_elems(struct bpf_htab *htab) 186 { 187 void __percpu *pptr; 188 int cpu; 189 190 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); 191 if (!pptr) 192 return -ENOMEM; 193 194 for_each_possible_cpu(cpu) { 195 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = 196 HTAB_EXTRA_ELEM_FREE; 197 } 198 htab->extra_elems = pptr; 199 return 0; 200 } 201 202 /* Called from syscall */ 203 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 204 { 205 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 206 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 207 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 208 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 209 /* percpu_lru means each cpu has its own LRU list. 210 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 211 * the map's value itself is percpu. percpu_lru has 212 * nothing to do with the map's value. 213 */ 214 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 215 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 216 struct bpf_htab *htab; 217 int err, i; 218 u64 cost; 219 220 if (lru && !capable(CAP_SYS_ADMIN)) 221 /* LRU implementation is much complicated than other 222 * maps. Hence, limit to CAP_SYS_ADMIN for now. 223 */ 224 return ERR_PTR(-EPERM); 225 226 if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) 227 /* reserved bits should not be used */ 228 return ERR_PTR(-EINVAL); 229 230 if (!lru && percpu_lru) 231 return ERR_PTR(-EINVAL); 232 233 if (lru && !prealloc) 234 return ERR_PTR(-ENOTSUPP); 235 236 htab = kzalloc(sizeof(*htab), GFP_USER); 237 if (!htab) 238 return ERR_PTR(-ENOMEM); 239 240 /* mandatory map attributes */ 241 htab->map.map_type = attr->map_type; 242 htab->map.key_size = attr->key_size; 243 htab->map.value_size = attr->value_size; 244 htab->map.max_entries = attr->max_entries; 245 htab->map.map_flags = attr->map_flags; 246 247 /* check sanity of attributes. 248 * value_size == 0 may be allowed in the future to use map as a set 249 */ 250 err = -EINVAL; 251 if (htab->map.max_entries == 0 || htab->map.key_size == 0 || 252 htab->map.value_size == 0) 253 goto free_htab; 254 255 if (percpu_lru) { 256 /* ensure each CPU's lru list has >=1 elements. 257 * since we are at it, make each lru list has the same 258 * number of elements. 259 */ 260 htab->map.max_entries = roundup(attr->max_entries, 261 num_possible_cpus()); 262 if (htab->map.max_entries < attr->max_entries) 263 htab->map.max_entries = rounddown(attr->max_entries, 264 num_possible_cpus()); 265 } 266 267 /* hash table size must be power of 2 */ 268 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 269 270 err = -E2BIG; 271 if (htab->map.key_size > MAX_BPF_STACK) 272 /* eBPF programs initialize keys on stack, so they cannot be 273 * larger than max stack size 274 */ 275 goto free_htab; 276 277 if (htab->map.value_size >= KMALLOC_MAX_SIZE - 278 MAX_BPF_STACK - sizeof(struct htab_elem)) 279 /* if value_size is bigger, the user space won't be able to 280 * access the elements via bpf syscall. This check also makes 281 * sure that the elem_size doesn't overflow and it's 282 * kmalloc-able later in htab_map_update_elem() 283 */ 284 goto free_htab; 285 286 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) 287 /* make sure the size for pcpu_alloc() is reasonable */ 288 goto free_htab; 289 290 htab->elem_size = sizeof(struct htab_elem) + 291 round_up(htab->map.key_size, 8); 292 if (percpu) 293 htab->elem_size += sizeof(void *); 294 else 295 htab->elem_size += round_up(htab->map.value_size, 8); 296 297 /* prevent zero size kmalloc and check for u32 overflow */ 298 if (htab->n_buckets == 0 || 299 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 300 goto free_htab; 301 302 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 303 (u64) htab->elem_size * htab->map.max_entries; 304 305 if (percpu) 306 cost += (u64) round_up(htab->map.value_size, 8) * 307 num_possible_cpus() * htab->map.max_entries; 308 else 309 cost += (u64) htab->elem_size * num_possible_cpus(); 310 311 if (cost >= U32_MAX - PAGE_SIZE) 312 /* make sure page count doesn't overflow */ 313 goto free_htab; 314 315 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 316 317 /* if map size is larger than memlock limit, reject it early */ 318 err = bpf_map_precharge_memlock(htab->map.pages); 319 if (err) 320 goto free_htab; 321 322 err = -ENOMEM; 323 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 324 sizeof(struct bucket)); 325 if (!htab->buckets) 326 goto free_htab; 327 328 for (i = 0; i < htab->n_buckets; i++) { 329 INIT_HLIST_HEAD(&htab->buckets[i].head); 330 raw_spin_lock_init(&htab->buckets[i].lock); 331 } 332 333 if (!percpu && !lru) { 334 /* lru itself can remove the least used element, so 335 * there is no need for an extra elem during map_update. 336 */ 337 err = alloc_extra_elems(htab); 338 if (err) 339 goto free_buckets; 340 } 341 342 if (prealloc) { 343 err = prealloc_init(htab); 344 if (err) 345 goto free_extra_elems; 346 } 347 348 return &htab->map; 349 350 free_extra_elems: 351 free_percpu(htab->extra_elems); 352 free_buckets: 353 bpf_map_area_free(htab->buckets); 354 free_htab: 355 kfree(htab); 356 return ERR_PTR(err); 357 } 358 359 static inline u32 htab_map_hash(const void *key, u32 key_len) 360 { 361 return jhash(key, key_len, 0); 362 } 363 364 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 365 { 366 return &htab->buckets[hash & (htab->n_buckets - 1)]; 367 } 368 369 static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) 370 { 371 return &__select_bucket(htab, hash)->head; 372 } 373 374 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, 375 void *key, u32 key_size) 376 { 377 struct htab_elem *l; 378 379 hlist_for_each_entry_rcu(l, head, hash_node) 380 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 381 return l; 382 383 return NULL; 384 } 385 386 /* Called from syscall or from eBPF program */ 387 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 388 { 389 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 390 struct hlist_head *head; 391 struct htab_elem *l; 392 u32 hash, key_size; 393 394 /* Must be called with rcu_read_lock. */ 395 WARN_ON_ONCE(!rcu_read_lock_held()); 396 397 key_size = map->key_size; 398 399 hash = htab_map_hash(key, key_size); 400 401 head = select_bucket(htab, hash); 402 403 l = lookup_elem_raw(head, hash, key, key_size); 404 405 return l; 406 } 407 408 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 409 { 410 struct htab_elem *l = __htab_map_lookup_elem(map, key); 411 412 if (l) 413 return l->key + round_up(map->key_size, 8); 414 415 return NULL; 416 } 417 418 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 419 { 420 struct htab_elem *l = __htab_map_lookup_elem(map, key); 421 422 if (l) { 423 bpf_lru_node_set_ref(&l->lru_node); 424 return l->key + round_up(map->key_size, 8); 425 } 426 427 return NULL; 428 } 429 430 /* It is called from the bpf_lru_list when the LRU needs to delete 431 * older elements from the htab. 432 */ 433 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 434 { 435 struct bpf_htab *htab = (struct bpf_htab *)arg; 436 struct htab_elem *l, *tgt_l; 437 struct hlist_head *head; 438 unsigned long flags; 439 struct bucket *b; 440 441 tgt_l = container_of(node, struct htab_elem, lru_node); 442 b = __select_bucket(htab, tgt_l->hash); 443 head = &b->head; 444 445 raw_spin_lock_irqsave(&b->lock, flags); 446 447 hlist_for_each_entry_rcu(l, head, hash_node) 448 if (l == tgt_l) { 449 hlist_del_rcu(&l->hash_node); 450 break; 451 } 452 453 raw_spin_unlock_irqrestore(&b->lock, flags); 454 455 return l == tgt_l; 456 } 457 458 /* Called from syscall */ 459 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 460 { 461 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 462 struct hlist_head *head; 463 struct htab_elem *l, *next_l; 464 u32 hash, key_size; 465 int i; 466 467 WARN_ON_ONCE(!rcu_read_lock_held()); 468 469 key_size = map->key_size; 470 471 hash = htab_map_hash(key, key_size); 472 473 head = select_bucket(htab, hash); 474 475 /* lookup the key */ 476 l = lookup_elem_raw(head, hash, key, key_size); 477 478 if (!l) { 479 i = 0; 480 goto find_first_elem; 481 } 482 483 /* key was found, get next key in the same bucket */ 484 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), 485 struct htab_elem, hash_node); 486 487 if (next_l) { 488 /* if next elem in this hash list is non-zero, just return it */ 489 memcpy(next_key, next_l->key, key_size); 490 return 0; 491 } 492 493 /* no more elements in this hash list, go to the next bucket */ 494 i = hash & (htab->n_buckets - 1); 495 i++; 496 497 find_first_elem: 498 /* iterate over buckets */ 499 for (; i < htab->n_buckets; i++) { 500 head = select_bucket(htab, i); 501 502 /* pick first element in the bucket */ 503 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), 504 struct htab_elem, hash_node); 505 if (next_l) { 506 /* if it's not empty, just return it */ 507 memcpy(next_key, next_l->key, key_size); 508 return 0; 509 } 510 } 511 512 /* iterated over all buckets and all elements */ 513 return -ENOENT; 514 } 515 516 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 517 { 518 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 519 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 520 kfree(l); 521 } 522 523 static void htab_elem_free_rcu(struct rcu_head *head) 524 { 525 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 526 struct bpf_htab *htab = l->htab; 527 528 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while 529 * we're calling kfree, otherwise deadlock is possible if kprobes 530 * are placed somewhere inside of slub 531 */ 532 preempt_disable(); 533 __this_cpu_inc(bpf_prog_active); 534 htab_elem_free(htab, l); 535 __this_cpu_dec(bpf_prog_active); 536 preempt_enable(); 537 } 538 539 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 540 { 541 if (l->state == HTAB_EXTRA_ELEM_USED) { 542 l->state = HTAB_EXTRA_ELEM_FREE; 543 return; 544 } 545 546 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { 547 pcpu_freelist_push(&htab->freelist, &l->fnode); 548 } else { 549 atomic_dec(&htab->count); 550 l->htab = htab; 551 call_rcu(&l->rcu, htab_elem_free_rcu); 552 } 553 } 554 555 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 556 void *value, bool onallcpus) 557 { 558 if (!onallcpus) { 559 /* copy true value_size bytes */ 560 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 561 } else { 562 u32 size = round_up(htab->map.value_size, 8); 563 int off = 0, cpu; 564 565 for_each_possible_cpu(cpu) { 566 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 567 value + off, size); 568 off += size; 569 } 570 } 571 } 572 573 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 574 void *value, u32 key_size, u32 hash, 575 bool percpu, bool onallcpus, 576 bool old_elem_exists) 577 { 578 u32 size = htab->map.value_size; 579 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); 580 struct htab_elem *l_new; 581 void __percpu *pptr; 582 int err = 0; 583 584 if (prealloc) { 585 l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); 586 if (!l_new) 587 err = -E2BIG; 588 } else { 589 if (atomic_inc_return(&htab->count) > htab->map.max_entries) { 590 atomic_dec(&htab->count); 591 err = -E2BIG; 592 } else { 593 l_new = kmalloc(htab->elem_size, 594 GFP_ATOMIC | __GFP_NOWARN); 595 if (!l_new) 596 return ERR_PTR(-ENOMEM); 597 } 598 } 599 600 if (err) { 601 if (!old_elem_exists) 602 return ERR_PTR(err); 603 604 /* if we're updating the existing element and the hash table 605 * is full, use per-cpu extra elems 606 */ 607 l_new = this_cpu_ptr(htab->extra_elems); 608 if (l_new->state != HTAB_EXTRA_ELEM_FREE) 609 return ERR_PTR(-E2BIG); 610 l_new->state = HTAB_EXTRA_ELEM_USED; 611 } else { 612 l_new->state = HTAB_NOT_AN_EXTRA_ELEM; 613 } 614 615 memcpy(l_new->key, key, key_size); 616 if (percpu) { 617 /* round up value_size to 8 bytes */ 618 size = round_up(size, 8); 619 620 if (prealloc) { 621 pptr = htab_elem_get_ptr(l_new, key_size); 622 } else { 623 /* alloc_percpu zero-fills */ 624 pptr = __alloc_percpu_gfp(size, 8, 625 GFP_ATOMIC | __GFP_NOWARN); 626 if (!pptr) { 627 kfree(l_new); 628 return ERR_PTR(-ENOMEM); 629 } 630 } 631 632 pcpu_copy_value(htab, pptr, value, onallcpus); 633 634 if (!prealloc) 635 htab_elem_set_ptr(l_new, key_size, pptr); 636 } else { 637 memcpy(l_new->key + round_up(key_size, 8), value, size); 638 } 639 640 l_new->hash = hash; 641 return l_new; 642 } 643 644 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 645 u64 map_flags) 646 { 647 if (l_old && map_flags == BPF_NOEXIST) 648 /* elem already exists */ 649 return -EEXIST; 650 651 if (!l_old && map_flags == BPF_EXIST) 652 /* elem doesn't exist, cannot update it */ 653 return -ENOENT; 654 655 return 0; 656 } 657 658 /* Called from syscall or from eBPF program */ 659 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 660 u64 map_flags) 661 { 662 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 663 struct htab_elem *l_new = NULL, *l_old; 664 struct hlist_head *head; 665 unsigned long flags; 666 struct bucket *b; 667 u32 key_size, hash; 668 int ret; 669 670 if (unlikely(map_flags > BPF_EXIST)) 671 /* unknown flags */ 672 return -EINVAL; 673 674 WARN_ON_ONCE(!rcu_read_lock_held()); 675 676 key_size = map->key_size; 677 678 hash = htab_map_hash(key, key_size); 679 680 b = __select_bucket(htab, hash); 681 head = &b->head; 682 683 /* bpf_map_update_elem() can be called in_irq() */ 684 raw_spin_lock_irqsave(&b->lock, flags); 685 686 l_old = lookup_elem_raw(head, hash, key, key_size); 687 688 ret = check_flags(htab, l_old, map_flags); 689 if (ret) 690 goto err; 691 692 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 693 !!l_old); 694 if (IS_ERR(l_new)) { 695 /* all pre-allocated elements are in use or memory exhausted */ 696 ret = PTR_ERR(l_new); 697 goto err; 698 } 699 700 /* add new element to the head of the list, so that 701 * concurrent search will find it before old elem 702 */ 703 hlist_add_head_rcu(&l_new->hash_node, head); 704 if (l_old) { 705 hlist_del_rcu(&l_old->hash_node); 706 free_htab_elem(htab, l_old); 707 } 708 ret = 0; 709 err: 710 raw_spin_unlock_irqrestore(&b->lock, flags); 711 return ret; 712 } 713 714 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 715 u64 map_flags) 716 { 717 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 718 struct htab_elem *l_new, *l_old = NULL; 719 struct hlist_head *head; 720 unsigned long flags; 721 struct bucket *b; 722 u32 key_size, hash; 723 int ret; 724 725 if (unlikely(map_flags > BPF_EXIST)) 726 /* unknown flags */ 727 return -EINVAL; 728 729 WARN_ON_ONCE(!rcu_read_lock_held()); 730 731 key_size = map->key_size; 732 733 hash = htab_map_hash(key, key_size); 734 735 b = __select_bucket(htab, hash); 736 head = &b->head; 737 738 /* For LRU, we need to alloc before taking bucket's 739 * spinlock because getting free nodes from LRU may need 740 * to remove older elements from htab and this removal 741 * operation will need a bucket lock. 742 */ 743 l_new = prealloc_lru_pop(htab, key, hash); 744 if (!l_new) 745 return -ENOMEM; 746 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 747 748 /* bpf_map_update_elem() can be called in_irq() */ 749 raw_spin_lock_irqsave(&b->lock, flags); 750 751 l_old = lookup_elem_raw(head, hash, key, key_size); 752 753 ret = check_flags(htab, l_old, map_flags); 754 if (ret) 755 goto err; 756 757 /* add new element to the head of the list, so that 758 * concurrent search will find it before old elem 759 */ 760 hlist_add_head_rcu(&l_new->hash_node, head); 761 if (l_old) { 762 bpf_lru_node_set_ref(&l_new->lru_node); 763 hlist_del_rcu(&l_old->hash_node); 764 } 765 ret = 0; 766 767 err: 768 raw_spin_unlock_irqrestore(&b->lock, flags); 769 770 if (ret) 771 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 772 else if (l_old) 773 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 774 775 return ret; 776 } 777 778 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 779 void *value, u64 map_flags, 780 bool onallcpus) 781 { 782 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 783 struct htab_elem *l_new = NULL, *l_old; 784 struct hlist_head *head; 785 unsigned long flags; 786 struct bucket *b; 787 u32 key_size, hash; 788 int ret; 789 790 if (unlikely(map_flags > BPF_EXIST)) 791 /* unknown flags */ 792 return -EINVAL; 793 794 WARN_ON_ONCE(!rcu_read_lock_held()); 795 796 key_size = map->key_size; 797 798 hash = htab_map_hash(key, key_size); 799 800 b = __select_bucket(htab, hash); 801 head = &b->head; 802 803 /* bpf_map_update_elem() can be called in_irq() */ 804 raw_spin_lock_irqsave(&b->lock, flags); 805 806 l_old = lookup_elem_raw(head, hash, key, key_size); 807 808 ret = check_flags(htab, l_old, map_flags); 809 if (ret) 810 goto err; 811 812 if (l_old) { 813 /* per-cpu hash map can update value in-place */ 814 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 815 value, onallcpus); 816 } else { 817 l_new = alloc_htab_elem(htab, key, value, key_size, 818 hash, true, onallcpus, false); 819 if (IS_ERR(l_new)) { 820 ret = PTR_ERR(l_new); 821 goto err; 822 } 823 hlist_add_head_rcu(&l_new->hash_node, head); 824 } 825 ret = 0; 826 err: 827 raw_spin_unlock_irqrestore(&b->lock, flags); 828 return ret; 829 } 830 831 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 832 void *value, u64 map_flags, 833 bool onallcpus) 834 { 835 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 836 struct htab_elem *l_new = NULL, *l_old; 837 struct hlist_head *head; 838 unsigned long flags; 839 struct bucket *b; 840 u32 key_size, hash; 841 int ret; 842 843 if (unlikely(map_flags > BPF_EXIST)) 844 /* unknown flags */ 845 return -EINVAL; 846 847 WARN_ON_ONCE(!rcu_read_lock_held()); 848 849 key_size = map->key_size; 850 851 hash = htab_map_hash(key, key_size); 852 853 b = __select_bucket(htab, hash); 854 head = &b->head; 855 856 /* For LRU, we need to alloc before taking bucket's 857 * spinlock because LRU's elem alloc may need 858 * to remove older elem from htab and this removal 859 * operation will need a bucket lock. 860 */ 861 if (map_flags != BPF_EXIST) { 862 l_new = prealloc_lru_pop(htab, key, hash); 863 if (!l_new) 864 return -ENOMEM; 865 } 866 867 /* bpf_map_update_elem() can be called in_irq() */ 868 raw_spin_lock_irqsave(&b->lock, flags); 869 870 l_old = lookup_elem_raw(head, hash, key, key_size); 871 872 ret = check_flags(htab, l_old, map_flags); 873 if (ret) 874 goto err; 875 876 if (l_old) { 877 bpf_lru_node_set_ref(&l_old->lru_node); 878 879 /* per-cpu hash map can update value in-place */ 880 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 881 value, onallcpus); 882 } else { 883 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 884 value, onallcpus); 885 hlist_add_head_rcu(&l_new->hash_node, head); 886 l_new = NULL; 887 } 888 ret = 0; 889 err: 890 raw_spin_unlock_irqrestore(&b->lock, flags); 891 if (l_new) 892 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 893 return ret; 894 } 895 896 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 897 void *value, u64 map_flags) 898 { 899 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 900 } 901 902 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 903 void *value, u64 map_flags) 904 { 905 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 906 false); 907 } 908 909 /* Called from syscall or from eBPF program */ 910 static int htab_map_delete_elem(struct bpf_map *map, void *key) 911 { 912 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 913 struct hlist_head *head; 914 struct bucket *b; 915 struct htab_elem *l; 916 unsigned long flags; 917 u32 hash, key_size; 918 int ret = -ENOENT; 919 920 WARN_ON_ONCE(!rcu_read_lock_held()); 921 922 key_size = map->key_size; 923 924 hash = htab_map_hash(key, key_size); 925 b = __select_bucket(htab, hash); 926 head = &b->head; 927 928 raw_spin_lock_irqsave(&b->lock, flags); 929 930 l = lookup_elem_raw(head, hash, key, key_size); 931 932 if (l) { 933 hlist_del_rcu(&l->hash_node); 934 free_htab_elem(htab, l); 935 ret = 0; 936 } 937 938 raw_spin_unlock_irqrestore(&b->lock, flags); 939 return ret; 940 } 941 942 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 943 { 944 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 945 struct hlist_head *head; 946 struct bucket *b; 947 struct htab_elem *l; 948 unsigned long flags; 949 u32 hash, key_size; 950 int ret = -ENOENT; 951 952 WARN_ON_ONCE(!rcu_read_lock_held()); 953 954 key_size = map->key_size; 955 956 hash = htab_map_hash(key, key_size); 957 b = __select_bucket(htab, hash); 958 head = &b->head; 959 960 raw_spin_lock_irqsave(&b->lock, flags); 961 962 l = lookup_elem_raw(head, hash, key, key_size); 963 964 if (l) { 965 hlist_del_rcu(&l->hash_node); 966 ret = 0; 967 } 968 969 raw_spin_unlock_irqrestore(&b->lock, flags); 970 if (l) 971 bpf_lru_push_free(&htab->lru, &l->lru_node); 972 return ret; 973 } 974 975 static void delete_all_elements(struct bpf_htab *htab) 976 { 977 int i; 978 979 for (i = 0; i < htab->n_buckets; i++) { 980 struct hlist_head *head = select_bucket(htab, i); 981 struct hlist_node *n; 982 struct htab_elem *l; 983 984 hlist_for_each_entry_safe(l, n, head, hash_node) { 985 hlist_del_rcu(&l->hash_node); 986 if (l->state != HTAB_EXTRA_ELEM_USED) 987 htab_elem_free(htab, l); 988 } 989 } 990 } 991 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 992 static void htab_map_free(struct bpf_map *map) 993 { 994 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 995 996 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 997 * so the programs (can be more than one that used this map) were 998 * disconnected from events. Wait for outstanding critical sections in 999 * these programs to complete 1000 */ 1001 synchronize_rcu(); 1002 1003 /* some of free_htab_elem() callbacks for elements of this map may 1004 * not have executed. Wait for them. 1005 */ 1006 rcu_barrier(); 1007 if (htab->map.map_flags & BPF_F_NO_PREALLOC) 1008 delete_all_elements(htab); 1009 else 1010 prealloc_destroy(htab); 1011 1012 free_percpu(htab->extra_elems); 1013 bpf_map_area_free(htab->buckets); 1014 kfree(htab); 1015 } 1016 1017 static const struct bpf_map_ops htab_ops = { 1018 .map_alloc = htab_map_alloc, 1019 .map_free = htab_map_free, 1020 .map_get_next_key = htab_map_get_next_key, 1021 .map_lookup_elem = htab_map_lookup_elem, 1022 .map_update_elem = htab_map_update_elem, 1023 .map_delete_elem = htab_map_delete_elem, 1024 }; 1025 1026 static struct bpf_map_type_list htab_type __ro_after_init = { 1027 .ops = &htab_ops, 1028 .type = BPF_MAP_TYPE_HASH, 1029 }; 1030 1031 static const struct bpf_map_ops htab_lru_ops = { 1032 .map_alloc = htab_map_alloc, 1033 .map_free = htab_map_free, 1034 .map_get_next_key = htab_map_get_next_key, 1035 .map_lookup_elem = htab_lru_map_lookup_elem, 1036 .map_update_elem = htab_lru_map_update_elem, 1037 .map_delete_elem = htab_lru_map_delete_elem, 1038 }; 1039 1040 static struct bpf_map_type_list htab_lru_type __ro_after_init = { 1041 .ops = &htab_lru_ops, 1042 .type = BPF_MAP_TYPE_LRU_HASH, 1043 }; 1044 1045 /* Called from eBPF program */ 1046 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1047 { 1048 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1049 1050 if (l) 1051 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1052 else 1053 return NULL; 1054 } 1055 1056 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1057 { 1058 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1059 1060 if (l) { 1061 bpf_lru_node_set_ref(&l->lru_node); 1062 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1063 } 1064 1065 return NULL; 1066 } 1067 1068 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1069 { 1070 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1071 struct htab_elem *l; 1072 void __percpu *pptr; 1073 int ret = -ENOENT; 1074 int cpu, off = 0; 1075 u32 size; 1076 1077 /* per_cpu areas are zero-filled and bpf programs can only 1078 * access 'value_size' of them, so copying rounded areas 1079 * will not leak any kernel data 1080 */ 1081 size = round_up(map->value_size, 8); 1082 rcu_read_lock(); 1083 l = __htab_map_lookup_elem(map, key); 1084 if (!l) 1085 goto out; 1086 if (htab_is_lru(htab)) 1087 bpf_lru_node_set_ref(&l->lru_node); 1088 pptr = htab_elem_get_ptr(l, map->key_size); 1089 for_each_possible_cpu(cpu) { 1090 bpf_long_memcpy(value + off, 1091 per_cpu_ptr(pptr, cpu), size); 1092 off += size; 1093 } 1094 ret = 0; 1095 out: 1096 rcu_read_unlock(); 1097 return ret; 1098 } 1099 1100 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1101 u64 map_flags) 1102 { 1103 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1104 int ret; 1105 1106 rcu_read_lock(); 1107 if (htab_is_lru(htab)) 1108 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1109 map_flags, true); 1110 else 1111 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1112 true); 1113 rcu_read_unlock(); 1114 1115 return ret; 1116 } 1117 1118 static const struct bpf_map_ops htab_percpu_ops = { 1119 .map_alloc = htab_map_alloc, 1120 .map_free = htab_map_free, 1121 .map_get_next_key = htab_map_get_next_key, 1122 .map_lookup_elem = htab_percpu_map_lookup_elem, 1123 .map_update_elem = htab_percpu_map_update_elem, 1124 .map_delete_elem = htab_map_delete_elem, 1125 }; 1126 1127 static struct bpf_map_type_list htab_percpu_type __ro_after_init = { 1128 .ops = &htab_percpu_ops, 1129 .type = BPF_MAP_TYPE_PERCPU_HASH, 1130 }; 1131 1132 static const struct bpf_map_ops htab_lru_percpu_ops = { 1133 .map_alloc = htab_map_alloc, 1134 .map_free = htab_map_free, 1135 .map_get_next_key = htab_map_get_next_key, 1136 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 1137 .map_update_elem = htab_lru_percpu_map_update_elem, 1138 .map_delete_elem = htab_lru_map_delete_elem, 1139 }; 1140 1141 static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { 1142 .ops = &htab_lru_percpu_ops, 1143 .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, 1144 }; 1145 1146 static int __init register_htab_map(void) 1147 { 1148 bpf_register_map_type(&htab_type); 1149 bpf_register_map_type(&htab_percpu_type); 1150 bpf_register_map_type(&htab_lru_type); 1151 bpf_register_map_type(&htab_lru_percpu_type); 1152 return 0; 1153 } 1154 late_initcall(register_htab_map); 1155