1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * Copyright (c) 2016 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/bpf.h> 14 #include <linux/jhash.h> 15 #include <linux/filter.h> 16 #include <linux/vmalloc.h> 17 #include "percpu_freelist.h" 18 #include "bpf_lru_list.h" 19 20 struct bucket { 21 struct hlist_head head; 22 raw_spinlock_t lock; 23 }; 24 25 struct bpf_htab { 26 struct bpf_map map; 27 struct bucket *buckets; 28 void *elems; 29 union { 30 struct pcpu_freelist freelist; 31 struct bpf_lru lru; 32 }; 33 void __percpu *extra_elems; 34 atomic_t count; /* number of elements in this hashtable */ 35 u32 n_buckets; /* number of hash buckets */ 36 u32 elem_size; /* size of each element in bytes */ 37 }; 38 39 enum extra_elem_state { 40 HTAB_NOT_AN_EXTRA_ELEM = 0, 41 HTAB_EXTRA_ELEM_FREE, 42 HTAB_EXTRA_ELEM_USED 43 }; 44 45 /* each htab element is struct htab_elem + key + value */ 46 struct htab_elem { 47 union { 48 struct hlist_node hash_node; 49 struct bpf_htab *htab; 50 struct pcpu_freelist_node fnode; 51 }; 52 union { 53 struct rcu_head rcu; 54 enum extra_elem_state state; 55 struct bpf_lru_node lru_node; 56 }; 57 u32 hash; 58 char key[0] __aligned(8); 59 }; 60 61 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 62 63 static bool htab_is_lru(const struct bpf_htab *htab) 64 { 65 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 66 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 67 } 68 69 static bool htab_is_percpu(const struct bpf_htab *htab) 70 { 71 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 72 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 73 } 74 75 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 76 void __percpu *pptr) 77 { 78 *(void __percpu **)(l->key + key_size) = pptr; 79 } 80 81 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 82 { 83 return *(void __percpu **)(l->key + key_size); 84 } 85 86 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 87 { 88 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 89 } 90 91 static void htab_free_elems(struct bpf_htab *htab) 92 { 93 int i; 94 95 if (!htab_is_percpu(htab)) 96 goto free_elems; 97 98 for (i = 0; i < htab->map.max_entries; i++) { 99 void __percpu *pptr; 100 101 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 102 htab->map.key_size); 103 free_percpu(pptr); 104 } 105 free_elems: 106 vfree(htab->elems); 107 } 108 109 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 110 u32 hash) 111 { 112 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 113 struct htab_elem *l; 114 115 if (node) { 116 l = container_of(node, struct htab_elem, lru_node); 117 memcpy(l->key, key, htab->map.key_size); 118 return l; 119 } 120 121 return NULL; 122 } 123 124 static int prealloc_init(struct bpf_htab *htab) 125 { 126 int err = -ENOMEM, i; 127 128 htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); 129 if (!htab->elems) 130 return -ENOMEM; 131 132 if (!htab_is_percpu(htab)) 133 goto skip_percpu_elems; 134 135 for (i = 0; i < htab->map.max_entries; i++) { 136 u32 size = round_up(htab->map.value_size, 8); 137 void __percpu *pptr; 138 139 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 140 if (!pptr) 141 goto free_elems; 142 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 143 pptr); 144 } 145 146 skip_percpu_elems: 147 if (htab_is_lru(htab)) 148 err = bpf_lru_init(&htab->lru, 149 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 150 offsetof(struct htab_elem, hash) - 151 offsetof(struct htab_elem, lru_node), 152 htab_lru_map_delete_node, 153 htab); 154 else 155 err = pcpu_freelist_init(&htab->freelist); 156 157 if (err) 158 goto free_elems; 159 160 if (htab_is_lru(htab)) 161 bpf_lru_populate(&htab->lru, htab->elems, 162 offsetof(struct htab_elem, lru_node), 163 htab->elem_size, htab->map.max_entries); 164 else 165 pcpu_freelist_populate(&htab->freelist, htab->elems, 166 htab->elem_size, htab->map.max_entries); 167 168 return 0; 169 170 free_elems: 171 htab_free_elems(htab); 172 return err; 173 } 174 175 static void prealloc_destroy(struct bpf_htab *htab) 176 { 177 htab_free_elems(htab); 178 179 if (htab_is_lru(htab)) 180 bpf_lru_destroy(&htab->lru); 181 else 182 pcpu_freelist_destroy(&htab->freelist); 183 } 184 185 static int alloc_extra_elems(struct bpf_htab *htab) 186 { 187 void __percpu *pptr; 188 int cpu; 189 190 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); 191 if (!pptr) 192 return -ENOMEM; 193 194 for_each_possible_cpu(cpu) { 195 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = 196 HTAB_EXTRA_ELEM_FREE; 197 } 198 htab->extra_elems = pptr; 199 return 0; 200 } 201 202 /* Called from syscall */ 203 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 204 { 205 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 206 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 207 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 208 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 209 /* percpu_lru means each cpu has its own LRU list. 210 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 211 * the map's value itself is percpu. percpu_lru has 212 * nothing to do with the map's value. 213 */ 214 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 215 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 216 struct bpf_htab *htab; 217 int err, i; 218 u64 cost; 219 220 if (lru && !capable(CAP_SYS_ADMIN)) 221 /* LRU implementation is much complicated than other 222 * maps. Hence, limit to CAP_SYS_ADMIN for now. 223 */ 224 return ERR_PTR(-EPERM); 225 226 if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) 227 /* reserved bits should not be used */ 228 return ERR_PTR(-EINVAL); 229 230 if (!lru && percpu_lru) 231 return ERR_PTR(-EINVAL); 232 233 if (lru && !prealloc) 234 return ERR_PTR(-ENOTSUPP); 235 236 htab = kzalloc(sizeof(*htab), GFP_USER); 237 if (!htab) 238 return ERR_PTR(-ENOMEM); 239 240 /* mandatory map attributes */ 241 htab->map.map_type = attr->map_type; 242 htab->map.key_size = attr->key_size; 243 htab->map.value_size = attr->value_size; 244 htab->map.max_entries = attr->max_entries; 245 htab->map.map_flags = attr->map_flags; 246 247 /* check sanity of attributes. 248 * value_size == 0 may be allowed in the future to use map as a set 249 */ 250 err = -EINVAL; 251 if (htab->map.max_entries == 0 || htab->map.key_size == 0 || 252 htab->map.value_size == 0) 253 goto free_htab; 254 255 if (percpu_lru) { 256 /* ensure each CPU's lru list has >=1 elements. 257 * since we are at it, make each lru list has the same 258 * number of elements. 259 */ 260 htab->map.max_entries = roundup(attr->max_entries, 261 num_possible_cpus()); 262 if (htab->map.max_entries < attr->max_entries) 263 htab->map.max_entries = rounddown(attr->max_entries, 264 num_possible_cpus()); 265 } 266 267 /* hash table size must be power of 2 */ 268 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 269 270 err = -E2BIG; 271 if (htab->map.key_size > MAX_BPF_STACK) 272 /* eBPF programs initialize keys on stack, so they cannot be 273 * larger than max stack size 274 */ 275 goto free_htab; 276 277 if (htab->map.value_size >= KMALLOC_MAX_SIZE - 278 MAX_BPF_STACK - sizeof(struct htab_elem)) 279 /* if value_size is bigger, the user space won't be able to 280 * access the elements via bpf syscall. This check also makes 281 * sure that the elem_size doesn't overflow and it's 282 * kmalloc-able later in htab_map_update_elem() 283 */ 284 goto free_htab; 285 286 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) 287 /* make sure the size for pcpu_alloc() is reasonable */ 288 goto free_htab; 289 290 htab->elem_size = sizeof(struct htab_elem) + 291 round_up(htab->map.key_size, 8); 292 if (percpu) 293 htab->elem_size += sizeof(void *); 294 else 295 htab->elem_size += round_up(htab->map.value_size, 8); 296 297 /* prevent zero size kmalloc and check for u32 overflow */ 298 if (htab->n_buckets == 0 || 299 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 300 goto free_htab; 301 302 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 303 (u64) htab->elem_size * htab->map.max_entries; 304 305 if (percpu) 306 cost += (u64) round_up(htab->map.value_size, 8) * 307 num_possible_cpus() * htab->map.max_entries; 308 else 309 cost += (u64) htab->elem_size * num_possible_cpus(); 310 311 if (cost >= U32_MAX - PAGE_SIZE) 312 /* make sure page count doesn't overflow */ 313 goto free_htab; 314 315 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 316 317 /* if map size is larger than memlock limit, reject it early */ 318 err = bpf_map_precharge_memlock(htab->map.pages); 319 if (err) 320 goto free_htab; 321 322 err = -ENOMEM; 323 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), 324 GFP_USER | __GFP_NOWARN); 325 326 if (!htab->buckets) { 327 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); 328 if (!htab->buckets) 329 goto free_htab; 330 } 331 332 for (i = 0; i < htab->n_buckets; i++) { 333 INIT_HLIST_HEAD(&htab->buckets[i].head); 334 raw_spin_lock_init(&htab->buckets[i].lock); 335 } 336 337 if (!percpu && !lru) { 338 /* lru itself can remove the least used element, so 339 * there is no need for an extra elem during map_update. 340 */ 341 err = alloc_extra_elems(htab); 342 if (err) 343 goto free_buckets; 344 } 345 346 if (prealloc) { 347 err = prealloc_init(htab); 348 if (err) 349 goto free_extra_elems; 350 } 351 352 return &htab->map; 353 354 free_extra_elems: 355 free_percpu(htab->extra_elems); 356 free_buckets: 357 kvfree(htab->buckets); 358 free_htab: 359 kfree(htab); 360 return ERR_PTR(err); 361 } 362 363 static inline u32 htab_map_hash(const void *key, u32 key_len) 364 { 365 return jhash(key, key_len, 0); 366 } 367 368 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 369 { 370 return &htab->buckets[hash & (htab->n_buckets - 1)]; 371 } 372 373 static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) 374 { 375 return &__select_bucket(htab, hash)->head; 376 } 377 378 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, 379 void *key, u32 key_size) 380 { 381 struct htab_elem *l; 382 383 hlist_for_each_entry_rcu(l, head, hash_node) 384 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 385 return l; 386 387 return NULL; 388 } 389 390 /* Called from syscall or from eBPF program */ 391 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 392 { 393 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 394 struct hlist_head *head; 395 struct htab_elem *l; 396 u32 hash, key_size; 397 398 /* Must be called with rcu_read_lock. */ 399 WARN_ON_ONCE(!rcu_read_lock_held()); 400 401 key_size = map->key_size; 402 403 hash = htab_map_hash(key, key_size); 404 405 head = select_bucket(htab, hash); 406 407 l = lookup_elem_raw(head, hash, key, key_size); 408 409 return l; 410 } 411 412 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 413 { 414 struct htab_elem *l = __htab_map_lookup_elem(map, key); 415 416 if (l) 417 return l->key + round_up(map->key_size, 8); 418 419 return NULL; 420 } 421 422 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 423 { 424 struct htab_elem *l = __htab_map_lookup_elem(map, key); 425 426 if (l) { 427 bpf_lru_node_set_ref(&l->lru_node); 428 return l->key + round_up(map->key_size, 8); 429 } 430 431 return NULL; 432 } 433 434 /* It is called from the bpf_lru_list when the LRU needs to delete 435 * older elements from the htab. 436 */ 437 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 438 { 439 struct bpf_htab *htab = (struct bpf_htab *)arg; 440 struct htab_elem *l, *tgt_l; 441 struct hlist_head *head; 442 unsigned long flags; 443 struct bucket *b; 444 445 tgt_l = container_of(node, struct htab_elem, lru_node); 446 b = __select_bucket(htab, tgt_l->hash); 447 head = &b->head; 448 449 raw_spin_lock_irqsave(&b->lock, flags); 450 451 hlist_for_each_entry_rcu(l, head, hash_node) 452 if (l == tgt_l) { 453 hlist_del_rcu(&l->hash_node); 454 break; 455 } 456 457 raw_spin_unlock_irqrestore(&b->lock, flags); 458 459 return l == tgt_l; 460 } 461 462 /* Called from syscall */ 463 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 464 { 465 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 466 struct hlist_head *head; 467 struct htab_elem *l, *next_l; 468 u32 hash, key_size; 469 int i; 470 471 WARN_ON_ONCE(!rcu_read_lock_held()); 472 473 key_size = map->key_size; 474 475 hash = htab_map_hash(key, key_size); 476 477 head = select_bucket(htab, hash); 478 479 /* lookup the key */ 480 l = lookup_elem_raw(head, hash, key, key_size); 481 482 if (!l) { 483 i = 0; 484 goto find_first_elem; 485 } 486 487 /* key was found, get next key in the same bucket */ 488 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), 489 struct htab_elem, hash_node); 490 491 if (next_l) { 492 /* if next elem in this hash list is non-zero, just return it */ 493 memcpy(next_key, next_l->key, key_size); 494 return 0; 495 } 496 497 /* no more elements in this hash list, go to the next bucket */ 498 i = hash & (htab->n_buckets - 1); 499 i++; 500 501 find_first_elem: 502 /* iterate over buckets */ 503 for (; i < htab->n_buckets; i++) { 504 head = select_bucket(htab, i); 505 506 /* pick first element in the bucket */ 507 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), 508 struct htab_elem, hash_node); 509 if (next_l) { 510 /* if it's not empty, just return it */ 511 memcpy(next_key, next_l->key, key_size); 512 return 0; 513 } 514 } 515 516 /* iterated over all buckets and all elements */ 517 return -ENOENT; 518 } 519 520 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 521 { 522 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 523 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 524 kfree(l); 525 } 526 527 static void htab_elem_free_rcu(struct rcu_head *head) 528 { 529 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 530 struct bpf_htab *htab = l->htab; 531 532 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while 533 * we're calling kfree, otherwise deadlock is possible if kprobes 534 * are placed somewhere inside of slub 535 */ 536 preempt_disable(); 537 __this_cpu_inc(bpf_prog_active); 538 htab_elem_free(htab, l); 539 __this_cpu_dec(bpf_prog_active); 540 preempt_enable(); 541 } 542 543 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 544 { 545 if (l->state == HTAB_EXTRA_ELEM_USED) { 546 l->state = HTAB_EXTRA_ELEM_FREE; 547 return; 548 } 549 550 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { 551 pcpu_freelist_push(&htab->freelist, &l->fnode); 552 } else { 553 atomic_dec(&htab->count); 554 l->htab = htab; 555 call_rcu(&l->rcu, htab_elem_free_rcu); 556 } 557 } 558 559 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 560 void *value, bool onallcpus) 561 { 562 if (!onallcpus) { 563 /* copy true value_size bytes */ 564 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 565 } else { 566 u32 size = round_up(htab->map.value_size, 8); 567 int off = 0, cpu; 568 569 for_each_possible_cpu(cpu) { 570 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 571 value + off, size); 572 off += size; 573 } 574 } 575 } 576 577 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 578 void *value, u32 key_size, u32 hash, 579 bool percpu, bool onallcpus, 580 bool old_elem_exists) 581 { 582 u32 size = htab->map.value_size; 583 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); 584 struct htab_elem *l_new; 585 void __percpu *pptr; 586 int err = 0; 587 588 if (prealloc) { 589 l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); 590 if (!l_new) 591 err = -E2BIG; 592 } else { 593 if (atomic_inc_return(&htab->count) > htab->map.max_entries) { 594 atomic_dec(&htab->count); 595 err = -E2BIG; 596 } else { 597 l_new = kmalloc(htab->elem_size, 598 GFP_ATOMIC | __GFP_NOWARN); 599 if (!l_new) 600 return ERR_PTR(-ENOMEM); 601 } 602 } 603 604 if (err) { 605 if (!old_elem_exists) 606 return ERR_PTR(err); 607 608 /* if we're updating the existing element and the hash table 609 * is full, use per-cpu extra elems 610 */ 611 l_new = this_cpu_ptr(htab->extra_elems); 612 if (l_new->state != HTAB_EXTRA_ELEM_FREE) 613 return ERR_PTR(-E2BIG); 614 l_new->state = HTAB_EXTRA_ELEM_USED; 615 } else { 616 l_new->state = HTAB_NOT_AN_EXTRA_ELEM; 617 } 618 619 memcpy(l_new->key, key, key_size); 620 if (percpu) { 621 /* round up value_size to 8 bytes */ 622 size = round_up(size, 8); 623 624 if (prealloc) { 625 pptr = htab_elem_get_ptr(l_new, key_size); 626 } else { 627 /* alloc_percpu zero-fills */ 628 pptr = __alloc_percpu_gfp(size, 8, 629 GFP_ATOMIC | __GFP_NOWARN); 630 if (!pptr) { 631 kfree(l_new); 632 return ERR_PTR(-ENOMEM); 633 } 634 } 635 636 pcpu_copy_value(htab, pptr, value, onallcpus); 637 638 if (!prealloc) 639 htab_elem_set_ptr(l_new, key_size, pptr); 640 } else { 641 memcpy(l_new->key + round_up(key_size, 8), value, size); 642 } 643 644 l_new->hash = hash; 645 return l_new; 646 } 647 648 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 649 u64 map_flags) 650 { 651 if (l_old && map_flags == BPF_NOEXIST) 652 /* elem already exists */ 653 return -EEXIST; 654 655 if (!l_old && map_flags == BPF_EXIST) 656 /* elem doesn't exist, cannot update it */ 657 return -ENOENT; 658 659 return 0; 660 } 661 662 /* Called from syscall or from eBPF program */ 663 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 664 u64 map_flags) 665 { 666 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 667 struct htab_elem *l_new = NULL, *l_old; 668 struct hlist_head *head; 669 unsigned long flags; 670 struct bucket *b; 671 u32 key_size, hash; 672 int ret; 673 674 if (unlikely(map_flags > BPF_EXIST)) 675 /* unknown flags */ 676 return -EINVAL; 677 678 WARN_ON_ONCE(!rcu_read_lock_held()); 679 680 key_size = map->key_size; 681 682 hash = htab_map_hash(key, key_size); 683 684 b = __select_bucket(htab, hash); 685 head = &b->head; 686 687 /* bpf_map_update_elem() can be called in_irq() */ 688 raw_spin_lock_irqsave(&b->lock, flags); 689 690 l_old = lookup_elem_raw(head, hash, key, key_size); 691 692 ret = check_flags(htab, l_old, map_flags); 693 if (ret) 694 goto err; 695 696 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 697 !!l_old); 698 if (IS_ERR(l_new)) { 699 /* all pre-allocated elements are in use or memory exhausted */ 700 ret = PTR_ERR(l_new); 701 goto err; 702 } 703 704 /* add new element to the head of the list, so that 705 * concurrent search will find it before old elem 706 */ 707 hlist_add_head_rcu(&l_new->hash_node, head); 708 if (l_old) { 709 hlist_del_rcu(&l_old->hash_node); 710 free_htab_elem(htab, l_old); 711 } 712 ret = 0; 713 err: 714 raw_spin_unlock_irqrestore(&b->lock, flags); 715 return ret; 716 } 717 718 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 719 u64 map_flags) 720 { 721 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 722 struct htab_elem *l_new, *l_old = NULL; 723 struct hlist_head *head; 724 unsigned long flags; 725 struct bucket *b; 726 u32 key_size, hash; 727 int ret; 728 729 if (unlikely(map_flags > BPF_EXIST)) 730 /* unknown flags */ 731 return -EINVAL; 732 733 WARN_ON_ONCE(!rcu_read_lock_held()); 734 735 key_size = map->key_size; 736 737 hash = htab_map_hash(key, key_size); 738 739 b = __select_bucket(htab, hash); 740 head = &b->head; 741 742 /* For LRU, we need to alloc before taking bucket's 743 * spinlock because getting free nodes from LRU may need 744 * to remove older elements from htab and this removal 745 * operation will need a bucket lock. 746 */ 747 l_new = prealloc_lru_pop(htab, key, hash); 748 if (!l_new) 749 return -ENOMEM; 750 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 751 752 /* bpf_map_update_elem() can be called in_irq() */ 753 raw_spin_lock_irqsave(&b->lock, flags); 754 755 l_old = lookup_elem_raw(head, hash, key, key_size); 756 757 ret = check_flags(htab, l_old, map_flags); 758 if (ret) 759 goto err; 760 761 /* add new element to the head of the list, so that 762 * concurrent search will find it before old elem 763 */ 764 hlist_add_head_rcu(&l_new->hash_node, head); 765 if (l_old) { 766 bpf_lru_node_set_ref(&l_new->lru_node); 767 hlist_del_rcu(&l_old->hash_node); 768 } 769 ret = 0; 770 771 err: 772 raw_spin_unlock_irqrestore(&b->lock, flags); 773 774 if (ret) 775 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 776 else if (l_old) 777 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 778 779 return ret; 780 } 781 782 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 783 void *value, u64 map_flags, 784 bool onallcpus) 785 { 786 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 787 struct htab_elem *l_new = NULL, *l_old; 788 struct hlist_head *head; 789 unsigned long flags; 790 struct bucket *b; 791 u32 key_size, hash; 792 int ret; 793 794 if (unlikely(map_flags > BPF_EXIST)) 795 /* unknown flags */ 796 return -EINVAL; 797 798 WARN_ON_ONCE(!rcu_read_lock_held()); 799 800 key_size = map->key_size; 801 802 hash = htab_map_hash(key, key_size); 803 804 b = __select_bucket(htab, hash); 805 head = &b->head; 806 807 /* bpf_map_update_elem() can be called in_irq() */ 808 raw_spin_lock_irqsave(&b->lock, flags); 809 810 l_old = lookup_elem_raw(head, hash, key, key_size); 811 812 ret = check_flags(htab, l_old, map_flags); 813 if (ret) 814 goto err; 815 816 if (l_old) { 817 /* per-cpu hash map can update value in-place */ 818 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 819 value, onallcpus); 820 } else { 821 l_new = alloc_htab_elem(htab, key, value, key_size, 822 hash, true, onallcpus, false); 823 if (IS_ERR(l_new)) { 824 ret = PTR_ERR(l_new); 825 goto err; 826 } 827 hlist_add_head_rcu(&l_new->hash_node, head); 828 } 829 ret = 0; 830 err: 831 raw_spin_unlock_irqrestore(&b->lock, flags); 832 return ret; 833 } 834 835 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 836 void *value, u64 map_flags, 837 bool onallcpus) 838 { 839 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 840 struct htab_elem *l_new = NULL, *l_old; 841 struct hlist_head *head; 842 unsigned long flags; 843 struct bucket *b; 844 u32 key_size, hash; 845 int ret; 846 847 if (unlikely(map_flags > BPF_EXIST)) 848 /* unknown flags */ 849 return -EINVAL; 850 851 WARN_ON_ONCE(!rcu_read_lock_held()); 852 853 key_size = map->key_size; 854 855 hash = htab_map_hash(key, key_size); 856 857 b = __select_bucket(htab, hash); 858 head = &b->head; 859 860 /* For LRU, we need to alloc before taking bucket's 861 * spinlock because LRU's elem alloc may need 862 * to remove older elem from htab and this removal 863 * operation will need a bucket lock. 864 */ 865 if (map_flags != BPF_EXIST) { 866 l_new = prealloc_lru_pop(htab, key, hash); 867 if (!l_new) 868 return -ENOMEM; 869 } 870 871 /* bpf_map_update_elem() can be called in_irq() */ 872 raw_spin_lock_irqsave(&b->lock, flags); 873 874 l_old = lookup_elem_raw(head, hash, key, key_size); 875 876 ret = check_flags(htab, l_old, map_flags); 877 if (ret) 878 goto err; 879 880 if (l_old) { 881 bpf_lru_node_set_ref(&l_old->lru_node); 882 883 /* per-cpu hash map can update value in-place */ 884 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 885 value, onallcpus); 886 } else { 887 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 888 value, onallcpus); 889 hlist_add_head_rcu(&l_new->hash_node, head); 890 l_new = NULL; 891 } 892 ret = 0; 893 err: 894 raw_spin_unlock_irqrestore(&b->lock, flags); 895 if (l_new) 896 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 897 return ret; 898 } 899 900 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 901 void *value, u64 map_flags) 902 { 903 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 904 } 905 906 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 907 void *value, u64 map_flags) 908 { 909 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 910 false); 911 } 912 913 /* Called from syscall or from eBPF program */ 914 static int htab_map_delete_elem(struct bpf_map *map, void *key) 915 { 916 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 917 struct hlist_head *head; 918 struct bucket *b; 919 struct htab_elem *l; 920 unsigned long flags; 921 u32 hash, key_size; 922 int ret = -ENOENT; 923 924 WARN_ON_ONCE(!rcu_read_lock_held()); 925 926 key_size = map->key_size; 927 928 hash = htab_map_hash(key, key_size); 929 b = __select_bucket(htab, hash); 930 head = &b->head; 931 932 raw_spin_lock_irqsave(&b->lock, flags); 933 934 l = lookup_elem_raw(head, hash, key, key_size); 935 936 if (l) { 937 hlist_del_rcu(&l->hash_node); 938 free_htab_elem(htab, l); 939 ret = 0; 940 } 941 942 raw_spin_unlock_irqrestore(&b->lock, flags); 943 return ret; 944 } 945 946 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 947 { 948 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 949 struct hlist_head *head; 950 struct bucket *b; 951 struct htab_elem *l; 952 unsigned long flags; 953 u32 hash, key_size; 954 int ret = -ENOENT; 955 956 WARN_ON_ONCE(!rcu_read_lock_held()); 957 958 key_size = map->key_size; 959 960 hash = htab_map_hash(key, key_size); 961 b = __select_bucket(htab, hash); 962 head = &b->head; 963 964 raw_spin_lock_irqsave(&b->lock, flags); 965 966 l = lookup_elem_raw(head, hash, key, key_size); 967 968 if (l) { 969 hlist_del_rcu(&l->hash_node); 970 ret = 0; 971 } 972 973 raw_spin_unlock_irqrestore(&b->lock, flags); 974 if (l) 975 bpf_lru_push_free(&htab->lru, &l->lru_node); 976 return ret; 977 } 978 979 static void delete_all_elements(struct bpf_htab *htab) 980 { 981 int i; 982 983 for (i = 0; i < htab->n_buckets; i++) { 984 struct hlist_head *head = select_bucket(htab, i); 985 struct hlist_node *n; 986 struct htab_elem *l; 987 988 hlist_for_each_entry_safe(l, n, head, hash_node) { 989 hlist_del_rcu(&l->hash_node); 990 if (l->state != HTAB_EXTRA_ELEM_USED) 991 htab_elem_free(htab, l); 992 } 993 } 994 } 995 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 996 static void htab_map_free(struct bpf_map *map) 997 { 998 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 999 1000 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 1001 * so the programs (can be more than one that used this map) were 1002 * disconnected from events. Wait for outstanding critical sections in 1003 * these programs to complete 1004 */ 1005 synchronize_rcu(); 1006 1007 /* some of free_htab_elem() callbacks for elements of this map may 1008 * not have executed. Wait for them. 1009 */ 1010 rcu_barrier(); 1011 if (htab->map.map_flags & BPF_F_NO_PREALLOC) 1012 delete_all_elements(htab); 1013 else 1014 prealloc_destroy(htab); 1015 1016 free_percpu(htab->extra_elems); 1017 kvfree(htab->buckets); 1018 kfree(htab); 1019 } 1020 1021 static const struct bpf_map_ops htab_ops = { 1022 .map_alloc = htab_map_alloc, 1023 .map_free = htab_map_free, 1024 .map_get_next_key = htab_map_get_next_key, 1025 .map_lookup_elem = htab_map_lookup_elem, 1026 .map_update_elem = htab_map_update_elem, 1027 .map_delete_elem = htab_map_delete_elem, 1028 }; 1029 1030 static struct bpf_map_type_list htab_type __read_mostly = { 1031 .ops = &htab_ops, 1032 .type = BPF_MAP_TYPE_HASH, 1033 }; 1034 1035 static const struct bpf_map_ops htab_lru_ops = { 1036 .map_alloc = htab_map_alloc, 1037 .map_free = htab_map_free, 1038 .map_get_next_key = htab_map_get_next_key, 1039 .map_lookup_elem = htab_lru_map_lookup_elem, 1040 .map_update_elem = htab_lru_map_update_elem, 1041 .map_delete_elem = htab_lru_map_delete_elem, 1042 }; 1043 1044 static struct bpf_map_type_list htab_lru_type __read_mostly = { 1045 .ops = &htab_lru_ops, 1046 .type = BPF_MAP_TYPE_LRU_HASH, 1047 }; 1048 1049 /* Called from eBPF program */ 1050 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1051 { 1052 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1053 1054 if (l) 1055 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1056 else 1057 return NULL; 1058 } 1059 1060 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1061 { 1062 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1063 1064 if (l) { 1065 bpf_lru_node_set_ref(&l->lru_node); 1066 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1067 } 1068 1069 return NULL; 1070 } 1071 1072 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1073 { 1074 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1075 struct htab_elem *l; 1076 void __percpu *pptr; 1077 int ret = -ENOENT; 1078 int cpu, off = 0; 1079 u32 size; 1080 1081 /* per_cpu areas are zero-filled and bpf programs can only 1082 * access 'value_size' of them, so copying rounded areas 1083 * will not leak any kernel data 1084 */ 1085 size = round_up(map->value_size, 8); 1086 rcu_read_lock(); 1087 l = __htab_map_lookup_elem(map, key); 1088 if (!l) 1089 goto out; 1090 if (htab_is_lru(htab)) 1091 bpf_lru_node_set_ref(&l->lru_node); 1092 pptr = htab_elem_get_ptr(l, map->key_size); 1093 for_each_possible_cpu(cpu) { 1094 bpf_long_memcpy(value + off, 1095 per_cpu_ptr(pptr, cpu), size); 1096 off += size; 1097 } 1098 ret = 0; 1099 out: 1100 rcu_read_unlock(); 1101 return ret; 1102 } 1103 1104 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1105 u64 map_flags) 1106 { 1107 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1108 int ret; 1109 1110 rcu_read_lock(); 1111 if (htab_is_lru(htab)) 1112 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1113 map_flags, true); 1114 else 1115 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1116 true); 1117 rcu_read_unlock(); 1118 1119 return ret; 1120 } 1121 1122 static const struct bpf_map_ops htab_percpu_ops = { 1123 .map_alloc = htab_map_alloc, 1124 .map_free = htab_map_free, 1125 .map_get_next_key = htab_map_get_next_key, 1126 .map_lookup_elem = htab_percpu_map_lookup_elem, 1127 .map_update_elem = htab_percpu_map_update_elem, 1128 .map_delete_elem = htab_map_delete_elem, 1129 }; 1130 1131 static struct bpf_map_type_list htab_percpu_type __read_mostly = { 1132 .ops = &htab_percpu_ops, 1133 .type = BPF_MAP_TYPE_PERCPU_HASH, 1134 }; 1135 1136 static const struct bpf_map_ops htab_lru_percpu_ops = { 1137 .map_alloc = htab_map_alloc, 1138 .map_free = htab_map_free, 1139 .map_get_next_key = htab_map_get_next_key, 1140 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 1141 .map_update_elem = htab_lru_percpu_map_update_elem, 1142 .map_delete_elem = htab_lru_map_delete_elem, 1143 }; 1144 1145 static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { 1146 .ops = &htab_lru_percpu_ops, 1147 .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, 1148 }; 1149 1150 static int __init register_htab_map(void) 1151 { 1152 bpf_register_map_type(&htab_type); 1153 bpf_register_map_type(&htab_percpu_type); 1154 bpf_register_map_type(&htab_lru_type); 1155 bpf_register_map_type(&htab_lru_percpu_type); 1156 return 0; 1157 } 1158 late_initcall(register_htab_map); 1159