1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2019 Facebook */ 3 #include <linux/rculist.h> 4 #include <linux/list.h> 5 #include <linux/hash.h> 6 #include <linux/types.h> 7 #include <linux/spinlock.h> 8 #include <linux/bpf.h> 9 #include <net/bpf_sk_storage.h> 10 #include <net/sock.h> 11 #include <uapi/linux/btf.h> 12 13 static atomic_t cache_idx; 14 15 #define SK_STORAGE_CREATE_FLAG_MASK \ 16 (BPF_F_NO_PREALLOC | BPF_F_CLONE) 17 18 struct bucket { 19 struct hlist_head list; 20 raw_spinlock_t lock; 21 }; 22 23 /* Thp map is not the primary owner of a bpf_sk_storage_elem. 24 * Instead, the sk->sk_bpf_storage is. 25 * 26 * The map (bpf_sk_storage_map) is for two purposes 27 * 1. Define the size of the "sk local storage". It is 28 * the map's value_size. 29 * 30 * 2. Maintain a list to keep track of all elems such 31 * that they can be cleaned up during the map destruction. 32 * 33 * When a bpf local storage is being looked up for a 34 * particular sk, the "bpf_map" pointer is actually used 35 * as the "key" to search in the list of elem in 36 * sk->sk_bpf_storage. 37 * 38 * Hence, consider sk->sk_bpf_storage is the mini-map 39 * with the "bpf_map" pointer as the searching key. 40 */ 41 struct bpf_sk_storage_map { 42 struct bpf_map map; 43 /* Lookup elem does not require accessing the map. 44 * 45 * Updating/Deleting requires a bucket lock to 46 * link/unlink the elem from the map. Having 47 * multiple buckets to improve contention. 48 */ 49 struct bucket *buckets; 50 u32 bucket_log; 51 u16 elem_size; 52 u16 cache_idx; 53 }; 54 55 struct bpf_sk_storage_data { 56 /* smap is used as the searching key when looking up 57 * from sk->sk_bpf_storage. 58 * 59 * Put it in the same cacheline as the data to minimize 60 * the number of cachelines access during the cache hit case. 61 */ 62 struct bpf_sk_storage_map __rcu *smap; 63 u8 data[0] __aligned(8); 64 }; 65 66 /* Linked to bpf_sk_storage and bpf_sk_storage_map */ 67 struct bpf_sk_storage_elem { 68 struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ 69 struct hlist_node snode; /* Linked to bpf_sk_storage */ 70 struct bpf_sk_storage __rcu *sk_storage; 71 struct rcu_head rcu; 72 /* 8 bytes hole */ 73 /* The data is stored in aother cacheline to minimize 74 * the number of cachelines access during a cache hit. 75 */ 76 struct bpf_sk_storage_data sdata ____cacheline_aligned; 77 }; 78 79 #define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) 80 #define SDATA(_SELEM) (&(_SELEM)->sdata) 81 #define BPF_SK_STORAGE_CACHE_SIZE 16 82 83 struct bpf_sk_storage { 84 struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; 85 struct hlist_head list; /* List of bpf_sk_storage_elem */ 86 struct sock *sk; /* The sk that owns the the above "list" of 87 * bpf_sk_storage_elem. 88 */ 89 struct rcu_head rcu; 90 raw_spinlock_t lock; /* Protect adding/removing from the "list" */ 91 }; 92 93 static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, 94 struct bpf_sk_storage_elem *selem) 95 { 96 return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; 97 } 98 99 static int omem_charge(struct sock *sk, unsigned int size) 100 { 101 /* same check as in sock_kmalloc() */ 102 if (size <= sysctl_optmem_max && 103 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 104 atomic_add(size, &sk->sk_omem_alloc); 105 return 0; 106 } 107 108 return -ENOMEM; 109 } 110 111 static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) 112 { 113 return !hlist_unhashed(&selem->snode); 114 } 115 116 static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) 117 { 118 return !hlist_unhashed(&selem->map_node); 119 } 120 121 static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, 122 struct sock *sk, void *value, 123 bool charge_omem) 124 { 125 struct bpf_sk_storage_elem *selem; 126 127 if (charge_omem && omem_charge(sk, smap->elem_size)) 128 return NULL; 129 130 selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); 131 if (selem) { 132 if (value) 133 memcpy(SDATA(selem)->data, value, smap->map.value_size); 134 return selem; 135 } 136 137 if (charge_omem) 138 atomic_sub(smap->elem_size, &sk->sk_omem_alloc); 139 140 return NULL; 141 } 142 143 /* sk_storage->lock must be held and selem->sk_storage == sk_storage. 144 * The caller must ensure selem->smap is still valid to be 145 * dereferenced for its smap->elem_size and smap->cache_idx. 146 */ 147 static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, 148 struct bpf_sk_storage_elem *selem, 149 bool uncharge_omem) 150 { 151 struct bpf_sk_storage_map *smap; 152 bool free_sk_storage; 153 struct sock *sk; 154 155 smap = rcu_dereference(SDATA(selem)->smap); 156 sk = sk_storage->sk; 157 158 /* All uncharging on sk->sk_omem_alloc must be done first. 159 * sk may be freed once the last selem is unlinked from sk_storage. 160 */ 161 if (uncharge_omem) 162 atomic_sub(smap->elem_size, &sk->sk_omem_alloc); 163 164 free_sk_storage = hlist_is_singular_node(&selem->snode, 165 &sk_storage->list); 166 if (free_sk_storage) { 167 atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); 168 sk_storage->sk = NULL; 169 /* After this RCU_INIT, sk may be freed and cannot be used */ 170 RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); 171 172 /* sk_storage is not freed now. sk_storage->lock is 173 * still held and raw_spin_unlock_bh(&sk_storage->lock) 174 * will be done by the caller. 175 * 176 * Although the unlock will be done under 177 * rcu_read_lock(), it is more intutivie to 178 * read if kfree_rcu(sk_storage, rcu) is done 179 * after the raw_spin_unlock_bh(&sk_storage->lock). 180 * 181 * Hence, a "bool free_sk_storage" is returned 182 * to the caller which then calls the kfree_rcu() 183 * after unlock. 184 */ 185 } 186 hlist_del_init_rcu(&selem->snode); 187 if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == 188 SDATA(selem)) 189 RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); 190 191 kfree_rcu(selem, rcu); 192 193 return free_sk_storage; 194 } 195 196 static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) 197 { 198 struct bpf_sk_storage *sk_storage; 199 bool free_sk_storage = false; 200 201 if (unlikely(!selem_linked_to_sk(selem))) 202 /* selem has already been unlinked from sk */ 203 return; 204 205 sk_storage = rcu_dereference(selem->sk_storage); 206 raw_spin_lock_bh(&sk_storage->lock); 207 if (likely(selem_linked_to_sk(selem))) 208 free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); 209 raw_spin_unlock_bh(&sk_storage->lock); 210 211 if (free_sk_storage) 212 kfree_rcu(sk_storage, rcu); 213 } 214 215 static void __selem_link_sk(struct bpf_sk_storage *sk_storage, 216 struct bpf_sk_storage_elem *selem) 217 { 218 RCU_INIT_POINTER(selem->sk_storage, sk_storage); 219 hlist_add_head(&selem->snode, &sk_storage->list); 220 } 221 222 static void selem_unlink_map(struct bpf_sk_storage_elem *selem) 223 { 224 struct bpf_sk_storage_map *smap; 225 struct bucket *b; 226 227 if (unlikely(!selem_linked_to_map(selem))) 228 /* selem has already be unlinked from smap */ 229 return; 230 231 smap = rcu_dereference(SDATA(selem)->smap); 232 b = select_bucket(smap, selem); 233 raw_spin_lock_bh(&b->lock); 234 if (likely(selem_linked_to_map(selem))) 235 hlist_del_init_rcu(&selem->map_node); 236 raw_spin_unlock_bh(&b->lock); 237 } 238 239 static void selem_link_map(struct bpf_sk_storage_map *smap, 240 struct bpf_sk_storage_elem *selem) 241 { 242 struct bucket *b = select_bucket(smap, selem); 243 244 raw_spin_lock_bh(&b->lock); 245 RCU_INIT_POINTER(SDATA(selem)->smap, smap); 246 hlist_add_head_rcu(&selem->map_node, &b->list); 247 raw_spin_unlock_bh(&b->lock); 248 } 249 250 static void selem_unlink(struct bpf_sk_storage_elem *selem) 251 { 252 /* Always unlink from map before unlinking from sk_storage 253 * because selem will be freed after successfully unlinked from 254 * the sk_storage. 255 */ 256 selem_unlink_map(selem); 257 selem_unlink_sk(selem); 258 } 259 260 static struct bpf_sk_storage_data * 261 __sk_storage_lookup(struct bpf_sk_storage *sk_storage, 262 struct bpf_sk_storage_map *smap, 263 bool cacheit_lockit) 264 { 265 struct bpf_sk_storage_data *sdata; 266 struct bpf_sk_storage_elem *selem; 267 268 /* Fast path (cache hit) */ 269 sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); 270 if (sdata && rcu_access_pointer(sdata->smap) == smap) 271 return sdata; 272 273 /* Slow path (cache miss) */ 274 hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) 275 if (rcu_access_pointer(SDATA(selem)->smap) == smap) 276 break; 277 278 if (!selem) 279 return NULL; 280 281 sdata = SDATA(selem); 282 if (cacheit_lockit) { 283 /* spinlock is needed to avoid racing with the 284 * parallel delete. Otherwise, publishing an already 285 * deleted sdata to the cache will become a use-after-free 286 * problem in the next __sk_storage_lookup(). 287 */ 288 raw_spin_lock_bh(&sk_storage->lock); 289 if (selem_linked_to_sk(selem)) 290 rcu_assign_pointer(sk_storage->cache[smap->cache_idx], 291 sdata); 292 raw_spin_unlock_bh(&sk_storage->lock); 293 } 294 295 return sdata; 296 } 297 298 static struct bpf_sk_storage_data * 299 sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) 300 { 301 struct bpf_sk_storage *sk_storage; 302 struct bpf_sk_storage_map *smap; 303 304 sk_storage = rcu_dereference(sk->sk_bpf_storage); 305 if (!sk_storage) 306 return NULL; 307 308 smap = (struct bpf_sk_storage_map *)map; 309 return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); 310 } 311 312 static int check_flags(const struct bpf_sk_storage_data *old_sdata, 313 u64 map_flags) 314 { 315 if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 316 /* elem already exists */ 317 return -EEXIST; 318 319 if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 320 /* elem doesn't exist, cannot update it */ 321 return -ENOENT; 322 323 return 0; 324 } 325 326 static int sk_storage_alloc(struct sock *sk, 327 struct bpf_sk_storage_map *smap, 328 struct bpf_sk_storage_elem *first_selem) 329 { 330 struct bpf_sk_storage *prev_sk_storage, *sk_storage; 331 int err; 332 333 err = omem_charge(sk, sizeof(*sk_storage)); 334 if (err) 335 return err; 336 337 sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); 338 if (!sk_storage) { 339 err = -ENOMEM; 340 goto uncharge; 341 } 342 INIT_HLIST_HEAD(&sk_storage->list); 343 raw_spin_lock_init(&sk_storage->lock); 344 sk_storage->sk = sk; 345 346 __selem_link_sk(sk_storage, first_selem); 347 selem_link_map(smap, first_selem); 348 /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. 349 * Hence, atomic ops is used to set sk->sk_bpf_storage 350 * from NULL to the newly allocated sk_storage ptr. 351 * 352 * From now on, the sk->sk_bpf_storage pointer is protected 353 * by the sk_storage->lock. Hence, when freeing 354 * the sk->sk_bpf_storage, the sk_storage->lock must 355 * be held before setting sk->sk_bpf_storage to NULL. 356 */ 357 prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, 358 NULL, sk_storage); 359 if (unlikely(prev_sk_storage)) { 360 selem_unlink_map(first_selem); 361 err = -EAGAIN; 362 goto uncharge; 363 364 /* Note that even first_selem was linked to smap's 365 * bucket->list, first_selem can be freed immediately 366 * (instead of kfree_rcu) because 367 * bpf_sk_storage_map_free() does a 368 * synchronize_rcu() before walking the bucket->list. 369 * Hence, no one is accessing selem from the 370 * bucket->list under rcu_read_lock(). 371 */ 372 } 373 374 return 0; 375 376 uncharge: 377 kfree(sk_storage); 378 atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); 379 return err; 380 } 381 382 /* sk cannot be going away because it is linking new elem 383 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). 384 * Otherwise, it will become a leak (and other memory issues 385 * during map destruction). 386 */ 387 static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, 388 struct bpf_map *map, 389 void *value, 390 u64 map_flags) 391 { 392 struct bpf_sk_storage_data *old_sdata = NULL; 393 struct bpf_sk_storage_elem *selem; 394 struct bpf_sk_storage *sk_storage; 395 struct bpf_sk_storage_map *smap; 396 int err; 397 398 /* BPF_EXIST and BPF_NOEXIST cannot be both set */ 399 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || 400 /* BPF_F_LOCK can only be used in a value with spin_lock */ 401 unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) 402 return ERR_PTR(-EINVAL); 403 404 smap = (struct bpf_sk_storage_map *)map; 405 sk_storage = rcu_dereference(sk->sk_bpf_storage); 406 if (!sk_storage || hlist_empty(&sk_storage->list)) { 407 /* Very first elem for this sk */ 408 err = check_flags(NULL, map_flags); 409 if (err) 410 return ERR_PTR(err); 411 412 selem = selem_alloc(smap, sk, value, true); 413 if (!selem) 414 return ERR_PTR(-ENOMEM); 415 416 err = sk_storage_alloc(sk, smap, selem); 417 if (err) { 418 kfree(selem); 419 atomic_sub(smap->elem_size, &sk->sk_omem_alloc); 420 return ERR_PTR(err); 421 } 422 423 return SDATA(selem); 424 } 425 426 if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { 427 /* Hoping to find an old_sdata to do inline update 428 * such that it can avoid taking the sk_storage->lock 429 * and changing the lists. 430 */ 431 old_sdata = __sk_storage_lookup(sk_storage, smap, false); 432 err = check_flags(old_sdata, map_flags); 433 if (err) 434 return ERR_PTR(err); 435 if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { 436 copy_map_value_locked(map, old_sdata->data, 437 value, false); 438 return old_sdata; 439 } 440 } 441 442 raw_spin_lock_bh(&sk_storage->lock); 443 444 /* Recheck sk_storage->list under sk_storage->lock */ 445 if (unlikely(hlist_empty(&sk_storage->list))) { 446 /* A parallel del is happening and sk_storage is going 447 * away. It has just been checked before, so very 448 * unlikely. Return instead of retry to keep things 449 * simple. 450 */ 451 err = -EAGAIN; 452 goto unlock_err; 453 } 454 455 old_sdata = __sk_storage_lookup(sk_storage, smap, false); 456 err = check_flags(old_sdata, map_flags); 457 if (err) 458 goto unlock_err; 459 460 if (old_sdata && (map_flags & BPF_F_LOCK)) { 461 copy_map_value_locked(map, old_sdata->data, value, false); 462 selem = SELEM(old_sdata); 463 goto unlock; 464 } 465 466 /* sk_storage->lock is held. Hence, we are sure 467 * we can unlink and uncharge the old_sdata successfully 468 * later. Hence, instead of charging the new selem now 469 * and then uncharge the old selem later (which may cause 470 * a potential but unnecessary charge failure), avoid taking 471 * a charge at all here (the "!old_sdata" check) and the 472 * old_sdata will not be uncharged later during __selem_unlink_sk(). 473 */ 474 selem = selem_alloc(smap, sk, value, !old_sdata); 475 if (!selem) { 476 err = -ENOMEM; 477 goto unlock_err; 478 } 479 480 /* First, link the new selem to the map */ 481 selem_link_map(smap, selem); 482 483 /* Second, link (and publish) the new selem to sk_storage */ 484 __selem_link_sk(sk_storage, selem); 485 486 /* Third, remove old selem, SELEM(old_sdata) */ 487 if (old_sdata) { 488 selem_unlink_map(SELEM(old_sdata)); 489 __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); 490 } 491 492 unlock: 493 raw_spin_unlock_bh(&sk_storage->lock); 494 return SDATA(selem); 495 496 unlock_err: 497 raw_spin_unlock_bh(&sk_storage->lock); 498 return ERR_PTR(err); 499 } 500 501 static int sk_storage_delete(struct sock *sk, struct bpf_map *map) 502 { 503 struct bpf_sk_storage_data *sdata; 504 505 sdata = sk_storage_lookup(sk, map, false); 506 if (!sdata) 507 return -ENOENT; 508 509 selem_unlink(SELEM(sdata)); 510 511 return 0; 512 } 513 514 /* Called by __sk_destruct() & bpf_sk_storage_clone() */ 515 void bpf_sk_storage_free(struct sock *sk) 516 { 517 struct bpf_sk_storage_elem *selem; 518 struct bpf_sk_storage *sk_storage; 519 bool free_sk_storage = false; 520 struct hlist_node *n; 521 522 rcu_read_lock(); 523 sk_storage = rcu_dereference(sk->sk_bpf_storage); 524 if (!sk_storage) { 525 rcu_read_unlock(); 526 return; 527 } 528 529 /* Netiher the bpf_prog nor the bpf-map's syscall 530 * could be modifying the sk_storage->list now. 531 * Thus, no elem can be added-to or deleted-from the 532 * sk_storage->list by the bpf_prog or by the bpf-map's syscall. 533 * 534 * It is racing with bpf_sk_storage_map_free() alone 535 * when unlinking elem from the sk_storage->list and 536 * the map's bucket->list. 537 */ 538 raw_spin_lock_bh(&sk_storage->lock); 539 hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { 540 /* Always unlink from map before unlinking from 541 * sk_storage. 542 */ 543 selem_unlink_map(selem); 544 free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); 545 } 546 raw_spin_unlock_bh(&sk_storage->lock); 547 rcu_read_unlock(); 548 549 if (free_sk_storage) 550 kfree_rcu(sk_storage, rcu); 551 } 552 553 static void bpf_sk_storage_map_free(struct bpf_map *map) 554 { 555 struct bpf_sk_storage_elem *selem; 556 struct bpf_sk_storage_map *smap; 557 struct bucket *b; 558 unsigned int i; 559 560 smap = (struct bpf_sk_storage_map *)map; 561 562 /* Note that this map might be concurrently cloned from 563 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone 564 * RCU read section to finish before proceeding. New RCU 565 * read sections should be prevented via bpf_map_inc_not_zero. 566 */ 567 synchronize_rcu(); 568 569 /* bpf prog and the userspace can no longer access this map 570 * now. No new selem (of this map) can be added 571 * to the sk->sk_bpf_storage or to the map bucket's list. 572 * 573 * The elem of this map can be cleaned up here 574 * or 575 * by bpf_sk_storage_free() during __sk_destruct(). 576 */ 577 for (i = 0; i < (1U << smap->bucket_log); i++) { 578 b = &smap->buckets[i]; 579 580 rcu_read_lock(); 581 /* No one is adding to b->list now */ 582 while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), 583 struct bpf_sk_storage_elem, 584 map_node))) { 585 selem_unlink(selem); 586 cond_resched_rcu(); 587 } 588 rcu_read_unlock(); 589 } 590 591 /* bpf_sk_storage_free() may still need to access the map. 592 * e.g. bpf_sk_storage_free() has unlinked selem from the map 593 * which then made the above while((selem = ...)) loop 594 * exited immediately. 595 * 596 * However, the bpf_sk_storage_free() still needs to access 597 * the smap->elem_size to do the uncharging in 598 * __selem_unlink_sk(). 599 * 600 * Hence, wait another rcu grace period for the 601 * bpf_sk_storage_free() to finish. 602 */ 603 synchronize_rcu(); 604 605 kvfree(smap->buckets); 606 kfree(map); 607 } 608 609 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) 610 { 611 if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || 612 !(attr->map_flags & BPF_F_NO_PREALLOC) || 613 attr->max_entries || 614 attr->key_size != sizeof(int) || !attr->value_size || 615 /* Enforce BTF for userspace sk dumping */ 616 !attr->btf_key_type_id || !attr->btf_value_type_id) 617 return -EINVAL; 618 619 if (!capable(CAP_SYS_ADMIN)) 620 return -EPERM; 621 622 if (attr->value_size >= KMALLOC_MAX_SIZE - 623 MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || 624 /* U16_MAX is much more than enough for sk local storage 625 * considering a tcp_sock is ~2k. 626 */ 627 attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) 628 return -E2BIG; 629 630 return 0; 631 } 632 633 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) 634 { 635 struct bpf_sk_storage_map *smap; 636 unsigned int i; 637 u32 nbuckets; 638 u64 cost; 639 int ret; 640 641 smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); 642 if (!smap) 643 return ERR_PTR(-ENOMEM); 644 bpf_map_init_from_attr(&smap->map, attr); 645 646 /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ 647 smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); 648 nbuckets = 1U << smap->bucket_log; 649 cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); 650 651 ret = bpf_map_charge_init(&smap->map.memory, cost); 652 if (ret < 0) { 653 kfree(smap); 654 return ERR_PTR(ret); 655 } 656 657 smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, 658 GFP_USER | __GFP_NOWARN); 659 if (!smap->buckets) { 660 bpf_map_charge_finish(&smap->map.memory); 661 kfree(smap); 662 return ERR_PTR(-ENOMEM); 663 } 664 665 for (i = 0; i < nbuckets; i++) { 666 INIT_HLIST_HEAD(&smap->buckets[i].list); 667 raw_spin_lock_init(&smap->buckets[i].lock); 668 } 669 670 smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; 671 smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % 672 BPF_SK_STORAGE_CACHE_SIZE; 673 674 return &smap->map; 675 } 676 677 static int notsupp_get_next_key(struct bpf_map *map, void *key, 678 void *next_key) 679 { 680 return -ENOTSUPP; 681 } 682 683 static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, 684 const struct btf *btf, 685 const struct btf_type *key_type, 686 const struct btf_type *value_type) 687 { 688 u32 int_data; 689 690 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 691 return -EINVAL; 692 693 int_data = *(u32 *)(key_type + 1); 694 if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 695 return -EINVAL; 696 697 return 0; 698 } 699 700 static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) 701 { 702 struct bpf_sk_storage_data *sdata; 703 struct socket *sock; 704 int fd, err; 705 706 fd = *(int *)key; 707 sock = sockfd_lookup(fd, &err); 708 if (sock) { 709 sdata = sk_storage_lookup(sock->sk, map, true); 710 sockfd_put(sock); 711 return sdata ? sdata->data : NULL; 712 } 713 714 return ERR_PTR(err); 715 } 716 717 static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, 718 void *value, u64 map_flags) 719 { 720 struct bpf_sk_storage_data *sdata; 721 struct socket *sock; 722 int fd, err; 723 724 fd = *(int *)key; 725 sock = sockfd_lookup(fd, &err); 726 if (sock) { 727 sdata = sk_storage_update(sock->sk, map, value, map_flags); 728 sockfd_put(sock); 729 return PTR_ERR_OR_ZERO(sdata); 730 } 731 732 return err; 733 } 734 735 static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) 736 { 737 struct socket *sock; 738 int fd, err; 739 740 fd = *(int *)key; 741 sock = sockfd_lookup(fd, &err); 742 if (sock) { 743 err = sk_storage_delete(sock->sk, map); 744 sockfd_put(sock); 745 return err; 746 } 747 748 return err; 749 } 750 751 static struct bpf_sk_storage_elem * 752 bpf_sk_storage_clone_elem(struct sock *newsk, 753 struct bpf_sk_storage_map *smap, 754 struct bpf_sk_storage_elem *selem) 755 { 756 struct bpf_sk_storage_elem *copy_selem; 757 758 copy_selem = selem_alloc(smap, newsk, NULL, true); 759 if (!copy_selem) 760 return NULL; 761 762 if (map_value_has_spin_lock(&smap->map)) 763 copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, 764 SDATA(selem)->data, true); 765 else 766 copy_map_value(&smap->map, SDATA(copy_selem)->data, 767 SDATA(selem)->data); 768 769 return copy_selem; 770 } 771 772 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) 773 { 774 struct bpf_sk_storage *new_sk_storage = NULL; 775 struct bpf_sk_storage *sk_storage; 776 struct bpf_sk_storage_elem *selem; 777 int ret = 0; 778 779 RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); 780 781 rcu_read_lock(); 782 sk_storage = rcu_dereference(sk->sk_bpf_storage); 783 784 if (!sk_storage || hlist_empty(&sk_storage->list)) 785 goto out; 786 787 hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { 788 struct bpf_sk_storage_elem *copy_selem; 789 struct bpf_sk_storage_map *smap; 790 struct bpf_map *map; 791 792 smap = rcu_dereference(SDATA(selem)->smap); 793 if (!(smap->map.map_flags & BPF_F_CLONE)) 794 continue; 795 796 /* Note that for lockless listeners adding new element 797 * here can race with cleanup in bpf_sk_storage_map_free. 798 * Try to grab map refcnt to make sure that it's still 799 * alive and prevent concurrent removal. 800 */ 801 map = bpf_map_inc_not_zero(&smap->map); 802 if (IS_ERR(map)) 803 continue; 804 805 copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); 806 if (!copy_selem) { 807 ret = -ENOMEM; 808 bpf_map_put(map); 809 goto out; 810 } 811 812 if (new_sk_storage) { 813 selem_link_map(smap, copy_selem); 814 __selem_link_sk(new_sk_storage, copy_selem); 815 } else { 816 ret = sk_storage_alloc(newsk, smap, copy_selem); 817 if (ret) { 818 kfree(copy_selem); 819 atomic_sub(smap->elem_size, 820 &newsk->sk_omem_alloc); 821 bpf_map_put(map); 822 goto out; 823 } 824 825 new_sk_storage = rcu_dereference(copy_selem->sk_storage); 826 } 827 bpf_map_put(map); 828 } 829 830 out: 831 rcu_read_unlock(); 832 833 /* In case of an error, don't free anything explicitly here, the 834 * caller is responsible to call bpf_sk_storage_free. 835 */ 836 837 return ret; 838 } 839 840 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, 841 void *, value, u64, flags) 842 { 843 struct bpf_sk_storage_data *sdata; 844 845 if (flags > BPF_SK_STORAGE_GET_F_CREATE) 846 return (unsigned long)NULL; 847 848 sdata = sk_storage_lookup(sk, map, true); 849 if (sdata) 850 return (unsigned long)sdata->data; 851 852 if (flags == BPF_SK_STORAGE_GET_F_CREATE && 853 /* Cannot add new elem to a going away sk. 854 * Otherwise, the new elem may become a leak 855 * (and also other memory issues during map 856 * destruction). 857 */ 858 refcount_inc_not_zero(&sk->sk_refcnt)) { 859 sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); 860 /* sk must be a fullsock (guaranteed by verifier), 861 * so sock_gen_put() is unnecessary. 862 */ 863 sock_put(sk); 864 return IS_ERR(sdata) ? 865 (unsigned long)NULL : (unsigned long)sdata->data; 866 } 867 868 return (unsigned long)NULL; 869 } 870 871 BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) 872 { 873 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 874 int err; 875 876 err = sk_storage_delete(sk, map); 877 sock_put(sk); 878 return err; 879 } 880 881 return -ENOENT; 882 } 883 884 const struct bpf_map_ops sk_storage_map_ops = { 885 .map_alloc_check = bpf_sk_storage_map_alloc_check, 886 .map_alloc = bpf_sk_storage_map_alloc, 887 .map_free = bpf_sk_storage_map_free, 888 .map_get_next_key = notsupp_get_next_key, 889 .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, 890 .map_update_elem = bpf_fd_sk_storage_update_elem, 891 .map_delete_elem = bpf_fd_sk_storage_delete_elem, 892 .map_check_btf = bpf_sk_storage_map_check_btf, 893 }; 894 895 const struct bpf_func_proto bpf_sk_storage_get_proto = { 896 .func = bpf_sk_storage_get, 897 .gpl_only = false, 898 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, 899 .arg1_type = ARG_CONST_MAP_PTR, 900 .arg2_type = ARG_PTR_TO_SOCKET, 901 .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, 902 .arg4_type = ARG_ANYTHING, 903 }; 904 905 const struct bpf_func_proto bpf_sk_storage_delete_proto = { 906 .func = bpf_sk_storage_delete, 907 .gpl_only = false, 908 .ret_type = RET_INTEGER, 909 .arg1_type = ARG_CONST_MAP_PTR, 910 .arg2_type = ARG_PTR_TO_SOCKET, 911 }; 912