1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io 3 */ 4 5 /* Devmaps primary use is as a backend map for XDP BPF helper call 6 * bpf_redirect_map(). Because XDP is mostly concerned with performance we 7 * spent some effort to ensure the datapath with redirect maps does not use 8 * any locking. This is a quick note on the details. 9 * 10 * We have three possible paths to get into the devmap control plane bpf 11 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall 12 * will invoke an update, delete, or lookup operation. To ensure updates and 13 * deletes appear atomic from the datapath side xchg() is used to modify the 14 * netdev_map array. Then because the datapath does a lookup into the netdev_map 15 * array (read-only) from an RCU critical section we use call_rcu() to wait for 16 * an rcu grace period before free'ing the old data structures. This ensures the 17 * datapath always has a valid copy. However, the datapath does a "flush" 18 * operation that pushes any pending packets in the driver outside the RCU 19 * critical section. Each bpf_dtab_netdev tracks these pending operations using 20 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until 21 * this list is empty, indicating outstanding flush operations have completed. 22 * 23 * BPF syscalls may race with BPF program calls on any of the update, delete 24 * or lookup operations. As noted above the xchg() operation also keep the 25 * netdev_map consistent in this case. From the devmap side BPF programs 26 * calling into these operations are the same as multiple user space threads 27 * making system calls. 28 * 29 * Finally, any of the above may race with a netdev_unregister notifier. The 30 * unregister notifier must search for net devices in the map structure that 31 * contain a reference to the net device and remove them. This is a two step 32 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) 33 * check to see if the ifindex is the same as the net_device being removed. 34 * When removing the dev a cmpxchg() is used to ensure the correct dev is 35 * removed, in the case of a concurrent update or delete operation it is 36 * possible that the initially referenced dev is no longer in the map. As the 37 * notifier hook walks the map we know that new dev references can not be 38 * added by the user because core infrastructure ensures dev_get_by_index() 39 * calls will fail at this point. 40 * 41 * The devmap_hash type is a map type which interprets keys as ifindexes and 42 * indexes these using a hashmap. This allows maps that use ifindex as key to be 43 * densely packed instead of having holes in the lookup array for unused 44 * ifindexes. The setup and packet enqueue/send code is shared between the two 45 * types of devmap; only the lookup and insertion is different. 46 */ 47 #include <linux/bpf.h> 48 #include <net/xdp.h> 49 #include <linux/filter.h> 50 #include <trace/events/xdp.h> 51 52 #define DEV_CREATE_FLAG_MASK \ 53 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 54 55 #define DEV_MAP_BULK_SIZE 16 56 struct bpf_dtab_netdev; 57 58 struct xdp_bulk_queue { 59 struct xdp_frame *q[DEV_MAP_BULK_SIZE]; 60 struct list_head flush_node; 61 struct net_device *dev_rx; 62 struct bpf_dtab_netdev *obj; 63 unsigned int count; 64 }; 65 66 struct bpf_dtab_netdev { 67 struct net_device *dev; /* must be first member, due to tracepoint */ 68 struct hlist_node index_hlist; 69 struct bpf_dtab *dtab; 70 struct xdp_bulk_queue __percpu *bulkq; 71 struct rcu_head rcu; 72 unsigned int idx; /* keep track of map index for tracepoint */ 73 }; 74 75 struct bpf_dtab { 76 struct bpf_map map; 77 struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ 78 struct list_head __percpu *flush_list; 79 struct list_head list; 80 81 /* these are only used for DEVMAP_HASH type maps */ 82 struct hlist_head *dev_index_head; 83 spinlock_t index_lock; 84 unsigned int items; 85 u32 n_buckets; 86 }; 87 88 static DEFINE_SPINLOCK(dev_map_lock); 89 static LIST_HEAD(dev_map_list); 90 91 static struct hlist_head *dev_map_create_hash(unsigned int entries) 92 { 93 int i; 94 struct hlist_head *hash; 95 96 hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); 97 if (hash != NULL) 98 for (i = 0; i < entries; i++) 99 INIT_HLIST_HEAD(&hash[i]); 100 101 return hash; 102 } 103 104 static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, 105 int idx) 106 { 107 return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; 108 } 109 110 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) 111 { 112 int err, cpu; 113 u64 cost; 114 115 /* check sanity of attributes */ 116 if (attr->max_entries == 0 || attr->key_size != 4 || 117 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) 118 return -EINVAL; 119 120 /* Lookup returns a pointer straight to dev->ifindex, so make sure the 121 * verifier prevents writes from the BPF side 122 */ 123 attr->map_flags |= BPF_F_RDONLY_PROG; 124 125 126 bpf_map_init_from_attr(&dtab->map, attr); 127 128 /* make sure page count doesn't overflow */ 129 cost = (u64) sizeof(struct list_head) * num_possible_cpus(); 130 131 if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 132 dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); 133 134 if (!dtab->n_buckets) /* Overflow check */ 135 return -EINVAL; 136 cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; 137 } else { 138 cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 139 } 140 141 /* if map size is larger than memlock limit, reject it */ 142 err = bpf_map_charge_init(&dtab->map.memory, cost); 143 if (err) 144 return -EINVAL; 145 146 dtab->flush_list = alloc_percpu(struct list_head); 147 if (!dtab->flush_list) 148 goto free_charge; 149 150 for_each_possible_cpu(cpu) 151 INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); 152 153 if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 154 dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); 155 if (!dtab->dev_index_head) 156 goto free_percpu; 157 158 spin_lock_init(&dtab->index_lock); 159 } else { 160 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * 161 sizeof(struct bpf_dtab_netdev *), 162 dtab->map.numa_node); 163 if (!dtab->netdev_map) 164 goto free_percpu; 165 } 166 167 return 0; 168 169 free_percpu: 170 free_percpu(dtab->flush_list); 171 free_charge: 172 bpf_map_charge_finish(&dtab->map.memory); 173 return -ENOMEM; 174 } 175 176 static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 177 { 178 struct bpf_dtab *dtab; 179 int err; 180 181 if (!capable(CAP_NET_ADMIN)) 182 return ERR_PTR(-EPERM); 183 184 dtab = kzalloc(sizeof(*dtab), GFP_USER); 185 if (!dtab) 186 return ERR_PTR(-ENOMEM); 187 188 err = dev_map_init_map(dtab, attr); 189 if (err) { 190 kfree(dtab); 191 return ERR_PTR(err); 192 } 193 194 spin_lock(&dev_map_lock); 195 list_add_tail_rcu(&dtab->list, &dev_map_list); 196 spin_unlock(&dev_map_lock); 197 198 return &dtab->map; 199 } 200 201 static void dev_map_free(struct bpf_map *map) 202 { 203 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 204 int i, cpu; 205 206 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 207 * so the programs (can be more than one that used this map) were 208 * disconnected from events. Wait for outstanding critical sections in 209 * these programs to complete. The rcu critical section only guarantees 210 * no further reads against netdev_map. It does __not__ ensure pending 211 * flush operations (if any) are complete. 212 */ 213 214 spin_lock(&dev_map_lock); 215 list_del_rcu(&dtab->list); 216 spin_unlock(&dev_map_lock); 217 218 bpf_clear_redirect_map(map); 219 synchronize_rcu(); 220 221 /* Make sure prior __dev_map_entry_free() have completed. */ 222 rcu_barrier(); 223 224 /* To ensure all pending flush operations have completed wait for flush 225 * list to empty on _all_ cpus. 226 * Because the above synchronize_rcu() ensures the map is disconnected 227 * from the program we can assume no new items will be added. 228 */ 229 for_each_online_cpu(cpu) { 230 struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); 231 232 while (!list_empty(flush_list)) 233 cond_resched(); 234 } 235 236 if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 237 for (i = 0; i < dtab->n_buckets; i++) { 238 struct bpf_dtab_netdev *dev; 239 struct hlist_head *head; 240 struct hlist_node *next; 241 242 head = dev_map_index_hash(dtab, i); 243 244 hlist_for_each_entry_safe(dev, next, head, index_hlist) { 245 hlist_del_rcu(&dev->index_hlist); 246 free_percpu(dev->bulkq); 247 dev_put(dev->dev); 248 kfree(dev); 249 } 250 } 251 252 kfree(dtab->dev_index_head); 253 } else { 254 for (i = 0; i < dtab->map.max_entries; i++) { 255 struct bpf_dtab_netdev *dev; 256 257 dev = dtab->netdev_map[i]; 258 if (!dev) 259 continue; 260 261 free_percpu(dev->bulkq); 262 dev_put(dev->dev); 263 kfree(dev); 264 } 265 266 bpf_map_area_free(dtab->netdev_map); 267 } 268 269 free_percpu(dtab->flush_list); 270 kfree(dtab); 271 } 272 273 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 274 { 275 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 276 u32 index = key ? *(u32 *)key : U32_MAX; 277 u32 *next = next_key; 278 279 if (index >= dtab->map.max_entries) { 280 *next = 0; 281 return 0; 282 } 283 284 if (index == dtab->map.max_entries - 1) 285 return -ENOENT; 286 *next = index + 1; 287 return 0; 288 } 289 290 struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) 291 { 292 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 293 struct hlist_head *head = dev_map_index_hash(dtab, key); 294 struct bpf_dtab_netdev *dev; 295 296 hlist_for_each_entry_rcu(dev, head, index_hlist) 297 if (dev->idx == key) 298 return dev; 299 300 return NULL; 301 } 302 303 static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, 304 void *next_key) 305 { 306 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 307 u32 idx, *next = next_key; 308 struct bpf_dtab_netdev *dev, *next_dev; 309 struct hlist_head *head; 310 int i = 0; 311 312 if (!key) 313 goto find_first; 314 315 idx = *(u32 *)key; 316 317 dev = __dev_map_hash_lookup_elem(map, idx); 318 if (!dev) 319 goto find_first; 320 321 next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), 322 struct bpf_dtab_netdev, index_hlist); 323 324 if (next_dev) { 325 *next = next_dev->idx; 326 return 0; 327 } 328 329 i = idx & (dtab->n_buckets - 1); 330 i++; 331 332 find_first: 333 for (; i < dtab->n_buckets; i++) { 334 head = dev_map_index_hash(dtab, i); 335 336 next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), 337 struct bpf_dtab_netdev, 338 index_hlist); 339 if (next_dev) { 340 *next = next_dev->idx; 341 return 0; 342 } 343 } 344 345 return -ENOENT; 346 } 347 348 static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, 349 bool in_napi_ctx) 350 { 351 struct bpf_dtab_netdev *obj = bq->obj; 352 struct net_device *dev = obj->dev; 353 int sent = 0, drops = 0, err = 0; 354 int i; 355 356 if (unlikely(!bq->count)) 357 return 0; 358 359 for (i = 0; i < bq->count; i++) { 360 struct xdp_frame *xdpf = bq->q[i]; 361 362 prefetch(xdpf); 363 } 364 365 sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); 366 if (sent < 0) { 367 err = sent; 368 sent = 0; 369 goto error; 370 } 371 drops = bq->count - sent; 372 out: 373 bq->count = 0; 374 375 trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, 376 sent, drops, bq->dev_rx, dev, err); 377 bq->dev_rx = NULL; 378 __list_del_clearprev(&bq->flush_node); 379 return 0; 380 error: 381 /* If ndo_xdp_xmit fails with an errno, no frames have been 382 * xmit'ed and it's our responsibility to them free all. 383 */ 384 for (i = 0; i < bq->count; i++) { 385 struct xdp_frame *xdpf = bq->q[i]; 386 387 /* RX path under NAPI protection, can return frames faster */ 388 if (likely(in_napi_ctx)) 389 xdp_return_frame_rx_napi(xdpf); 390 else 391 xdp_return_frame(xdpf); 392 drops++; 393 } 394 goto out; 395 } 396 397 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled 398 * from the driver before returning from its napi->poll() routine. The poll() 399 * routine is called either from busy_poll context or net_rx_action signaled 400 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the 401 * net device can be torn down. On devmap tear down we ensure the flush list 402 * is empty before completing to ensure all flush operations have completed. 403 */ 404 void __dev_map_flush(struct bpf_map *map) 405 { 406 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 407 struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); 408 struct xdp_bulk_queue *bq, *tmp; 409 410 rcu_read_lock(); 411 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) 412 bq_xmit_all(bq, XDP_XMIT_FLUSH, true); 413 rcu_read_unlock(); 414 } 415 416 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or 417 * update happens in parallel here a dev_put wont happen until after reading the 418 * ifindex. 419 */ 420 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) 421 { 422 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 423 struct bpf_dtab_netdev *obj; 424 425 if (key >= map->max_entries) 426 return NULL; 427 428 obj = READ_ONCE(dtab->netdev_map[key]); 429 return obj; 430 } 431 432 /* Runs under RCU-read-side, plus in softirq under NAPI protection. 433 * Thus, safe percpu variable access. 434 */ 435 static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, 436 struct net_device *dev_rx) 437 438 { 439 struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); 440 struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); 441 442 if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) 443 bq_xmit_all(bq, 0, true); 444 445 /* Ingress dev_rx will be the same for all xdp_frame's in 446 * bulk_queue, because bq stored per-CPU and must be flushed 447 * from net_device drivers NAPI func end. 448 */ 449 if (!bq->dev_rx) 450 bq->dev_rx = dev_rx; 451 452 bq->q[bq->count++] = xdpf; 453 454 if (!bq->flush_node.prev) 455 list_add(&bq->flush_node, flush_list); 456 457 return 0; 458 } 459 460 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 461 struct net_device *dev_rx) 462 { 463 struct net_device *dev = dst->dev; 464 struct xdp_frame *xdpf; 465 int err; 466 467 if (!dev->netdev_ops->ndo_xdp_xmit) 468 return -EOPNOTSUPP; 469 470 err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); 471 if (unlikely(err)) 472 return err; 473 474 xdpf = convert_to_xdp_frame(xdp); 475 if (unlikely(!xdpf)) 476 return -EOVERFLOW; 477 478 return bq_enqueue(dst, xdpf, dev_rx); 479 } 480 481 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 482 struct bpf_prog *xdp_prog) 483 { 484 int err; 485 486 err = xdp_ok_fwd_dev(dst->dev, skb->len); 487 if (unlikely(err)) 488 return err; 489 skb->dev = dst->dev; 490 generic_xdp_tx(skb, xdp_prog); 491 492 return 0; 493 } 494 495 static void *dev_map_lookup_elem(struct bpf_map *map, void *key) 496 { 497 struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); 498 struct net_device *dev = obj ? obj->dev : NULL; 499 500 return dev ? &dev->ifindex : NULL; 501 } 502 503 static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) 504 { 505 struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, 506 *(u32 *)key); 507 struct net_device *dev = obj ? obj->dev : NULL; 508 509 return dev ? &dev->ifindex : NULL; 510 } 511 512 static void dev_map_flush_old(struct bpf_dtab_netdev *dev) 513 { 514 if (dev->dev->netdev_ops->ndo_xdp_xmit) { 515 struct xdp_bulk_queue *bq; 516 int cpu; 517 518 rcu_read_lock(); 519 for_each_online_cpu(cpu) { 520 bq = per_cpu_ptr(dev->bulkq, cpu); 521 bq_xmit_all(bq, XDP_XMIT_FLUSH, false); 522 } 523 rcu_read_unlock(); 524 } 525 } 526 527 static void __dev_map_entry_free(struct rcu_head *rcu) 528 { 529 struct bpf_dtab_netdev *dev; 530 531 dev = container_of(rcu, struct bpf_dtab_netdev, rcu); 532 dev_map_flush_old(dev); 533 free_percpu(dev->bulkq); 534 dev_put(dev->dev); 535 kfree(dev); 536 } 537 538 static int dev_map_delete_elem(struct bpf_map *map, void *key) 539 { 540 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 541 struct bpf_dtab_netdev *old_dev; 542 int k = *(u32 *)key; 543 544 if (k >= map->max_entries) 545 return -EINVAL; 546 547 /* Use call_rcu() here to ensure any rcu critical sections have 548 * completed, but this does not guarantee a flush has happened 549 * yet. Because driver side rcu_read_lock/unlock only protects the 550 * running XDP program. However, for pending flush operations the 551 * dev and ctx are stored in another per cpu map. And additionally, 552 * the driver tear down ensures all soft irqs are complete before 553 * removing the net device in the case of dev_put equals zero. 554 */ 555 old_dev = xchg(&dtab->netdev_map[k], NULL); 556 if (old_dev) 557 call_rcu(&old_dev->rcu, __dev_map_entry_free); 558 return 0; 559 } 560 561 static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) 562 { 563 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 564 struct bpf_dtab_netdev *old_dev; 565 int k = *(u32 *)key; 566 unsigned long flags; 567 int ret = -ENOENT; 568 569 spin_lock_irqsave(&dtab->index_lock, flags); 570 571 old_dev = __dev_map_hash_lookup_elem(map, k); 572 if (old_dev) { 573 dtab->items--; 574 hlist_del_init_rcu(&old_dev->index_hlist); 575 call_rcu(&old_dev->rcu, __dev_map_entry_free); 576 ret = 0; 577 } 578 spin_unlock_irqrestore(&dtab->index_lock, flags); 579 580 return ret; 581 } 582 583 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, 584 struct bpf_dtab *dtab, 585 u32 ifindex, 586 unsigned int idx) 587 { 588 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 589 struct bpf_dtab_netdev *dev; 590 struct xdp_bulk_queue *bq; 591 int cpu; 592 593 dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); 594 if (!dev) 595 return ERR_PTR(-ENOMEM); 596 597 dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), 598 sizeof(void *), gfp); 599 if (!dev->bulkq) { 600 kfree(dev); 601 return ERR_PTR(-ENOMEM); 602 } 603 604 for_each_possible_cpu(cpu) { 605 bq = per_cpu_ptr(dev->bulkq, cpu); 606 bq->obj = dev; 607 } 608 609 dev->dev = dev_get_by_index(net, ifindex); 610 if (!dev->dev) { 611 free_percpu(dev->bulkq); 612 kfree(dev); 613 return ERR_PTR(-EINVAL); 614 } 615 616 dev->idx = idx; 617 dev->dtab = dtab; 618 619 return dev; 620 } 621 622 static int __dev_map_update_elem(struct net *net, struct bpf_map *map, 623 void *key, void *value, u64 map_flags) 624 { 625 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 626 struct bpf_dtab_netdev *dev, *old_dev; 627 u32 ifindex = *(u32 *)value; 628 u32 i = *(u32 *)key; 629 630 if (unlikely(map_flags > BPF_EXIST)) 631 return -EINVAL; 632 if (unlikely(i >= dtab->map.max_entries)) 633 return -E2BIG; 634 if (unlikely(map_flags == BPF_NOEXIST)) 635 return -EEXIST; 636 637 if (!ifindex) { 638 dev = NULL; 639 } else { 640 dev = __dev_map_alloc_node(net, dtab, ifindex, i); 641 if (IS_ERR(dev)) 642 return PTR_ERR(dev); 643 } 644 645 /* Use call_rcu() here to ensure rcu critical sections have completed 646 * Remembering the driver side flush operation will happen before the 647 * net device is removed. 648 */ 649 old_dev = xchg(&dtab->netdev_map[i], dev); 650 if (old_dev) 651 call_rcu(&old_dev->rcu, __dev_map_entry_free); 652 653 return 0; 654 } 655 656 static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, 657 u64 map_flags) 658 { 659 return __dev_map_update_elem(current->nsproxy->net_ns, 660 map, key, value, map_flags); 661 } 662 663 static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, 664 void *key, void *value, u64 map_flags) 665 { 666 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 667 struct bpf_dtab_netdev *dev, *old_dev; 668 u32 ifindex = *(u32 *)value; 669 u32 idx = *(u32 *)key; 670 unsigned long flags; 671 int err = -EEXIST; 672 673 if (unlikely(map_flags > BPF_EXIST || !ifindex)) 674 return -EINVAL; 675 676 spin_lock_irqsave(&dtab->index_lock, flags); 677 678 old_dev = __dev_map_hash_lookup_elem(map, idx); 679 if (old_dev && (map_flags & BPF_NOEXIST)) 680 goto out_err; 681 682 dev = __dev_map_alloc_node(net, dtab, ifindex, idx); 683 if (IS_ERR(dev)) { 684 err = PTR_ERR(dev); 685 goto out_err; 686 } 687 688 if (old_dev) { 689 hlist_del_rcu(&old_dev->index_hlist); 690 } else { 691 if (dtab->items >= dtab->map.max_entries) { 692 spin_unlock_irqrestore(&dtab->index_lock, flags); 693 call_rcu(&dev->rcu, __dev_map_entry_free); 694 return -E2BIG; 695 } 696 dtab->items++; 697 } 698 699 hlist_add_head_rcu(&dev->index_hlist, 700 dev_map_index_hash(dtab, idx)); 701 spin_unlock_irqrestore(&dtab->index_lock, flags); 702 703 if (old_dev) 704 call_rcu(&old_dev->rcu, __dev_map_entry_free); 705 706 return 0; 707 708 out_err: 709 spin_unlock_irqrestore(&dtab->index_lock, flags); 710 return err; 711 } 712 713 static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, 714 u64 map_flags) 715 { 716 return __dev_map_hash_update_elem(current->nsproxy->net_ns, 717 map, key, value, map_flags); 718 } 719 720 const struct bpf_map_ops dev_map_ops = { 721 .map_alloc = dev_map_alloc, 722 .map_free = dev_map_free, 723 .map_get_next_key = dev_map_get_next_key, 724 .map_lookup_elem = dev_map_lookup_elem, 725 .map_update_elem = dev_map_update_elem, 726 .map_delete_elem = dev_map_delete_elem, 727 .map_check_btf = map_check_no_btf, 728 }; 729 730 const struct bpf_map_ops dev_map_hash_ops = { 731 .map_alloc = dev_map_alloc, 732 .map_free = dev_map_free, 733 .map_get_next_key = dev_map_hash_get_next_key, 734 .map_lookup_elem = dev_map_hash_lookup_elem, 735 .map_update_elem = dev_map_hash_update_elem, 736 .map_delete_elem = dev_map_hash_delete_elem, 737 .map_check_btf = map_check_no_btf, 738 }; 739 740 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, 741 struct net_device *netdev) 742 { 743 unsigned long flags; 744 u32 i; 745 746 spin_lock_irqsave(&dtab->index_lock, flags); 747 for (i = 0; i < dtab->n_buckets; i++) { 748 struct bpf_dtab_netdev *dev; 749 struct hlist_head *head; 750 struct hlist_node *next; 751 752 head = dev_map_index_hash(dtab, i); 753 754 hlist_for_each_entry_safe(dev, next, head, index_hlist) { 755 if (netdev != dev->dev) 756 continue; 757 758 dtab->items--; 759 hlist_del_rcu(&dev->index_hlist); 760 call_rcu(&dev->rcu, __dev_map_entry_free); 761 } 762 } 763 spin_unlock_irqrestore(&dtab->index_lock, flags); 764 } 765 766 static int dev_map_notification(struct notifier_block *notifier, 767 ulong event, void *ptr) 768 { 769 struct net_device *netdev = netdev_notifier_info_to_dev(ptr); 770 struct bpf_dtab *dtab; 771 int i; 772 773 switch (event) { 774 case NETDEV_UNREGISTER: 775 /* This rcu_read_lock/unlock pair is needed because 776 * dev_map_list is an RCU list AND to ensure a delete 777 * operation does not free a netdev_map entry while we 778 * are comparing it against the netdev being unregistered. 779 */ 780 rcu_read_lock(); 781 list_for_each_entry_rcu(dtab, &dev_map_list, list) { 782 if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 783 dev_map_hash_remove_netdev(dtab, netdev); 784 continue; 785 } 786 787 for (i = 0; i < dtab->map.max_entries; i++) { 788 struct bpf_dtab_netdev *dev, *odev; 789 790 dev = READ_ONCE(dtab->netdev_map[i]); 791 if (!dev || netdev != dev->dev) 792 continue; 793 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); 794 if (dev == odev) 795 call_rcu(&dev->rcu, 796 __dev_map_entry_free); 797 } 798 } 799 rcu_read_unlock(); 800 break; 801 default: 802 break; 803 } 804 return NOTIFY_OK; 805 } 806 807 static struct notifier_block dev_map_notifier = { 808 .notifier_call = dev_map_notification, 809 }; 810 811 static int __init dev_map_init(void) 812 { 813 /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ 814 BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != 815 offsetof(struct _bpf_dtab_netdev, dev)); 816 register_netdevice_notifier(&dev_map_notifier); 817 return 0; 818 } 819 820 subsys_initcall(dev_map_init); 821