1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <net/xdp_sock_drv.h> 26 #include <net/xdp.h> 27 28 #include "xsk_queue.h" 29 #include "xdp_umem.h" 30 #include "xsk.h" 31 32 #define TX_BATCH_SIZE 16 33 34 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); 35 36 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 37 { 38 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && 39 READ_ONCE(xs->umem->fq); 40 } 41 42 void xsk_set_rx_need_wakeup(struct xdp_umem *umem) 43 { 44 if (umem->need_wakeup & XDP_WAKEUP_RX) 45 return; 46 47 umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 48 umem->need_wakeup |= XDP_WAKEUP_RX; 49 } 50 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 51 52 void xsk_set_tx_need_wakeup(struct xdp_umem *umem) 53 { 54 struct xdp_sock *xs; 55 56 if (umem->need_wakeup & XDP_WAKEUP_TX) 57 return; 58 59 rcu_read_lock(); 60 list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 61 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 62 } 63 rcu_read_unlock(); 64 65 umem->need_wakeup |= XDP_WAKEUP_TX; 66 } 67 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 68 69 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) 70 { 71 if (!(umem->need_wakeup & XDP_WAKEUP_RX)) 72 return; 73 74 umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 75 umem->need_wakeup &= ~XDP_WAKEUP_RX; 76 } 77 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 78 79 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) 80 { 81 struct xdp_sock *xs; 82 83 if (!(umem->need_wakeup & XDP_WAKEUP_TX)) 84 return; 85 86 rcu_read_lock(); 87 list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 88 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 89 } 90 rcu_read_unlock(); 91 92 umem->need_wakeup &= ~XDP_WAKEUP_TX; 93 } 94 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 95 96 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) 97 { 98 return umem->flags & XDP_UMEM_USES_NEED_WAKEUP; 99 } 100 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); 101 102 void xp_release(struct xdp_buff_xsk *xskb) 103 { 104 xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; 105 } 106 107 static u64 xp_get_handle(struct xdp_buff_xsk *xskb) 108 { 109 u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; 110 111 offset += xskb->pool->headroom; 112 if (!xskb->pool->unaligned) 113 return xskb->orig_addr + offset; 114 return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); 115 } 116 117 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 118 { 119 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 120 u64 addr; 121 int err; 122 123 addr = xp_get_handle(xskb); 124 err = xskq_prod_reserve_desc(xs->rx, addr, len); 125 if (err) { 126 xs->rx_dropped++; 127 return err; 128 } 129 130 xp_release(xskb); 131 return 0; 132 } 133 134 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) 135 { 136 void *from_buf, *to_buf; 137 u32 metalen; 138 139 if (unlikely(xdp_data_meta_unsupported(from))) { 140 from_buf = from->data; 141 to_buf = to->data; 142 metalen = 0; 143 } else { 144 from_buf = from->data_meta; 145 metalen = from->data - from->data_meta; 146 to_buf = to->data - metalen; 147 } 148 149 memcpy(to_buf, from_buf, len + metalen); 150 } 151 152 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, 153 bool explicit_free) 154 { 155 struct xdp_buff *xsk_xdp; 156 int err; 157 158 if (len > xsk_umem_get_rx_frame_size(xs->umem)) { 159 xs->rx_dropped++; 160 return -ENOSPC; 161 } 162 163 xsk_xdp = xsk_buff_alloc(xs->umem); 164 if (!xsk_xdp) { 165 xs->rx_dropped++; 166 return -ENOSPC; 167 } 168 169 xsk_copy_xdp(xsk_xdp, xdp, len); 170 err = __xsk_rcv_zc(xs, xsk_xdp, len); 171 if (err) { 172 xsk_buff_free(xsk_xdp); 173 return err; 174 } 175 if (explicit_free) 176 xdp_return_buff(xdp); 177 return 0; 178 } 179 180 static bool xsk_is_bound(struct xdp_sock *xs) 181 { 182 if (READ_ONCE(xs->state) == XSK_BOUND) { 183 /* Matches smp_wmb() in bind(). */ 184 smp_rmb(); 185 return true; 186 } 187 return false; 188 } 189 190 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, 191 bool explicit_free) 192 { 193 u32 len; 194 195 if (!xsk_is_bound(xs)) 196 return -EINVAL; 197 198 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 199 return -EINVAL; 200 201 len = xdp->data_end - xdp->data; 202 203 return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? 204 __xsk_rcv_zc(xs, xdp, len) : 205 __xsk_rcv(xs, xdp, len, explicit_free); 206 } 207 208 static void xsk_flush(struct xdp_sock *xs) 209 { 210 xskq_prod_submit(xs->rx); 211 __xskq_cons_release(xs->umem->fq); 212 sock_def_readable(&xs->sk); 213 } 214 215 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 216 { 217 int err; 218 219 spin_lock_bh(&xs->rx_lock); 220 err = xsk_rcv(xs, xdp, false); 221 xsk_flush(xs); 222 spin_unlock_bh(&xs->rx_lock); 223 return err; 224 } 225 226 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) 227 { 228 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); 229 int err; 230 231 err = xsk_rcv(xs, xdp, true); 232 if (err) 233 return err; 234 235 if (!xs->flush_node.prev) 236 list_add(&xs->flush_node, flush_list); 237 238 return 0; 239 } 240 241 void __xsk_map_flush(void) 242 { 243 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); 244 struct xdp_sock *xs, *tmp; 245 246 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 247 xsk_flush(xs); 248 __list_del_clearprev(&xs->flush_node); 249 } 250 } 251 252 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) 253 { 254 xskq_prod_submit_n(umem->cq, nb_entries); 255 } 256 EXPORT_SYMBOL(xsk_umem_complete_tx); 257 258 void xsk_umem_consume_tx_done(struct xdp_umem *umem) 259 { 260 struct xdp_sock *xs; 261 262 rcu_read_lock(); 263 list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 264 __xskq_cons_release(xs->tx); 265 xs->sk.sk_write_space(&xs->sk); 266 } 267 rcu_read_unlock(); 268 } 269 EXPORT_SYMBOL(xsk_umem_consume_tx_done); 270 271 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) 272 { 273 struct xdp_sock *xs; 274 275 rcu_read_lock(); 276 list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 277 if (!xskq_cons_peek_desc(xs->tx, desc, umem)) 278 continue; 279 280 /* This is the backpressure mechanism for the Tx path. 281 * Reserve space in the completion queue and only proceed 282 * if there is space in it. This avoids having to implement 283 * any buffering in the Tx path. 284 */ 285 if (xskq_prod_reserve_addr(umem->cq, desc->addr)) 286 goto out; 287 288 xskq_cons_release(xs->tx); 289 rcu_read_unlock(); 290 return true; 291 } 292 293 out: 294 rcu_read_unlock(); 295 return false; 296 } 297 EXPORT_SYMBOL(xsk_umem_consume_tx); 298 299 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 300 { 301 struct net_device *dev = xs->dev; 302 int err; 303 304 rcu_read_lock(); 305 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 306 rcu_read_unlock(); 307 308 return err; 309 } 310 311 static int xsk_zc_xmit(struct xdp_sock *xs) 312 { 313 return xsk_wakeup(xs, XDP_WAKEUP_TX); 314 } 315 316 static void xsk_destruct_skb(struct sk_buff *skb) 317 { 318 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; 319 struct xdp_sock *xs = xdp_sk(skb->sk); 320 unsigned long flags; 321 322 spin_lock_irqsave(&xs->tx_completion_lock, flags); 323 xskq_prod_submit_addr(xs->umem->cq, addr); 324 spin_unlock_irqrestore(&xs->tx_completion_lock, flags); 325 326 sock_wfree(skb); 327 } 328 329 static int xsk_generic_xmit(struct sock *sk) 330 { 331 struct xdp_sock *xs = xdp_sk(sk); 332 u32 max_batch = TX_BATCH_SIZE; 333 bool sent_frame = false; 334 struct xdp_desc desc; 335 struct sk_buff *skb; 336 int err = 0; 337 338 mutex_lock(&xs->mutex); 339 340 if (xs->queue_id >= xs->dev->real_num_tx_queues) 341 goto out; 342 343 while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { 344 char *buffer; 345 u64 addr; 346 u32 len; 347 348 if (max_batch-- == 0) { 349 err = -EAGAIN; 350 goto out; 351 } 352 353 len = desc.len; 354 skb = sock_alloc_send_skb(sk, len, 1, &err); 355 if (unlikely(!skb)) { 356 err = -EAGAIN; 357 goto out; 358 } 359 360 skb_put(skb, len); 361 addr = desc.addr; 362 buffer = xsk_buff_raw_get_data(xs->umem, addr); 363 err = skb_store_bits(skb, 0, buffer, len); 364 /* This is the backpressure mechanism for the Tx path. 365 * Reserve space in the completion queue and only proceed 366 * if there is space in it. This avoids having to implement 367 * any buffering in the Tx path. 368 */ 369 if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { 370 kfree_skb(skb); 371 goto out; 372 } 373 374 skb->dev = xs->dev; 375 skb->priority = sk->sk_priority; 376 skb->mark = sk->sk_mark; 377 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; 378 skb->destructor = xsk_destruct_skb; 379 380 err = dev_direct_xmit(skb, xs->queue_id); 381 xskq_cons_release(xs->tx); 382 /* Ignore NET_XMIT_CN as packet might have been sent */ 383 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 384 /* SKB completed but not sent */ 385 err = -EBUSY; 386 goto out; 387 } 388 389 sent_frame = true; 390 } 391 392 out: 393 if (sent_frame) 394 sk->sk_write_space(sk); 395 396 mutex_unlock(&xs->mutex); 397 return err; 398 } 399 400 static int __xsk_sendmsg(struct sock *sk) 401 { 402 struct xdp_sock *xs = xdp_sk(sk); 403 404 if (unlikely(!(xs->dev->flags & IFF_UP))) 405 return -ENETDOWN; 406 if (unlikely(!xs->tx)) 407 return -ENOBUFS; 408 409 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); 410 } 411 412 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 413 { 414 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 415 struct sock *sk = sock->sk; 416 struct xdp_sock *xs = xdp_sk(sk); 417 418 if (unlikely(!xsk_is_bound(xs))) 419 return -ENXIO; 420 if (unlikely(need_wait)) 421 return -EOPNOTSUPP; 422 423 return __xsk_sendmsg(sk); 424 } 425 426 static __poll_t xsk_poll(struct file *file, struct socket *sock, 427 struct poll_table_struct *wait) 428 { 429 __poll_t mask = datagram_poll(file, sock, wait); 430 struct sock *sk = sock->sk; 431 struct xdp_sock *xs = xdp_sk(sk); 432 struct xdp_umem *umem; 433 434 if (unlikely(!xsk_is_bound(xs))) 435 return mask; 436 437 umem = xs->umem; 438 439 if (umem->need_wakeup) { 440 if (xs->zc) 441 xsk_wakeup(xs, umem->need_wakeup); 442 else 443 /* Poll needs to drive Tx also in copy mode */ 444 __xsk_sendmsg(sk); 445 } 446 447 if (xs->rx && !xskq_prod_is_empty(xs->rx)) 448 mask |= EPOLLIN | EPOLLRDNORM; 449 if (xs->tx && !xskq_cons_is_full(xs->tx)) 450 mask |= EPOLLOUT | EPOLLWRNORM; 451 452 return mask; 453 } 454 455 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 456 bool umem_queue) 457 { 458 struct xsk_queue *q; 459 460 if (entries == 0 || *queue || !is_power_of_2(entries)) 461 return -EINVAL; 462 463 q = xskq_create(entries, umem_queue); 464 if (!q) 465 return -ENOMEM; 466 467 /* Make sure queue is ready before it can be seen by others */ 468 smp_wmb(); 469 WRITE_ONCE(*queue, q); 470 return 0; 471 } 472 473 static void xsk_unbind_dev(struct xdp_sock *xs) 474 { 475 struct net_device *dev = xs->dev; 476 477 if (xs->state != XSK_BOUND) 478 return; 479 WRITE_ONCE(xs->state, XSK_UNBOUND); 480 481 /* Wait for driver to stop using the xdp socket. */ 482 xdp_del_sk_umem(xs->umem, xs); 483 xs->dev = NULL; 484 synchronize_net(); 485 dev_put(dev); 486 } 487 488 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 489 struct xdp_sock ***map_entry) 490 { 491 struct xsk_map *map = NULL; 492 struct xsk_map_node *node; 493 494 *map_entry = NULL; 495 496 spin_lock_bh(&xs->map_list_lock); 497 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 498 node); 499 if (node) { 500 WARN_ON(xsk_map_inc(node->map)); 501 map = node->map; 502 *map_entry = node->map_entry; 503 } 504 spin_unlock_bh(&xs->map_list_lock); 505 return map; 506 } 507 508 static void xsk_delete_from_maps(struct xdp_sock *xs) 509 { 510 /* This function removes the current XDP socket from all the 511 * maps it resides in. We need to take extra care here, due to 512 * the two locks involved. Each map has a lock synchronizing 513 * updates to the entries, and each socket has a lock that 514 * synchronizes access to the list of maps (map_list). For 515 * deadlock avoidance the locks need to be taken in the order 516 * "map lock"->"socket map list lock". We start off by 517 * accessing the socket map list, and take a reference to the 518 * map to guarantee existence between the 519 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 520 * calls. Then we ask the map to remove the socket, which 521 * tries to remove the socket from the map. Note that there 522 * might be updates to the map between 523 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 524 */ 525 struct xdp_sock **map_entry = NULL; 526 struct xsk_map *map; 527 528 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 529 xsk_map_try_sock_delete(map, xs, map_entry); 530 xsk_map_put(map); 531 } 532 } 533 534 static int xsk_release(struct socket *sock) 535 { 536 struct sock *sk = sock->sk; 537 struct xdp_sock *xs = xdp_sk(sk); 538 struct net *net; 539 540 if (!sk) 541 return 0; 542 543 net = sock_net(sk); 544 545 mutex_lock(&net->xdp.lock); 546 sk_del_node_init_rcu(sk); 547 mutex_unlock(&net->xdp.lock); 548 549 local_bh_disable(); 550 sock_prot_inuse_add(net, sk->sk_prot, -1); 551 local_bh_enable(); 552 553 xsk_delete_from_maps(xs); 554 mutex_lock(&xs->mutex); 555 xsk_unbind_dev(xs); 556 mutex_unlock(&xs->mutex); 557 558 xskq_destroy(xs->rx); 559 xskq_destroy(xs->tx); 560 561 sock_orphan(sk); 562 sock->sk = NULL; 563 564 sk_refcnt_debug_release(sk); 565 sock_put(sk); 566 567 return 0; 568 } 569 570 static struct socket *xsk_lookup_xsk_from_fd(int fd) 571 { 572 struct socket *sock; 573 int err; 574 575 sock = sockfd_lookup(fd, &err); 576 if (!sock) 577 return ERR_PTR(-ENOTSOCK); 578 579 if (sock->sk->sk_family != PF_XDP) { 580 sockfd_put(sock); 581 return ERR_PTR(-ENOPROTOOPT); 582 } 583 584 return sock; 585 } 586 587 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 588 { 589 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 590 struct sock *sk = sock->sk; 591 struct xdp_sock *xs = xdp_sk(sk); 592 struct net_device *dev; 593 u32 flags, qid; 594 int err = 0; 595 596 if (addr_len < sizeof(struct sockaddr_xdp)) 597 return -EINVAL; 598 if (sxdp->sxdp_family != AF_XDP) 599 return -EINVAL; 600 601 flags = sxdp->sxdp_flags; 602 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 603 XDP_USE_NEED_WAKEUP)) 604 return -EINVAL; 605 606 rtnl_lock(); 607 mutex_lock(&xs->mutex); 608 if (xs->state != XSK_READY) { 609 err = -EBUSY; 610 goto out_release; 611 } 612 613 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 614 if (!dev) { 615 err = -ENODEV; 616 goto out_release; 617 } 618 619 if (!xs->rx && !xs->tx) { 620 err = -EINVAL; 621 goto out_unlock; 622 } 623 624 qid = sxdp->sxdp_queue_id; 625 626 if (flags & XDP_SHARED_UMEM) { 627 struct xdp_sock *umem_xs; 628 struct socket *sock; 629 630 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 631 (flags & XDP_USE_NEED_WAKEUP)) { 632 /* Cannot specify flags for shared sockets. */ 633 err = -EINVAL; 634 goto out_unlock; 635 } 636 637 if (xs->umem) { 638 /* We have already our own. */ 639 err = -EINVAL; 640 goto out_unlock; 641 } 642 643 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 644 if (IS_ERR(sock)) { 645 err = PTR_ERR(sock); 646 goto out_unlock; 647 } 648 649 umem_xs = xdp_sk(sock->sk); 650 if (!xsk_is_bound(umem_xs)) { 651 err = -EBADF; 652 sockfd_put(sock); 653 goto out_unlock; 654 } 655 if (umem_xs->dev != dev || umem_xs->queue_id != qid) { 656 err = -EINVAL; 657 sockfd_put(sock); 658 goto out_unlock; 659 } 660 661 xdp_get_umem(umem_xs->umem); 662 WRITE_ONCE(xs->umem, umem_xs->umem); 663 sockfd_put(sock); 664 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 665 err = -EINVAL; 666 goto out_unlock; 667 } else { 668 /* This xsk has its own umem. */ 669 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); 670 if (err) 671 goto out_unlock; 672 } 673 674 xs->dev = dev; 675 xs->zc = xs->umem->zc; 676 xs->queue_id = qid; 677 xdp_add_sk_umem(xs->umem, xs); 678 679 out_unlock: 680 if (err) { 681 dev_put(dev); 682 } else { 683 /* Matches smp_rmb() in bind() for shared umem 684 * sockets, and xsk_is_bound(). 685 */ 686 smp_wmb(); 687 WRITE_ONCE(xs->state, XSK_BOUND); 688 } 689 out_release: 690 mutex_unlock(&xs->mutex); 691 rtnl_unlock(); 692 return err; 693 } 694 695 struct xdp_umem_reg_v1 { 696 __u64 addr; /* Start of packet data area */ 697 __u64 len; /* Length of packet data area */ 698 __u32 chunk_size; 699 __u32 headroom; 700 }; 701 702 static int xsk_setsockopt(struct socket *sock, int level, int optname, 703 char __user *optval, unsigned int optlen) 704 { 705 struct sock *sk = sock->sk; 706 struct xdp_sock *xs = xdp_sk(sk); 707 int err; 708 709 if (level != SOL_XDP) 710 return -ENOPROTOOPT; 711 712 switch (optname) { 713 case XDP_RX_RING: 714 case XDP_TX_RING: 715 { 716 struct xsk_queue **q; 717 int entries; 718 719 if (optlen < sizeof(entries)) 720 return -EINVAL; 721 if (copy_from_user(&entries, optval, sizeof(entries))) 722 return -EFAULT; 723 724 mutex_lock(&xs->mutex); 725 if (xs->state != XSK_READY) { 726 mutex_unlock(&xs->mutex); 727 return -EBUSY; 728 } 729 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 730 err = xsk_init_queue(entries, q, false); 731 if (!err && optname == XDP_TX_RING) 732 /* Tx needs to be explicitly woken up the first time */ 733 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 734 mutex_unlock(&xs->mutex); 735 return err; 736 } 737 case XDP_UMEM_REG: 738 { 739 size_t mr_size = sizeof(struct xdp_umem_reg); 740 struct xdp_umem_reg mr = {}; 741 struct xdp_umem *umem; 742 743 if (optlen < sizeof(struct xdp_umem_reg_v1)) 744 return -EINVAL; 745 else if (optlen < sizeof(mr)) 746 mr_size = sizeof(struct xdp_umem_reg_v1); 747 748 if (copy_from_user(&mr, optval, mr_size)) 749 return -EFAULT; 750 751 mutex_lock(&xs->mutex); 752 if (xs->state != XSK_READY || xs->umem) { 753 mutex_unlock(&xs->mutex); 754 return -EBUSY; 755 } 756 757 umem = xdp_umem_create(&mr); 758 if (IS_ERR(umem)) { 759 mutex_unlock(&xs->mutex); 760 return PTR_ERR(umem); 761 } 762 763 /* Make sure umem is ready before it can be seen by others */ 764 smp_wmb(); 765 WRITE_ONCE(xs->umem, umem); 766 mutex_unlock(&xs->mutex); 767 return 0; 768 } 769 case XDP_UMEM_FILL_RING: 770 case XDP_UMEM_COMPLETION_RING: 771 { 772 struct xsk_queue **q; 773 int entries; 774 775 if (copy_from_user(&entries, optval, sizeof(entries))) 776 return -EFAULT; 777 778 mutex_lock(&xs->mutex); 779 if (xs->state != XSK_READY) { 780 mutex_unlock(&xs->mutex); 781 return -EBUSY; 782 } 783 if (!xs->umem) { 784 mutex_unlock(&xs->mutex); 785 return -EINVAL; 786 } 787 788 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 789 &xs->umem->cq; 790 err = xsk_init_queue(entries, q, true); 791 if (optname == XDP_UMEM_FILL_RING) 792 xp_set_fq(xs->umem->pool, *q); 793 mutex_unlock(&xs->mutex); 794 return err; 795 } 796 default: 797 break; 798 } 799 800 return -ENOPROTOOPT; 801 } 802 803 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 804 { 805 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 806 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 807 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 808 } 809 810 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 811 { 812 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 813 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 814 ring->desc = offsetof(struct xdp_umem_ring, desc); 815 } 816 817 static int xsk_getsockopt(struct socket *sock, int level, int optname, 818 char __user *optval, int __user *optlen) 819 { 820 struct sock *sk = sock->sk; 821 struct xdp_sock *xs = xdp_sk(sk); 822 int len; 823 824 if (level != SOL_XDP) 825 return -ENOPROTOOPT; 826 827 if (get_user(len, optlen)) 828 return -EFAULT; 829 if (len < 0) 830 return -EINVAL; 831 832 switch (optname) { 833 case XDP_STATISTICS: 834 { 835 struct xdp_statistics stats; 836 837 if (len < sizeof(stats)) 838 return -EINVAL; 839 840 mutex_lock(&xs->mutex); 841 stats.rx_dropped = xs->rx_dropped; 842 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 843 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 844 mutex_unlock(&xs->mutex); 845 846 if (copy_to_user(optval, &stats, sizeof(stats))) 847 return -EFAULT; 848 if (put_user(sizeof(stats), optlen)) 849 return -EFAULT; 850 851 return 0; 852 } 853 case XDP_MMAP_OFFSETS: 854 { 855 struct xdp_mmap_offsets off; 856 struct xdp_mmap_offsets_v1 off_v1; 857 bool flags_supported = true; 858 void *to_copy; 859 860 if (len < sizeof(off_v1)) 861 return -EINVAL; 862 else if (len < sizeof(off)) 863 flags_supported = false; 864 865 if (flags_supported) { 866 /* xdp_ring_offset is identical to xdp_ring_offset_v1 867 * except for the flags field added to the end. 868 */ 869 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 870 &off.rx); 871 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 872 &off.tx); 873 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 874 &off.fr); 875 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 876 &off.cr); 877 off.rx.flags = offsetof(struct xdp_rxtx_ring, 878 ptrs.flags); 879 off.tx.flags = offsetof(struct xdp_rxtx_ring, 880 ptrs.flags); 881 off.fr.flags = offsetof(struct xdp_umem_ring, 882 ptrs.flags); 883 off.cr.flags = offsetof(struct xdp_umem_ring, 884 ptrs.flags); 885 886 len = sizeof(off); 887 to_copy = &off; 888 } else { 889 xsk_enter_rxtx_offsets(&off_v1.rx); 890 xsk_enter_rxtx_offsets(&off_v1.tx); 891 xsk_enter_umem_offsets(&off_v1.fr); 892 xsk_enter_umem_offsets(&off_v1.cr); 893 894 len = sizeof(off_v1); 895 to_copy = &off_v1; 896 } 897 898 if (copy_to_user(optval, to_copy, len)) 899 return -EFAULT; 900 if (put_user(len, optlen)) 901 return -EFAULT; 902 903 return 0; 904 } 905 case XDP_OPTIONS: 906 { 907 struct xdp_options opts = {}; 908 909 if (len < sizeof(opts)) 910 return -EINVAL; 911 912 mutex_lock(&xs->mutex); 913 if (xs->zc) 914 opts.flags |= XDP_OPTIONS_ZEROCOPY; 915 mutex_unlock(&xs->mutex); 916 917 len = sizeof(opts); 918 if (copy_to_user(optval, &opts, len)) 919 return -EFAULT; 920 if (put_user(len, optlen)) 921 return -EFAULT; 922 923 return 0; 924 } 925 default: 926 break; 927 } 928 929 return -EOPNOTSUPP; 930 } 931 932 static int xsk_mmap(struct file *file, struct socket *sock, 933 struct vm_area_struct *vma) 934 { 935 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 936 unsigned long size = vma->vm_end - vma->vm_start; 937 struct xdp_sock *xs = xdp_sk(sock->sk); 938 struct xsk_queue *q = NULL; 939 struct xdp_umem *umem; 940 unsigned long pfn; 941 struct page *qpg; 942 943 if (READ_ONCE(xs->state) != XSK_READY) 944 return -EBUSY; 945 946 if (offset == XDP_PGOFF_RX_RING) { 947 q = READ_ONCE(xs->rx); 948 } else if (offset == XDP_PGOFF_TX_RING) { 949 q = READ_ONCE(xs->tx); 950 } else { 951 umem = READ_ONCE(xs->umem); 952 if (!umem) 953 return -EINVAL; 954 955 /* Matches the smp_wmb() in XDP_UMEM_REG */ 956 smp_rmb(); 957 if (offset == XDP_UMEM_PGOFF_FILL_RING) 958 q = READ_ONCE(umem->fq); 959 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 960 q = READ_ONCE(umem->cq); 961 } 962 963 if (!q) 964 return -EINVAL; 965 966 /* Matches the smp_wmb() in xsk_init_queue */ 967 smp_rmb(); 968 qpg = virt_to_head_page(q->ring); 969 if (size > page_size(qpg)) 970 return -EINVAL; 971 972 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 973 return remap_pfn_range(vma, vma->vm_start, pfn, 974 size, vma->vm_page_prot); 975 } 976 977 static int xsk_notifier(struct notifier_block *this, 978 unsigned long msg, void *ptr) 979 { 980 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 981 struct net *net = dev_net(dev); 982 struct sock *sk; 983 984 switch (msg) { 985 case NETDEV_UNREGISTER: 986 mutex_lock(&net->xdp.lock); 987 sk_for_each(sk, &net->xdp.list) { 988 struct xdp_sock *xs = xdp_sk(sk); 989 990 mutex_lock(&xs->mutex); 991 if (xs->dev == dev) { 992 sk->sk_err = ENETDOWN; 993 if (!sock_flag(sk, SOCK_DEAD)) 994 sk->sk_error_report(sk); 995 996 xsk_unbind_dev(xs); 997 998 /* Clear device references in umem. */ 999 xdp_umem_clear_dev(xs->umem); 1000 } 1001 mutex_unlock(&xs->mutex); 1002 } 1003 mutex_unlock(&net->xdp.lock); 1004 break; 1005 } 1006 return NOTIFY_DONE; 1007 } 1008 1009 static struct proto xsk_proto = { 1010 .name = "XDP", 1011 .owner = THIS_MODULE, 1012 .obj_size = sizeof(struct xdp_sock), 1013 }; 1014 1015 static const struct proto_ops xsk_proto_ops = { 1016 .family = PF_XDP, 1017 .owner = THIS_MODULE, 1018 .release = xsk_release, 1019 .bind = xsk_bind, 1020 .connect = sock_no_connect, 1021 .socketpair = sock_no_socketpair, 1022 .accept = sock_no_accept, 1023 .getname = sock_no_getname, 1024 .poll = xsk_poll, 1025 .ioctl = sock_no_ioctl, 1026 .listen = sock_no_listen, 1027 .shutdown = sock_no_shutdown, 1028 .setsockopt = xsk_setsockopt, 1029 .getsockopt = xsk_getsockopt, 1030 .sendmsg = xsk_sendmsg, 1031 .recvmsg = sock_no_recvmsg, 1032 .mmap = xsk_mmap, 1033 .sendpage = sock_no_sendpage, 1034 }; 1035 1036 static void xsk_destruct(struct sock *sk) 1037 { 1038 struct xdp_sock *xs = xdp_sk(sk); 1039 1040 if (!sock_flag(sk, SOCK_DEAD)) 1041 return; 1042 1043 xdp_put_umem(xs->umem); 1044 1045 sk_refcnt_debug_dec(sk); 1046 } 1047 1048 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1049 int kern) 1050 { 1051 struct sock *sk; 1052 struct xdp_sock *xs; 1053 1054 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1055 return -EPERM; 1056 if (sock->type != SOCK_RAW) 1057 return -ESOCKTNOSUPPORT; 1058 1059 if (protocol) 1060 return -EPROTONOSUPPORT; 1061 1062 sock->state = SS_UNCONNECTED; 1063 1064 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1065 if (!sk) 1066 return -ENOBUFS; 1067 1068 sock->ops = &xsk_proto_ops; 1069 1070 sock_init_data(sock, sk); 1071 1072 sk->sk_family = PF_XDP; 1073 1074 sk->sk_destruct = xsk_destruct; 1075 sk_refcnt_debug_inc(sk); 1076 1077 sock_set_flag(sk, SOCK_RCU_FREE); 1078 1079 xs = xdp_sk(sk); 1080 xs->state = XSK_READY; 1081 mutex_init(&xs->mutex); 1082 spin_lock_init(&xs->rx_lock); 1083 spin_lock_init(&xs->tx_completion_lock); 1084 1085 INIT_LIST_HEAD(&xs->map_list); 1086 spin_lock_init(&xs->map_list_lock); 1087 1088 mutex_lock(&net->xdp.lock); 1089 sk_add_node_rcu(sk, &net->xdp.list); 1090 mutex_unlock(&net->xdp.lock); 1091 1092 local_bh_disable(); 1093 sock_prot_inuse_add(net, &xsk_proto, 1); 1094 local_bh_enable(); 1095 1096 return 0; 1097 } 1098 1099 static const struct net_proto_family xsk_family_ops = { 1100 .family = PF_XDP, 1101 .create = xsk_create, 1102 .owner = THIS_MODULE, 1103 }; 1104 1105 static struct notifier_block xsk_netdev_notifier = { 1106 .notifier_call = xsk_notifier, 1107 }; 1108 1109 static int __net_init xsk_net_init(struct net *net) 1110 { 1111 mutex_init(&net->xdp.lock); 1112 INIT_HLIST_HEAD(&net->xdp.list); 1113 return 0; 1114 } 1115 1116 static void __net_exit xsk_net_exit(struct net *net) 1117 { 1118 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 1119 } 1120 1121 static struct pernet_operations xsk_net_ops = { 1122 .init = xsk_net_init, 1123 .exit = xsk_net_exit, 1124 }; 1125 1126 static int __init xsk_init(void) 1127 { 1128 int err, cpu; 1129 1130 err = proto_register(&xsk_proto, 0 /* no slab */); 1131 if (err) 1132 goto out; 1133 1134 err = sock_register(&xsk_family_ops); 1135 if (err) 1136 goto out_proto; 1137 1138 err = register_pernet_subsys(&xsk_net_ops); 1139 if (err) 1140 goto out_sk; 1141 1142 err = register_netdevice_notifier(&xsk_netdev_notifier); 1143 if (err) 1144 goto out_pernet; 1145 1146 for_each_possible_cpu(cpu) 1147 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); 1148 return 0; 1149 1150 out_pernet: 1151 unregister_pernet_subsys(&xsk_net_ops); 1152 out_sk: 1153 sock_unregister(PF_XDP); 1154 out_proto: 1155 proto_unregister(&xsk_proto); 1156 out: 1157 return err; 1158 } 1159 1160 fs_initcall(xsk_init); 1161