1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <net/xdp_sock.h> 26 #include <net/xdp.h> 27 28 #include "xsk_queue.h" 29 #include "xdp_umem.h" 30 #include "xsk.h" 31 32 #define TX_BATCH_SIZE 16 33 34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 35 { 36 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && 37 READ_ONCE(xs->umem->fq); 38 } 39 40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) 41 { 42 return xskq_has_addrs(umem->fq, cnt); 43 } 44 EXPORT_SYMBOL(xsk_umem_has_addrs); 45 46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) 47 { 48 return xskq_peek_addr(umem->fq, addr, umem); 49 } 50 EXPORT_SYMBOL(xsk_umem_peek_addr); 51 52 void xsk_umem_discard_addr(struct xdp_umem *umem) 53 { 54 xskq_discard_addr(umem->fq); 55 } 56 EXPORT_SYMBOL(xsk_umem_discard_addr); 57 58 void xsk_set_rx_need_wakeup(struct xdp_umem *umem) 59 { 60 if (umem->need_wakeup & XDP_WAKEUP_RX) 61 return; 62 63 umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 64 umem->need_wakeup |= XDP_WAKEUP_RX; 65 } 66 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 67 68 void xsk_set_tx_need_wakeup(struct xdp_umem *umem) 69 { 70 struct xdp_sock *xs; 71 72 if (umem->need_wakeup & XDP_WAKEUP_TX) 73 return; 74 75 rcu_read_lock(); 76 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 77 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 78 } 79 rcu_read_unlock(); 80 81 umem->need_wakeup |= XDP_WAKEUP_TX; 82 } 83 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 84 85 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) 86 { 87 if (!(umem->need_wakeup & XDP_WAKEUP_RX)) 88 return; 89 90 umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 91 umem->need_wakeup &= ~XDP_WAKEUP_RX; 92 } 93 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 94 95 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) 96 { 97 struct xdp_sock *xs; 98 99 if (!(umem->need_wakeup & XDP_WAKEUP_TX)) 100 return; 101 102 rcu_read_lock(); 103 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 104 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 105 } 106 rcu_read_unlock(); 107 108 umem->need_wakeup &= ~XDP_WAKEUP_TX; 109 } 110 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 111 112 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) 113 { 114 return umem->flags & XDP_UMEM_USES_NEED_WAKEUP; 115 } 116 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); 117 118 /* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for 119 * each page. This is only required in copy mode. 120 */ 121 static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, 122 u32 len, u32 metalen) 123 { 124 void *to_buf = xdp_umem_get_data(umem, addr); 125 126 addr = xsk_umem_add_offset_to_addr(addr); 127 if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) { 128 void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; 129 u64 page_start = addr & ~(PAGE_SIZE - 1); 130 u64 first_len = PAGE_SIZE - (addr - page_start); 131 132 memcpy(to_buf, from_buf, first_len + metalen); 133 memcpy(next_pg_addr, from_buf + first_len, len - first_len); 134 135 return; 136 } 137 138 memcpy(to_buf, from_buf, len + metalen); 139 } 140 141 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 142 { 143 u64 offset = xs->umem->headroom; 144 u64 addr, memcpy_addr; 145 void *from_buf; 146 u32 metalen; 147 int err; 148 149 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || 150 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 151 xs->rx_dropped++; 152 return -ENOSPC; 153 } 154 155 if (unlikely(xdp_data_meta_unsupported(xdp))) { 156 from_buf = xdp->data; 157 metalen = 0; 158 } else { 159 from_buf = xdp->data_meta; 160 metalen = xdp->data - xdp->data_meta; 161 } 162 163 memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 164 __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); 165 166 offset += metalen; 167 addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 168 err = xskq_produce_batch_desc(xs->rx, addr, len); 169 if (!err) { 170 xskq_discard_addr(xs->umem->fq); 171 xdp_return_buff(xdp); 172 return 0; 173 } 174 175 xs->rx_dropped++; 176 return err; 177 } 178 179 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 180 { 181 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); 182 183 if (err) 184 xs->rx_dropped++; 185 186 return err; 187 } 188 189 static bool xsk_is_bound(struct xdp_sock *xs) 190 { 191 if (READ_ONCE(xs->state) == XSK_BOUND) { 192 /* Matches smp_wmb() in bind(). */ 193 smp_rmb(); 194 return true; 195 } 196 return false; 197 } 198 199 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 200 { 201 u32 len; 202 203 if (!xsk_is_bound(xs)) 204 return -EINVAL; 205 206 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 207 return -EINVAL; 208 209 len = xdp->data_end - xdp->data; 210 211 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? 212 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); 213 } 214 215 static void xsk_flush(struct xdp_sock *xs) 216 { 217 xskq_produce_flush_desc(xs->rx); 218 xs->sk.sk_data_ready(&xs->sk); 219 } 220 221 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 222 { 223 u32 metalen = xdp->data - xdp->data_meta; 224 u32 len = xdp->data_end - xdp->data; 225 u64 offset = xs->umem->headroom; 226 void *buffer; 227 u64 addr; 228 int err; 229 230 spin_lock_bh(&xs->rx_lock); 231 232 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { 233 err = -EINVAL; 234 goto out_unlock; 235 } 236 237 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || 238 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 239 err = -ENOSPC; 240 goto out_drop; 241 } 242 243 addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 244 buffer = xdp_umem_get_data(xs->umem, addr); 245 memcpy(buffer, xdp->data_meta, len + metalen); 246 247 addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); 248 err = xskq_produce_batch_desc(xs->rx, addr, len); 249 if (err) 250 goto out_drop; 251 252 xskq_discard_addr(xs->umem->fq); 253 xskq_produce_flush_desc(xs->rx); 254 255 spin_unlock_bh(&xs->rx_lock); 256 257 xs->sk.sk_data_ready(&xs->sk); 258 return 0; 259 260 out_drop: 261 xs->rx_dropped++; 262 out_unlock: 263 spin_unlock_bh(&xs->rx_lock); 264 return err; 265 } 266 267 int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, 268 struct xdp_sock *xs) 269 { 270 struct xsk_map *m = container_of(map, struct xsk_map, map); 271 struct list_head *flush_list = this_cpu_ptr(m->flush_list); 272 int err; 273 274 err = xsk_rcv(xs, xdp); 275 if (err) 276 return err; 277 278 if (!xs->flush_node.prev) 279 list_add(&xs->flush_node, flush_list); 280 281 return 0; 282 } 283 284 void __xsk_map_flush(struct bpf_map *map) 285 { 286 struct xsk_map *m = container_of(map, struct xsk_map, map); 287 struct list_head *flush_list = this_cpu_ptr(m->flush_list); 288 struct xdp_sock *xs, *tmp; 289 290 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 291 xsk_flush(xs); 292 __list_del_clearprev(&xs->flush_node); 293 } 294 } 295 296 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) 297 { 298 xskq_produce_flush_addr_n(umem->cq, nb_entries); 299 } 300 EXPORT_SYMBOL(xsk_umem_complete_tx); 301 302 void xsk_umem_consume_tx_done(struct xdp_umem *umem) 303 { 304 struct xdp_sock *xs; 305 306 rcu_read_lock(); 307 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 308 xs->sk.sk_write_space(&xs->sk); 309 } 310 rcu_read_unlock(); 311 } 312 EXPORT_SYMBOL(xsk_umem_consume_tx_done); 313 314 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) 315 { 316 struct xdp_sock *xs; 317 318 rcu_read_lock(); 319 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 320 if (!xskq_peek_desc(xs->tx, desc, umem)) 321 continue; 322 323 if (xskq_produce_addr_lazy(umem->cq, desc->addr)) 324 goto out; 325 326 xskq_discard_desc(xs->tx); 327 rcu_read_unlock(); 328 return true; 329 } 330 331 out: 332 rcu_read_unlock(); 333 return false; 334 } 335 EXPORT_SYMBOL(xsk_umem_consume_tx); 336 337 static int xsk_zc_xmit(struct xdp_sock *xs) 338 { 339 struct net_device *dev = xs->dev; 340 341 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, 342 XDP_WAKEUP_TX); 343 } 344 345 static void xsk_destruct_skb(struct sk_buff *skb) 346 { 347 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; 348 struct xdp_sock *xs = xdp_sk(skb->sk); 349 unsigned long flags; 350 351 spin_lock_irqsave(&xs->tx_completion_lock, flags); 352 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); 353 spin_unlock_irqrestore(&xs->tx_completion_lock, flags); 354 355 sock_wfree(skb); 356 } 357 358 static int xsk_generic_xmit(struct sock *sk) 359 { 360 struct xdp_sock *xs = xdp_sk(sk); 361 u32 max_batch = TX_BATCH_SIZE; 362 bool sent_frame = false; 363 struct xdp_desc desc; 364 struct sk_buff *skb; 365 int err = 0; 366 367 mutex_lock(&xs->mutex); 368 369 if (xs->queue_id >= xs->dev->real_num_tx_queues) 370 goto out; 371 372 while (xskq_peek_desc(xs->tx, &desc, xs->umem)) { 373 char *buffer; 374 u64 addr; 375 u32 len; 376 377 if (max_batch-- == 0) { 378 err = -EAGAIN; 379 goto out; 380 } 381 382 len = desc.len; 383 skb = sock_alloc_send_skb(sk, len, 1, &err); 384 if (unlikely(!skb)) { 385 err = -EAGAIN; 386 goto out; 387 } 388 389 skb_put(skb, len); 390 addr = desc.addr; 391 buffer = xdp_umem_get_data(xs->umem, addr); 392 err = skb_store_bits(skb, 0, buffer, len); 393 if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { 394 kfree_skb(skb); 395 goto out; 396 } 397 398 skb->dev = xs->dev; 399 skb->priority = sk->sk_priority; 400 skb->mark = sk->sk_mark; 401 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; 402 skb->destructor = xsk_destruct_skb; 403 404 err = dev_direct_xmit(skb, xs->queue_id); 405 xskq_discard_desc(xs->tx); 406 /* Ignore NET_XMIT_CN as packet might have been sent */ 407 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 408 /* SKB completed but not sent */ 409 err = -EBUSY; 410 goto out; 411 } 412 413 sent_frame = true; 414 } 415 416 out: 417 if (sent_frame) 418 sk->sk_write_space(sk); 419 420 mutex_unlock(&xs->mutex); 421 return err; 422 } 423 424 static int __xsk_sendmsg(struct sock *sk) 425 { 426 struct xdp_sock *xs = xdp_sk(sk); 427 428 if (unlikely(!(xs->dev->flags & IFF_UP))) 429 return -ENETDOWN; 430 if (unlikely(!xs->tx)) 431 return -ENOBUFS; 432 433 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); 434 } 435 436 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 437 { 438 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 439 struct sock *sk = sock->sk; 440 struct xdp_sock *xs = xdp_sk(sk); 441 442 if (unlikely(!xsk_is_bound(xs))) 443 return -ENXIO; 444 if (unlikely(need_wait)) 445 return -EOPNOTSUPP; 446 447 return __xsk_sendmsg(sk); 448 } 449 450 static __poll_t xsk_poll(struct file *file, struct socket *sock, 451 struct poll_table_struct *wait) 452 { 453 __poll_t mask = datagram_poll(file, sock, wait); 454 struct sock *sk = sock->sk; 455 struct xdp_sock *xs = xdp_sk(sk); 456 struct net_device *dev; 457 struct xdp_umem *umem; 458 459 if (unlikely(!xsk_is_bound(xs))) 460 return mask; 461 462 dev = xs->dev; 463 umem = xs->umem; 464 465 if (umem->need_wakeup) { 466 if (dev->netdev_ops->ndo_xsk_wakeup) 467 dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, 468 umem->need_wakeup); 469 else 470 /* Poll needs to drive Tx also in copy mode */ 471 __xsk_sendmsg(sk); 472 } 473 474 if (xs->rx && !xskq_empty_desc(xs->rx)) 475 mask |= EPOLLIN | EPOLLRDNORM; 476 if (xs->tx && !xskq_full_desc(xs->tx)) 477 mask |= EPOLLOUT | EPOLLWRNORM; 478 479 return mask; 480 } 481 482 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 483 bool umem_queue) 484 { 485 struct xsk_queue *q; 486 487 if (entries == 0 || *queue || !is_power_of_2(entries)) 488 return -EINVAL; 489 490 q = xskq_create(entries, umem_queue); 491 if (!q) 492 return -ENOMEM; 493 494 /* Make sure queue is ready before it can be seen by others */ 495 smp_wmb(); 496 WRITE_ONCE(*queue, q); 497 return 0; 498 } 499 500 static void xsk_unbind_dev(struct xdp_sock *xs) 501 { 502 struct net_device *dev = xs->dev; 503 504 if (xs->state != XSK_BOUND) 505 return; 506 WRITE_ONCE(xs->state, XSK_UNBOUND); 507 508 /* Wait for driver to stop using the xdp socket. */ 509 xdp_del_sk_umem(xs->umem, xs); 510 xs->dev = NULL; 511 synchronize_net(); 512 dev_put(dev); 513 } 514 515 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 516 struct xdp_sock ***map_entry) 517 { 518 struct xsk_map *map = NULL; 519 struct xsk_map_node *node; 520 521 *map_entry = NULL; 522 523 spin_lock_bh(&xs->map_list_lock); 524 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 525 node); 526 if (node) { 527 WARN_ON(xsk_map_inc(node->map)); 528 map = node->map; 529 *map_entry = node->map_entry; 530 } 531 spin_unlock_bh(&xs->map_list_lock); 532 return map; 533 } 534 535 static void xsk_delete_from_maps(struct xdp_sock *xs) 536 { 537 /* This function removes the current XDP socket from all the 538 * maps it resides in. We need to take extra care here, due to 539 * the two locks involved. Each map has a lock synchronizing 540 * updates to the entries, and each socket has a lock that 541 * synchronizes access to the list of maps (map_list). For 542 * deadlock avoidance the locks need to be taken in the order 543 * "map lock"->"socket map list lock". We start off by 544 * accessing the socket map list, and take a reference to the 545 * map to guarantee existence between the 546 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 547 * calls. Then we ask the map to remove the socket, which 548 * tries to remove the socket from the map. Note that there 549 * might be updates to the map between 550 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 551 */ 552 struct xdp_sock **map_entry = NULL; 553 struct xsk_map *map; 554 555 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 556 xsk_map_try_sock_delete(map, xs, map_entry); 557 xsk_map_put(map); 558 } 559 } 560 561 static int xsk_release(struct socket *sock) 562 { 563 struct sock *sk = sock->sk; 564 struct xdp_sock *xs = xdp_sk(sk); 565 struct net *net; 566 567 if (!sk) 568 return 0; 569 570 net = sock_net(sk); 571 572 mutex_lock(&net->xdp.lock); 573 sk_del_node_init_rcu(sk); 574 mutex_unlock(&net->xdp.lock); 575 576 local_bh_disable(); 577 sock_prot_inuse_add(net, sk->sk_prot, -1); 578 local_bh_enable(); 579 580 xsk_delete_from_maps(xs); 581 mutex_lock(&xs->mutex); 582 xsk_unbind_dev(xs); 583 mutex_unlock(&xs->mutex); 584 585 xskq_destroy(xs->rx); 586 xskq_destroy(xs->tx); 587 588 sock_orphan(sk); 589 sock->sk = NULL; 590 591 sk_refcnt_debug_release(sk); 592 sock_put(sk); 593 594 return 0; 595 } 596 597 static struct socket *xsk_lookup_xsk_from_fd(int fd) 598 { 599 struct socket *sock; 600 int err; 601 602 sock = sockfd_lookup(fd, &err); 603 if (!sock) 604 return ERR_PTR(-ENOTSOCK); 605 606 if (sock->sk->sk_family != PF_XDP) { 607 sockfd_put(sock); 608 return ERR_PTR(-ENOPROTOOPT); 609 } 610 611 return sock; 612 } 613 614 /* Check if umem pages are contiguous. 615 * If zero-copy mode, use the DMA address to do the page contiguity check 616 * For all other modes we use addr (kernel virtual address) 617 * Store the result in the low bits of addr. 618 */ 619 static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) 620 { 621 struct xdp_umem_page *pgs = umem->pages; 622 int i, is_contig; 623 624 for (i = 0; i < umem->npgs - 1; i++) { 625 is_contig = (flags & XDP_ZEROCOPY) ? 626 (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : 627 (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); 628 pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; 629 } 630 } 631 632 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 633 { 634 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 635 struct sock *sk = sock->sk; 636 struct xdp_sock *xs = xdp_sk(sk); 637 struct net_device *dev; 638 u32 flags, qid; 639 int err = 0; 640 641 if (addr_len < sizeof(struct sockaddr_xdp)) 642 return -EINVAL; 643 if (sxdp->sxdp_family != AF_XDP) 644 return -EINVAL; 645 646 flags = sxdp->sxdp_flags; 647 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 648 XDP_USE_NEED_WAKEUP)) 649 return -EINVAL; 650 651 rtnl_lock(); 652 mutex_lock(&xs->mutex); 653 if (xs->state != XSK_READY) { 654 err = -EBUSY; 655 goto out_release; 656 } 657 658 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 659 if (!dev) { 660 err = -ENODEV; 661 goto out_release; 662 } 663 664 if (!xs->rx && !xs->tx) { 665 err = -EINVAL; 666 goto out_unlock; 667 } 668 669 qid = sxdp->sxdp_queue_id; 670 671 if (flags & XDP_SHARED_UMEM) { 672 struct xdp_sock *umem_xs; 673 struct socket *sock; 674 675 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 676 (flags & XDP_USE_NEED_WAKEUP)) { 677 /* Cannot specify flags for shared sockets. */ 678 err = -EINVAL; 679 goto out_unlock; 680 } 681 682 if (xs->umem) { 683 /* We have already our own. */ 684 err = -EINVAL; 685 goto out_unlock; 686 } 687 688 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 689 if (IS_ERR(sock)) { 690 err = PTR_ERR(sock); 691 goto out_unlock; 692 } 693 694 umem_xs = xdp_sk(sock->sk); 695 if (!xsk_is_bound(umem_xs)) { 696 err = -EBADF; 697 sockfd_put(sock); 698 goto out_unlock; 699 } 700 if (umem_xs->dev != dev || umem_xs->queue_id != qid) { 701 err = -EINVAL; 702 sockfd_put(sock); 703 goto out_unlock; 704 } 705 706 xdp_get_umem(umem_xs->umem); 707 WRITE_ONCE(xs->umem, umem_xs->umem); 708 sockfd_put(sock); 709 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 710 err = -EINVAL; 711 goto out_unlock; 712 } else { 713 /* This xsk has its own umem. */ 714 xskq_set_umem(xs->umem->fq, xs->umem->size, 715 xs->umem->chunk_mask); 716 xskq_set_umem(xs->umem->cq, xs->umem->size, 717 xs->umem->chunk_mask); 718 719 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); 720 if (err) 721 goto out_unlock; 722 723 xsk_check_page_contiguity(xs->umem, flags); 724 } 725 726 xs->dev = dev; 727 xs->zc = xs->umem->zc; 728 xs->queue_id = qid; 729 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); 730 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); 731 xdp_add_sk_umem(xs->umem, xs); 732 733 out_unlock: 734 if (err) { 735 dev_put(dev); 736 } else { 737 /* Matches smp_rmb() in bind() for shared umem 738 * sockets, and xsk_is_bound(). 739 */ 740 smp_wmb(); 741 WRITE_ONCE(xs->state, XSK_BOUND); 742 } 743 out_release: 744 mutex_unlock(&xs->mutex); 745 rtnl_unlock(); 746 return err; 747 } 748 749 struct xdp_umem_reg_v1 { 750 __u64 addr; /* Start of packet data area */ 751 __u64 len; /* Length of packet data area */ 752 __u32 chunk_size; 753 __u32 headroom; 754 }; 755 756 static int xsk_setsockopt(struct socket *sock, int level, int optname, 757 char __user *optval, unsigned int optlen) 758 { 759 struct sock *sk = sock->sk; 760 struct xdp_sock *xs = xdp_sk(sk); 761 int err; 762 763 if (level != SOL_XDP) 764 return -ENOPROTOOPT; 765 766 switch (optname) { 767 case XDP_RX_RING: 768 case XDP_TX_RING: 769 { 770 struct xsk_queue **q; 771 int entries; 772 773 if (optlen < sizeof(entries)) 774 return -EINVAL; 775 if (copy_from_user(&entries, optval, sizeof(entries))) 776 return -EFAULT; 777 778 mutex_lock(&xs->mutex); 779 if (xs->state != XSK_READY) { 780 mutex_unlock(&xs->mutex); 781 return -EBUSY; 782 } 783 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 784 err = xsk_init_queue(entries, q, false); 785 if (!err && optname == XDP_TX_RING) 786 /* Tx needs to be explicitly woken up the first time */ 787 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 788 mutex_unlock(&xs->mutex); 789 return err; 790 } 791 case XDP_UMEM_REG: 792 { 793 size_t mr_size = sizeof(struct xdp_umem_reg); 794 struct xdp_umem_reg mr = {}; 795 struct xdp_umem *umem; 796 797 if (optlen < sizeof(struct xdp_umem_reg_v1)) 798 return -EINVAL; 799 else if (optlen < sizeof(mr)) 800 mr_size = sizeof(struct xdp_umem_reg_v1); 801 802 if (copy_from_user(&mr, optval, mr_size)) 803 return -EFAULT; 804 805 mutex_lock(&xs->mutex); 806 if (xs->state != XSK_READY || xs->umem) { 807 mutex_unlock(&xs->mutex); 808 return -EBUSY; 809 } 810 811 umem = xdp_umem_create(&mr); 812 if (IS_ERR(umem)) { 813 mutex_unlock(&xs->mutex); 814 return PTR_ERR(umem); 815 } 816 817 /* Make sure umem is ready before it can be seen by others */ 818 smp_wmb(); 819 WRITE_ONCE(xs->umem, umem); 820 mutex_unlock(&xs->mutex); 821 return 0; 822 } 823 case XDP_UMEM_FILL_RING: 824 case XDP_UMEM_COMPLETION_RING: 825 { 826 struct xsk_queue **q; 827 int entries; 828 829 if (copy_from_user(&entries, optval, sizeof(entries))) 830 return -EFAULT; 831 832 mutex_lock(&xs->mutex); 833 if (xs->state != XSK_READY) { 834 mutex_unlock(&xs->mutex); 835 return -EBUSY; 836 } 837 if (!xs->umem) { 838 mutex_unlock(&xs->mutex); 839 return -EINVAL; 840 } 841 842 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 843 &xs->umem->cq; 844 err = xsk_init_queue(entries, q, true); 845 mutex_unlock(&xs->mutex); 846 return err; 847 } 848 default: 849 break; 850 } 851 852 return -ENOPROTOOPT; 853 } 854 855 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 856 { 857 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 858 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 859 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 860 } 861 862 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 863 { 864 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 865 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 866 ring->desc = offsetof(struct xdp_umem_ring, desc); 867 } 868 869 static int xsk_getsockopt(struct socket *sock, int level, int optname, 870 char __user *optval, int __user *optlen) 871 { 872 struct sock *sk = sock->sk; 873 struct xdp_sock *xs = xdp_sk(sk); 874 int len; 875 876 if (level != SOL_XDP) 877 return -ENOPROTOOPT; 878 879 if (get_user(len, optlen)) 880 return -EFAULT; 881 if (len < 0) 882 return -EINVAL; 883 884 switch (optname) { 885 case XDP_STATISTICS: 886 { 887 struct xdp_statistics stats; 888 889 if (len < sizeof(stats)) 890 return -EINVAL; 891 892 mutex_lock(&xs->mutex); 893 stats.rx_dropped = xs->rx_dropped; 894 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 895 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 896 mutex_unlock(&xs->mutex); 897 898 if (copy_to_user(optval, &stats, sizeof(stats))) 899 return -EFAULT; 900 if (put_user(sizeof(stats), optlen)) 901 return -EFAULT; 902 903 return 0; 904 } 905 case XDP_MMAP_OFFSETS: 906 { 907 struct xdp_mmap_offsets off; 908 struct xdp_mmap_offsets_v1 off_v1; 909 bool flags_supported = true; 910 void *to_copy; 911 912 if (len < sizeof(off_v1)) 913 return -EINVAL; 914 else if (len < sizeof(off)) 915 flags_supported = false; 916 917 if (flags_supported) { 918 /* xdp_ring_offset is identical to xdp_ring_offset_v1 919 * except for the flags field added to the end. 920 */ 921 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 922 &off.rx); 923 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 924 &off.tx); 925 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 926 &off.fr); 927 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 928 &off.cr); 929 off.rx.flags = offsetof(struct xdp_rxtx_ring, 930 ptrs.flags); 931 off.tx.flags = offsetof(struct xdp_rxtx_ring, 932 ptrs.flags); 933 off.fr.flags = offsetof(struct xdp_umem_ring, 934 ptrs.flags); 935 off.cr.flags = offsetof(struct xdp_umem_ring, 936 ptrs.flags); 937 938 len = sizeof(off); 939 to_copy = &off; 940 } else { 941 xsk_enter_rxtx_offsets(&off_v1.rx); 942 xsk_enter_rxtx_offsets(&off_v1.tx); 943 xsk_enter_umem_offsets(&off_v1.fr); 944 xsk_enter_umem_offsets(&off_v1.cr); 945 946 len = sizeof(off_v1); 947 to_copy = &off_v1; 948 } 949 950 if (copy_to_user(optval, to_copy, len)) 951 return -EFAULT; 952 if (put_user(len, optlen)) 953 return -EFAULT; 954 955 return 0; 956 } 957 case XDP_OPTIONS: 958 { 959 struct xdp_options opts = {}; 960 961 if (len < sizeof(opts)) 962 return -EINVAL; 963 964 mutex_lock(&xs->mutex); 965 if (xs->zc) 966 opts.flags |= XDP_OPTIONS_ZEROCOPY; 967 mutex_unlock(&xs->mutex); 968 969 len = sizeof(opts); 970 if (copy_to_user(optval, &opts, len)) 971 return -EFAULT; 972 if (put_user(len, optlen)) 973 return -EFAULT; 974 975 return 0; 976 } 977 default: 978 break; 979 } 980 981 return -EOPNOTSUPP; 982 } 983 984 static int xsk_mmap(struct file *file, struct socket *sock, 985 struct vm_area_struct *vma) 986 { 987 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 988 unsigned long size = vma->vm_end - vma->vm_start; 989 struct xdp_sock *xs = xdp_sk(sock->sk); 990 struct xsk_queue *q = NULL; 991 struct xdp_umem *umem; 992 unsigned long pfn; 993 struct page *qpg; 994 995 if (READ_ONCE(xs->state) != XSK_READY) 996 return -EBUSY; 997 998 if (offset == XDP_PGOFF_RX_RING) { 999 q = READ_ONCE(xs->rx); 1000 } else if (offset == XDP_PGOFF_TX_RING) { 1001 q = READ_ONCE(xs->tx); 1002 } else { 1003 umem = READ_ONCE(xs->umem); 1004 if (!umem) 1005 return -EINVAL; 1006 1007 /* Matches the smp_wmb() in XDP_UMEM_REG */ 1008 smp_rmb(); 1009 if (offset == XDP_UMEM_PGOFF_FILL_RING) 1010 q = READ_ONCE(umem->fq); 1011 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 1012 q = READ_ONCE(umem->cq); 1013 } 1014 1015 if (!q) 1016 return -EINVAL; 1017 1018 /* Matches the smp_wmb() in xsk_init_queue */ 1019 smp_rmb(); 1020 qpg = virt_to_head_page(q->ring); 1021 if (size > page_size(qpg)) 1022 return -EINVAL; 1023 1024 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 1025 return remap_pfn_range(vma, vma->vm_start, pfn, 1026 size, vma->vm_page_prot); 1027 } 1028 1029 static int xsk_notifier(struct notifier_block *this, 1030 unsigned long msg, void *ptr) 1031 { 1032 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1033 struct net *net = dev_net(dev); 1034 struct sock *sk; 1035 1036 switch (msg) { 1037 case NETDEV_UNREGISTER: 1038 mutex_lock(&net->xdp.lock); 1039 sk_for_each(sk, &net->xdp.list) { 1040 struct xdp_sock *xs = xdp_sk(sk); 1041 1042 mutex_lock(&xs->mutex); 1043 if (xs->dev == dev) { 1044 sk->sk_err = ENETDOWN; 1045 if (!sock_flag(sk, SOCK_DEAD)) 1046 sk->sk_error_report(sk); 1047 1048 xsk_unbind_dev(xs); 1049 1050 /* Clear device references in umem. */ 1051 xdp_umem_clear_dev(xs->umem); 1052 } 1053 mutex_unlock(&xs->mutex); 1054 } 1055 mutex_unlock(&net->xdp.lock); 1056 break; 1057 } 1058 return NOTIFY_DONE; 1059 } 1060 1061 static struct proto xsk_proto = { 1062 .name = "XDP", 1063 .owner = THIS_MODULE, 1064 .obj_size = sizeof(struct xdp_sock), 1065 }; 1066 1067 static const struct proto_ops xsk_proto_ops = { 1068 .family = PF_XDP, 1069 .owner = THIS_MODULE, 1070 .release = xsk_release, 1071 .bind = xsk_bind, 1072 .connect = sock_no_connect, 1073 .socketpair = sock_no_socketpair, 1074 .accept = sock_no_accept, 1075 .getname = sock_no_getname, 1076 .poll = xsk_poll, 1077 .ioctl = sock_no_ioctl, 1078 .listen = sock_no_listen, 1079 .shutdown = sock_no_shutdown, 1080 .setsockopt = xsk_setsockopt, 1081 .getsockopt = xsk_getsockopt, 1082 .sendmsg = xsk_sendmsg, 1083 .recvmsg = sock_no_recvmsg, 1084 .mmap = xsk_mmap, 1085 .sendpage = sock_no_sendpage, 1086 }; 1087 1088 static void xsk_destruct(struct sock *sk) 1089 { 1090 struct xdp_sock *xs = xdp_sk(sk); 1091 1092 if (!sock_flag(sk, SOCK_DEAD)) 1093 return; 1094 1095 xdp_put_umem(xs->umem); 1096 1097 sk_refcnt_debug_dec(sk); 1098 } 1099 1100 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1101 int kern) 1102 { 1103 struct sock *sk; 1104 struct xdp_sock *xs; 1105 1106 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1107 return -EPERM; 1108 if (sock->type != SOCK_RAW) 1109 return -ESOCKTNOSUPPORT; 1110 1111 if (protocol) 1112 return -EPROTONOSUPPORT; 1113 1114 sock->state = SS_UNCONNECTED; 1115 1116 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1117 if (!sk) 1118 return -ENOBUFS; 1119 1120 sock->ops = &xsk_proto_ops; 1121 1122 sock_init_data(sock, sk); 1123 1124 sk->sk_family = PF_XDP; 1125 1126 sk->sk_destruct = xsk_destruct; 1127 sk_refcnt_debug_inc(sk); 1128 1129 sock_set_flag(sk, SOCK_RCU_FREE); 1130 1131 xs = xdp_sk(sk); 1132 xs->state = XSK_READY; 1133 mutex_init(&xs->mutex); 1134 spin_lock_init(&xs->rx_lock); 1135 spin_lock_init(&xs->tx_completion_lock); 1136 1137 INIT_LIST_HEAD(&xs->map_list); 1138 spin_lock_init(&xs->map_list_lock); 1139 1140 mutex_lock(&net->xdp.lock); 1141 sk_add_node_rcu(sk, &net->xdp.list); 1142 mutex_unlock(&net->xdp.lock); 1143 1144 local_bh_disable(); 1145 sock_prot_inuse_add(net, &xsk_proto, 1); 1146 local_bh_enable(); 1147 1148 return 0; 1149 } 1150 1151 static const struct net_proto_family xsk_family_ops = { 1152 .family = PF_XDP, 1153 .create = xsk_create, 1154 .owner = THIS_MODULE, 1155 }; 1156 1157 static struct notifier_block xsk_netdev_notifier = { 1158 .notifier_call = xsk_notifier, 1159 }; 1160 1161 static int __net_init xsk_net_init(struct net *net) 1162 { 1163 mutex_init(&net->xdp.lock); 1164 INIT_HLIST_HEAD(&net->xdp.list); 1165 return 0; 1166 } 1167 1168 static void __net_exit xsk_net_exit(struct net *net) 1169 { 1170 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 1171 } 1172 1173 static struct pernet_operations xsk_net_ops = { 1174 .init = xsk_net_init, 1175 .exit = xsk_net_exit, 1176 }; 1177 1178 static int __init xsk_init(void) 1179 { 1180 int err; 1181 1182 err = proto_register(&xsk_proto, 0 /* no slab */); 1183 if (err) 1184 goto out; 1185 1186 err = sock_register(&xsk_family_ops); 1187 if (err) 1188 goto out_proto; 1189 1190 err = register_pernet_subsys(&xsk_net_ops); 1191 if (err) 1192 goto out_sk; 1193 1194 err = register_netdevice_notifier(&xsk_netdev_notifier); 1195 if (err) 1196 goto out_pernet; 1197 1198 return 0; 1199 1200 out_pernet: 1201 unregister_pernet_subsys(&xsk_net_ops); 1202 out_sk: 1203 sock_unregister(PF_XDP); 1204 out_proto: 1205 proto_unregister(&xsk_proto); 1206 out: 1207 return err; 1208 } 1209 1210 fs_initcall(xsk_init); 1211