1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <net/xdp_sock.h> 26 #include <net/xdp.h> 27 28 #include "xsk_queue.h" 29 #include "xdp_umem.h" 30 #include "xsk.h" 31 32 #define TX_BATCH_SIZE 16 33 34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 35 { 36 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && 37 READ_ONCE(xs->umem->fq); 38 } 39 40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) 41 { 42 return xskq_has_addrs(umem->fq, cnt); 43 } 44 EXPORT_SYMBOL(xsk_umem_has_addrs); 45 46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) 47 { 48 return xskq_peek_addr(umem->fq, addr, umem); 49 } 50 EXPORT_SYMBOL(xsk_umem_peek_addr); 51 52 void xsk_umem_discard_addr(struct xdp_umem *umem) 53 { 54 xskq_discard_addr(umem->fq); 55 } 56 EXPORT_SYMBOL(xsk_umem_discard_addr); 57 58 void xsk_set_rx_need_wakeup(struct xdp_umem *umem) 59 { 60 if (umem->need_wakeup & XDP_WAKEUP_RX) 61 return; 62 63 umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 64 umem->need_wakeup |= XDP_WAKEUP_RX; 65 } 66 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 67 68 void xsk_set_tx_need_wakeup(struct xdp_umem *umem) 69 { 70 struct xdp_sock *xs; 71 72 if (umem->need_wakeup & XDP_WAKEUP_TX) 73 return; 74 75 rcu_read_lock(); 76 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 77 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 78 } 79 rcu_read_unlock(); 80 81 umem->need_wakeup |= XDP_WAKEUP_TX; 82 } 83 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 84 85 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) 86 { 87 if (!(umem->need_wakeup & XDP_WAKEUP_RX)) 88 return; 89 90 umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 91 umem->need_wakeup &= ~XDP_WAKEUP_RX; 92 } 93 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 94 95 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) 96 { 97 struct xdp_sock *xs; 98 99 if (!(umem->need_wakeup & XDP_WAKEUP_TX)) 100 return; 101 102 rcu_read_lock(); 103 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 104 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 105 } 106 rcu_read_unlock(); 107 108 umem->need_wakeup &= ~XDP_WAKEUP_TX; 109 } 110 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 111 112 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) 113 { 114 return umem->flags & XDP_UMEM_USES_NEED_WAKEUP; 115 } 116 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); 117 118 /* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for 119 * each page. This is only required in copy mode. 120 */ 121 static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, 122 u32 len, u32 metalen) 123 { 124 void *to_buf = xdp_umem_get_data(umem, addr); 125 126 addr = xsk_umem_add_offset_to_addr(addr); 127 if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) { 128 void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; 129 u64 page_start = addr & ~(PAGE_SIZE - 1); 130 u64 first_len = PAGE_SIZE - (addr - page_start); 131 132 memcpy(to_buf, from_buf, first_len + metalen); 133 memcpy(next_pg_addr, from_buf + first_len, len - first_len); 134 135 return; 136 } 137 138 memcpy(to_buf, from_buf, len + metalen); 139 } 140 141 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 142 { 143 u64 offset = xs->umem->headroom; 144 u64 addr, memcpy_addr; 145 void *from_buf; 146 u32 metalen; 147 int err; 148 149 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || 150 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 151 xs->rx_dropped++; 152 return -ENOSPC; 153 } 154 155 if (unlikely(xdp_data_meta_unsupported(xdp))) { 156 from_buf = xdp->data; 157 metalen = 0; 158 } else { 159 from_buf = xdp->data_meta; 160 metalen = xdp->data - xdp->data_meta; 161 } 162 163 memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 164 __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); 165 166 offset += metalen; 167 addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 168 err = xskq_produce_batch_desc(xs->rx, addr, len); 169 if (!err) { 170 xskq_discard_addr(xs->umem->fq); 171 xdp_return_buff(xdp); 172 return 0; 173 } 174 175 xs->rx_dropped++; 176 return err; 177 } 178 179 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 180 { 181 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); 182 183 if (err) 184 xs->rx_dropped++; 185 186 return err; 187 } 188 189 static bool xsk_is_bound(struct xdp_sock *xs) 190 { 191 if (READ_ONCE(xs->state) == XSK_BOUND) { 192 /* Matches smp_wmb() in bind(). */ 193 smp_rmb(); 194 return true; 195 } 196 return false; 197 } 198 199 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 200 { 201 u32 len; 202 203 if (!xsk_is_bound(xs)) 204 return -EINVAL; 205 206 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 207 return -EINVAL; 208 209 len = xdp->data_end - xdp->data; 210 211 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? 212 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); 213 } 214 215 static void xsk_flush(struct xdp_sock *xs) 216 { 217 xskq_produce_flush_desc(xs->rx); 218 xs->sk.sk_data_ready(&xs->sk); 219 } 220 221 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 222 { 223 u32 metalen = xdp->data - xdp->data_meta; 224 u32 len = xdp->data_end - xdp->data; 225 u64 offset = xs->umem->headroom; 226 void *buffer; 227 u64 addr; 228 int err; 229 230 spin_lock_bh(&xs->rx_lock); 231 232 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { 233 err = -EINVAL; 234 goto out_unlock; 235 } 236 237 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || 238 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 239 err = -ENOSPC; 240 goto out_drop; 241 } 242 243 addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 244 buffer = xdp_umem_get_data(xs->umem, addr); 245 memcpy(buffer, xdp->data_meta, len + metalen); 246 247 addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); 248 err = xskq_produce_batch_desc(xs->rx, addr, len); 249 if (err) 250 goto out_drop; 251 252 xskq_discard_addr(xs->umem->fq); 253 xskq_produce_flush_desc(xs->rx); 254 255 spin_unlock_bh(&xs->rx_lock); 256 257 xs->sk.sk_data_ready(&xs->sk); 258 return 0; 259 260 out_drop: 261 xs->rx_dropped++; 262 out_unlock: 263 spin_unlock_bh(&xs->rx_lock); 264 return err; 265 } 266 267 int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, 268 struct xdp_sock *xs) 269 { 270 struct xsk_map *m = container_of(map, struct xsk_map, map); 271 struct list_head *flush_list = this_cpu_ptr(m->flush_list); 272 int err; 273 274 err = xsk_rcv(xs, xdp); 275 if (err) 276 return err; 277 278 if (!xs->flush_node.prev) 279 list_add(&xs->flush_node, flush_list); 280 281 return 0; 282 } 283 284 void __xsk_map_flush(struct bpf_map *map) 285 { 286 struct xsk_map *m = container_of(map, struct xsk_map, map); 287 struct list_head *flush_list = this_cpu_ptr(m->flush_list); 288 struct xdp_sock *xs, *tmp; 289 290 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 291 xsk_flush(xs); 292 __list_del_clearprev(&xs->flush_node); 293 } 294 } 295 296 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) 297 { 298 xskq_produce_flush_addr_n(umem->cq, nb_entries); 299 } 300 EXPORT_SYMBOL(xsk_umem_complete_tx); 301 302 void xsk_umem_consume_tx_done(struct xdp_umem *umem) 303 { 304 struct xdp_sock *xs; 305 306 rcu_read_lock(); 307 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 308 xs->sk.sk_write_space(&xs->sk); 309 } 310 rcu_read_unlock(); 311 } 312 EXPORT_SYMBOL(xsk_umem_consume_tx_done); 313 314 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) 315 { 316 struct xdp_sock *xs; 317 318 rcu_read_lock(); 319 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 320 if (!xskq_peek_desc(xs->tx, desc, umem)) 321 continue; 322 323 if (xskq_produce_addr_lazy(umem->cq, desc->addr)) 324 goto out; 325 326 xskq_discard_desc(xs->tx); 327 rcu_read_unlock(); 328 return true; 329 } 330 331 out: 332 rcu_read_unlock(); 333 return false; 334 } 335 EXPORT_SYMBOL(xsk_umem_consume_tx); 336 337 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 338 { 339 struct net_device *dev = xs->dev; 340 int err; 341 342 rcu_read_lock(); 343 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 344 rcu_read_unlock(); 345 346 return err; 347 } 348 349 static int xsk_zc_xmit(struct xdp_sock *xs) 350 { 351 return xsk_wakeup(xs, XDP_WAKEUP_TX); 352 } 353 354 static void xsk_destruct_skb(struct sk_buff *skb) 355 { 356 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; 357 struct xdp_sock *xs = xdp_sk(skb->sk); 358 unsigned long flags; 359 360 spin_lock_irqsave(&xs->tx_completion_lock, flags); 361 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); 362 spin_unlock_irqrestore(&xs->tx_completion_lock, flags); 363 364 sock_wfree(skb); 365 } 366 367 static int xsk_generic_xmit(struct sock *sk) 368 { 369 struct xdp_sock *xs = xdp_sk(sk); 370 u32 max_batch = TX_BATCH_SIZE; 371 bool sent_frame = false; 372 struct xdp_desc desc; 373 struct sk_buff *skb; 374 int err = 0; 375 376 mutex_lock(&xs->mutex); 377 378 if (xs->queue_id >= xs->dev->real_num_tx_queues) 379 goto out; 380 381 while (xskq_peek_desc(xs->tx, &desc, xs->umem)) { 382 char *buffer; 383 u64 addr; 384 u32 len; 385 386 if (max_batch-- == 0) { 387 err = -EAGAIN; 388 goto out; 389 } 390 391 len = desc.len; 392 skb = sock_alloc_send_skb(sk, len, 1, &err); 393 if (unlikely(!skb)) { 394 err = -EAGAIN; 395 goto out; 396 } 397 398 skb_put(skb, len); 399 addr = desc.addr; 400 buffer = xdp_umem_get_data(xs->umem, addr); 401 err = skb_store_bits(skb, 0, buffer, len); 402 if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { 403 kfree_skb(skb); 404 goto out; 405 } 406 407 skb->dev = xs->dev; 408 skb->priority = sk->sk_priority; 409 skb->mark = sk->sk_mark; 410 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; 411 skb->destructor = xsk_destruct_skb; 412 413 err = dev_direct_xmit(skb, xs->queue_id); 414 xskq_discard_desc(xs->tx); 415 /* Ignore NET_XMIT_CN as packet might have been sent */ 416 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 417 /* SKB completed but not sent */ 418 err = -EBUSY; 419 goto out; 420 } 421 422 sent_frame = true; 423 } 424 425 out: 426 if (sent_frame) 427 sk->sk_write_space(sk); 428 429 mutex_unlock(&xs->mutex); 430 return err; 431 } 432 433 static int __xsk_sendmsg(struct sock *sk) 434 { 435 struct xdp_sock *xs = xdp_sk(sk); 436 437 if (unlikely(!(xs->dev->flags & IFF_UP))) 438 return -ENETDOWN; 439 if (unlikely(!xs->tx)) 440 return -ENOBUFS; 441 442 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); 443 } 444 445 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 446 { 447 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 448 struct sock *sk = sock->sk; 449 struct xdp_sock *xs = xdp_sk(sk); 450 451 if (unlikely(!xsk_is_bound(xs))) 452 return -ENXIO; 453 if (unlikely(need_wait)) 454 return -EOPNOTSUPP; 455 456 return __xsk_sendmsg(sk); 457 } 458 459 static __poll_t xsk_poll(struct file *file, struct socket *sock, 460 struct poll_table_struct *wait) 461 { 462 __poll_t mask = datagram_poll(file, sock, wait); 463 struct sock *sk = sock->sk; 464 struct xdp_sock *xs = xdp_sk(sk); 465 struct xdp_umem *umem; 466 467 if (unlikely(!xsk_is_bound(xs))) 468 return mask; 469 470 umem = xs->umem; 471 472 if (umem->need_wakeup) { 473 if (xs->zc) 474 xsk_wakeup(xs, umem->need_wakeup); 475 else 476 /* Poll needs to drive Tx also in copy mode */ 477 __xsk_sendmsg(sk); 478 } 479 480 if (xs->rx && !xskq_empty_desc(xs->rx)) 481 mask |= EPOLLIN | EPOLLRDNORM; 482 if (xs->tx && !xskq_full_desc(xs->tx)) 483 mask |= EPOLLOUT | EPOLLWRNORM; 484 485 return mask; 486 } 487 488 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 489 bool umem_queue) 490 { 491 struct xsk_queue *q; 492 493 if (entries == 0 || *queue || !is_power_of_2(entries)) 494 return -EINVAL; 495 496 q = xskq_create(entries, umem_queue); 497 if (!q) 498 return -ENOMEM; 499 500 /* Make sure queue is ready before it can be seen by others */ 501 smp_wmb(); 502 WRITE_ONCE(*queue, q); 503 return 0; 504 } 505 506 static void xsk_unbind_dev(struct xdp_sock *xs) 507 { 508 struct net_device *dev = xs->dev; 509 510 if (xs->state != XSK_BOUND) 511 return; 512 WRITE_ONCE(xs->state, XSK_UNBOUND); 513 514 /* Wait for driver to stop using the xdp socket. */ 515 xdp_del_sk_umem(xs->umem, xs); 516 xs->dev = NULL; 517 synchronize_net(); 518 dev_put(dev); 519 } 520 521 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 522 struct xdp_sock ***map_entry) 523 { 524 struct xsk_map *map = NULL; 525 struct xsk_map_node *node; 526 527 *map_entry = NULL; 528 529 spin_lock_bh(&xs->map_list_lock); 530 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 531 node); 532 if (node) { 533 WARN_ON(xsk_map_inc(node->map)); 534 map = node->map; 535 *map_entry = node->map_entry; 536 } 537 spin_unlock_bh(&xs->map_list_lock); 538 return map; 539 } 540 541 static void xsk_delete_from_maps(struct xdp_sock *xs) 542 { 543 /* This function removes the current XDP socket from all the 544 * maps it resides in. We need to take extra care here, due to 545 * the two locks involved. Each map has a lock synchronizing 546 * updates to the entries, and each socket has a lock that 547 * synchronizes access to the list of maps (map_list). For 548 * deadlock avoidance the locks need to be taken in the order 549 * "map lock"->"socket map list lock". We start off by 550 * accessing the socket map list, and take a reference to the 551 * map to guarantee existence between the 552 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 553 * calls. Then we ask the map to remove the socket, which 554 * tries to remove the socket from the map. Note that there 555 * might be updates to the map between 556 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 557 */ 558 struct xdp_sock **map_entry = NULL; 559 struct xsk_map *map; 560 561 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 562 xsk_map_try_sock_delete(map, xs, map_entry); 563 xsk_map_put(map); 564 } 565 } 566 567 static int xsk_release(struct socket *sock) 568 { 569 struct sock *sk = sock->sk; 570 struct xdp_sock *xs = xdp_sk(sk); 571 struct net *net; 572 573 if (!sk) 574 return 0; 575 576 net = sock_net(sk); 577 578 mutex_lock(&net->xdp.lock); 579 sk_del_node_init_rcu(sk); 580 mutex_unlock(&net->xdp.lock); 581 582 local_bh_disable(); 583 sock_prot_inuse_add(net, sk->sk_prot, -1); 584 local_bh_enable(); 585 586 xsk_delete_from_maps(xs); 587 mutex_lock(&xs->mutex); 588 xsk_unbind_dev(xs); 589 mutex_unlock(&xs->mutex); 590 591 xskq_destroy(xs->rx); 592 xskq_destroy(xs->tx); 593 594 sock_orphan(sk); 595 sock->sk = NULL; 596 597 sk_refcnt_debug_release(sk); 598 sock_put(sk); 599 600 return 0; 601 } 602 603 static struct socket *xsk_lookup_xsk_from_fd(int fd) 604 { 605 struct socket *sock; 606 int err; 607 608 sock = sockfd_lookup(fd, &err); 609 if (!sock) 610 return ERR_PTR(-ENOTSOCK); 611 612 if (sock->sk->sk_family != PF_XDP) { 613 sockfd_put(sock); 614 return ERR_PTR(-ENOPROTOOPT); 615 } 616 617 return sock; 618 } 619 620 /* Check if umem pages are contiguous. 621 * If zero-copy mode, use the DMA address to do the page contiguity check 622 * For all other modes we use addr (kernel virtual address) 623 * Store the result in the low bits of addr. 624 */ 625 static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) 626 { 627 struct xdp_umem_page *pgs = umem->pages; 628 int i, is_contig; 629 630 for (i = 0; i < umem->npgs - 1; i++) { 631 is_contig = (flags & XDP_ZEROCOPY) ? 632 (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : 633 (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); 634 pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; 635 } 636 } 637 638 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 639 { 640 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 641 struct sock *sk = sock->sk; 642 struct xdp_sock *xs = xdp_sk(sk); 643 struct net_device *dev; 644 u32 flags, qid; 645 int err = 0; 646 647 if (addr_len < sizeof(struct sockaddr_xdp)) 648 return -EINVAL; 649 if (sxdp->sxdp_family != AF_XDP) 650 return -EINVAL; 651 652 flags = sxdp->sxdp_flags; 653 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 654 XDP_USE_NEED_WAKEUP)) 655 return -EINVAL; 656 657 rtnl_lock(); 658 mutex_lock(&xs->mutex); 659 if (xs->state != XSK_READY) { 660 err = -EBUSY; 661 goto out_release; 662 } 663 664 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 665 if (!dev) { 666 err = -ENODEV; 667 goto out_release; 668 } 669 670 if (!xs->rx && !xs->tx) { 671 err = -EINVAL; 672 goto out_unlock; 673 } 674 675 qid = sxdp->sxdp_queue_id; 676 677 if (flags & XDP_SHARED_UMEM) { 678 struct xdp_sock *umem_xs; 679 struct socket *sock; 680 681 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 682 (flags & XDP_USE_NEED_WAKEUP)) { 683 /* Cannot specify flags for shared sockets. */ 684 err = -EINVAL; 685 goto out_unlock; 686 } 687 688 if (xs->umem) { 689 /* We have already our own. */ 690 err = -EINVAL; 691 goto out_unlock; 692 } 693 694 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 695 if (IS_ERR(sock)) { 696 err = PTR_ERR(sock); 697 goto out_unlock; 698 } 699 700 umem_xs = xdp_sk(sock->sk); 701 if (!xsk_is_bound(umem_xs)) { 702 err = -EBADF; 703 sockfd_put(sock); 704 goto out_unlock; 705 } 706 if (umem_xs->dev != dev || umem_xs->queue_id != qid) { 707 err = -EINVAL; 708 sockfd_put(sock); 709 goto out_unlock; 710 } 711 712 xdp_get_umem(umem_xs->umem); 713 WRITE_ONCE(xs->umem, umem_xs->umem); 714 sockfd_put(sock); 715 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 716 err = -EINVAL; 717 goto out_unlock; 718 } else { 719 /* This xsk has its own umem. */ 720 xskq_set_umem(xs->umem->fq, xs->umem->size, 721 xs->umem->chunk_mask); 722 xskq_set_umem(xs->umem->cq, xs->umem->size, 723 xs->umem->chunk_mask); 724 725 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); 726 if (err) 727 goto out_unlock; 728 729 xsk_check_page_contiguity(xs->umem, flags); 730 } 731 732 xs->dev = dev; 733 xs->zc = xs->umem->zc; 734 xs->queue_id = qid; 735 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); 736 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); 737 xdp_add_sk_umem(xs->umem, xs); 738 739 out_unlock: 740 if (err) { 741 dev_put(dev); 742 } else { 743 /* Matches smp_rmb() in bind() for shared umem 744 * sockets, and xsk_is_bound(). 745 */ 746 smp_wmb(); 747 WRITE_ONCE(xs->state, XSK_BOUND); 748 } 749 out_release: 750 mutex_unlock(&xs->mutex); 751 rtnl_unlock(); 752 return err; 753 } 754 755 struct xdp_umem_reg_v1 { 756 __u64 addr; /* Start of packet data area */ 757 __u64 len; /* Length of packet data area */ 758 __u32 chunk_size; 759 __u32 headroom; 760 }; 761 762 static int xsk_setsockopt(struct socket *sock, int level, int optname, 763 char __user *optval, unsigned int optlen) 764 { 765 struct sock *sk = sock->sk; 766 struct xdp_sock *xs = xdp_sk(sk); 767 int err; 768 769 if (level != SOL_XDP) 770 return -ENOPROTOOPT; 771 772 switch (optname) { 773 case XDP_RX_RING: 774 case XDP_TX_RING: 775 { 776 struct xsk_queue **q; 777 int entries; 778 779 if (optlen < sizeof(entries)) 780 return -EINVAL; 781 if (copy_from_user(&entries, optval, sizeof(entries))) 782 return -EFAULT; 783 784 mutex_lock(&xs->mutex); 785 if (xs->state != XSK_READY) { 786 mutex_unlock(&xs->mutex); 787 return -EBUSY; 788 } 789 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 790 err = xsk_init_queue(entries, q, false); 791 if (!err && optname == XDP_TX_RING) 792 /* Tx needs to be explicitly woken up the first time */ 793 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 794 mutex_unlock(&xs->mutex); 795 return err; 796 } 797 case XDP_UMEM_REG: 798 { 799 size_t mr_size = sizeof(struct xdp_umem_reg); 800 struct xdp_umem_reg mr = {}; 801 struct xdp_umem *umem; 802 803 if (optlen < sizeof(struct xdp_umem_reg_v1)) 804 return -EINVAL; 805 else if (optlen < sizeof(mr)) 806 mr_size = sizeof(struct xdp_umem_reg_v1); 807 808 if (copy_from_user(&mr, optval, mr_size)) 809 return -EFAULT; 810 811 mutex_lock(&xs->mutex); 812 if (xs->state != XSK_READY || xs->umem) { 813 mutex_unlock(&xs->mutex); 814 return -EBUSY; 815 } 816 817 umem = xdp_umem_create(&mr); 818 if (IS_ERR(umem)) { 819 mutex_unlock(&xs->mutex); 820 return PTR_ERR(umem); 821 } 822 823 /* Make sure umem is ready before it can be seen by others */ 824 smp_wmb(); 825 WRITE_ONCE(xs->umem, umem); 826 mutex_unlock(&xs->mutex); 827 return 0; 828 } 829 case XDP_UMEM_FILL_RING: 830 case XDP_UMEM_COMPLETION_RING: 831 { 832 struct xsk_queue **q; 833 int entries; 834 835 if (copy_from_user(&entries, optval, sizeof(entries))) 836 return -EFAULT; 837 838 mutex_lock(&xs->mutex); 839 if (xs->state != XSK_READY) { 840 mutex_unlock(&xs->mutex); 841 return -EBUSY; 842 } 843 if (!xs->umem) { 844 mutex_unlock(&xs->mutex); 845 return -EINVAL; 846 } 847 848 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 849 &xs->umem->cq; 850 err = xsk_init_queue(entries, q, true); 851 mutex_unlock(&xs->mutex); 852 return err; 853 } 854 default: 855 break; 856 } 857 858 return -ENOPROTOOPT; 859 } 860 861 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 862 { 863 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 864 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 865 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 866 } 867 868 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 869 { 870 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 871 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 872 ring->desc = offsetof(struct xdp_umem_ring, desc); 873 } 874 875 static int xsk_getsockopt(struct socket *sock, int level, int optname, 876 char __user *optval, int __user *optlen) 877 { 878 struct sock *sk = sock->sk; 879 struct xdp_sock *xs = xdp_sk(sk); 880 int len; 881 882 if (level != SOL_XDP) 883 return -ENOPROTOOPT; 884 885 if (get_user(len, optlen)) 886 return -EFAULT; 887 if (len < 0) 888 return -EINVAL; 889 890 switch (optname) { 891 case XDP_STATISTICS: 892 { 893 struct xdp_statistics stats; 894 895 if (len < sizeof(stats)) 896 return -EINVAL; 897 898 mutex_lock(&xs->mutex); 899 stats.rx_dropped = xs->rx_dropped; 900 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 901 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 902 mutex_unlock(&xs->mutex); 903 904 if (copy_to_user(optval, &stats, sizeof(stats))) 905 return -EFAULT; 906 if (put_user(sizeof(stats), optlen)) 907 return -EFAULT; 908 909 return 0; 910 } 911 case XDP_MMAP_OFFSETS: 912 { 913 struct xdp_mmap_offsets off; 914 struct xdp_mmap_offsets_v1 off_v1; 915 bool flags_supported = true; 916 void *to_copy; 917 918 if (len < sizeof(off_v1)) 919 return -EINVAL; 920 else if (len < sizeof(off)) 921 flags_supported = false; 922 923 if (flags_supported) { 924 /* xdp_ring_offset is identical to xdp_ring_offset_v1 925 * except for the flags field added to the end. 926 */ 927 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 928 &off.rx); 929 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 930 &off.tx); 931 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 932 &off.fr); 933 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 934 &off.cr); 935 off.rx.flags = offsetof(struct xdp_rxtx_ring, 936 ptrs.flags); 937 off.tx.flags = offsetof(struct xdp_rxtx_ring, 938 ptrs.flags); 939 off.fr.flags = offsetof(struct xdp_umem_ring, 940 ptrs.flags); 941 off.cr.flags = offsetof(struct xdp_umem_ring, 942 ptrs.flags); 943 944 len = sizeof(off); 945 to_copy = &off; 946 } else { 947 xsk_enter_rxtx_offsets(&off_v1.rx); 948 xsk_enter_rxtx_offsets(&off_v1.tx); 949 xsk_enter_umem_offsets(&off_v1.fr); 950 xsk_enter_umem_offsets(&off_v1.cr); 951 952 len = sizeof(off_v1); 953 to_copy = &off_v1; 954 } 955 956 if (copy_to_user(optval, to_copy, len)) 957 return -EFAULT; 958 if (put_user(len, optlen)) 959 return -EFAULT; 960 961 return 0; 962 } 963 case XDP_OPTIONS: 964 { 965 struct xdp_options opts = {}; 966 967 if (len < sizeof(opts)) 968 return -EINVAL; 969 970 mutex_lock(&xs->mutex); 971 if (xs->zc) 972 opts.flags |= XDP_OPTIONS_ZEROCOPY; 973 mutex_unlock(&xs->mutex); 974 975 len = sizeof(opts); 976 if (copy_to_user(optval, &opts, len)) 977 return -EFAULT; 978 if (put_user(len, optlen)) 979 return -EFAULT; 980 981 return 0; 982 } 983 default: 984 break; 985 } 986 987 return -EOPNOTSUPP; 988 } 989 990 static int xsk_mmap(struct file *file, struct socket *sock, 991 struct vm_area_struct *vma) 992 { 993 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 994 unsigned long size = vma->vm_end - vma->vm_start; 995 struct xdp_sock *xs = xdp_sk(sock->sk); 996 struct xsk_queue *q = NULL; 997 struct xdp_umem *umem; 998 unsigned long pfn; 999 struct page *qpg; 1000 1001 if (READ_ONCE(xs->state) != XSK_READY) 1002 return -EBUSY; 1003 1004 if (offset == XDP_PGOFF_RX_RING) { 1005 q = READ_ONCE(xs->rx); 1006 } else if (offset == XDP_PGOFF_TX_RING) { 1007 q = READ_ONCE(xs->tx); 1008 } else { 1009 umem = READ_ONCE(xs->umem); 1010 if (!umem) 1011 return -EINVAL; 1012 1013 /* Matches the smp_wmb() in XDP_UMEM_REG */ 1014 smp_rmb(); 1015 if (offset == XDP_UMEM_PGOFF_FILL_RING) 1016 q = READ_ONCE(umem->fq); 1017 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 1018 q = READ_ONCE(umem->cq); 1019 } 1020 1021 if (!q) 1022 return -EINVAL; 1023 1024 /* Matches the smp_wmb() in xsk_init_queue */ 1025 smp_rmb(); 1026 qpg = virt_to_head_page(q->ring); 1027 if (size > page_size(qpg)) 1028 return -EINVAL; 1029 1030 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 1031 return remap_pfn_range(vma, vma->vm_start, pfn, 1032 size, vma->vm_page_prot); 1033 } 1034 1035 static int xsk_notifier(struct notifier_block *this, 1036 unsigned long msg, void *ptr) 1037 { 1038 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1039 struct net *net = dev_net(dev); 1040 struct sock *sk; 1041 1042 switch (msg) { 1043 case NETDEV_UNREGISTER: 1044 mutex_lock(&net->xdp.lock); 1045 sk_for_each(sk, &net->xdp.list) { 1046 struct xdp_sock *xs = xdp_sk(sk); 1047 1048 mutex_lock(&xs->mutex); 1049 if (xs->dev == dev) { 1050 sk->sk_err = ENETDOWN; 1051 if (!sock_flag(sk, SOCK_DEAD)) 1052 sk->sk_error_report(sk); 1053 1054 xsk_unbind_dev(xs); 1055 1056 /* Clear device references in umem. */ 1057 xdp_umem_clear_dev(xs->umem); 1058 } 1059 mutex_unlock(&xs->mutex); 1060 } 1061 mutex_unlock(&net->xdp.lock); 1062 break; 1063 } 1064 return NOTIFY_DONE; 1065 } 1066 1067 static struct proto xsk_proto = { 1068 .name = "XDP", 1069 .owner = THIS_MODULE, 1070 .obj_size = sizeof(struct xdp_sock), 1071 }; 1072 1073 static const struct proto_ops xsk_proto_ops = { 1074 .family = PF_XDP, 1075 .owner = THIS_MODULE, 1076 .release = xsk_release, 1077 .bind = xsk_bind, 1078 .connect = sock_no_connect, 1079 .socketpair = sock_no_socketpair, 1080 .accept = sock_no_accept, 1081 .getname = sock_no_getname, 1082 .poll = xsk_poll, 1083 .ioctl = sock_no_ioctl, 1084 .listen = sock_no_listen, 1085 .shutdown = sock_no_shutdown, 1086 .setsockopt = xsk_setsockopt, 1087 .getsockopt = xsk_getsockopt, 1088 .sendmsg = xsk_sendmsg, 1089 .recvmsg = sock_no_recvmsg, 1090 .mmap = xsk_mmap, 1091 .sendpage = sock_no_sendpage, 1092 }; 1093 1094 static void xsk_destruct(struct sock *sk) 1095 { 1096 struct xdp_sock *xs = xdp_sk(sk); 1097 1098 if (!sock_flag(sk, SOCK_DEAD)) 1099 return; 1100 1101 xdp_put_umem(xs->umem); 1102 1103 sk_refcnt_debug_dec(sk); 1104 } 1105 1106 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1107 int kern) 1108 { 1109 struct sock *sk; 1110 struct xdp_sock *xs; 1111 1112 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1113 return -EPERM; 1114 if (sock->type != SOCK_RAW) 1115 return -ESOCKTNOSUPPORT; 1116 1117 if (protocol) 1118 return -EPROTONOSUPPORT; 1119 1120 sock->state = SS_UNCONNECTED; 1121 1122 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1123 if (!sk) 1124 return -ENOBUFS; 1125 1126 sock->ops = &xsk_proto_ops; 1127 1128 sock_init_data(sock, sk); 1129 1130 sk->sk_family = PF_XDP; 1131 1132 sk->sk_destruct = xsk_destruct; 1133 sk_refcnt_debug_inc(sk); 1134 1135 sock_set_flag(sk, SOCK_RCU_FREE); 1136 1137 xs = xdp_sk(sk); 1138 xs->state = XSK_READY; 1139 mutex_init(&xs->mutex); 1140 spin_lock_init(&xs->rx_lock); 1141 spin_lock_init(&xs->tx_completion_lock); 1142 1143 INIT_LIST_HEAD(&xs->map_list); 1144 spin_lock_init(&xs->map_list_lock); 1145 1146 mutex_lock(&net->xdp.lock); 1147 sk_add_node_rcu(sk, &net->xdp.list); 1148 mutex_unlock(&net->xdp.lock); 1149 1150 local_bh_disable(); 1151 sock_prot_inuse_add(net, &xsk_proto, 1); 1152 local_bh_enable(); 1153 1154 return 0; 1155 } 1156 1157 static const struct net_proto_family xsk_family_ops = { 1158 .family = PF_XDP, 1159 .create = xsk_create, 1160 .owner = THIS_MODULE, 1161 }; 1162 1163 static struct notifier_block xsk_netdev_notifier = { 1164 .notifier_call = xsk_notifier, 1165 }; 1166 1167 static int __net_init xsk_net_init(struct net *net) 1168 { 1169 mutex_init(&net->xdp.lock); 1170 INIT_HLIST_HEAD(&net->xdp.list); 1171 return 0; 1172 } 1173 1174 static void __net_exit xsk_net_exit(struct net *net) 1175 { 1176 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 1177 } 1178 1179 static struct pernet_operations xsk_net_ops = { 1180 .init = xsk_net_init, 1181 .exit = xsk_net_exit, 1182 }; 1183 1184 static int __init xsk_init(void) 1185 { 1186 int err; 1187 1188 err = proto_register(&xsk_proto, 0 /* no slab */); 1189 if (err) 1190 goto out; 1191 1192 err = sock_register(&xsk_family_ops); 1193 if (err) 1194 goto out_proto; 1195 1196 err = register_pernet_subsys(&xsk_net_ops); 1197 if (err) 1198 goto out_sk; 1199 1200 err = register_netdevice_notifier(&xsk_netdev_notifier); 1201 if (err) 1202 goto out_pernet; 1203 1204 return 0; 1205 1206 out_pernet: 1207 unregister_pernet_subsys(&xsk_net_ops); 1208 out_sk: 1209 sock_unregister(PF_XDP); 1210 out_proto: 1211 proto_unregister(&xsk_proto); 1212 out: 1213 return err; 1214 } 1215 1216 fs_initcall(xsk_init); 1217