1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <net/xdp_sock.h> 25 #include <net/xdp.h> 26 27 #include "xsk_queue.h" 28 #include "xdp_umem.h" 29 30 #define TX_BATCH_SIZE 16 31 32 static struct xdp_sock *xdp_sk(struct sock *sk) 33 { 34 return (struct xdp_sock *)sk; 35 } 36 37 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 38 { 39 return !!xs->rx; 40 } 41 42 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 43 { 44 u32 *id, len = xdp->data_end - xdp->data; 45 void *buffer; 46 int err = 0; 47 48 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 49 return -EINVAL; 50 51 id = xskq_peek_id(xs->umem->fq); 52 if (!id) 53 return -ENOSPC; 54 55 buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); 56 memcpy(buffer, xdp->data, len); 57 err = xskq_produce_batch_desc(xs->rx, *id, len, 58 xs->umem->frame_headroom); 59 if (!err) 60 xskq_discard_id(xs->umem->fq); 61 62 return err; 63 } 64 65 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 66 { 67 int err; 68 69 err = __xsk_rcv(xs, xdp); 70 if (likely(!err)) 71 xdp_return_buff(xdp); 72 else 73 xs->rx_dropped++; 74 75 return err; 76 } 77 78 void xsk_flush(struct xdp_sock *xs) 79 { 80 xskq_produce_flush_desc(xs->rx); 81 xs->sk.sk_data_ready(&xs->sk); 82 } 83 84 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 85 { 86 int err; 87 88 err = __xsk_rcv(xs, xdp); 89 if (!err) 90 xsk_flush(xs); 91 else 92 xs->rx_dropped++; 93 94 return err; 95 } 96 97 static void xsk_destruct_skb(struct sk_buff *skb) 98 { 99 u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; 100 struct xdp_sock *xs = xdp_sk(skb->sk); 101 102 WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); 103 104 sock_wfree(skb); 105 } 106 107 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, 108 size_t total_len) 109 { 110 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 111 u32 max_batch = TX_BATCH_SIZE; 112 struct xdp_sock *xs = xdp_sk(sk); 113 bool sent_frame = false; 114 struct xdp_desc desc; 115 struct sk_buff *skb; 116 int err = 0; 117 118 if (unlikely(!xs->tx)) 119 return -ENOBUFS; 120 if (need_wait) 121 return -EOPNOTSUPP; 122 123 mutex_lock(&xs->mutex); 124 125 while (xskq_peek_desc(xs->tx, &desc)) { 126 char *buffer; 127 u32 id, len; 128 129 if (max_batch-- == 0) { 130 err = -EAGAIN; 131 goto out; 132 } 133 134 if (xskq_reserve_id(xs->umem->cq)) { 135 err = -EAGAIN; 136 goto out; 137 } 138 139 len = desc.len; 140 if (unlikely(len > xs->dev->mtu)) { 141 err = -EMSGSIZE; 142 goto out; 143 } 144 145 if (xs->queue_id >= xs->dev->real_num_tx_queues) { 146 err = -ENXIO; 147 goto out; 148 } 149 150 skb = sock_alloc_send_skb(sk, len, !need_wait, &err); 151 if (unlikely(!skb)) { 152 err = -EAGAIN; 153 goto out; 154 } 155 156 skb_put(skb, len); 157 id = desc.idx; 158 buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; 159 err = skb_store_bits(skb, 0, buffer, len); 160 if (unlikely(err)) { 161 kfree_skb(skb); 162 goto out; 163 } 164 165 skb->dev = xs->dev; 166 skb->priority = sk->sk_priority; 167 skb->mark = sk->sk_mark; 168 skb_shinfo(skb)->destructor_arg = (void *)(long)id; 169 skb->destructor = xsk_destruct_skb; 170 171 err = dev_direct_xmit(skb, xs->queue_id); 172 /* Ignore NET_XMIT_CN as packet might have been sent */ 173 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 174 err = -EAGAIN; 175 /* SKB consumed by dev_direct_xmit() */ 176 goto out; 177 } 178 179 sent_frame = true; 180 xskq_discard_desc(xs->tx); 181 } 182 183 out: 184 if (sent_frame) 185 sk->sk_write_space(sk); 186 187 mutex_unlock(&xs->mutex); 188 return err; 189 } 190 191 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 192 { 193 struct sock *sk = sock->sk; 194 struct xdp_sock *xs = xdp_sk(sk); 195 196 if (unlikely(!xs->dev)) 197 return -ENXIO; 198 if (unlikely(!(xs->dev->flags & IFF_UP))) 199 return -ENETDOWN; 200 201 return xsk_generic_xmit(sk, m, total_len); 202 } 203 204 static unsigned int xsk_poll(struct file *file, struct socket *sock, 205 struct poll_table_struct *wait) 206 { 207 unsigned int mask = datagram_poll(file, sock, wait); 208 struct sock *sk = sock->sk; 209 struct xdp_sock *xs = xdp_sk(sk); 210 211 if (xs->rx && !xskq_empty_desc(xs->rx)) 212 mask |= POLLIN | POLLRDNORM; 213 if (xs->tx && !xskq_full_desc(xs->tx)) 214 mask |= POLLOUT | POLLWRNORM; 215 216 return mask; 217 } 218 219 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 220 bool umem_queue) 221 { 222 struct xsk_queue *q; 223 224 if (entries == 0 || *queue || !is_power_of_2(entries)) 225 return -EINVAL; 226 227 q = xskq_create(entries, umem_queue); 228 if (!q) 229 return -ENOMEM; 230 231 /* Make sure queue is ready before it can be seen by others */ 232 smp_wmb(); 233 *queue = q; 234 return 0; 235 } 236 237 static int xsk_release(struct socket *sock) 238 { 239 struct sock *sk = sock->sk; 240 struct xdp_sock *xs = xdp_sk(sk); 241 struct net *net; 242 243 if (!sk) 244 return 0; 245 246 net = sock_net(sk); 247 248 local_bh_disable(); 249 sock_prot_inuse_add(net, sk->sk_prot, -1); 250 local_bh_enable(); 251 252 if (xs->dev) { 253 /* Wait for driver to stop using the xdp socket. */ 254 synchronize_net(); 255 dev_put(xs->dev); 256 xs->dev = NULL; 257 } 258 259 sock_orphan(sk); 260 sock->sk = NULL; 261 262 sk_refcnt_debug_release(sk); 263 sock_put(sk); 264 265 return 0; 266 } 267 268 static struct socket *xsk_lookup_xsk_from_fd(int fd) 269 { 270 struct socket *sock; 271 int err; 272 273 sock = sockfd_lookup(fd, &err); 274 if (!sock) 275 return ERR_PTR(-ENOTSOCK); 276 277 if (sock->sk->sk_family != PF_XDP) { 278 sockfd_put(sock); 279 return ERR_PTR(-ENOPROTOOPT); 280 } 281 282 return sock; 283 } 284 285 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 286 { 287 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 288 struct sock *sk = sock->sk; 289 struct xdp_sock *xs = xdp_sk(sk); 290 struct net_device *dev; 291 int err = 0; 292 293 if (addr_len < sizeof(struct sockaddr_xdp)) 294 return -EINVAL; 295 if (sxdp->sxdp_family != AF_XDP) 296 return -EINVAL; 297 298 mutex_lock(&xs->mutex); 299 if (xs->dev) { 300 err = -EBUSY; 301 goto out_release; 302 } 303 304 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 305 if (!dev) { 306 err = -ENODEV; 307 goto out_release; 308 } 309 310 if (!xs->rx && !xs->tx) { 311 err = -EINVAL; 312 goto out_unlock; 313 } 314 315 if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || 316 (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { 317 err = -EINVAL; 318 goto out_unlock; 319 } 320 321 if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { 322 struct xdp_sock *umem_xs; 323 struct socket *sock; 324 325 if (xs->umem) { 326 /* We have already our own. */ 327 err = -EINVAL; 328 goto out_unlock; 329 } 330 331 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 332 if (IS_ERR(sock)) { 333 err = PTR_ERR(sock); 334 goto out_unlock; 335 } 336 337 umem_xs = xdp_sk(sock->sk); 338 if (!umem_xs->umem) { 339 /* No umem to inherit. */ 340 err = -EBADF; 341 sockfd_put(sock); 342 goto out_unlock; 343 } else if (umem_xs->dev != dev || 344 umem_xs->queue_id != sxdp->sxdp_queue_id) { 345 err = -EINVAL; 346 sockfd_put(sock); 347 goto out_unlock; 348 } 349 350 xdp_get_umem(umem_xs->umem); 351 xs->umem = umem_xs->umem; 352 sockfd_put(sock); 353 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 354 err = -EINVAL; 355 goto out_unlock; 356 } else { 357 /* This xsk has its own umem. */ 358 xskq_set_umem(xs->umem->fq, &xs->umem->props); 359 xskq_set_umem(xs->umem->cq, &xs->umem->props); 360 } 361 362 xs->dev = dev; 363 xs->queue_id = sxdp->sxdp_queue_id; 364 365 xskq_set_umem(xs->rx, &xs->umem->props); 366 xskq_set_umem(xs->tx, &xs->umem->props); 367 368 out_unlock: 369 if (err) 370 dev_put(dev); 371 out_release: 372 mutex_unlock(&xs->mutex); 373 return err; 374 } 375 376 static int xsk_setsockopt(struct socket *sock, int level, int optname, 377 char __user *optval, unsigned int optlen) 378 { 379 struct sock *sk = sock->sk; 380 struct xdp_sock *xs = xdp_sk(sk); 381 int err; 382 383 if (level != SOL_XDP) 384 return -ENOPROTOOPT; 385 386 switch (optname) { 387 case XDP_RX_RING: 388 case XDP_TX_RING: 389 { 390 struct xsk_queue **q; 391 int entries; 392 393 if (optlen < sizeof(entries)) 394 return -EINVAL; 395 if (copy_from_user(&entries, optval, sizeof(entries))) 396 return -EFAULT; 397 398 mutex_lock(&xs->mutex); 399 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 400 err = xsk_init_queue(entries, q, false); 401 mutex_unlock(&xs->mutex); 402 return err; 403 } 404 case XDP_UMEM_REG: 405 { 406 struct xdp_umem_reg mr; 407 struct xdp_umem *umem; 408 409 if (copy_from_user(&mr, optval, sizeof(mr))) 410 return -EFAULT; 411 412 mutex_lock(&xs->mutex); 413 if (xs->umem) { 414 mutex_unlock(&xs->mutex); 415 return -EBUSY; 416 } 417 418 umem = xdp_umem_create(&mr); 419 if (IS_ERR(umem)) { 420 mutex_unlock(&xs->mutex); 421 return PTR_ERR(umem); 422 } 423 424 /* Make sure umem is ready before it can be seen by others */ 425 smp_wmb(); 426 xs->umem = umem; 427 mutex_unlock(&xs->mutex); 428 return 0; 429 } 430 case XDP_UMEM_FILL_RING: 431 case XDP_UMEM_COMPLETION_RING: 432 { 433 struct xsk_queue **q; 434 int entries; 435 436 if (copy_from_user(&entries, optval, sizeof(entries))) 437 return -EFAULT; 438 439 mutex_lock(&xs->mutex); 440 if (!xs->umem) { 441 mutex_unlock(&xs->mutex); 442 return -EINVAL; 443 } 444 445 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 446 &xs->umem->cq; 447 err = xsk_init_queue(entries, q, true); 448 mutex_unlock(&xs->mutex); 449 return err; 450 } 451 default: 452 break; 453 } 454 455 return -ENOPROTOOPT; 456 } 457 458 static int xsk_getsockopt(struct socket *sock, int level, int optname, 459 char __user *optval, int __user *optlen) 460 { 461 struct sock *sk = sock->sk; 462 struct xdp_sock *xs = xdp_sk(sk); 463 int len; 464 465 if (level != SOL_XDP) 466 return -ENOPROTOOPT; 467 468 if (get_user(len, optlen)) 469 return -EFAULT; 470 if (len < 0) 471 return -EINVAL; 472 473 switch (optname) { 474 case XDP_STATISTICS: 475 { 476 struct xdp_statistics stats; 477 478 if (len < sizeof(stats)) 479 return -EINVAL; 480 481 mutex_lock(&xs->mutex); 482 stats.rx_dropped = xs->rx_dropped; 483 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 484 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 485 mutex_unlock(&xs->mutex); 486 487 if (copy_to_user(optval, &stats, sizeof(stats))) 488 return -EFAULT; 489 if (put_user(sizeof(stats), optlen)) 490 return -EFAULT; 491 492 return 0; 493 } 494 case XDP_MMAP_OFFSETS: 495 { 496 struct xdp_mmap_offsets off; 497 498 if (len < sizeof(off)) 499 return -EINVAL; 500 501 off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 502 off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 503 off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); 504 off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 505 off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 506 off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); 507 508 off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 509 off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 510 off.fr.desc = offsetof(struct xdp_umem_ring, desc); 511 off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 512 off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 513 off.cr.desc = offsetof(struct xdp_umem_ring, desc); 514 515 len = sizeof(off); 516 if (copy_to_user(optval, &off, len)) 517 return -EFAULT; 518 if (put_user(len, optlen)) 519 return -EFAULT; 520 521 return 0; 522 } 523 default: 524 break; 525 } 526 527 return -EOPNOTSUPP; 528 } 529 530 static int xsk_mmap(struct file *file, struct socket *sock, 531 struct vm_area_struct *vma) 532 { 533 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; 534 unsigned long size = vma->vm_end - vma->vm_start; 535 struct xdp_sock *xs = xdp_sk(sock->sk); 536 struct xsk_queue *q = NULL; 537 struct xdp_umem *umem; 538 unsigned long pfn; 539 struct page *qpg; 540 541 if (offset == XDP_PGOFF_RX_RING) { 542 q = READ_ONCE(xs->rx); 543 } else if (offset == XDP_PGOFF_TX_RING) { 544 q = READ_ONCE(xs->tx); 545 } else { 546 umem = READ_ONCE(xs->umem); 547 if (!umem) 548 return -EINVAL; 549 550 if (offset == XDP_UMEM_PGOFF_FILL_RING) 551 q = READ_ONCE(umem->fq); 552 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 553 q = READ_ONCE(umem->cq); 554 } 555 556 if (!q) 557 return -EINVAL; 558 559 qpg = virt_to_head_page(q->ring); 560 if (size > (PAGE_SIZE << compound_order(qpg))) 561 return -EINVAL; 562 563 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 564 return remap_pfn_range(vma, vma->vm_start, pfn, 565 size, vma->vm_page_prot); 566 } 567 568 static struct proto xsk_proto = { 569 .name = "XDP", 570 .owner = THIS_MODULE, 571 .obj_size = sizeof(struct xdp_sock), 572 }; 573 574 static const struct proto_ops xsk_proto_ops = { 575 .family = PF_XDP, 576 .owner = THIS_MODULE, 577 .release = xsk_release, 578 .bind = xsk_bind, 579 .connect = sock_no_connect, 580 .socketpair = sock_no_socketpair, 581 .accept = sock_no_accept, 582 .getname = sock_no_getname, 583 .poll = xsk_poll, 584 .ioctl = sock_no_ioctl, 585 .listen = sock_no_listen, 586 .shutdown = sock_no_shutdown, 587 .setsockopt = xsk_setsockopt, 588 .getsockopt = xsk_getsockopt, 589 .sendmsg = xsk_sendmsg, 590 .recvmsg = sock_no_recvmsg, 591 .mmap = xsk_mmap, 592 .sendpage = sock_no_sendpage, 593 }; 594 595 static void xsk_destruct(struct sock *sk) 596 { 597 struct xdp_sock *xs = xdp_sk(sk); 598 599 if (!sock_flag(sk, SOCK_DEAD)) 600 return; 601 602 xskq_destroy(xs->rx); 603 xskq_destroy(xs->tx); 604 xdp_put_umem(xs->umem); 605 606 sk_refcnt_debug_dec(sk); 607 } 608 609 static int xsk_create(struct net *net, struct socket *sock, int protocol, 610 int kern) 611 { 612 struct sock *sk; 613 struct xdp_sock *xs; 614 615 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 616 return -EPERM; 617 if (sock->type != SOCK_RAW) 618 return -ESOCKTNOSUPPORT; 619 620 if (protocol) 621 return -EPROTONOSUPPORT; 622 623 sock->state = SS_UNCONNECTED; 624 625 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 626 if (!sk) 627 return -ENOBUFS; 628 629 sock->ops = &xsk_proto_ops; 630 631 sock_init_data(sock, sk); 632 633 sk->sk_family = PF_XDP; 634 635 sk->sk_destruct = xsk_destruct; 636 sk_refcnt_debug_inc(sk); 637 638 xs = xdp_sk(sk); 639 mutex_init(&xs->mutex); 640 641 local_bh_disable(); 642 sock_prot_inuse_add(net, &xsk_proto, 1); 643 local_bh_enable(); 644 645 return 0; 646 } 647 648 static const struct net_proto_family xsk_family_ops = { 649 .family = PF_XDP, 650 .create = xsk_create, 651 .owner = THIS_MODULE, 652 }; 653 654 static int __init xsk_init(void) 655 { 656 int err; 657 658 err = proto_register(&xsk_proto, 0 /* no slab */); 659 if (err) 660 goto out; 661 662 err = sock_register(&xsk_family_ops); 663 if (err) 664 goto out_proto; 665 666 return 0; 667 668 out_proto: 669 proto_unregister(&xsk_proto); 670 out: 671 return err; 672 } 673 674 fs_initcall(xsk_init); 675