1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 * 17 * Author(s): Björn Töpel <bjorn.topel@intel.com> 18 * Magnus Karlsson <magnus.karlsson@intel.com> 19 */ 20 21 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 22 23 #include <linux/if_xdp.h> 24 #include <linux/init.h> 25 #include <linux/sched/mm.h> 26 #include <linux/sched/signal.h> 27 #include <linux/sched/task.h> 28 #include <linux/socket.h> 29 #include <linux/file.h> 30 #include <linux/uaccess.h> 31 #include <linux/net.h> 32 #include <linux/netdevice.h> 33 #include <net/xdp_sock.h> 34 #include <net/xdp.h> 35 36 #include "xsk_queue.h" 37 #include "xdp_umem.h" 38 39 #define TX_BATCH_SIZE 16 40 41 static struct xdp_sock *xdp_sk(struct sock *sk) 42 { 43 return (struct xdp_sock *)sk; 44 } 45 46 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 47 { 48 return !!xs->rx; 49 } 50 51 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 52 { 53 u32 *id, len = xdp->data_end - xdp->data; 54 void *buffer; 55 int err = 0; 56 57 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 58 return -EINVAL; 59 60 id = xskq_peek_id(xs->umem->fq); 61 if (!id) 62 return -ENOSPC; 63 64 buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); 65 memcpy(buffer, xdp->data, len); 66 err = xskq_produce_batch_desc(xs->rx, *id, len, 67 xs->umem->frame_headroom); 68 if (!err) 69 xskq_discard_id(xs->umem->fq); 70 71 return err; 72 } 73 74 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 75 { 76 int err; 77 78 err = __xsk_rcv(xs, xdp); 79 if (likely(!err)) 80 xdp_return_buff(xdp); 81 else 82 xs->rx_dropped++; 83 84 return err; 85 } 86 87 void xsk_flush(struct xdp_sock *xs) 88 { 89 xskq_produce_flush_desc(xs->rx); 90 xs->sk.sk_data_ready(&xs->sk); 91 } 92 93 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 94 { 95 int err; 96 97 err = __xsk_rcv(xs, xdp); 98 if (!err) 99 xsk_flush(xs); 100 else 101 xs->rx_dropped++; 102 103 return err; 104 } 105 106 static void xsk_destruct_skb(struct sk_buff *skb) 107 { 108 u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; 109 struct xdp_sock *xs = xdp_sk(skb->sk); 110 111 WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); 112 113 sock_wfree(skb); 114 } 115 116 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, 117 size_t total_len) 118 { 119 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 120 u32 max_batch = TX_BATCH_SIZE; 121 struct xdp_sock *xs = xdp_sk(sk); 122 bool sent_frame = false; 123 struct xdp_desc desc; 124 struct sk_buff *skb; 125 int err = 0; 126 127 if (unlikely(!xs->tx)) 128 return -ENOBUFS; 129 if (need_wait) 130 return -EOPNOTSUPP; 131 132 mutex_lock(&xs->mutex); 133 134 while (xskq_peek_desc(xs->tx, &desc)) { 135 char *buffer; 136 u32 id, len; 137 138 if (max_batch-- == 0) { 139 err = -EAGAIN; 140 goto out; 141 } 142 143 if (xskq_reserve_id(xs->umem->cq)) { 144 err = -EAGAIN; 145 goto out; 146 } 147 148 len = desc.len; 149 if (unlikely(len > xs->dev->mtu)) { 150 err = -EMSGSIZE; 151 goto out; 152 } 153 154 skb = sock_alloc_send_skb(sk, len, !need_wait, &err); 155 if (unlikely(!skb)) { 156 err = -EAGAIN; 157 goto out; 158 } 159 160 skb_put(skb, len); 161 id = desc.idx; 162 buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; 163 err = skb_store_bits(skb, 0, buffer, len); 164 if (unlikely(err)) { 165 kfree_skb(skb); 166 goto out; 167 } 168 169 skb->dev = xs->dev; 170 skb->priority = sk->sk_priority; 171 skb->mark = sk->sk_mark; 172 skb_shinfo(skb)->destructor_arg = (void *)(long)id; 173 skb->destructor = xsk_destruct_skb; 174 175 err = dev_direct_xmit(skb, xs->queue_id); 176 /* Ignore NET_XMIT_CN as packet might have been sent */ 177 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 178 err = -EAGAIN; 179 /* SKB consumed by dev_direct_xmit() */ 180 goto out; 181 } 182 183 sent_frame = true; 184 xskq_discard_desc(xs->tx); 185 } 186 187 out: 188 if (sent_frame) 189 sk->sk_write_space(sk); 190 191 mutex_unlock(&xs->mutex); 192 return err; 193 } 194 195 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 196 { 197 struct sock *sk = sock->sk; 198 struct xdp_sock *xs = xdp_sk(sk); 199 200 if (unlikely(!xs->dev)) 201 return -ENXIO; 202 if (unlikely(!(xs->dev->flags & IFF_UP))) 203 return -ENETDOWN; 204 205 return xsk_generic_xmit(sk, m, total_len); 206 } 207 208 static unsigned int xsk_poll(struct file *file, struct socket *sock, 209 struct poll_table_struct *wait) 210 { 211 unsigned int mask = datagram_poll(file, sock, wait); 212 struct sock *sk = sock->sk; 213 struct xdp_sock *xs = xdp_sk(sk); 214 215 if (xs->rx && !xskq_empty_desc(xs->rx)) 216 mask |= POLLIN | POLLRDNORM; 217 if (xs->tx && !xskq_full_desc(xs->tx)) 218 mask |= POLLOUT | POLLWRNORM; 219 220 return mask; 221 } 222 223 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 224 bool umem_queue) 225 { 226 struct xsk_queue *q; 227 228 if (entries == 0 || *queue || !is_power_of_2(entries)) 229 return -EINVAL; 230 231 q = xskq_create(entries, umem_queue); 232 if (!q) 233 return -ENOMEM; 234 235 *queue = q; 236 return 0; 237 } 238 239 static void __xsk_release(struct xdp_sock *xs) 240 { 241 /* Wait for driver to stop using the xdp socket. */ 242 synchronize_net(); 243 244 dev_put(xs->dev); 245 } 246 247 static int xsk_release(struct socket *sock) 248 { 249 struct sock *sk = sock->sk; 250 struct xdp_sock *xs = xdp_sk(sk); 251 struct net *net; 252 253 if (!sk) 254 return 0; 255 256 net = sock_net(sk); 257 258 local_bh_disable(); 259 sock_prot_inuse_add(net, sk->sk_prot, -1); 260 local_bh_enable(); 261 262 if (xs->dev) { 263 __xsk_release(xs); 264 xs->dev = NULL; 265 } 266 267 sock_orphan(sk); 268 sock->sk = NULL; 269 270 sk_refcnt_debug_release(sk); 271 sock_put(sk); 272 273 return 0; 274 } 275 276 static struct socket *xsk_lookup_xsk_from_fd(int fd) 277 { 278 struct socket *sock; 279 int err; 280 281 sock = sockfd_lookup(fd, &err); 282 if (!sock) 283 return ERR_PTR(-ENOTSOCK); 284 285 if (sock->sk->sk_family != PF_XDP) { 286 sockfd_put(sock); 287 return ERR_PTR(-ENOPROTOOPT); 288 } 289 290 return sock; 291 } 292 293 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 294 { 295 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 296 struct sock *sk = sock->sk; 297 struct net_device *dev, *dev_curr; 298 struct xdp_sock *xs = xdp_sk(sk); 299 struct xdp_umem *old_umem = NULL; 300 int err = 0; 301 302 if (addr_len < sizeof(struct sockaddr_xdp)) 303 return -EINVAL; 304 if (sxdp->sxdp_family != AF_XDP) 305 return -EINVAL; 306 307 mutex_lock(&xs->mutex); 308 dev_curr = xs->dev; 309 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 310 if (!dev) { 311 err = -ENODEV; 312 goto out_release; 313 } 314 315 if (!xs->rx && !xs->tx) { 316 err = -EINVAL; 317 goto out_unlock; 318 } 319 320 if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { 321 err = -EINVAL; 322 goto out_unlock; 323 } 324 325 if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { 326 struct xdp_sock *umem_xs; 327 struct socket *sock; 328 329 if (xs->umem) { 330 /* We have already our own. */ 331 err = -EINVAL; 332 goto out_unlock; 333 } 334 335 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 336 if (IS_ERR(sock)) { 337 err = PTR_ERR(sock); 338 goto out_unlock; 339 } 340 341 umem_xs = xdp_sk(sock->sk); 342 if (!umem_xs->umem) { 343 /* No umem to inherit. */ 344 err = -EBADF; 345 sockfd_put(sock); 346 goto out_unlock; 347 } else if (umem_xs->dev != dev || 348 umem_xs->queue_id != sxdp->sxdp_queue_id) { 349 err = -EINVAL; 350 sockfd_put(sock); 351 goto out_unlock; 352 } 353 354 xdp_get_umem(umem_xs->umem); 355 old_umem = xs->umem; 356 xs->umem = umem_xs->umem; 357 sockfd_put(sock); 358 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 359 err = -EINVAL; 360 goto out_unlock; 361 } else { 362 /* This xsk has its own umem. */ 363 xskq_set_umem(xs->umem->fq, &xs->umem->props); 364 xskq_set_umem(xs->umem->cq, &xs->umem->props); 365 } 366 367 /* Rebind? */ 368 if (dev_curr && (dev_curr != dev || 369 xs->queue_id != sxdp->sxdp_queue_id)) { 370 __xsk_release(xs); 371 if (old_umem) 372 xdp_put_umem(old_umem); 373 } 374 375 xs->dev = dev; 376 xs->queue_id = sxdp->sxdp_queue_id; 377 378 xskq_set_umem(xs->rx, &xs->umem->props); 379 xskq_set_umem(xs->tx, &xs->umem->props); 380 381 out_unlock: 382 if (err) 383 dev_put(dev); 384 out_release: 385 mutex_unlock(&xs->mutex); 386 return err; 387 } 388 389 static int xsk_setsockopt(struct socket *sock, int level, int optname, 390 char __user *optval, unsigned int optlen) 391 { 392 struct sock *sk = sock->sk; 393 struct xdp_sock *xs = xdp_sk(sk); 394 int err; 395 396 if (level != SOL_XDP) 397 return -ENOPROTOOPT; 398 399 switch (optname) { 400 case XDP_RX_RING: 401 case XDP_TX_RING: 402 { 403 struct xsk_queue **q; 404 int entries; 405 406 if (optlen < sizeof(entries)) 407 return -EINVAL; 408 if (copy_from_user(&entries, optval, sizeof(entries))) 409 return -EFAULT; 410 411 mutex_lock(&xs->mutex); 412 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 413 err = xsk_init_queue(entries, q, false); 414 mutex_unlock(&xs->mutex); 415 return err; 416 } 417 case XDP_UMEM_REG: 418 { 419 struct xdp_umem_reg mr; 420 struct xdp_umem *umem; 421 422 if (xs->umem) 423 return -EBUSY; 424 425 if (copy_from_user(&mr, optval, sizeof(mr))) 426 return -EFAULT; 427 428 mutex_lock(&xs->mutex); 429 err = xdp_umem_create(&umem); 430 431 err = xdp_umem_reg(umem, &mr); 432 if (err) { 433 kfree(umem); 434 mutex_unlock(&xs->mutex); 435 return err; 436 } 437 438 /* Make sure umem is ready before it can be seen by others */ 439 smp_wmb(); 440 441 xs->umem = umem; 442 mutex_unlock(&xs->mutex); 443 return 0; 444 } 445 case XDP_UMEM_FILL_RING: 446 case XDP_UMEM_COMPLETION_RING: 447 { 448 struct xsk_queue **q; 449 int entries; 450 451 if (!xs->umem) 452 return -EINVAL; 453 454 if (copy_from_user(&entries, optval, sizeof(entries))) 455 return -EFAULT; 456 457 mutex_lock(&xs->mutex); 458 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 459 &xs->umem->cq; 460 err = xsk_init_queue(entries, q, true); 461 mutex_unlock(&xs->mutex); 462 return err; 463 } 464 default: 465 break; 466 } 467 468 return -ENOPROTOOPT; 469 } 470 471 static int xsk_getsockopt(struct socket *sock, int level, int optname, 472 char __user *optval, int __user *optlen) 473 { 474 struct sock *sk = sock->sk; 475 struct xdp_sock *xs = xdp_sk(sk); 476 int len; 477 478 if (level != SOL_XDP) 479 return -ENOPROTOOPT; 480 481 if (get_user(len, optlen)) 482 return -EFAULT; 483 if (len < 0) 484 return -EINVAL; 485 486 switch (optname) { 487 case XDP_STATISTICS: 488 { 489 struct xdp_statistics stats; 490 491 if (len < sizeof(stats)) 492 return -EINVAL; 493 494 mutex_lock(&xs->mutex); 495 stats.rx_dropped = xs->rx_dropped; 496 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 497 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 498 mutex_unlock(&xs->mutex); 499 500 if (copy_to_user(optval, &stats, sizeof(stats))) 501 return -EFAULT; 502 if (put_user(sizeof(stats), optlen)) 503 return -EFAULT; 504 505 return 0; 506 } 507 default: 508 break; 509 } 510 511 return -EOPNOTSUPP; 512 } 513 514 static int xsk_mmap(struct file *file, struct socket *sock, 515 struct vm_area_struct *vma) 516 { 517 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; 518 unsigned long size = vma->vm_end - vma->vm_start; 519 struct xdp_sock *xs = xdp_sk(sock->sk); 520 struct xsk_queue *q = NULL; 521 unsigned long pfn; 522 struct page *qpg; 523 524 if (offset == XDP_PGOFF_RX_RING) { 525 q = xs->rx; 526 } else if (offset == XDP_PGOFF_TX_RING) { 527 q = xs->tx; 528 } else { 529 if (!xs->umem) 530 return -EINVAL; 531 532 if (offset == XDP_UMEM_PGOFF_FILL_RING) 533 q = xs->umem->fq; 534 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 535 q = xs->umem->cq; 536 } 537 538 if (!q) 539 return -EINVAL; 540 541 qpg = virt_to_head_page(q->ring); 542 if (size > (PAGE_SIZE << compound_order(qpg))) 543 return -EINVAL; 544 545 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 546 return remap_pfn_range(vma, vma->vm_start, pfn, 547 size, vma->vm_page_prot); 548 } 549 550 static struct proto xsk_proto = { 551 .name = "XDP", 552 .owner = THIS_MODULE, 553 .obj_size = sizeof(struct xdp_sock), 554 }; 555 556 static const struct proto_ops xsk_proto_ops = { 557 .family = PF_XDP, 558 .owner = THIS_MODULE, 559 .release = xsk_release, 560 .bind = xsk_bind, 561 .connect = sock_no_connect, 562 .socketpair = sock_no_socketpair, 563 .accept = sock_no_accept, 564 .getname = sock_no_getname, 565 .poll = xsk_poll, 566 .ioctl = sock_no_ioctl, 567 .listen = sock_no_listen, 568 .shutdown = sock_no_shutdown, 569 .setsockopt = xsk_setsockopt, 570 .getsockopt = xsk_getsockopt, 571 .sendmsg = xsk_sendmsg, 572 .recvmsg = sock_no_recvmsg, 573 .mmap = xsk_mmap, 574 .sendpage = sock_no_sendpage, 575 }; 576 577 static void xsk_destruct(struct sock *sk) 578 { 579 struct xdp_sock *xs = xdp_sk(sk); 580 581 if (!sock_flag(sk, SOCK_DEAD)) 582 return; 583 584 xskq_destroy(xs->rx); 585 xskq_destroy(xs->tx); 586 xdp_put_umem(xs->umem); 587 588 sk_refcnt_debug_dec(sk); 589 } 590 591 static int xsk_create(struct net *net, struct socket *sock, int protocol, 592 int kern) 593 { 594 struct sock *sk; 595 struct xdp_sock *xs; 596 597 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 598 return -EPERM; 599 if (sock->type != SOCK_RAW) 600 return -ESOCKTNOSUPPORT; 601 602 if (protocol) 603 return -EPROTONOSUPPORT; 604 605 sock->state = SS_UNCONNECTED; 606 607 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 608 if (!sk) 609 return -ENOBUFS; 610 611 sock->ops = &xsk_proto_ops; 612 613 sock_init_data(sock, sk); 614 615 sk->sk_family = PF_XDP; 616 617 sk->sk_destruct = xsk_destruct; 618 sk_refcnt_debug_inc(sk); 619 620 xs = xdp_sk(sk); 621 mutex_init(&xs->mutex); 622 623 local_bh_disable(); 624 sock_prot_inuse_add(net, &xsk_proto, 1); 625 local_bh_enable(); 626 627 return 0; 628 } 629 630 static const struct net_proto_family xsk_family_ops = { 631 .family = PF_XDP, 632 .create = xsk_create, 633 .owner = THIS_MODULE, 634 }; 635 636 static int __init xsk_init(void) 637 { 638 int err; 639 640 err = proto_register(&xsk_proto, 0 /* no slab */); 641 if (err) 642 goto out; 643 644 err = sock_register(&xsk_family_ops); 645 if (err) 646 goto out_proto; 647 648 return 0; 649 650 out_proto: 651 proto_unregister(&xsk_proto); 652 out: 653 return err; 654 } 655 656 fs_initcall(xsk_init); 657