1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <net/xdp_sock.h> 26 #include <net/xdp.h> 27 28 #include "xsk_queue.h" 29 #include "xdp_umem.h" 30 #include "xsk.h" 31 32 #define TX_BATCH_SIZE 16 33 34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 35 { 36 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && 37 READ_ONCE(xs->umem->fq); 38 } 39 40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) 41 { 42 return xskq_has_addrs(umem->fq, cnt); 43 } 44 EXPORT_SYMBOL(xsk_umem_has_addrs); 45 46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) 47 { 48 return xskq_peek_addr(umem->fq, addr); 49 } 50 EXPORT_SYMBOL(xsk_umem_peek_addr); 51 52 void xsk_umem_discard_addr(struct xdp_umem *umem) 53 { 54 xskq_discard_addr(umem->fq); 55 } 56 EXPORT_SYMBOL(xsk_umem_discard_addr); 57 58 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 59 { 60 void *to_buf, *from_buf; 61 u32 metalen; 62 u64 addr; 63 int err; 64 65 if (!xskq_peek_addr(xs->umem->fq, &addr) || 66 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 67 xs->rx_dropped++; 68 return -ENOSPC; 69 } 70 71 addr += xs->umem->headroom; 72 73 if (unlikely(xdp_data_meta_unsupported(xdp))) { 74 from_buf = xdp->data; 75 metalen = 0; 76 } else { 77 from_buf = xdp->data_meta; 78 metalen = xdp->data - xdp->data_meta; 79 } 80 81 to_buf = xdp_umem_get_data(xs->umem, addr); 82 memcpy(to_buf, from_buf, len + metalen); 83 addr += metalen; 84 err = xskq_produce_batch_desc(xs->rx, addr, len); 85 if (!err) { 86 xskq_discard_addr(xs->umem->fq); 87 xdp_return_buff(xdp); 88 return 0; 89 } 90 91 xs->rx_dropped++; 92 return err; 93 } 94 95 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 96 { 97 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); 98 99 if (err) 100 xs->rx_dropped++; 101 102 return err; 103 } 104 105 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 106 { 107 u32 len; 108 109 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 110 return -EINVAL; 111 112 len = xdp->data_end - xdp->data; 113 114 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? 115 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); 116 } 117 118 void xsk_flush(struct xdp_sock *xs) 119 { 120 xskq_produce_flush_desc(xs->rx); 121 xs->sk.sk_data_ready(&xs->sk); 122 } 123 124 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 125 { 126 u32 metalen = xdp->data - xdp->data_meta; 127 u32 len = xdp->data_end - xdp->data; 128 void *buffer; 129 u64 addr; 130 int err; 131 132 spin_lock_bh(&xs->rx_lock); 133 134 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { 135 err = -EINVAL; 136 goto out_unlock; 137 } 138 139 if (!xskq_peek_addr(xs->umem->fq, &addr) || 140 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 141 err = -ENOSPC; 142 goto out_drop; 143 } 144 145 addr += xs->umem->headroom; 146 147 buffer = xdp_umem_get_data(xs->umem, addr); 148 memcpy(buffer, xdp->data_meta, len + metalen); 149 addr += metalen; 150 err = xskq_produce_batch_desc(xs->rx, addr, len); 151 if (err) 152 goto out_drop; 153 154 xskq_discard_addr(xs->umem->fq); 155 xskq_produce_flush_desc(xs->rx); 156 157 spin_unlock_bh(&xs->rx_lock); 158 159 xs->sk.sk_data_ready(&xs->sk); 160 return 0; 161 162 out_drop: 163 xs->rx_dropped++; 164 out_unlock: 165 spin_unlock_bh(&xs->rx_lock); 166 return err; 167 } 168 169 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) 170 { 171 xskq_produce_flush_addr_n(umem->cq, nb_entries); 172 } 173 EXPORT_SYMBOL(xsk_umem_complete_tx); 174 175 void xsk_umem_consume_tx_done(struct xdp_umem *umem) 176 { 177 struct xdp_sock *xs; 178 179 rcu_read_lock(); 180 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 181 xs->sk.sk_write_space(&xs->sk); 182 } 183 rcu_read_unlock(); 184 } 185 EXPORT_SYMBOL(xsk_umem_consume_tx_done); 186 187 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) 188 { 189 struct xdp_sock *xs; 190 191 rcu_read_lock(); 192 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 193 if (!xskq_peek_desc(xs->tx, desc)) 194 continue; 195 196 if (xskq_produce_addr_lazy(umem->cq, desc->addr)) 197 goto out; 198 199 xskq_discard_desc(xs->tx); 200 rcu_read_unlock(); 201 return true; 202 } 203 204 out: 205 rcu_read_unlock(); 206 return false; 207 } 208 EXPORT_SYMBOL(xsk_umem_consume_tx); 209 210 static int xsk_zc_xmit(struct sock *sk) 211 { 212 struct xdp_sock *xs = xdp_sk(sk); 213 struct net_device *dev = xs->dev; 214 215 return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); 216 } 217 218 static void xsk_destruct_skb(struct sk_buff *skb) 219 { 220 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; 221 struct xdp_sock *xs = xdp_sk(skb->sk); 222 unsigned long flags; 223 224 spin_lock_irqsave(&xs->tx_completion_lock, flags); 225 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); 226 spin_unlock_irqrestore(&xs->tx_completion_lock, flags); 227 228 sock_wfree(skb); 229 } 230 231 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, 232 size_t total_len) 233 { 234 u32 max_batch = TX_BATCH_SIZE; 235 struct xdp_sock *xs = xdp_sk(sk); 236 bool sent_frame = false; 237 struct xdp_desc desc; 238 struct sk_buff *skb; 239 int err = 0; 240 241 mutex_lock(&xs->mutex); 242 243 if (xs->queue_id >= xs->dev->real_num_tx_queues) 244 goto out; 245 246 while (xskq_peek_desc(xs->tx, &desc)) { 247 char *buffer; 248 u64 addr; 249 u32 len; 250 251 if (max_batch-- == 0) { 252 err = -EAGAIN; 253 goto out; 254 } 255 256 len = desc.len; 257 skb = sock_alloc_send_skb(sk, len, 1, &err); 258 if (unlikely(!skb)) { 259 err = -EAGAIN; 260 goto out; 261 } 262 263 skb_put(skb, len); 264 addr = desc.addr; 265 buffer = xdp_umem_get_data(xs->umem, addr); 266 err = skb_store_bits(skb, 0, buffer, len); 267 if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { 268 kfree_skb(skb); 269 goto out; 270 } 271 272 skb->dev = xs->dev; 273 skb->priority = sk->sk_priority; 274 skb->mark = sk->sk_mark; 275 skb_shinfo(skb)->destructor_arg = (void *)(long)addr; 276 skb->destructor = xsk_destruct_skb; 277 278 err = dev_direct_xmit(skb, xs->queue_id); 279 xskq_discard_desc(xs->tx); 280 /* Ignore NET_XMIT_CN as packet might have been sent */ 281 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 282 /* SKB completed but not sent */ 283 err = -EBUSY; 284 goto out; 285 } 286 287 sent_frame = true; 288 } 289 290 out: 291 if (sent_frame) 292 sk->sk_write_space(sk); 293 294 mutex_unlock(&xs->mutex); 295 return err; 296 } 297 298 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 299 { 300 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 301 struct sock *sk = sock->sk; 302 struct xdp_sock *xs = xdp_sk(sk); 303 304 if (unlikely(!xs->dev)) 305 return -ENXIO; 306 if (unlikely(!(xs->dev->flags & IFF_UP))) 307 return -ENETDOWN; 308 if (unlikely(!xs->tx)) 309 return -ENOBUFS; 310 if (need_wait) 311 return -EOPNOTSUPP; 312 313 return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); 314 } 315 316 static unsigned int xsk_poll(struct file *file, struct socket *sock, 317 struct poll_table_struct *wait) 318 { 319 unsigned int mask = datagram_poll(file, sock, wait); 320 struct sock *sk = sock->sk; 321 struct xdp_sock *xs = xdp_sk(sk); 322 323 if (xs->rx && !xskq_empty_desc(xs->rx)) 324 mask |= POLLIN | POLLRDNORM; 325 if (xs->tx && !xskq_full_desc(xs->tx)) 326 mask |= POLLOUT | POLLWRNORM; 327 328 return mask; 329 } 330 331 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 332 bool umem_queue) 333 { 334 struct xsk_queue *q; 335 336 if (entries == 0 || *queue || !is_power_of_2(entries)) 337 return -EINVAL; 338 339 q = xskq_create(entries, umem_queue); 340 if (!q) 341 return -ENOMEM; 342 343 /* Make sure queue is ready before it can be seen by others */ 344 smp_wmb(); 345 *queue = q; 346 return 0; 347 } 348 349 static void xsk_unbind_dev(struct xdp_sock *xs) 350 { 351 struct net_device *dev = xs->dev; 352 353 if (!dev || xs->state != XSK_BOUND) 354 return; 355 356 xs->state = XSK_UNBOUND; 357 358 /* Wait for driver to stop using the xdp socket. */ 359 xdp_del_sk_umem(xs->umem, xs); 360 xs->dev = NULL; 361 synchronize_net(); 362 dev_put(dev); 363 } 364 365 static int xsk_release(struct socket *sock) 366 { 367 struct sock *sk = sock->sk; 368 struct xdp_sock *xs = xdp_sk(sk); 369 struct net *net; 370 371 if (!sk) 372 return 0; 373 374 net = sock_net(sk); 375 376 mutex_lock(&net->xdp.lock); 377 sk_del_node_init_rcu(sk); 378 mutex_unlock(&net->xdp.lock); 379 380 local_bh_disable(); 381 sock_prot_inuse_add(net, sk->sk_prot, -1); 382 local_bh_enable(); 383 384 xsk_unbind_dev(xs); 385 386 xskq_destroy(xs->rx); 387 xskq_destroy(xs->tx); 388 389 sock_orphan(sk); 390 sock->sk = NULL; 391 392 sk_refcnt_debug_release(sk); 393 sock_put(sk); 394 395 return 0; 396 } 397 398 static struct socket *xsk_lookup_xsk_from_fd(int fd) 399 { 400 struct socket *sock; 401 int err; 402 403 sock = sockfd_lookup(fd, &err); 404 if (!sock) 405 return ERR_PTR(-ENOTSOCK); 406 407 if (sock->sk->sk_family != PF_XDP) { 408 sockfd_put(sock); 409 return ERR_PTR(-ENOPROTOOPT); 410 } 411 412 return sock; 413 } 414 415 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 416 { 417 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 418 struct sock *sk = sock->sk; 419 struct xdp_sock *xs = xdp_sk(sk); 420 struct net_device *dev; 421 u32 flags, qid; 422 int err = 0; 423 424 if (addr_len < sizeof(struct sockaddr_xdp)) 425 return -EINVAL; 426 if (sxdp->sxdp_family != AF_XDP) 427 return -EINVAL; 428 429 flags = sxdp->sxdp_flags; 430 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY)) 431 return -EINVAL; 432 433 rtnl_lock(); 434 mutex_lock(&xs->mutex); 435 if (xs->state != XSK_READY) { 436 err = -EBUSY; 437 goto out_release; 438 } 439 440 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 441 if (!dev) { 442 err = -ENODEV; 443 goto out_release; 444 } 445 446 if (!xs->rx && !xs->tx) { 447 err = -EINVAL; 448 goto out_unlock; 449 } 450 451 qid = sxdp->sxdp_queue_id; 452 453 if (flags & XDP_SHARED_UMEM) { 454 struct xdp_sock *umem_xs; 455 struct socket *sock; 456 457 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { 458 /* Cannot specify flags for shared sockets. */ 459 err = -EINVAL; 460 goto out_unlock; 461 } 462 463 if (xs->umem) { 464 /* We have already our own. */ 465 err = -EINVAL; 466 goto out_unlock; 467 } 468 469 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 470 if (IS_ERR(sock)) { 471 err = PTR_ERR(sock); 472 goto out_unlock; 473 } 474 475 umem_xs = xdp_sk(sock->sk); 476 if (!umem_xs->umem) { 477 /* No umem to inherit. */ 478 err = -EBADF; 479 sockfd_put(sock); 480 goto out_unlock; 481 } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { 482 err = -EINVAL; 483 sockfd_put(sock); 484 goto out_unlock; 485 } 486 487 xdp_get_umem(umem_xs->umem); 488 xs->umem = umem_xs->umem; 489 sockfd_put(sock); 490 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 491 err = -EINVAL; 492 goto out_unlock; 493 } else { 494 /* This xsk has its own umem. */ 495 xskq_set_umem(xs->umem->fq, xs->umem->size, 496 xs->umem->chunk_mask); 497 xskq_set_umem(xs->umem->cq, xs->umem->size, 498 xs->umem->chunk_mask); 499 500 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); 501 if (err) 502 goto out_unlock; 503 } 504 505 xs->dev = dev; 506 xs->zc = xs->umem->zc; 507 xs->queue_id = qid; 508 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); 509 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); 510 xdp_add_sk_umem(xs->umem, xs); 511 512 out_unlock: 513 if (err) 514 dev_put(dev); 515 else 516 xs->state = XSK_BOUND; 517 out_release: 518 mutex_unlock(&xs->mutex); 519 rtnl_unlock(); 520 return err; 521 } 522 523 static int xsk_setsockopt(struct socket *sock, int level, int optname, 524 char __user *optval, unsigned int optlen) 525 { 526 struct sock *sk = sock->sk; 527 struct xdp_sock *xs = xdp_sk(sk); 528 int err; 529 530 if (level != SOL_XDP) 531 return -ENOPROTOOPT; 532 533 switch (optname) { 534 case XDP_RX_RING: 535 case XDP_TX_RING: 536 { 537 struct xsk_queue **q; 538 int entries; 539 540 if (optlen < sizeof(entries)) 541 return -EINVAL; 542 if (copy_from_user(&entries, optval, sizeof(entries))) 543 return -EFAULT; 544 545 mutex_lock(&xs->mutex); 546 if (xs->state != XSK_READY) { 547 mutex_unlock(&xs->mutex); 548 return -EBUSY; 549 } 550 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 551 err = xsk_init_queue(entries, q, false); 552 mutex_unlock(&xs->mutex); 553 return err; 554 } 555 case XDP_UMEM_REG: 556 { 557 struct xdp_umem_reg mr; 558 struct xdp_umem *umem; 559 560 if (copy_from_user(&mr, optval, sizeof(mr))) 561 return -EFAULT; 562 563 mutex_lock(&xs->mutex); 564 if (xs->state != XSK_READY || xs->umem) { 565 mutex_unlock(&xs->mutex); 566 return -EBUSY; 567 } 568 569 umem = xdp_umem_create(&mr); 570 if (IS_ERR(umem)) { 571 mutex_unlock(&xs->mutex); 572 return PTR_ERR(umem); 573 } 574 575 /* Make sure umem is ready before it can be seen by others */ 576 smp_wmb(); 577 xs->umem = umem; 578 mutex_unlock(&xs->mutex); 579 return 0; 580 } 581 case XDP_UMEM_FILL_RING: 582 case XDP_UMEM_COMPLETION_RING: 583 { 584 struct xsk_queue **q; 585 int entries; 586 587 if (copy_from_user(&entries, optval, sizeof(entries))) 588 return -EFAULT; 589 590 mutex_lock(&xs->mutex); 591 if (xs->state != XSK_READY) { 592 mutex_unlock(&xs->mutex); 593 return -EBUSY; 594 } 595 if (!xs->umem) { 596 mutex_unlock(&xs->mutex); 597 return -EINVAL; 598 } 599 600 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 601 &xs->umem->cq; 602 err = xsk_init_queue(entries, q, true); 603 mutex_unlock(&xs->mutex); 604 return err; 605 } 606 default: 607 break; 608 } 609 610 return -ENOPROTOOPT; 611 } 612 613 static int xsk_getsockopt(struct socket *sock, int level, int optname, 614 char __user *optval, int __user *optlen) 615 { 616 struct sock *sk = sock->sk; 617 struct xdp_sock *xs = xdp_sk(sk); 618 int len; 619 620 if (level != SOL_XDP) 621 return -ENOPROTOOPT; 622 623 if (get_user(len, optlen)) 624 return -EFAULT; 625 if (len < 0) 626 return -EINVAL; 627 628 switch (optname) { 629 case XDP_STATISTICS: 630 { 631 struct xdp_statistics stats; 632 633 if (len < sizeof(stats)) 634 return -EINVAL; 635 636 mutex_lock(&xs->mutex); 637 stats.rx_dropped = xs->rx_dropped; 638 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 639 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 640 mutex_unlock(&xs->mutex); 641 642 if (copy_to_user(optval, &stats, sizeof(stats))) 643 return -EFAULT; 644 if (put_user(sizeof(stats), optlen)) 645 return -EFAULT; 646 647 return 0; 648 } 649 case XDP_MMAP_OFFSETS: 650 { 651 struct xdp_mmap_offsets off; 652 653 if (len < sizeof(off)) 654 return -EINVAL; 655 656 off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 657 off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 658 off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); 659 off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 660 off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 661 off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); 662 663 off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 664 off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 665 off.fr.desc = offsetof(struct xdp_umem_ring, desc); 666 off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 667 off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 668 off.cr.desc = offsetof(struct xdp_umem_ring, desc); 669 670 len = sizeof(off); 671 if (copy_to_user(optval, &off, len)) 672 return -EFAULT; 673 if (put_user(len, optlen)) 674 return -EFAULT; 675 676 return 0; 677 } 678 case XDP_OPTIONS: 679 { 680 struct xdp_options opts = {}; 681 682 if (len < sizeof(opts)) 683 return -EINVAL; 684 685 mutex_lock(&xs->mutex); 686 if (xs->zc) 687 opts.flags |= XDP_OPTIONS_ZEROCOPY; 688 mutex_unlock(&xs->mutex); 689 690 len = sizeof(opts); 691 if (copy_to_user(optval, &opts, len)) 692 return -EFAULT; 693 if (put_user(len, optlen)) 694 return -EFAULT; 695 696 return 0; 697 } 698 default: 699 break; 700 } 701 702 return -EOPNOTSUPP; 703 } 704 705 static int xsk_mmap(struct file *file, struct socket *sock, 706 struct vm_area_struct *vma) 707 { 708 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 709 unsigned long size = vma->vm_end - vma->vm_start; 710 struct xdp_sock *xs = xdp_sk(sock->sk); 711 struct xsk_queue *q = NULL; 712 struct xdp_umem *umem; 713 unsigned long pfn; 714 struct page *qpg; 715 716 if (xs->state != XSK_READY) 717 return -EBUSY; 718 719 if (offset == XDP_PGOFF_RX_RING) { 720 q = READ_ONCE(xs->rx); 721 } else if (offset == XDP_PGOFF_TX_RING) { 722 q = READ_ONCE(xs->tx); 723 } else { 724 umem = READ_ONCE(xs->umem); 725 if (!umem) 726 return -EINVAL; 727 728 /* Matches the smp_wmb() in XDP_UMEM_REG */ 729 smp_rmb(); 730 if (offset == XDP_UMEM_PGOFF_FILL_RING) 731 q = READ_ONCE(umem->fq); 732 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 733 q = READ_ONCE(umem->cq); 734 } 735 736 if (!q) 737 return -EINVAL; 738 739 /* Matches the smp_wmb() in xsk_init_queue */ 740 smp_rmb(); 741 qpg = virt_to_head_page(q->ring); 742 if (size > (PAGE_SIZE << compound_order(qpg))) 743 return -EINVAL; 744 745 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 746 return remap_pfn_range(vma, vma->vm_start, pfn, 747 size, vma->vm_page_prot); 748 } 749 750 static int xsk_notifier(struct notifier_block *this, 751 unsigned long msg, void *ptr) 752 { 753 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 754 struct net *net = dev_net(dev); 755 struct sock *sk; 756 757 switch (msg) { 758 case NETDEV_UNREGISTER: 759 mutex_lock(&net->xdp.lock); 760 sk_for_each(sk, &net->xdp.list) { 761 struct xdp_sock *xs = xdp_sk(sk); 762 763 mutex_lock(&xs->mutex); 764 if (xs->dev == dev) { 765 sk->sk_err = ENETDOWN; 766 if (!sock_flag(sk, SOCK_DEAD)) 767 sk->sk_error_report(sk); 768 769 xsk_unbind_dev(xs); 770 771 /* Clear device references in umem. */ 772 xdp_umem_clear_dev(xs->umem); 773 } 774 mutex_unlock(&xs->mutex); 775 } 776 mutex_unlock(&net->xdp.lock); 777 break; 778 } 779 return NOTIFY_DONE; 780 } 781 782 static struct proto xsk_proto = { 783 .name = "XDP", 784 .owner = THIS_MODULE, 785 .obj_size = sizeof(struct xdp_sock), 786 }; 787 788 static const struct proto_ops xsk_proto_ops = { 789 .family = PF_XDP, 790 .owner = THIS_MODULE, 791 .release = xsk_release, 792 .bind = xsk_bind, 793 .connect = sock_no_connect, 794 .socketpair = sock_no_socketpair, 795 .accept = sock_no_accept, 796 .getname = sock_no_getname, 797 .poll = xsk_poll, 798 .ioctl = sock_no_ioctl, 799 .listen = sock_no_listen, 800 .shutdown = sock_no_shutdown, 801 .setsockopt = xsk_setsockopt, 802 .getsockopt = xsk_getsockopt, 803 .sendmsg = xsk_sendmsg, 804 .recvmsg = sock_no_recvmsg, 805 .mmap = xsk_mmap, 806 .sendpage = sock_no_sendpage, 807 }; 808 809 static void xsk_destruct(struct sock *sk) 810 { 811 struct xdp_sock *xs = xdp_sk(sk); 812 813 if (!sock_flag(sk, SOCK_DEAD)) 814 return; 815 816 xdp_put_umem(xs->umem); 817 818 sk_refcnt_debug_dec(sk); 819 } 820 821 static int xsk_create(struct net *net, struct socket *sock, int protocol, 822 int kern) 823 { 824 struct sock *sk; 825 struct xdp_sock *xs; 826 827 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 828 return -EPERM; 829 if (sock->type != SOCK_RAW) 830 return -ESOCKTNOSUPPORT; 831 832 if (protocol) 833 return -EPROTONOSUPPORT; 834 835 sock->state = SS_UNCONNECTED; 836 837 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 838 if (!sk) 839 return -ENOBUFS; 840 841 sock->ops = &xsk_proto_ops; 842 843 sock_init_data(sock, sk); 844 845 sk->sk_family = PF_XDP; 846 847 sk->sk_destruct = xsk_destruct; 848 sk_refcnt_debug_inc(sk); 849 850 sock_set_flag(sk, SOCK_RCU_FREE); 851 852 xs = xdp_sk(sk); 853 xs->state = XSK_READY; 854 mutex_init(&xs->mutex); 855 spin_lock_init(&xs->rx_lock); 856 spin_lock_init(&xs->tx_completion_lock); 857 858 mutex_lock(&net->xdp.lock); 859 sk_add_node_rcu(sk, &net->xdp.list); 860 mutex_unlock(&net->xdp.lock); 861 862 local_bh_disable(); 863 sock_prot_inuse_add(net, &xsk_proto, 1); 864 local_bh_enable(); 865 866 return 0; 867 } 868 869 static const struct net_proto_family xsk_family_ops = { 870 .family = PF_XDP, 871 .create = xsk_create, 872 .owner = THIS_MODULE, 873 }; 874 875 static struct notifier_block xsk_netdev_notifier = { 876 .notifier_call = xsk_notifier, 877 }; 878 879 static int __net_init xsk_net_init(struct net *net) 880 { 881 mutex_init(&net->xdp.lock); 882 INIT_HLIST_HEAD(&net->xdp.list); 883 return 0; 884 } 885 886 static void __net_exit xsk_net_exit(struct net *net) 887 { 888 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 889 } 890 891 static struct pernet_operations xsk_net_ops = { 892 .init = xsk_net_init, 893 .exit = xsk_net_exit, 894 }; 895 896 static int __init xsk_init(void) 897 { 898 int err; 899 900 err = proto_register(&xsk_proto, 0 /* no slab */); 901 if (err) 902 goto out; 903 904 err = sock_register(&xsk_family_ops); 905 if (err) 906 goto out_proto; 907 908 err = register_pernet_subsys(&xsk_net_ops); 909 if (err) 910 goto out_sk; 911 912 err = register_netdevice_notifier(&xsk_netdev_notifier); 913 if (err) 914 goto out_pernet; 915 916 return 0; 917 918 out_pernet: 919 unregister_pernet_subsys(&xsk_net_ops); 920 out_sk: 921 sock_unregister(PF_XDP); 922 out_proto: 923 proto_unregister(&xsk_proto); 924 out: 925 return err; 926 } 927 928 fs_initcall(xsk_init); 929