1 /* A network driver using virtio. 2 * 3 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, see <http://www.gnu.org/licenses/>. 17 */ 18 //#define DEBUG 19 #include <linux/netdevice.h> 20 #include <linux/etherdevice.h> 21 #include <linux/ethtool.h> 22 #include <linux/module.h> 23 #include <linux/virtio.h> 24 #include <linux/virtio_net.h> 25 #include <linux/bpf.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/scatterlist.h> 28 #include <linux/if_vlan.h> 29 #include <linux/slab.h> 30 #include <linux/cpu.h> 31 #include <linux/average.h> 32 #include <linux/filter.h> 33 #include <linux/netdevice.h> 34 #include <linux/pci.h> 35 #include <net/route.h> 36 #include <net/xdp.h> 37 #include <net/net_failover.h> 38 39 static int napi_weight = NAPI_POLL_WEIGHT; 40 module_param(napi_weight, int, 0444); 41 42 static bool csum = true, gso = true, napi_tx; 43 module_param(csum, bool, 0444); 44 module_param(gso, bool, 0444); 45 module_param(napi_tx, bool, 0644); 46 47 /* FIXME: MTU in config. */ 48 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) 49 #define GOOD_COPY_LEN 128 50 51 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) 52 53 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */ 54 #define VIRTIO_XDP_HEADROOM 256 55 56 /* Separating two types of XDP xmit */ 57 #define VIRTIO_XDP_TX BIT(0) 58 #define VIRTIO_XDP_REDIR BIT(1) 59 60 /* RX packet size EWMA. The average packet size is used to determine the packet 61 * buffer size when refilling RX rings. As the entire RX ring may be refilled 62 * at once, the weight is chosen so that the EWMA will be insensitive to short- 63 * term, transient changes in packet size. 64 */ 65 DECLARE_EWMA(pkt_len, 0, 64) 66 67 #define VIRTNET_DRIVER_VERSION "1.0.0" 68 69 static const unsigned long guest_offloads[] = { 70 VIRTIO_NET_F_GUEST_TSO4, 71 VIRTIO_NET_F_GUEST_TSO6, 72 VIRTIO_NET_F_GUEST_ECN, 73 VIRTIO_NET_F_GUEST_UFO 74 }; 75 76 struct virtnet_stat_desc { 77 char desc[ETH_GSTRING_LEN]; 78 size_t offset; 79 }; 80 81 struct virtnet_sq_stats { 82 struct u64_stats_sync syncp; 83 u64 packets; 84 u64 bytes; 85 }; 86 87 struct virtnet_rq_stats { 88 struct u64_stats_sync syncp; 89 u64 packets; 90 u64 bytes; 91 }; 92 93 #define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m) 94 #define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stats, m) 95 96 static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = { 97 { "packets", VIRTNET_SQ_STAT(packets) }, 98 { "bytes", VIRTNET_SQ_STAT(bytes) }, 99 }; 100 101 static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = { 102 { "packets", VIRTNET_RQ_STAT(packets) }, 103 { "bytes", VIRTNET_RQ_STAT(bytes) }, 104 }; 105 106 #define VIRTNET_SQ_STATS_LEN ARRAY_SIZE(virtnet_sq_stats_desc) 107 #define VIRTNET_RQ_STATS_LEN ARRAY_SIZE(virtnet_rq_stats_desc) 108 109 /* Internal representation of a send virtqueue */ 110 struct send_queue { 111 /* Virtqueue associated with this send _queue */ 112 struct virtqueue *vq; 113 114 /* TX: fragments + linear part + virtio header */ 115 struct scatterlist sg[MAX_SKB_FRAGS + 2]; 116 117 /* Name of the send queue: output.$index */ 118 char name[40]; 119 120 struct virtnet_sq_stats stats; 121 122 struct napi_struct napi; 123 }; 124 125 /* Internal representation of a receive virtqueue */ 126 struct receive_queue { 127 /* Virtqueue associated with this receive_queue */ 128 struct virtqueue *vq; 129 130 struct napi_struct napi; 131 132 struct bpf_prog __rcu *xdp_prog; 133 134 struct virtnet_rq_stats stats; 135 136 /* Chain pages by the private ptr. */ 137 struct page *pages; 138 139 /* Average packet length for mergeable receive buffers. */ 140 struct ewma_pkt_len mrg_avg_pkt_len; 141 142 /* Page frag for packet buffer allocation. */ 143 struct page_frag alloc_frag; 144 145 /* RX: fragments + linear part + virtio header */ 146 struct scatterlist sg[MAX_SKB_FRAGS + 2]; 147 148 /* Min single buffer size for mergeable buffers case. */ 149 unsigned int min_buf_len; 150 151 /* Name of this receive queue: input.$index */ 152 char name[40]; 153 154 struct xdp_rxq_info xdp_rxq; 155 }; 156 157 /* Control VQ buffers: protected by the rtnl lock */ 158 struct control_buf { 159 struct virtio_net_ctrl_hdr hdr; 160 virtio_net_ctrl_ack status; 161 struct virtio_net_ctrl_mq mq; 162 u8 promisc; 163 u8 allmulti; 164 __virtio16 vid; 165 __virtio64 offloads; 166 }; 167 168 struct virtnet_info { 169 struct virtio_device *vdev; 170 struct virtqueue *cvq; 171 struct net_device *dev; 172 struct send_queue *sq; 173 struct receive_queue *rq; 174 unsigned int status; 175 176 /* Max # of queue pairs supported by the device */ 177 u16 max_queue_pairs; 178 179 /* # of queue pairs currently used by the driver */ 180 u16 curr_queue_pairs; 181 182 /* # of XDP queue pairs currently used by the driver */ 183 u16 xdp_queue_pairs; 184 185 /* I like... big packets and I cannot lie! */ 186 bool big_packets; 187 188 /* Host will merge rx buffers for big packets (shake it! shake it!) */ 189 bool mergeable_rx_bufs; 190 191 /* Has control virtqueue */ 192 bool has_cvq; 193 194 /* Host can handle any s/g split between our header and packet data */ 195 bool any_header_sg; 196 197 /* Packet virtio header size */ 198 u8 hdr_len; 199 200 /* Work struct for refilling if we run low on memory. */ 201 struct delayed_work refill; 202 203 /* Work struct for config space updates */ 204 struct work_struct config_work; 205 206 /* Does the affinity hint is set for virtqueues? */ 207 bool affinity_hint_set; 208 209 /* CPU hotplug instances for online & dead */ 210 struct hlist_node node; 211 struct hlist_node node_dead; 212 213 struct control_buf *ctrl; 214 215 /* Ethtool settings */ 216 u8 duplex; 217 u32 speed; 218 219 unsigned long guest_offloads; 220 221 /* failover when STANDBY feature enabled */ 222 struct failover *failover; 223 }; 224 225 struct padded_vnet_hdr { 226 struct virtio_net_hdr_mrg_rxbuf hdr; 227 /* 228 * hdr is in a separate sg buffer, and data sg buffer shares same page 229 * with this header sg. This padding makes next sg 16 byte aligned 230 * after the header. 231 */ 232 char padding[4]; 233 }; 234 235 /* Converting between virtqueue no. and kernel tx/rx queue no. 236 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq 237 */ 238 static int vq2txq(struct virtqueue *vq) 239 { 240 return (vq->index - 1) / 2; 241 } 242 243 static int txq2vq(int txq) 244 { 245 return txq * 2 + 1; 246 } 247 248 static int vq2rxq(struct virtqueue *vq) 249 { 250 return vq->index / 2; 251 } 252 253 static int rxq2vq(int rxq) 254 { 255 return rxq * 2; 256 } 257 258 static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb) 259 { 260 return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb; 261 } 262 263 /* 264 * private is used to chain pages for big packets, put the whole 265 * most recent used list in the beginning for reuse 266 */ 267 static void give_pages(struct receive_queue *rq, struct page *page) 268 { 269 struct page *end; 270 271 /* Find end of list, sew whole thing into vi->rq.pages. */ 272 for (end = page; end->private; end = (struct page *)end->private); 273 end->private = (unsigned long)rq->pages; 274 rq->pages = page; 275 } 276 277 static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) 278 { 279 struct page *p = rq->pages; 280 281 if (p) { 282 rq->pages = (struct page *)p->private; 283 /* clear private here, it is used to chain pages */ 284 p->private = 0; 285 } else 286 p = alloc_page(gfp_mask); 287 return p; 288 } 289 290 static void virtqueue_napi_schedule(struct napi_struct *napi, 291 struct virtqueue *vq) 292 { 293 if (napi_schedule_prep(napi)) { 294 virtqueue_disable_cb(vq); 295 __napi_schedule(napi); 296 } 297 } 298 299 static void virtqueue_napi_complete(struct napi_struct *napi, 300 struct virtqueue *vq, int processed) 301 { 302 int opaque; 303 304 opaque = virtqueue_enable_cb_prepare(vq); 305 if (napi_complete_done(napi, processed)) { 306 if (unlikely(virtqueue_poll(vq, opaque))) 307 virtqueue_napi_schedule(napi, vq); 308 } else { 309 virtqueue_disable_cb(vq); 310 } 311 } 312 313 static void skb_xmit_done(struct virtqueue *vq) 314 { 315 struct virtnet_info *vi = vq->vdev->priv; 316 struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi; 317 318 /* Suppress further interrupts. */ 319 virtqueue_disable_cb(vq); 320 321 if (napi->weight) 322 virtqueue_napi_schedule(napi, vq); 323 else 324 /* We were probably waiting for more output buffers. */ 325 netif_wake_subqueue(vi->dev, vq2txq(vq)); 326 } 327 328 #define MRG_CTX_HEADER_SHIFT 22 329 static void *mergeable_len_to_ctx(unsigned int truesize, 330 unsigned int headroom) 331 { 332 return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize); 333 } 334 335 static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx) 336 { 337 return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT; 338 } 339 340 static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) 341 { 342 return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1); 343 } 344 345 /* Called from bottom half context */ 346 static struct sk_buff *page_to_skb(struct virtnet_info *vi, 347 struct receive_queue *rq, 348 struct page *page, unsigned int offset, 349 unsigned int len, unsigned int truesize) 350 { 351 struct sk_buff *skb; 352 struct virtio_net_hdr_mrg_rxbuf *hdr; 353 unsigned int copy, hdr_len, hdr_padded_len; 354 char *p; 355 356 p = page_address(page) + offset; 357 358 /* copy small packet so we can reuse these pages for small data */ 359 skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN); 360 if (unlikely(!skb)) 361 return NULL; 362 363 hdr = skb_vnet_hdr(skb); 364 365 hdr_len = vi->hdr_len; 366 if (vi->mergeable_rx_bufs) 367 hdr_padded_len = sizeof(*hdr); 368 else 369 hdr_padded_len = sizeof(struct padded_vnet_hdr); 370 371 memcpy(hdr, p, hdr_len); 372 373 len -= hdr_len; 374 offset += hdr_padded_len; 375 p += hdr_padded_len; 376 377 copy = len; 378 if (copy > skb_tailroom(skb)) 379 copy = skb_tailroom(skb); 380 skb_put_data(skb, p, copy); 381 382 len -= copy; 383 offset += copy; 384 385 if (vi->mergeable_rx_bufs) { 386 if (len) 387 skb_add_rx_frag(skb, 0, page, offset, len, truesize); 388 else 389 put_page(page); 390 return skb; 391 } 392 393 /* 394 * Verify that we can indeed put this data into a skb. 395 * This is here to handle cases when the device erroneously 396 * tries to receive more than is possible. This is usually 397 * the case of a broken device. 398 */ 399 if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) { 400 net_dbg_ratelimited("%s: too much data\n", skb->dev->name); 401 dev_kfree_skb(skb); 402 return NULL; 403 } 404 BUG_ON(offset >= PAGE_SIZE); 405 while (len) { 406 unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len); 407 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset, 408 frag_size, truesize); 409 len -= frag_size; 410 page = (struct page *)page->private; 411 offset = 0; 412 } 413 414 if (page) 415 give_pages(rq, page); 416 417 return skb; 418 } 419 420 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, 421 struct send_queue *sq, 422 struct xdp_frame *xdpf) 423 { 424 struct virtio_net_hdr_mrg_rxbuf *hdr; 425 int err; 426 427 /* virtqueue want to use data area in-front of packet */ 428 if (unlikely(xdpf->metasize > 0)) 429 return -EOPNOTSUPP; 430 431 if (unlikely(xdpf->headroom < vi->hdr_len)) 432 return -EOVERFLOW; 433 434 /* Make room for virtqueue hdr (also change xdpf->headroom?) */ 435 xdpf->data -= vi->hdr_len; 436 /* Zero header and leave csum up to XDP layers */ 437 hdr = xdpf->data; 438 memset(hdr, 0, vi->hdr_len); 439 xdpf->len += vi->hdr_len; 440 441 sg_init_one(sq->sg, xdpf->data, xdpf->len); 442 443 err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC); 444 if (unlikely(err)) 445 return -ENOSPC; /* Caller handle free/refcnt */ 446 447 return 0; 448 } 449 450 static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi, 451 struct xdp_frame *xdpf) 452 { 453 struct xdp_frame *xdpf_sent; 454 struct send_queue *sq; 455 unsigned int len; 456 unsigned int qp; 457 458 qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 459 sq = &vi->sq[qp]; 460 461 /* Free up any pending old buffers before queueing new ones. */ 462 while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 463 xdp_return_frame(xdpf_sent); 464 465 return __virtnet_xdp_xmit_one(vi, sq, xdpf); 466 } 467 468 static int virtnet_xdp_xmit(struct net_device *dev, 469 int n, struct xdp_frame **frames, u32 flags) 470 { 471 struct virtnet_info *vi = netdev_priv(dev); 472 struct receive_queue *rq = vi->rq; 473 struct xdp_frame *xdpf_sent; 474 struct bpf_prog *xdp_prog; 475 struct send_queue *sq; 476 unsigned int len; 477 unsigned int qp; 478 int drops = 0; 479 int err; 480 int i; 481 482 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 483 return -EINVAL; 484 485 qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 486 sq = &vi->sq[qp]; 487 488 /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this 489 * indicate XDP resources have been successfully allocated. 490 */ 491 xdp_prog = rcu_dereference(rq->xdp_prog); 492 if (!xdp_prog) 493 return -ENXIO; 494 495 /* Free up any pending old buffers before queueing new ones. */ 496 while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 497 xdp_return_frame(xdpf_sent); 498 499 for (i = 0; i < n; i++) { 500 struct xdp_frame *xdpf = frames[i]; 501 502 err = __virtnet_xdp_xmit_one(vi, sq, xdpf); 503 if (err) { 504 xdp_return_frame_rx_napi(xdpf); 505 drops++; 506 } 507 } 508 509 if (flags & XDP_XMIT_FLUSH) 510 virtqueue_kick(sq->vq); 511 512 return n - drops; 513 } 514 515 static unsigned int virtnet_get_headroom(struct virtnet_info *vi) 516 { 517 return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; 518 } 519 520 /* We copy the packet for XDP in the following cases: 521 * 522 * 1) Packet is scattered across multiple rx buffers. 523 * 2) Headroom space is insufficient. 524 * 525 * This is inefficient but it's a temporary condition that 526 * we hit right after XDP is enabled and until queue is refilled 527 * with large buffers with sufficient headroom - so it should affect 528 * at most queue size packets. 529 * Afterwards, the conditions to enable 530 * XDP should preclude the underlying device from sending packets 531 * across multiple buffers (num_buf > 1), and we make sure buffers 532 * have enough headroom. 533 */ 534 static struct page *xdp_linearize_page(struct receive_queue *rq, 535 u16 *num_buf, 536 struct page *p, 537 int offset, 538 int page_off, 539 unsigned int *len) 540 { 541 struct page *page = alloc_page(GFP_ATOMIC); 542 543 if (!page) 544 return NULL; 545 546 memcpy(page_address(page) + page_off, page_address(p) + offset, *len); 547 page_off += *len; 548 549 while (--*num_buf) { 550 int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 551 unsigned int buflen; 552 void *buf; 553 int off; 554 555 buf = virtqueue_get_buf(rq->vq, &buflen); 556 if (unlikely(!buf)) 557 goto err_buf; 558 559 p = virt_to_head_page(buf); 560 off = buf - page_address(p); 561 562 /* guard against a misconfigured or uncooperative backend that 563 * is sending packet larger than the MTU. 564 */ 565 if ((page_off + buflen + tailroom) > PAGE_SIZE) { 566 put_page(p); 567 goto err_buf; 568 } 569 570 memcpy(page_address(page) + page_off, 571 page_address(p) + off, buflen); 572 page_off += buflen; 573 put_page(p); 574 } 575 576 /* Headroom does not contribute to packet length */ 577 *len = page_off - VIRTIO_XDP_HEADROOM; 578 return page; 579 err_buf: 580 __free_pages(page, 0); 581 return NULL; 582 } 583 584 static struct sk_buff *receive_small(struct net_device *dev, 585 struct virtnet_info *vi, 586 struct receive_queue *rq, 587 void *buf, void *ctx, 588 unsigned int len, 589 unsigned int *xdp_xmit, 590 unsigned int *rbytes) 591 { 592 struct sk_buff *skb; 593 struct bpf_prog *xdp_prog; 594 unsigned int xdp_headroom = (unsigned long)ctx; 595 unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom; 596 unsigned int headroom = vi->hdr_len + header_offset; 597 unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + 598 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 599 struct page *page = virt_to_head_page(buf); 600 unsigned int delta = 0; 601 struct page *xdp_page; 602 int err; 603 604 len -= vi->hdr_len; 605 *rbytes += len; 606 607 rcu_read_lock(); 608 xdp_prog = rcu_dereference(rq->xdp_prog); 609 if (xdp_prog) { 610 struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; 611 struct xdp_frame *xdpf; 612 struct xdp_buff xdp; 613 void *orig_data; 614 u32 act; 615 616 if (unlikely(hdr->hdr.gso_type)) 617 goto err_xdp; 618 619 if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) { 620 int offset = buf - page_address(page) + header_offset; 621 unsigned int tlen = len + vi->hdr_len; 622 u16 num_buf = 1; 623 624 xdp_headroom = virtnet_get_headroom(vi); 625 header_offset = VIRTNET_RX_PAD + xdp_headroom; 626 headroom = vi->hdr_len + header_offset; 627 buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + 628 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 629 xdp_page = xdp_linearize_page(rq, &num_buf, page, 630 offset, header_offset, 631 &tlen); 632 if (!xdp_page) 633 goto err_xdp; 634 635 buf = page_address(xdp_page); 636 put_page(page); 637 page = xdp_page; 638 } 639 640 xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; 641 xdp.data = xdp.data_hard_start + xdp_headroom; 642 xdp_set_data_meta_invalid(&xdp); 643 xdp.data_end = xdp.data + len; 644 xdp.rxq = &rq->xdp_rxq; 645 orig_data = xdp.data; 646 act = bpf_prog_run_xdp(xdp_prog, &xdp); 647 648 switch (act) { 649 case XDP_PASS: 650 /* Recalculate length in case bpf program changed it */ 651 delta = orig_data - xdp.data; 652 len = xdp.data_end - xdp.data; 653 break; 654 case XDP_TX: 655 xdpf = convert_to_xdp_frame(&xdp); 656 if (unlikely(!xdpf)) 657 goto err_xdp; 658 err = __virtnet_xdp_tx_xmit(vi, xdpf); 659 if (unlikely(err)) { 660 trace_xdp_exception(vi->dev, xdp_prog, act); 661 goto err_xdp; 662 } 663 *xdp_xmit |= VIRTIO_XDP_TX; 664 rcu_read_unlock(); 665 goto xdp_xmit; 666 case XDP_REDIRECT: 667 err = xdp_do_redirect(dev, &xdp, xdp_prog); 668 if (err) 669 goto err_xdp; 670 *xdp_xmit |= VIRTIO_XDP_REDIR; 671 rcu_read_unlock(); 672 goto xdp_xmit; 673 default: 674 bpf_warn_invalid_xdp_action(act); 675 case XDP_ABORTED: 676 trace_xdp_exception(vi->dev, xdp_prog, act); 677 case XDP_DROP: 678 goto err_xdp; 679 } 680 } 681 rcu_read_unlock(); 682 683 skb = build_skb(buf, buflen); 684 if (!skb) { 685 put_page(page); 686 goto err; 687 } 688 skb_reserve(skb, headroom - delta); 689 skb_put(skb, len); 690 if (!delta) { 691 buf += header_offset; 692 memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len); 693 } /* keep zeroed vnet hdr since packet was changed by bpf */ 694 695 err: 696 return skb; 697 698 err_xdp: 699 rcu_read_unlock(); 700 dev->stats.rx_dropped++; 701 put_page(page); 702 xdp_xmit: 703 return NULL; 704 } 705 706 static struct sk_buff *receive_big(struct net_device *dev, 707 struct virtnet_info *vi, 708 struct receive_queue *rq, 709 void *buf, 710 unsigned int len, 711 unsigned int *rbytes) 712 { 713 struct page *page = buf; 714 struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); 715 716 *rbytes += len - vi->hdr_len; 717 if (unlikely(!skb)) 718 goto err; 719 720 return skb; 721 722 err: 723 dev->stats.rx_dropped++; 724 give_pages(rq, page); 725 return NULL; 726 } 727 728 static struct sk_buff *receive_mergeable(struct net_device *dev, 729 struct virtnet_info *vi, 730 struct receive_queue *rq, 731 void *buf, 732 void *ctx, 733 unsigned int len, 734 unsigned int *xdp_xmit, 735 unsigned int *rbytes) 736 { 737 struct virtio_net_hdr_mrg_rxbuf *hdr = buf; 738 u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); 739 struct page *page = virt_to_head_page(buf); 740 int offset = buf - page_address(page); 741 struct sk_buff *head_skb, *curr_skb; 742 struct bpf_prog *xdp_prog; 743 unsigned int truesize; 744 unsigned int headroom = mergeable_ctx_to_headroom(ctx); 745 int err; 746 747 head_skb = NULL; 748 *rbytes += len - vi->hdr_len; 749 750 rcu_read_lock(); 751 xdp_prog = rcu_dereference(rq->xdp_prog); 752 if (xdp_prog) { 753 struct xdp_frame *xdpf; 754 struct page *xdp_page; 755 struct xdp_buff xdp; 756 void *data; 757 u32 act; 758 759 /* Transient failure which in theory could occur if 760 * in-flight packets from before XDP was enabled reach 761 * the receive path after XDP is loaded. 762 */ 763 if (unlikely(hdr->hdr.gso_type)) 764 goto err_xdp; 765 766 /* This happens when rx buffer size is underestimated 767 * or headroom is not enough because of the buffer 768 * was refilled before XDP is set. This should only 769 * happen for the first several packets, so we don't 770 * care much about its performance. 771 */ 772 if (unlikely(num_buf > 1 || 773 headroom < virtnet_get_headroom(vi))) { 774 /* linearize data for XDP */ 775 xdp_page = xdp_linearize_page(rq, &num_buf, 776 page, offset, 777 VIRTIO_XDP_HEADROOM, 778 &len); 779 if (!xdp_page) 780 goto err_xdp; 781 offset = VIRTIO_XDP_HEADROOM; 782 } else { 783 xdp_page = page; 784 } 785 786 /* Allow consuming headroom but reserve enough space to push 787 * the descriptor on if we get an XDP_TX return code. 788 */ 789 data = page_address(xdp_page) + offset; 790 xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; 791 xdp.data = data + vi->hdr_len; 792 xdp_set_data_meta_invalid(&xdp); 793 xdp.data_end = xdp.data + (len - vi->hdr_len); 794 xdp.rxq = &rq->xdp_rxq; 795 796 act = bpf_prog_run_xdp(xdp_prog, &xdp); 797 798 switch (act) { 799 case XDP_PASS: 800 /* recalculate offset to account for any header 801 * adjustments. Note other cases do not build an 802 * skb and avoid using offset 803 */ 804 offset = xdp.data - 805 page_address(xdp_page) - vi->hdr_len; 806 807 /* recalculate len if xdp.data or xdp.data_end were 808 * adjusted 809 */ 810 len = xdp.data_end - xdp.data + vi->hdr_len; 811 /* We can only create skb based on xdp_page. */ 812 if (unlikely(xdp_page != page)) { 813 rcu_read_unlock(); 814 put_page(page); 815 head_skb = page_to_skb(vi, rq, xdp_page, 816 offset, len, PAGE_SIZE); 817 return head_skb; 818 } 819 break; 820 case XDP_TX: 821 xdpf = convert_to_xdp_frame(&xdp); 822 if (unlikely(!xdpf)) 823 goto err_xdp; 824 err = __virtnet_xdp_tx_xmit(vi, xdpf); 825 if (unlikely(err)) { 826 trace_xdp_exception(vi->dev, xdp_prog, act); 827 if (unlikely(xdp_page != page)) 828 put_page(xdp_page); 829 goto err_xdp; 830 } 831 *xdp_xmit |= VIRTIO_XDP_TX; 832 if (unlikely(xdp_page != page)) 833 put_page(page); 834 rcu_read_unlock(); 835 goto xdp_xmit; 836 case XDP_REDIRECT: 837 err = xdp_do_redirect(dev, &xdp, xdp_prog); 838 if (err) { 839 if (unlikely(xdp_page != page)) 840 put_page(xdp_page); 841 goto err_xdp; 842 } 843 *xdp_xmit |= VIRTIO_XDP_REDIR; 844 if (unlikely(xdp_page != page)) 845 put_page(page); 846 rcu_read_unlock(); 847 goto xdp_xmit; 848 default: 849 bpf_warn_invalid_xdp_action(act); 850 case XDP_ABORTED: 851 trace_xdp_exception(vi->dev, xdp_prog, act); 852 case XDP_DROP: 853 if (unlikely(xdp_page != page)) 854 __free_pages(xdp_page, 0); 855 goto err_xdp; 856 } 857 } 858 rcu_read_unlock(); 859 860 truesize = mergeable_ctx_to_truesize(ctx); 861 if (unlikely(len > truesize)) { 862 pr_debug("%s: rx error: len %u exceeds truesize %lu\n", 863 dev->name, len, (unsigned long)ctx); 864 dev->stats.rx_length_errors++; 865 goto err_skb; 866 } 867 868 head_skb = page_to_skb(vi, rq, page, offset, len, truesize); 869 curr_skb = head_skb; 870 871 if (unlikely(!curr_skb)) 872 goto err_skb; 873 while (--num_buf) { 874 int num_skb_frags; 875 876 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx); 877 if (unlikely(!buf)) { 878 pr_debug("%s: rx error: %d buffers out of %d missing\n", 879 dev->name, num_buf, 880 virtio16_to_cpu(vi->vdev, 881 hdr->num_buffers)); 882 dev->stats.rx_length_errors++; 883 goto err_buf; 884 } 885 886 *rbytes += len; 887 page = virt_to_head_page(buf); 888 889 truesize = mergeable_ctx_to_truesize(ctx); 890 if (unlikely(len > truesize)) { 891 pr_debug("%s: rx error: len %u exceeds truesize %lu\n", 892 dev->name, len, (unsigned long)ctx); 893 dev->stats.rx_length_errors++; 894 goto err_skb; 895 } 896 897 num_skb_frags = skb_shinfo(curr_skb)->nr_frags; 898 if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { 899 struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); 900 901 if (unlikely(!nskb)) 902 goto err_skb; 903 if (curr_skb == head_skb) 904 skb_shinfo(curr_skb)->frag_list = nskb; 905 else 906 curr_skb->next = nskb; 907 curr_skb = nskb; 908 head_skb->truesize += nskb->truesize; 909 num_skb_frags = 0; 910 } 911 if (curr_skb != head_skb) { 912 head_skb->data_len += len; 913 head_skb->len += len; 914 head_skb->truesize += truesize; 915 } 916 offset = buf - page_address(page); 917 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { 918 put_page(page); 919 skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, 920 len, truesize); 921 } else { 922 skb_add_rx_frag(curr_skb, num_skb_frags, page, 923 offset, len, truesize); 924 } 925 } 926 927 ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); 928 return head_skb; 929 930 err_xdp: 931 rcu_read_unlock(); 932 err_skb: 933 put_page(page); 934 while (num_buf-- > 1) { 935 buf = virtqueue_get_buf(rq->vq, &len); 936 if (unlikely(!buf)) { 937 pr_debug("%s: rx error: %d buffers missing\n", 938 dev->name, num_buf); 939 dev->stats.rx_length_errors++; 940 break; 941 } 942 *rbytes += len; 943 page = virt_to_head_page(buf); 944 put_page(page); 945 } 946 err_buf: 947 dev->stats.rx_dropped++; 948 dev_kfree_skb(head_skb); 949 xdp_xmit: 950 return NULL; 951 } 952 953 static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, 954 void *buf, unsigned int len, void **ctx, 955 unsigned int *xdp_xmit, unsigned int *rbytes) 956 { 957 struct net_device *dev = vi->dev; 958 struct sk_buff *skb; 959 struct virtio_net_hdr_mrg_rxbuf *hdr; 960 961 if (unlikely(len < vi->hdr_len + ETH_HLEN)) { 962 pr_debug("%s: short packet %i\n", dev->name, len); 963 dev->stats.rx_length_errors++; 964 if (vi->mergeable_rx_bufs) { 965 put_page(virt_to_head_page(buf)); 966 } else if (vi->big_packets) { 967 give_pages(rq, buf); 968 } else { 969 put_page(virt_to_head_page(buf)); 970 } 971 return; 972 } 973 974 if (vi->mergeable_rx_bufs) 975 skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, 976 rbytes); 977 else if (vi->big_packets) 978 skb = receive_big(dev, vi, rq, buf, len, rbytes); 979 else 980 skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, rbytes); 981 982 if (unlikely(!skb)) 983 return; 984 985 hdr = skb_vnet_hdr(skb); 986 987 if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) 988 skb->ip_summed = CHECKSUM_UNNECESSARY; 989 990 if (virtio_net_hdr_to_skb(skb, &hdr->hdr, 991 virtio_is_little_endian(vi->vdev))) { 992 net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n", 993 dev->name, hdr->hdr.gso_type, 994 hdr->hdr.gso_size); 995 goto frame_err; 996 } 997 998 skb->protocol = eth_type_trans(skb, dev); 999 pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 1000 ntohs(skb->protocol), skb->len, skb->pkt_type); 1001 1002 napi_gro_receive(&rq->napi, skb); 1003 return; 1004 1005 frame_err: 1006 dev->stats.rx_frame_errors++; 1007 dev_kfree_skb(skb); 1008 } 1009 1010 /* Unlike mergeable buffers, all buffers are allocated to the 1011 * same size, except for the headroom. For this reason we do 1012 * not need to use mergeable_len_to_ctx here - it is enough 1013 * to store the headroom as the context ignoring the truesize. 1014 */ 1015 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, 1016 gfp_t gfp) 1017 { 1018 struct page_frag *alloc_frag = &rq->alloc_frag; 1019 char *buf; 1020 unsigned int xdp_headroom = virtnet_get_headroom(vi); 1021 void *ctx = (void *)(unsigned long)xdp_headroom; 1022 int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; 1023 int err; 1024 1025 len = SKB_DATA_ALIGN(len) + 1026 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1027 if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp))) 1028 return -ENOMEM; 1029 1030 buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; 1031 get_page(alloc_frag->page); 1032 alloc_frag->offset += len; 1033 sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom, 1034 vi->hdr_len + GOOD_PACKET_LEN); 1035 err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); 1036 if (err < 0) 1037 put_page(virt_to_head_page(buf)); 1038 return err; 1039 } 1040 1041 static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq, 1042 gfp_t gfp) 1043 { 1044 struct page *first, *list = NULL; 1045 char *p; 1046 int i, err, offset; 1047 1048 sg_init_table(rq->sg, MAX_SKB_FRAGS + 2); 1049 1050 /* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */ 1051 for (i = MAX_SKB_FRAGS + 1; i > 1; --i) { 1052 first = get_a_page(rq, gfp); 1053 if (!first) { 1054 if (list) 1055 give_pages(rq, list); 1056 return -ENOMEM; 1057 } 1058 sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE); 1059 1060 /* chain new page in list head to match sg */ 1061 first->private = (unsigned long)list; 1062 list = first; 1063 } 1064 1065 first = get_a_page(rq, gfp); 1066 if (!first) { 1067 give_pages(rq, list); 1068 return -ENOMEM; 1069 } 1070 p = page_address(first); 1071 1072 /* rq->sg[0], rq->sg[1] share the same page */ 1073 /* a separated rq->sg[0] for header - required in case !any_header_sg */ 1074 sg_set_buf(&rq->sg[0], p, vi->hdr_len); 1075 1076 /* rq->sg[1] for data packet, from offset */ 1077 offset = sizeof(struct padded_vnet_hdr); 1078 sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset); 1079 1080 /* chain first in list head */ 1081 first->private = (unsigned long)list; 1082 err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2, 1083 first, gfp); 1084 if (err < 0) 1085 give_pages(rq, first); 1086 1087 return err; 1088 } 1089 1090 static unsigned int get_mergeable_buf_len(struct receive_queue *rq, 1091 struct ewma_pkt_len *avg_pkt_len, 1092 unsigned int room) 1093 { 1094 const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 1095 unsigned int len; 1096 1097 if (room) 1098 return PAGE_SIZE - room; 1099 1100 len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len), 1101 rq->min_buf_len, PAGE_SIZE - hdr_len); 1102 1103 return ALIGN(len, L1_CACHE_BYTES); 1104 } 1105 1106 static int add_recvbuf_mergeable(struct virtnet_info *vi, 1107 struct receive_queue *rq, gfp_t gfp) 1108 { 1109 struct page_frag *alloc_frag = &rq->alloc_frag; 1110 unsigned int headroom = virtnet_get_headroom(vi); 1111 unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; 1112 unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); 1113 char *buf; 1114 void *ctx; 1115 int err; 1116 unsigned int len, hole; 1117 1118 /* Extra tailroom is needed to satisfy XDP's assumption. This 1119 * means rx frags coalescing won't work, but consider we've 1120 * disabled GSO for XDP, it won't be a big issue. 1121 */ 1122 len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); 1123 if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) 1124 return -ENOMEM; 1125 1126 buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; 1127 buf += headroom; /* advance address leaving hole at front of pkt */ 1128 get_page(alloc_frag->page); 1129 alloc_frag->offset += len + room; 1130 hole = alloc_frag->size - alloc_frag->offset; 1131 if (hole < len + room) { 1132 /* To avoid internal fragmentation, if there is very likely not 1133 * enough space for another buffer, add the remaining space to 1134 * the current buffer. 1135 */ 1136 len += hole; 1137 alloc_frag->offset += hole; 1138 } 1139 1140 sg_init_one(rq->sg, buf, len); 1141 ctx = mergeable_len_to_ctx(len, headroom); 1142 err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); 1143 if (err < 0) 1144 put_page(virt_to_head_page(buf)); 1145 1146 return err; 1147 } 1148 1149 /* 1150 * Returns false if we couldn't fill entirely (OOM). 1151 * 1152 * Normally run in the receive path, but can also be run from ndo_open 1153 * before we're receiving packets, or from refill_work which is 1154 * careful to disable receiving (using napi_disable). 1155 */ 1156 static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, 1157 gfp_t gfp) 1158 { 1159 int err; 1160 bool oom; 1161 1162 do { 1163 if (vi->mergeable_rx_bufs) 1164 err = add_recvbuf_mergeable(vi, rq, gfp); 1165 else if (vi->big_packets) 1166 err = add_recvbuf_big(vi, rq, gfp); 1167 else 1168 err = add_recvbuf_small(vi, rq, gfp); 1169 1170 oom = err == -ENOMEM; 1171 if (err) 1172 break; 1173 } while (rq->vq->num_free); 1174 virtqueue_kick(rq->vq); 1175 return !oom; 1176 } 1177 1178 static void skb_recv_done(struct virtqueue *rvq) 1179 { 1180 struct virtnet_info *vi = rvq->vdev->priv; 1181 struct receive_queue *rq = &vi->rq[vq2rxq(rvq)]; 1182 1183 virtqueue_napi_schedule(&rq->napi, rvq); 1184 } 1185 1186 static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) 1187 { 1188 napi_enable(napi); 1189 1190 /* If all buffers were filled by other side before we napi_enabled, we 1191 * won't get another interrupt, so process any outstanding packets now. 1192 * Call local_bh_enable after to trigger softIRQ processing. 1193 */ 1194 local_bh_disable(); 1195 virtqueue_napi_schedule(napi, vq); 1196 local_bh_enable(); 1197 } 1198 1199 static void virtnet_napi_tx_enable(struct virtnet_info *vi, 1200 struct virtqueue *vq, 1201 struct napi_struct *napi) 1202 { 1203 if (!napi->weight) 1204 return; 1205 1206 /* Tx napi touches cachelines on the cpu handling tx interrupts. Only 1207 * enable the feature if this is likely affine with the transmit path. 1208 */ 1209 if (!vi->affinity_hint_set) { 1210 napi->weight = 0; 1211 return; 1212 } 1213 1214 return virtnet_napi_enable(vq, napi); 1215 } 1216 1217 static void virtnet_napi_tx_disable(struct napi_struct *napi) 1218 { 1219 if (napi->weight) 1220 napi_disable(napi); 1221 } 1222 1223 static void refill_work(struct work_struct *work) 1224 { 1225 struct virtnet_info *vi = 1226 container_of(work, struct virtnet_info, refill.work); 1227 bool still_empty; 1228 int i; 1229 1230 for (i = 0; i < vi->curr_queue_pairs; i++) { 1231 struct receive_queue *rq = &vi->rq[i]; 1232 1233 napi_disable(&rq->napi); 1234 still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); 1235 virtnet_napi_enable(rq->vq, &rq->napi); 1236 1237 /* In theory, this can happen: if we don't get any buffers in 1238 * we will *never* try to fill again. 1239 */ 1240 if (still_empty) 1241 schedule_delayed_work(&vi->refill, HZ/2); 1242 } 1243 } 1244 1245 static int virtnet_receive(struct receive_queue *rq, int budget, 1246 unsigned int *xdp_xmit) 1247 { 1248 struct virtnet_info *vi = rq->vq->vdev->priv; 1249 unsigned int len, received = 0, bytes = 0; 1250 void *buf; 1251 1252 if (!vi->big_packets || vi->mergeable_rx_bufs) { 1253 void *ctx; 1254 1255 while (received < budget && 1256 (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) { 1257 receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &bytes); 1258 received++; 1259 } 1260 } else { 1261 while (received < budget && 1262 (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { 1263 receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &bytes); 1264 received++; 1265 } 1266 } 1267 1268 if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) { 1269 if (!try_fill_recv(vi, rq, GFP_ATOMIC)) 1270 schedule_delayed_work(&vi->refill, 0); 1271 } 1272 1273 u64_stats_update_begin(&rq->stats.syncp); 1274 rq->stats.bytes += bytes; 1275 rq->stats.packets += received; 1276 u64_stats_update_end(&rq->stats.syncp); 1277 1278 return received; 1279 } 1280 1281 static void free_old_xmit_skbs(struct send_queue *sq) 1282 { 1283 struct sk_buff *skb; 1284 unsigned int len; 1285 unsigned int packets = 0; 1286 unsigned int bytes = 0; 1287 1288 while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { 1289 pr_debug("Sent skb %p\n", skb); 1290 1291 bytes += skb->len; 1292 packets++; 1293 1294 dev_consume_skb_any(skb); 1295 } 1296 1297 /* Avoid overhead when no packets have been processed 1298 * happens when called speculatively from start_xmit. 1299 */ 1300 if (!packets) 1301 return; 1302 1303 u64_stats_update_begin(&sq->stats.syncp); 1304 sq->stats.bytes += bytes; 1305 sq->stats.packets += packets; 1306 u64_stats_update_end(&sq->stats.syncp); 1307 } 1308 1309 static void virtnet_poll_cleantx(struct receive_queue *rq) 1310 { 1311 struct virtnet_info *vi = rq->vq->vdev->priv; 1312 unsigned int index = vq2rxq(rq->vq); 1313 struct send_queue *sq = &vi->sq[index]; 1314 struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index); 1315 1316 if (!sq->napi.weight) 1317 return; 1318 1319 if (__netif_tx_trylock(txq)) { 1320 free_old_xmit_skbs(sq); 1321 __netif_tx_unlock(txq); 1322 } 1323 1324 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1325 netif_tx_wake_queue(txq); 1326 } 1327 1328 static int virtnet_poll(struct napi_struct *napi, int budget) 1329 { 1330 struct receive_queue *rq = 1331 container_of(napi, struct receive_queue, napi); 1332 struct virtnet_info *vi = rq->vq->vdev->priv; 1333 struct send_queue *sq; 1334 unsigned int received, qp; 1335 unsigned int xdp_xmit = 0; 1336 1337 virtnet_poll_cleantx(rq); 1338 1339 received = virtnet_receive(rq, budget, &xdp_xmit); 1340 1341 /* Out of packets? */ 1342 if (received < budget) 1343 virtqueue_napi_complete(napi, rq->vq, received); 1344 1345 if (xdp_xmit & VIRTIO_XDP_REDIR) 1346 xdp_do_flush_map(); 1347 1348 if (xdp_xmit & VIRTIO_XDP_TX) { 1349 qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + 1350 smp_processor_id(); 1351 sq = &vi->sq[qp]; 1352 virtqueue_kick(sq->vq); 1353 } 1354 1355 return received; 1356 } 1357 1358 static int virtnet_open(struct net_device *dev) 1359 { 1360 struct virtnet_info *vi = netdev_priv(dev); 1361 int i, err; 1362 1363 for (i = 0; i < vi->max_queue_pairs; i++) { 1364 if (i < vi->curr_queue_pairs) 1365 /* Make sure we have some buffers: if oom use wq. */ 1366 if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) 1367 schedule_delayed_work(&vi->refill, 0); 1368 1369 err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i); 1370 if (err < 0) 1371 return err; 1372 1373 err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, 1374 MEM_TYPE_PAGE_SHARED, NULL); 1375 if (err < 0) { 1376 xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); 1377 return err; 1378 } 1379 1380 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 1381 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); 1382 } 1383 1384 return 0; 1385 } 1386 1387 static int virtnet_poll_tx(struct napi_struct *napi, int budget) 1388 { 1389 struct send_queue *sq = container_of(napi, struct send_queue, napi); 1390 struct virtnet_info *vi = sq->vq->vdev->priv; 1391 struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq)); 1392 1393 __netif_tx_lock(txq, raw_smp_processor_id()); 1394 free_old_xmit_skbs(sq); 1395 __netif_tx_unlock(txq); 1396 1397 virtqueue_napi_complete(napi, sq->vq, 0); 1398 1399 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1400 netif_tx_wake_queue(txq); 1401 1402 return 0; 1403 } 1404 1405 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) 1406 { 1407 struct virtio_net_hdr_mrg_rxbuf *hdr; 1408 const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 1409 struct virtnet_info *vi = sq->vq->vdev->priv; 1410 int num_sg; 1411 unsigned hdr_len = vi->hdr_len; 1412 bool can_push; 1413 1414 pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); 1415 1416 can_push = vi->any_header_sg && 1417 !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && 1418 !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; 1419 /* Even if we can, don't push here yet as this would skew 1420 * csum_start offset below. */ 1421 if (can_push) 1422 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len); 1423 else 1424 hdr = skb_vnet_hdr(skb); 1425 1426 if (virtio_net_hdr_from_skb(skb, &hdr->hdr, 1427 virtio_is_little_endian(vi->vdev), false, 1428 0)) 1429 BUG(); 1430 1431 if (vi->mergeable_rx_bufs) 1432 hdr->num_buffers = 0; 1433 1434 sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2)); 1435 if (can_push) { 1436 __skb_push(skb, hdr_len); 1437 num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len); 1438 if (unlikely(num_sg < 0)) 1439 return num_sg; 1440 /* Pull header back to avoid skew in tx bytes calculations. */ 1441 __skb_pull(skb, hdr_len); 1442 } else { 1443 sg_set_buf(sq->sg, hdr, hdr_len); 1444 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len); 1445 if (unlikely(num_sg < 0)) 1446 return num_sg; 1447 num_sg++; 1448 } 1449 return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); 1450 } 1451 1452 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) 1453 { 1454 struct virtnet_info *vi = netdev_priv(dev); 1455 int qnum = skb_get_queue_mapping(skb); 1456 struct send_queue *sq = &vi->sq[qnum]; 1457 int err; 1458 struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); 1459 bool kick = !skb->xmit_more; 1460 bool use_napi = sq->napi.weight; 1461 1462 /* Free up any pending old buffers before queueing new ones. */ 1463 free_old_xmit_skbs(sq); 1464 1465 if (use_napi && kick) 1466 virtqueue_enable_cb_delayed(sq->vq); 1467 1468 /* timestamp packet in software */ 1469 skb_tx_timestamp(skb); 1470 1471 /* Try to transmit */ 1472 err = xmit_skb(sq, skb); 1473 1474 /* This should not happen! */ 1475 if (unlikely(err)) { 1476 dev->stats.tx_fifo_errors++; 1477 if (net_ratelimit()) 1478 dev_warn(&dev->dev, 1479 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err); 1480 dev->stats.tx_dropped++; 1481 dev_kfree_skb_any(skb); 1482 return NETDEV_TX_OK; 1483 } 1484 1485 /* Don't wait up for transmitted skbs to be freed. */ 1486 if (!use_napi) { 1487 skb_orphan(skb); 1488 nf_reset(skb); 1489 } 1490 1491 /* If running out of space, stop queue to avoid getting packets that we 1492 * are then unable to transmit. 1493 * An alternative would be to force queuing layer to requeue the skb by 1494 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be 1495 * returned in a normal path of operation: it means that driver is not 1496 * maintaining the TX queue stop/start state properly, and causes 1497 * the stack to do a non-trivial amount of useless work. 1498 * Since most packets only take 1 or 2 ring slots, stopping the queue 1499 * early means 16 slots are typically wasted. 1500 */ 1501 if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { 1502 netif_stop_subqueue(dev, qnum); 1503 if (!use_napi && 1504 unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { 1505 /* More just got used, free them then recheck. */ 1506 free_old_xmit_skbs(sq); 1507 if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { 1508 netif_start_subqueue(dev, qnum); 1509 virtqueue_disable_cb(sq->vq); 1510 } 1511 } 1512 } 1513 1514 if (kick || netif_xmit_stopped(txq)) 1515 virtqueue_kick(sq->vq); 1516 1517 return NETDEV_TX_OK; 1518 } 1519 1520 /* 1521 * Send command via the control virtqueue and check status. Commands 1522 * supported by the hypervisor, as indicated by feature bits, should 1523 * never fail unless improperly formatted. 1524 */ 1525 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, 1526 struct scatterlist *out) 1527 { 1528 struct scatterlist *sgs[4], hdr, stat; 1529 unsigned out_num = 0, tmp; 1530 1531 /* Caller should know better */ 1532 BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); 1533 1534 vi->ctrl->status = ~0; 1535 vi->ctrl->hdr.class = class; 1536 vi->ctrl->hdr.cmd = cmd; 1537 /* Add header */ 1538 sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr)); 1539 sgs[out_num++] = &hdr; 1540 1541 if (out) 1542 sgs[out_num++] = out; 1543 1544 /* Add return status. */ 1545 sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status)); 1546 sgs[out_num] = &stat; 1547 1548 BUG_ON(out_num + 1 > ARRAY_SIZE(sgs)); 1549 virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC); 1550 1551 if (unlikely(!virtqueue_kick(vi->cvq))) 1552 return vi->ctrl->status == VIRTIO_NET_OK; 1553 1554 /* Spin for a response, the kick causes an ioport write, trapping 1555 * into the hypervisor, so the request should be handled immediately. 1556 */ 1557 while (!virtqueue_get_buf(vi->cvq, &tmp) && 1558 !virtqueue_is_broken(vi->cvq)) 1559 cpu_relax(); 1560 1561 return vi->ctrl->status == VIRTIO_NET_OK; 1562 } 1563 1564 static int virtnet_set_mac_address(struct net_device *dev, void *p) 1565 { 1566 struct virtnet_info *vi = netdev_priv(dev); 1567 struct virtio_device *vdev = vi->vdev; 1568 int ret; 1569 struct sockaddr *addr; 1570 struct scatterlist sg; 1571 1572 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) 1573 return -EOPNOTSUPP; 1574 1575 addr = kmemdup(p, sizeof(*addr), GFP_KERNEL); 1576 if (!addr) 1577 return -ENOMEM; 1578 1579 ret = eth_prepare_mac_addr_change(dev, addr); 1580 if (ret) 1581 goto out; 1582 1583 if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { 1584 sg_init_one(&sg, addr->sa_data, dev->addr_len); 1585 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, 1586 VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { 1587 dev_warn(&vdev->dev, 1588 "Failed to set mac address by vq command.\n"); 1589 ret = -EINVAL; 1590 goto out; 1591 } 1592 } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && 1593 !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1594 unsigned int i; 1595 1596 /* Naturally, this has an atomicity problem. */ 1597 for (i = 0; i < dev->addr_len; i++) 1598 virtio_cwrite8(vdev, 1599 offsetof(struct virtio_net_config, mac) + 1600 i, addr->sa_data[i]); 1601 } 1602 1603 eth_commit_mac_addr_change(dev, p); 1604 ret = 0; 1605 1606 out: 1607 kfree(addr); 1608 return ret; 1609 } 1610 1611 static void virtnet_stats(struct net_device *dev, 1612 struct rtnl_link_stats64 *tot) 1613 { 1614 struct virtnet_info *vi = netdev_priv(dev); 1615 unsigned int start; 1616 int i; 1617 1618 for (i = 0; i < vi->max_queue_pairs; i++) { 1619 u64 tpackets, tbytes, rpackets, rbytes; 1620 struct receive_queue *rq = &vi->rq[i]; 1621 struct send_queue *sq = &vi->sq[i]; 1622 1623 do { 1624 start = u64_stats_fetch_begin_irq(&sq->stats.syncp); 1625 tpackets = sq->stats.packets; 1626 tbytes = sq->stats.bytes; 1627 } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start)); 1628 1629 do { 1630 start = u64_stats_fetch_begin_irq(&rq->stats.syncp); 1631 rpackets = rq->stats.packets; 1632 rbytes = rq->stats.bytes; 1633 } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start)); 1634 1635 tot->rx_packets += rpackets; 1636 tot->tx_packets += tpackets; 1637 tot->rx_bytes += rbytes; 1638 tot->tx_bytes += tbytes; 1639 } 1640 1641 tot->tx_dropped = dev->stats.tx_dropped; 1642 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 1643 tot->rx_dropped = dev->stats.rx_dropped; 1644 tot->rx_length_errors = dev->stats.rx_length_errors; 1645 tot->rx_frame_errors = dev->stats.rx_frame_errors; 1646 } 1647 1648 #ifdef CONFIG_NET_POLL_CONTROLLER 1649 static void virtnet_netpoll(struct net_device *dev) 1650 { 1651 struct virtnet_info *vi = netdev_priv(dev); 1652 int i; 1653 1654 for (i = 0; i < vi->curr_queue_pairs; i++) 1655 napi_schedule(&vi->rq[i].napi); 1656 } 1657 #endif 1658 1659 static void virtnet_ack_link_announce(struct virtnet_info *vi) 1660 { 1661 rtnl_lock(); 1662 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, 1663 VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL)) 1664 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); 1665 rtnl_unlock(); 1666 } 1667 1668 static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) 1669 { 1670 struct scatterlist sg; 1671 struct net_device *dev = vi->dev; 1672 1673 if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) 1674 return 0; 1675 1676 vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); 1677 sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq)); 1678 1679 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, 1680 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) { 1681 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", 1682 queue_pairs); 1683 return -EINVAL; 1684 } else { 1685 vi->curr_queue_pairs = queue_pairs; 1686 /* virtnet_open() will refill when device is going to up. */ 1687 if (dev->flags & IFF_UP) 1688 schedule_delayed_work(&vi->refill, 0); 1689 } 1690 1691 return 0; 1692 } 1693 1694 static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) 1695 { 1696 int err; 1697 1698 rtnl_lock(); 1699 err = _virtnet_set_queues(vi, queue_pairs); 1700 rtnl_unlock(); 1701 return err; 1702 } 1703 1704 static int virtnet_close(struct net_device *dev) 1705 { 1706 struct virtnet_info *vi = netdev_priv(dev); 1707 int i; 1708 1709 /* Make sure refill_work doesn't re-enable napi! */ 1710 cancel_delayed_work_sync(&vi->refill); 1711 1712 for (i = 0; i < vi->max_queue_pairs; i++) { 1713 xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); 1714 napi_disable(&vi->rq[i].napi); 1715 virtnet_napi_tx_disable(&vi->sq[i].napi); 1716 } 1717 1718 return 0; 1719 } 1720 1721 static void virtnet_set_rx_mode(struct net_device *dev) 1722 { 1723 struct virtnet_info *vi = netdev_priv(dev); 1724 struct scatterlist sg[2]; 1725 struct virtio_net_ctrl_mac *mac_data; 1726 struct netdev_hw_addr *ha; 1727 int uc_count; 1728 int mc_count; 1729 void *buf; 1730 int i; 1731 1732 /* We can't dynamically set ndo_set_rx_mode, so return gracefully */ 1733 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) 1734 return; 1735 1736 vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0); 1737 vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0); 1738 1739 sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc)); 1740 1741 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, 1742 VIRTIO_NET_CTRL_RX_PROMISC, sg)) 1743 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", 1744 vi->ctrl->promisc ? "en" : "dis"); 1745 1746 sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti)); 1747 1748 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, 1749 VIRTIO_NET_CTRL_RX_ALLMULTI, sg)) 1750 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", 1751 vi->ctrl->allmulti ? "en" : "dis"); 1752 1753 uc_count = netdev_uc_count(dev); 1754 mc_count = netdev_mc_count(dev); 1755 /* MAC filter - use one buffer for both lists */ 1756 buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) + 1757 (2 * sizeof(mac_data->entries)), GFP_ATOMIC); 1758 mac_data = buf; 1759 if (!buf) 1760 return; 1761 1762 sg_init_table(sg, 2); 1763 1764 /* Store the unicast list and count in the front of the buffer */ 1765 mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count); 1766 i = 0; 1767 netdev_for_each_uc_addr(ha, dev) 1768 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); 1769 1770 sg_set_buf(&sg[0], mac_data, 1771 sizeof(mac_data->entries) + (uc_count * ETH_ALEN)); 1772 1773 /* multicast list and count fill the end */ 1774 mac_data = (void *)&mac_data->macs[uc_count][0]; 1775 1776 mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count); 1777 i = 0; 1778 netdev_for_each_mc_addr(ha, dev) 1779 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); 1780 1781 sg_set_buf(&sg[1], mac_data, 1782 sizeof(mac_data->entries) + (mc_count * ETH_ALEN)); 1783 1784 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, 1785 VIRTIO_NET_CTRL_MAC_TABLE_SET, sg)) 1786 dev_warn(&dev->dev, "Failed to set MAC filter table.\n"); 1787 1788 kfree(buf); 1789 } 1790 1791 static int virtnet_vlan_rx_add_vid(struct net_device *dev, 1792 __be16 proto, u16 vid) 1793 { 1794 struct virtnet_info *vi = netdev_priv(dev); 1795 struct scatterlist sg; 1796 1797 vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid); 1798 sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid)); 1799 1800 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, 1801 VIRTIO_NET_CTRL_VLAN_ADD, &sg)) 1802 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); 1803 return 0; 1804 } 1805 1806 static int virtnet_vlan_rx_kill_vid(struct net_device *dev, 1807 __be16 proto, u16 vid) 1808 { 1809 struct virtnet_info *vi = netdev_priv(dev); 1810 struct scatterlist sg; 1811 1812 vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid); 1813 sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid)); 1814 1815 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, 1816 VIRTIO_NET_CTRL_VLAN_DEL, &sg)) 1817 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); 1818 return 0; 1819 } 1820 1821 static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu) 1822 { 1823 int i; 1824 1825 if (vi->affinity_hint_set) { 1826 for (i = 0; i < vi->max_queue_pairs; i++) { 1827 virtqueue_set_affinity(vi->rq[i].vq, -1); 1828 virtqueue_set_affinity(vi->sq[i].vq, -1); 1829 } 1830 1831 vi->affinity_hint_set = false; 1832 } 1833 } 1834 1835 static void virtnet_set_affinity(struct virtnet_info *vi) 1836 { 1837 int i; 1838 int cpu; 1839 1840 /* In multiqueue mode, when the number of cpu is equal to the number of 1841 * queue pairs, we let the queue pairs to be private to one cpu by 1842 * setting the affinity hint to eliminate the contention. 1843 */ 1844 if (vi->curr_queue_pairs == 1 || 1845 vi->max_queue_pairs != num_online_cpus()) { 1846 virtnet_clean_affinity(vi, -1); 1847 return; 1848 } 1849 1850 i = 0; 1851 for_each_online_cpu(cpu) { 1852 virtqueue_set_affinity(vi->rq[i].vq, cpu); 1853 virtqueue_set_affinity(vi->sq[i].vq, cpu); 1854 netif_set_xps_queue(vi->dev, cpumask_of(cpu), i); 1855 i++; 1856 } 1857 1858 vi->affinity_hint_set = true; 1859 } 1860 1861 static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node) 1862 { 1863 struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, 1864 node); 1865 virtnet_set_affinity(vi); 1866 return 0; 1867 } 1868 1869 static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node) 1870 { 1871 struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, 1872 node_dead); 1873 virtnet_set_affinity(vi); 1874 return 0; 1875 } 1876 1877 static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node) 1878 { 1879 struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, 1880 node); 1881 1882 virtnet_clean_affinity(vi, cpu); 1883 return 0; 1884 } 1885 1886 static enum cpuhp_state virtionet_online; 1887 1888 static int virtnet_cpu_notif_add(struct virtnet_info *vi) 1889 { 1890 int ret; 1891 1892 ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node); 1893 if (ret) 1894 return ret; 1895 ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD, 1896 &vi->node_dead); 1897 if (!ret) 1898 return ret; 1899 cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); 1900 return ret; 1901 } 1902 1903 static void virtnet_cpu_notif_remove(struct virtnet_info *vi) 1904 { 1905 cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); 1906 cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD, 1907 &vi->node_dead); 1908 } 1909 1910 static void virtnet_get_ringparam(struct net_device *dev, 1911 struct ethtool_ringparam *ring) 1912 { 1913 struct virtnet_info *vi = netdev_priv(dev); 1914 1915 ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq); 1916 ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq); 1917 ring->rx_pending = ring->rx_max_pending; 1918 ring->tx_pending = ring->tx_max_pending; 1919 } 1920 1921 1922 static void virtnet_get_drvinfo(struct net_device *dev, 1923 struct ethtool_drvinfo *info) 1924 { 1925 struct virtnet_info *vi = netdev_priv(dev); 1926 struct virtio_device *vdev = vi->vdev; 1927 1928 strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); 1929 strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version)); 1930 strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info)); 1931 1932 } 1933 1934 /* TODO: Eliminate OOO packets during switching */ 1935 static int virtnet_set_channels(struct net_device *dev, 1936 struct ethtool_channels *channels) 1937 { 1938 struct virtnet_info *vi = netdev_priv(dev); 1939 u16 queue_pairs = channels->combined_count; 1940 int err; 1941 1942 /* We don't support separate rx/tx channels. 1943 * We don't allow setting 'other' channels. 1944 */ 1945 if (channels->rx_count || channels->tx_count || channels->other_count) 1946 return -EINVAL; 1947 1948 if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0) 1949 return -EINVAL; 1950 1951 /* For now we don't support modifying channels while XDP is loaded 1952 * also when XDP is loaded all RX queues have XDP programs so we only 1953 * need to check a single RX queue. 1954 */ 1955 if (vi->rq[0].xdp_prog) 1956 return -EINVAL; 1957 1958 get_online_cpus(); 1959 err = _virtnet_set_queues(vi, queue_pairs); 1960 if (!err) { 1961 netif_set_real_num_tx_queues(dev, queue_pairs); 1962 netif_set_real_num_rx_queues(dev, queue_pairs); 1963 1964 virtnet_set_affinity(vi); 1965 } 1966 put_online_cpus(); 1967 1968 return err; 1969 } 1970 1971 static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) 1972 { 1973 struct virtnet_info *vi = netdev_priv(dev); 1974 char *p = (char *)data; 1975 unsigned int i, j; 1976 1977 switch (stringset) { 1978 case ETH_SS_STATS: 1979 for (i = 0; i < vi->curr_queue_pairs; i++) { 1980 for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) { 1981 snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s", 1982 i, virtnet_rq_stats_desc[j].desc); 1983 p += ETH_GSTRING_LEN; 1984 } 1985 } 1986 1987 for (i = 0; i < vi->curr_queue_pairs; i++) { 1988 for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) { 1989 snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_%s", 1990 i, virtnet_sq_stats_desc[j].desc); 1991 p += ETH_GSTRING_LEN; 1992 } 1993 } 1994 break; 1995 } 1996 } 1997 1998 static int virtnet_get_sset_count(struct net_device *dev, int sset) 1999 { 2000 struct virtnet_info *vi = netdev_priv(dev); 2001 2002 switch (sset) { 2003 case ETH_SS_STATS: 2004 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + 2005 VIRTNET_SQ_STATS_LEN); 2006 default: 2007 return -EOPNOTSUPP; 2008 } 2009 } 2010 2011 static void virtnet_get_ethtool_stats(struct net_device *dev, 2012 struct ethtool_stats *stats, u64 *data) 2013 { 2014 struct virtnet_info *vi = netdev_priv(dev); 2015 unsigned int idx = 0, start, i, j; 2016 const u8 *stats_base; 2017 size_t offset; 2018 2019 for (i = 0; i < vi->curr_queue_pairs; i++) { 2020 struct receive_queue *rq = &vi->rq[i]; 2021 2022 stats_base = (u8 *)&rq->stats; 2023 do { 2024 start = u64_stats_fetch_begin_irq(&rq->stats.syncp); 2025 for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) { 2026 offset = virtnet_rq_stats_desc[j].offset; 2027 data[idx + j] = *(u64 *)(stats_base + offset); 2028 } 2029 } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start)); 2030 idx += VIRTNET_RQ_STATS_LEN; 2031 } 2032 2033 for (i = 0; i < vi->curr_queue_pairs; i++) { 2034 struct send_queue *sq = &vi->sq[i]; 2035 2036 stats_base = (u8 *)&sq->stats; 2037 do { 2038 start = u64_stats_fetch_begin_irq(&sq->stats.syncp); 2039 for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) { 2040 offset = virtnet_sq_stats_desc[j].offset; 2041 data[idx + j] = *(u64 *)(stats_base + offset); 2042 } 2043 } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start)); 2044 idx += VIRTNET_SQ_STATS_LEN; 2045 } 2046 } 2047 2048 static void virtnet_get_channels(struct net_device *dev, 2049 struct ethtool_channels *channels) 2050 { 2051 struct virtnet_info *vi = netdev_priv(dev); 2052 2053 channels->combined_count = vi->curr_queue_pairs; 2054 channels->max_combined = vi->max_queue_pairs; 2055 channels->max_other = 0; 2056 channels->rx_count = 0; 2057 channels->tx_count = 0; 2058 channels->other_count = 0; 2059 } 2060 2061 /* Check if the user is trying to change anything besides speed/duplex */ 2062 static bool 2063 virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd) 2064 { 2065 struct ethtool_link_ksettings diff1 = *cmd; 2066 struct ethtool_link_ksettings diff2 = {}; 2067 2068 /* cmd is always set so we need to clear it, validate the port type 2069 * and also without autonegotiation we can ignore advertising 2070 */ 2071 diff1.base.speed = 0; 2072 diff2.base.port = PORT_OTHER; 2073 ethtool_link_ksettings_zero_link_mode(&diff1, advertising); 2074 diff1.base.duplex = 0; 2075 diff1.base.cmd = 0; 2076 diff1.base.link_mode_masks_nwords = 0; 2077 2078 return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) && 2079 bitmap_empty(diff1.link_modes.supported, 2080 __ETHTOOL_LINK_MODE_MASK_NBITS) && 2081 bitmap_empty(diff1.link_modes.advertising, 2082 __ETHTOOL_LINK_MODE_MASK_NBITS) && 2083 bitmap_empty(diff1.link_modes.lp_advertising, 2084 __ETHTOOL_LINK_MODE_MASK_NBITS); 2085 } 2086 2087 static int virtnet_set_link_ksettings(struct net_device *dev, 2088 const struct ethtool_link_ksettings *cmd) 2089 { 2090 struct virtnet_info *vi = netdev_priv(dev); 2091 u32 speed; 2092 2093 speed = cmd->base.speed; 2094 /* don't allow custom speed and duplex */ 2095 if (!ethtool_validate_speed(speed) || 2096 !ethtool_validate_duplex(cmd->base.duplex) || 2097 !virtnet_validate_ethtool_cmd(cmd)) 2098 return -EINVAL; 2099 vi->speed = speed; 2100 vi->duplex = cmd->base.duplex; 2101 2102 return 0; 2103 } 2104 2105 static int virtnet_get_link_ksettings(struct net_device *dev, 2106 struct ethtool_link_ksettings *cmd) 2107 { 2108 struct virtnet_info *vi = netdev_priv(dev); 2109 2110 cmd->base.speed = vi->speed; 2111 cmd->base.duplex = vi->duplex; 2112 cmd->base.port = PORT_OTHER; 2113 2114 return 0; 2115 } 2116 2117 static void virtnet_init_settings(struct net_device *dev) 2118 { 2119 struct virtnet_info *vi = netdev_priv(dev); 2120 2121 vi->speed = SPEED_UNKNOWN; 2122 vi->duplex = DUPLEX_UNKNOWN; 2123 } 2124 2125 static void virtnet_update_settings(struct virtnet_info *vi) 2126 { 2127 u32 speed; 2128 u8 duplex; 2129 2130 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) 2131 return; 2132 2133 speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config, 2134 speed)); 2135 if (ethtool_validate_speed(speed)) 2136 vi->speed = speed; 2137 duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config, 2138 duplex)); 2139 if (ethtool_validate_duplex(duplex)) 2140 vi->duplex = duplex; 2141 } 2142 2143 static const struct ethtool_ops virtnet_ethtool_ops = { 2144 .get_drvinfo = virtnet_get_drvinfo, 2145 .get_link = ethtool_op_get_link, 2146 .get_ringparam = virtnet_get_ringparam, 2147 .get_strings = virtnet_get_strings, 2148 .get_sset_count = virtnet_get_sset_count, 2149 .get_ethtool_stats = virtnet_get_ethtool_stats, 2150 .set_channels = virtnet_set_channels, 2151 .get_channels = virtnet_get_channels, 2152 .get_ts_info = ethtool_op_get_ts_info, 2153 .get_link_ksettings = virtnet_get_link_ksettings, 2154 .set_link_ksettings = virtnet_set_link_ksettings, 2155 }; 2156 2157 static void virtnet_freeze_down(struct virtio_device *vdev) 2158 { 2159 struct virtnet_info *vi = vdev->priv; 2160 int i; 2161 2162 /* Make sure no work handler is accessing the device */ 2163 flush_work(&vi->config_work); 2164 2165 netif_device_detach(vi->dev); 2166 netif_tx_disable(vi->dev); 2167 cancel_delayed_work_sync(&vi->refill); 2168 2169 if (netif_running(vi->dev)) { 2170 for (i = 0; i < vi->max_queue_pairs; i++) { 2171 napi_disable(&vi->rq[i].napi); 2172 virtnet_napi_tx_disable(&vi->sq[i].napi); 2173 } 2174 } 2175 } 2176 2177 static int init_vqs(struct virtnet_info *vi); 2178 2179 static int virtnet_restore_up(struct virtio_device *vdev) 2180 { 2181 struct virtnet_info *vi = vdev->priv; 2182 int err, i; 2183 2184 err = init_vqs(vi); 2185 if (err) 2186 return err; 2187 2188 virtio_device_ready(vdev); 2189 2190 if (netif_running(vi->dev)) { 2191 for (i = 0; i < vi->curr_queue_pairs; i++) 2192 if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) 2193 schedule_delayed_work(&vi->refill, 0); 2194 2195 for (i = 0; i < vi->max_queue_pairs; i++) { 2196 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 2197 virtnet_napi_tx_enable(vi, vi->sq[i].vq, 2198 &vi->sq[i].napi); 2199 } 2200 } 2201 2202 netif_device_attach(vi->dev); 2203 return err; 2204 } 2205 2206 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads) 2207 { 2208 struct scatterlist sg; 2209 vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads); 2210 2211 sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads)); 2212 2213 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS, 2214 VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) { 2215 dev_warn(&vi->dev->dev, "Fail to set guest offload. \n"); 2216 return -EINVAL; 2217 } 2218 2219 return 0; 2220 } 2221 2222 static int virtnet_clear_guest_offloads(struct virtnet_info *vi) 2223 { 2224 u64 offloads = 0; 2225 2226 if (!vi->guest_offloads) 2227 return 0; 2228 2229 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) 2230 offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM; 2231 2232 return virtnet_set_guest_offloads(vi, offloads); 2233 } 2234 2235 static int virtnet_restore_guest_offloads(struct virtnet_info *vi) 2236 { 2237 u64 offloads = vi->guest_offloads; 2238 2239 if (!vi->guest_offloads) 2240 return 0; 2241 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) 2242 offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM; 2243 2244 return virtnet_set_guest_offloads(vi, offloads); 2245 } 2246 2247 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, 2248 struct netlink_ext_ack *extack) 2249 { 2250 unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr); 2251 struct virtnet_info *vi = netdev_priv(dev); 2252 struct bpf_prog *old_prog; 2253 u16 xdp_qp = 0, curr_qp; 2254 int i, err; 2255 2256 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) 2257 && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || 2258 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || 2259 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || 2260 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) { 2261 NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first"); 2262 return -EOPNOTSUPP; 2263 } 2264 2265 if (vi->mergeable_rx_bufs && !vi->any_header_sg) { 2266 NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); 2267 return -EINVAL; 2268 } 2269 2270 if (dev->mtu > max_sz) { 2271 NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); 2272 netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz); 2273 return -EINVAL; 2274 } 2275 2276 curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs; 2277 if (prog) 2278 xdp_qp = nr_cpu_ids; 2279 2280 /* XDP requires extra queues for XDP_TX */ 2281 if (curr_qp + xdp_qp > vi->max_queue_pairs) { 2282 NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available"); 2283 netdev_warn(dev, "request %i queues but max is %i\n", 2284 curr_qp + xdp_qp, vi->max_queue_pairs); 2285 return -ENOMEM; 2286 } 2287 2288 if (prog) { 2289 prog = bpf_prog_add(prog, vi->max_queue_pairs - 1); 2290 if (IS_ERR(prog)) 2291 return PTR_ERR(prog); 2292 } 2293 2294 /* Make sure NAPI is not using any XDP TX queues for RX. */ 2295 if (netif_running(dev)) 2296 for (i = 0; i < vi->max_queue_pairs; i++) 2297 napi_disable(&vi->rq[i].napi); 2298 2299 netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); 2300 err = _virtnet_set_queues(vi, curr_qp + xdp_qp); 2301 if (err) 2302 goto err; 2303 vi->xdp_queue_pairs = xdp_qp; 2304 2305 for (i = 0; i < vi->max_queue_pairs; i++) { 2306 old_prog = rtnl_dereference(vi->rq[i].xdp_prog); 2307 rcu_assign_pointer(vi->rq[i].xdp_prog, prog); 2308 if (i == 0) { 2309 if (!old_prog) 2310 virtnet_clear_guest_offloads(vi); 2311 if (!prog) 2312 virtnet_restore_guest_offloads(vi); 2313 } 2314 if (old_prog) 2315 bpf_prog_put(old_prog); 2316 if (netif_running(dev)) 2317 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 2318 } 2319 2320 return 0; 2321 2322 err: 2323 for (i = 0; i < vi->max_queue_pairs; i++) 2324 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 2325 if (prog) 2326 bpf_prog_sub(prog, vi->max_queue_pairs - 1); 2327 return err; 2328 } 2329 2330 static u32 virtnet_xdp_query(struct net_device *dev) 2331 { 2332 struct virtnet_info *vi = netdev_priv(dev); 2333 const struct bpf_prog *xdp_prog; 2334 int i; 2335 2336 for (i = 0; i < vi->max_queue_pairs; i++) { 2337 xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog); 2338 if (xdp_prog) 2339 return xdp_prog->aux->id; 2340 } 2341 return 0; 2342 } 2343 2344 static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) 2345 { 2346 switch (xdp->command) { 2347 case XDP_SETUP_PROG: 2348 return virtnet_xdp_set(dev, xdp->prog, xdp->extack); 2349 case XDP_QUERY_PROG: 2350 xdp->prog_id = virtnet_xdp_query(dev); 2351 xdp->prog_attached = !!xdp->prog_id; 2352 return 0; 2353 default: 2354 return -EINVAL; 2355 } 2356 } 2357 2358 static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, 2359 size_t len) 2360 { 2361 struct virtnet_info *vi = netdev_priv(dev); 2362 int ret; 2363 2364 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) 2365 return -EOPNOTSUPP; 2366 2367 ret = snprintf(buf, len, "sby"); 2368 if (ret >= len) 2369 return -EOPNOTSUPP; 2370 2371 return 0; 2372 } 2373 2374 static const struct net_device_ops virtnet_netdev = { 2375 .ndo_open = virtnet_open, 2376 .ndo_stop = virtnet_close, 2377 .ndo_start_xmit = start_xmit, 2378 .ndo_validate_addr = eth_validate_addr, 2379 .ndo_set_mac_address = virtnet_set_mac_address, 2380 .ndo_set_rx_mode = virtnet_set_rx_mode, 2381 .ndo_get_stats64 = virtnet_stats, 2382 .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, 2383 .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, 2384 #ifdef CONFIG_NET_POLL_CONTROLLER 2385 .ndo_poll_controller = virtnet_netpoll, 2386 #endif 2387 .ndo_bpf = virtnet_xdp, 2388 .ndo_xdp_xmit = virtnet_xdp_xmit, 2389 .ndo_features_check = passthru_features_check, 2390 .ndo_get_phys_port_name = virtnet_get_phys_port_name, 2391 }; 2392 2393 static void virtnet_config_changed_work(struct work_struct *work) 2394 { 2395 struct virtnet_info *vi = 2396 container_of(work, struct virtnet_info, config_work); 2397 u16 v; 2398 2399 if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS, 2400 struct virtio_net_config, status, &v) < 0) 2401 return; 2402 2403 if (v & VIRTIO_NET_S_ANNOUNCE) { 2404 netdev_notify_peers(vi->dev); 2405 virtnet_ack_link_announce(vi); 2406 } 2407 2408 /* Ignore unknown (future) status bits */ 2409 v &= VIRTIO_NET_S_LINK_UP; 2410 2411 if (vi->status == v) 2412 return; 2413 2414 vi->status = v; 2415 2416 if (vi->status & VIRTIO_NET_S_LINK_UP) { 2417 virtnet_update_settings(vi); 2418 netif_carrier_on(vi->dev); 2419 netif_tx_wake_all_queues(vi->dev); 2420 } else { 2421 netif_carrier_off(vi->dev); 2422 netif_tx_stop_all_queues(vi->dev); 2423 } 2424 } 2425 2426 static void virtnet_config_changed(struct virtio_device *vdev) 2427 { 2428 struct virtnet_info *vi = vdev->priv; 2429 2430 schedule_work(&vi->config_work); 2431 } 2432 2433 static void virtnet_free_queues(struct virtnet_info *vi) 2434 { 2435 int i; 2436 2437 for (i = 0; i < vi->max_queue_pairs; i++) { 2438 napi_hash_del(&vi->rq[i].napi); 2439 netif_napi_del(&vi->rq[i].napi); 2440 netif_napi_del(&vi->sq[i].napi); 2441 } 2442 2443 /* We called napi_hash_del() before netif_napi_del(), 2444 * we need to respect an RCU grace period before freeing vi->rq 2445 */ 2446 synchronize_net(); 2447 2448 kfree(vi->rq); 2449 kfree(vi->sq); 2450 kfree(vi->ctrl); 2451 } 2452 2453 static void _free_receive_bufs(struct virtnet_info *vi) 2454 { 2455 struct bpf_prog *old_prog; 2456 int i; 2457 2458 for (i = 0; i < vi->max_queue_pairs; i++) { 2459 while (vi->rq[i].pages) 2460 __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0); 2461 2462 old_prog = rtnl_dereference(vi->rq[i].xdp_prog); 2463 RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL); 2464 if (old_prog) 2465 bpf_prog_put(old_prog); 2466 } 2467 } 2468 2469 static void free_receive_bufs(struct virtnet_info *vi) 2470 { 2471 rtnl_lock(); 2472 _free_receive_bufs(vi); 2473 rtnl_unlock(); 2474 } 2475 2476 static void free_receive_page_frags(struct virtnet_info *vi) 2477 { 2478 int i; 2479 for (i = 0; i < vi->max_queue_pairs; i++) 2480 if (vi->rq[i].alloc_frag.page) 2481 put_page(vi->rq[i].alloc_frag.page); 2482 } 2483 2484 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) 2485 { 2486 if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) 2487 return false; 2488 else if (q < vi->curr_queue_pairs) 2489 return true; 2490 else 2491 return false; 2492 } 2493 2494 static void free_unused_bufs(struct virtnet_info *vi) 2495 { 2496 void *buf; 2497 int i; 2498 2499 for (i = 0; i < vi->max_queue_pairs; i++) { 2500 struct virtqueue *vq = vi->sq[i].vq; 2501 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 2502 if (!is_xdp_raw_buffer_queue(vi, i)) 2503 dev_kfree_skb(buf); 2504 else 2505 put_page(virt_to_head_page(buf)); 2506 } 2507 } 2508 2509 for (i = 0; i < vi->max_queue_pairs; i++) { 2510 struct virtqueue *vq = vi->rq[i].vq; 2511 2512 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 2513 if (vi->mergeable_rx_bufs) { 2514 put_page(virt_to_head_page(buf)); 2515 } else if (vi->big_packets) { 2516 give_pages(&vi->rq[i], buf); 2517 } else { 2518 put_page(virt_to_head_page(buf)); 2519 } 2520 } 2521 } 2522 } 2523 2524 static void virtnet_del_vqs(struct virtnet_info *vi) 2525 { 2526 struct virtio_device *vdev = vi->vdev; 2527 2528 virtnet_clean_affinity(vi, -1); 2529 2530 vdev->config->del_vqs(vdev); 2531 2532 virtnet_free_queues(vi); 2533 } 2534 2535 /* How large should a single buffer be so a queue full of these can fit at 2536 * least one full packet? 2537 * Logic below assumes the mergeable buffer header is used. 2538 */ 2539 static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq) 2540 { 2541 const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 2542 unsigned int rq_size = virtqueue_get_vring_size(vq); 2543 unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu; 2544 unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len; 2545 unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size); 2546 2547 return max(max(min_buf_len, hdr_len) - hdr_len, 2548 (unsigned int)GOOD_PACKET_LEN); 2549 } 2550 2551 static int virtnet_find_vqs(struct virtnet_info *vi) 2552 { 2553 vq_callback_t **callbacks; 2554 struct virtqueue **vqs; 2555 int ret = -ENOMEM; 2556 int i, total_vqs; 2557 const char **names; 2558 bool *ctx; 2559 2560 /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by 2561 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by 2562 * possible control vq. 2563 */ 2564 total_vqs = vi->max_queue_pairs * 2 + 2565 virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ); 2566 2567 /* Allocate space for find_vqs parameters */ 2568 vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL); 2569 if (!vqs) 2570 goto err_vq; 2571 callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL); 2572 if (!callbacks) 2573 goto err_callback; 2574 names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL); 2575 if (!names) 2576 goto err_names; 2577 if (!vi->big_packets || vi->mergeable_rx_bufs) { 2578 ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL); 2579 if (!ctx) 2580 goto err_ctx; 2581 } else { 2582 ctx = NULL; 2583 } 2584 2585 /* Parameters for control virtqueue, if any */ 2586 if (vi->has_cvq) { 2587 callbacks[total_vqs - 1] = NULL; 2588 names[total_vqs - 1] = "control"; 2589 } 2590 2591 /* Allocate/initialize parameters for send/receive virtqueues */ 2592 for (i = 0; i < vi->max_queue_pairs; i++) { 2593 callbacks[rxq2vq(i)] = skb_recv_done; 2594 callbacks[txq2vq(i)] = skb_xmit_done; 2595 sprintf(vi->rq[i].name, "input.%d", i); 2596 sprintf(vi->sq[i].name, "output.%d", i); 2597 names[rxq2vq(i)] = vi->rq[i].name; 2598 names[txq2vq(i)] = vi->sq[i].name; 2599 if (ctx) 2600 ctx[rxq2vq(i)] = true; 2601 } 2602 2603 ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks, 2604 names, ctx, NULL); 2605 if (ret) 2606 goto err_find; 2607 2608 if (vi->has_cvq) { 2609 vi->cvq = vqs[total_vqs - 1]; 2610 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) 2611 vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; 2612 } 2613 2614 for (i = 0; i < vi->max_queue_pairs; i++) { 2615 vi->rq[i].vq = vqs[rxq2vq(i)]; 2616 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); 2617 vi->sq[i].vq = vqs[txq2vq(i)]; 2618 } 2619 2620 /* run here: ret == 0. */ 2621 2622 2623 err_find: 2624 kfree(ctx); 2625 err_ctx: 2626 kfree(names); 2627 err_names: 2628 kfree(callbacks); 2629 err_callback: 2630 kfree(vqs); 2631 err_vq: 2632 return ret; 2633 } 2634 2635 static int virtnet_alloc_queues(struct virtnet_info *vi) 2636 { 2637 int i; 2638 2639 vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL); 2640 if (!vi->ctrl) 2641 goto err_ctrl; 2642 vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL); 2643 if (!vi->sq) 2644 goto err_sq; 2645 vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL); 2646 if (!vi->rq) 2647 goto err_rq; 2648 2649 INIT_DELAYED_WORK(&vi->refill, refill_work); 2650 for (i = 0; i < vi->max_queue_pairs; i++) { 2651 vi->rq[i].pages = NULL; 2652 netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll, 2653 napi_weight); 2654 netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, 2655 napi_tx ? napi_weight : 0); 2656 2657 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); 2658 ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); 2659 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); 2660 2661 u64_stats_init(&vi->rq[i].stats.syncp); 2662 u64_stats_init(&vi->sq[i].stats.syncp); 2663 } 2664 2665 return 0; 2666 2667 err_rq: 2668 kfree(vi->sq); 2669 err_sq: 2670 kfree(vi->ctrl); 2671 err_ctrl: 2672 return -ENOMEM; 2673 } 2674 2675 static int init_vqs(struct virtnet_info *vi) 2676 { 2677 int ret; 2678 2679 /* Allocate send & receive queues */ 2680 ret = virtnet_alloc_queues(vi); 2681 if (ret) 2682 goto err; 2683 2684 ret = virtnet_find_vqs(vi); 2685 if (ret) 2686 goto err_free; 2687 2688 get_online_cpus(); 2689 virtnet_set_affinity(vi); 2690 put_online_cpus(); 2691 2692 return 0; 2693 2694 err_free: 2695 virtnet_free_queues(vi); 2696 err: 2697 return ret; 2698 } 2699 2700 #ifdef CONFIG_SYSFS 2701 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue, 2702 char *buf) 2703 { 2704 struct virtnet_info *vi = netdev_priv(queue->dev); 2705 unsigned int queue_index = get_netdev_rx_queue_index(queue); 2706 unsigned int headroom = virtnet_get_headroom(vi); 2707 unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; 2708 struct ewma_pkt_len *avg; 2709 2710 BUG_ON(queue_index >= vi->max_queue_pairs); 2711 avg = &vi->rq[queue_index].mrg_avg_pkt_len; 2712 return sprintf(buf, "%u\n", 2713 get_mergeable_buf_len(&vi->rq[queue_index], avg, 2714 SKB_DATA_ALIGN(headroom + tailroom))); 2715 } 2716 2717 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute = 2718 __ATTR_RO(mergeable_rx_buffer_size); 2719 2720 static struct attribute *virtio_net_mrg_rx_attrs[] = { 2721 &mergeable_rx_buffer_size_attribute.attr, 2722 NULL 2723 }; 2724 2725 static const struct attribute_group virtio_net_mrg_rx_group = { 2726 .name = "virtio_net", 2727 .attrs = virtio_net_mrg_rx_attrs 2728 }; 2729 #endif 2730 2731 static bool virtnet_fail_on_feature(struct virtio_device *vdev, 2732 unsigned int fbit, 2733 const char *fname, const char *dname) 2734 { 2735 if (!virtio_has_feature(vdev, fbit)) 2736 return false; 2737 2738 dev_err(&vdev->dev, "device advertises feature %s but not %s", 2739 fname, dname); 2740 2741 return true; 2742 } 2743 2744 #define VIRTNET_FAIL_ON(vdev, fbit, dbit) \ 2745 virtnet_fail_on_feature(vdev, fbit, #fbit, dbit) 2746 2747 static bool virtnet_validate_features(struct virtio_device *vdev) 2748 { 2749 if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) && 2750 (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX, 2751 "VIRTIO_NET_F_CTRL_VQ") || 2752 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN, 2753 "VIRTIO_NET_F_CTRL_VQ") || 2754 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE, 2755 "VIRTIO_NET_F_CTRL_VQ") || 2756 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") || 2757 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR, 2758 "VIRTIO_NET_F_CTRL_VQ"))) { 2759 return false; 2760 } 2761 2762 return true; 2763 } 2764 2765 #define MIN_MTU ETH_MIN_MTU 2766 #define MAX_MTU ETH_MAX_MTU 2767 2768 static int virtnet_validate(struct virtio_device *vdev) 2769 { 2770 if (!vdev->config->get) { 2771 dev_err(&vdev->dev, "%s failure: config access disabled\n", 2772 __func__); 2773 return -EINVAL; 2774 } 2775 2776 if (!virtnet_validate_features(vdev)) 2777 return -EINVAL; 2778 2779 if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { 2780 int mtu = virtio_cread16(vdev, 2781 offsetof(struct virtio_net_config, 2782 mtu)); 2783 if (mtu < MIN_MTU) 2784 __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU); 2785 } 2786 2787 return 0; 2788 } 2789 2790 static int virtnet_probe(struct virtio_device *vdev) 2791 { 2792 int i, err = -ENOMEM; 2793 struct net_device *dev; 2794 struct virtnet_info *vi; 2795 u16 max_queue_pairs; 2796 int mtu; 2797 2798 /* Find if host supports multiqueue virtio_net device */ 2799 err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ, 2800 struct virtio_net_config, 2801 max_virtqueue_pairs, &max_queue_pairs); 2802 2803 /* We need at least 2 queue's */ 2804 if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || 2805 max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX || 2806 !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) 2807 max_queue_pairs = 1; 2808 2809 /* Allocate ourselves a network device with room for our info */ 2810 dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs); 2811 if (!dev) 2812 return -ENOMEM; 2813 2814 /* Set up network device as normal. */ 2815 dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; 2816 dev->netdev_ops = &virtnet_netdev; 2817 dev->features = NETIF_F_HIGHDMA; 2818 2819 dev->ethtool_ops = &virtnet_ethtool_ops; 2820 SET_NETDEV_DEV(dev, &vdev->dev); 2821 2822 /* Do we support "hardware" checksums? */ 2823 if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { 2824 /* This opens up the world of extra features. */ 2825 dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; 2826 if (csum) 2827 dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; 2828 2829 if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { 2830 dev->hw_features |= NETIF_F_TSO 2831 | NETIF_F_TSO_ECN | NETIF_F_TSO6; 2832 } 2833 /* Individual feature bits: what can host handle? */ 2834 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4)) 2835 dev->hw_features |= NETIF_F_TSO; 2836 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6)) 2837 dev->hw_features |= NETIF_F_TSO6; 2838 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) 2839 dev->hw_features |= NETIF_F_TSO_ECN; 2840 2841 dev->features |= NETIF_F_GSO_ROBUST; 2842 2843 if (gso) 2844 dev->features |= dev->hw_features & NETIF_F_ALL_TSO; 2845 /* (!csum && gso) case will be fixed by register_netdev() */ 2846 } 2847 if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) 2848 dev->features |= NETIF_F_RXCSUM; 2849 2850 dev->vlan_features = dev->features; 2851 2852 /* MTU range: 68 - 65535 */ 2853 dev->min_mtu = MIN_MTU; 2854 dev->max_mtu = MAX_MTU; 2855 2856 /* Configuration may specify what MAC to use. Otherwise random. */ 2857 if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) 2858 virtio_cread_bytes(vdev, 2859 offsetof(struct virtio_net_config, mac), 2860 dev->dev_addr, dev->addr_len); 2861 else 2862 eth_hw_addr_random(dev); 2863 2864 /* Set up our device-specific information */ 2865 vi = netdev_priv(dev); 2866 vi->dev = dev; 2867 vi->vdev = vdev; 2868 vdev->priv = vi; 2869 2870 INIT_WORK(&vi->config_work, virtnet_config_changed_work); 2871 2872 /* If we can receive ANY GSO packets, we must allocate large ones. */ 2873 if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || 2874 virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) || 2875 virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) || 2876 virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO)) 2877 vi->big_packets = true; 2878 2879 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) 2880 vi->mergeable_rx_bufs = true; 2881 2882 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) || 2883 virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) 2884 vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 2885 else 2886 vi->hdr_len = sizeof(struct virtio_net_hdr); 2887 2888 if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) || 2889 virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) 2890 vi->any_header_sg = true; 2891 2892 if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) 2893 vi->has_cvq = true; 2894 2895 if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { 2896 mtu = virtio_cread16(vdev, 2897 offsetof(struct virtio_net_config, 2898 mtu)); 2899 if (mtu < dev->min_mtu) { 2900 /* Should never trigger: MTU was previously validated 2901 * in virtnet_validate. 2902 */ 2903 dev_err(&vdev->dev, "device MTU appears to have changed " 2904 "it is now %d < %d", mtu, dev->min_mtu); 2905 goto free; 2906 } 2907 2908 dev->mtu = mtu; 2909 dev->max_mtu = mtu; 2910 2911 /* TODO: size buffers correctly in this case. */ 2912 if (dev->mtu > ETH_DATA_LEN) 2913 vi->big_packets = true; 2914 } 2915 2916 if (vi->any_header_sg) 2917 dev->needed_headroom = vi->hdr_len; 2918 2919 /* Enable multiqueue by default */ 2920 if (num_online_cpus() >= max_queue_pairs) 2921 vi->curr_queue_pairs = max_queue_pairs; 2922 else 2923 vi->curr_queue_pairs = num_online_cpus(); 2924 vi->max_queue_pairs = max_queue_pairs; 2925 2926 /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ 2927 err = init_vqs(vi); 2928 if (err) 2929 goto free; 2930 2931 #ifdef CONFIG_SYSFS 2932 if (vi->mergeable_rx_bufs) 2933 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group; 2934 #endif 2935 netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); 2936 netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); 2937 2938 virtnet_init_settings(dev); 2939 2940 if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) { 2941 vi->failover = net_failover_create(vi->dev); 2942 if (IS_ERR(vi->failover)) { 2943 err = PTR_ERR(vi->failover); 2944 goto free_vqs; 2945 } 2946 } 2947 2948 err = register_netdev(dev); 2949 if (err) { 2950 pr_debug("virtio_net: registering device failed\n"); 2951 goto free_failover; 2952 } 2953 2954 virtio_device_ready(vdev); 2955 2956 err = virtnet_cpu_notif_add(vi); 2957 if (err) { 2958 pr_debug("virtio_net: registering cpu notifier failed\n"); 2959 goto free_unregister_netdev; 2960 } 2961 2962 virtnet_set_queues(vi, vi->curr_queue_pairs); 2963 2964 /* Assume link up if device can't report link status, 2965 otherwise get link status from config. */ 2966 netif_carrier_off(dev); 2967 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { 2968 schedule_work(&vi->config_work); 2969 } else { 2970 vi->status = VIRTIO_NET_S_LINK_UP; 2971 virtnet_update_settings(vi); 2972 netif_carrier_on(dev); 2973 } 2974 2975 for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) 2976 if (virtio_has_feature(vi->vdev, guest_offloads[i])) 2977 set_bit(guest_offloads[i], &vi->guest_offloads); 2978 2979 pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", 2980 dev->name, max_queue_pairs); 2981 2982 return 0; 2983 2984 free_unregister_netdev: 2985 vi->vdev->config->reset(vdev); 2986 2987 unregister_netdev(dev); 2988 free_failover: 2989 net_failover_destroy(vi->failover); 2990 free_vqs: 2991 cancel_delayed_work_sync(&vi->refill); 2992 free_receive_page_frags(vi); 2993 virtnet_del_vqs(vi); 2994 free: 2995 free_netdev(dev); 2996 return err; 2997 } 2998 2999 static void remove_vq_common(struct virtnet_info *vi) 3000 { 3001 vi->vdev->config->reset(vi->vdev); 3002 3003 /* Free unused buffers in both send and recv, if any. */ 3004 free_unused_bufs(vi); 3005 3006 free_receive_bufs(vi); 3007 3008 free_receive_page_frags(vi); 3009 3010 virtnet_del_vqs(vi); 3011 } 3012 3013 static void virtnet_remove(struct virtio_device *vdev) 3014 { 3015 struct virtnet_info *vi = vdev->priv; 3016 3017 virtnet_cpu_notif_remove(vi); 3018 3019 /* Make sure no work handler is accessing the device. */ 3020 flush_work(&vi->config_work); 3021 3022 unregister_netdev(vi->dev); 3023 3024 net_failover_destroy(vi->failover); 3025 3026 remove_vq_common(vi); 3027 3028 free_netdev(vi->dev); 3029 } 3030 3031 static __maybe_unused int virtnet_freeze(struct virtio_device *vdev) 3032 { 3033 struct virtnet_info *vi = vdev->priv; 3034 3035 virtnet_cpu_notif_remove(vi); 3036 virtnet_freeze_down(vdev); 3037 remove_vq_common(vi); 3038 3039 return 0; 3040 } 3041 3042 static __maybe_unused int virtnet_restore(struct virtio_device *vdev) 3043 { 3044 struct virtnet_info *vi = vdev->priv; 3045 int err; 3046 3047 err = virtnet_restore_up(vdev); 3048 if (err) 3049 return err; 3050 virtnet_set_queues(vi, vi->curr_queue_pairs); 3051 3052 err = virtnet_cpu_notif_add(vi); 3053 if (err) 3054 return err; 3055 3056 return 0; 3057 } 3058 3059 static struct virtio_device_id id_table[] = { 3060 { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, 3061 { 0 }, 3062 }; 3063 3064 #define VIRTNET_FEATURES \ 3065 VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \ 3066 VIRTIO_NET_F_MAC, \ 3067 VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \ 3068 VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \ 3069 VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \ 3070 VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \ 3071 VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ 3072 VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ 3073 VIRTIO_NET_F_CTRL_MAC_ADDR, \ 3074 VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ 3075 VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY 3076 3077 static unsigned int features[] = { 3078 VIRTNET_FEATURES, 3079 }; 3080 3081 static unsigned int features_legacy[] = { 3082 VIRTNET_FEATURES, 3083 VIRTIO_NET_F_GSO, 3084 VIRTIO_F_ANY_LAYOUT, 3085 }; 3086 3087 static struct virtio_driver virtio_net_driver = { 3088 .feature_table = features, 3089 .feature_table_size = ARRAY_SIZE(features), 3090 .feature_table_legacy = features_legacy, 3091 .feature_table_size_legacy = ARRAY_SIZE(features_legacy), 3092 .driver.name = KBUILD_MODNAME, 3093 .driver.owner = THIS_MODULE, 3094 .id_table = id_table, 3095 .validate = virtnet_validate, 3096 .probe = virtnet_probe, 3097 .remove = virtnet_remove, 3098 .config_changed = virtnet_config_changed, 3099 #ifdef CONFIG_PM_SLEEP 3100 .freeze = virtnet_freeze, 3101 .restore = virtnet_restore, 3102 #endif 3103 }; 3104 3105 static __init int virtio_net_driver_init(void) 3106 { 3107 int ret; 3108 3109 ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online", 3110 virtnet_cpu_online, 3111 virtnet_cpu_down_prep); 3112 if (ret < 0) 3113 goto out; 3114 virtionet_online = ret; 3115 ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead", 3116 NULL, virtnet_cpu_dead); 3117 if (ret) 3118 goto err_dead; 3119 3120 ret = register_virtio_driver(&virtio_net_driver); 3121 if (ret) 3122 goto err_virtio; 3123 return 0; 3124 err_virtio: 3125 cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); 3126 err_dead: 3127 cpuhp_remove_multi_state(virtionet_online); 3128 out: 3129 return ret; 3130 } 3131 module_init(virtio_net_driver_init); 3132 3133 static __exit void virtio_net_driver_exit(void) 3134 { 3135 unregister_virtio_driver(&virtio_net_driver); 3136 cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); 3137 cpuhp_remove_multi_state(virtionet_online); 3138 } 3139 module_exit(virtio_net_driver_exit); 3140 3141 MODULE_DEVICE_TABLE(virtio, id_table); 3142 MODULE_DESCRIPTION("Virtio network driver"); 3143 MODULE_LICENSE("GPL"); 3144