1 /* Copyright (C) 2009 Red Hat, Inc. 2 * Author: Michael S. Tsirkin <mst@redhat.com> 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. 5 * 6 * virtio-net server in host kernel. 7 */ 8 9 #include <linux/compat.h> 10 #include <linux/eventfd.h> 11 #include <linux/vhost.h> 12 #include <linux/virtio_net.h> 13 #include <linux/miscdevice.h> 14 #include <linux/module.h> 15 #include <linux/moduleparam.h> 16 #include <linux/mutex.h> 17 #include <linux/workqueue.h> 18 #include <linux/file.h> 19 #include <linux/slab.h> 20 #include <linux/sched/clock.h> 21 #include <linux/sched/signal.h> 22 #include <linux/vmalloc.h> 23 24 #include <linux/net.h> 25 #include <linux/if_packet.h> 26 #include <linux/if_arp.h> 27 #include <linux/if_tun.h> 28 #include <linux/if_macvlan.h> 29 #include <linux/if_tap.h> 30 #include <linux/if_vlan.h> 31 #include <linux/skb_array.h> 32 #include <linux/skbuff.h> 33 34 #include <net/sock.h> 35 #include <net/xdp.h> 36 37 #include "vhost.h" 38 39 static int experimental_zcopytx = 1; 40 module_param(experimental_zcopytx, int, 0444); 41 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" 42 " 1 -Enable; 0 - Disable"); 43 44 /* Max number of bytes transferred before requeueing the job. 45 * Using this limit prevents one virtqueue from starving others. */ 46 #define VHOST_NET_WEIGHT 0x80000 47 48 /* Max number of packets transferred before requeueing the job. 49 * Using this limit prevents one virtqueue from starving others with small 50 * pkts. 51 */ 52 #define VHOST_NET_PKT_WEIGHT 256 53 54 /* MAX number of TX used buffers for outstanding zerocopy */ 55 #define VHOST_MAX_PEND 128 56 #define VHOST_GOODCOPY_LEN 256 57 58 /* 59 * For transmit, used buffer len is unused; we override it to track buffer 60 * status internally; used for zerocopy tx only. 61 */ 62 /* Lower device DMA failed */ 63 #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) 64 /* Lower device DMA done */ 65 #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) 66 /* Lower device DMA in progress */ 67 #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) 68 /* Buffer unused */ 69 #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) 70 71 #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) 72 73 enum { 74 VHOST_NET_FEATURES = VHOST_FEATURES | 75 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | 76 (1ULL << VIRTIO_NET_F_MRG_RXBUF) | 77 (1ULL << VIRTIO_F_IOMMU_PLATFORM) 78 }; 79 80 enum { 81 VHOST_NET_VQ_RX = 0, 82 VHOST_NET_VQ_TX = 1, 83 VHOST_NET_VQ_MAX = 2, 84 }; 85 86 struct vhost_net_ubuf_ref { 87 /* refcount follows semantics similar to kref: 88 * 0: object is released 89 * 1: no outstanding ubufs 90 * >1: outstanding ubufs 91 */ 92 atomic_t refcount; 93 wait_queue_head_t wait; 94 struct vhost_virtqueue *vq; 95 }; 96 97 #define VHOST_RX_BATCH 64 98 struct vhost_net_buf { 99 void **queue; 100 int tail; 101 int head; 102 }; 103 104 struct vhost_net_virtqueue { 105 struct vhost_virtqueue vq; 106 size_t vhost_hlen; 107 size_t sock_hlen; 108 /* vhost zerocopy support fields below: */ 109 /* last used idx for outstanding DMA zerocopy buffers */ 110 int upend_idx; 111 /* For TX, first used idx for DMA done zerocopy buffers 112 * For RX, number of batched heads 113 */ 114 int done_idx; 115 /* an array of userspace buffers info */ 116 struct ubuf_info *ubuf_info; 117 /* Reference counting for outstanding ubufs. 118 * Protected by vq mutex. Writers must also take device mutex. */ 119 struct vhost_net_ubuf_ref *ubufs; 120 struct ptr_ring *rx_ring; 121 struct vhost_net_buf rxq; 122 }; 123 124 struct vhost_net { 125 struct vhost_dev dev; 126 struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; 127 struct vhost_poll poll[VHOST_NET_VQ_MAX]; 128 /* Number of TX recently submitted. 129 * Protected by tx vq lock. */ 130 unsigned tx_packets; 131 /* Number of times zerocopy TX recently failed. 132 * Protected by tx vq lock. */ 133 unsigned tx_zcopy_err; 134 /* Flush in progress. Protected by tx vq lock. */ 135 bool tx_flush; 136 }; 137 138 static unsigned vhost_net_zcopy_mask __read_mostly; 139 140 static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) 141 { 142 if (rxq->tail != rxq->head) 143 return rxq->queue[rxq->head]; 144 else 145 return NULL; 146 } 147 148 static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) 149 { 150 return rxq->tail - rxq->head; 151 } 152 153 static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) 154 { 155 return rxq->tail == rxq->head; 156 } 157 158 static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) 159 { 160 void *ret = vhost_net_buf_get_ptr(rxq); 161 ++rxq->head; 162 return ret; 163 } 164 165 static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) 166 { 167 struct vhost_net_buf *rxq = &nvq->rxq; 168 169 rxq->head = 0; 170 rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, 171 VHOST_RX_BATCH); 172 return rxq->tail; 173 } 174 175 static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) 176 { 177 struct vhost_net_buf *rxq = &nvq->rxq; 178 179 if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { 180 ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, 181 vhost_net_buf_get_size(rxq), 182 tun_ptr_free); 183 rxq->head = rxq->tail = 0; 184 } 185 } 186 187 static int vhost_net_buf_peek_len(void *ptr) 188 { 189 if (tun_is_xdp_frame(ptr)) { 190 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); 191 192 return xdpf->len; 193 } 194 195 return __skb_array_len_with_tag(ptr); 196 } 197 198 static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) 199 { 200 struct vhost_net_buf *rxq = &nvq->rxq; 201 202 if (!vhost_net_buf_is_empty(rxq)) 203 goto out; 204 205 if (!vhost_net_buf_produce(nvq)) 206 return 0; 207 208 out: 209 return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); 210 } 211 212 static void vhost_net_buf_init(struct vhost_net_buf *rxq) 213 { 214 rxq->head = rxq->tail = 0; 215 } 216 217 static void vhost_net_enable_zcopy(int vq) 218 { 219 vhost_net_zcopy_mask |= 0x1 << vq; 220 } 221 222 static struct vhost_net_ubuf_ref * 223 vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) 224 { 225 struct vhost_net_ubuf_ref *ubufs; 226 /* No zero copy backend? Nothing to count. */ 227 if (!zcopy) 228 return NULL; 229 ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); 230 if (!ubufs) 231 return ERR_PTR(-ENOMEM); 232 atomic_set(&ubufs->refcount, 1); 233 init_waitqueue_head(&ubufs->wait); 234 ubufs->vq = vq; 235 return ubufs; 236 } 237 238 static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) 239 { 240 int r = atomic_sub_return(1, &ubufs->refcount); 241 if (unlikely(!r)) 242 wake_up(&ubufs->wait); 243 return r; 244 } 245 246 static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) 247 { 248 vhost_net_ubuf_put(ubufs); 249 wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); 250 } 251 252 static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) 253 { 254 vhost_net_ubuf_put_and_wait(ubufs); 255 kfree(ubufs); 256 } 257 258 static void vhost_net_clear_ubuf_info(struct vhost_net *n) 259 { 260 int i; 261 262 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 263 kfree(n->vqs[i].ubuf_info); 264 n->vqs[i].ubuf_info = NULL; 265 } 266 } 267 268 static int vhost_net_set_ubuf_info(struct vhost_net *n) 269 { 270 bool zcopy; 271 int i; 272 273 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 274 zcopy = vhost_net_zcopy_mask & (0x1 << i); 275 if (!zcopy) 276 continue; 277 n->vqs[i].ubuf_info = kmalloc(sizeof(*n->vqs[i].ubuf_info) * 278 UIO_MAXIOV, GFP_KERNEL); 279 if (!n->vqs[i].ubuf_info) 280 goto err; 281 } 282 return 0; 283 284 err: 285 vhost_net_clear_ubuf_info(n); 286 return -ENOMEM; 287 } 288 289 static void vhost_net_vq_reset(struct vhost_net *n) 290 { 291 int i; 292 293 vhost_net_clear_ubuf_info(n); 294 295 for (i = 0; i < VHOST_NET_VQ_MAX; i++) { 296 n->vqs[i].done_idx = 0; 297 n->vqs[i].upend_idx = 0; 298 n->vqs[i].ubufs = NULL; 299 n->vqs[i].vhost_hlen = 0; 300 n->vqs[i].sock_hlen = 0; 301 vhost_net_buf_init(&n->vqs[i].rxq); 302 } 303 304 } 305 306 static void vhost_net_tx_packet(struct vhost_net *net) 307 { 308 ++net->tx_packets; 309 if (net->tx_packets < 1024) 310 return; 311 net->tx_packets = 0; 312 net->tx_zcopy_err = 0; 313 } 314 315 static void vhost_net_tx_err(struct vhost_net *net) 316 { 317 ++net->tx_zcopy_err; 318 } 319 320 static bool vhost_net_tx_select_zcopy(struct vhost_net *net) 321 { 322 /* TX flush waits for outstanding DMAs to be done. 323 * Don't start new DMAs. 324 */ 325 return !net->tx_flush && 326 net->tx_packets / 64 >= net->tx_zcopy_err; 327 } 328 329 static bool vhost_sock_zcopy(struct socket *sock) 330 { 331 return unlikely(experimental_zcopytx) && 332 sock_flag(sock->sk, SOCK_ZEROCOPY); 333 } 334 335 /* In case of DMA done not in order in lower device driver for some reason. 336 * upend_idx is used to track end of used idx, done_idx is used to track head 337 * of used idx. Once lower device DMA done contiguously, we will signal KVM 338 * guest used idx. 339 */ 340 static void vhost_zerocopy_signal_used(struct vhost_net *net, 341 struct vhost_virtqueue *vq) 342 { 343 struct vhost_net_virtqueue *nvq = 344 container_of(vq, struct vhost_net_virtqueue, vq); 345 int i, add; 346 int j = 0; 347 348 for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { 349 if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) 350 vhost_net_tx_err(net); 351 if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { 352 vq->heads[i].len = VHOST_DMA_CLEAR_LEN; 353 ++j; 354 } else 355 break; 356 } 357 while (j) { 358 add = min(UIO_MAXIOV - nvq->done_idx, j); 359 vhost_add_used_and_signal_n(vq->dev, vq, 360 &vq->heads[nvq->done_idx], add); 361 nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; 362 j -= add; 363 } 364 } 365 366 static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) 367 { 368 struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; 369 struct vhost_virtqueue *vq = ubufs->vq; 370 int cnt; 371 372 rcu_read_lock_bh(); 373 374 /* set len to mark this desc buffers done DMA */ 375 vq->heads[ubuf->desc].len = success ? 376 VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; 377 cnt = vhost_net_ubuf_put(ubufs); 378 379 /* 380 * Trigger polling thread if guest stopped submitting new buffers: 381 * in this case, the refcount after decrement will eventually reach 1. 382 * We also trigger polling periodically after each 16 packets 383 * (the value 16 here is more or less arbitrary, it's tuned to trigger 384 * less than 10% of times). 385 */ 386 if (cnt <= 1 || !(cnt % 16)) 387 vhost_poll_queue(&vq->poll); 388 389 rcu_read_unlock_bh(); 390 } 391 392 static inline unsigned long busy_clock(void) 393 { 394 return local_clock() >> 10; 395 } 396 397 static bool vhost_can_busy_poll(struct vhost_dev *dev, 398 unsigned long endtime) 399 { 400 return likely(!need_resched()) && 401 likely(!time_after(busy_clock(), endtime)) && 402 likely(!signal_pending(current)) && 403 !vhost_has_work(dev); 404 } 405 406 static void vhost_net_disable_vq(struct vhost_net *n, 407 struct vhost_virtqueue *vq) 408 { 409 struct vhost_net_virtqueue *nvq = 410 container_of(vq, struct vhost_net_virtqueue, vq); 411 struct vhost_poll *poll = n->poll + (nvq - n->vqs); 412 if (!vq->private_data) 413 return; 414 vhost_poll_stop(poll); 415 } 416 417 static int vhost_net_enable_vq(struct vhost_net *n, 418 struct vhost_virtqueue *vq) 419 { 420 struct vhost_net_virtqueue *nvq = 421 container_of(vq, struct vhost_net_virtqueue, vq); 422 struct vhost_poll *poll = n->poll + (nvq - n->vqs); 423 struct socket *sock; 424 425 sock = vq->private_data; 426 if (!sock) 427 return 0; 428 429 return vhost_poll_start(poll, sock->file); 430 } 431 432 static int vhost_net_tx_get_vq_desc(struct vhost_net *net, 433 struct vhost_virtqueue *vq, 434 struct iovec iov[], unsigned int iov_size, 435 unsigned int *out_num, unsigned int *in_num) 436 { 437 unsigned long uninitialized_var(endtime); 438 int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), 439 out_num, in_num, NULL, NULL); 440 441 if (r == vq->num && vq->busyloop_timeout) { 442 preempt_disable(); 443 endtime = busy_clock() + vq->busyloop_timeout; 444 while (vhost_can_busy_poll(vq->dev, endtime) && 445 vhost_vq_avail_empty(vq->dev, vq)) 446 cpu_relax(); 447 preempt_enable(); 448 r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), 449 out_num, in_num, NULL, NULL); 450 } 451 452 return r; 453 } 454 455 static bool vhost_exceeds_maxpend(struct vhost_net *net) 456 { 457 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 458 struct vhost_virtqueue *vq = &nvq->vq; 459 460 return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > 461 min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); 462 } 463 464 /* Expects to be always run from workqueue - which acts as 465 * read-size critical section for our kind of RCU. */ 466 static void handle_tx(struct vhost_net *net) 467 { 468 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 469 struct vhost_virtqueue *vq = &nvq->vq; 470 unsigned out, in; 471 int head; 472 struct msghdr msg = { 473 .msg_name = NULL, 474 .msg_namelen = 0, 475 .msg_control = NULL, 476 .msg_controllen = 0, 477 .msg_flags = MSG_DONTWAIT, 478 }; 479 size_t len, total_len = 0; 480 int err; 481 size_t hdr_size; 482 struct socket *sock; 483 struct vhost_net_ubuf_ref *uninitialized_var(ubufs); 484 bool zcopy, zcopy_used; 485 int sent_pkts = 0; 486 487 mutex_lock(&vq->mutex); 488 sock = vq->private_data; 489 if (!sock) 490 goto out; 491 492 if (!vq_iotlb_prefetch(vq)) 493 goto out; 494 495 vhost_disable_notify(&net->dev, vq); 496 vhost_net_disable_vq(net, vq); 497 498 hdr_size = nvq->vhost_hlen; 499 zcopy = nvq->ubufs; 500 501 for (;;) { 502 /* Release DMAs done buffers first */ 503 if (zcopy) 504 vhost_zerocopy_signal_used(net, vq); 505 506 507 head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, 508 ARRAY_SIZE(vq->iov), 509 &out, &in); 510 /* On error, stop handling until the next kick. */ 511 if (unlikely(head < 0)) 512 break; 513 /* Nothing new? Wait for eventfd to tell us they refilled. */ 514 if (head == vq->num) { 515 if (unlikely(vhost_enable_notify(&net->dev, vq))) { 516 vhost_disable_notify(&net->dev, vq); 517 continue; 518 } 519 break; 520 } 521 if (in) { 522 vq_err(vq, "Unexpected descriptor format for TX: " 523 "out %d, int %d\n", out, in); 524 break; 525 } 526 /* Skip header. TODO: support TSO. */ 527 len = iov_length(vq->iov, out); 528 iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len); 529 iov_iter_advance(&msg.msg_iter, hdr_size); 530 /* Sanity check */ 531 if (!msg_data_left(&msg)) { 532 vq_err(vq, "Unexpected header len for TX: " 533 "%zd expected %zd\n", 534 len, hdr_size); 535 break; 536 } 537 len = msg_data_left(&msg); 538 539 zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN 540 && !vhost_exceeds_maxpend(net) 541 && vhost_net_tx_select_zcopy(net); 542 543 /* use msg_control to pass vhost zerocopy ubuf info to skb */ 544 if (zcopy_used) { 545 struct ubuf_info *ubuf; 546 ubuf = nvq->ubuf_info + nvq->upend_idx; 547 548 vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); 549 vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; 550 ubuf->callback = vhost_zerocopy_callback; 551 ubuf->ctx = nvq->ubufs; 552 ubuf->desc = nvq->upend_idx; 553 refcount_set(&ubuf->refcnt, 1); 554 msg.msg_control = ubuf; 555 msg.msg_controllen = sizeof(ubuf); 556 ubufs = nvq->ubufs; 557 atomic_inc(&ubufs->refcount); 558 nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; 559 } else { 560 msg.msg_control = NULL; 561 ubufs = NULL; 562 } 563 564 total_len += len; 565 if (total_len < VHOST_NET_WEIGHT && 566 !vhost_vq_avail_empty(&net->dev, vq) && 567 likely(!vhost_exceeds_maxpend(net))) { 568 msg.msg_flags |= MSG_MORE; 569 } else { 570 msg.msg_flags &= ~MSG_MORE; 571 } 572 573 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 574 err = sock->ops->sendmsg(sock, &msg, len); 575 if (unlikely(err < 0)) { 576 if (zcopy_used) { 577 vhost_net_ubuf_put(ubufs); 578 nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) 579 % UIO_MAXIOV; 580 } 581 vhost_discard_vq_desc(vq, 1); 582 vhost_net_enable_vq(net, vq); 583 break; 584 } 585 if (err != len) 586 pr_debug("Truncated TX packet: " 587 " len %d != %zd\n", err, len); 588 if (!zcopy_used) 589 vhost_add_used_and_signal(&net->dev, vq, head, 0); 590 else 591 vhost_zerocopy_signal_used(net, vq); 592 vhost_net_tx_packet(net); 593 if (unlikely(total_len >= VHOST_NET_WEIGHT) || 594 unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) { 595 vhost_poll_queue(&vq->poll); 596 break; 597 } 598 } 599 out: 600 mutex_unlock(&vq->mutex); 601 } 602 603 static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) 604 { 605 struct sk_buff *head; 606 int len = 0; 607 unsigned long flags; 608 609 if (rvq->rx_ring) 610 return vhost_net_buf_peek(rvq); 611 612 spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); 613 head = skb_peek(&sk->sk_receive_queue); 614 if (likely(head)) { 615 len = head->len; 616 if (skb_vlan_tag_present(head)) 617 len += VLAN_HLEN; 618 } 619 620 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); 621 return len; 622 } 623 624 static int sk_has_rx_data(struct sock *sk) 625 { 626 struct socket *sock = sk->sk_socket; 627 628 if (sock->ops->peek_len) 629 return sock->ops->peek_len(sock); 630 631 return skb_queue_empty(&sk->sk_receive_queue); 632 } 633 634 static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq) 635 { 636 struct vhost_virtqueue *vq = &nvq->vq; 637 struct vhost_dev *dev = vq->dev; 638 639 if (!nvq->done_idx) 640 return; 641 642 vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); 643 nvq->done_idx = 0; 644 } 645 646 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) 647 { 648 struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX]; 649 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 650 struct vhost_virtqueue *vq = &nvq->vq; 651 unsigned long uninitialized_var(endtime); 652 int len = peek_head_len(rvq, sk); 653 654 if (!len && vq->busyloop_timeout) { 655 /* Flush batched heads first */ 656 vhost_rx_signal_used(rvq); 657 /* Both tx vq and rx socket were polled here */ 658 mutex_lock_nested(&vq->mutex, 1); 659 vhost_disable_notify(&net->dev, vq); 660 661 preempt_disable(); 662 endtime = busy_clock() + vq->busyloop_timeout; 663 664 while (vhost_can_busy_poll(&net->dev, endtime) && 665 !sk_has_rx_data(sk) && 666 vhost_vq_avail_empty(&net->dev, vq)) 667 cpu_relax(); 668 669 preempt_enable(); 670 671 if (!vhost_vq_avail_empty(&net->dev, vq)) 672 vhost_poll_queue(&vq->poll); 673 else if (unlikely(vhost_enable_notify(&net->dev, vq))) { 674 vhost_disable_notify(&net->dev, vq); 675 vhost_poll_queue(&vq->poll); 676 } 677 678 mutex_unlock(&vq->mutex); 679 680 len = peek_head_len(rvq, sk); 681 } 682 683 return len; 684 } 685 686 /* This is a multi-buffer version of vhost_get_desc, that works if 687 * vq has read descriptors only. 688 * @vq - the relevant virtqueue 689 * @datalen - data length we'll be reading 690 * @iovcount - returned count of io vectors we fill 691 * @log - vhost log 692 * @log_num - log offset 693 * @quota - headcount quota, 1 for big buffer 694 * returns number of buffer heads allocated, negative on error 695 */ 696 static int get_rx_bufs(struct vhost_virtqueue *vq, 697 struct vring_used_elem *heads, 698 int datalen, 699 unsigned *iovcount, 700 struct vhost_log *log, 701 unsigned *log_num, 702 unsigned int quota) 703 { 704 unsigned int out, in; 705 int seg = 0; 706 int headcount = 0; 707 unsigned d; 708 int r, nlogs = 0; 709 /* len is always initialized before use since we are always called with 710 * datalen > 0. 711 */ 712 u32 uninitialized_var(len); 713 714 while (datalen > 0 && headcount < quota) { 715 if (unlikely(seg >= UIO_MAXIOV)) { 716 r = -ENOBUFS; 717 goto err; 718 } 719 r = vhost_get_vq_desc(vq, vq->iov + seg, 720 ARRAY_SIZE(vq->iov) - seg, &out, 721 &in, log, log_num); 722 if (unlikely(r < 0)) 723 goto err; 724 725 d = r; 726 if (d == vq->num) { 727 r = 0; 728 goto err; 729 } 730 if (unlikely(out || in <= 0)) { 731 vq_err(vq, "unexpected descriptor format for RX: " 732 "out %d, in %d\n", out, in); 733 r = -EINVAL; 734 goto err; 735 } 736 if (unlikely(log)) { 737 nlogs += *log_num; 738 log += *log_num; 739 } 740 heads[headcount].id = cpu_to_vhost32(vq, d); 741 len = iov_length(vq->iov + seg, in); 742 heads[headcount].len = cpu_to_vhost32(vq, len); 743 datalen -= len; 744 ++headcount; 745 seg += in; 746 } 747 heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); 748 *iovcount = seg; 749 if (unlikely(log)) 750 *log_num = nlogs; 751 752 /* Detect overrun */ 753 if (unlikely(datalen > 0)) { 754 r = UIO_MAXIOV + 1; 755 goto err; 756 } 757 return headcount; 758 err: 759 vhost_discard_vq_desc(vq, headcount); 760 return r; 761 } 762 763 /* Expects to be always run from workqueue - which acts as 764 * read-size critical section for our kind of RCU. */ 765 static void handle_rx(struct vhost_net *net) 766 { 767 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; 768 struct vhost_virtqueue *vq = &nvq->vq; 769 unsigned uninitialized_var(in), log; 770 struct vhost_log *vq_log; 771 struct msghdr msg = { 772 .msg_name = NULL, 773 .msg_namelen = 0, 774 .msg_control = NULL, /* FIXME: get and handle RX aux data. */ 775 .msg_controllen = 0, 776 .msg_flags = MSG_DONTWAIT, 777 }; 778 struct virtio_net_hdr hdr = { 779 .flags = 0, 780 .gso_type = VIRTIO_NET_HDR_GSO_NONE 781 }; 782 size_t total_len = 0; 783 int err, mergeable; 784 s16 headcount; 785 size_t vhost_hlen, sock_hlen; 786 size_t vhost_len, sock_len; 787 struct socket *sock; 788 struct iov_iter fixup; 789 __virtio16 num_buffers; 790 int recv_pkts = 0; 791 792 mutex_lock_nested(&vq->mutex, 0); 793 sock = vq->private_data; 794 if (!sock) 795 goto out; 796 797 if (!vq_iotlb_prefetch(vq)) 798 goto out; 799 800 vhost_disable_notify(&net->dev, vq); 801 vhost_net_disable_vq(net, vq); 802 803 vhost_hlen = nvq->vhost_hlen; 804 sock_hlen = nvq->sock_hlen; 805 806 vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? 807 vq->log : NULL; 808 mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); 809 810 while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) { 811 sock_len += sock_hlen; 812 vhost_len = sock_len + vhost_hlen; 813 headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, 814 vhost_len, &in, vq_log, &log, 815 likely(mergeable) ? UIO_MAXIOV : 1); 816 /* On error, stop handling until the next kick. */ 817 if (unlikely(headcount < 0)) 818 goto out; 819 /* OK, now we need to know about added descriptors. */ 820 if (!headcount) { 821 if (unlikely(vhost_enable_notify(&net->dev, vq))) { 822 /* They have slipped one in as we were 823 * doing that: check again. */ 824 vhost_disable_notify(&net->dev, vq); 825 continue; 826 } 827 /* Nothing new? Wait for eventfd to tell us 828 * they refilled. */ 829 goto out; 830 } 831 if (nvq->rx_ring) 832 msg.msg_control = vhost_net_buf_consume(&nvq->rxq); 833 /* On overrun, truncate and discard */ 834 if (unlikely(headcount > UIO_MAXIOV)) { 835 iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); 836 err = sock->ops->recvmsg(sock, &msg, 837 1, MSG_DONTWAIT | MSG_TRUNC); 838 pr_debug("Discarded rx packet: len %zd\n", sock_len); 839 continue; 840 } 841 /* We don't need to be notified again. */ 842 iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); 843 fixup = msg.msg_iter; 844 if (unlikely((vhost_hlen))) { 845 /* We will supply the header ourselves 846 * TODO: support TSO. 847 */ 848 iov_iter_advance(&msg.msg_iter, vhost_hlen); 849 } 850 err = sock->ops->recvmsg(sock, &msg, 851 sock_len, MSG_DONTWAIT | MSG_TRUNC); 852 /* Userspace might have consumed the packet meanwhile: 853 * it's not supposed to do this usually, but might be hard 854 * to prevent. Discard data we got (if any) and keep going. */ 855 if (unlikely(err != sock_len)) { 856 pr_debug("Discarded rx packet: " 857 " len %d, expected %zd\n", err, sock_len); 858 vhost_discard_vq_desc(vq, headcount); 859 continue; 860 } 861 /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ 862 if (unlikely(vhost_hlen)) { 863 if (copy_to_iter(&hdr, sizeof(hdr), 864 &fixup) != sizeof(hdr)) { 865 vq_err(vq, "Unable to write vnet_hdr " 866 "at addr %p\n", vq->iov->iov_base); 867 goto out; 868 } 869 } else { 870 /* Header came from socket; we'll need to patch 871 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF 872 */ 873 iov_iter_advance(&fixup, sizeof(hdr)); 874 } 875 /* TODO: Should check and handle checksum. */ 876 877 num_buffers = cpu_to_vhost16(vq, headcount); 878 if (likely(mergeable) && 879 copy_to_iter(&num_buffers, sizeof num_buffers, 880 &fixup) != sizeof num_buffers) { 881 vq_err(vq, "Failed num_buffers write"); 882 vhost_discard_vq_desc(vq, headcount); 883 goto out; 884 } 885 nvq->done_idx += headcount; 886 if (nvq->done_idx > VHOST_RX_BATCH) 887 vhost_rx_signal_used(nvq); 888 if (unlikely(vq_log)) 889 vhost_log_write(vq, vq_log, log, vhost_len); 890 total_len += vhost_len; 891 if (unlikely(total_len >= VHOST_NET_WEIGHT) || 892 unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) { 893 vhost_poll_queue(&vq->poll); 894 goto out; 895 } 896 } 897 vhost_net_enable_vq(net, vq); 898 out: 899 vhost_rx_signal_used(nvq); 900 mutex_unlock(&vq->mutex); 901 } 902 903 static void handle_tx_kick(struct vhost_work *work) 904 { 905 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, 906 poll.work); 907 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); 908 909 handle_tx(net); 910 } 911 912 static void handle_rx_kick(struct vhost_work *work) 913 { 914 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, 915 poll.work); 916 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); 917 918 handle_rx(net); 919 } 920 921 static void handle_tx_net(struct vhost_work *work) 922 { 923 struct vhost_net *net = container_of(work, struct vhost_net, 924 poll[VHOST_NET_VQ_TX].work); 925 handle_tx(net); 926 } 927 928 static void handle_rx_net(struct vhost_work *work) 929 { 930 struct vhost_net *net = container_of(work, struct vhost_net, 931 poll[VHOST_NET_VQ_RX].work); 932 handle_rx(net); 933 } 934 935 static int vhost_net_open(struct inode *inode, struct file *f) 936 { 937 struct vhost_net *n; 938 struct vhost_dev *dev; 939 struct vhost_virtqueue **vqs; 940 void **queue; 941 int i; 942 943 n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); 944 if (!n) 945 return -ENOMEM; 946 vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); 947 if (!vqs) { 948 kvfree(n); 949 return -ENOMEM; 950 } 951 952 queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *), 953 GFP_KERNEL); 954 if (!queue) { 955 kfree(vqs); 956 kvfree(n); 957 return -ENOMEM; 958 } 959 n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; 960 961 dev = &n->dev; 962 vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; 963 vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; 964 n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; 965 n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; 966 for (i = 0; i < VHOST_NET_VQ_MAX; i++) { 967 n->vqs[i].ubufs = NULL; 968 n->vqs[i].ubuf_info = NULL; 969 n->vqs[i].upend_idx = 0; 970 n->vqs[i].done_idx = 0; 971 n->vqs[i].vhost_hlen = 0; 972 n->vqs[i].sock_hlen = 0; 973 n->vqs[i].rx_ring = NULL; 974 vhost_net_buf_init(&n->vqs[i].rxq); 975 } 976 vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); 977 978 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev); 979 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev); 980 981 f->private_data = n; 982 983 return 0; 984 } 985 986 static struct socket *vhost_net_stop_vq(struct vhost_net *n, 987 struct vhost_virtqueue *vq) 988 { 989 struct socket *sock; 990 struct vhost_net_virtqueue *nvq = 991 container_of(vq, struct vhost_net_virtqueue, vq); 992 993 mutex_lock(&vq->mutex); 994 sock = vq->private_data; 995 vhost_net_disable_vq(n, vq); 996 vq->private_data = NULL; 997 vhost_net_buf_unproduce(nvq); 998 nvq->rx_ring = NULL; 999 mutex_unlock(&vq->mutex); 1000 return sock; 1001 } 1002 1003 static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, 1004 struct socket **rx_sock) 1005 { 1006 *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); 1007 *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); 1008 } 1009 1010 static void vhost_net_flush_vq(struct vhost_net *n, int index) 1011 { 1012 vhost_poll_flush(n->poll + index); 1013 vhost_poll_flush(&n->vqs[index].vq.poll); 1014 } 1015 1016 static void vhost_net_flush(struct vhost_net *n) 1017 { 1018 vhost_net_flush_vq(n, VHOST_NET_VQ_TX); 1019 vhost_net_flush_vq(n, VHOST_NET_VQ_RX); 1020 if (n->vqs[VHOST_NET_VQ_TX].ubufs) { 1021 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 1022 n->tx_flush = true; 1023 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 1024 /* Wait for all lower device DMAs done. */ 1025 vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); 1026 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 1027 n->tx_flush = false; 1028 atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); 1029 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 1030 } 1031 } 1032 1033 static int vhost_net_release(struct inode *inode, struct file *f) 1034 { 1035 struct vhost_net *n = f->private_data; 1036 struct socket *tx_sock; 1037 struct socket *rx_sock; 1038 1039 vhost_net_stop(n, &tx_sock, &rx_sock); 1040 vhost_net_flush(n); 1041 vhost_dev_stop(&n->dev); 1042 vhost_dev_cleanup(&n->dev); 1043 vhost_net_vq_reset(n); 1044 if (tx_sock) 1045 sockfd_put(tx_sock); 1046 if (rx_sock) 1047 sockfd_put(rx_sock); 1048 /* Make sure no callbacks are outstanding */ 1049 synchronize_rcu_bh(); 1050 /* We do an extra flush before freeing memory, 1051 * since jobs can re-queue themselves. */ 1052 vhost_net_flush(n); 1053 kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); 1054 kfree(n->dev.vqs); 1055 kvfree(n); 1056 return 0; 1057 } 1058 1059 static struct socket *get_raw_socket(int fd) 1060 { 1061 struct { 1062 struct sockaddr_ll sa; 1063 char buf[MAX_ADDR_LEN]; 1064 } uaddr; 1065 int r; 1066 struct socket *sock = sockfd_lookup(fd, &r); 1067 1068 if (!sock) 1069 return ERR_PTR(-ENOTSOCK); 1070 1071 /* Parameter checking */ 1072 if (sock->sk->sk_type != SOCK_RAW) { 1073 r = -ESOCKTNOSUPPORT; 1074 goto err; 1075 } 1076 1077 r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0); 1078 if (r < 0) 1079 goto err; 1080 1081 if (uaddr.sa.sll_family != AF_PACKET) { 1082 r = -EPFNOSUPPORT; 1083 goto err; 1084 } 1085 return sock; 1086 err: 1087 sockfd_put(sock); 1088 return ERR_PTR(r); 1089 } 1090 1091 static struct ptr_ring *get_tap_ptr_ring(int fd) 1092 { 1093 struct ptr_ring *ring; 1094 struct file *file = fget(fd); 1095 1096 if (!file) 1097 return NULL; 1098 ring = tun_get_tx_ring(file); 1099 if (!IS_ERR(ring)) 1100 goto out; 1101 ring = tap_get_ptr_ring(file); 1102 if (!IS_ERR(ring)) 1103 goto out; 1104 ring = NULL; 1105 out: 1106 fput(file); 1107 return ring; 1108 } 1109 1110 static struct socket *get_tap_socket(int fd) 1111 { 1112 struct file *file = fget(fd); 1113 struct socket *sock; 1114 1115 if (!file) 1116 return ERR_PTR(-EBADF); 1117 sock = tun_get_socket(file); 1118 if (!IS_ERR(sock)) 1119 return sock; 1120 sock = tap_get_socket(file); 1121 if (IS_ERR(sock)) 1122 fput(file); 1123 return sock; 1124 } 1125 1126 static struct socket *get_socket(int fd) 1127 { 1128 struct socket *sock; 1129 1130 /* special case to disable backend */ 1131 if (fd == -1) 1132 return NULL; 1133 sock = get_raw_socket(fd); 1134 if (!IS_ERR(sock)) 1135 return sock; 1136 sock = get_tap_socket(fd); 1137 if (!IS_ERR(sock)) 1138 return sock; 1139 return ERR_PTR(-ENOTSOCK); 1140 } 1141 1142 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) 1143 { 1144 struct socket *sock, *oldsock; 1145 struct vhost_virtqueue *vq; 1146 struct vhost_net_virtqueue *nvq; 1147 struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; 1148 int r; 1149 1150 mutex_lock(&n->dev.mutex); 1151 r = vhost_dev_check_owner(&n->dev); 1152 if (r) 1153 goto err; 1154 1155 if (index >= VHOST_NET_VQ_MAX) { 1156 r = -ENOBUFS; 1157 goto err; 1158 } 1159 vq = &n->vqs[index].vq; 1160 nvq = &n->vqs[index]; 1161 mutex_lock(&vq->mutex); 1162 1163 /* Verify that ring has been setup correctly. */ 1164 if (!vhost_vq_access_ok(vq)) { 1165 r = -EFAULT; 1166 goto err_vq; 1167 } 1168 sock = get_socket(fd); 1169 if (IS_ERR(sock)) { 1170 r = PTR_ERR(sock); 1171 goto err_vq; 1172 } 1173 1174 /* start polling new socket */ 1175 oldsock = vq->private_data; 1176 if (sock != oldsock) { 1177 ubufs = vhost_net_ubuf_alloc(vq, 1178 sock && vhost_sock_zcopy(sock)); 1179 if (IS_ERR(ubufs)) { 1180 r = PTR_ERR(ubufs); 1181 goto err_ubufs; 1182 } 1183 1184 vhost_net_disable_vq(n, vq); 1185 vq->private_data = sock; 1186 vhost_net_buf_unproduce(nvq); 1187 r = vhost_vq_init_access(vq); 1188 if (r) 1189 goto err_used; 1190 r = vhost_net_enable_vq(n, vq); 1191 if (r) 1192 goto err_used; 1193 if (index == VHOST_NET_VQ_RX) 1194 nvq->rx_ring = get_tap_ptr_ring(fd); 1195 1196 oldubufs = nvq->ubufs; 1197 nvq->ubufs = ubufs; 1198 1199 n->tx_packets = 0; 1200 n->tx_zcopy_err = 0; 1201 n->tx_flush = false; 1202 } 1203 1204 mutex_unlock(&vq->mutex); 1205 1206 if (oldubufs) { 1207 vhost_net_ubuf_put_wait_and_free(oldubufs); 1208 mutex_lock(&vq->mutex); 1209 vhost_zerocopy_signal_used(n, vq); 1210 mutex_unlock(&vq->mutex); 1211 } 1212 1213 if (oldsock) { 1214 vhost_net_flush_vq(n, index); 1215 sockfd_put(oldsock); 1216 } 1217 1218 mutex_unlock(&n->dev.mutex); 1219 return 0; 1220 1221 err_used: 1222 vq->private_data = oldsock; 1223 vhost_net_enable_vq(n, vq); 1224 if (ubufs) 1225 vhost_net_ubuf_put_wait_and_free(ubufs); 1226 err_ubufs: 1227 sockfd_put(sock); 1228 err_vq: 1229 mutex_unlock(&vq->mutex); 1230 err: 1231 mutex_unlock(&n->dev.mutex); 1232 return r; 1233 } 1234 1235 static long vhost_net_reset_owner(struct vhost_net *n) 1236 { 1237 struct socket *tx_sock = NULL; 1238 struct socket *rx_sock = NULL; 1239 long err; 1240 struct vhost_umem *umem; 1241 1242 mutex_lock(&n->dev.mutex); 1243 err = vhost_dev_check_owner(&n->dev); 1244 if (err) 1245 goto done; 1246 umem = vhost_dev_reset_owner_prepare(); 1247 if (!umem) { 1248 err = -ENOMEM; 1249 goto done; 1250 } 1251 vhost_net_stop(n, &tx_sock, &rx_sock); 1252 vhost_net_flush(n); 1253 vhost_dev_stop(&n->dev); 1254 vhost_dev_reset_owner(&n->dev, umem); 1255 vhost_net_vq_reset(n); 1256 done: 1257 mutex_unlock(&n->dev.mutex); 1258 if (tx_sock) 1259 sockfd_put(tx_sock); 1260 if (rx_sock) 1261 sockfd_put(rx_sock); 1262 return err; 1263 } 1264 1265 static int vhost_net_set_features(struct vhost_net *n, u64 features) 1266 { 1267 size_t vhost_hlen, sock_hlen, hdr_len; 1268 int i; 1269 1270 hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | 1271 (1ULL << VIRTIO_F_VERSION_1))) ? 1272 sizeof(struct virtio_net_hdr_mrg_rxbuf) : 1273 sizeof(struct virtio_net_hdr); 1274 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { 1275 /* vhost provides vnet_hdr */ 1276 vhost_hlen = hdr_len; 1277 sock_hlen = 0; 1278 } else { 1279 /* socket provides vnet_hdr */ 1280 vhost_hlen = 0; 1281 sock_hlen = hdr_len; 1282 } 1283 mutex_lock(&n->dev.mutex); 1284 if ((features & (1 << VHOST_F_LOG_ALL)) && 1285 !vhost_log_access_ok(&n->dev)) 1286 goto out_unlock; 1287 1288 if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { 1289 if (vhost_init_device_iotlb(&n->dev, true)) 1290 goto out_unlock; 1291 } 1292 1293 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 1294 mutex_lock(&n->vqs[i].vq.mutex); 1295 n->vqs[i].vq.acked_features = features; 1296 n->vqs[i].vhost_hlen = vhost_hlen; 1297 n->vqs[i].sock_hlen = sock_hlen; 1298 mutex_unlock(&n->vqs[i].vq.mutex); 1299 } 1300 mutex_unlock(&n->dev.mutex); 1301 return 0; 1302 1303 out_unlock: 1304 mutex_unlock(&n->dev.mutex); 1305 return -EFAULT; 1306 } 1307 1308 static long vhost_net_set_owner(struct vhost_net *n) 1309 { 1310 int r; 1311 1312 mutex_lock(&n->dev.mutex); 1313 if (vhost_dev_has_owner(&n->dev)) { 1314 r = -EBUSY; 1315 goto out; 1316 } 1317 r = vhost_net_set_ubuf_info(n); 1318 if (r) 1319 goto out; 1320 r = vhost_dev_set_owner(&n->dev); 1321 if (r) 1322 vhost_net_clear_ubuf_info(n); 1323 vhost_net_flush(n); 1324 out: 1325 mutex_unlock(&n->dev.mutex); 1326 return r; 1327 } 1328 1329 static long vhost_net_ioctl(struct file *f, unsigned int ioctl, 1330 unsigned long arg) 1331 { 1332 struct vhost_net *n = f->private_data; 1333 void __user *argp = (void __user *)arg; 1334 u64 __user *featurep = argp; 1335 struct vhost_vring_file backend; 1336 u64 features; 1337 int r; 1338 1339 switch (ioctl) { 1340 case VHOST_NET_SET_BACKEND: 1341 if (copy_from_user(&backend, argp, sizeof backend)) 1342 return -EFAULT; 1343 return vhost_net_set_backend(n, backend.index, backend.fd); 1344 case VHOST_GET_FEATURES: 1345 features = VHOST_NET_FEATURES; 1346 if (copy_to_user(featurep, &features, sizeof features)) 1347 return -EFAULT; 1348 return 0; 1349 case VHOST_SET_FEATURES: 1350 if (copy_from_user(&features, featurep, sizeof features)) 1351 return -EFAULT; 1352 if (features & ~VHOST_NET_FEATURES) 1353 return -EOPNOTSUPP; 1354 return vhost_net_set_features(n, features); 1355 case VHOST_RESET_OWNER: 1356 return vhost_net_reset_owner(n); 1357 case VHOST_SET_OWNER: 1358 return vhost_net_set_owner(n); 1359 default: 1360 mutex_lock(&n->dev.mutex); 1361 r = vhost_dev_ioctl(&n->dev, ioctl, argp); 1362 if (r == -ENOIOCTLCMD) 1363 r = vhost_vring_ioctl(&n->dev, ioctl, argp); 1364 else 1365 vhost_net_flush(n); 1366 mutex_unlock(&n->dev.mutex); 1367 return r; 1368 } 1369 } 1370 1371 #ifdef CONFIG_COMPAT 1372 static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, 1373 unsigned long arg) 1374 { 1375 return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); 1376 } 1377 #endif 1378 1379 static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) 1380 { 1381 struct file *file = iocb->ki_filp; 1382 struct vhost_net *n = file->private_data; 1383 struct vhost_dev *dev = &n->dev; 1384 int noblock = file->f_flags & O_NONBLOCK; 1385 1386 return vhost_chr_read_iter(dev, to, noblock); 1387 } 1388 1389 static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, 1390 struct iov_iter *from) 1391 { 1392 struct file *file = iocb->ki_filp; 1393 struct vhost_net *n = file->private_data; 1394 struct vhost_dev *dev = &n->dev; 1395 1396 return vhost_chr_write_iter(dev, from); 1397 } 1398 1399 static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) 1400 { 1401 struct vhost_net *n = file->private_data; 1402 struct vhost_dev *dev = &n->dev; 1403 1404 return vhost_chr_poll(file, dev, wait); 1405 } 1406 1407 static const struct file_operations vhost_net_fops = { 1408 .owner = THIS_MODULE, 1409 .release = vhost_net_release, 1410 .read_iter = vhost_net_chr_read_iter, 1411 .write_iter = vhost_net_chr_write_iter, 1412 .poll = vhost_net_chr_poll, 1413 .unlocked_ioctl = vhost_net_ioctl, 1414 #ifdef CONFIG_COMPAT 1415 .compat_ioctl = vhost_net_compat_ioctl, 1416 #endif 1417 .open = vhost_net_open, 1418 .llseek = noop_llseek, 1419 }; 1420 1421 static struct miscdevice vhost_net_misc = { 1422 .minor = VHOST_NET_MINOR, 1423 .name = "vhost-net", 1424 .fops = &vhost_net_fops, 1425 }; 1426 1427 static int vhost_net_init(void) 1428 { 1429 if (experimental_zcopytx) 1430 vhost_net_enable_zcopy(VHOST_NET_VQ_TX); 1431 return misc_register(&vhost_net_misc); 1432 } 1433 module_init(vhost_net_init); 1434 1435 static void vhost_net_exit(void) 1436 { 1437 misc_deregister(&vhost_net_misc); 1438 } 1439 module_exit(vhost_net_exit); 1440 1441 MODULE_VERSION("0.0.1"); 1442 MODULE_LICENSE("GPL v2"); 1443 MODULE_AUTHOR("Michael S. Tsirkin"); 1444 MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); 1445 MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); 1446 MODULE_ALIAS("devname:vhost-net"); 1447