1 #include <linux/etherdevice.h> 2 #include <linux/if_macvlan.h> 3 #include <linux/if_vlan.h> 4 #include <linux/interrupt.h> 5 #include <linux/nsproxy.h> 6 #include <linux/compat.h> 7 #include <linux/if_tun.h> 8 #include <linux/module.h> 9 #include <linux/skbuff.h> 10 #include <linux/cache.h> 11 #include <linux/sched.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/init.h> 15 #include <linux/wait.h> 16 #include <linux/cdev.h> 17 #include <linux/idr.h> 18 #include <linux/fs.h> 19 20 #include <net/net_namespace.h> 21 #include <net/rtnetlink.h> 22 #include <net/sock.h> 23 #include <linux/virtio_net.h> 24 25 /* 26 * A macvtap queue is the central object of this driver, it connects 27 * an open character device to a macvlan interface. There can be 28 * multiple queues on one interface, which map back to queues 29 * implemented in hardware on the underlying device. 30 * 31 * macvtap_proto is used to allocate queues through the sock allocation 32 * mechanism. 33 * 34 */ 35 struct macvtap_queue { 36 struct sock sk; 37 struct socket sock; 38 struct socket_wq wq; 39 int vnet_hdr_sz; 40 struct macvlan_dev __rcu *vlan; 41 struct file *file; 42 unsigned int flags; 43 u16 queue_index; 44 bool enabled; 45 struct list_head next; 46 }; 47 48 static struct proto macvtap_proto = { 49 .name = "macvtap", 50 .owner = THIS_MODULE, 51 .obj_size = sizeof (struct macvtap_queue), 52 }; 53 54 /* 55 * Variables for dealing with macvtaps device numbers. 56 */ 57 static dev_t macvtap_major; 58 #define MACVTAP_NUM_DEVS (1U << MINORBITS) 59 static DEFINE_MUTEX(minor_lock); 60 static DEFINE_IDR(minor_idr); 61 62 #define GOODCOPY_LEN 128 63 static struct class *macvtap_class; 64 static struct cdev macvtap_cdev; 65 66 static const struct proto_ops macvtap_socket_ops; 67 68 #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ 69 NETIF_F_TSO6 | NETIF_F_UFO) 70 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) 71 /* 72 * RCU usage: 73 * The macvtap_queue and the macvlan_dev are loosely coupled, the 74 * pointers from one to the other can only be read while rcu_read_lock 75 * or rtnl is held. 76 * 77 * Both the file and the macvlan_dev hold a reference on the macvtap_queue 78 * through sock_hold(&q->sk). When the macvlan_dev goes away first, 79 * q->vlan becomes inaccessible. When the files gets closed, 80 * macvtap_get_queue() fails. 81 * 82 * There may still be references to the struct sock inside of the 83 * queue from outbound SKBs, but these never reference back to the 84 * file or the dev. The data structure is freed through __sk_free 85 * when both our references and any pending SKBs are gone. 86 */ 87 88 static int macvtap_enable_queue(struct net_device *dev, struct file *file, 89 struct macvtap_queue *q) 90 { 91 struct macvlan_dev *vlan = netdev_priv(dev); 92 int err = -EINVAL; 93 94 ASSERT_RTNL(); 95 96 if (q->enabled) 97 goto out; 98 99 err = 0; 100 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 101 q->queue_index = vlan->numvtaps; 102 q->enabled = true; 103 104 vlan->numvtaps++; 105 out: 106 return err; 107 } 108 109 static int macvtap_set_queue(struct net_device *dev, struct file *file, 110 struct macvtap_queue *q) 111 { 112 struct macvlan_dev *vlan = netdev_priv(dev); 113 int err = -EBUSY; 114 115 rtnl_lock(); 116 if (vlan->numqueues == MAX_MACVTAP_QUEUES) 117 goto out; 118 119 err = 0; 120 rcu_assign_pointer(q->vlan, vlan); 121 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 122 sock_hold(&q->sk); 123 124 q->file = file; 125 q->queue_index = vlan->numvtaps; 126 q->enabled = true; 127 file->private_data = q; 128 list_add_tail(&q->next, &vlan->queue_list); 129 130 vlan->numvtaps++; 131 vlan->numqueues++; 132 133 out: 134 rtnl_unlock(); 135 return err; 136 } 137 138 static int macvtap_disable_queue(struct macvtap_queue *q) 139 { 140 struct macvlan_dev *vlan; 141 struct macvtap_queue *nq; 142 143 ASSERT_RTNL(); 144 if (!q->enabled) 145 return -EINVAL; 146 147 vlan = rtnl_dereference(q->vlan); 148 149 if (vlan) { 150 int index = q->queue_index; 151 BUG_ON(index >= vlan->numvtaps); 152 nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]); 153 nq->queue_index = index; 154 155 rcu_assign_pointer(vlan->taps[index], nq); 156 RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL); 157 q->enabled = false; 158 159 vlan->numvtaps--; 160 } 161 162 return 0; 163 } 164 165 /* 166 * The file owning the queue got closed, give up both 167 * the reference that the files holds as well as the 168 * one from the macvlan_dev if that still exists. 169 * 170 * Using the spinlock makes sure that we don't get 171 * to the queue again after destroying it. 172 */ 173 static void macvtap_put_queue(struct macvtap_queue *q) 174 { 175 struct macvlan_dev *vlan; 176 177 rtnl_lock(); 178 vlan = rtnl_dereference(q->vlan); 179 180 if (vlan) { 181 if (q->enabled) 182 BUG_ON(macvtap_disable_queue(q)); 183 184 vlan->numqueues--; 185 RCU_INIT_POINTER(q->vlan, NULL); 186 sock_put(&q->sk); 187 list_del_init(&q->next); 188 } 189 190 rtnl_unlock(); 191 192 synchronize_rcu(); 193 sock_put(&q->sk); 194 } 195 196 /* 197 * Select a queue based on the rxq of the device on which this packet 198 * arrived. If the incoming device is not mq, calculate a flow hash 199 * to select a queue. If all fails, find the first available queue. 200 * Cache vlan->numvtaps since it can become zero during the execution 201 * of this function. 202 */ 203 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, 204 struct sk_buff *skb) 205 { 206 struct macvlan_dev *vlan = netdev_priv(dev); 207 struct macvtap_queue *tap = NULL; 208 /* Access to taps array is protected by rcu, but access to numvtaps 209 * isn't. Below we use it to lookup a queue, but treat it as a hint 210 * and validate that the result isn't NULL - in case we are 211 * racing against queue removal. 212 */ 213 int numvtaps = ACCESS_ONCE(vlan->numvtaps); 214 __u32 rxq; 215 216 if (!numvtaps) 217 goto out; 218 219 /* Check if we can use flow to select a queue */ 220 rxq = skb_get_rxhash(skb); 221 if (rxq) { 222 tap = rcu_dereference(vlan->taps[rxq % numvtaps]); 223 goto out; 224 } 225 226 if (likely(skb_rx_queue_recorded(skb))) { 227 rxq = skb_get_rx_queue(skb); 228 229 while (unlikely(rxq >= numvtaps)) 230 rxq -= numvtaps; 231 232 tap = rcu_dereference(vlan->taps[rxq]); 233 goto out; 234 } 235 236 tap = rcu_dereference(vlan->taps[0]); 237 out: 238 return tap; 239 } 240 241 /* 242 * The net_device is going away, give up the reference 243 * that it holds on all queues and safely set the pointer 244 * from the queues to NULL. 245 */ 246 static void macvtap_del_queues(struct net_device *dev) 247 { 248 struct macvlan_dev *vlan = netdev_priv(dev); 249 struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES]; 250 int i, j = 0; 251 252 ASSERT_RTNL(); 253 list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) { 254 list_del_init(&q->next); 255 qlist[j++] = q; 256 RCU_INIT_POINTER(q->vlan, NULL); 257 if (q->enabled) 258 vlan->numvtaps--; 259 vlan->numqueues--; 260 } 261 for (i = 0; i < vlan->numvtaps; i++) 262 RCU_INIT_POINTER(vlan->taps[i], NULL); 263 BUG_ON(vlan->numvtaps); 264 BUG_ON(vlan->numqueues); 265 /* guarantee that any future macvtap_set_queue will fail */ 266 vlan->numvtaps = MAX_MACVTAP_QUEUES; 267 268 for (--j; j >= 0; j--) 269 sock_put(&qlist[j]->sk); 270 } 271 272 /* 273 * Forward happens for data that gets sent from one macvlan 274 * endpoint to another one in bridge mode. We just take 275 * the skb and put it into the receive queue. 276 */ 277 static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) 278 { 279 struct macvlan_dev *vlan = netdev_priv(dev); 280 struct macvtap_queue *q = macvtap_get_queue(dev, skb); 281 netdev_features_t features; 282 if (!q) 283 goto drop; 284 285 if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) 286 goto drop; 287 288 skb->dev = dev; 289 /* Apply the forward feature mask so that we perform segmentation 290 * according to users wishes. 291 */ 292 features = netif_skb_features(skb) & vlan->tap_features; 293 if (netif_needs_gso(skb, features)) { 294 struct sk_buff *segs = __skb_gso_segment(skb, features, false); 295 296 if (IS_ERR(segs)) 297 goto drop; 298 299 if (!segs) { 300 skb_queue_tail(&q->sk.sk_receive_queue, skb); 301 goto wake_up; 302 } 303 304 kfree_skb(skb); 305 while (segs) { 306 struct sk_buff *nskb = segs->next; 307 308 segs->next = NULL; 309 skb_queue_tail(&q->sk.sk_receive_queue, segs); 310 segs = nskb; 311 } 312 } else { 313 skb_queue_tail(&q->sk.sk_receive_queue, skb); 314 } 315 316 wake_up: 317 wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); 318 return NET_RX_SUCCESS; 319 320 drop: 321 kfree_skb(skb); 322 return NET_RX_DROP; 323 } 324 325 /* 326 * Receive is for data from the external interface (lowerdev), 327 * in case of macvtap, we can treat that the same way as 328 * forward, which macvlan cannot. 329 */ 330 static int macvtap_receive(struct sk_buff *skb) 331 { 332 skb_push(skb, ETH_HLEN); 333 return macvtap_forward(skb->dev, skb); 334 } 335 336 static int macvtap_get_minor(struct macvlan_dev *vlan) 337 { 338 int retval = -ENOMEM; 339 340 mutex_lock(&minor_lock); 341 retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL); 342 if (retval >= 0) { 343 vlan->minor = retval; 344 } else if (retval == -ENOSPC) { 345 printk(KERN_ERR "too many macvtap devices\n"); 346 retval = -EINVAL; 347 } 348 mutex_unlock(&minor_lock); 349 return retval < 0 ? retval : 0; 350 } 351 352 static void macvtap_free_minor(struct macvlan_dev *vlan) 353 { 354 mutex_lock(&minor_lock); 355 if (vlan->minor) { 356 idr_remove(&minor_idr, vlan->minor); 357 vlan->minor = 0; 358 } 359 mutex_unlock(&minor_lock); 360 } 361 362 static struct net_device *dev_get_by_macvtap_minor(int minor) 363 { 364 struct net_device *dev = NULL; 365 struct macvlan_dev *vlan; 366 367 mutex_lock(&minor_lock); 368 vlan = idr_find(&minor_idr, minor); 369 if (vlan) { 370 dev = vlan->dev; 371 dev_hold(dev); 372 } 373 mutex_unlock(&minor_lock); 374 return dev; 375 } 376 377 static int macvtap_newlink(struct net *src_net, 378 struct net_device *dev, 379 struct nlattr *tb[], 380 struct nlattr *data[]) 381 { 382 struct macvlan_dev *vlan = netdev_priv(dev); 383 INIT_LIST_HEAD(&vlan->queue_list); 384 385 /* Since macvlan supports all offloads by default, make 386 * tap support all offloads also. 387 */ 388 vlan->tap_features = TUN_OFFLOADS; 389 390 /* Don't put anything that may fail after macvlan_common_newlink 391 * because we can't undo what it does. 392 */ 393 return macvlan_common_newlink(src_net, dev, tb, data, 394 macvtap_receive, macvtap_forward); 395 } 396 397 static void macvtap_dellink(struct net_device *dev, 398 struct list_head *head) 399 { 400 macvtap_del_queues(dev); 401 macvlan_dellink(dev, head); 402 } 403 404 static void macvtap_setup(struct net_device *dev) 405 { 406 macvlan_common_setup(dev); 407 dev->tx_queue_len = TUN_READQ_SIZE; 408 } 409 410 static struct rtnl_link_ops macvtap_link_ops __read_mostly = { 411 .kind = "macvtap", 412 .setup = macvtap_setup, 413 .newlink = macvtap_newlink, 414 .dellink = macvtap_dellink, 415 }; 416 417 418 static void macvtap_sock_write_space(struct sock *sk) 419 { 420 wait_queue_head_t *wqueue; 421 422 if (!sock_writeable(sk) || 423 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) 424 return; 425 426 wqueue = sk_sleep(sk); 427 if (wqueue && waitqueue_active(wqueue)) 428 wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); 429 } 430 431 static void macvtap_sock_destruct(struct sock *sk) 432 { 433 skb_queue_purge(&sk->sk_receive_queue); 434 } 435 436 static int macvtap_open(struct inode *inode, struct file *file) 437 { 438 struct net *net = current->nsproxy->net_ns; 439 struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode)); 440 struct macvtap_queue *q; 441 int err; 442 443 err = -ENODEV; 444 if (!dev) 445 goto out; 446 447 err = -ENOMEM; 448 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, 449 &macvtap_proto); 450 if (!q) 451 goto out; 452 453 RCU_INIT_POINTER(q->sock.wq, &q->wq); 454 init_waitqueue_head(&q->wq.wait); 455 q->sock.type = SOCK_RAW; 456 q->sock.state = SS_CONNECTED; 457 q->sock.file = file; 458 q->sock.ops = &macvtap_socket_ops; 459 sock_init_data(&q->sock, &q->sk); 460 q->sk.sk_write_space = macvtap_sock_write_space; 461 q->sk.sk_destruct = macvtap_sock_destruct; 462 q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; 463 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); 464 465 /* 466 * so far only KVM virtio_net uses macvtap, enable zero copy between 467 * guest kernel and host kernel when lower device supports zerocopy 468 * 469 * The macvlan supports zerocopy iff the lower device supports zero 470 * copy so we don't have to look at the lower device directly. 471 */ 472 if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG)) 473 sock_set_flag(&q->sk, SOCK_ZEROCOPY); 474 475 err = macvtap_set_queue(dev, file, q); 476 if (err) 477 sock_put(&q->sk); 478 479 out: 480 if (dev) 481 dev_put(dev); 482 483 return err; 484 } 485 486 static int macvtap_release(struct inode *inode, struct file *file) 487 { 488 struct macvtap_queue *q = file->private_data; 489 macvtap_put_queue(q); 490 return 0; 491 } 492 493 static unsigned int macvtap_poll(struct file *file, poll_table * wait) 494 { 495 struct macvtap_queue *q = file->private_data; 496 unsigned int mask = POLLERR; 497 498 if (!q) 499 goto out; 500 501 mask = 0; 502 poll_wait(file, &q->wq.wait, wait); 503 504 if (!skb_queue_empty(&q->sk.sk_receive_queue)) 505 mask |= POLLIN | POLLRDNORM; 506 507 if (sock_writeable(&q->sk) || 508 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && 509 sock_writeable(&q->sk))) 510 mask |= POLLOUT | POLLWRNORM; 511 512 out: 513 return mask; 514 } 515 516 static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, 517 size_t len, size_t linear, 518 int noblock, int *err) 519 { 520 struct sk_buff *skb; 521 522 /* Under a page? Don't bother with paged skb. */ 523 if (prepad + len < PAGE_SIZE || !linear) 524 linear = len; 525 526 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 527 err); 528 if (!skb) 529 return NULL; 530 531 skb_reserve(skb, prepad); 532 skb_put(skb, linear); 533 skb->data_len = len - linear; 534 skb->len += len - linear; 535 536 return skb; 537 } 538 539 /* set skb frags from iovec, this can move to core network code for reuse */ 540 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, 541 int offset, size_t count) 542 { 543 int len = iov_length(from, count) - offset; 544 int copy = skb_headlen(skb); 545 int size, offset1 = 0; 546 int i = 0; 547 548 /* Skip over from offset */ 549 while (count && (offset >= from->iov_len)) { 550 offset -= from->iov_len; 551 ++from; 552 --count; 553 } 554 555 /* copy up to skb headlen */ 556 while (count && (copy > 0)) { 557 size = min_t(unsigned int, copy, from->iov_len - offset); 558 if (copy_from_user(skb->data + offset1, from->iov_base + offset, 559 size)) 560 return -EFAULT; 561 if (copy > size) { 562 ++from; 563 --count; 564 offset = 0; 565 } else 566 offset += size; 567 copy -= size; 568 offset1 += size; 569 } 570 571 if (len == offset1) 572 return 0; 573 574 while (count--) { 575 struct page *page[MAX_SKB_FRAGS]; 576 int num_pages; 577 unsigned long base; 578 unsigned long truesize; 579 580 len = from->iov_len - offset; 581 if (!len) { 582 offset = 0; 583 ++from; 584 continue; 585 } 586 base = (unsigned long)from->iov_base + offset; 587 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 588 if (i + size > MAX_SKB_FRAGS) 589 return -EMSGSIZE; 590 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 591 if (num_pages != size) { 592 int j; 593 594 for (j = 0; j < num_pages; j++) 595 put_page(page[i + j]); 596 return -EFAULT; 597 } 598 truesize = size * PAGE_SIZE; 599 skb->data_len += len; 600 skb->len += len; 601 skb->truesize += truesize; 602 atomic_add(truesize, &skb->sk->sk_wmem_alloc); 603 while (len) { 604 int off = base & ~PAGE_MASK; 605 int size = min_t(int, len, PAGE_SIZE - off); 606 __skb_fill_page_desc(skb, i, page[i], off, size); 607 skb_shinfo(skb)->nr_frags++; 608 /* increase sk_wmem_alloc */ 609 base += size; 610 len -= size; 611 i++; 612 } 613 offset = 0; 614 ++from; 615 } 616 return 0; 617 } 618 619 /* 620 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should 621 * be shared with the tun/tap driver. 622 */ 623 static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, 624 struct virtio_net_hdr *vnet_hdr) 625 { 626 unsigned short gso_type = 0; 627 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 628 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 629 case VIRTIO_NET_HDR_GSO_TCPV4: 630 gso_type = SKB_GSO_TCPV4; 631 break; 632 case VIRTIO_NET_HDR_GSO_TCPV6: 633 gso_type = SKB_GSO_TCPV6; 634 break; 635 case VIRTIO_NET_HDR_GSO_UDP: 636 gso_type = SKB_GSO_UDP; 637 break; 638 default: 639 return -EINVAL; 640 } 641 642 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) 643 gso_type |= SKB_GSO_TCP_ECN; 644 645 if (vnet_hdr->gso_size == 0) 646 return -EINVAL; 647 } 648 649 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 650 if (!skb_partial_csum_set(skb, vnet_hdr->csum_start, 651 vnet_hdr->csum_offset)) 652 return -EINVAL; 653 } 654 655 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 656 skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; 657 skb_shinfo(skb)->gso_type = gso_type; 658 659 /* Header must be checked, and gso_segs computed. */ 660 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 661 skb_shinfo(skb)->gso_segs = 0; 662 } 663 return 0; 664 } 665 666 static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, 667 struct virtio_net_hdr *vnet_hdr) 668 { 669 memset(vnet_hdr, 0, sizeof(*vnet_hdr)); 670 671 if (skb_is_gso(skb)) { 672 struct skb_shared_info *sinfo = skb_shinfo(skb); 673 674 /* This is a hint as to how much should be linear. */ 675 vnet_hdr->hdr_len = skb_headlen(skb); 676 vnet_hdr->gso_size = sinfo->gso_size; 677 if (sinfo->gso_type & SKB_GSO_TCPV4) 678 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 679 else if (sinfo->gso_type & SKB_GSO_TCPV6) 680 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 681 else if (sinfo->gso_type & SKB_GSO_UDP) 682 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 683 else 684 BUG(); 685 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 686 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; 687 } else 688 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 689 690 if (skb->ip_summed == CHECKSUM_PARTIAL) { 691 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 692 vnet_hdr->csum_start = skb_checksum_start_offset(skb); 693 vnet_hdr->csum_offset = skb->csum_offset; 694 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 695 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; 696 } /* else everything is zero */ 697 698 return 0; 699 } 700 701 static unsigned long iov_pages(const struct iovec *iv, int offset, 702 unsigned long nr_segs) 703 { 704 unsigned long seg, base; 705 int pages = 0, len, size; 706 707 while (nr_segs && (offset >= iv->iov_len)) { 708 offset -= iv->iov_len; 709 ++iv; 710 --nr_segs; 711 } 712 713 for (seg = 0; seg < nr_segs; seg++) { 714 base = (unsigned long)iv[seg].iov_base + offset; 715 len = iv[seg].iov_len - offset; 716 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 717 pages += size; 718 offset = 0; 719 } 720 721 return pages; 722 } 723 724 /* Get packet from user space buffer */ 725 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, 726 const struct iovec *iv, unsigned long total_len, 727 size_t count, int noblock) 728 { 729 struct sk_buff *skb; 730 struct macvlan_dev *vlan; 731 unsigned long len = total_len; 732 int err; 733 struct virtio_net_hdr vnet_hdr = { 0 }; 734 int vnet_hdr_len = 0; 735 int copylen = 0; 736 bool zerocopy = false; 737 size_t linear; 738 739 if (q->flags & IFF_VNET_HDR) { 740 vnet_hdr_len = q->vnet_hdr_sz; 741 742 err = -EINVAL; 743 if (len < vnet_hdr_len) 744 goto err; 745 len -= vnet_hdr_len; 746 747 err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, 748 sizeof(vnet_hdr)); 749 if (err < 0) 750 goto err; 751 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 752 vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 753 vnet_hdr.hdr_len) 754 vnet_hdr.hdr_len = vnet_hdr.csum_start + 755 vnet_hdr.csum_offset + 2; 756 err = -EINVAL; 757 if (vnet_hdr.hdr_len > len) 758 goto err; 759 } 760 761 err = -EINVAL; 762 if (unlikely(len < ETH_HLEN)) 763 goto err; 764 765 err = -EMSGSIZE; 766 if (unlikely(count > UIO_MAXIOV)) 767 goto err; 768 769 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { 770 copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN; 771 linear = copylen; 772 if (iov_pages(iv, vnet_hdr_len + copylen, count) 773 <= MAX_SKB_FRAGS) 774 zerocopy = true; 775 } 776 777 if (!zerocopy) { 778 copylen = len; 779 linear = vnet_hdr.hdr_len; 780 } 781 782 skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, 783 linear, noblock, &err); 784 if (!skb) 785 goto err; 786 787 if (zerocopy) 788 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 789 else { 790 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 791 len); 792 if (!err && m && m->msg_control) { 793 struct ubuf_info *uarg = m->msg_control; 794 uarg->callback(uarg, false); 795 } 796 } 797 798 if (err) 799 goto err_kfree; 800 801 skb_set_network_header(skb, ETH_HLEN); 802 skb_reset_mac_header(skb); 803 skb->protocol = eth_hdr(skb)->h_proto; 804 805 if (vnet_hdr_len) { 806 err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr); 807 if (err) 808 goto err_kfree; 809 } 810 811 skb_probe_transport_header(skb, ETH_HLEN); 812 813 rcu_read_lock(); 814 vlan = rcu_dereference(q->vlan); 815 /* copy skb_ubuf_info for callback when skb has no error */ 816 if (zerocopy) { 817 skb_shinfo(skb)->destructor_arg = m->msg_control; 818 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 819 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 820 } 821 if (vlan) { 822 local_bh_disable(); 823 macvlan_start_xmit(skb, vlan->dev); 824 local_bh_enable(); 825 } else { 826 kfree_skb(skb); 827 } 828 rcu_read_unlock(); 829 830 return total_len; 831 832 err_kfree: 833 kfree_skb(skb); 834 835 err: 836 rcu_read_lock(); 837 vlan = rcu_dereference(q->vlan); 838 if (vlan) 839 vlan->dev->stats.tx_dropped++; 840 rcu_read_unlock(); 841 842 return err; 843 } 844 845 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, 846 unsigned long count, loff_t pos) 847 { 848 struct file *file = iocb->ki_filp; 849 ssize_t result = -ENOLINK; 850 struct macvtap_queue *q = file->private_data; 851 852 result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count, 853 file->f_flags & O_NONBLOCK); 854 return result; 855 } 856 857 /* Put packet to the user space buffer */ 858 static ssize_t macvtap_put_user(struct macvtap_queue *q, 859 const struct sk_buff *skb, 860 const struct iovec *iv, int len) 861 { 862 struct macvlan_dev *vlan; 863 int ret; 864 int vnet_hdr_len = 0; 865 int vlan_offset = 0; 866 int copied; 867 868 if (q->flags & IFF_VNET_HDR) { 869 struct virtio_net_hdr vnet_hdr; 870 vnet_hdr_len = q->vnet_hdr_sz; 871 if ((len -= vnet_hdr_len) < 0) 872 return -EINVAL; 873 874 ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); 875 if (ret) 876 return ret; 877 878 if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) 879 return -EFAULT; 880 } 881 copied = vnet_hdr_len; 882 883 if (!vlan_tx_tag_present(skb)) 884 len = min_t(int, skb->len, len); 885 else { 886 int copy; 887 struct { 888 __be16 h_vlan_proto; 889 __be16 h_vlan_TCI; 890 } veth; 891 veth.h_vlan_proto = skb->vlan_proto; 892 veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); 893 894 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); 895 len = min_t(int, skb->len + VLAN_HLEN, len); 896 897 copy = min_t(int, vlan_offset, len); 898 ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); 899 len -= copy; 900 copied += copy; 901 if (ret || !len) 902 goto done; 903 904 copy = min_t(int, sizeof(veth), len); 905 ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); 906 len -= copy; 907 copied += copy; 908 if (ret || !len) 909 goto done; 910 } 911 912 ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); 913 copied += len; 914 915 done: 916 rcu_read_lock(); 917 vlan = rcu_dereference(q->vlan); 918 if (vlan) { 919 preempt_disable(); 920 macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0); 921 preempt_enable(); 922 } 923 rcu_read_unlock(); 924 925 return ret ? ret : copied; 926 } 927 928 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb, 929 const struct iovec *iv, unsigned long len, 930 int noblock) 931 { 932 DEFINE_WAIT(wait); 933 struct sk_buff *skb; 934 ssize_t ret = 0; 935 936 while (len) { 937 if (!noblock) 938 prepare_to_wait(sk_sleep(&q->sk), &wait, 939 TASK_INTERRUPTIBLE); 940 941 /* Read frames from the queue */ 942 skb = skb_dequeue(&q->sk.sk_receive_queue); 943 if (!skb) { 944 if (noblock) { 945 ret = -EAGAIN; 946 break; 947 } 948 if (signal_pending(current)) { 949 ret = -ERESTARTSYS; 950 break; 951 } 952 /* Nothing to read, let's sleep */ 953 schedule(); 954 continue; 955 } 956 ret = macvtap_put_user(q, skb, iv, len); 957 kfree_skb(skb); 958 break; 959 } 960 961 if (!noblock) 962 finish_wait(sk_sleep(&q->sk), &wait); 963 return ret; 964 } 965 966 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, 967 unsigned long count, loff_t pos) 968 { 969 struct file *file = iocb->ki_filp; 970 struct macvtap_queue *q = file->private_data; 971 ssize_t len, ret = 0; 972 973 len = iov_length(iv, count); 974 if (len < 0) { 975 ret = -EINVAL; 976 goto out; 977 } 978 979 ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK); 980 ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */ 981 out: 982 return ret; 983 } 984 985 static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q) 986 { 987 struct macvlan_dev *vlan; 988 989 ASSERT_RTNL(); 990 vlan = rtnl_dereference(q->vlan); 991 if (vlan) 992 dev_hold(vlan->dev); 993 994 return vlan; 995 } 996 997 static void macvtap_put_vlan(struct macvlan_dev *vlan) 998 { 999 dev_put(vlan->dev); 1000 } 1001 1002 static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags) 1003 { 1004 struct macvtap_queue *q = file->private_data; 1005 struct macvlan_dev *vlan; 1006 int ret; 1007 1008 vlan = macvtap_get_vlan(q); 1009 if (!vlan) 1010 return -EINVAL; 1011 1012 if (flags & IFF_ATTACH_QUEUE) 1013 ret = macvtap_enable_queue(vlan->dev, file, q); 1014 else if (flags & IFF_DETACH_QUEUE) 1015 ret = macvtap_disable_queue(q); 1016 else 1017 ret = -EINVAL; 1018 1019 macvtap_put_vlan(vlan); 1020 return ret; 1021 } 1022 1023 static int set_offload(struct macvtap_queue *q, unsigned long arg) 1024 { 1025 struct macvlan_dev *vlan; 1026 netdev_features_t features; 1027 netdev_features_t feature_mask = 0; 1028 1029 vlan = rtnl_dereference(q->vlan); 1030 if (!vlan) 1031 return -ENOLINK; 1032 1033 features = vlan->dev->features; 1034 1035 if (arg & TUN_F_CSUM) { 1036 feature_mask = NETIF_F_HW_CSUM; 1037 1038 if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { 1039 if (arg & TUN_F_TSO_ECN) 1040 feature_mask |= NETIF_F_TSO_ECN; 1041 if (arg & TUN_F_TSO4) 1042 feature_mask |= NETIF_F_TSO; 1043 if (arg & TUN_F_TSO6) 1044 feature_mask |= NETIF_F_TSO6; 1045 } 1046 1047 if (arg & TUN_F_UFO) 1048 feature_mask |= NETIF_F_UFO; 1049 } 1050 1051 /* tun/tap driver inverts the usage for TSO offloads, where 1052 * setting the TSO bit means that the userspace wants to 1053 * accept TSO frames and turning it off means that user space 1054 * does not support TSO. 1055 * For macvtap, we have to invert it to mean the same thing. 1056 * When user space turns off TSO, we turn off GSO/LRO so that 1057 * user-space will not receive TSO frames. 1058 */ 1059 if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO)) 1060 features |= RX_OFFLOADS; 1061 else 1062 features &= ~RX_OFFLOADS; 1063 1064 /* tap_features are the same as features on tun/tap and 1065 * reflect user expectations. 1066 */ 1067 vlan->tap_features = vlan->dev->features & 1068 (feature_mask | ~TUN_OFFLOADS); 1069 vlan->set_features = features; 1070 netdev_update_features(vlan->dev); 1071 1072 return 0; 1073 } 1074 1075 /* 1076 * provide compatibility with generic tun/tap interface 1077 */ 1078 static long macvtap_ioctl(struct file *file, unsigned int cmd, 1079 unsigned long arg) 1080 { 1081 struct macvtap_queue *q = file->private_data; 1082 struct macvlan_dev *vlan; 1083 void __user *argp = (void __user *)arg; 1084 struct ifreq __user *ifr = argp; 1085 unsigned int __user *up = argp; 1086 unsigned int u; 1087 int __user *sp = argp; 1088 int s; 1089 int ret; 1090 1091 switch (cmd) { 1092 case TUNSETIFF: 1093 /* ignore the name, just look at flags */ 1094 if (get_user(u, &ifr->ifr_flags)) 1095 return -EFAULT; 1096 1097 ret = 0; 1098 if ((u & ~(IFF_VNET_HDR | IFF_MULTI_QUEUE)) != 1099 (IFF_NO_PI | IFF_TAP)) 1100 ret = -EINVAL; 1101 else 1102 q->flags = u; 1103 1104 return ret; 1105 1106 case TUNGETIFF: 1107 rtnl_lock(); 1108 vlan = macvtap_get_vlan(q); 1109 if (!vlan) { 1110 rtnl_unlock(); 1111 return -ENOLINK; 1112 } 1113 1114 ret = 0; 1115 if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) || 1116 put_user(q->flags, &ifr->ifr_flags)) 1117 ret = -EFAULT; 1118 macvtap_put_vlan(vlan); 1119 rtnl_unlock(); 1120 return ret; 1121 1122 case TUNSETQUEUE: 1123 if (get_user(u, &ifr->ifr_flags)) 1124 return -EFAULT; 1125 rtnl_lock(); 1126 ret = macvtap_ioctl_set_queue(file, u); 1127 rtnl_unlock(); 1128 return ret; 1129 1130 case TUNGETFEATURES: 1131 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | 1132 IFF_MULTI_QUEUE, up)) 1133 return -EFAULT; 1134 return 0; 1135 1136 case TUNSETSNDBUF: 1137 if (get_user(u, up)) 1138 return -EFAULT; 1139 1140 q->sk.sk_sndbuf = u; 1141 return 0; 1142 1143 case TUNGETVNETHDRSZ: 1144 s = q->vnet_hdr_sz; 1145 if (put_user(s, sp)) 1146 return -EFAULT; 1147 return 0; 1148 1149 case TUNSETVNETHDRSZ: 1150 if (get_user(s, sp)) 1151 return -EFAULT; 1152 if (s < (int)sizeof(struct virtio_net_hdr)) 1153 return -EINVAL; 1154 1155 q->vnet_hdr_sz = s; 1156 return 0; 1157 1158 case TUNSETOFFLOAD: 1159 /* let the user check for future flags */ 1160 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | 1161 TUN_F_TSO_ECN | TUN_F_UFO)) 1162 return -EINVAL; 1163 1164 /* TODO: only accept frames with the features that 1165 got enabled for forwarded frames */ 1166 if (!(q->flags & IFF_VNET_HDR)) 1167 return -EINVAL; 1168 rtnl_lock(); 1169 ret = set_offload(q, arg); 1170 rtnl_unlock(); 1171 return ret; 1172 1173 default: 1174 return -EINVAL; 1175 } 1176 } 1177 1178 #ifdef CONFIG_COMPAT 1179 static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, 1180 unsigned long arg) 1181 { 1182 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); 1183 } 1184 #endif 1185 1186 static const struct file_operations macvtap_fops = { 1187 .owner = THIS_MODULE, 1188 .open = macvtap_open, 1189 .release = macvtap_release, 1190 .aio_read = macvtap_aio_read, 1191 .aio_write = macvtap_aio_write, 1192 .poll = macvtap_poll, 1193 .llseek = no_llseek, 1194 .unlocked_ioctl = macvtap_ioctl, 1195 #ifdef CONFIG_COMPAT 1196 .compat_ioctl = macvtap_compat_ioctl, 1197 #endif 1198 }; 1199 1200 static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, 1201 struct msghdr *m, size_t total_len) 1202 { 1203 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1204 return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen, 1205 m->msg_flags & MSG_DONTWAIT); 1206 } 1207 1208 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, 1209 struct msghdr *m, size_t total_len, 1210 int flags) 1211 { 1212 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1213 int ret; 1214 if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) 1215 return -EINVAL; 1216 ret = macvtap_do_read(q, iocb, m->msg_iov, total_len, 1217 flags & MSG_DONTWAIT); 1218 if (ret > total_len) { 1219 m->msg_flags |= MSG_TRUNC; 1220 ret = flags & MSG_TRUNC ? ret : total_len; 1221 } 1222 return ret; 1223 } 1224 1225 /* Ops structure to mimic raw sockets with tun */ 1226 static const struct proto_ops macvtap_socket_ops = { 1227 .sendmsg = macvtap_sendmsg, 1228 .recvmsg = macvtap_recvmsg, 1229 }; 1230 1231 /* Get an underlying socket object from tun file. Returns error unless file is 1232 * attached to a device. The returned object works like a packet socket, it 1233 * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for 1234 * holding a reference to the file for as long as the socket is in use. */ 1235 struct socket *macvtap_get_socket(struct file *file) 1236 { 1237 struct macvtap_queue *q; 1238 if (file->f_op != &macvtap_fops) 1239 return ERR_PTR(-EINVAL); 1240 q = file->private_data; 1241 if (!q) 1242 return ERR_PTR(-EBADFD); 1243 return &q->sock; 1244 } 1245 EXPORT_SYMBOL_GPL(macvtap_get_socket); 1246 1247 static int macvtap_device_event(struct notifier_block *unused, 1248 unsigned long event, void *ptr) 1249 { 1250 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1251 struct macvlan_dev *vlan; 1252 struct device *classdev; 1253 dev_t devt; 1254 int err; 1255 1256 if (dev->rtnl_link_ops != &macvtap_link_ops) 1257 return NOTIFY_DONE; 1258 1259 vlan = netdev_priv(dev); 1260 1261 switch (event) { 1262 case NETDEV_REGISTER: 1263 /* Create the device node here after the network device has 1264 * been registered but before register_netdevice has 1265 * finished running. 1266 */ 1267 err = macvtap_get_minor(vlan); 1268 if (err) 1269 return notifier_from_errno(err); 1270 1271 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1272 classdev = device_create(macvtap_class, &dev->dev, devt, 1273 dev, "tap%d", dev->ifindex); 1274 if (IS_ERR(classdev)) { 1275 macvtap_free_minor(vlan); 1276 return notifier_from_errno(PTR_ERR(classdev)); 1277 } 1278 break; 1279 case NETDEV_UNREGISTER: 1280 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1281 device_destroy(macvtap_class, devt); 1282 macvtap_free_minor(vlan); 1283 break; 1284 } 1285 1286 return NOTIFY_DONE; 1287 } 1288 1289 static struct notifier_block macvtap_notifier_block __read_mostly = { 1290 .notifier_call = macvtap_device_event, 1291 }; 1292 1293 static int macvtap_init(void) 1294 { 1295 int err; 1296 1297 err = alloc_chrdev_region(&macvtap_major, 0, 1298 MACVTAP_NUM_DEVS, "macvtap"); 1299 if (err) 1300 goto out1; 1301 1302 cdev_init(&macvtap_cdev, &macvtap_fops); 1303 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); 1304 if (err) 1305 goto out2; 1306 1307 macvtap_class = class_create(THIS_MODULE, "macvtap"); 1308 if (IS_ERR(macvtap_class)) { 1309 err = PTR_ERR(macvtap_class); 1310 goto out3; 1311 } 1312 1313 err = register_netdevice_notifier(&macvtap_notifier_block); 1314 if (err) 1315 goto out4; 1316 1317 err = macvlan_link_register(&macvtap_link_ops); 1318 if (err) 1319 goto out5; 1320 1321 return 0; 1322 1323 out5: 1324 unregister_netdevice_notifier(&macvtap_notifier_block); 1325 out4: 1326 class_unregister(macvtap_class); 1327 out3: 1328 cdev_del(&macvtap_cdev); 1329 out2: 1330 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1331 out1: 1332 return err; 1333 } 1334 module_init(macvtap_init); 1335 1336 static void macvtap_exit(void) 1337 { 1338 rtnl_link_unregister(&macvtap_link_ops); 1339 unregister_netdevice_notifier(&macvtap_notifier_block); 1340 class_unregister(macvtap_class); 1341 cdev_del(&macvtap_cdev); 1342 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1343 } 1344 module_exit(macvtap_exit); 1345 1346 MODULE_ALIAS_RTNL_LINK("macvtap"); 1347 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); 1348 MODULE_LICENSE("GPL"); 1349