1 #include <linux/etherdevice.h> 2 #include <linux/if_macvlan.h> 3 #include <linux/interrupt.h> 4 #include <linux/nsproxy.h> 5 #include <linux/compat.h> 6 #include <linux/if_tun.h> 7 #include <linux/module.h> 8 #include <linux/skbuff.h> 9 #include <linux/cache.h> 10 #include <linux/sched.h> 11 #include <linux/types.h> 12 #include <linux/slab.h> 13 #include <linux/init.h> 14 #include <linux/wait.h> 15 #include <linux/cdev.h> 16 #include <linux/fs.h> 17 18 #include <net/net_namespace.h> 19 #include <net/rtnetlink.h> 20 #include <net/sock.h> 21 #include <linux/virtio_net.h> 22 23 /* 24 * A macvtap queue is the central object of this driver, it connects 25 * an open character device to a macvlan interface. There can be 26 * multiple queues on one interface, which map back to queues 27 * implemented in hardware on the underlying device. 28 * 29 * macvtap_proto is used to allocate queues through the sock allocation 30 * mechanism. 31 * 32 * TODO: multiqueue support is currently not implemented, even though 33 * macvtap is basically prepared for that. We will need to add this 34 * here as well as in virtio-net and qemu to get line rate on 10gbit 35 * adapters from a guest. 36 */ 37 struct macvtap_queue { 38 struct sock sk; 39 struct socket sock; 40 struct socket_wq wq; 41 int vnet_hdr_sz; 42 struct macvlan_dev __rcu *vlan; 43 struct file *file; 44 unsigned int flags; 45 }; 46 47 static struct proto macvtap_proto = { 48 .name = "macvtap", 49 .owner = THIS_MODULE, 50 .obj_size = sizeof (struct macvtap_queue), 51 }; 52 53 /* 54 * Minor number matches netdev->ifindex, so need a potentially 55 * large value. This also makes it possible to split the 56 * tap functionality out again in the future by offering it 57 * from other drivers besides macvtap. As long as every device 58 * only has one tap, the interface numbers assure that the 59 * device nodes are unique. 60 */ 61 static dev_t macvtap_major; 62 #define MACVTAP_NUM_DEVS 65536 63 #define GOODCOPY_LEN 128 64 static struct class *macvtap_class; 65 static struct cdev macvtap_cdev; 66 67 static const struct proto_ops macvtap_socket_ops; 68 69 /* 70 * RCU usage: 71 * The macvtap_queue and the macvlan_dev are loosely coupled, the 72 * pointers from one to the other can only be read while rcu_read_lock 73 * or macvtap_lock is held. 74 * 75 * Both the file and the macvlan_dev hold a reference on the macvtap_queue 76 * through sock_hold(&q->sk). When the macvlan_dev goes away first, 77 * q->vlan becomes inaccessible. When the files gets closed, 78 * macvtap_get_queue() fails. 79 * 80 * There may still be references to the struct sock inside of the 81 * queue from outbound SKBs, but these never reference back to the 82 * file or the dev. The data structure is freed through __sk_free 83 * when both our references and any pending SKBs are gone. 84 */ 85 static DEFINE_SPINLOCK(macvtap_lock); 86 87 /* 88 * get_slot: return a [unused/occupied] slot in vlan->taps[]: 89 * - if 'q' is NULL, return the first empty slot; 90 * - otherwise, return the slot this pointer occupies. 91 */ 92 static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q) 93 { 94 int i; 95 96 for (i = 0; i < MAX_MACVTAP_QUEUES; i++) { 97 if (rcu_dereference(vlan->taps[i]) == q) 98 return i; 99 } 100 101 /* Should never happen */ 102 BUG_ON(1); 103 } 104 105 static int macvtap_set_queue(struct net_device *dev, struct file *file, 106 struct macvtap_queue *q) 107 { 108 struct macvlan_dev *vlan = netdev_priv(dev); 109 int index; 110 int err = -EBUSY; 111 112 spin_lock(&macvtap_lock); 113 if (vlan->numvtaps == MAX_MACVTAP_QUEUES) 114 goto out; 115 116 err = 0; 117 index = get_slot(vlan, NULL); 118 rcu_assign_pointer(q->vlan, vlan); 119 rcu_assign_pointer(vlan->taps[index], q); 120 sock_hold(&q->sk); 121 122 q->file = file; 123 file->private_data = q; 124 125 vlan->numvtaps++; 126 127 out: 128 spin_unlock(&macvtap_lock); 129 return err; 130 } 131 132 /* 133 * The file owning the queue got closed, give up both 134 * the reference that the files holds as well as the 135 * one from the macvlan_dev if that still exists. 136 * 137 * Using the spinlock makes sure that we don't get 138 * to the queue again after destroying it. 139 */ 140 static void macvtap_put_queue(struct macvtap_queue *q) 141 { 142 struct macvlan_dev *vlan; 143 144 spin_lock(&macvtap_lock); 145 vlan = rcu_dereference_protected(q->vlan, 146 lockdep_is_held(&macvtap_lock)); 147 if (vlan) { 148 int index = get_slot(vlan, q); 149 150 rcu_assign_pointer(vlan->taps[index], NULL); 151 rcu_assign_pointer(q->vlan, NULL); 152 sock_put(&q->sk); 153 --vlan->numvtaps; 154 } 155 156 spin_unlock(&macvtap_lock); 157 158 synchronize_rcu(); 159 sock_put(&q->sk); 160 } 161 162 /* 163 * Select a queue based on the rxq of the device on which this packet 164 * arrived. If the incoming device is not mq, calculate a flow hash 165 * to select a queue. If all fails, find the first available queue. 166 * Cache vlan->numvtaps since it can become zero during the execution 167 * of this function. 168 */ 169 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, 170 struct sk_buff *skb) 171 { 172 struct macvlan_dev *vlan = netdev_priv(dev); 173 struct macvtap_queue *tap = NULL; 174 int numvtaps = vlan->numvtaps; 175 __u32 rxq; 176 177 if (!numvtaps) 178 goto out; 179 180 if (likely(skb_rx_queue_recorded(skb))) { 181 rxq = skb_get_rx_queue(skb); 182 183 while (unlikely(rxq >= numvtaps)) 184 rxq -= numvtaps; 185 186 tap = rcu_dereference(vlan->taps[rxq]); 187 if (tap) 188 goto out; 189 } 190 191 /* Check if we can use flow to select a queue */ 192 rxq = skb_get_rxhash(skb); 193 if (rxq) { 194 tap = rcu_dereference(vlan->taps[rxq % numvtaps]); 195 if (tap) 196 goto out; 197 } 198 199 /* Everything failed - find first available queue */ 200 for (rxq = 0; rxq < MAX_MACVTAP_QUEUES; rxq++) { 201 tap = rcu_dereference(vlan->taps[rxq]); 202 if (tap) 203 break; 204 } 205 206 out: 207 return tap; 208 } 209 210 /* 211 * The net_device is going away, give up the reference 212 * that it holds on all queues and safely set the pointer 213 * from the queues to NULL. 214 */ 215 static void macvtap_del_queues(struct net_device *dev) 216 { 217 struct macvlan_dev *vlan = netdev_priv(dev); 218 struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES]; 219 int i, j = 0; 220 221 /* macvtap_put_queue can free some slots, so go through all slots */ 222 spin_lock(&macvtap_lock); 223 for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) { 224 q = rcu_dereference_protected(vlan->taps[i], 225 lockdep_is_held(&macvtap_lock)); 226 if (q) { 227 qlist[j++] = q; 228 rcu_assign_pointer(vlan->taps[i], NULL); 229 rcu_assign_pointer(q->vlan, NULL); 230 vlan->numvtaps--; 231 } 232 } 233 BUG_ON(vlan->numvtaps != 0); 234 spin_unlock(&macvtap_lock); 235 236 synchronize_rcu(); 237 238 for (--j; j >= 0; j--) 239 sock_put(&qlist[j]->sk); 240 } 241 242 /* 243 * Forward happens for data that gets sent from one macvlan 244 * endpoint to another one in bridge mode. We just take 245 * the skb and put it into the receive queue. 246 */ 247 static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) 248 { 249 struct macvtap_queue *q = macvtap_get_queue(dev, skb); 250 if (!q) 251 goto drop; 252 253 if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) 254 goto drop; 255 256 skb_queue_tail(&q->sk.sk_receive_queue, skb); 257 wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); 258 return NET_RX_SUCCESS; 259 260 drop: 261 kfree_skb(skb); 262 return NET_RX_DROP; 263 } 264 265 /* 266 * Receive is for data from the external interface (lowerdev), 267 * in case of macvtap, we can treat that the same way as 268 * forward, which macvlan cannot. 269 */ 270 static int macvtap_receive(struct sk_buff *skb) 271 { 272 skb_push(skb, ETH_HLEN); 273 return macvtap_forward(skb->dev, skb); 274 } 275 276 static int macvtap_newlink(struct net *src_net, 277 struct net_device *dev, 278 struct nlattr *tb[], 279 struct nlattr *data[]) 280 { 281 struct device *classdev; 282 dev_t devt; 283 int err; 284 285 err = macvlan_common_newlink(src_net, dev, tb, data, 286 macvtap_receive, macvtap_forward); 287 if (err) 288 goto out; 289 290 devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); 291 292 classdev = device_create(macvtap_class, &dev->dev, devt, 293 dev, "tap%d", dev->ifindex); 294 if (IS_ERR(classdev)) { 295 err = PTR_ERR(classdev); 296 macvtap_del_queues(dev); 297 } 298 299 out: 300 return err; 301 } 302 303 static void macvtap_dellink(struct net_device *dev, 304 struct list_head *head) 305 { 306 device_destroy(macvtap_class, 307 MKDEV(MAJOR(macvtap_major), dev->ifindex)); 308 309 macvtap_del_queues(dev); 310 macvlan_dellink(dev, head); 311 } 312 313 static void macvtap_setup(struct net_device *dev) 314 { 315 macvlan_common_setup(dev); 316 dev->tx_queue_len = TUN_READQ_SIZE; 317 } 318 319 static struct rtnl_link_ops macvtap_link_ops __read_mostly = { 320 .kind = "macvtap", 321 .setup = macvtap_setup, 322 .newlink = macvtap_newlink, 323 .dellink = macvtap_dellink, 324 }; 325 326 327 static void macvtap_sock_write_space(struct sock *sk) 328 { 329 wait_queue_head_t *wqueue; 330 331 if (!sock_writeable(sk) || 332 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) 333 return; 334 335 wqueue = sk_sleep(sk); 336 if (wqueue && waitqueue_active(wqueue)) 337 wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); 338 } 339 340 static int macvtap_open(struct inode *inode, struct file *file) 341 { 342 struct net *net = current->nsproxy->net_ns; 343 struct net_device *dev = dev_get_by_index(net, iminor(inode)); 344 struct macvlan_dev *vlan = netdev_priv(dev); 345 struct macvtap_queue *q; 346 int err; 347 348 err = -ENODEV; 349 if (!dev) 350 goto out; 351 352 /* check if this is a macvtap device */ 353 err = -EINVAL; 354 if (dev->rtnl_link_ops != &macvtap_link_ops) 355 goto out; 356 357 err = -ENOMEM; 358 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, 359 &macvtap_proto); 360 if (!q) 361 goto out; 362 363 q->sock.wq = &q->wq; 364 init_waitqueue_head(&q->wq.wait); 365 q->sock.type = SOCK_RAW; 366 q->sock.state = SS_CONNECTED; 367 q->sock.file = file; 368 q->sock.ops = &macvtap_socket_ops; 369 sock_init_data(&q->sock, &q->sk); 370 q->sk.sk_write_space = macvtap_sock_write_space; 371 q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; 372 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); 373 374 /* 375 * so far only KVM virtio_net uses macvtap, enable zero copy between 376 * guest kernel and host kernel when lower device supports zerocopy 377 */ 378 if (vlan) { 379 if ((vlan->lowerdev->features & NETIF_F_HIGHDMA) && 380 (vlan->lowerdev->features & NETIF_F_SG)) 381 sock_set_flag(&q->sk, SOCK_ZEROCOPY); 382 } 383 384 err = macvtap_set_queue(dev, file, q); 385 if (err) 386 sock_put(&q->sk); 387 388 out: 389 if (dev) 390 dev_put(dev); 391 392 return err; 393 } 394 395 static int macvtap_release(struct inode *inode, struct file *file) 396 { 397 struct macvtap_queue *q = file->private_data; 398 macvtap_put_queue(q); 399 return 0; 400 } 401 402 static unsigned int macvtap_poll(struct file *file, poll_table * wait) 403 { 404 struct macvtap_queue *q = file->private_data; 405 unsigned int mask = POLLERR; 406 407 if (!q) 408 goto out; 409 410 mask = 0; 411 poll_wait(file, &q->wq.wait, wait); 412 413 if (!skb_queue_empty(&q->sk.sk_receive_queue)) 414 mask |= POLLIN | POLLRDNORM; 415 416 if (sock_writeable(&q->sk) || 417 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && 418 sock_writeable(&q->sk))) 419 mask |= POLLOUT | POLLWRNORM; 420 421 out: 422 return mask; 423 } 424 425 static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, 426 size_t len, size_t linear, 427 int noblock, int *err) 428 { 429 struct sk_buff *skb; 430 431 /* Under a page? Don't bother with paged skb. */ 432 if (prepad + len < PAGE_SIZE || !linear) 433 linear = len; 434 435 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 436 err); 437 if (!skb) 438 return NULL; 439 440 skb_reserve(skb, prepad); 441 skb_put(skb, linear); 442 skb->data_len = len - linear; 443 skb->len += len - linear; 444 445 return skb; 446 } 447 448 /* set skb frags from iovec, this can move to core network code for reuse */ 449 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, 450 int offset, size_t count) 451 { 452 int len = iov_length(from, count) - offset; 453 int copy = skb_headlen(skb); 454 int size, offset1 = 0; 455 int i = 0; 456 skb_frag_t *f; 457 458 /* Skip over from offset */ 459 while (count && (offset >= from->iov_len)) { 460 offset -= from->iov_len; 461 ++from; 462 --count; 463 } 464 465 /* copy up to skb headlen */ 466 while (count && (copy > 0)) { 467 size = min_t(unsigned int, copy, from->iov_len - offset); 468 if (copy_from_user(skb->data + offset1, from->iov_base + offset, 469 size)) 470 return -EFAULT; 471 if (copy > size) { 472 ++from; 473 --count; 474 } 475 copy -= size; 476 offset1 += size; 477 offset = 0; 478 } 479 480 if (len == offset1) 481 return 0; 482 483 while (count--) { 484 struct page *page[MAX_SKB_FRAGS]; 485 int num_pages; 486 unsigned long base; 487 488 len = from->iov_len - offset1; 489 if (!len) { 490 offset1 = 0; 491 ++from; 492 continue; 493 } 494 base = (unsigned long)from->iov_base + offset1; 495 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 496 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 497 if ((num_pages != size) || 498 (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) 499 /* put_page is in skb free */ 500 return -EFAULT; 501 skb->data_len += len; 502 skb->len += len; 503 skb->truesize += len; 504 atomic_add(len, &skb->sk->sk_wmem_alloc); 505 while (len) { 506 f = &skb_shinfo(skb)->frags[i]; 507 f->page = page[i]; 508 f->page_offset = base & ~PAGE_MASK; 509 f->size = min_t(int, len, PAGE_SIZE - f->page_offset); 510 skb_shinfo(skb)->nr_frags++; 511 /* increase sk_wmem_alloc */ 512 base += f->size; 513 len -= f->size; 514 i++; 515 } 516 offset1 = 0; 517 ++from; 518 } 519 return 0; 520 } 521 522 /* 523 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should 524 * be shared with the tun/tap driver. 525 */ 526 static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, 527 struct virtio_net_hdr *vnet_hdr) 528 { 529 unsigned short gso_type = 0; 530 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 531 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 532 case VIRTIO_NET_HDR_GSO_TCPV4: 533 gso_type = SKB_GSO_TCPV4; 534 break; 535 case VIRTIO_NET_HDR_GSO_TCPV6: 536 gso_type = SKB_GSO_TCPV6; 537 break; 538 case VIRTIO_NET_HDR_GSO_UDP: 539 gso_type = SKB_GSO_UDP; 540 break; 541 default: 542 return -EINVAL; 543 } 544 545 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) 546 gso_type |= SKB_GSO_TCP_ECN; 547 548 if (vnet_hdr->gso_size == 0) 549 return -EINVAL; 550 } 551 552 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 553 if (!skb_partial_csum_set(skb, vnet_hdr->csum_start, 554 vnet_hdr->csum_offset)) 555 return -EINVAL; 556 } 557 558 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 559 skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; 560 skb_shinfo(skb)->gso_type = gso_type; 561 562 /* Header must be checked, and gso_segs computed. */ 563 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 564 skb_shinfo(skb)->gso_segs = 0; 565 } 566 return 0; 567 } 568 569 static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, 570 struct virtio_net_hdr *vnet_hdr) 571 { 572 memset(vnet_hdr, 0, sizeof(*vnet_hdr)); 573 574 if (skb_is_gso(skb)) { 575 struct skb_shared_info *sinfo = skb_shinfo(skb); 576 577 /* This is a hint as to how much should be linear. */ 578 vnet_hdr->hdr_len = skb_headlen(skb); 579 vnet_hdr->gso_size = sinfo->gso_size; 580 if (sinfo->gso_type & SKB_GSO_TCPV4) 581 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 582 else if (sinfo->gso_type & SKB_GSO_TCPV6) 583 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 584 else if (sinfo->gso_type & SKB_GSO_UDP) 585 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 586 else 587 BUG(); 588 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 589 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; 590 } else 591 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 592 593 if (skb->ip_summed == CHECKSUM_PARTIAL) { 594 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 595 vnet_hdr->csum_start = skb_checksum_start_offset(skb); 596 vnet_hdr->csum_offset = skb->csum_offset; 597 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 598 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; 599 } /* else everything is zero */ 600 601 return 0; 602 } 603 604 605 /* Get packet from user space buffer */ 606 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, 607 const struct iovec *iv, unsigned long total_len, 608 size_t count, int noblock) 609 { 610 struct sk_buff *skb; 611 struct macvlan_dev *vlan; 612 unsigned long len = total_len; 613 int err; 614 struct virtio_net_hdr vnet_hdr = { 0 }; 615 int vnet_hdr_len = 0; 616 int copylen; 617 bool zerocopy = false; 618 619 if (q->flags & IFF_VNET_HDR) { 620 vnet_hdr_len = q->vnet_hdr_sz; 621 622 err = -EINVAL; 623 if (len < vnet_hdr_len) 624 goto err; 625 len -= vnet_hdr_len; 626 627 err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, 628 sizeof(vnet_hdr)); 629 if (err < 0) 630 goto err; 631 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 632 vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 633 vnet_hdr.hdr_len) 634 vnet_hdr.hdr_len = vnet_hdr.csum_start + 635 vnet_hdr.csum_offset + 2; 636 err = -EINVAL; 637 if (vnet_hdr.hdr_len > len) 638 goto err; 639 } 640 641 err = -EINVAL; 642 if (unlikely(len < ETH_HLEN)) 643 goto err; 644 645 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) 646 zerocopy = true; 647 648 if (zerocopy) { 649 /* There are 256 bytes to be copied in skb, so there is enough 650 * room for skb expand head in case it is used. 651 * The rest buffer is mapped from userspace. 652 */ 653 copylen = vnet_hdr.hdr_len; 654 if (!copylen) 655 copylen = GOODCOPY_LEN; 656 } else 657 copylen = len; 658 659 skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, 660 vnet_hdr.hdr_len, noblock, &err); 661 if (!skb) 662 goto err; 663 664 if (zerocopy) { 665 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 666 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 667 } else 668 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 669 len); 670 if (err) 671 goto err_kfree; 672 673 skb_set_network_header(skb, ETH_HLEN); 674 skb_reset_mac_header(skb); 675 skb->protocol = eth_hdr(skb)->h_proto; 676 677 if (vnet_hdr_len) { 678 err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr); 679 if (err) 680 goto err_kfree; 681 } 682 683 rcu_read_lock_bh(); 684 vlan = rcu_dereference_bh(q->vlan); 685 /* copy skb_ubuf_info for callback when skb has no error */ 686 if (zerocopy) 687 skb_shinfo(skb)->destructor_arg = m->msg_control; 688 if (vlan) 689 macvlan_start_xmit(skb, vlan->dev); 690 else 691 kfree_skb(skb); 692 rcu_read_unlock_bh(); 693 694 return total_len; 695 696 err_kfree: 697 kfree_skb(skb); 698 699 err: 700 rcu_read_lock_bh(); 701 vlan = rcu_dereference_bh(q->vlan); 702 if (vlan) 703 vlan->dev->stats.tx_dropped++; 704 rcu_read_unlock_bh(); 705 706 return err; 707 } 708 709 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, 710 unsigned long count, loff_t pos) 711 { 712 struct file *file = iocb->ki_filp; 713 ssize_t result = -ENOLINK; 714 struct macvtap_queue *q = file->private_data; 715 716 result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count, 717 file->f_flags & O_NONBLOCK); 718 return result; 719 } 720 721 /* Put packet to the user space buffer */ 722 static ssize_t macvtap_put_user(struct macvtap_queue *q, 723 const struct sk_buff *skb, 724 const struct iovec *iv, int len) 725 { 726 struct macvlan_dev *vlan; 727 int ret; 728 int vnet_hdr_len = 0; 729 730 if (q->flags & IFF_VNET_HDR) { 731 struct virtio_net_hdr vnet_hdr; 732 vnet_hdr_len = q->vnet_hdr_sz; 733 if ((len -= vnet_hdr_len) < 0) 734 return -EINVAL; 735 736 ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); 737 if (ret) 738 return ret; 739 740 if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) 741 return -EFAULT; 742 } 743 744 len = min_t(int, skb->len, len); 745 746 ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len); 747 748 rcu_read_lock_bh(); 749 vlan = rcu_dereference_bh(q->vlan); 750 if (vlan) 751 macvlan_count_rx(vlan, len, ret == 0, 0); 752 rcu_read_unlock_bh(); 753 754 return ret ? ret : (len + vnet_hdr_len); 755 } 756 757 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb, 758 const struct iovec *iv, unsigned long len, 759 int noblock) 760 { 761 DECLARE_WAITQUEUE(wait, current); 762 struct sk_buff *skb; 763 ssize_t ret = 0; 764 765 add_wait_queue(sk_sleep(&q->sk), &wait); 766 while (len) { 767 current->state = TASK_INTERRUPTIBLE; 768 769 /* Read frames from the queue */ 770 skb = skb_dequeue(&q->sk.sk_receive_queue); 771 if (!skb) { 772 if (noblock) { 773 ret = -EAGAIN; 774 break; 775 } 776 if (signal_pending(current)) { 777 ret = -ERESTARTSYS; 778 break; 779 } 780 /* Nothing to read, let's sleep */ 781 schedule(); 782 continue; 783 } 784 ret = macvtap_put_user(q, skb, iv, len); 785 kfree_skb(skb); 786 break; 787 } 788 789 current->state = TASK_RUNNING; 790 remove_wait_queue(sk_sleep(&q->sk), &wait); 791 return ret; 792 } 793 794 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, 795 unsigned long count, loff_t pos) 796 { 797 struct file *file = iocb->ki_filp; 798 struct macvtap_queue *q = file->private_data; 799 ssize_t len, ret = 0; 800 801 len = iov_length(iv, count); 802 if (len < 0) { 803 ret = -EINVAL; 804 goto out; 805 } 806 807 ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK); 808 ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */ 809 out: 810 return ret; 811 } 812 813 /* 814 * provide compatibility with generic tun/tap interface 815 */ 816 static long macvtap_ioctl(struct file *file, unsigned int cmd, 817 unsigned long arg) 818 { 819 struct macvtap_queue *q = file->private_data; 820 struct macvlan_dev *vlan; 821 void __user *argp = (void __user *)arg; 822 struct ifreq __user *ifr = argp; 823 unsigned int __user *up = argp; 824 unsigned int u; 825 int __user *sp = argp; 826 int s; 827 int ret; 828 829 switch (cmd) { 830 case TUNSETIFF: 831 /* ignore the name, just look at flags */ 832 if (get_user(u, &ifr->ifr_flags)) 833 return -EFAULT; 834 835 ret = 0; 836 if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP)) 837 ret = -EINVAL; 838 else 839 q->flags = u; 840 841 return ret; 842 843 case TUNGETIFF: 844 rcu_read_lock_bh(); 845 vlan = rcu_dereference_bh(q->vlan); 846 if (vlan) 847 dev_hold(vlan->dev); 848 rcu_read_unlock_bh(); 849 850 if (!vlan) 851 return -ENOLINK; 852 853 ret = 0; 854 if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) || 855 put_user(q->flags, &ifr->ifr_flags)) 856 ret = -EFAULT; 857 dev_put(vlan->dev); 858 return ret; 859 860 case TUNGETFEATURES: 861 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up)) 862 return -EFAULT; 863 return 0; 864 865 case TUNSETSNDBUF: 866 if (get_user(u, up)) 867 return -EFAULT; 868 869 q->sk.sk_sndbuf = u; 870 return 0; 871 872 case TUNGETVNETHDRSZ: 873 s = q->vnet_hdr_sz; 874 if (put_user(s, sp)) 875 return -EFAULT; 876 return 0; 877 878 case TUNSETVNETHDRSZ: 879 if (get_user(s, sp)) 880 return -EFAULT; 881 if (s < (int)sizeof(struct virtio_net_hdr)) 882 return -EINVAL; 883 884 q->vnet_hdr_sz = s; 885 return 0; 886 887 case TUNSETOFFLOAD: 888 /* let the user check for future flags */ 889 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | 890 TUN_F_TSO_ECN | TUN_F_UFO)) 891 return -EINVAL; 892 893 /* TODO: only accept frames with the features that 894 got enabled for forwarded frames */ 895 if (!(q->flags & IFF_VNET_HDR)) 896 return -EINVAL; 897 return 0; 898 899 default: 900 return -EINVAL; 901 } 902 } 903 904 #ifdef CONFIG_COMPAT 905 static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, 906 unsigned long arg) 907 { 908 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); 909 } 910 #endif 911 912 static const struct file_operations macvtap_fops = { 913 .owner = THIS_MODULE, 914 .open = macvtap_open, 915 .release = macvtap_release, 916 .aio_read = macvtap_aio_read, 917 .aio_write = macvtap_aio_write, 918 .poll = macvtap_poll, 919 .llseek = no_llseek, 920 .unlocked_ioctl = macvtap_ioctl, 921 #ifdef CONFIG_COMPAT 922 .compat_ioctl = macvtap_compat_ioctl, 923 #endif 924 }; 925 926 static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, 927 struct msghdr *m, size_t total_len) 928 { 929 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 930 return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen, 931 m->msg_flags & MSG_DONTWAIT); 932 } 933 934 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, 935 struct msghdr *m, size_t total_len, 936 int flags) 937 { 938 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 939 int ret; 940 if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) 941 return -EINVAL; 942 ret = macvtap_do_read(q, iocb, m->msg_iov, total_len, 943 flags & MSG_DONTWAIT); 944 if (ret > total_len) { 945 m->msg_flags |= MSG_TRUNC; 946 ret = flags & MSG_TRUNC ? ret : total_len; 947 } 948 return ret; 949 } 950 951 /* Ops structure to mimic raw sockets with tun */ 952 static const struct proto_ops macvtap_socket_ops = { 953 .sendmsg = macvtap_sendmsg, 954 .recvmsg = macvtap_recvmsg, 955 }; 956 957 /* Get an underlying socket object from tun file. Returns error unless file is 958 * attached to a device. The returned object works like a packet socket, it 959 * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for 960 * holding a reference to the file for as long as the socket is in use. */ 961 struct socket *macvtap_get_socket(struct file *file) 962 { 963 struct macvtap_queue *q; 964 if (file->f_op != &macvtap_fops) 965 return ERR_PTR(-EINVAL); 966 q = file->private_data; 967 if (!q) 968 return ERR_PTR(-EBADFD); 969 return &q->sock; 970 } 971 EXPORT_SYMBOL_GPL(macvtap_get_socket); 972 973 static int macvtap_init(void) 974 { 975 int err; 976 977 err = alloc_chrdev_region(&macvtap_major, 0, 978 MACVTAP_NUM_DEVS, "macvtap"); 979 if (err) 980 goto out1; 981 982 cdev_init(&macvtap_cdev, &macvtap_fops); 983 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); 984 if (err) 985 goto out2; 986 987 macvtap_class = class_create(THIS_MODULE, "macvtap"); 988 if (IS_ERR(macvtap_class)) { 989 err = PTR_ERR(macvtap_class); 990 goto out3; 991 } 992 993 err = macvlan_link_register(&macvtap_link_ops); 994 if (err) 995 goto out4; 996 997 return 0; 998 999 out4: 1000 class_unregister(macvtap_class); 1001 out3: 1002 cdev_del(&macvtap_cdev); 1003 out2: 1004 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1005 out1: 1006 return err; 1007 } 1008 module_init(macvtap_init); 1009 1010 static void macvtap_exit(void) 1011 { 1012 rtnl_link_unregister(&macvtap_link_ops); 1013 class_unregister(macvtap_class); 1014 cdev_del(&macvtap_cdev); 1015 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1016 } 1017 module_exit(macvtap_exit); 1018 1019 MODULE_ALIAS_RTNL_LINK("macvtap"); 1020 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); 1021 MODULE_LICENSE("GPL"); 1022