tun.c (943170998b200190f99d3fe7e771437e2c51f319) | tun.c (90e33d45940793def6f773b2d528e9f3c84ffdc7) |
---|---|
1/* 2 * TUN - Universal TUN/TAP device driver. 3 * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. --- 61 unchanged lines hidden (view full) --- 70#include <net/netns/generic.h> 71#include <net/rtnetlink.h> 72#include <net/sock.h> 73#include <linux/seq_file.h> 74#include <linux/uio.h> 75#include <linux/skb_array.h> 76#include <linux/bpf.h> 77#include <linux/bpf_trace.h> | 1/* 2 * TUN - Universal TUN/TAP device driver. 3 * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. --- 61 unchanged lines hidden (view full) --- 70#include <net/netns/generic.h> 71#include <net/rtnetlink.h> 72#include <net/sock.h> 73#include <linux/seq_file.h> 74#include <linux/uio.h> 75#include <linux/skb_array.h> 76#include <linux/bpf.h> 77#include <linux/bpf_trace.h> |
78#include <linux/mutex.h> |
|
78 79#include <linux/uaccess.h> 80 81/* Uncomment to enable debugging */ 82/* #define TUN_DEBUG 1 */ 83 84#ifdef TUN_DEBUG 85static int debug; --- 30 unchanged lines hidden (view full) --- 116 * overload it to mean fasync when stored there. 117 */ 118#define TUN_FASYNC IFF_ATTACH_QUEUE 119/* High bits in flags field are unused. */ 120#define TUN_VNET_LE 0x80000000 121#define TUN_VNET_BE 0x40000000 122 123#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ | 79 80#include <linux/uaccess.h> 81 82/* Uncomment to enable debugging */ 83/* #define TUN_DEBUG 1 */ 84 85#ifdef TUN_DEBUG 86static int debug; --- 30 unchanged lines hidden (view full) --- 117 * overload it to mean fasync when stored there. 118 */ 119#define TUN_FASYNC IFF_ATTACH_QUEUE 120/* High bits in flags field are unused. */ 121#define TUN_VNET_LE 0x80000000 122#define TUN_VNET_BE 0x40000000 123 124#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ |
124 IFF_MULTI_QUEUE | IFF_NAPI) | 125 IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) 126 |
125#define GOODCOPY_LEN 128 126 127#define FLT_EXACT_COUNT 8 128struct tap_filter { 129 unsigned int count; /* Number of addrs. Zero means disabled */ 130 u32 mask[2]; /* Mask of the hashed addrs */ 131 unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; 132}; --- 35 unchanged lines hidden (view full) --- 168 struct fasync_struct *fasync; 169 /* only used for fasnyc */ 170 unsigned int flags; 171 union { 172 u16 queue_index; 173 unsigned int ifindex; 174 }; 175 struct napi_struct napi; | 127#define GOODCOPY_LEN 128 128 129#define FLT_EXACT_COUNT 8 130struct tap_filter { 131 unsigned int count; /* Number of addrs. Zero means disabled */ 132 u32 mask[2]; /* Mask of the hashed addrs */ 133 unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; 134}; --- 35 unchanged lines hidden (view full) --- 170 struct fasync_struct *fasync; 171 /* only used for fasnyc */ 172 unsigned int flags; 173 union { 174 u16 queue_index; 175 unsigned int ifindex; 176 }; 177 struct napi_struct napi; |
178 struct mutex napi_mutex; /* Protects access to the above napi */ |
|
176 struct list_head next; 177 struct tun_struct *detached; 178 struct skb_array tx_array; 179}; 180 181struct tun_flow_entry { 182 struct hlist_node hash_link; 183 struct rcu_head rcu; --- 88 unchanged lines hidden (view full) --- 272 273static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, 274 bool napi_en) 275{ 276 if (napi_en) { 277 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, 278 NAPI_POLL_WEIGHT); 279 napi_enable(&tfile->napi); | 179 struct list_head next; 180 struct tun_struct *detached; 181 struct skb_array tx_array; 182}; 183 184struct tun_flow_entry { 185 struct hlist_node hash_link; 186 struct rcu_head rcu; --- 88 unchanged lines hidden (view full) --- 275 276static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, 277 bool napi_en) 278{ 279 if (napi_en) { 280 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, 281 NAPI_POLL_WEIGHT); 282 napi_enable(&tfile->napi); |
283 mutex_init(&tfile->napi_mutex); |
|
280 } 281} 282 283static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile) 284{ 285 if (tun->flags & IFF_NAPI) 286 napi_disable(&tfile->napi); 287} 288 289static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) 290{ 291 if (tun->flags & IFF_NAPI) 292 netif_napi_del(&tfile->napi); 293} 294 | 284 } 285} 286 287static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile) 288{ 289 if (tun->flags & IFF_NAPI) 290 napi_disable(&tfile->napi); 291} 292 293static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) 294{ 295 if (tun->flags & IFF_NAPI) 296 netif_napi_del(&tfile->napi); 297} 298 |
299static bool tun_napi_frags_enabled(const struct tun_struct *tun) 300{ 301 return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS; 302} 303 |
|
295#ifdef CONFIG_TUN_VNET_CROSS_LE 296static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) 297{ 298 return tun->flags & TUN_VNET_BE ? false : 299 virtio_legacy_is_little_endian(); 300} 301 302static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) --- 728 unchanged lines hidden (view full) --- 1031 * 2) the tun socket gets a sendmsg call from user space 1032 * If NAPI is not enabled, since both of those are synchronous 1033 * operations, we are guaranteed never to have pending data when we poll 1034 * for it so there is nothing to do here but return. 1035 * We need this though so netpoll recognizes us as an interface that 1036 * supports polling, which enables bridge devices in virt setups to 1037 * still use netconsole 1038 * If NAPI is enabled, however, we need to schedule polling for all | 304#ifdef CONFIG_TUN_VNET_CROSS_LE 305static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) 306{ 307 return tun->flags & TUN_VNET_BE ? false : 308 virtio_legacy_is_little_endian(); 309} 310 311static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) --- 728 unchanged lines hidden (view full) --- 1040 * 2) the tun socket gets a sendmsg call from user space 1041 * If NAPI is not enabled, since both of those are synchronous 1042 * operations, we are guaranteed never to have pending data when we poll 1043 * for it so there is nothing to do here but return. 1044 * We need this though so netpoll recognizes us as an interface that 1045 * supports polling, which enables bridge devices in virt setups to 1046 * still use netconsole 1047 * If NAPI is enabled, however, we need to schedule polling for all |
1039 * queues. | 1048 * queues unless we are using napi_gro_frags(), which we call in 1049 * process context and not in NAPI context. |
1040 */ 1041 struct tun_struct *tun = netdev_priv(dev); 1042 1043 if (tun->flags & IFF_NAPI) { 1044 struct tun_file *tfile; 1045 int i; 1046 | 1050 */ 1051 struct tun_struct *tun = netdev_priv(dev); 1052 1053 if (tun->flags & IFF_NAPI) { 1054 struct tun_file *tfile; 1055 int i; 1056 |
1057 if (tun_napi_frags_enabled(tun)) 1058 return; 1059 |
|
1047 rcu_read_lock(); 1048 for (i = 0; i < tun->numqueues; i++) { 1049 tfile = rcu_dereference(tun->tfiles[i]); 1050 napi_schedule(&tfile->napi); 1051 } 1052 rcu_read_unlock(); 1053 } 1054 return; --- 206 unchanged lines hidden (view full) --- 1261 1262 if (tun->dev->reg_state != NETREG_REGISTERED) 1263 mask = POLLERR; 1264 1265 tun_put(tun); 1266 return mask; 1267} 1268 | 1060 rcu_read_lock(); 1061 for (i = 0; i < tun->numqueues; i++) { 1062 tfile = rcu_dereference(tun->tfiles[i]); 1063 napi_schedule(&tfile->napi); 1064 } 1065 rcu_read_unlock(); 1066 } 1067 return; --- 206 unchanged lines hidden (view full) --- 1274 1275 if (tun->dev->reg_state != NETREG_REGISTERED) 1276 mask = POLLERR; 1277 1278 tun_put(tun); 1279 return mask; 1280} 1281 |
1282static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, 1283 size_t len, 1284 const struct iov_iter *it) 1285{ 1286 struct sk_buff *skb; 1287 size_t linear; 1288 int err; 1289 int i; 1290 1291 if (it->nr_segs > MAX_SKB_FRAGS + 1) 1292 return ERR_PTR(-ENOMEM); 1293 1294 local_bh_disable(); 1295 skb = napi_get_frags(&tfile->napi); 1296 local_bh_enable(); 1297 if (!skb) 1298 return ERR_PTR(-ENOMEM); 1299 1300 linear = iov_iter_single_seg_count(it); 1301 err = __skb_grow(skb, linear); 1302 if (err) 1303 goto free; 1304 1305 skb->len = len; 1306 skb->data_len = len - linear; 1307 skb->truesize += skb->data_len; 1308 1309 for (i = 1; i < it->nr_segs; i++) { 1310 size_t fragsz = it->iov[i].iov_len; 1311 unsigned long offset; 1312 struct page *page; 1313 void *data; 1314 1315 if (fragsz == 0 || fragsz > PAGE_SIZE) { 1316 err = -EINVAL; 1317 goto free; 1318 } 1319 1320 local_bh_disable(); 1321 data = napi_alloc_frag(fragsz); 1322 local_bh_enable(); 1323 if (!data) { 1324 err = -ENOMEM; 1325 goto free; 1326 } 1327 1328 page = virt_to_head_page(data); 1329 offset = data - page_address(page); 1330 skb_fill_page_desc(skb, i - 1, page, offset, fragsz); 1331 } 1332 1333 return skb; 1334free: 1335 /* frees skb and all frags allocated with napi_alloc_frag() */ 1336 napi_free_frags(&tfile->napi); 1337 return ERR_PTR(err); 1338} 1339 |
|
1269/* prepad is the amount to reserve at front. len is length after that. 1270 * linear is a hint as to how much to copy (usually headers). */ 1271static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, 1272 size_t prepad, size_t len, 1273 size_t linear, int noblock) 1274{ 1275 struct sock *sk = tfile->socket.sk; 1276 struct sk_buff *skb; --- 196 unchanged lines hidden (view full) --- 1473 struct virtio_net_hdr gso = { 0 }; 1474 struct tun_pcpu_stats *stats; 1475 int good_linear; 1476 int copylen; 1477 bool zerocopy = false; 1478 int err; 1479 u32 rxhash; 1480 int skb_xdp = 1; | 1340/* prepad is the amount to reserve at front. len is length after that. 1341 * linear is a hint as to how much to copy (usually headers). */ 1342static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, 1343 size_t prepad, size_t len, 1344 size_t linear, int noblock) 1345{ 1346 struct sock *sk = tfile->socket.sk; 1347 struct sk_buff *skb; --- 196 unchanged lines hidden (view full) --- 1544 struct virtio_net_hdr gso = { 0 }; 1545 struct tun_pcpu_stats *stats; 1546 int good_linear; 1547 int copylen; 1548 bool zerocopy = false; 1549 int err; 1550 u32 rxhash; 1551 int skb_xdp = 1; |
1552 bool frags = tun_napi_frags_enabled(tun); |
|
1481 1482 if (!(tun->dev->flags & IFF_UP)) 1483 return -EIO; 1484 1485 if (!(tun->flags & IFF_NO_PI)) { 1486 if (len < sizeof(pi)) 1487 return -EINVAL; 1488 len -= sizeof(pi); --- 41 unchanged lines hidden (view full) --- 1530 if (copylen > good_linear) 1531 copylen = good_linear; 1532 linear = copylen; 1533 iov_iter_advance(&i, copylen); 1534 if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) 1535 zerocopy = true; 1536 } 1537 | 1553 1554 if (!(tun->dev->flags & IFF_UP)) 1555 return -EIO; 1556 1557 if (!(tun->flags & IFF_NO_PI)) { 1558 if (len < sizeof(pi)) 1559 return -EINVAL; 1560 len -= sizeof(pi); --- 41 unchanged lines hidden (view full) --- 1602 if (copylen > good_linear) 1603 copylen = good_linear; 1604 linear = copylen; 1605 iov_iter_advance(&i, copylen); 1606 if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) 1607 zerocopy = true; 1608 } 1609 |
1538 if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { | 1610 if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { |
1539 /* For the packet that is not easy to be processed 1540 * (e.g gso or jumbo packet), we will do it at after 1541 * skb was created with generic XDP routine. 1542 */ 1543 skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); 1544 if (IS_ERR(skb)) { 1545 this_cpu_inc(tun->pcpu_stats->rx_dropped); 1546 return PTR_ERR(skb); --- 4 unchanged lines hidden (view full) --- 1551 if (!zerocopy) { 1552 copylen = len; 1553 if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) 1554 linear = good_linear; 1555 else 1556 linear = tun16_to_cpu(tun, gso.hdr_len); 1557 } 1558 | 1611 /* For the packet that is not easy to be processed 1612 * (e.g gso or jumbo packet), we will do it at after 1613 * skb was created with generic XDP routine. 1614 */ 1615 skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); 1616 if (IS_ERR(skb)) { 1617 this_cpu_inc(tun->pcpu_stats->rx_dropped); 1618 return PTR_ERR(skb); --- 4 unchanged lines hidden (view full) --- 1623 if (!zerocopy) { 1624 copylen = len; 1625 if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) 1626 linear = good_linear; 1627 else 1628 linear = tun16_to_cpu(tun, gso.hdr_len); 1629 } 1630 |
1559 skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); | 1631 if (frags) { 1632 mutex_lock(&tfile->napi_mutex); 1633 skb = tun_napi_alloc_frags(tfile, copylen, from); 1634 /* tun_napi_alloc_frags() enforces a layout for the skb. 1635 * If zerocopy is enabled, then this layout will be 1636 * overwritten by zerocopy_sg_from_iter(). 1637 */ 1638 zerocopy = false; 1639 } else { 1640 skb = tun_alloc_skb(tfile, align, copylen, linear, 1641 noblock); 1642 } 1643 |
1560 if (IS_ERR(skb)) { 1561 if (PTR_ERR(skb) != -EAGAIN) 1562 this_cpu_inc(tun->pcpu_stats->rx_dropped); | 1644 if (IS_ERR(skb)) { 1645 if (PTR_ERR(skb) != -EAGAIN) 1646 this_cpu_inc(tun->pcpu_stats->rx_dropped); |
1647 if (frags) 1648 mutex_unlock(&tfile->napi_mutex); |
|
1563 return PTR_ERR(skb); 1564 } 1565 1566 if (zerocopy) 1567 err = zerocopy_sg_from_iter(skb, from); 1568 else 1569 err = skb_copy_datagram_from_iter(skb, 0, from, len); 1570 1571 if (err) { 1572 this_cpu_inc(tun->pcpu_stats->rx_dropped); 1573 kfree_skb(skb); | 1649 return PTR_ERR(skb); 1650 } 1651 1652 if (zerocopy) 1653 err = zerocopy_sg_from_iter(skb, from); 1654 else 1655 err = skb_copy_datagram_from_iter(skb, 0, from, len); 1656 1657 if (err) { 1658 this_cpu_inc(tun->pcpu_stats->rx_dropped); 1659 kfree_skb(skb); |
1660 if (frags) { 1661 tfile->napi.skb = NULL; 1662 mutex_unlock(&tfile->napi_mutex); 1663 } 1664 |
|
1574 return -EFAULT; 1575 } 1576 } 1577 1578 if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { 1579 this_cpu_inc(tun->pcpu_stats->rx_frame_errors); 1580 kfree_skb(skb); | 1665 return -EFAULT; 1666 } 1667 } 1668 1669 if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { 1670 this_cpu_inc(tun->pcpu_stats->rx_frame_errors); 1671 kfree_skb(skb); |
1672 if (frags) { 1673 tfile->napi.skb = NULL; 1674 mutex_unlock(&tfile->napi_mutex); 1675 } 1676 |
|
1581 return -EINVAL; 1582 } 1583 1584 switch (tun->flags & TUN_TYPE_MASK) { 1585 case IFF_TUN: 1586 if (tun->flags & IFF_NO_PI) { 1587 switch (skb->data[0] & 0xf0) { 1588 case 0x40: --- 9 unchanged lines hidden (view full) --- 1598 } 1599 } 1600 1601 skb_reset_mac_header(skb); 1602 skb->protocol = pi.proto; 1603 skb->dev = tun->dev; 1604 break; 1605 case IFF_TAP: | 1677 return -EINVAL; 1678 } 1679 1680 switch (tun->flags & TUN_TYPE_MASK) { 1681 case IFF_TUN: 1682 if (tun->flags & IFF_NO_PI) { 1683 switch (skb->data[0] & 0xf0) { 1684 case 0x40: --- 9 unchanged lines hidden (view full) --- 1694 } 1695 } 1696 1697 skb_reset_mac_header(skb); 1698 skb->protocol = pi.proto; 1699 skb->dev = tun->dev; 1700 break; 1701 case IFF_TAP: |
1606 skb->protocol = eth_type_trans(skb, tun->dev); | 1702 if (!frags) 1703 skb->protocol = eth_type_trans(skb, tun->dev); |
1607 break; 1608 } 1609 1610 /* copy skb_ubuf_info for callback when skb has no error */ 1611 if (zerocopy) { 1612 skb_shinfo(skb)->destructor_arg = msg_control; 1613 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 1614 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; --- 18 unchanged lines hidden (view full) --- 1633 return total_len; 1634 } 1635 } 1636 rcu_read_unlock(); 1637 } 1638 1639 rxhash = __skb_get_hash_symmetric(skb); 1640 | 1704 break; 1705 } 1706 1707 /* copy skb_ubuf_info for callback when skb has no error */ 1708 if (zerocopy) { 1709 skb_shinfo(skb)->destructor_arg = msg_control; 1710 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 1711 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; --- 18 unchanged lines hidden (view full) --- 1730 return total_len; 1731 } 1732 } 1733 rcu_read_unlock(); 1734 } 1735 1736 rxhash = __skb_get_hash_symmetric(skb); 1737 |
1641 if (tun->flags & IFF_NAPI) { | 1738 if (frags) { 1739 /* Exercise flow dissector code path. */ 1740 u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb)); 1741 1742 if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) { 1743 this_cpu_inc(tun->pcpu_stats->rx_dropped); 1744 napi_free_frags(&tfile->napi); 1745 mutex_unlock(&tfile->napi_mutex); 1746 WARN_ON(1); 1747 return -ENOMEM; 1748 } 1749 1750 local_bh_disable(); 1751 napi_gro_frags(&tfile->napi); 1752 local_bh_enable(); 1753 mutex_unlock(&tfile->napi_mutex); 1754 } else if (tun->flags & IFF_NAPI) { |
1642 struct sk_buff_head *queue = &tfile->sk.sk_write_queue; 1643 int queue_len; 1644 1645 spin_lock_bh(&queue->lock); 1646 __skb_queue_tail(queue, skb); 1647 queue_len = skb_queue_len(queue); 1648 spin_unlock(&queue->lock); 1649 --- 406 unchanged lines hidden (view full) --- 2056 struct tun_struct *tun; 2057 struct tun_file *tfile = file->private_data; 2058 struct net_device *dev; 2059 int err; 2060 2061 if (tfile->detached) 2062 return -EINVAL; 2063 | 1755 struct sk_buff_head *queue = &tfile->sk.sk_write_queue; 1756 int queue_len; 1757 1758 spin_lock_bh(&queue->lock); 1759 __skb_queue_tail(queue, skb); 1760 queue_len = skb_queue_len(queue); 1761 spin_unlock(&queue->lock); 1762 --- 406 unchanged lines hidden (view full) --- 2169 struct tun_struct *tun; 2170 struct tun_file *tfile = file->private_data; 2171 struct net_device *dev; 2172 int err; 2173 2174 if (tfile->detached) 2175 return -EINVAL; 2176 |
2177 if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { 2178 if (!capable(CAP_NET_ADMIN)) 2179 return -EPERM; 2180 2181 if (!(ifr->ifr_flags & IFF_NAPI) || 2182 (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) 2183 return -EINVAL; 2184 } 2185 |
|
2064 dev = __dev_get_by_name(net, ifr->ifr_name); 2065 if (dev) { 2066 if (ifr->ifr_flags & IFF_TUN_EXCL) 2067 return -EBUSY; 2068 if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) 2069 tun = netdev_priv(dev); 2070 else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) 2071 tun = netdev_priv(dev); --- 903 unchanged lines hidden --- | 2186 dev = __dev_get_by_name(net, ifr->ifr_name); 2187 if (dev) { 2188 if (ifr->ifr_flags & IFF_TUN_EXCL) 2189 return -EBUSY; 2190 if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) 2191 tun = netdev_priv(dev); 2192 else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) 2193 tun = netdev_priv(dev); --- 903 unchanged lines hidden --- |