tun.c (943170998b200190f99d3fe7e771437e2c51f319) tun.c (90e33d45940793def6f773b2d528e9f3c84ffdc7)
1/*
2 * TUN - Universal TUN/TAP device driver.
3 * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.

--- 61 unchanged lines hidden (view full) ---

70#include <net/netns/generic.h>
71#include <net/rtnetlink.h>
72#include <net/sock.h>
73#include <linux/seq_file.h>
74#include <linux/uio.h>
75#include <linux/skb_array.h>
76#include <linux/bpf.h>
77#include <linux/bpf_trace.h>
1/*
2 * TUN - Universal TUN/TAP device driver.
3 * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.

--- 61 unchanged lines hidden (view full) ---

70#include <net/netns/generic.h>
71#include <net/rtnetlink.h>
72#include <net/sock.h>
73#include <linux/seq_file.h>
74#include <linux/uio.h>
75#include <linux/skb_array.h>
76#include <linux/bpf.h>
77#include <linux/bpf_trace.h>
78#include <linux/mutex.h>
78
79#include <linux/uaccess.h>
80
81/* Uncomment to enable debugging */
82/* #define TUN_DEBUG 1 */
83
84#ifdef TUN_DEBUG
85static int debug;

--- 30 unchanged lines hidden (view full) ---

116 * overload it to mean fasync when stored there.
117 */
118#define TUN_FASYNC IFF_ATTACH_QUEUE
119/* High bits in flags field are unused. */
120#define TUN_VNET_LE 0x80000000
121#define TUN_VNET_BE 0x40000000
122
123#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
79
80#include <linux/uaccess.h>
81
82/* Uncomment to enable debugging */
83/* #define TUN_DEBUG 1 */
84
85#ifdef TUN_DEBUG
86static int debug;

--- 30 unchanged lines hidden (view full) ---

117 * overload it to mean fasync when stored there.
118 */
119#define TUN_FASYNC IFF_ATTACH_QUEUE
120/* High bits in flags field are unused. */
121#define TUN_VNET_LE 0x80000000
122#define TUN_VNET_BE 0x40000000
123
124#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
124 IFF_MULTI_QUEUE | IFF_NAPI)
125 IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
126
125#define GOODCOPY_LEN 128
126
127#define FLT_EXACT_COUNT 8
128struct tap_filter {
129 unsigned int count; /* Number of addrs. Zero means disabled */
130 u32 mask[2]; /* Mask of the hashed addrs */
131 unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
132};

--- 35 unchanged lines hidden (view full) ---

168 struct fasync_struct *fasync;
169 /* only used for fasnyc */
170 unsigned int flags;
171 union {
172 u16 queue_index;
173 unsigned int ifindex;
174 };
175 struct napi_struct napi;
127#define GOODCOPY_LEN 128
128
129#define FLT_EXACT_COUNT 8
130struct tap_filter {
131 unsigned int count; /* Number of addrs. Zero means disabled */
132 u32 mask[2]; /* Mask of the hashed addrs */
133 unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
134};

--- 35 unchanged lines hidden (view full) ---

170 struct fasync_struct *fasync;
171 /* only used for fasnyc */
172 unsigned int flags;
173 union {
174 u16 queue_index;
175 unsigned int ifindex;
176 };
177 struct napi_struct napi;
178 struct mutex napi_mutex; /* Protects access to the above napi */
176 struct list_head next;
177 struct tun_struct *detached;
178 struct skb_array tx_array;
179};
180
181struct tun_flow_entry {
182 struct hlist_node hash_link;
183 struct rcu_head rcu;

--- 88 unchanged lines hidden (view full) ---

272
273static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
274 bool napi_en)
275{
276 if (napi_en) {
277 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
278 NAPI_POLL_WEIGHT);
279 napi_enable(&tfile->napi);
179 struct list_head next;
180 struct tun_struct *detached;
181 struct skb_array tx_array;
182};
183
184struct tun_flow_entry {
185 struct hlist_node hash_link;
186 struct rcu_head rcu;

--- 88 unchanged lines hidden (view full) ---

275
276static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
277 bool napi_en)
278{
279 if (napi_en) {
280 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
281 NAPI_POLL_WEIGHT);
282 napi_enable(&tfile->napi);
283 mutex_init(&tfile->napi_mutex);
280 }
281}
282
283static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
284{
285 if (tun->flags & IFF_NAPI)
286 napi_disable(&tfile->napi);
287}
288
289static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
290{
291 if (tun->flags & IFF_NAPI)
292 netif_napi_del(&tfile->napi);
293}
294
284 }
285}
286
287static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
288{
289 if (tun->flags & IFF_NAPI)
290 napi_disable(&tfile->napi);
291}
292
293static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
294{
295 if (tun->flags & IFF_NAPI)
296 netif_napi_del(&tfile->napi);
297}
298
299static bool tun_napi_frags_enabled(const struct tun_struct *tun)
300{
301 return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
302}
303
295#ifdef CONFIG_TUN_VNET_CROSS_LE
296static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
297{
298 return tun->flags & TUN_VNET_BE ? false :
299 virtio_legacy_is_little_endian();
300}
301
302static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)

--- 728 unchanged lines hidden (view full) ---

1031 * 2) the tun socket gets a sendmsg call from user space
1032 * If NAPI is not enabled, since both of those are synchronous
1033 * operations, we are guaranteed never to have pending data when we poll
1034 * for it so there is nothing to do here but return.
1035 * We need this though so netpoll recognizes us as an interface that
1036 * supports polling, which enables bridge devices in virt setups to
1037 * still use netconsole
1038 * If NAPI is enabled, however, we need to schedule polling for all
304#ifdef CONFIG_TUN_VNET_CROSS_LE
305static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
306{
307 return tun->flags & TUN_VNET_BE ? false :
308 virtio_legacy_is_little_endian();
309}
310
311static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)

--- 728 unchanged lines hidden (view full) ---

1040 * 2) the tun socket gets a sendmsg call from user space
1041 * If NAPI is not enabled, since both of those are synchronous
1042 * operations, we are guaranteed never to have pending data when we poll
1043 * for it so there is nothing to do here but return.
1044 * We need this though so netpoll recognizes us as an interface that
1045 * supports polling, which enables bridge devices in virt setups to
1046 * still use netconsole
1047 * If NAPI is enabled, however, we need to schedule polling for all
1039 * queues.
1048 * queues unless we are using napi_gro_frags(), which we call in
1049 * process context and not in NAPI context.
1040 */
1041 struct tun_struct *tun = netdev_priv(dev);
1042
1043 if (tun->flags & IFF_NAPI) {
1044 struct tun_file *tfile;
1045 int i;
1046
1050 */
1051 struct tun_struct *tun = netdev_priv(dev);
1052
1053 if (tun->flags & IFF_NAPI) {
1054 struct tun_file *tfile;
1055 int i;
1056
1057 if (tun_napi_frags_enabled(tun))
1058 return;
1059
1047 rcu_read_lock();
1048 for (i = 0; i < tun->numqueues; i++) {
1049 tfile = rcu_dereference(tun->tfiles[i]);
1050 napi_schedule(&tfile->napi);
1051 }
1052 rcu_read_unlock();
1053 }
1054 return;

--- 206 unchanged lines hidden (view full) ---

1261
1262 if (tun->dev->reg_state != NETREG_REGISTERED)
1263 mask = POLLERR;
1264
1265 tun_put(tun);
1266 return mask;
1267}
1268
1060 rcu_read_lock();
1061 for (i = 0; i < tun->numqueues; i++) {
1062 tfile = rcu_dereference(tun->tfiles[i]);
1063 napi_schedule(&tfile->napi);
1064 }
1065 rcu_read_unlock();
1066 }
1067 return;

--- 206 unchanged lines hidden (view full) ---

1274
1275 if (tun->dev->reg_state != NETREG_REGISTERED)
1276 mask = POLLERR;
1277
1278 tun_put(tun);
1279 return mask;
1280}
1281
1282static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
1283 size_t len,
1284 const struct iov_iter *it)
1285{
1286 struct sk_buff *skb;
1287 size_t linear;
1288 int err;
1289 int i;
1290
1291 if (it->nr_segs > MAX_SKB_FRAGS + 1)
1292 return ERR_PTR(-ENOMEM);
1293
1294 local_bh_disable();
1295 skb = napi_get_frags(&tfile->napi);
1296 local_bh_enable();
1297 if (!skb)
1298 return ERR_PTR(-ENOMEM);
1299
1300 linear = iov_iter_single_seg_count(it);
1301 err = __skb_grow(skb, linear);
1302 if (err)
1303 goto free;
1304
1305 skb->len = len;
1306 skb->data_len = len - linear;
1307 skb->truesize += skb->data_len;
1308
1309 for (i = 1; i < it->nr_segs; i++) {
1310 size_t fragsz = it->iov[i].iov_len;
1311 unsigned long offset;
1312 struct page *page;
1313 void *data;
1314
1315 if (fragsz == 0 || fragsz > PAGE_SIZE) {
1316 err = -EINVAL;
1317 goto free;
1318 }
1319
1320 local_bh_disable();
1321 data = napi_alloc_frag(fragsz);
1322 local_bh_enable();
1323 if (!data) {
1324 err = -ENOMEM;
1325 goto free;
1326 }
1327
1328 page = virt_to_head_page(data);
1329 offset = data - page_address(page);
1330 skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
1331 }
1332
1333 return skb;
1334free:
1335 /* frees skb and all frags allocated with napi_alloc_frag() */
1336 napi_free_frags(&tfile->napi);
1337 return ERR_PTR(err);
1338}
1339
1269/* prepad is the amount to reserve at front. len is length after that.
1270 * linear is a hint as to how much to copy (usually headers). */
1271static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1272 size_t prepad, size_t len,
1273 size_t linear, int noblock)
1274{
1275 struct sock *sk = tfile->socket.sk;
1276 struct sk_buff *skb;

--- 196 unchanged lines hidden (view full) ---

1473 struct virtio_net_hdr gso = { 0 };
1474 struct tun_pcpu_stats *stats;
1475 int good_linear;
1476 int copylen;
1477 bool zerocopy = false;
1478 int err;
1479 u32 rxhash;
1480 int skb_xdp = 1;
1340/* prepad is the amount to reserve at front. len is length after that.
1341 * linear is a hint as to how much to copy (usually headers). */
1342static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1343 size_t prepad, size_t len,
1344 size_t linear, int noblock)
1345{
1346 struct sock *sk = tfile->socket.sk;
1347 struct sk_buff *skb;

--- 196 unchanged lines hidden (view full) ---

1544 struct virtio_net_hdr gso = { 0 };
1545 struct tun_pcpu_stats *stats;
1546 int good_linear;
1547 int copylen;
1548 bool zerocopy = false;
1549 int err;
1550 u32 rxhash;
1551 int skb_xdp = 1;
1552 bool frags = tun_napi_frags_enabled(tun);
1481
1482 if (!(tun->dev->flags & IFF_UP))
1483 return -EIO;
1484
1485 if (!(tun->flags & IFF_NO_PI)) {
1486 if (len < sizeof(pi))
1487 return -EINVAL;
1488 len -= sizeof(pi);

--- 41 unchanged lines hidden (view full) ---

1530 if (copylen > good_linear)
1531 copylen = good_linear;
1532 linear = copylen;
1533 iov_iter_advance(&i, copylen);
1534 if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1535 zerocopy = true;
1536 }
1537
1553
1554 if (!(tun->dev->flags & IFF_UP))
1555 return -EIO;
1556
1557 if (!(tun->flags & IFF_NO_PI)) {
1558 if (len < sizeof(pi))
1559 return -EINVAL;
1560 len -= sizeof(pi);

--- 41 unchanged lines hidden (view full) ---

1602 if (copylen > good_linear)
1603 copylen = good_linear;
1604 linear = copylen;
1605 iov_iter_advance(&i, copylen);
1606 if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1607 zerocopy = true;
1608 }
1609
1538 if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1610 if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1539 /* For the packet that is not easy to be processed
1540 * (e.g gso or jumbo packet), we will do it at after
1541 * skb was created with generic XDP routine.
1542 */
1543 skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
1544 if (IS_ERR(skb)) {
1545 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1546 return PTR_ERR(skb);

--- 4 unchanged lines hidden (view full) ---

1551 if (!zerocopy) {
1552 copylen = len;
1553 if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1554 linear = good_linear;
1555 else
1556 linear = tun16_to_cpu(tun, gso.hdr_len);
1557 }
1558
1611 /* For the packet that is not easy to be processed
1612 * (e.g gso or jumbo packet), we will do it at after
1613 * skb was created with generic XDP routine.
1614 */
1615 skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
1616 if (IS_ERR(skb)) {
1617 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1618 return PTR_ERR(skb);

--- 4 unchanged lines hidden (view full) ---

1623 if (!zerocopy) {
1624 copylen = len;
1625 if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1626 linear = good_linear;
1627 else
1628 linear = tun16_to_cpu(tun, gso.hdr_len);
1629 }
1630
1559 skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
1631 if (frags) {
1632 mutex_lock(&tfile->napi_mutex);
1633 skb = tun_napi_alloc_frags(tfile, copylen, from);
1634 /* tun_napi_alloc_frags() enforces a layout for the skb.
1635 * If zerocopy is enabled, then this layout will be
1636 * overwritten by zerocopy_sg_from_iter().
1637 */
1638 zerocopy = false;
1639 } else {
1640 skb = tun_alloc_skb(tfile, align, copylen, linear,
1641 noblock);
1642 }
1643
1560 if (IS_ERR(skb)) {
1561 if (PTR_ERR(skb) != -EAGAIN)
1562 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1644 if (IS_ERR(skb)) {
1645 if (PTR_ERR(skb) != -EAGAIN)
1646 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1647 if (frags)
1648 mutex_unlock(&tfile->napi_mutex);
1563 return PTR_ERR(skb);
1564 }
1565
1566 if (zerocopy)
1567 err = zerocopy_sg_from_iter(skb, from);
1568 else
1569 err = skb_copy_datagram_from_iter(skb, 0, from, len);
1570
1571 if (err) {
1572 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1573 kfree_skb(skb);
1649 return PTR_ERR(skb);
1650 }
1651
1652 if (zerocopy)
1653 err = zerocopy_sg_from_iter(skb, from);
1654 else
1655 err = skb_copy_datagram_from_iter(skb, 0, from, len);
1656
1657 if (err) {
1658 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1659 kfree_skb(skb);
1660 if (frags) {
1661 tfile->napi.skb = NULL;
1662 mutex_unlock(&tfile->napi_mutex);
1663 }
1664
1574 return -EFAULT;
1575 }
1576 }
1577
1578 if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
1579 this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
1580 kfree_skb(skb);
1665 return -EFAULT;
1666 }
1667 }
1668
1669 if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
1670 this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
1671 kfree_skb(skb);
1672 if (frags) {
1673 tfile->napi.skb = NULL;
1674 mutex_unlock(&tfile->napi_mutex);
1675 }
1676
1581 return -EINVAL;
1582 }
1583
1584 switch (tun->flags & TUN_TYPE_MASK) {
1585 case IFF_TUN:
1586 if (tun->flags & IFF_NO_PI) {
1587 switch (skb->data[0] & 0xf0) {
1588 case 0x40:

--- 9 unchanged lines hidden (view full) ---

1598 }
1599 }
1600
1601 skb_reset_mac_header(skb);
1602 skb->protocol = pi.proto;
1603 skb->dev = tun->dev;
1604 break;
1605 case IFF_TAP:
1677 return -EINVAL;
1678 }
1679
1680 switch (tun->flags & TUN_TYPE_MASK) {
1681 case IFF_TUN:
1682 if (tun->flags & IFF_NO_PI) {
1683 switch (skb->data[0] & 0xf0) {
1684 case 0x40:

--- 9 unchanged lines hidden (view full) ---

1694 }
1695 }
1696
1697 skb_reset_mac_header(skb);
1698 skb->protocol = pi.proto;
1699 skb->dev = tun->dev;
1700 break;
1701 case IFF_TAP:
1606 skb->protocol = eth_type_trans(skb, tun->dev);
1702 if (!frags)
1703 skb->protocol = eth_type_trans(skb, tun->dev);
1607 break;
1608 }
1609
1610 /* copy skb_ubuf_info for callback when skb has no error */
1611 if (zerocopy) {
1612 skb_shinfo(skb)->destructor_arg = msg_control;
1613 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1614 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;

--- 18 unchanged lines hidden (view full) ---

1633 return total_len;
1634 }
1635 }
1636 rcu_read_unlock();
1637 }
1638
1639 rxhash = __skb_get_hash_symmetric(skb);
1640
1704 break;
1705 }
1706
1707 /* copy skb_ubuf_info for callback when skb has no error */
1708 if (zerocopy) {
1709 skb_shinfo(skb)->destructor_arg = msg_control;
1710 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1711 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;

--- 18 unchanged lines hidden (view full) ---

1730 return total_len;
1731 }
1732 }
1733 rcu_read_unlock();
1734 }
1735
1736 rxhash = __skb_get_hash_symmetric(skb);
1737
1641 if (tun->flags & IFF_NAPI) {
1738 if (frags) {
1739 /* Exercise flow dissector code path. */
1740 u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
1741
1742 if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
1743 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1744 napi_free_frags(&tfile->napi);
1745 mutex_unlock(&tfile->napi_mutex);
1746 WARN_ON(1);
1747 return -ENOMEM;
1748 }
1749
1750 local_bh_disable();
1751 napi_gro_frags(&tfile->napi);
1752 local_bh_enable();
1753 mutex_unlock(&tfile->napi_mutex);
1754 } else if (tun->flags & IFF_NAPI) {
1642 struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1643 int queue_len;
1644
1645 spin_lock_bh(&queue->lock);
1646 __skb_queue_tail(queue, skb);
1647 queue_len = skb_queue_len(queue);
1648 spin_unlock(&queue->lock);
1649

--- 406 unchanged lines hidden (view full) ---

2056 struct tun_struct *tun;
2057 struct tun_file *tfile = file->private_data;
2058 struct net_device *dev;
2059 int err;
2060
2061 if (tfile->detached)
2062 return -EINVAL;
2063
1755 struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1756 int queue_len;
1757
1758 spin_lock_bh(&queue->lock);
1759 __skb_queue_tail(queue, skb);
1760 queue_len = skb_queue_len(queue);
1761 spin_unlock(&queue->lock);
1762

--- 406 unchanged lines hidden (view full) ---

2169 struct tun_struct *tun;
2170 struct tun_file *tfile = file->private_data;
2171 struct net_device *dev;
2172 int err;
2173
2174 if (tfile->detached)
2175 return -EINVAL;
2176
2177 if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2178 if (!capable(CAP_NET_ADMIN))
2179 return -EPERM;
2180
2181 if (!(ifr->ifr_flags & IFF_NAPI) ||
2182 (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2183 return -EINVAL;
2184 }
2185
2064 dev = __dev_get_by_name(net, ifr->ifr_name);
2065 if (dev) {
2066 if (ifr->ifr_flags & IFF_TUN_EXCL)
2067 return -EBUSY;
2068 if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2069 tun = netdev_priv(dev);
2070 else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2071 tun = netdev_priv(dev);

--- 903 unchanged lines hidden ---
2186 dev = __dev_get_by_name(net, ifr->ifr_name);
2187 if (dev) {
2188 if (ifr->ifr_flags & IFF_TUN_EXCL)
2189 return -EBUSY;
2190 if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2191 tun = netdev_priv(dev);
2192 else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2193 tun = netdev_priv(dev);

--- 903 unchanged lines hidden ---