xref: /openbmc/linux/net/packet/af_packet.c (revision 9144f784f852f9a125cabe9927b986d909bfa439)
12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * INET		An implementation of the TCP/IP protocol suite for the LINUX
41da177e4SLinus Torvalds  *		operating system.  INET is implemented using the  BSD Socket
51da177e4SLinus Torvalds  *		interface as the means of communication with the user level.
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *		PACKET - implements raw packet sockets.
81da177e4SLinus Torvalds  *
902c30a84SJesper Juhl  * Authors:	Ross Biro
101da177e4SLinus Torvalds  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
111da177e4SLinus Torvalds  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * Fixes:
141da177e4SLinus Torvalds  *		Alan Cox	:	verify_area() now used correctly
151da177e4SLinus Torvalds  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
161da177e4SLinus Torvalds  *		Alan Cox	:	tidied skbuff lists.
171da177e4SLinus Torvalds  *		Alan Cox	:	Now uses generic datagram routines I
181da177e4SLinus Torvalds  *					added. Also fixed the peek/read crash
191da177e4SLinus Torvalds  *					from all old Linux datagram code.
201da177e4SLinus Torvalds  *		Alan Cox	:	Uses the improved datagram code.
211da177e4SLinus Torvalds  *		Alan Cox	:	Added NULL's for socket options.
221da177e4SLinus Torvalds  *		Alan Cox	:	Re-commented the code.
231da177e4SLinus Torvalds  *		Alan Cox	:	Use new kernel side addressing
241da177e4SLinus Torvalds  *		Rob Janssen	:	Correct MTU usage.
251da177e4SLinus Torvalds  *		Dave Platt	:	Counter leaks caused by incorrect
261da177e4SLinus Torvalds  *					interrupt locking and some slightly
271da177e4SLinus Torvalds  *					dubious gcc output. Can you read
281da177e4SLinus Torvalds  *					compiler: it said _VOLATILE_
291da177e4SLinus Torvalds  *	Richard Kooijman	:	Timestamp fixes.
301da177e4SLinus Torvalds  *		Alan Cox	:	New buffers. Use sk->mac.raw.
311da177e4SLinus Torvalds  *		Alan Cox	:	sendmsg/recvmsg support.
321da177e4SLinus Torvalds  *		Alan Cox	:	Protocol setting support
331da177e4SLinus Torvalds  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
341da177e4SLinus Torvalds  *	Cyrus Durgin		:	Fixed kerneld for kmod.
351da177e4SLinus Torvalds  *	Michal Ostrowski        :       Module initialization cleanup.
361da177e4SLinus Torvalds  *         Ulises Alonso        :       Frame number limit removal and
371da177e4SLinus Torvalds  *                                      packet_set_ring memory leak.
380fb375fbSEric W. Biederman  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
390fb375fbSEric W. Biederman  *					The convention is that longer addresses
400fb375fbSEric W. Biederman  *					will simply extend the hardware address
410fb375fbSEric W. Biederman  *					byte arrays at the end of sockaddr_ll
420fb375fbSEric W. Biederman  *					and packet_mreq.
4369e3c75fSJohann Baudy  *		Johann Baudy	:	Added TX RING.
44f6fb8f10Schetan loke  *		Chetan Loke	:	Implemented TPACKET_V3 block abstraction
45f6fb8f10Schetan loke  *					layer.
46f6fb8f10Schetan loke  *					Copyright (C) 2011, <lokec@ccs.neu.edu>
471da177e4SLinus Torvalds  */
481da177e4SLinus Torvalds 
49dc41c4a9SBaruch Siach #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50dc41c4a9SBaruch Siach 
51cc69837fSJakub Kicinski #include <linux/ethtool.h>
52b6459415SJakub Kicinski #include <linux/filter.h>
531da177e4SLinus Torvalds #include <linux/types.h>
541da177e4SLinus Torvalds #include <linux/mm.h>
554fc268d2SRandy Dunlap #include <linux/capability.h>
561da177e4SLinus Torvalds #include <linux/fcntl.h>
571da177e4SLinus Torvalds #include <linux/socket.h>
581da177e4SLinus Torvalds #include <linux/in.h>
591da177e4SLinus Torvalds #include <linux/inet.h>
601da177e4SLinus Torvalds #include <linux/netdevice.h>
611da177e4SLinus Torvalds #include <linux/if_packet.h>
621da177e4SLinus Torvalds #include <linux/wireless.h>
63ffbc6111SHerbert Xu #include <linux/kernel.h>
641da177e4SLinus Torvalds #include <linux/kmod.h>
655a0e3ad6STejun Heo #include <linux/slab.h>
660e3125c7SNeil Horman #include <linux/vmalloc.h>
67457c4cbcSEric W. Biederman #include <net/net_namespace.h>
681da177e4SLinus Torvalds #include <net/ip.h>
691da177e4SLinus Torvalds #include <net/protocol.h>
701da177e4SLinus Torvalds #include <linux/skbuff.h>
711da177e4SLinus Torvalds #include <net/sock.h>
721da177e4SLinus Torvalds #include <linux/errno.h>
731da177e4SLinus Torvalds #include <linux/timer.h>
747c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
751da177e4SLinus Torvalds #include <asm/ioctls.h>
761da177e4SLinus Torvalds #include <asm/page.h>
77a1f8e7f7SAl Viro #include <asm/cacheflush.h>
781da177e4SLinus Torvalds #include <asm/io.h>
791da177e4SLinus Torvalds #include <linux/proc_fs.h>
801da177e4SLinus Torvalds #include <linux/seq_file.h>
811da177e4SLinus Torvalds #include <linux/poll.h>
821da177e4SLinus Torvalds #include <linux/module.h>
831da177e4SLinus Torvalds #include <linux/init.h>
84905db440SHerbert Xu #include <linux/mutex.h>
8505423b24SEric Dumazet #include <linux/if_vlan.h>
86bfd5f4a3SSridhar Samudrala #include <linux/virtio_net.h>
87ed85b565SRichard Cochran #include <linux/errqueue.h>
88614f60faSScott McMillan #include <linux/net_tstamp.h>
89b0138408SDaniel Borkmann #include <linux/percpu.h>
901da177e4SLinus Torvalds #ifdef CONFIG_INET
911da177e4SLinus Torvalds #include <net/inet_common.h>
921da177e4SLinus Torvalds #endif
9347dceb8eSWillem de Bruijn #include <linux/bpf.h>
94719c44d3SWillem de Bruijn #include <net/compat.h>
950d7308c0SPablo Neira Ayuso #include <linux/netfilter_netdev.h>
961da177e4SLinus Torvalds 
972787b04bSPavel Emelyanov #include "internal.h"
982787b04bSPavel Emelyanov 
991da177e4SLinus Torvalds /*
1001da177e4SLinus Torvalds    Assumptions:
101d5496990SEyal Birger    - If the device has no dev->header_ops->create, there is no LL header
102d5496990SEyal Birger      visible above the device. In this case, its hard_header_len should be 0.
103b4c58814SXie He      The device may prepend its own header internally. In this case, its
104b4c58814SXie He      needed_headroom should be set to the space needed for it to add its
105b4c58814SXie He      internal header.
106b4c58814SXie He      For example, a WiFi driver pretending to be an Ethernet driver should
107b4c58814SXie He      set its hard_header_len to be the Ethernet header length, and set its
108b4c58814SXie He      needed_headroom to be (the real WiFi header length - the fake Ethernet
109b4c58814SXie He      header length).
1101da177e4SLinus Torvalds    - packet socket receives packets with pulled ll header,
1111da177e4SLinus Torvalds      so that SOCK_RAW should push it back.
1121da177e4SLinus Torvalds 
1131da177e4SLinus Torvalds On receive:
1141da177e4SLinus Torvalds -----------
1151da177e4SLinus Torvalds 
116d5496990SEyal Birger Incoming, dev_has_header(dev) == true
117b0e380b1SArnaldo Carvalho de Melo    mac_header -> ll header
1181da177e4SLinus Torvalds    data       -> data
1191da177e4SLinus Torvalds 
120d5496990SEyal Birger Outgoing, dev_has_header(dev) == true
121b0e380b1SArnaldo Carvalho de Melo    mac_header -> ll header
1221da177e4SLinus Torvalds    data       -> ll header
1231da177e4SLinus Torvalds 
124d5496990SEyal Birger Incoming, dev_has_header(dev) == false
125b79a80bdSXie He    mac_header -> data
126b79a80bdSXie He      However drivers often make it point to the ll header.
127b79a80bdSXie He      This is incorrect because the ll header should be invisible to us.
1281da177e4SLinus Torvalds    data       -> data
1291da177e4SLinus Torvalds 
130d5496990SEyal Birger Outgoing, dev_has_header(dev) == false
131b79a80bdSXie He    mac_header -> data. ll header is invisible to us.
1321da177e4SLinus Torvalds    data       -> data
1331da177e4SLinus Torvalds 
1341da177e4SLinus Torvalds Resume
135d5496990SEyal Birger   If dev_has_header(dev) == false we are unable to restore the ll header,
136b79a80bdSXie He     because it is invisible to us.
1371da177e4SLinus Torvalds 
1381da177e4SLinus Torvalds 
1391da177e4SLinus Torvalds On transmit:
1401da177e4SLinus Torvalds ------------
1411da177e4SLinus Torvalds 
14221c85974SXie He dev_has_header(dev) == true
143b0e380b1SArnaldo Carvalho de Melo    mac_header -> ll header
1441da177e4SLinus Torvalds    data       -> ll header
1451da177e4SLinus Torvalds 
14621c85974SXie He dev_has_header(dev) == false (ll header is invisible to us)
147b0e380b1SArnaldo Carvalho de Melo    mac_header -> data
1481da177e4SLinus Torvalds    data       -> data
1491da177e4SLinus Torvalds 
15009599729SXie He    We should set network_header on output to the correct position,
1511da177e4SLinus Torvalds    packet classifier depends on it.
1521da177e4SLinus Torvalds  */
1531da177e4SLinus Torvalds 
1541da177e4SLinus Torvalds /* Private packet socket structures. */
1551da177e4SLinus Torvalds 
1560fb375fbSEric W. Biederman /* identical to struct packet_mreq except it has
1570fb375fbSEric W. Biederman  * a longer address field.
1580fb375fbSEric W. Biederman  */
15940d4e3dfSEric Dumazet struct packet_mreq_max {
1600fb375fbSEric W. Biederman 	int		mr_ifindex;
1610fb375fbSEric W. Biederman 	unsigned short	mr_type;
1620fb375fbSEric W. Biederman 	unsigned short	mr_alen;
1630fb375fbSEric W. Biederman 	unsigned char	mr_address[MAX_ADDR_LEN];
1641da177e4SLinus Torvalds };
165a2efcfa0SDavid S. Miller 
166184f489eSDaniel Borkmann union tpacket_uhdr {
167184f489eSDaniel Borkmann 	struct tpacket_hdr  *h1;
168184f489eSDaniel Borkmann 	struct tpacket2_hdr *h2;
169184f489eSDaniel Borkmann 	struct tpacket3_hdr *h3;
170184f489eSDaniel Borkmann 	void *raw;
171184f489eSDaniel Borkmann };
172184f489eSDaniel Borkmann 
173f6fb8f10Schetan loke static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
17469e3c75fSJohann Baudy 		int closing, int tx_ring);
17569e3c75fSJohann Baudy 
176f6fb8f10Schetan loke #define V3_ALIGNMENT	(8)
177f6fb8f10Schetan loke 
178bc59ba39Schetan loke #define BLK_HDR_LEN	(ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
179f6fb8f10Schetan loke 
180f6fb8f10Schetan loke #define BLK_PLUS_PRIV(sz_of_priv) \
181f6fb8f10Schetan loke 	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
182f6fb8f10Schetan loke 
183f6fb8f10Schetan loke #define BLOCK_STATUS(x)	((x)->hdr.bh1.block_status)
184f6fb8f10Schetan loke #define BLOCK_NUM_PKTS(x)	((x)->hdr.bh1.num_pkts)
185f6fb8f10Schetan loke #define BLOCK_O2FP(x)		((x)->hdr.bh1.offset_to_first_pkt)
186f6fb8f10Schetan loke #define BLOCK_LEN(x)		((x)->hdr.bh1.blk_len)
187f6fb8f10Schetan loke #define BLOCK_SNUM(x)		((x)->hdr.bh1.seq_num)
188f6fb8f10Schetan loke #define BLOCK_O2PRIV(x)	((x)->offset_to_priv)
189f6fb8f10Schetan loke 
19069e3c75fSJohann Baudy struct packet_sock;
19177f65ebdSWillem de Bruijn static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
19277f65ebdSWillem de Bruijn 		       struct packet_type *pt, struct net_device *orig_dev);
1931da177e4SLinus Torvalds 
194f6fb8f10Schetan loke static void *packet_previous_frame(struct packet_sock *po,
195f6fb8f10Schetan loke 		struct packet_ring_buffer *rb,
196f6fb8f10Schetan loke 		int status);
197f6fb8f10Schetan loke static void packet_increment_head(struct packet_ring_buffer *buff);
198878cd3baSRosen, Rami static int prb_curr_blk_in_use(struct tpacket_block_desc *);
199bc59ba39Schetan loke static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
200f6fb8f10Schetan loke 			struct packet_sock *);
201bc59ba39Schetan loke static void prb_retire_current_block(struct tpacket_kbdq_core *,
202f6fb8f10Schetan loke 		struct packet_sock *, unsigned int status);
203bc59ba39Schetan loke static int prb_queue_frozen(struct tpacket_kbdq_core *);
204bc59ba39Schetan loke static void prb_open_block(struct tpacket_kbdq_core *,
205bc59ba39Schetan loke 		struct tpacket_block_desc *);
20617bfd8c8SKees Cook static void prb_retire_rx_blk_timer_expired(struct timer_list *);
207bc59ba39Schetan loke static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208bc59ba39Schetan loke static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
209bc59ba39Schetan loke static void prb_clear_rxhash(struct tpacket_kbdq_core *,
210bc59ba39Schetan loke 		struct tpacket3_hdr *);
211bc59ba39Schetan loke static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
212bc59ba39Schetan loke 		struct tpacket3_hdr *);
2131da177e4SLinus Torvalds static void packet_flush_mclist(struct sock *sk);
214865b03f2SMagnus Karlsson static u16 packet_pick_tx_queue(struct sk_buff *skb);
2151da177e4SLinus Torvalds 
216ffbc6111SHerbert Xu struct packet_skb_cb {
217ffbc6111SHerbert Xu 	union {
218ffbc6111SHerbert Xu 		struct sockaddr_pkt pkt;
2192472d761SEyal Birger 		union {
2202472d761SEyal Birger 			/* Trick: alias skb original length with
2212472d761SEyal Birger 			 * ll.sll_family and ll.protocol in order
2222472d761SEyal Birger 			 * to save room.
2232472d761SEyal Birger 			 */
2242472d761SEyal Birger 			unsigned int origlen;
225ffbc6111SHerbert Xu 			struct sockaddr_ll ll;
2262472d761SEyal Birger 		};
227ffbc6111SHerbert Xu 	} sa;
228ffbc6111SHerbert Xu };
229ffbc6111SHerbert Xu 
230d3869efeSDavid Woodhouse #define vio_le() virtio_legacy_is_little_endian()
231d3869efeSDavid Woodhouse 
232ffbc6111SHerbert Xu #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
2338dc41944SHerbert Xu 
234bc59ba39Schetan loke #define GET_PBDQC_FROM_RB(x)	((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
235f6fb8f10Schetan loke #define GET_PBLOCK_DESC(x, bid)	\
236bc59ba39Schetan loke 	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
237f6fb8f10Schetan loke #define GET_CURR_PBLOCK_DESC_FROM_CORE(x)	\
238bc59ba39Schetan loke 	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
239f6fb8f10Schetan loke #define GET_NEXT_PRB_BLK_NUM(x) \
240f6fb8f10Schetan loke 	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241f6fb8f10Schetan loke 	((x)->kactive_blk_num+1) : 0)
242f6fb8f10Schetan loke 
243dc99f600SDavid S. Miller static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244dc99f600SDavid S. Miller static void __fanout_link(struct sock *sk, struct packet_sock *po);
245dc99f600SDavid S. Miller 
2460d7308c0SPablo Neira Ayuso #ifdef CONFIG_NETFILTER_EGRESS
nf_hook_direct_egress(struct sk_buff * skb)2470d7308c0SPablo Neira Ayuso static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
2480d7308c0SPablo Neira Ayuso {
2490d7308c0SPablo Neira Ayuso 	struct sk_buff *next, *head = NULL, *tail;
2500d7308c0SPablo Neira Ayuso 	int rc;
2510d7308c0SPablo Neira Ayuso 
2520d7308c0SPablo Neira Ayuso 	rcu_read_lock();
2530d7308c0SPablo Neira Ayuso 	for (; skb != NULL; skb = next) {
2540d7308c0SPablo Neira Ayuso 		next = skb->next;
2550d7308c0SPablo Neira Ayuso 		skb_mark_not_on_list(skb);
2560d7308c0SPablo Neira Ayuso 
2570d7308c0SPablo Neira Ayuso 		if (!nf_hook_egress(skb, &rc, skb->dev))
2580d7308c0SPablo Neira Ayuso 			continue;
2590d7308c0SPablo Neira Ayuso 
2600d7308c0SPablo Neira Ayuso 		if (!head)
2610d7308c0SPablo Neira Ayuso 			head = skb;
2620d7308c0SPablo Neira Ayuso 		else
2630d7308c0SPablo Neira Ayuso 			tail->next = skb;
2640d7308c0SPablo Neira Ayuso 
2650d7308c0SPablo Neira Ayuso 		tail = skb;
2660d7308c0SPablo Neira Ayuso 	}
2670d7308c0SPablo Neira Ayuso 	rcu_read_unlock();
2680d7308c0SPablo Neira Ayuso 
2690d7308c0SPablo Neira Ayuso 	return head;
2700d7308c0SPablo Neira Ayuso }
2710d7308c0SPablo Neira Ayuso #endif
2720d7308c0SPablo Neira Ayuso 
packet_xmit(const struct packet_sock * po,struct sk_buff * skb)273105a201eSEric Dumazet static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb)
274d346a3faSDaniel Borkmann {
275105a201eSEric Dumazet 	if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS))
276105a201eSEric Dumazet 		return dev_queue_xmit(skb);
277105a201eSEric Dumazet 
2780d7308c0SPablo Neira Ayuso #ifdef CONFIG_NETFILTER_EGRESS
2790d7308c0SPablo Neira Ayuso 	if (nf_hook_egress_active()) {
2800d7308c0SPablo Neira Ayuso 		skb = nf_hook_direct_egress(skb);
2810d7308c0SPablo Neira Ayuso 		if (!skb)
2820d7308c0SPablo Neira Ayuso 			return NET_XMIT_DROP;
2830d7308c0SPablo Neira Ayuso 	}
2840d7308c0SPablo Neira Ayuso #endif
285865b03f2SMagnus Karlsson 	return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
286d346a3faSDaniel Borkmann }
287d346a3faSDaniel Borkmann 
packet_cached_dev_get(struct packet_sock * po)28866e56cd4SDaniel Borkmann static struct net_device *packet_cached_dev_get(struct packet_sock *po)
28966e56cd4SDaniel Borkmann {
29066e56cd4SDaniel Borkmann 	struct net_device *dev;
29166e56cd4SDaniel Borkmann 
29266e56cd4SDaniel Borkmann 	rcu_read_lock();
29366e56cd4SDaniel Borkmann 	dev = rcu_dereference(po->cached_dev);
29466e56cd4SDaniel Borkmann 	dev_hold(dev);
29566e56cd4SDaniel Borkmann 	rcu_read_unlock();
29666e56cd4SDaniel Borkmann 
29766e56cd4SDaniel Borkmann 	return dev;
29866e56cd4SDaniel Borkmann }
29966e56cd4SDaniel Borkmann 
packet_cached_dev_assign(struct packet_sock * po,struct net_device * dev)30066e56cd4SDaniel Borkmann static void packet_cached_dev_assign(struct packet_sock *po,
30166e56cd4SDaniel Borkmann 				     struct net_device *dev)
30266e56cd4SDaniel Borkmann {
30366e56cd4SDaniel Borkmann 	rcu_assign_pointer(po->cached_dev, dev);
30466e56cd4SDaniel Borkmann }
30566e56cd4SDaniel Borkmann 
packet_cached_dev_reset(struct packet_sock * po)30666e56cd4SDaniel Borkmann static void packet_cached_dev_reset(struct packet_sock *po)
30766e56cd4SDaniel Borkmann {
30866e56cd4SDaniel Borkmann 	RCU_INIT_POINTER(po->cached_dev, NULL);
30966e56cd4SDaniel Borkmann }
31066e56cd4SDaniel Borkmann 
packet_pick_tx_queue(struct sk_buff * skb)311865b03f2SMagnus Karlsson static u16 packet_pick_tx_queue(struct sk_buff *skb)
3120fd5d57bSDaniel Borkmann {
313865b03f2SMagnus Karlsson 	struct net_device *dev = skb->dev;
3140fd5d57bSDaniel Borkmann 	const struct net_device_ops *ops = dev->netdev_ops;
315b71b5837SPaolo Abeni 	int cpu = raw_smp_processor_id();
3160fd5d57bSDaniel Borkmann 	u16 queue_index;
3170fd5d57bSDaniel Borkmann 
318b71b5837SPaolo Abeni #ifdef CONFIG_XPS
319b71b5837SPaolo Abeni 	skb->sender_cpu = cpu + 1;
320b71b5837SPaolo Abeni #endif
321b71b5837SPaolo Abeni 	skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
3220fd5d57bSDaniel Borkmann 	if (ops->ndo_select_queue) {
323a350ecceSPaolo Abeni 		queue_index = ops->ndo_select_queue(dev, skb, NULL);
3240fd5d57bSDaniel Borkmann 		queue_index = netdev_cap_txqueue(dev, queue_index);
3250fd5d57bSDaniel Borkmann 	} else {
326b71b5837SPaolo Abeni 		queue_index = netdev_pick_tx(dev, skb, NULL);
3270fd5d57bSDaniel Borkmann 	}
3280fd5d57bSDaniel Borkmann 
329865b03f2SMagnus Karlsson 	return queue_index;
3300fd5d57bSDaniel Borkmann }
3310fd5d57bSDaniel Borkmann 
332a6361f0cSWillem de Bruijn /* __register_prot_hook must be invoked through register_prot_hook
333ce06b03eSDavid S. Miller  * or from a context in which asynchronous accesses to the packet
334ce06b03eSDavid S. Miller  * socket is not possible (packet_create()).
335ce06b03eSDavid S. Miller  */
__register_prot_hook(struct sock * sk)336a6361f0cSWillem de Bruijn static void __register_prot_hook(struct sock *sk)
337ce06b03eSDavid S. Miller {
338ce06b03eSDavid S. Miller 	struct packet_sock *po = pkt_sk(sk);
339e40526cbSDaniel Borkmann 
34061edf479SEric Dumazet 	if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
34166e56cd4SDaniel Borkmann 		if (po->fanout)
342dc99f600SDavid S. Miller 			__fanout_link(sk, po);
34366e56cd4SDaniel Borkmann 		else
344ce06b03eSDavid S. Miller 			dev_add_pack(&po->prot_hook);
345e40526cbSDaniel Borkmann 
346ce06b03eSDavid S. Miller 		sock_hold(sk);
34761edf479SEric Dumazet 		packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1);
348ce06b03eSDavid S. Miller 	}
349ce06b03eSDavid S. Miller }
350ce06b03eSDavid S. Miller 
register_prot_hook(struct sock * sk)351a6361f0cSWillem de Bruijn static void register_prot_hook(struct sock *sk)
352a6361f0cSWillem de Bruijn {
353a6361f0cSWillem de Bruijn 	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
354a6361f0cSWillem de Bruijn 	__register_prot_hook(sk);
355a6361f0cSWillem de Bruijn }
356a6361f0cSWillem de Bruijn 
357a6361f0cSWillem de Bruijn /* If the sync parameter is true, we will temporarily drop
358ce06b03eSDavid S. Miller  * the po->bind_lock and do a synchronize_net to make sure no
359ce06b03eSDavid S. Miller  * asynchronous packet processing paths still refer to the elements
360ce06b03eSDavid S. Miller  * of po->prot_hook.  If the sync parameter is false, it is the
361ce06b03eSDavid S. Miller  * callers responsibility to take care of this.
362ce06b03eSDavid S. Miller  */
__unregister_prot_hook(struct sock * sk,bool sync)363ce06b03eSDavid S. Miller static void __unregister_prot_hook(struct sock *sk, bool sync)
364ce06b03eSDavid S. Miller {
365ce06b03eSDavid S. Miller 	struct packet_sock *po = pkt_sk(sk);
366ce06b03eSDavid S. Miller 
367a6361f0cSWillem de Bruijn 	lockdep_assert_held_once(&po->bind_lock);
368a6361f0cSWillem de Bruijn 
36961edf479SEric Dumazet 	packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);
37066e56cd4SDaniel Borkmann 
37166e56cd4SDaniel Borkmann 	if (po->fanout)
372dc99f600SDavid S. Miller 		__fanout_unlink(sk, po);
37366e56cd4SDaniel Borkmann 	else
374ce06b03eSDavid S. Miller 		__dev_remove_pack(&po->prot_hook);
375e40526cbSDaniel Borkmann 
376ce06b03eSDavid S. Miller 	__sock_put(sk);
377ce06b03eSDavid S. Miller 
378ce06b03eSDavid S. Miller 	if (sync) {
379ce06b03eSDavid S. Miller 		spin_unlock(&po->bind_lock);
380ce06b03eSDavid S. Miller 		synchronize_net();
381ce06b03eSDavid S. Miller 		spin_lock(&po->bind_lock);
382ce06b03eSDavid S. Miller 	}
383ce06b03eSDavid S. Miller }
384ce06b03eSDavid S. Miller 
unregister_prot_hook(struct sock * sk,bool sync)385ce06b03eSDavid S. Miller static void unregister_prot_hook(struct sock *sk, bool sync)
386ce06b03eSDavid S. Miller {
387ce06b03eSDavid S. Miller 	struct packet_sock *po = pkt_sk(sk);
388ce06b03eSDavid S. Miller 
38961edf479SEric Dumazet 	if (packet_sock_flag(po, PACKET_SOCK_RUNNING))
390ce06b03eSDavid S. Miller 		__unregister_prot_hook(sk, sync);
391ce06b03eSDavid S. Miller }
392ce06b03eSDavid S. Miller 
pgv_to_page(void * addr)3936e58040bSMichael S. Tsirkin static inline struct page * __pure pgv_to_page(void *addr)
3940af55bb5SChangli Gao {
3950af55bb5SChangli Gao 	if (is_vmalloc_addr(addr))
3960af55bb5SChangli Gao 		return vmalloc_to_page(addr);
3970af55bb5SChangli Gao 	return virt_to_page(addr);
3980af55bb5SChangli Gao }
3990af55bb5SChangli Gao 
__packet_set_status(struct packet_sock * po,void * frame,int status)400bbd6ef87SPatrick McHardy static void __packet_set_status(struct packet_sock *po, void *frame, int status)
401bbd6ef87SPatrick McHardy {
402184f489eSDaniel Borkmann 	union tpacket_uhdr h;
403bbd6ef87SPatrick McHardy 
4048a989617SEric Dumazet 	/* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
4058a989617SEric Dumazet 
406bbd6ef87SPatrick McHardy 	h.raw = frame;
407bbd6ef87SPatrick McHardy 	switch (po->tp_version) {
408bbd6ef87SPatrick McHardy 	case TPACKET_V1:
4098a989617SEric Dumazet 		WRITE_ONCE(h.h1->tp_status, status);
4100af55bb5SChangli Gao 		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
411bbd6ef87SPatrick McHardy 		break;
412bbd6ef87SPatrick McHardy 	case TPACKET_V2:
4138a989617SEric Dumazet 		WRITE_ONCE(h.h2->tp_status, status);
4140af55bb5SChangli Gao 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
415bbd6ef87SPatrick McHardy 		break;
416f6fb8f10Schetan loke 	case TPACKET_V3:
4178a989617SEric Dumazet 		WRITE_ONCE(h.h3->tp_status, status);
4187f953ab2SSowmini Varadhan 		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
4197f953ab2SSowmini Varadhan 		break;
42069e3c75fSJohann Baudy 	default:
421f6fb8f10Schetan loke 		WARN(1, "TPACKET version not supported.\n");
42269e3c75fSJohann Baudy 		BUG();
42369e3c75fSJohann Baudy 	}
42469e3c75fSJohann Baudy 
42569e3c75fSJohann Baudy 	smp_wmb();
42669e3c75fSJohann Baudy }
42769e3c75fSJohann Baudy 
__packet_get_status(const struct packet_sock * po,void * frame)42896f657e6SEric Dumazet static int __packet_get_status(const struct packet_sock *po, void *frame)
42969e3c75fSJohann Baudy {
430184f489eSDaniel Borkmann 	union tpacket_uhdr h;
43169e3c75fSJohann Baudy 
43269e3c75fSJohann Baudy 	smp_rmb();
43369e3c75fSJohann Baudy 
4348a989617SEric Dumazet 	/* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
4358a989617SEric Dumazet 
43669e3c75fSJohann Baudy 	h.raw = frame;
43769e3c75fSJohann Baudy 	switch (po->tp_version) {
43869e3c75fSJohann Baudy 	case TPACKET_V1:
4390af55bb5SChangli Gao 		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
4408a989617SEric Dumazet 		return READ_ONCE(h.h1->tp_status);
44169e3c75fSJohann Baudy 	case TPACKET_V2:
4420af55bb5SChangli Gao 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
4438a989617SEric Dumazet 		return READ_ONCE(h.h2->tp_status);
444f6fb8f10Schetan loke 	case TPACKET_V3:
4457f953ab2SSowmini Varadhan 		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
4468a989617SEric Dumazet 		return READ_ONCE(h.h3->tp_status);
44769e3c75fSJohann Baudy 	default:
448f6fb8f10Schetan loke 		WARN(1, "TPACKET version not supported.\n");
44969e3c75fSJohann Baudy 		BUG();
45069e3c75fSJohann Baudy 		return 0;
451bbd6ef87SPatrick McHardy 	}
4521da177e4SLinus Torvalds }
45369e3c75fSJohann Baudy 
tpacket_get_timestamp(struct sk_buff * skb,struct timespec64 * ts,unsigned int flags)454d413fcb4SArnd Bergmann static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
4557a51384cSDaniel Borkmann 				   unsigned int flags)
4567a51384cSDaniel Borkmann {
4577a51384cSDaniel Borkmann 	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
4587a51384cSDaniel Borkmann 
45968a360e8SWillem de Bruijn 	if (shhwtstamps &&
46068a360e8SWillem de Bruijn 	    (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
461d413fcb4SArnd Bergmann 	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
462b9c32fb2SDaniel Borkmann 		return TP_STATUS_TS_RAW_HARDWARE;
4637a51384cSDaniel Borkmann 
464171c3b15SRichard Sanger 	if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
46527942a15SMartin KaFai Lau 	    ktime_to_timespec64_cond(skb_tstamp(skb), ts))
466b9c32fb2SDaniel Borkmann 		return TP_STATUS_TS_SOFTWARE;
4677a51384cSDaniel Borkmann 
468b9c32fb2SDaniel Borkmann 	return 0;
4697a51384cSDaniel Borkmann }
4707a51384cSDaniel Borkmann 
__packet_set_timestamp(struct packet_sock * po,void * frame,struct sk_buff * skb)471b9c32fb2SDaniel Borkmann static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
4727a51384cSDaniel Borkmann 				    struct sk_buff *skb)
4732e31396fSWillem de Bruijn {
4742e31396fSWillem de Bruijn 	union tpacket_uhdr h;
475d413fcb4SArnd Bergmann 	struct timespec64 ts;
476b9c32fb2SDaniel Borkmann 	__u32 ts_status;
4772e31396fSWillem de Bruijn 
4781051ce4aSEric Dumazet 	if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp))))
479b9c32fb2SDaniel Borkmann 		return 0;
4802e31396fSWillem de Bruijn 
4812e31396fSWillem de Bruijn 	h.raw = frame;
482d413fcb4SArnd Bergmann 	/*
483d413fcb4SArnd Bergmann 	 * versions 1 through 3 overflow the timestamps in y2106, since they
484d413fcb4SArnd Bergmann 	 * all store the seconds in a 32-bit unsigned integer.
485d413fcb4SArnd Bergmann 	 * If we create a version 4, that should have a 64-bit timestamp,
486d413fcb4SArnd Bergmann 	 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
487d413fcb4SArnd Bergmann 	 * nanoseconds.
488d413fcb4SArnd Bergmann 	 */
4892e31396fSWillem de Bruijn 	switch (po->tp_version) {
4902e31396fSWillem de Bruijn 	case TPACKET_V1:
4912e31396fSWillem de Bruijn 		h.h1->tp_sec = ts.tv_sec;
4922e31396fSWillem de Bruijn 		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
4932e31396fSWillem de Bruijn 		break;
4942e31396fSWillem de Bruijn 	case TPACKET_V2:
4952e31396fSWillem de Bruijn 		h.h2->tp_sec = ts.tv_sec;
4962e31396fSWillem de Bruijn 		h.h2->tp_nsec = ts.tv_nsec;
4972e31396fSWillem de Bruijn 		break;
4982e31396fSWillem de Bruijn 	case TPACKET_V3:
49957ea884bSDaniel Borkmann 		h.h3->tp_sec = ts.tv_sec;
50057ea884bSDaniel Borkmann 		h.h3->tp_nsec = ts.tv_nsec;
50157ea884bSDaniel Borkmann 		break;
5022e31396fSWillem de Bruijn 	default:
5032e31396fSWillem de Bruijn 		WARN(1, "TPACKET version not supported.\n");
5042e31396fSWillem de Bruijn 		BUG();
5052e31396fSWillem de Bruijn 	}
5062e31396fSWillem de Bruijn 
5072e31396fSWillem de Bruijn 	/* one flush is safe, as both fields always lie on the same cacheline */
5082e31396fSWillem de Bruijn 	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
5092e31396fSWillem de Bruijn 	smp_wmb();
510b9c32fb2SDaniel Borkmann 
511b9c32fb2SDaniel Borkmann 	return ts_status;
5122e31396fSWillem de Bruijn }
5132e31396fSWillem de Bruijn 
packet_lookup_frame(const struct packet_sock * po,const struct packet_ring_buffer * rb,unsigned int position,int status)514d4b5bd98SEric Dumazet static void *packet_lookup_frame(const struct packet_sock *po,
515d4b5bd98SEric Dumazet 				 const struct packet_ring_buffer *rb,
51669e3c75fSJohann Baudy 				 unsigned int position,
51769e3c75fSJohann Baudy 				 int status)
51869e3c75fSJohann Baudy {
51969e3c75fSJohann Baudy 	unsigned int pg_vec_pos, frame_offset;
520184f489eSDaniel Borkmann 	union tpacket_uhdr h;
52169e3c75fSJohann Baudy 
52269e3c75fSJohann Baudy 	pg_vec_pos = position / rb->frames_per_block;
52369e3c75fSJohann Baudy 	frame_offset = position % rb->frames_per_block;
52469e3c75fSJohann Baudy 
5250e3125c7SNeil Horman 	h.raw = rb->pg_vec[pg_vec_pos].buffer +
5260e3125c7SNeil Horman 		(frame_offset * rb->frame_size);
52769e3c75fSJohann Baudy 
52869e3c75fSJohann Baudy 	if (status != __packet_get_status(po, h.raw))
52969e3c75fSJohann Baudy 		return NULL;
53069e3c75fSJohann Baudy 
53169e3c75fSJohann Baudy 	return h.raw;
53269e3c75fSJohann Baudy }
53369e3c75fSJohann Baudy 
packet_current_frame(struct packet_sock * po,struct packet_ring_buffer * rb,int status)534eea49cc9SOlof Johansson static void *packet_current_frame(struct packet_sock *po,
53569e3c75fSJohann Baudy 		struct packet_ring_buffer *rb,
53669e3c75fSJohann Baudy 		int status)
53769e3c75fSJohann Baudy {
53869e3c75fSJohann Baudy 	return packet_lookup_frame(po, rb, rb->head, status);
53969e3c75fSJohann Baudy }
54069e3c75fSJohann Baudy 
vlan_get_tci(const struct sk_buff * skb,struct net_device * dev)5417aa78d0dSEric Dumazet static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev)
5425a041d25SChengen Du {
5435a041d25SChengen Du 	struct vlan_hdr vhdr, *vh;
5445a041d25SChengen Du 	unsigned int header_len;
5455a041d25SChengen Du 
5465a041d25SChengen Du 	if (!dev)
5475a041d25SChengen Du 		return 0;
5485a041d25SChengen Du 
5495a041d25SChengen Du 	/* In the SOCK_DGRAM scenario, skb data starts at the network
5505a041d25SChengen Du 	 * protocol, which is after the VLAN headers. The outer VLAN
5515a041d25SChengen Du 	 * header is at the hard_header_len offset in non-variable
5525a041d25SChengen Du 	 * length link layer headers. If it's a VLAN device, the
5535a041d25SChengen Du 	 * min_header_len should be used to exclude the VLAN header
5545a041d25SChengen Du 	 * size.
5555a041d25SChengen Du 	 */
5565a041d25SChengen Du 	if (dev->min_header_len == dev->hard_header_len)
5575a041d25SChengen Du 		header_len = dev->hard_header_len;
5585a041d25SChengen Du 	else if (is_vlan_dev(dev))
5595a041d25SChengen Du 		header_len = dev->min_header_len;
5605a041d25SChengen Du 	else
5615a041d25SChengen Du 		return 0;
5625a041d25SChengen Du 
5637aa78d0dSEric Dumazet 	vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len,
5647aa78d0dSEric Dumazet 				sizeof(vhdr), &vhdr);
5655a041d25SChengen Du 	if (unlikely(!vh))
5665a041d25SChengen Du 		return 0;
5675a041d25SChengen Du 
5685a041d25SChengen Du 	return ntohs(vh->h_vlan_TCI);
5695a041d25SChengen Du }
5705a041d25SChengen Du 
vlan_get_protocol_dgram(const struct sk_buff * skb)571*a693b876SEric Dumazet static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
5725a041d25SChengen Du {
5735a041d25SChengen Du 	__be16 proto = skb->protocol;
5745a041d25SChengen Du 
575*a693b876SEric Dumazet 	if (unlikely(eth_type_vlan(proto)))
576*a693b876SEric Dumazet 		proto = __vlan_get_protocol_offset(skb, proto,
577*a693b876SEric Dumazet 						   skb_mac_offset(skb), NULL);
5785a041d25SChengen Du 
5795a041d25SChengen Du 	return proto;
5805a041d25SChengen Du }
5815a041d25SChengen Du 
prb_del_retire_blk_timer(struct tpacket_kbdq_core * pkc)582bc59ba39Schetan loke static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
583f6fb8f10Schetan loke {
584f6fb8f10Schetan loke 	del_timer_sync(&pkc->retire_blk_timer);
585f6fb8f10Schetan loke }
586f6fb8f10Schetan loke 
prb_shutdown_retire_blk_timer(struct packet_sock * po,struct sk_buff_head * rb_queue)587f6fb8f10Schetan loke static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
588f6fb8f10Schetan loke 		struct sk_buff_head *rb_queue)
589f6fb8f10Schetan loke {
590bc59ba39Schetan loke 	struct tpacket_kbdq_core *pkc;
591f6fb8f10Schetan loke 
59273d0fcf2STobias Klauser 	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
593f6fb8f10Schetan loke 
594ec6f809fSVeaceslav Falico 	spin_lock_bh(&rb_queue->lock);
595f6fb8f10Schetan loke 	pkc->delete_blk_timer = 1;
596ec6f809fSVeaceslav Falico 	spin_unlock_bh(&rb_queue->lock);
597f6fb8f10Schetan loke 
598f6fb8f10Schetan loke 	prb_del_retire_blk_timer(pkc);
599f6fb8f10Schetan loke }
600f6fb8f10Schetan loke 
prb_setup_retire_blk_timer(struct packet_sock * po)601e8e85cc5SManinder Singh static void prb_setup_retire_blk_timer(struct packet_sock *po)
602f6fb8f10Schetan loke {
603bc59ba39Schetan loke 	struct tpacket_kbdq_core *pkc;
604f6fb8f10Schetan loke 
605e8e85cc5SManinder Singh 	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
60617bfd8c8SKees Cook 	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
60717bfd8c8SKees Cook 		    0);
60817bfd8c8SKees Cook 	pkc->retire_blk_timer.expires = jiffies;
609f6fb8f10Schetan loke }
610f6fb8f10Schetan loke 
prb_calc_retire_blk_tmo(struct packet_sock * po,int blk_size_in_bytes)611f6fb8f10Schetan loke static int prb_calc_retire_blk_tmo(struct packet_sock *po,
612f6fb8f10Schetan loke 				int blk_size_in_bytes)
613f6fb8f10Schetan loke {
614f6fb8f10Schetan loke 	struct net_device *dev;
6150914d2bbSMao Wenan 	unsigned int mbits, div;
6167cad1bacSDavid Decotigny 	struct ethtool_link_ksettings ecmd;
6174bc71cb9SJiri Pirko 	int err;
618f6fb8f10Schetan loke 
6194bc71cb9SJiri Pirko 	rtnl_lock();
6204bc71cb9SJiri Pirko 	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
6214bc71cb9SJiri Pirko 	if (unlikely(!dev)) {
6224bc71cb9SJiri Pirko 		rtnl_unlock();
623f6fb8f10Schetan loke 		return DEFAULT_PRB_RETIRE_TOV;
6244bc71cb9SJiri Pirko 	}
6257cad1bacSDavid Decotigny 	err = __ethtool_get_link_ksettings(dev, &ecmd);
6264bc71cb9SJiri Pirko 	rtnl_unlock();
6270914d2bbSMao Wenan 	if (err)
6280914d2bbSMao Wenan 		return DEFAULT_PRB_RETIRE_TOV;
6290914d2bbSMao Wenan 
6300914d2bbSMao Wenan 	/* If the link speed is so slow you don't really
631f6fb8f10Schetan loke 	 * need to worry about perf anyways
632f6fb8f10Schetan loke 	 */
6337cad1bacSDavid Decotigny 	if (ecmd.base.speed < SPEED_1000 ||
6340914d2bbSMao Wenan 	    ecmd.base.speed == SPEED_UNKNOWN)
635b43d1f9fSMao Wenan 		return DEFAULT_PRB_RETIRE_TOV;
636f6fb8f10Schetan loke 
6370914d2bbSMao Wenan 	div = ecmd.base.speed / 1000;
638f6fb8f10Schetan loke 	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
639f6fb8f10Schetan loke 
640f6fb8f10Schetan loke 	if (div)
641f6fb8f10Schetan loke 		mbits /= div;
642f6fb8f10Schetan loke 
643f6fb8f10Schetan loke 	if (div)
6440914d2bbSMao Wenan 		return mbits + 1;
6450914d2bbSMao Wenan 	return mbits;
646f6fb8f10Schetan loke }
647f6fb8f10Schetan loke 
prb_init_ft_ops(struct tpacket_kbdq_core * p1,union tpacket_req_u * req_u)648bc59ba39Schetan loke static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
649f6fb8f10Schetan loke 			union tpacket_req_u *req_u)
650f6fb8f10Schetan loke {
651f6fb8f10Schetan loke 	p1->feature_req_word = req_u->req3.tp_feature_req_word;
652f6fb8f10Schetan loke }
653f6fb8f10Schetan loke 
init_prb_bdqc(struct packet_sock * po,struct packet_ring_buffer * rb,struct pgv * pg_vec,union tpacket_req_u * req_u)654f6fb8f10Schetan loke static void init_prb_bdqc(struct packet_sock *po,
655f6fb8f10Schetan loke 			struct packet_ring_buffer *rb,
656f6fb8f10Schetan loke 			struct pgv *pg_vec,
657e8e85cc5SManinder Singh 			union tpacket_req_u *req_u)
658f6fb8f10Schetan loke {
65922781a5bSDuan Jiong 	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
660bc59ba39Schetan loke 	struct tpacket_block_desc *pbd;
661f6fb8f10Schetan loke 
662f6fb8f10Schetan loke 	memset(p1, 0x0, sizeof(*p1));
663f6fb8f10Schetan loke 
664f6fb8f10Schetan loke 	p1->knxt_seq_num = 1;
665f6fb8f10Schetan loke 	p1->pkbdq = pg_vec;
666bc59ba39Schetan loke 	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
667e3192690SJoe Perches 	p1->pkblk_start	= pg_vec[0].buffer;
668f6fb8f10Schetan loke 	p1->kblk_size = req_u->req3.tp_block_size;
669f6fb8f10Schetan loke 	p1->knum_blocks	= req_u->req3.tp_block_nr;
670f6fb8f10Schetan loke 	p1->hdrlen = po->tp_hdrlen;
671f6fb8f10Schetan loke 	p1->version = po->tp_version;
672f6fb8f10Schetan loke 	p1->last_kactive_blk_num = 0;
673ee80fbf3SDaniel Borkmann 	po->stats.stats3.tp_freeze_q_cnt = 0;
674f6fb8f10Schetan loke 	if (req_u->req3.tp_retire_blk_tov)
675f6fb8f10Schetan loke 		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
676f6fb8f10Schetan loke 	else
677f6fb8f10Schetan loke 		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
678f6fb8f10Schetan loke 						req_u->req3.tp_block_size);
679f6fb8f10Schetan loke 	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
680f6fb8f10Schetan loke 	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
681632ca50fSJohn Ogness 	rwlock_init(&p1->blk_fill_in_prog_lock);
682f6fb8f10Schetan loke 
683dc808110SEric Dumazet 	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
684f6fb8f10Schetan loke 	prb_init_ft_ops(p1, req_u);
685e8e85cc5SManinder Singh 	prb_setup_retire_blk_timer(po);
686f6fb8f10Schetan loke 	prb_open_block(p1, pbd);
687f6fb8f10Schetan loke }
688f6fb8f10Schetan loke 
689f6fb8f10Schetan loke /*  Do NOT update the last_blk_num first.
690f6fb8f10Schetan loke  *  Assumes sk_buff_head lock is held.
691f6fb8f10Schetan loke  */
_prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core * pkc)692bc59ba39Schetan loke static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
693f6fb8f10Schetan loke {
694f6fb8f10Schetan loke 	mod_timer(&pkc->retire_blk_timer,
695f6fb8f10Schetan loke 			jiffies + pkc->tov_in_jiffies);
696f6fb8f10Schetan loke 	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
697f6fb8f10Schetan loke }
698f6fb8f10Schetan loke 
699f6fb8f10Schetan loke /*
700f6fb8f10Schetan loke  * Timer logic:
701f6fb8f10Schetan loke  * 1) We refresh the timer only when we open a block.
702f6fb8f10Schetan loke  *    By doing this we don't waste cycles refreshing the timer
703f6fb8f10Schetan loke  *	  on packet-by-packet basis.
704f6fb8f10Schetan loke  *
705f6fb8f10Schetan loke  * With a 1MB block-size, on a 1Gbps line, it will take
706f6fb8f10Schetan loke  * i) ~8 ms to fill a block + ii) memcpy etc.
707f6fb8f10Schetan loke  * In this cut we are not accounting for the memcpy time.
708f6fb8f10Schetan loke  *
709f6fb8f10Schetan loke  * So, if the user sets the 'tmo' to 10ms then the timer
710f6fb8f10Schetan loke  * will never fire while the block is still getting filled
711f6fb8f10Schetan loke  * (which is what we want). However, the user could choose
712f6fb8f10Schetan loke  * to close a block early and that's fine.
713f6fb8f10Schetan loke  *
714f6fb8f10Schetan loke  * But when the timer does fire, we check whether or not to refresh it.
715f6fb8f10Schetan loke  * Since the tmo granularity is in msecs, it is not too expensive
716f6fb8f10Schetan loke  * to refresh the timer, lets say every '8' msecs.
717f6fb8f10Schetan loke  * Either the user can set the 'tmo' or we can derive it based on
718f6fb8f10Schetan loke  * a) line-speed and b) block-size.
719f6fb8f10Schetan loke  * prb_calc_retire_blk_tmo() calculates the tmo.
720f6fb8f10Schetan loke  *
721f6fb8f10Schetan loke  */
prb_retire_rx_blk_timer_expired(struct timer_list * t)72217bfd8c8SKees Cook static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
723f6fb8f10Schetan loke {
72417bfd8c8SKees Cook 	struct packet_sock *po =
72517bfd8c8SKees Cook 		from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
72622781a5bSDuan Jiong 	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
727f6fb8f10Schetan loke 	unsigned int frozen;
728bc59ba39Schetan loke 	struct tpacket_block_desc *pbd;
729f6fb8f10Schetan loke 
730f6fb8f10Schetan loke 	spin_lock(&po->sk.sk_receive_queue.lock);
731f6fb8f10Schetan loke 
732f6fb8f10Schetan loke 	frozen = prb_queue_frozen(pkc);
733f6fb8f10Schetan loke 	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
734f6fb8f10Schetan loke 
735f6fb8f10Schetan loke 	if (unlikely(pkc->delete_blk_timer))
736f6fb8f10Schetan loke 		goto out;
737f6fb8f10Schetan loke 
738f6fb8f10Schetan loke 	/* We only need to plug the race when the block is partially filled.
739f6fb8f10Schetan loke 	 * tpacket_rcv:
740f6fb8f10Schetan loke 	 *		lock(); increment BLOCK_NUM_PKTS; unlock()
741f6fb8f10Schetan loke 	 *		copy_bits() is in progress ...
742f6fb8f10Schetan loke 	 *		timer fires on other cpu:
743f6fb8f10Schetan loke 	 *		we can't retire the current block because copy_bits
744f6fb8f10Schetan loke 	 *		is in progress.
745f6fb8f10Schetan loke 	 *
746f6fb8f10Schetan loke 	 */
747f6fb8f10Schetan loke 	if (BLOCK_NUM_PKTS(pbd)) {
748f6fb8f10Schetan loke 		/* Waiting for skb_copy_bits to finish... */
749632ca50fSJohn Ogness 		write_lock(&pkc->blk_fill_in_prog_lock);
750632ca50fSJohn Ogness 		write_unlock(&pkc->blk_fill_in_prog_lock);
751f6fb8f10Schetan loke 	}
752f6fb8f10Schetan loke 
753f6fb8f10Schetan loke 	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
754f6fb8f10Schetan loke 		if (!frozen) {
75541a50d62SAlexander Drozdov 			if (!BLOCK_NUM_PKTS(pbd)) {
75641a50d62SAlexander Drozdov 				/* An empty block. Just refresh the timer. */
75741a50d62SAlexander Drozdov 				goto refresh_timer;
75841a50d62SAlexander Drozdov 			}
759f6fb8f10Schetan loke 			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
760f6fb8f10Schetan loke 			if (!prb_dispatch_next_block(pkc, po))
761f6fb8f10Schetan loke 				goto refresh_timer;
762f6fb8f10Schetan loke 			else
763f6fb8f10Schetan loke 				goto out;
764f6fb8f10Schetan loke 		} else {
765f6fb8f10Schetan loke 			/* Case 1. Queue was frozen because user-space was
766f6fb8f10Schetan loke 			 *	   lagging behind.
767f6fb8f10Schetan loke 			 */
768878cd3baSRosen, Rami 			if (prb_curr_blk_in_use(pbd)) {
769f6fb8f10Schetan loke 				/*
770f6fb8f10Schetan loke 				 * Ok, user-space is still behind.
771f6fb8f10Schetan loke 				 * So just refresh the timer.
772f6fb8f10Schetan loke 				 */
773f6fb8f10Schetan loke 				goto refresh_timer;
774f6fb8f10Schetan loke 			} else {
775f6fb8f10Schetan loke 			       /* Case 2. queue was frozen,user-space caught up,
776f6fb8f10Schetan loke 				* now the link went idle && the timer fired.
777f6fb8f10Schetan loke 				* We don't have a block to close.So we open this
778f6fb8f10Schetan loke 				* block and restart the timer.
779f6fb8f10Schetan loke 				* opening a block thaws the queue,restarts timer
780f6fb8f10Schetan loke 				* Thawing/timer-refresh is a side effect.
781f6fb8f10Schetan loke 				*/
782f6fb8f10Schetan loke 				prb_open_block(pkc, pbd);
783f6fb8f10Schetan loke 				goto out;
784f6fb8f10Schetan loke 			}
785f6fb8f10Schetan loke 		}
786f6fb8f10Schetan loke 	}
787f6fb8f10Schetan loke 
788f6fb8f10Schetan loke refresh_timer:
789f6fb8f10Schetan loke 	_prb_refresh_rx_retire_blk_timer(pkc);
790f6fb8f10Schetan loke 
791f6fb8f10Schetan loke out:
792f6fb8f10Schetan loke 	spin_unlock(&po->sk.sk_receive_queue.lock);
793f6fb8f10Schetan loke }
794f6fb8f10Schetan loke 
prb_flush_block(struct tpacket_kbdq_core * pkc1,struct tpacket_block_desc * pbd1,__u32 status)795eea49cc9SOlof Johansson static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
796bc59ba39Schetan loke 		struct tpacket_block_desc *pbd1, __u32 status)
797f6fb8f10Schetan loke {
798f6fb8f10Schetan loke 	/* Flush everything minus the block header */
799f6fb8f10Schetan loke 
800f6fb8f10Schetan loke #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
801f6fb8f10Schetan loke 	u8 *start, *end;
802f6fb8f10Schetan loke 
803f6fb8f10Schetan loke 	start = (u8 *)pbd1;
804f6fb8f10Schetan loke 
805f6fb8f10Schetan loke 	/* Skip the block header(we know header WILL fit in 4K) */
806f6fb8f10Schetan loke 	start += PAGE_SIZE;
807f6fb8f10Schetan loke 
808f6fb8f10Schetan loke 	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
809f6fb8f10Schetan loke 	for (; start < end; start += PAGE_SIZE)
810f6fb8f10Schetan loke 		flush_dcache_page(pgv_to_page(start));
811f6fb8f10Schetan loke 
812f6fb8f10Schetan loke 	smp_wmb();
813f6fb8f10Schetan loke #endif
814f6fb8f10Schetan loke 
815f6fb8f10Schetan loke 	/* Now update the block status. */
816f6fb8f10Schetan loke 
817f6fb8f10Schetan loke 	BLOCK_STATUS(pbd1) = status;
818f6fb8f10Schetan loke 
819f6fb8f10Schetan loke 	/* Flush the block header */
820f6fb8f10Schetan loke 
821f6fb8f10Schetan loke #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
822f6fb8f10Schetan loke 	start = (u8 *)pbd1;
823f6fb8f10Schetan loke 	flush_dcache_page(pgv_to_page(start));
824f6fb8f10Schetan loke 
825f6fb8f10Schetan loke 	smp_wmb();
826f6fb8f10Schetan loke #endif
827f6fb8f10Schetan loke }
828f6fb8f10Schetan loke 
829f6fb8f10Schetan loke /*
830f6fb8f10Schetan loke  * Side effect:
831f6fb8f10Schetan loke  *
832f6fb8f10Schetan loke  * 1) flush the block
833f6fb8f10Schetan loke  * 2) Increment active_blk_num
834f6fb8f10Schetan loke  *
835f6fb8f10Schetan loke  * Note:We DONT refresh the timer on purpose.
836f6fb8f10Schetan loke  *	Because almost always the next block will be opened.
837f6fb8f10Schetan loke  */
prb_close_block(struct tpacket_kbdq_core * pkc1,struct tpacket_block_desc * pbd1,struct packet_sock * po,unsigned int stat)838bc59ba39Schetan loke static void prb_close_block(struct tpacket_kbdq_core *pkc1,
839bc59ba39Schetan loke 		struct tpacket_block_desc *pbd1,
840f6fb8f10Schetan loke 		struct packet_sock *po, unsigned int stat)
841f6fb8f10Schetan loke {
842f6fb8f10Schetan loke 	__u32 status = TP_STATUS_USER | stat;
843f6fb8f10Schetan loke 
844f6fb8f10Schetan loke 	struct tpacket3_hdr *last_pkt;
845bc59ba39Schetan loke 	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
846da413eecSDan Collins 	struct sock *sk = &po->sk;
847f6fb8f10Schetan loke 
8488e8e2951SEric Dumazet 	if (atomic_read(&po->tp_drops))
849f6fb8f10Schetan loke 		status |= TP_STATUS_LOSING;
850f6fb8f10Schetan loke 
851f6fb8f10Schetan loke 	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
852f6fb8f10Schetan loke 	last_pkt->tp_next_offset = 0;
853f6fb8f10Schetan loke 
854f6fb8f10Schetan loke 	/* Get the ts of the last pkt */
855f6fb8f10Schetan loke 	if (BLOCK_NUM_PKTS(pbd1)) {
856f6fb8f10Schetan loke 		h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
857f6fb8f10Schetan loke 		h1->ts_last_pkt.ts_nsec	= last_pkt->tp_nsec;
858f6fb8f10Schetan loke 	} else {
85941a50d62SAlexander Drozdov 		/* Ok, we tmo'd - so get the current time.
86041a50d62SAlexander Drozdov 		 *
86141a50d62SAlexander Drozdov 		 * It shouldn't really happen as we don't close empty
86241a50d62SAlexander Drozdov 		 * blocks. See prb_retire_rx_blk_timer_expired().
86341a50d62SAlexander Drozdov 		 */
864d413fcb4SArnd Bergmann 		struct timespec64 ts;
865d413fcb4SArnd Bergmann 		ktime_get_real_ts64(&ts);
866f6fb8f10Schetan loke 		h1->ts_last_pkt.ts_sec = ts.tv_sec;
867f6fb8f10Schetan loke 		h1->ts_last_pkt.ts_nsec	= ts.tv_nsec;
868f6fb8f10Schetan loke 	}
869f6fb8f10Schetan loke 
870f6fb8f10Schetan loke 	smp_wmb();
871f6fb8f10Schetan loke 
872f6fb8f10Schetan loke 	/* Flush the block */
873f6fb8f10Schetan loke 	prb_flush_block(pkc1, pbd1, status);
874f6fb8f10Schetan loke 
875da413eecSDan Collins 	sk->sk_data_ready(sk);
876da413eecSDan Collins 
877f6fb8f10Schetan loke 	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
878f6fb8f10Schetan loke }
879f6fb8f10Schetan loke 
prb_thaw_queue(struct tpacket_kbdq_core * pkc)880eea49cc9SOlof Johansson static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
881f6fb8f10Schetan loke {
882f6fb8f10Schetan loke 	pkc->reset_pending_on_curr_blk = 0;
883f6fb8f10Schetan loke }
884f6fb8f10Schetan loke 
885f6fb8f10Schetan loke /*
886f6fb8f10Schetan loke  * Side effect of opening a block:
887f6fb8f10Schetan loke  *
888f6fb8f10Schetan loke  * 1) prb_queue is thawed.
889f6fb8f10Schetan loke  * 2) retire_blk_timer is refreshed.
890f6fb8f10Schetan loke  *
891f6fb8f10Schetan loke  */
prb_open_block(struct tpacket_kbdq_core * pkc1,struct tpacket_block_desc * pbd1)892bc59ba39Schetan loke static void prb_open_block(struct tpacket_kbdq_core *pkc1,
893bc59ba39Schetan loke 	struct tpacket_block_desc *pbd1)
894f6fb8f10Schetan loke {
895d413fcb4SArnd Bergmann 	struct timespec64 ts;
896bc59ba39Schetan loke 	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
897f6fb8f10Schetan loke 
898f6fb8f10Schetan loke 	smp_rmb();
899f6fb8f10Schetan loke 
900f6fb8f10Schetan loke 	/* We could have just memset this but we will lose the
901f6fb8f10Schetan loke 	 * flexibility of making the priv area sticky
902f6fb8f10Schetan loke 	 */
9038da3056cSDaniel Borkmann 
904f6fb8f10Schetan loke 	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
905f6fb8f10Schetan loke 	BLOCK_NUM_PKTS(pbd1) = 0;
906f6fb8f10Schetan loke 	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
9078da3056cSDaniel Borkmann 
908d413fcb4SArnd Bergmann 	ktime_get_real_ts64(&ts);
9098da3056cSDaniel Borkmann 
910f6fb8f10Schetan loke 	h1->ts_first_pkt.ts_sec = ts.tv_sec;
911f6fb8f10Schetan loke 	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
9128da3056cSDaniel Borkmann 
913f6fb8f10Schetan loke 	pkc1->pkblk_start = (char *)pbd1;
914e3192690SJoe Perches 	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
9158da3056cSDaniel Borkmann 
916f6fb8f10Schetan loke 	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
917f6fb8f10Schetan loke 	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
9188da3056cSDaniel Borkmann 
919f6fb8f10Schetan loke 	pbd1->version = pkc1->version;
920f6fb8f10Schetan loke 	pkc1->prev = pkc1->nxt_offset;
921f6fb8f10Schetan loke 	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
9228da3056cSDaniel Borkmann 
923f6fb8f10Schetan loke 	prb_thaw_queue(pkc1);
924f6fb8f10Schetan loke 	_prb_refresh_rx_retire_blk_timer(pkc1);
925f6fb8f10Schetan loke 
926f6fb8f10Schetan loke 	smp_wmb();
927f6fb8f10Schetan loke }
928f6fb8f10Schetan loke 
929f6fb8f10Schetan loke /*
930f6fb8f10Schetan loke  * Queue freeze logic:
931f6fb8f10Schetan loke  * 1) Assume tp_block_nr = 8 blocks.
932f6fb8f10Schetan loke  * 2) At time 't0', user opens Rx ring.
933f6fb8f10Schetan loke  * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
934f6fb8f10Schetan loke  * 4) user-space is either sleeping or processing block '0'.
935f6fb8f10Schetan loke  * 5) tpacket_rcv is currently filling block '7', since there is no space left,
936f6fb8f10Schetan loke  *    it will close block-7,loop around and try to fill block '0'.
937f6fb8f10Schetan loke  *    call-flow:
938f6fb8f10Schetan loke  *    __packet_lookup_frame_in_block
939f6fb8f10Schetan loke  *      prb_retire_current_block()
940f6fb8f10Schetan loke  *      prb_dispatch_next_block()
941f6fb8f10Schetan loke  *        |->(BLOCK_STATUS == USER) evaluates to true
942f6fb8f10Schetan loke  *    5.1) Since block-0 is currently in-use, we just freeze the queue.
943f6fb8f10Schetan loke  * 6) Now there are two cases:
944f6fb8f10Schetan loke  *    6.1) Link goes idle right after the queue is frozen.
945f6fb8f10Schetan loke  *         But remember, the last open_block() refreshed the timer.
946f6fb8f10Schetan loke  *         When this timer expires,it will refresh itself so that we can
947f6fb8f10Schetan loke  *         re-open block-0 in near future.
948f6fb8f10Schetan loke  *    6.2) Link is busy and keeps on receiving packets. This is a simple
949f6fb8f10Schetan loke  *         case and __packet_lookup_frame_in_block will check if block-0
950f6fb8f10Schetan loke  *         is free and can now be re-used.
951f6fb8f10Schetan loke  */
prb_freeze_queue(struct tpacket_kbdq_core * pkc,struct packet_sock * po)952eea49cc9SOlof Johansson static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
953f6fb8f10Schetan loke 				  struct packet_sock *po)
954f6fb8f10Schetan loke {
955f6fb8f10Schetan loke 	pkc->reset_pending_on_curr_blk = 1;
956ee80fbf3SDaniel Borkmann 	po->stats.stats3.tp_freeze_q_cnt++;
957f6fb8f10Schetan loke }
958f6fb8f10Schetan loke 
959f6fb8f10Schetan loke #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
960f6fb8f10Schetan loke 
961f6fb8f10Schetan loke /*
962f6fb8f10Schetan loke  * If the next block is free then we will dispatch it
963f6fb8f10Schetan loke  * and return a good offset.
964f6fb8f10Schetan loke  * Else, we will freeze the queue.
965f6fb8f10Schetan loke  * So, caller must check the return value.
966f6fb8f10Schetan loke  */
prb_dispatch_next_block(struct tpacket_kbdq_core * pkc,struct packet_sock * po)967bc59ba39Schetan loke static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
968f6fb8f10Schetan loke 		struct packet_sock *po)
969f6fb8f10Schetan loke {
970bc59ba39Schetan loke 	struct tpacket_block_desc *pbd;
971f6fb8f10Schetan loke 
972f6fb8f10Schetan loke 	smp_rmb();
973f6fb8f10Schetan loke 
974f6fb8f10Schetan loke 	/* 1. Get current block num */
975f6fb8f10Schetan loke 	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
976f6fb8f10Schetan loke 
977f6fb8f10Schetan loke 	/* 2. If this block is currently in_use then freeze the queue */
978f6fb8f10Schetan loke 	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
979f6fb8f10Schetan loke 		prb_freeze_queue(pkc, po);
980f6fb8f10Schetan loke 		return NULL;
981f6fb8f10Schetan loke 	}
982f6fb8f10Schetan loke 
983f6fb8f10Schetan loke 	/*
984f6fb8f10Schetan loke 	 * 3.
985f6fb8f10Schetan loke 	 * open this block and return the offset where the first packet
986f6fb8f10Schetan loke 	 * needs to get stored.
987f6fb8f10Schetan loke 	 */
988f6fb8f10Schetan loke 	prb_open_block(pkc, pbd);
989f6fb8f10Schetan loke 	return (void *)pkc->nxt_offset;
990f6fb8f10Schetan loke }
991f6fb8f10Schetan loke 
prb_retire_current_block(struct tpacket_kbdq_core * pkc,struct packet_sock * po,unsigned int status)992bc59ba39Schetan loke static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
993f6fb8f10Schetan loke 		struct packet_sock *po, unsigned int status)
994f6fb8f10Schetan loke {
995bc59ba39Schetan loke 	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
996f6fb8f10Schetan loke 
997f6fb8f10Schetan loke 	/* retire/close the current block */
998f6fb8f10Schetan loke 	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
999f6fb8f10Schetan loke 		/*
1000f6fb8f10Schetan loke 		 * Plug the case where copy_bits() is in progress on
1001f6fb8f10Schetan loke 		 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
1002f6fb8f10Schetan loke 		 * have space to copy the pkt in the current block and
1003f6fb8f10Schetan loke 		 * called prb_retire_current_block()
1004f6fb8f10Schetan loke 		 *
1005f6fb8f10Schetan loke 		 * We don't need to worry about the TMO case because
1006f6fb8f10Schetan loke 		 * the timer-handler already handled this case.
1007f6fb8f10Schetan loke 		 */
1008f6fb8f10Schetan loke 		if (!(status & TP_STATUS_BLK_TMO)) {
1009f6fb8f10Schetan loke 			/* Waiting for skb_copy_bits to finish... */
1010632ca50fSJohn Ogness 			write_lock(&pkc->blk_fill_in_prog_lock);
1011632ca50fSJohn Ogness 			write_unlock(&pkc->blk_fill_in_prog_lock);
1012f6fb8f10Schetan loke 		}
1013f6fb8f10Schetan loke 		prb_close_block(pkc, pbd, po, status);
1014f6fb8f10Schetan loke 		return;
1015f6fb8f10Schetan loke 	}
1016f6fb8f10Schetan loke }
1017f6fb8f10Schetan loke 
prb_curr_blk_in_use(struct tpacket_block_desc * pbd)1018878cd3baSRosen, Rami static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
1019f6fb8f10Schetan loke {
1020f6fb8f10Schetan loke 	return TP_STATUS_USER & BLOCK_STATUS(pbd);
1021f6fb8f10Schetan loke }
1022f6fb8f10Schetan loke 
prb_queue_frozen(struct tpacket_kbdq_core * pkc)1023eea49cc9SOlof Johansson static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
1024f6fb8f10Schetan loke {
1025f6fb8f10Schetan loke 	return pkc->reset_pending_on_curr_blk;
1026f6fb8f10Schetan loke }
1027f6fb8f10Schetan loke 
prb_clear_blk_fill_status(struct packet_ring_buffer * rb)1028eea49cc9SOlof Johansson static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
102988fd1cb8SJohn Ogness 	__releases(&pkc->blk_fill_in_prog_lock)
1030f6fb8f10Schetan loke {
1031bc59ba39Schetan loke 	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
1032632ca50fSJohn Ogness 
1033632ca50fSJohn Ogness 	read_unlock(&pkc->blk_fill_in_prog_lock);
1034f6fb8f10Schetan loke }
1035f6fb8f10Schetan loke 
prb_fill_rxhash(struct tpacket_kbdq_core * pkc,struct tpacket3_hdr * ppd)1036eea49cc9SOlof Johansson static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
1037f6fb8f10Schetan loke 			struct tpacket3_hdr *ppd)
1038f6fb8f10Schetan loke {
10393958afa1STom Herbert 	ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
1040f6fb8f10Schetan loke }
1041f6fb8f10Schetan loke 
prb_clear_rxhash(struct tpacket_kbdq_core * pkc,struct tpacket3_hdr * ppd)1042eea49cc9SOlof Johansson static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
1043f6fb8f10Schetan loke 			struct tpacket3_hdr *ppd)
1044f6fb8f10Schetan loke {
1045f6fb8f10Schetan loke 	ppd->hv1.tp_rxhash = 0;
1046f6fb8f10Schetan loke }
1047f6fb8f10Schetan loke 
prb_fill_vlan_info(struct tpacket_kbdq_core * pkc,struct tpacket3_hdr * ppd)1048eea49cc9SOlof Johansson static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
1049f6fb8f10Schetan loke 			struct tpacket3_hdr *ppd)
1050f6fb8f10Schetan loke {
10515a041d25SChengen Du 	struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc);
10525a041d25SChengen Du 
1053df8a39deSJiri Pirko 	if (skb_vlan_tag_present(pkc->skb)) {
1054df8a39deSJiri Pirko 		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1055a0cdfcf3SAtzm Watanabe 		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1056a0cdfcf3SAtzm Watanabe 		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
10575a041d25SChengen Du 	} else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) {
10585a041d25SChengen Du 		ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev);
10595a041d25SChengen Du 		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol);
10605a041d25SChengen Du 		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1061f6fb8f10Schetan loke 	} else {
10629e67030aSdanborkmann@iogearbox.net 		ppd->hv1.tp_vlan_tci = 0;
1063a0cdfcf3SAtzm Watanabe 		ppd->hv1.tp_vlan_tpid = 0;
10649e67030aSdanborkmann@iogearbox.net 		ppd->tp_status = TP_STATUS_AVAILABLE;
1065f6fb8f10Schetan loke 	}
1066f6fb8f10Schetan loke }
1067f6fb8f10Schetan loke 
prb_run_all_ft_ops(struct tpacket_kbdq_core * pkc,struct tpacket3_hdr * ppd)1068bc59ba39Schetan loke static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1069f6fb8f10Schetan loke 			struct tpacket3_hdr *ppd)
1070f6fb8f10Schetan loke {
1071a0cdfcf3SAtzm Watanabe 	ppd->hv1.tp_padding = 0;
1072f6fb8f10Schetan loke 	prb_fill_vlan_info(pkc, ppd);
1073f6fb8f10Schetan loke 
1074f6fb8f10Schetan loke 	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1075f6fb8f10Schetan loke 		prb_fill_rxhash(pkc, ppd);
1076f6fb8f10Schetan loke 	else
1077f6fb8f10Schetan loke 		prb_clear_rxhash(pkc, ppd);
1078f6fb8f10Schetan loke }
1079f6fb8f10Schetan loke 
prb_fill_curr_block(char * curr,struct tpacket_kbdq_core * pkc,struct tpacket_block_desc * pbd,unsigned int len)1080eea49cc9SOlof Johansson static void prb_fill_curr_block(char *curr,
1081bc59ba39Schetan loke 				struct tpacket_kbdq_core *pkc,
1082bc59ba39Schetan loke 				struct tpacket_block_desc *pbd,
1083f6fb8f10Schetan loke 				unsigned int len)
108488fd1cb8SJohn Ogness 	__acquires(&pkc->blk_fill_in_prog_lock)
1085f6fb8f10Schetan loke {
1086f6fb8f10Schetan loke 	struct tpacket3_hdr *ppd;
1087f6fb8f10Schetan loke 
1088f6fb8f10Schetan loke 	ppd  = (struct tpacket3_hdr *)curr;
1089f6fb8f10Schetan loke 	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1090f6fb8f10Schetan loke 	pkc->prev = curr;
1091f6fb8f10Schetan loke 	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1092f6fb8f10Schetan loke 	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1093f6fb8f10Schetan loke 	BLOCK_NUM_PKTS(pbd) += 1;
1094632ca50fSJohn Ogness 	read_lock(&pkc->blk_fill_in_prog_lock);
1095f6fb8f10Schetan loke 	prb_run_all_ft_ops(pkc, ppd);
1096f6fb8f10Schetan loke }
1097f6fb8f10Schetan loke 
1098f6fb8f10Schetan loke /* Assumes caller has the sk->rx_queue.lock */
__packet_lookup_frame_in_block(struct packet_sock * po,struct sk_buff * skb,unsigned int len)1099f6fb8f10Schetan loke static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1100f6fb8f10Schetan loke 					    struct sk_buff *skb,
1101f6fb8f10Schetan loke 					    unsigned int len
1102f6fb8f10Schetan loke 					    )
1103f6fb8f10Schetan loke {
1104bc59ba39Schetan loke 	struct tpacket_kbdq_core *pkc;
1105bc59ba39Schetan loke 	struct tpacket_block_desc *pbd;
1106f6fb8f10Schetan loke 	char *curr, *end;
1107f6fb8f10Schetan loke 
1108e3192690SJoe Perches 	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1109f6fb8f10Schetan loke 	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1110f6fb8f10Schetan loke 
1111f6fb8f10Schetan loke 	/* Queue is frozen when user space is lagging behind */
1112f6fb8f10Schetan loke 	if (prb_queue_frozen(pkc)) {
1113f6fb8f10Schetan loke 		/*
1114f6fb8f10Schetan loke 		 * Check if that last block which caused the queue to freeze,
1115f6fb8f10Schetan loke 		 * is still in_use by user-space.
1116f6fb8f10Schetan loke 		 */
1117878cd3baSRosen, Rami 		if (prb_curr_blk_in_use(pbd)) {
1118f6fb8f10Schetan loke 			/* Can't record this packet */
1119f6fb8f10Schetan loke 			return NULL;
1120f6fb8f10Schetan loke 		} else {
1121f6fb8f10Schetan loke 			/*
1122f6fb8f10Schetan loke 			 * Ok, the block was released by user-space.
1123f6fb8f10Schetan loke 			 * Now let's open that block.
1124f6fb8f10Schetan loke 			 * opening a block also thaws the queue.
1125f6fb8f10Schetan loke 			 * Thawing is a side effect.
1126f6fb8f10Schetan loke 			 */
1127f6fb8f10Schetan loke 			prb_open_block(pkc, pbd);
1128f6fb8f10Schetan loke 		}
1129f6fb8f10Schetan loke 	}
1130f6fb8f10Schetan loke 
1131f6fb8f10Schetan loke 	smp_mb();
1132f6fb8f10Schetan loke 	curr = pkc->nxt_offset;
1133f6fb8f10Schetan loke 	pkc->skb = skb;
1134e3192690SJoe Perches 	end = (char *)pbd + pkc->kblk_size;
1135f6fb8f10Schetan loke 
1136f6fb8f10Schetan loke 	/* first try the current block */
1137f6fb8f10Schetan loke 	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1138f6fb8f10Schetan loke 		prb_fill_curr_block(curr, pkc, pbd, len);
1139f6fb8f10Schetan loke 		return (void *)curr;
1140f6fb8f10Schetan loke 	}
1141f6fb8f10Schetan loke 
1142f6fb8f10Schetan loke 	/* Ok, close the current block */
1143f6fb8f10Schetan loke 	prb_retire_current_block(pkc, po, 0);
1144f6fb8f10Schetan loke 
1145f6fb8f10Schetan loke 	/* Now, try to dispatch the next block */
1146f6fb8f10Schetan loke 	curr = (char *)prb_dispatch_next_block(pkc, po);
1147f6fb8f10Schetan loke 	if (curr) {
1148f6fb8f10Schetan loke 		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1149f6fb8f10Schetan loke 		prb_fill_curr_block(curr, pkc, pbd, len);
1150f6fb8f10Schetan loke 		return (void *)curr;
1151f6fb8f10Schetan loke 	}
1152f6fb8f10Schetan loke 
1153f6fb8f10Schetan loke 	/*
1154f6fb8f10Schetan loke 	 * No free blocks are available.user_space hasn't caught up yet.
1155f6fb8f10Schetan loke 	 * Queue was just frozen and now this packet will get dropped.
1156f6fb8f10Schetan loke 	 */
1157f6fb8f10Schetan loke 	return NULL;
1158f6fb8f10Schetan loke }
1159f6fb8f10Schetan loke 
packet_current_rx_frame(struct packet_sock * po,struct sk_buff * skb,int status,unsigned int len)1160eea49cc9SOlof Johansson static void *packet_current_rx_frame(struct packet_sock *po,
1161f6fb8f10Schetan loke 					    struct sk_buff *skb,
1162f6fb8f10Schetan loke 					    int status, unsigned int len)
1163f6fb8f10Schetan loke {
1164f6fb8f10Schetan loke 	char *curr = NULL;
1165f6fb8f10Schetan loke 	switch (po->tp_version) {
1166f6fb8f10Schetan loke 	case TPACKET_V1:
1167f6fb8f10Schetan loke 	case TPACKET_V2:
1168f6fb8f10Schetan loke 		curr = packet_lookup_frame(po, &po->rx_ring,
1169f6fb8f10Schetan loke 					po->rx_ring.head, status);
1170f6fb8f10Schetan loke 		return curr;
1171f6fb8f10Schetan loke 	case TPACKET_V3:
117246088059SMao Wenan 		return __packet_lookup_frame_in_block(po, skb, len);
1173f6fb8f10Schetan loke 	default:
1174f6fb8f10Schetan loke 		WARN(1, "TPACKET version not supported\n");
1175f6fb8f10Schetan loke 		BUG();
117699aa3473SYing Xue 		return NULL;
1177f6fb8f10Schetan loke 	}
1178f6fb8f10Schetan loke }
1179f6fb8f10Schetan loke 
prb_lookup_block(const struct packet_sock * po,const struct packet_ring_buffer * rb,unsigned int idx,int status)1180dcf70cefSEric Dumazet static void *prb_lookup_block(const struct packet_sock *po,
1181dcf70cefSEric Dumazet 			      const struct packet_ring_buffer *rb,
118277f65ebdSWillem de Bruijn 			      unsigned int idx,
1183f6fb8f10Schetan loke 			      int status)
1184f6fb8f10Schetan loke {
1185bc59ba39Schetan loke 	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
118677f65ebdSWillem de Bruijn 	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1187f6fb8f10Schetan loke 
1188f6fb8f10Schetan loke 	if (status != BLOCK_STATUS(pbd))
1189f6fb8f10Schetan loke 		return NULL;
1190f6fb8f10Schetan loke 	return pbd;
1191f6fb8f10Schetan loke }
1192f6fb8f10Schetan loke 
prb_previous_blk_num(struct packet_ring_buffer * rb)1193eea49cc9SOlof Johansson static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1194f6fb8f10Schetan loke {
1195f6fb8f10Schetan loke 	unsigned int prev;
1196f6fb8f10Schetan loke 	if (rb->prb_bdqc.kactive_blk_num)
1197f6fb8f10Schetan loke 		prev = rb->prb_bdqc.kactive_blk_num-1;
1198f6fb8f10Schetan loke 	else
1199f6fb8f10Schetan loke 		prev = rb->prb_bdqc.knum_blocks-1;
1200f6fb8f10Schetan loke 	return prev;
1201f6fb8f10Schetan loke }
1202f6fb8f10Schetan loke 
1203f6fb8f10Schetan loke /* Assumes caller has held the rx_queue.lock */
__prb_previous_block(struct packet_sock * po,struct packet_ring_buffer * rb,int status)1204eea49cc9SOlof Johansson static void *__prb_previous_block(struct packet_sock *po,
1205f6fb8f10Schetan loke 					 struct packet_ring_buffer *rb,
1206f6fb8f10Schetan loke 					 int status)
1207f6fb8f10Schetan loke {
1208f6fb8f10Schetan loke 	unsigned int previous = prb_previous_blk_num(rb);
1209f6fb8f10Schetan loke 	return prb_lookup_block(po, rb, previous, status);
1210f6fb8f10Schetan loke }
1211f6fb8f10Schetan loke 
packet_previous_rx_frame(struct packet_sock * po,struct packet_ring_buffer * rb,int status)1212eea49cc9SOlof Johansson static void *packet_previous_rx_frame(struct packet_sock *po,
1213f6fb8f10Schetan loke 					     struct packet_ring_buffer *rb,
1214f6fb8f10Schetan loke 					     int status)
1215f6fb8f10Schetan loke {
1216f6fb8f10Schetan loke 	if (po->tp_version <= TPACKET_V2)
1217f6fb8f10Schetan loke 		return packet_previous_frame(po, rb, status);
1218f6fb8f10Schetan loke 
1219f6fb8f10Schetan loke 	return __prb_previous_block(po, rb, status);
1220f6fb8f10Schetan loke }
1221f6fb8f10Schetan loke 
packet_increment_rx_head(struct packet_sock * po,struct packet_ring_buffer * rb)1222eea49cc9SOlof Johansson static void packet_increment_rx_head(struct packet_sock *po,
1223f6fb8f10Schetan loke 					    struct packet_ring_buffer *rb)
1224f6fb8f10Schetan loke {
1225f6fb8f10Schetan loke 	switch (po->tp_version) {
1226f6fb8f10Schetan loke 	case TPACKET_V1:
1227f6fb8f10Schetan loke 	case TPACKET_V2:
1228f6fb8f10Schetan loke 		return packet_increment_head(rb);
1229f6fb8f10Schetan loke 	case TPACKET_V3:
1230f6fb8f10Schetan loke 	default:
1231f6fb8f10Schetan loke 		WARN(1, "TPACKET version not supported.\n");
1232f6fb8f10Schetan loke 		BUG();
1233f6fb8f10Schetan loke 		return;
1234f6fb8f10Schetan loke 	}
1235f6fb8f10Schetan loke }
1236f6fb8f10Schetan loke 
packet_previous_frame(struct packet_sock * po,struct packet_ring_buffer * rb,int status)1237eea49cc9SOlof Johansson static void *packet_previous_frame(struct packet_sock *po,
123869e3c75fSJohann Baudy 		struct packet_ring_buffer *rb,
123969e3c75fSJohann Baudy 		int status)
124069e3c75fSJohann Baudy {
124169e3c75fSJohann Baudy 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
124269e3c75fSJohann Baudy 	return packet_lookup_frame(po, rb, previous, status);
124369e3c75fSJohann Baudy }
124469e3c75fSJohann Baudy 
packet_increment_head(struct packet_ring_buffer * buff)1245eea49cc9SOlof Johansson static void packet_increment_head(struct packet_ring_buffer *buff)
124669e3c75fSJohann Baudy {
124769e3c75fSJohann Baudy 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
124869e3c75fSJohann Baudy }
124969e3c75fSJohann Baudy 
packet_inc_pending(struct packet_ring_buffer * rb)1250b0138408SDaniel Borkmann static void packet_inc_pending(struct packet_ring_buffer *rb)
1251b0138408SDaniel Borkmann {
1252b0138408SDaniel Borkmann 	this_cpu_inc(*rb->pending_refcnt);
1253b0138408SDaniel Borkmann }
1254b0138408SDaniel Borkmann 
packet_dec_pending(struct packet_ring_buffer * rb)1255b0138408SDaniel Borkmann static void packet_dec_pending(struct packet_ring_buffer *rb)
1256b0138408SDaniel Borkmann {
1257b0138408SDaniel Borkmann 	this_cpu_dec(*rb->pending_refcnt);
1258b0138408SDaniel Borkmann }
1259b0138408SDaniel Borkmann 
packet_read_pending(const struct packet_ring_buffer * rb)1260b0138408SDaniel Borkmann static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1261b0138408SDaniel Borkmann {
1262b0138408SDaniel Borkmann 	unsigned int refcnt = 0;
1263b0138408SDaniel Borkmann 	int cpu;
1264b0138408SDaniel Borkmann 
1265b0138408SDaniel Borkmann 	/* We don't use pending refcount in rx_ring. */
1266b0138408SDaniel Borkmann 	if (rb->pending_refcnt == NULL)
1267b0138408SDaniel Borkmann 		return 0;
1268b0138408SDaniel Borkmann 
1269b0138408SDaniel Borkmann 	for_each_possible_cpu(cpu)
1270b0138408SDaniel Borkmann 		refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1271b0138408SDaniel Borkmann 
1272b0138408SDaniel Borkmann 	return refcnt;
1273b0138408SDaniel Borkmann }
1274b0138408SDaniel Borkmann 
packet_alloc_pending(struct packet_sock * po)1275b0138408SDaniel Borkmann static int packet_alloc_pending(struct packet_sock *po)
1276b0138408SDaniel Borkmann {
1277b0138408SDaniel Borkmann 	po->rx_ring.pending_refcnt = NULL;
1278b0138408SDaniel Borkmann 
1279b0138408SDaniel Borkmann 	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1280b0138408SDaniel Borkmann 	if (unlikely(po->tx_ring.pending_refcnt == NULL))
1281b0138408SDaniel Borkmann 		return -ENOBUFS;
1282b0138408SDaniel Borkmann 
1283b0138408SDaniel Borkmann 	return 0;
1284b0138408SDaniel Borkmann }
1285b0138408SDaniel Borkmann 
packet_free_pending(struct packet_sock * po)1286b0138408SDaniel Borkmann static void packet_free_pending(struct packet_sock *po)
1287b0138408SDaniel Borkmann {
1288b0138408SDaniel Borkmann 	free_percpu(po->tx_ring.pending_refcnt);
1289b0138408SDaniel Borkmann }
1290b0138408SDaniel Borkmann 
12919954729bSWillem de Bruijn #define ROOM_POW_OFF	2
12929954729bSWillem de Bruijn #define ROOM_NONE	0x0
12939954729bSWillem de Bruijn #define ROOM_LOW	0x1
12949954729bSWillem de Bruijn #define ROOM_NORMAL	0x2
12959954729bSWillem de Bruijn 
__tpacket_has_room(const struct packet_sock * po,int pow_off)1296d4b5bd98SEric Dumazet static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
12979954729bSWillem de Bruijn {
12989954729bSWillem de Bruijn 	int idx, len;
12999954729bSWillem de Bruijn 
1300d4b5bd98SEric Dumazet 	len = READ_ONCE(po->rx_ring.frame_max) + 1;
1301d4b5bd98SEric Dumazet 	idx = READ_ONCE(po->rx_ring.head);
13029954729bSWillem de Bruijn 	if (pow_off)
13039954729bSWillem de Bruijn 		idx += len >> pow_off;
13049954729bSWillem de Bruijn 	if (idx >= len)
13059954729bSWillem de Bruijn 		idx -= len;
13069954729bSWillem de Bruijn 	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
13079954729bSWillem de Bruijn }
13089954729bSWillem de Bruijn 
__tpacket_v3_has_room(const struct packet_sock * po,int pow_off)1309dcf70cefSEric Dumazet static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
13109954729bSWillem de Bruijn {
13119954729bSWillem de Bruijn 	int idx, len;
13129954729bSWillem de Bruijn 
1313dcf70cefSEric Dumazet 	len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1314dcf70cefSEric Dumazet 	idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
13159954729bSWillem de Bruijn 	if (pow_off)
13169954729bSWillem de Bruijn 		idx += len >> pow_off;
13179954729bSWillem de Bruijn 	if (idx >= len)
13189954729bSWillem de Bruijn 		idx -= len;
13199954729bSWillem de Bruijn 	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
13209954729bSWillem de Bruijn }
13219954729bSWillem de Bruijn 
__packet_rcv_has_room(const struct packet_sock * po,const struct sk_buff * skb)13220338a145SEric Dumazet static int __packet_rcv_has_room(const struct packet_sock *po,
13230338a145SEric Dumazet 				 const struct sk_buff *skb)
132477f65ebdSWillem de Bruijn {
13250338a145SEric Dumazet 	const struct sock *sk = &po->sk;
13269954729bSWillem de Bruijn 	int ret = ROOM_NONE;
132777f65ebdSWillem de Bruijn 
13289954729bSWillem de Bruijn 	if (po->prot_hook.func != tpacket_rcv) {
13290338a145SEric Dumazet 		int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
13300338a145SEric Dumazet 		int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
13312ccdbaa6SWillem de Bruijn 				   - (skb ? skb->truesize : 0);
13320338a145SEric Dumazet 
13330338a145SEric Dumazet 		if (avail > (rcvbuf >> ROOM_POW_OFF))
13349954729bSWillem de Bruijn 			return ROOM_NORMAL;
13359954729bSWillem de Bruijn 		else if (avail > 0)
13369954729bSWillem de Bruijn 			return ROOM_LOW;
13379954729bSWillem de Bruijn 		else
13389954729bSWillem de Bruijn 			return ROOM_NONE;
13399954729bSWillem de Bruijn 	}
134077f65ebdSWillem de Bruijn 
13419954729bSWillem de Bruijn 	if (po->tp_version == TPACKET_V3) {
13429954729bSWillem de Bruijn 		if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
13439954729bSWillem de Bruijn 			ret = ROOM_NORMAL;
13449954729bSWillem de Bruijn 		else if (__tpacket_v3_has_room(po, 0))
13459954729bSWillem de Bruijn 			ret = ROOM_LOW;
13469954729bSWillem de Bruijn 	} else {
13479954729bSWillem de Bruijn 		if (__tpacket_has_room(po, ROOM_POW_OFF))
13489954729bSWillem de Bruijn 			ret = ROOM_NORMAL;
13499954729bSWillem de Bruijn 		else if (__tpacket_has_room(po, 0))
13509954729bSWillem de Bruijn 			ret = ROOM_LOW;
13519954729bSWillem de Bruijn 	}
13522ccdbaa6SWillem de Bruijn 
13532ccdbaa6SWillem de Bruijn 	return ret;
13542ccdbaa6SWillem de Bruijn }
13552ccdbaa6SWillem de Bruijn 
packet_rcv_has_room(struct packet_sock * po,struct sk_buff * skb)13562ccdbaa6SWillem de Bruijn static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
13572ccdbaa6SWillem de Bruijn {
1358791a3e9fSEric Dumazet 	bool pressure;
1359791a3e9fSEric Dumazet 	int ret;
13602ccdbaa6SWillem de Bruijn 
13612ccdbaa6SWillem de Bruijn 	ret = __packet_rcv_has_room(po, skb);
13623a2bb84eSEric Dumazet 	pressure = ret != ROOM_NORMAL;
13633a2bb84eSEric Dumazet 
1364791a3e9fSEric Dumazet 	if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure)
1365791a3e9fSEric Dumazet 		packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure);
136677f65ebdSWillem de Bruijn 
13679954729bSWillem de Bruijn 	return ret;
136877f65ebdSWillem de Bruijn }
136977f65ebdSWillem de Bruijn 
packet_rcv_try_clear_pressure(struct packet_sock * po)13709bb6cd65SEric Dumazet static void packet_rcv_try_clear_pressure(struct packet_sock *po)
13719bb6cd65SEric Dumazet {
1372791a3e9fSEric Dumazet 	if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) &&
13739bb6cd65SEric Dumazet 	    __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1374791a3e9fSEric Dumazet 		packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false);
13759bb6cd65SEric Dumazet }
13769bb6cd65SEric Dumazet 
packet_sock_destruct(struct sock * sk)13771da177e4SLinus Torvalds static void packet_sock_destruct(struct sock *sk)
13781da177e4SLinus Torvalds {
1379ed85b565SRichard Cochran 	skb_queue_purge(&sk->sk_error_queue);
1380ed85b565SRichard Cochran 
1381547b792cSIlpo Järvinen 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
138214afee4bSReshetova, Elena 	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
13831da177e4SLinus Torvalds 
13841da177e4SLinus Torvalds 	if (!sock_flag(sk, SOCK_DEAD)) {
138540d4e3dfSEric Dumazet 		pr_err("Attempt to release alive packet socket: %p\n", sk);
13861da177e4SLinus Torvalds 		return;
13871da177e4SLinus Torvalds 	}
13881da177e4SLinus Torvalds }
13891da177e4SLinus Torvalds 
fanout_flow_is_huge(struct packet_sock * po,struct sk_buff * skb)13903b3a5b0aSWillem de Bruijn static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
13913b3a5b0aSWillem de Bruijn {
1392b756ad92SEric Dumazet 	u32 *history = po->rollover->history;
1393b756ad92SEric Dumazet 	u32 victim, rxhash;
13943b3a5b0aSWillem de Bruijn 	int i, count = 0;
13953b3a5b0aSWillem de Bruijn 
13963b3a5b0aSWillem de Bruijn 	rxhash = skb_get_hash(skb);
13973b3a5b0aSWillem de Bruijn 	for (i = 0; i < ROLLOVER_HLEN; i++)
1398b756ad92SEric Dumazet 		if (READ_ONCE(history[i]) == rxhash)
13993b3a5b0aSWillem de Bruijn 			count++;
14003b3a5b0aSWillem de Bruijn 
14018032bf12SJason A. Donenfeld 	victim = get_random_u32_below(ROLLOVER_HLEN);
1402b756ad92SEric Dumazet 
1403b756ad92SEric Dumazet 	/* Avoid dirtying the cache line if possible */
1404b756ad92SEric Dumazet 	if (READ_ONCE(history[victim]) != rxhash)
1405b756ad92SEric Dumazet 		WRITE_ONCE(history[victim], rxhash);
1406b756ad92SEric Dumazet 
14073b3a5b0aSWillem de Bruijn 	return count > (ROLLOVER_HLEN >> 1);
14083b3a5b0aSWillem de Bruijn }
14093b3a5b0aSWillem de Bruijn 
fanout_demux_hash(struct packet_fanout * f,struct sk_buff * skb,unsigned int num)141077f65ebdSWillem de Bruijn static unsigned int fanout_demux_hash(struct packet_fanout *f,
141177f65ebdSWillem de Bruijn 				      struct sk_buff *skb,
141277f65ebdSWillem de Bruijn 				      unsigned int num)
1413dc99f600SDavid S. Miller {
1414eb70db87SDavid S. Miller 	return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1415dc99f600SDavid S. Miller }
1416dc99f600SDavid S. Miller 
fanout_demux_lb(struct packet_fanout * f,struct sk_buff * skb,unsigned int num)141777f65ebdSWillem de Bruijn static unsigned int fanout_demux_lb(struct packet_fanout *f,
141877f65ebdSWillem de Bruijn 				    struct sk_buff *skb,
141977f65ebdSWillem de Bruijn 				    unsigned int num)
1420dc99f600SDavid S. Miller {
1421468479e6SWillem de Bruijn 	unsigned int val = atomic_inc_return(&f->rr_cur);
1422dc99f600SDavid S. Miller 
1423468479e6SWillem de Bruijn 	return val % num;
1424dc99f600SDavid S. Miller }
1425dc99f600SDavid S. Miller 
fanout_demux_cpu(struct packet_fanout * f,struct sk_buff * skb,unsigned int num)142677f65ebdSWillem de Bruijn static unsigned int fanout_demux_cpu(struct packet_fanout *f,
142777f65ebdSWillem de Bruijn 				     struct sk_buff *skb,
142877f65ebdSWillem de Bruijn 				     unsigned int num)
142995ec3eb4SDavid S. Miller {
143077f65ebdSWillem de Bruijn 	return smp_processor_id() % num;
143177f65ebdSWillem de Bruijn }
143295ec3eb4SDavid S. Miller 
fanout_demux_rnd(struct packet_fanout * f,struct sk_buff * skb,unsigned int num)14335df0ddfbSDaniel Borkmann static unsigned int fanout_demux_rnd(struct packet_fanout *f,
14345df0ddfbSDaniel Borkmann 				     struct sk_buff *skb,
14355df0ddfbSDaniel Borkmann 				     unsigned int num)
14365df0ddfbSDaniel Borkmann {
14378032bf12SJason A. Donenfeld 	return get_random_u32_below(num);
14385df0ddfbSDaniel Borkmann }
14395df0ddfbSDaniel Borkmann 
fanout_demux_rollover(struct packet_fanout * f,struct sk_buff * skb,unsigned int idx,bool try_self,unsigned int num)144077f65ebdSWillem de Bruijn static unsigned int fanout_demux_rollover(struct packet_fanout *f,
144177f65ebdSWillem de Bruijn 					  struct sk_buff *skb,
1442ad377cabSWillem de Bruijn 					  unsigned int idx, bool try_self,
144377f65ebdSWillem de Bruijn 					  unsigned int num)
144477f65ebdSWillem de Bruijn {
14454633c9e0SWillem de Bruijn 	struct packet_sock *po, *po_next, *po_skip = NULL;
1446a9b63918SWillem de Bruijn 	unsigned int i, j, room = ROOM_NONE;
144777f65ebdSWillem de Bruijn 
144894f633eaSEric Dumazet 	po = pkt_sk(rcu_dereference(f->arr[idx]));
14493b3a5b0aSWillem de Bruijn 
14503b3a5b0aSWillem de Bruijn 	if (try_self) {
14513b3a5b0aSWillem de Bruijn 		room = packet_rcv_has_room(po, skb);
14523b3a5b0aSWillem de Bruijn 		if (room == ROOM_NORMAL ||
14533b3a5b0aSWillem de Bruijn 		    (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1454ad377cabSWillem de Bruijn 			return idx;
14554633c9e0SWillem de Bruijn 		po_skip = po;
14563b3a5b0aSWillem de Bruijn 	}
1457ad377cabSWillem de Bruijn 
14580648ab70SWillem de Bruijn 	i = j = min_t(int, po->rollover->sock, num - 1);
145977f65ebdSWillem de Bruijn 	do {
146094f633eaSEric Dumazet 		po_next = pkt_sk(rcu_dereference(f->arr[i]));
1461791a3e9fSEric Dumazet 		if (po_next != po_skip &&
1462791a3e9fSEric Dumazet 		    !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) &&
14632ccdbaa6SWillem de Bruijn 		    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
146477f65ebdSWillem de Bruijn 			if (i != j)
14650648ab70SWillem de Bruijn 				po->rollover->sock = i;
1466a9b63918SWillem de Bruijn 			atomic_long_inc(&po->rollover->num);
1467a9b63918SWillem de Bruijn 			if (room == ROOM_LOW)
1468a9b63918SWillem de Bruijn 				atomic_long_inc(&po->rollover->num_huge);
146977f65ebdSWillem de Bruijn 			return i;
147077f65ebdSWillem de Bruijn 		}
1471ad377cabSWillem de Bruijn 
147277f65ebdSWillem de Bruijn 		if (++i == num)
147377f65ebdSWillem de Bruijn 			i = 0;
147477f65ebdSWillem de Bruijn 	} while (i != j);
147577f65ebdSWillem de Bruijn 
1476a9b63918SWillem de Bruijn 	atomic_long_inc(&po->rollover->num_failed);
147777f65ebdSWillem de Bruijn 	return idx;
147877f65ebdSWillem de Bruijn }
147977f65ebdSWillem de Bruijn 
fanout_demux_qm(struct packet_fanout * f,struct sk_buff * skb,unsigned int num)14802d36097dSNeil Horman static unsigned int fanout_demux_qm(struct packet_fanout *f,
14812d36097dSNeil Horman 				    struct sk_buff *skb,
14822d36097dSNeil Horman 				    unsigned int num)
14832d36097dSNeil Horman {
14842d36097dSNeil Horman 	return skb_get_queue_mapping(skb) % num;
14852d36097dSNeil Horman }
14862d36097dSNeil Horman 
fanout_demux_bpf(struct packet_fanout * f,struct sk_buff * skb,unsigned int num)148747dceb8eSWillem de Bruijn static unsigned int fanout_demux_bpf(struct packet_fanout *f,
148847dceb8eSWillem de Bruijn 				     struct sk_buff *skb,
148947dceb8eSWillem de Bruijn 				     unsigned int num)
149047dceb8eSWillem de Bruijn {
149147dceb8eSWillem de Bruijn 	struct bpf_prog *prog;
149247dceb8eSWillem de Bruijn 	unsigned int ret = 0;
149347dceb8eSWillem de Bruijn 
149447dceb8eSWillem de Bruijn 	rcu_read_lock();
149547dceb8eSWillem de Bruijn 	prog = rcu_dereference(f->bpf_prog);
149647dceb8eSWillem de Bruijn 	if (prog)
1497ff936a04SAlexei Starovoitov 		ret = bpf_prog_run_clear_cb(prog, skb) % num;
149847dceb8eSWillem de Bruijn 	rcu_read_unlock();
149947dceb8eSWillem de Bruijn 
150047dceb8eSWillem de Bruijn 	return ret;
150147dceb8eSWillem de Bruijn }
150247dceb8eSWillem de Bruijn 
fanout_has_flag(struct packet_fanout * f,u16 flag)150377f65ebdSWillem de Bruijn static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
150477f65ebdSWillem de Bruijn {
150577f65ebdSWillem de Bruijn 	return f->flags & (flag >> 8);
150695ec3eb4SDavid S. Miller }
150795ec3eb4SDavid S. Miller 
packet_rcv_fanout(struct sk_buff * skb,struct net_device * dev,struct packet_type * pt,struct net_device * orig_dev)150895ec3eb4SDavid S. Miller static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1509dc99f600SDavid S. Miller 			     struct packet_type *pt, struct net_device *orig_dev)
1510dc99f600SDavid S. Miller {
1511dc99f600SDavid S. Miller 	struct packet_fanout *f = pt->af_packet_priv;
1512f98f4514SEric Dumazet 	unsigned int num = READ_ONCE(f->num_members);
151319bcf9f2SEric W. Biederman 	struct net *net = read_pnet(&f->net);
1514dc99f600SDavid S. Miller 	struct packet_sock *po;
151577f65ebdSWillem de Bruijn 	unsigned int idx;
1516dc99f600SDavid S. Miller 
151719bcf9f2SEric W. Biederman 	if (!net_eq(dev_net(dev), net) || !num) {
1518dc99f600SDavid S. Miller 		kfree_skb(skb);
1519dc99f600SDavid S. Miller 		return 0;
1520dc99f600SDavid S. Miller 	}
1521dc99f600SDavid S. Miller 
152277f65ebdSWillem de Bruijn 	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
152319bcf9f2SEric W. Biederman 		skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
15247736d33fSDavid S. Miller 		if (!skb)
15257736d33fSDavid S. Miller 			return 0;
15267736d33fSDavid S. Miller 	}
15273f34b24aSAlexander Drozdov 	switch (f->type) {
15283f34b24aSAlexander Drozdov 	case PACKET_FANOUT_HASH:
15293f34b24aSAlexander Drozdov 	default:
153077f65ebdSWillem de Bruijn 		idx = fanout_demux_hash(f, skb, num);
153195ec3eb4SDavid S. Miller 		break;
153295ec3eb4SDavid S. Miller 	case PACKET_FANOUT_LB:
153377f65ebdSWillem de Bruijn 		idx = fanout_demux_lb(f, skb, num);
153495ec3eb4SDavid S. Miller 		break;
153595ec3eb4SDavid S. Miller 	case PACKET_FANOUT_CPU:
153677f65ebdSWillem de Bruijn 		idx = fanout_demux_cpu(f, skb, num);
153777f65ebdSWillem de Bruijn 		break;
15385df0ddfbSDaniel Borkmann 	case PACKET_FANOUT_RND:
15395df0ddfbSDaniel Borkmann 		idx = fanout_demux_rnd(f, skb, num);
15405df0ddfbSDaniel Borkmann 		break;
15412d36097dSNeil Horman 	case PACKET_FANOUT_QM:
15422d36097dSNeil Horman 		idx = fanout_demux_qm(f, skb, num);
15432d36097dSNeil Horman 		break;
154477f65ebdSWillem de Bruijn 	case PACKET_FANOUT_ROLLOVER:
1545ad377cabSWillem de Bruijn 		idx = fanout_demux_rollover(f, skb, 0, false, num);
154695ec3eb4SDavid S. Miller 		break;
154747dceb8eSWillem de Bruijn 	case PACKET_FANOUT_CBPF:
1548f2e52095SWillem de Bruijn 	case PACKET_FANOUT_EBPF:
154947dceb8eSWillem de Bruijn 		idx = fanout_demux_bpf(f, skb, num);
155047dceb8eSWillem de Bruijn 		break;
155195ec3eb4SDavid S. Miller 	}
155295ec3eb4SDavid S. Miller 
1553ad377cabSWillem de Bruijn 	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1554ad377cabSWillem de Bruijn 		idx = fanout_demux_rollover(f, skb, idx, true, num);
1555dc99f600SDavid S. Miller 
155694f633eaSEric Dumazet 	po = pkt_sk(rcu_dereference(f->arr[idx]));
1557dc99f600SDavid S. Miller 	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1558dc99f600SDavid S. Miller }
1559dc99f600SDavid S. Miller 
1560fff3321dSPavel Emelyanov DEFINE_MUTEX(fanout_mutex);
1561fff3321dSPavel Emelyanov EXPORT_SYMBOL_GPL(fanout_mutex);
1562dc99f600SDavid S. Miller static LIST_HEAD(fanout_list);
15634a69a864SMike Maloney static u16 fanout_next_id;
1564dc99f600SDavid S. Miller 
__fanout_link(struct sock * sk,struct packet_sock * po)1565dc99f600SDavid S. Miller static void __fanout_link(struct sock *sk, struct packet_sock *po)
1566dc99f600SDavid S. Miller {
1567dc99f600SDavid S. Miller 	struct packet_fanout *f = po->fanout;
1568dc99f600SDavid S. Miller 
1569dc99f600SDavid S. Miller 	spin_lock(&f->lock);
157094f633eaSEric Dumazet 	rcu_assign_pointer(f->arr[f->num_members], sk);
1571dc99f600SDavid S. Miller 	smp_wmb();
1572dc99f600SDavid S. Miller 	f->num_members++;
15732bd624b4SAnoob Soman 	if (f->num_members == 1)
15742bd624b4SAnoob Soman 		dev_add_pack(&f->prot_hook);
1575dc99f600SDavid S. Miller 	spin_unlock(&f->lock);
1576dc99f600SDavid S. Miller }
1577dc99f600SDavid S. Miller 
__fanout_unlink(struct sock * sk,struct packet_sock * po)1578dc99f600SDavid S. Miller static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1579dc99f600SDavid S. Miller {
1580dc99f600SDavid S. Miller 	struct packet_fanout *f = po->fanout;
1581dc99f600SDavid S. Miller 	int i;
1582dc99f600SDavid S. Miller 
1583dc99f600SDavid S. Miller 	spin_lock(&f->lock);
1584dc99f600SDavid S. Miller 	for (i = 0; i < f->num_members; i++) {
158594f633eaSEric Dumazet 		if (rcu_dereference_protected(f->arr[i],
158694f633eaSEric Dumazet 					      lockdep_is_held(&f->lock)) == sk)
1587dc99f600SDavid S. Miller 			break;
1588dc99f600SDavid S. Miller 	}
1589dc99f600SDavid S. Miller 	BUG_ON(i >= f->num_members);
159094f633eaSEric Dumazet 	rcu_assign_pointer(f->arr[i],
159194f633eaSEric Dumazet 			   rcu_dereference_protected(f->arr[f->num_members - 1],
159294f633eaSEric Dumazet 						     lockdep_is_held(&f->lock)));
1593dc99f600SDavid S. Miller 	f->num_members--;
15942bd624b4SAnoob Soman 	if (f->num_members == 0)
15952bd624b4SAnoob Soman 		__dev_remove_pack(&f->prot_hook);
1596dc99f600SDavid S. Miller 	spin_unlock(&f->lock);
1597dc99f600SDavid S. Miller }
1598dc99f600SDavid S. Miller 
match_fanout_group(struct packet_type * ptype,struct sock * sk)1599a0dfb263SFengguang Wu static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1600c0de08d0SEric Leblond {
1601161642e2SEric Dumazet 	if (sk->sk_family != PF_PACKET)
1602c0de08d0SEric Leblond 		return false;
1603161642e2SEric Dumazet 
1604161642e2SEric Dumazet 	return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1605c0de08d0SEric Leblond }
1606c0de08d0SEric Leblond 
fanout_init_data(struct packet_fanout * f)160747dceb8eSWillem de Bruijn static void fanout_init_data(struct packet_fanout *f)
160847dceb8eSWillem de Bruijn {
160947dceb8eSWillem de Bruijn 	switch (f->type) {
161047dceb8eSWillem de Bruijn 	case PACKET_FANOUT_LB:
161147dceb8eSWillem de Bruijn 		atomic_set(&f->rr_cur, 0);
161247dceb8eSWillem de Bruijn 		break;
161347dceb8eSWillem de Bruijn 	case PACKET_FANOUT_CBPF:
1614f2e52095SWillem de Bruijn 	case PACKET_FANOUT_EBPF:
161547dceb8eSWillem de Bruijn 		RCU_INIT_POINTER(f->bpf_prog, NULL);
161647dceb8eSWillem de Bruijn 		break;
161747dceb8eSWillem de Bruijn 	}
161847dceb8eSWillem de Bruijn }
161947dceb8eSWillem de Bruijn 
__fanout_set_data_bpf(struct packet_fanout * f,struct bpf_prog * new)162047dceb8eSWillem de Bruijn static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
162147dceb8eSWillem de Bruijn {
162247dceb8eSWillem de Bruijn 	struct bpf_prog *old;
162347dceb8eSWillem de Bruijn 
162447dceb8eSWillem de Bruijn 	spin_lock(&f->lock);
162547dceb8eSWillem de Bruijn 	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
162647dceb8eSWillem de Bruijn 	rcu_assign_pointer(f->bpf_prog, new);
162747dceb8eSWillem de Bruijn 	spin_unlock(&f->lock);
162847dceb8eSWillem de Bruijn 
162947dceb8eSWillem de Bruijn 	if (old) {
163047dceb8eSWillem de Bruijn 		synchronize_net();
163147dceb8eSWillem de Bruijn 		bpf_prog_destroy(old);
163247dceb8eSWillem de Bruijn 	}
163347dceb8eSWillem de Bruijn }
163447dceb8eSWillem de Bruijn 
fanout_set_data_cbpf(struct packet_sock * po,sockptr_t data,unsigned int len)1635b1ea9ff6SChristoph Hellwig static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
163647dceb8eSWillem de Bruijn 				unsigned int len)
163747dceb8eSWillem de Bruijn {
163847dceb8eSWillem de Bruijn 	struct bpf_prog *new;
163947dceb8eSWillem de Bruijn 	struct sock_fprog fprog;
164047dceb8eSWillem de Bruijn 	int ret;
164147dceb8eSWillem de Bruijn 
164247dceb8eSWillem de Bruijn 	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
164347dceb8eSWillem de Bruijn 		return -EPERM;
16444d295e54SChristoph Hellwig 
16454d295e54SChristoph Hellwig 	ret = copy_bpf_fprog_from_user(&fprog, data, len);
16464d295e54SChristoph Hellwig 	if (ret)
16474d295e54SChristoph Hellwig 		return ret;
164847dceb8eSWillem de Bruijn 
1649bab18991SDaniel Borkmann 	ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
165047dceb8eSWillem de Bruijn 	if (ret)
165147dceb8eSWillem de Bruijn 		return ret;
165247dceb8eSWillem de Bruijn 
165347dceb8eSWillem de Bruijn 	__fanout_set_data_bpf(po->fanout, new);
165447dceb8eSWillem de Bruijn 	return 0;
165547dceb8eSWillem de Bruijn }
165647dceb8eSWillem de Bruijn 
fanout_set_data_ebpf(struct packet_sock * po,sockptr_t data,unsigned int len)1657a7b75c5aSChristoph Hellwig static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
1658f2e52095SWillem de Bruijn 				unsigned int len)
1659f2e52095SWillem de Bruijn {
1660f2e52095SWillem de Bruijn 	struct bpf_prog *new;
1661f2e52095SWillem de Bruijn 	u32 fd;
1662f2e52095SWillem de Bruijn 
1663f2e52095SWillem de Bruijn 	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1664f2e52095SWillem de Bruijn 		return -EPERM;
1665f2e52095SWillem de Bruijn 	if (len != sizeof(fd))
1666f2e52095SWillem de Bruijn 		return -EINVAL;
1667a7b75c5aSChristoph Hellwig 	if (copy_from_sockptr(&fd, data, len))
1668f2e52095SWillem de Bruijn 		return -EFAULT;
1669f2e52095SWillem de Bruijn 
1670113214beSDaniel Borkmann 	new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1671f2e52095SWillem de Bruijn 	if (IS_ERR(new))
1672f2e52095SWillem de Bruijn 		return PTR_ERR(new);
1673f2e52095SWillem de Bruijn 
1674f2e52095SWillem de Bruijn 	__fanout_set_data_bpf(po->fanout, new);
1675f2e52095SWillem de Bruijn 	return 0;
1676f2e52095SWillem de Bruijn }
1677f2e52095SWillem de Bruijn 
fanout_set_data(struct packet_sock * po,sockptr_t data,unsigned int len)1678a7b75c5aSChristoph Hellwig static int fanout_set_data(struct packet_sock *po, sockptr_t data,
167947dceb8eSWillem de Bruijn 			   unsigned int len)
168047dceb8eSWillem de Bruijn {
168147dceb8eSWillem de Bruijn 	switch (po->fanout->type) {
168247dceb8eSWillem de Bruijn 	case PACKET_FANOUT_CBPF:
1683a7b75c5aSChristoph Hellwig 		return fanout_set_data_cbpf(po, data, len);
1684f2e52095SWillem de Bruijn 	case PACKET_FANOUT_EBPF:
1685f2e52095SWillem de Bruijn 		return fanout_set_data_ebpf(po, data, len);
168647dceb8eSWillem de Bruijn 	default:
168747dceb8eSWillem de Bruijn 		return -EINVAL;
168807d53ae4Szhong jiang 	}
168947dceb8eSWillem de Bruijn }
169047dceb8eSWillem de Bruijn 
fanout_release_data(struct packet_fanout * f)169147dceb8eSWillem de Bruijn static void fanout_release_data(struct packet_fanout *f)
169247dceb8eSWillem de Bruijn {
169347dceb8eSWillem de Bruijn 	switch (f->type) {
169447dceb8eSWillem de Bruijn 	case PACKET_FANOUT_CBPF:
1695f2e52095SWillem de Bruijn 	case PACKET_FANOUT_EBPF:
169647dceb8eSWillem de Bruijn 		__fanout_set_data_bpf(f, NULL);
169707d53ae4Szhong jiang 	}
169847dceb8eSWillem de Bruijn }
169947dceb8eSWillem de Bruijn 
__fanout_id_is_free(struct sock * sk,u16 candidate_id)17004a69a864SMike Maloney static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
17014a69a864SMike Maloney {
17024a69a864SMike Maloney 	struct packet_fanout *f;
17034a69a864SMike Maloney 
17044a69a864SMike Maloney 	list_for_each_entry(f, &fanout_list, list) {
17054a69a864SMike Maloney 		if (f->id == candidate_id &&
17064a69a864SMike Maloney 		    read_pnet(&f->net) == sock_net(sk)) {
17074a69a864SMike Maloney 			return false;
17084a69a864SMike Maloney 		}
17094a69a864SMike Maloney 	}
17104a69a864SMike Maloney 	return true;
17114a69a864SMike Maloney }
17124a69a864SMike Maloney 
fanout_find_new_id(struct sock * sk,u16 * new_id)17134a69a864SMike Maloney static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
17144a69a864SMike Maloney {
17154a69a864SMike Maloney 	u16 id = fanout_next_id;
17164a69a864SMike Maloney 
17174a69a864SMike Maloney 	do {
17184a69a864SMike Maloney 		if (__fanout_id_is_free(sk, id)) {
17194a69a864SMike Maloney 			*new_id = id;
17204a69a864SMike Maloney 			fanout_next_id = id + 1;
17214a69a864SMike Maloney 			return true;
17224a69a864SMike Maloney 		}
17234a69a864SMike Maloney 
17244a69a864SMike Maloney 		id++;
17254a69a864SMike Maloney 	} while (id != fanout_next_id);
17264a69a864SMike Maloney 
17274a69a864SMike Maloney 	return false;
17284a69a864SMike Maloney }
17294a69a864SMike Maloney 
fanout_add(struct sock * sk,struct fanout_args * args)17309c661b0bSTanner Love static int fanout_add(struct sock *sk, struct fanout_args *args)
1731dc99f600SDavid S. Miller {
1732d199fab6SEric Dumazet 	struct packet_rollover *rollover = NULL;
1733dc99f600SDavid S. Miller 	struct packet_sock *po = pkt_sk(sk);
17349c661b0bSTanner Love 	u16 type_flags = args->type_flags;
1735dc99f600SDavid S. Miller 	struct packet_fanout *f, *match;
17367736d33fSDavid S. Miller 	u8 type = type_flags & 0xff;
173777f65ebdSWillem de Bruijn 	u8 flags = type_flags >> 8;
17389c661b0bSTanner Love 	u16 id = args->id;
1739dc99f600SDavid S. Miller 	int err;
1740dc99f600SDavid S. Miller 
1741dc99f600SDavid S. Miller 	switch (type) {
174277f65ebdSWillem de Bruijn 	case PACKET_FANOUT_ROLLOVER:
174377f65ebdSWillem de Bruijn 		if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
174477f65ebdSWillem de Bruijn 			return -EINVAL;
17455af5a020SGustavo A. R. Silva 		break;
1746dc99f600SDavid S. Miller 	case PACKET_FANOUT_HASH:
1747dc99f600SDavid S. Miller 	case PACKET_FANOUT_LB:
174895ec3eb4SDavid S. Miller 	case PACKET_FANOUT_CPU:
17495df0ddfbSDaniel Borkmann 	case PACKET_FANOUT_RND:
17502d36097dSNeil Horman 	case PACKET_FANOUT_QM:
175147dceb8eSWillem de Bruijn 	case PACKET_FANOUT_CBPF:
1752f2e52095SWillem de Bruijn 	case PACKET_FANOUT_EBPF:
1753dc99f600SDavid S. Miller 		break;
1754dc99f600SDavid S. Miller 	default:
1755dc99f600SDavid S. Miller 		return -EINVAL;
1756dc99f600SDavid S. Miller 	}
1757dc99f600SDavid S. Miller 
1758d199fab6SEric Dumazet 	mutex_lock(&fanout_mutex);
1759dc99f600SDavid S. Miller 
1760d199fab6SEric Dumazet 	err = -EALREADY;
1761dc99f600SDavid S. Miller 	if (po->fanout)
1762d199fab6SEric Dumazet 		goto out;
1763dc99f600SDavid S. Miller 
17644633c9e0SWillem de Bruijn 	if (type == PACKET_FANOUT_ROLLOVER ||
17654633c9e0SWillem de Bruijn 	    (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1766d199fab6SEric Dumazet 		err = -ENOMEM;
1767d199fab6SEric Dumazet 		rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1768d199fab6SEric Dumazet 		if (!rollover)
1769d199fab6SEric Dumazet 			goto out;
1770d199fab6SEric Dumazet 		atomic_long_set(&rollover->num, 0);
1771d199fab6SEric Dumazet 		atomic_long_set(&rollover->num_huge, 0);
1772d199fab6SEric Dumazet 		atomic_long_set(&rollover->num_failed, 0);
17730648ab70SWillem de Bruijn 	}
17740648ab70SWillem de Bruijn 
17754a69a864SMike Maloney 	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
17764a69a864SMike Maloney 		if (id != 0) {
17774a69a864SMike Maloney 			err = -EINVAL;
17784a69a864SMike Maloney 			goto out;
17794a69a864SMike Maloney 		}
17804a69a864SMike Maloney 		if (!fanout_find_new_id(sk, &id)) {
17814a69a864SMike Maloney 			err = -ENOMEM;
17824a69a864SMike Maloney 			goto out;
17834a69a864SMike Maloney 		}
17844a69a864SMike Maloney 		/* ephemeral flag for the first socket in the group: drop it */
17854a69a864SMike Maloney 		flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
17864a69a864SMike Maloney 	}
17874a69a864SMike Maloney 
1788dc99f600SDavid S. Miller 	match = NULL;
1789dc99f600SDavid S. Miller 	list_for_each_entry(f, &fanout_list, list) {
1790dc99f600SDavid S. Miller 		if (f->id == id &&
1791dc99f600SDavid S. Miller 		    read_pnet(&f->net) == sock_net(sk)) {
1792dc99f600SDavid S. Miller 			match = f;
1793dc99f600SDavid S. Miller 			break;
1794dc99f600SDavid S. Miller 		}
1795dc99f600SDavid S. Miller 	}
1796afe62c68SEric Dumazet 	err = -EINVAL;
17979c661b0bSTanner Love 	if (match) {
17989c661b0bSTanner Love 		if (match->flags != flags)
1799afe62c68SEric Dumazet 			goto out;
18009c661b0bSTanner Love 		if (args->max_num_members &&
18019c661b0bSTanner Love 		    args->max_num_members != match->max_num_members)
18029c661b0bSTanner Love 			goto out;
18039c661b0bSTanner Love 	} else {
18049c661b0bSTanner Love 		if (args->max_num_members > PACKET_FANOUT_MAX)
18059c661b0bSTanner Love 			goto out;
18069c661b0bSTanner Love 		if (!args->max_num_members)
18079c661b0bSTanner Love 			/* legacy PACKET_FANOUT_MAX */
18089c661b0bSTanner Love 			args->max_num_members = 256;
1809afe62c68SEric Dumazet 		err = -ENOMEM;
18109c661b0bSTanner Love 		match = kvzalloc(struct_size(match, arr, args->max_num_members),
18119c661b0bSTanner Love 				 GFP_KERNEL);
1812afe62c68SEric Dumazet 		if (!match)
1813afe62c68SEric Dumazet 			goto out;
1814dc99f600SDavid S. Miller 		write_pnet(&match->net, sock_net(sk));
1815dc99f600SDavid S. Miller 		match->id = id;
1816dc99f600SDavid S. Miller 		match->type = type;
181777f65ebdSWillem de Bruijn 		match->flags = flags;
1818dc99f600SDavid S. Miller 		INIT_LIST_HEAD(&match->list);
1819dc99f600SDavid S. Miller 		spin_lock_init(&match->lock);
1820fb5c2c17SReshetova, Elena 		refcount_set(&match->sk_ref, 0);
182147dceb8eSWillem de Bruijn 		fanout_init_data(match);
1822dc99f600SDavid S. Miller 		match->prot_hook.type = po->prot_hook.type;
1823dc99f600SDavid S. Miller 		match->prot_hook.dev = po->prot_hook.dev;
182495ec3eb4SDavid S. Miller 		match->prot_hook.func = packet_rcv_fanout;
1825dc99f600SDavid S. Miller 		match->prot_hook.af_packet_priv = match;
182647934e06SCongyu Liu 		match->prot_hook.af_packet_net = read_pnet(&match->net);
1827c0de08d0SEric Leblond 		match->prot_hook.id_match = match_fanout_group;
18289c661b0bSTanner Love 		match->max_num_members = args->max_num_members;
182958ba4263SWillem de Bruijn 		match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
1830dc99f600SDavid S. Miller 		list_add(&match->list, &fanout_list);
1831dc99f600SDavid S. Miller 	}
1832dc99f600SDavid S. Miller 	err = -EINVAL;
1833008ba2a1SWillem de Bruijn 
1834008ba2a1SWillem de Bruijn 	spin_lock(&po->bind_lock);
183561edf479SEric Dumazet 	if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
1836008ba2a1SWillem de Bruijn 	    match->type == type &&
1837dc99f600SDavid S. Miller 	    match->prot_hook.type == po->prot_hook.type &&
1838dc99f600SDavid S. Miller 	    match->prot_hook.dev == po->prot_hook.dev) {
1839dc99f600SDavid S. Miller 		err = -ENOSPC;
18409c661b0bSTanner Love 		if (refcount_read(&match->sk_ref) < match->max_num_members) {
1841dc99f600SDavid S. Miller 			__dev_remove_pack(&po->prot_hook);
1842e42e70adSEric Dumazet 
1843e42e70adSEric Dumazet 			/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1844e42e70adSEric Dumazet 			WRITE_ONCE(po->fanout, match);
1845e42e70adSEric Dumazet 
184657f015f5SMike Maloney 			po->rollover = rollover;
184757f015f5SMike Maloney 			rollover = NULL;
1848fb5c2c17SReshetova, Elena 			refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1849dc99f600SDavid S. Miller 			__fanout_link(sk, po);
1850dc99f600SDavid S. Miller 			err = 0;
1851dc99f600SDavid S. Miller 		}
1852dc99f600SDavid S. Miller 	}
1853008ba2a1SWillem de Bruijn 	spin_unlock(&po->bind_lock);
1854008ba2a1SWillem de Bruijn 
1855008ba2a1SWillem de Bruijn 	if (err && !refcount_read(&match->sk_ref)) {
1856008ba2a1SWillem de Bruijn 		list_del(&match->list);
18579c661b0bSTanner Love 		kvfree(match);
1858008ba2a1SWillem de Bruijn 	}
1859008ba2a1SWillem de Bruijn 
1860afe62c68SEric Dumazet out:
186157f015f5SMike Maloney 	kfree(rollover);
1862d199fab6SEric Dumazet 	mutex_unlock(&fanout_mutex);
1863dc99f600SDavid S. Miller 	return err;
1864dc99f600SDavid S. Miller }
1865dc99f600SDavid S. Miller 
18662bd624b4SAnoob Soman /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
18672bd624b4SAnoob Soman  * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
18682bd624b4SAnoob Soman  * It is the responsibility of the caller to call fanout_release_data() and
18692bd624b4SAnoob Soman  * free the returned packet_fanout (after synchronize_net())
18702bd624b4SAnoob Soman  */
fanout_release(struct sock * sk)18712bd624b4SAnoob Soman static struct packet_fanout *fanout_release(struct sock *sk)
1872dc99f600SDavid S. Miller {
1873dc99f600SDavid S. Miller 	struct packet_sock *po = pkt_sk(sk);
1874dc99f600SDavid S. Miller 	struct packet_fanout *f;
1875dc99f600SDavid S. Miller 
1876fff3321dSPavel Emelyanov 	mutex_lock(&fanout_mutex);
1877d199fab6SEric Dumazet 	f = po->fanout;
1878d199fab6SEric Dumazet 	if (f) {
1879dc99f600SDavid S. Miller 		po->fanout = NULL;
1880dc99f600SDavid S. Miller 
1881fb5c2c17SReshetova, Elena 		if (refcount_dec_and_test(&f->sk_ref))
1882dc99f600SDavid S. Miller 			list_del(&f->list);
18832bd624b4SAnoob Soman 		else
18842bd624b4SAnoob Soman 			f = NULL;
1885dc99f600SDavid S. Miller 	}
1886d199fab6SEric Dumazet 	mutex_unlock(&fanout_mutex);
18872bd624b4SAnoob Soman 
18882bd624b4SAnoob Soman 	return f;
1889d199fab6SEric Dumazet }
18901da177e4SLinus Torvalds 
packet_extra_vlan_len_allowed(const struct net_device * dev,struct sk_buff * skb)18913c70c132SDaniel Borkmann static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
18923c70c132SDaniel Borkmann 					  struct sk_buff *skb)
18933c70c132SDaniel Borkmann {
18943c70c132SDaniel Borkmann 	/* Earlier code assumed this would be a VLAN pkt, double-check
18953c70c132SDaniel Borkmann 	 * this now that we have the actual packet in hand. We can only
18963c70c132SDaniel Borkmann 	 * do this check on Ethernet devices.
18973c70c132SDaniel Borkmann 	 */
18983c70c132SDaniel Borkmann 	if (unlikely(dev->type != ARPHRD_ETHER))
18993c70c132SDaniel Borkmann 		return false;
19003c70c132SDaniel Borkmann 
19013c70c132SDaniel Borkmann 	skb_reset_mac_header(skb);
19023c70c132SDaniel Borkmann 	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
19033c70c132SDaniel Borkmann }
19043c70c132SDaniel Borkmann 
190590ddc4f0SEric Dumazet static const struct proto_ops packet_ops;
19061da177e4SLinus Torvalds 
190790ddc4f0SEric Dumazet static const struct proto_ops packet_ops_spkt;
19081da177e4SLinus Torvalds 
packet_rcv_spkt(struct sk_buff * skb,struct net_device * dev,struct packet_type * pt,struct net_device * orig_dev)190940d4e3dfSEric Dumazet static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
191040d4e3dfSEric Dumazet 			   struct packet_type *pt, struct net_device *orig_dev)
19111da177e4SLinus Torvalds {
19121da177e4SLinus Torvalds 	struct sock *sk;
19131da177e4SLinus Torvalds 	struct sockaddr_pkt *spkt;
19141da177e4SLinus Torvalds 
19151da177e4SLinus Torvalds 	/*
19161da177e4SLinus Torvalds 	 *	When we registered the protocol we saved the socket in the data
19171da177e4SLinus Torvalds 	 *	field for just this event.
19181da177e4SLinus Torvalds 	 */
19191da177e4SLinus Torvalds 
19201da177e4SLinus Torvalds 	sk = pt->af_packet_priv;
19211da177e4SLinus Torvalds 
19221da177e4SLinus Torvalds 	/*
19231da177e4SLinus Torvalds 	 *	Yank back the headers [hope the device set this
19241da177e4SLinus Torvalds 	 *	right or kerboom...]
19251da177e4SLinus Torvalds 	 *
19261da177e4SLinus Torvalds 	 *	Incoming packets have ll header pulled,
19271da177e4SLinus Torvalds 	 *	push it back.
19281da177e4SLinus Torvalds 	 *
192998e399f8SArnaldo Carvalho de Melo 	 *	For outgoing ones skb->data == skb_mac_header(skb)
19301da177e4SLinus Torvalds 	 *	so that this procedure is noop.
19311da177e4SLinus Torvalds 	 */
19321da177e4SLinus Torvalds 
19331da177e4SLinus Torvalds 	if (skb->pkt_type == PACKET_LOOPBACK)
19341da177e4SLinus Torvalds 		goto out;
19351da177e4SLinus Torvalds 
193609ad9bc7SOctavian Purdila 	if (!net_eq(dev_net(dev), sock_net(sk)))
1937d12d01d6SDenis V. Lunev 		goto out;
1938d12d01d6SDenis V. Lunev 
193940d4e3dfSEric Dumazet 	skb = skb_share_check(skb, GFP_ATOMIC);
194040d4e3dfSEric Dumazet 	if (skb == NULL)
19411da177e4SLinus Torvalds 		goto oom;
19421da177e4SLinus Torvalds 
19431da177e4SLinus Torvalds 	/* drop any routing info */
1944adf30907SEric Dumazet 	skb_dst_drop(skb);
19451da177e4SLinus Torvalds 
194684531c24SPhil Oester 	/* drop conntrack reference */
1947895b5c9fSFlorian Westphal 	nf_reset_ct(skb);
194884531c24SPhil Oester 
1949ffbc6111SHerbert Xu 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
19501da177e4SLinus Torvalds 
195198e399f8SArnaldo Carvalho de Melo 	skb_push(skb, skb->data - skb_mac_header(skb));
19521da177e4SLinus Torvalds 
19531da177e4SLinus Torvalds 	/*
19541da177e4SLinus Torvalds 	 *	The SOCK_PACKET socket receives _all_ frames.
19551da177e4SLinus Torvalds 	 */
19561da177e4SLinus Torvalds 
19571da177e4SLinus Torvalds 	spkt->spkt_family = dev->type;
19588fc9d51eSWolfram Sang 	strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
19591da177e4SLinus Torvalds 	spkt->spkt_protocol = skb->protocol;
19601da177e4SLinus Torvalds 
19611da177e4SLinus Torvalds 	/*
19621da177e4SLinus Torvalds 	 *	Charge the memory to the socket. This is done specifically
19631da177e4SLinus Torvalds 	 *	to prevent sockets using all the memory up.
19641da177e4SLinus Torvalds 	 */
19651da177e4SLinus Torvalds 
19661da177e4SLinus Torvalds 	if (sock_queue_rcv_skb(sk, skb) == 0)
19671da177e4SLinus Torvalds 		return 0;
19681da177e4SLinus Torvalds 
19691da177e4SLinus Torvalds out:
19701da177e4SLinus Torvalds 	kfree_skb(skb);
19711da177e4SLinus Torvalds oom:
19721da177e4SLinus Torvalds 	return 0;
19731da177e4SLinus Torvalds }
19741da177e4SLinus Torvalds 
packet_parse_headers(struct sk_buff * skb,struct socket * sock)197575c65772SMaxim Mikityanskiy static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
197675c65772SMaxim Mikityanskiy {
1977dfed913eSHangbin Liu 	int depth;
1978dfed913eSHangbin Liu 
197918bed891SYoshiki Komachi 	if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
198018bed891SYoshiki Komachi 	    sock->type == SOCK_RAW) {
198175c65772SMaxim Mikityanskiy 		skb_reset_mac_header(skb);
198275c65772SMaxim Mikityanskiy 		skb->protocol = dev_parse_header_protocol(skb);
198375c65772SMaxim Mikityanskiy 	}
198475c65772SMaxim Mikityanskiy 
1985dfed913eSHangbin Liu 	/* Move network header to the right position for VLAN tagged packets */
1986dfed913eSHangbin Liu 	if (likely(skb->dev->type == ARPHRD_ETHER) &&
1987dfed913eSHangbin Liu 	    eth_type_vlan(skb->protocol) &&
19884063384eSEric Dumazet 	    vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1989dfed913eSHangbin Liu 		skb_set_network_header(skb, depth);
1990dfed913eSHangbin Liu 
199175c65772SMaxim Mikityanskiy 	skb_probe_transport_header(skb);
199275c65772SMaxim Mikityanskiy }
19931da177e4SLinus Torvalds 
19941da177e4SLinus Torvalds /*
19951da177e4SLinus Torvalds  *	Output a raw packet to a device layer. This bypasses all the other
19961da177e4SLinus Torvalds  *	protocol layers and you must therefore supply it with a complete frame
19971da177e4SLinus Torvalds  */
19981da177e4SLinus Torvalds 
packet_sendmsg_spkt(struct socket * sock,struct msghdr * msg,size_t len)19991b784140SYing Xue static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
20001b784140SYing Xue 			       size_t len)
20011da177e4SLinus Torvalds {
20021da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
2003342dfc30SSteffen Hurrle 	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
20041a35ca80SEric Dumazet 	struct sk_buff *skb = NULL;
20051da177e4SLinus Torvalds 	struct net_device *dev;
2006c14ac945SSoheil Hassas Yeganeh 	struct sockcm_cookie sockc;
20070e11c91eSAl Viro 	__be16 proto = 0;
20081da177e4SLinus Torvalds 	int err;
20093bdc0ebaSBen Greear 	int extra_len = 0;
20101da177e4SLinus Torvalds 
20111da177e4SLinus Torvalds 	/*
20121da177e4SLinus Torvalds 	 *	Get and verify the address.
20131da177e4SLinus Torvalds 	 */
20141da177e4SLinus Torvalds 
201540d4e3dfSEric Dumazet 	if (saddr) {
20161da177e4SLinus Torvalds 		if (msg->msg_namelen < sizeof(struct sockaddr))
201740d4e3dfSEric Dumazet 			return -EINVAL;
20181da177e4SLinus Torvalds 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
20191da177e4SLinus Torvalds 			proto = saddr->spkt_protocol;
202040d4e3dfSEric Dumazet 	} else
202140d4e3dfSEric Dumazet 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
20221da177e4SLinus Torvalds 
20231da177e4SLinus Torvalds 	/*
20241da177e4SLinus Torvalds 	 *	Find the device first to size check it
20251da177e4SLinus Torvalds 	 */
20261da177e4SLinus Torvalds 
2027de74e92aSdanborkmann@iogearbox.net 	saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
20281a35ca80SEric Dumazet retry:
2029654d1f8aSEric Dumazet 	rcu_read_lock();
2030654d1f8aSEric Dumazet 	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
20311da177e4SLinus Torvalds 	err = -ENODEV;
20321da177e4SLinus Torvalds 	if (dev == NULL)
20331da177e4SLinus Torvalds 		goto out_unlock;
20341da177e4SLinus Torvalds 
2035d5e76b0aSDavid S. Miller 	err = -ENETDOWN;
2036d5e76b0aSDavid S. Miller 	if (!(dev->flags & IFF_UP))
2037d5e76b0aSDavid S. Miller 		goto out_unlock;
2038d5e76b0aSDavid S. Miller 
20391da177e4SLinus Torvalds 	/*
20401da177e4SLinus Torvalds 	 * You may not queue a frame bigger than the mtu. This is the lowest level
20411da177e4SLinus Torvalds 	 * raw protocol and you must do your own fragmentation at this level.
20421da177e4SLinus Torvalds 	 */
20431da177e4SLinus Torvalds 
20443bdc0ebaSBen Greear 	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
20453bdc0ebaSBen Greear 		if (!netif_supports_nofcs(dev)) {
20463bdc0ebaSBen Greear 			err = -EPROTONOSUPPORT;
20473bdc0ebaSBen Greear 			goto out_unlock;
20483bdc0ebaSBen Greear 		}
20493bdc0ebaSBen Greear 		extra_len = 4; /* We're doing our own CRC */
20503bdc0ebaSBen Greear 	}
20513bdc0ebaSBen Greear 
20521da177e4SLinus Torvalds 	err = -EMSGSIZE;
20533bdc0ebaSBen Greear 	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
20541da177e4SLinus Torvalds 		goto out_unlock;
20551da177e4SLinus Torvalds 
20561a35ca80SEric Dumazet 	if (!skb) {
20571a35ca80SEric Dumazet 		size_t reserved = LL_RESERVED_SPACE(dev);
20584ce40912SHerbert Xu 		int tlen = dev->needed_tailroom;
20591a35ca80SEric Dumazet 		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
20601da177e4SLinus Torvalds 
20611a35ca80SEric Dumazet 		rcu_read_unlock();
20624ce40912SHerbert Xu 		skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
20631da177e4SLinus Torvalds 		if (skb == NULL)
20641a35ca80SEric Dumazet 			return -ENOBUFS;
20651a35ca80SEric Dumazet 		/* FIXME: Save some space for broken drivers that write a hard
20661a35ca80SEric Dumazet 		 * header at transmission time by themselves. PPP is the notable
20671a35ca80SEric Dumazet 		 * one here. This should really be fixed at the driver level.
20681da177e4SLinus Torvalds 		 */
20691a35ca80SEric Dumazet 		skb_reserve(skb, reserved);
2070c1d2bbe1SArnaldo Carvalho de Melo 		skb_reset_network_header(skb);
20711da177e4SLinus Torvalds 
20721da177e4SLinus Torvalds 		/* Try to align data part correctly */
20731a35ca80SEric Dumazet 		if (hhlen) {
20741a35ca80SEric Dumazet 			skb->data -= hhlen;
20751a35ca80SEric Dumazet 			skb->tail -= hhlen;
20761a35ca80SEric Dumazet 			if (len < hhlen)
2077c1d2bbe1SArnaldo Carvalho de Melo 				skb_reset_network_header(skb);
20781da177e4SLinus Torvalds 		}
20796ce8e9ceSAl Viro 		err = memcpy_from_msg(skb_put(skb, len), msg, len);
20801a35ca80SEric Dumazet 		if (err)
20811a35ca80SEric Dumazet 			goto out_free;
20821a35ca80SEric Dumazet 		goto retry;
20831a35ca80SEric Dumazet 	}
20841a35ca80SEric Dumazet 
20856a341729SKuniyuki Iwashima 	if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
20869ed988cdSWillem de Bruijn 		err = -EINVAL;
20879ed988cdSWillem de Bruijn 		goto out_unlock;
20889ed988cdSWillem de Bruijn 	}
20893c70c132SDaniel Borkmann 	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
20903c70c132SDaniel Borkmann 	    !packet_extra_vlan_len_allowed(dev, skb)) {
209157f89bfaSBen Greear 		err = -EMSGSIZE;
209257f89bfaSBen Greear 		goto out_unlock;
209357f89bfaSBen Greear 	}
20941a35ca80SEric Dumazet 
2095657a0667SWillem de Bruijn 	sockcm_init(&sockc, sk);
2096c14ac945SSoheil Hassas Yeganeh 	if (msg->msg_controllen) {
2097c14ac945SSoheil Hassas Yeganeh 		err = sock_cmsg_send(sk, msg, &sockc);
2098f8e7718cSSoheil Hassas Yeganeh 		if (unlikely(err))
2099c14ac945SSoheil Hassas Yeganeh 			goto out_unlock;
2100c14ac945SSoheil Hassas Yeganeh 	}
2101c14ac945SSoheil Hassas Yeganeh 
21021da177e4SLinus Torvalds 	skb->protocol = proto;
21031da177e4SLinus Torvalds 	skb->dev = dev;
21048bf43be7SEric Dumazet 	skb->priority = READ_ONCE(sk->sk_priority);
21053c5b4d69SEric Dumazet 	skb->mark = READ_ONCE(sk->sk_mark);
21063d0ba8c0SRichard Cochran 	skb->tstamp = sockc.transmit_time;
2107bf84a010SDaniel Borkmann 
21088f932f76SWillem de Bruijn 	skb_setup_tx_timestamp(skb, sockc.tsflags);
21091da177e4SLinus Torvalds 
21103bdc0ebaSBen Greear 	if (unlikely(extra_len == 4))
21113bdc0ebaSBen Greear 		skb->no_fcs = 1;
21123bdc0ebaSBen Greear 
211375c65772SMaxim Mikityanskiy 	packet_parse_headers(skb, sock);
2114c1aad275SJason Wang 
21151da177e4SLinus Torvalds 	dev_queue_xmit(skb);
2116654d1f8aSEric Dumazet 	rcu_read_unlock();
211740d4e3dfSEric Dumazet 	return len;
21181da177e4SLinus Torvalds 
21191da177e4SLinus Torvalds out_unlock:
2120654d1f8aSEric Dumazet 	rcu_read_unlock();
21211a35ca80SEric Dumazet out_free:
21221a35ca80SEric Dumazet 	kfree_skb(skb);
21231da177e4SLinus Torvalds 	return err;
21241da177e4SLinus Torvalds }
21251da177e4SLinus Torvalds 
run_filter(struct sk_buff * skb,const struct sock * sk,unsigned int res)2126ff936a04SAlexei Starovoitov static unsigned int run_filter(struct sk_buff *skb,
212762ab0812SEric Dumazet 			       const struct sock *sk,
2128dbcb5855SDavid S. Miller 			       unsigned int res)
21291da177e4SLinus Torvalds {
21301da177e4SLinus Torvalds 	struct sk_filter *filter;
21311da177e4SLinus Torvalds 
213280f8f102SEric Dumazet 	rcu_read_lock();
213380f8f102SEric Dumazet 	filter = rcu_dereference(sk->sk_filter);
2134dbcb5855SDavid S. Miller 	if (filter != NULL)
2135ff936a04SAlexei Starovoitov 		res = bpf_prog_run_clear_cb(filter->prog, skb);
213680f8f102SEric Dumazet 	rcu_read_unlock();
21371da177e4SLinus Torvalds 
2138dbcb5855SDavid S. Miller 	return res;
21391da177e4SLinus Torvalds }
21401da177e4SLinus Torvalds 
packet_rcv_vnet(struct msghdr * msg,const struct sk_buff * skb,size_t * len,int vnet_hdr_sz)214116cc1400SWillem de Bruijn static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2142dfc39d40SJianfeng Tan 			   size_t *len, int vnet_hdr_sz)
214316cc1400SWillem de Bruijn {
2144dfc39d40SJianfeng Tan 	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
214516cc1400SWillem de Bruijn 
2146dfc39d40SJianfeng Tan 	if (*len < vnet_hdr_sz)
214716cc1400SWillem de Bruijn 		return -EINVAL;
2148dfc39d40SJianfeng Tan 	*len -= vnet_hdr_sz;
214916cc1400SWillem de Bruijn 
2150dfc39d40SJianfeng Tan 	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
215116cc1400SWillem de Bruijn 		return -EINVAL;
215216cc1400SWillem de Bruijn 
2153dfc39d40SJianfeng Tan 	return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
215416cc1400SWillem de Bruijn }
215516cc1400SWillem de Bruijn 
21561da177e4SLinus Torvalds /*
215762ab0812SEric Dumazet  * This function makes lazy skb cloning in hope that most of packets
215862ab0812SEric Dumazet  * are discarded by BPF.
215962ab0812SEric Dumazet  *
216062ab0812SEric Dumazet  * Note tricky part: we DO mangle shared skb! skb->data, skb->len
216162ab0812SEric Dumazet  * and skb->cb are mangled. It works because (and until) packets
216262ab0812SEric Dumazet  * falling here are owned by current CPU. Output packets are cloned
216362ab0812SEric Dumazet  * by dev_queue_xmit_nit(), input packets are processed by net_bh
21640e4161d0SWang Hai  * sequentially, so that if we return skb to original state on exit,
216562ab0812SEric Dumazet  * we will not harm anyone.
21661da177e4SLinus Torvalds  */
21671da177e4SLinus Torvalds 
packet_rcv(struct sk_buff * skb,struct net_device * dev,struct packet_type * pt,struct net_device * orig_dev)216840d4e3dfSEric Dumazet static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
216940d4e3dfSEric Dumazet 		      struct packet_type *pt, struct net_device *orig_dev)
21701da177e4SLinus Torvalds {
21711da177e4SLinus Torvalds 	struct sock *sk;
21721da177e4SLinus Torvalds 	struct sockaddr_ll *sll;
21731da177e4SLinus Torvalds 	struct packet_sock *po;
21741da177e4SLinus Torvalds 	u8 *skb_head = skb->data;
21751da177e4SLinus Torvalds 	int skb_len = skb->len;
2176dbcb5855SDavid S. Miller 	unsigned int snaplen, res;
2177da37845fSWeongyo Jeong 	bool is_drop_n_account = false;
21781da177e4SLinus Torvalds 
21791da177e4SLinus Torvalds 	if (skb->pkt_type == PACKET_LOOPBACK)
21801da177e4SLinus Torvalds 		goto drop;
21811da177e4SLinus Torvalds 
21821da177e4SLinus Torvalds 	sk = pt->af_packet_priv;
21831da177e4SLinus Torvalds 	po = pkt_sk(sk);
21841da177e4SLinus Torvalds 
218509ad9bc7SOctavian Purdila 	if (!net_eq(dev_net(dev), sock_net(sk)))
2186d12d01d6SDenis V. Lunev 		goto drop;
2187d12d01d6SDenis V. Lunev 
21881da177e4SLinus Torvalds 	skb->dev = dev;
21891da177e4SLinus Torvalds 
2190d5496990SEyal Birger 	if (dev_has_header(dev)) {
21911da177e4SLinus Torvalds 		/* The device has an explicit notion of ll header,
219262ab0812SEric Dumazet 		 * exported to higher levels.
219362ab0812SEric Dumazet 		 *
219462ab0812SEric Dumazet 		 * Otherwise, the device hides details of its frame
219562ab0812SEric Dumazet 		 * structure, so that corresponding packet head is
219662ab0812SEric Dumazet 		 * never delivered to user.
21971da177e4SLinus Torvalds 		 */
21981da177e4SLinus Torvalds 		if (sk->sk_type != SOCK_DGRAM)
219998e399f8SArnaldo Carvalho de Melo 			skb_push(skb, skb->data - skb_mac_header(skb));
22001da177e4SLinus Torvalds 		else if (skb->pkt_type == PACKET_OUTGOING) {
22011da177e4SLinus Torvalds 			/* Special case: outgoing packets have ll header at head */
2202bbe735e4SArnaldo Carvalho de Melo 			skb_pull(skb, skb_network_offset(skb));
22031da177e4SLinus Torvalds 		}
22041da177e4SLinus Torvalds 	}
22051da177e4SLinus Torvalds 
22061da177e4SLinus Torvalds 	snaplen = skb->len;
22071da177e4SLinus Torvalds 
2208dbcb5855SDavid S. Miller 	res = run_filter(skb, sk, snaplen);
2209dbcb5855SDavid S. Miller 	if (!res)
22101da177e4SLinus Torvalds 		goto drop_n_restore;
2211dbcb5855SDavid S. Miller 	if (snaplen > res)
2212dbcb5855SDavid S. Miller 		snaplen = res;
22131da177e4SLinus Torvalds 
22140fd7bac6SEric Dumazet 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
22151da177e4SLinus Torvalds 		goto drop_n_acct;
22161da177e4SLinus Torvalds 
22171da177e4SLinus Torvalds 	if (skb_shared(skb)) {
22181da177e4SLinus Torvalds 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
22191da177e4SLinus Torvalds 		if (nskb == NULL)
22201da177e4SLinus Torvalds 			goto drop_n_acct;
22211da177e4SLinus Torvalds 
22221da177e4SLinus Torvalds 		if (skb_head != skb->data) {
22231da177e4SLinus Torvalds 			skb->data = skb_head;
22241da177e4SLinus Torvalds 			skb->len = skb_len;
22251da177e4SLinus Torvalds 		}
2226abc4e4faSEric Dumazet 		consume_skb(skb);
22271da177e4SLinus Torvalds 		skb = nskb;
22281da177e4SLinus Torvalds 	}
22291da177e4SLinus Torvalds 
2230b4772ef8SEyal Birger 	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2231ffbc6111SHerbert Xu 
2232ffbc6111SHerbert Xu 	sll = &PACKET_SKB_CB(skb)->sa.ll;
22331da177e4SLinus Torvalds 	sll->sll_hatype = dev->type;
22341da177e4SLinus Torvalds 	sll->sll_pkttype = skb->pkt_type;
2235ee5675ecSEric Dumazet 	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
223680feaacbSPeter P. Waskiewicz Jr 		sll->sll_ifindex = orig_dev->ifindex;
223780feaacbSPeter P. Waskiewicz Jr 	else
22381da177e4SLinus Torvalds 		sll->sll_ifindex = dev->ifindex;
22391da177e4SLinus Torvalds 
2240b95cce35SStephen Hemminger 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
22411da177e4SLinus Torvalds 
22422472d761SEyal Birger 	/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
22432472d761SEyal Birger 	 * Use their space for storing the original skb length.
22442472d761SEyal Birger 	 */
22452472d761SEyal Birger 	PACKET_SKB_CB(skb)->sa.origlen = skb->len;
22468dc41944SHerbert Xu 
22471da177e4SLinus Torvalds 	if (pskb_trim(skb, snaplen))
22481da177e4SLinus Torvalds 		goto drop_n_acct;
22491da177e4SLinus Torvalds 
22501da177e4SLinus Torvalds 	skb_set_owner_r(skb, sk);
22511da177e4SLinus Torvalds 	skb->dev = NULL;
2252adf30907SEric Dumazet 	skb_dst_drop(skb);
22531da177e4SLinus Torvalds 
225484531c24SPhil Oester 	/* drop conntrack reference */
2255895b5c9fSFlorian Westphal 	nf_reset_ct(skb);
225684531c24SPhil Oester 
22571da177e4SLinus Torvalds 	spin_lock(&sk->sk_receive_queue.lock);
2258ee80fbf3SDaniel Borkmann 	po->stats.stats1.tp_packets++;
22593bc3b96fSEyal Birger 	sock_skb_set_dropcount(sk, skb);
226027942a15SMartin KaFai Lau 	skb_clear_delivery_time(skb);
22611da177e4SLinus Torvalds 	__skb_queue_tail(&sk->sk_receive_queue, skb);
22621da177e4SLinus Torvalds 	spin_unlock(&sk->sk_receive_queue.lock);
2263676d2369SDavid S. Miller 	sk->sk_data_ready(sk);
22641da177e4SLinus Torvalds 	return 0;
22651da177e4SLinus Torvalds 
22661da177e4SLinus Torvalds drop_n_acct:
2267da37845fSWeongyo Jeong 	is_drop_n_account = true;
22688e8e2951SEric Dumazet 	atomic_inc(&po->tp_drops);
22697091fbd8SWillem de Bruijn 	atomic_inc(&sk->sk_drops);
22701da177e4SLinus Torvalds 
22711da177e4SLinus Torvalds drop_n_restore:
22721da177e4SLinus Torvalds 	if (skb_head != skb->data && skb_shared(skb)) {
22731da177e4SLinus Torvalds 		skb->data = skb_head;
22741da177e4SLinus Torvalds 		skb->len = skb_len;
22751da177e4SLinus Torvalds 	}
22761da177e4SLinus Torvalds drop:
2277da37845fSWeongyo Jeong 	if (!is_drop_n_account)
2278ead2ceb0SNeil Horman 		consume_skb(skb);
2279da37845fSWeongyo Jeong 	else
2280da37845fSWeongyo Jeong 		kfree_skb(skb);
22811da177e4SLinus Torvalds 	return 0;
22821da177e4SLinus Torvalds }
22831da177e4SLinus Torvalds 
tpacket_rcv(struct sk_buff * skb,struct net_device * dev,struct packet_type * pt,struct net_device * orig_dev)228440d4e3dfSEric Dumazet static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
228540d4e3dfSEric Dumazet 		       struct packet_type *pt, struct net_device *orig_dev)
22861da177e4SLinus Torvalds {
22871da177e4SLinus Torvalds 	struct sock *sk;
22881da177e4SLinus Torvalds 	struct packet_sock *po;
22891da177e4SLinus Torvalds 	struct sockaddr_ll *sll;
2290184f489eSDaniel Borkmann 	union tpacket_uhdr h;
22911da177e4SLinus Torvalds 	u8 *skb_head = skb->data;
22921da177e4SLinus Torvalds 	int skb_len = skb->len;
2293dbcb5855SDavid S. Miller 	unsigned int snaplen, res;
2294f6fb8f10Schetan loke 	unsigned long status = TP_STATUS_USER;
2295acf69c94SOr Cohen 	unsigned short macoff, hdrlen;
2296acf69c94SOr Cohen 	unsigned int netoff;
22971da177e4SLinus Torvalds 	struct sk_buff *copy_skb = NULL;
2298d413fcb4SArnd Bergmann 	struct timespec64 ts;
2299b9c32fb2SDaniel Borkmann 	__u32 ts_status;
2300da37845fSWeongyo Jeong 	bool is_drop_n_account = false;
230161fad681SWillem de Bruijn 	unsigned int slot_id = 0;
2302dfc39d40SJianfeng Tan 	int vnet_hdr_sz = 0;
23031da177e4SLinus Torvalds 
230451846355SAtzm Watanabe 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
230551846355SAtzm Watanabe 	 * We may add members to them until current aligned size without forcing
230651846355SAtzm Watanabe 	 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
230751846355SAtzm Watanabe 	 */
230851846355SAtzm Watanabe 	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
230951846355SAtzm Watanabe 	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
231051846355SAtzm Watanabe 
23111da177e4SLinus Torvalds 	if (skb->pkt_type == PACKET_LOOPBACK)
23121da177e4SLinus Torvalds 		goto drop;
23131da177e4SLinus Torvalds 
23141da177e4SLinus Torvalds 	sk = pt->af_packet_priv;
23151da177e4SLinus Torvalds 	po = pkt_sk(sk);
23161da177e4SLinus Torvalds 
231709ad9bc7SOctavian Purdila 	if (!net_eq(dev_net(dev), sock_net(sk)))
2318d12d01d6SDenis V. Lunev 		goto drop;
2319d12d01d6SDenis V. Lunev 
2320d5496990SEyal Birger 	if (dev_has_header(dev)) {
23211da177e4SLinus Torvalds 		if (sk->sk_type != SOCK_DGRAM)
232298e399f8SArnaldo Carvalho de Melo 			skb_push(skb, skb->data - skb_mac_header(skb));
23231da177e4SLinus Torvalds 		else if (skb->pkt_type == PACKET_OUTGOING) {
23241da177e4SLinus Torvalds 			/* Special case: outgoing packets have ll header at head */
2325bbe735e4SArnaldo Carvalho de Melo 			skb_pull(skb, skb_network_offset(skb));
23268dc41944SHerbert Xu 		}
23278dc41944SHerbert Xu 	}
23288dc41944SHerbert Xu 
23291da177e4SLinus Torvalds 	snaplen = skb->len;
23301da177e4SLinus Torvalds 
2331dbcb5855SDavid S. Miller 	res = run_filter(skb, sk, snaplen);
2332dbcb5855SDavid S. Miller 	if (!res)
23331da177e4SLinus Torvalds 		goto drop_n_restore;
233468c2e5deSAlexander Drozdov 
23352c51c627SEric Dumazet 	/* If we are flooded, just give up */
23362c51c627SEric Dumazet 	if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
23372c51c627SEric Dumazet 		atomic_inc(&po->tp_drops);
23382c51c627SEric Dumazet 		goto drop_n_restore;
23392c51c627SEric Dumazet 	}
23402c51c627SEric Dumazet 
234168c2e5deSAlexander Drozdov 	if (skb->ip_summed == CHECKSUM_PARTIAL)
234268c2e5deSAlexander Drozdov 		status |= TP_STATUS_CSUMNOTREADY;
2343682f048bSAlexander Drozdov 	else if (skb->pkt_type != PACKET_OUTGOING &&
2344b85f628aSWillem de Bruijn 		 skb_csum_unnecessary(skb))
2345682f048bSAlexander Drozdov 		status |= TP_STATUS_CSUM_VALID;
23468e08bb75SXin Long 	if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
23478e08bb75SXin Long 		status |= TP_STATUS_GSO_TCP;
234868c2e5deSAlexander Drozdov 
2349dbcb5855SDavid S. Miller 	if (snaplen > res)
2350dbcb5855SDavid S. Miller 		snaplen = res;
23511da177e4SLinus Torvalds 
23521da177e4SLinus Torvalds 	if (sk->sk_type == SOCK_DGRAM) {
23538913336aSPatrick McHardy 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
23548913336aSPatrick McHardy 				  po->tp_reserve;
23551da177e4SLinus Torvalds 	} else {
235695c96174SEric Dumazet 		unsigned int maclen = skb_network_offset(skb);
2357bbd6ef87SPatrick McHardy 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
23588913336aSPatrick McHardy 				       (maclen < 16 ? 16 : maclen)) +
23598913336aSPatrick McHardy 				       po->tp_reserve;
2360dfc39d40SJianfeng Tan 		vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2361dfc39d40SJianfeng Tan 		if (vnet_hdr_sz)
2362dfc39d40SJianfeng Tan 			netoff += vnet_hdr_sz;
23631da177e4SLinus Torvalds 		macoff = netoff - maclen;
23641da177e4SLinus Torvalds 	}
2365acf69c94SOr Cohen 	if (netoff > USHRT_MAX) {
2366acf69c94SOr Cohen 		atomic_inc(&po->tp_drops);
2367acf69c94SOr Cohen 		goto drop_n_restore;
2368acf69c94SOr Cohen 	}
2369f6fb8f10Schetan loke 	if (po->tp_version <= TPACKET_V2) {
237069e3c75fSJohann Baudy 		if (macoff + snaplen > po->rx_ring.frame_size) {
23711da177e4SLinus Torvalds 			if (po->copy_thresh &&
23720fd7bac6SEric Dumazet 			    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
23731da177e4SLinus Torvalds 				if (skb_shared(skb)) {
23741da177e4SLinus Torvalds 					copy_skb = skb_clone(skb, GFP_ATOMIC);
23751da177e4SLinus Torvalds 				} else {
23761da177e4SLinus Torvalds 					copy_skb = skb_get(skb);
23771da177e4SLinus Torvalds 					skb_head = skb->data;
23781da177e4SLinus Torvalds 				}
2379c700525fSEric Dumazet 				if (copy_skb) {
2380c700525fSEric Dumazet 					memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2381c700525fSEric Dumazet 					       sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
23821da177e4SLinus Torvalds 					skb_set_owner_r(copy_skb, sk);
23831da177e4SLinus Torvalds 				}
2384c700525fSEric Dumazet 			}
238569e3c75fSJohann Baudy 			snaplen = po->rx_ring.frame_size - macoff;
2386edbd58beSBenjamin Poirier 			if ((int)snaplen < 0) {
23871da177e4SLinus Torvalds 				snaplen = 0;
2388dfc39d40SJianfeng Tan 				vnet_hdr_sz = 0;
2389edbd58beSBenjamin Poirier 			}
23901da177e4SLinus Torvalds 		}
2391dc808110SEric Dumazet 	} else if (unlikely(macoff + snaplen >
2392dc808110SEric Dumazet 			    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2393dc808110SEric Dumazet 		u32 nval;
2394dc808110SEric Dumazet 
2395dc808110SEric Dumazet 		nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2396dc808110SEric Dumazet 		pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2397dc808110SEric Dumazet 			    snaplen, nval, macoff);
2398dc808110SEric Dumazet 		snaplen = nval;
2399dc808110SEric Dumazet 		if (unlikely((int)snaplen < 0)) {
2400dc808110SEric Dumazet 			snaplen = 0;
2401dc808110SEric Dumazet 			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2402dfc39d40SJianfeng Tan 			vnet_hdr_sz = 0;
2403dc808110SEric Dumazet 		}
2404f6fb8f10Schetan loke 	}
24051da177e4SLinus Torvalds 	spin_lock(&sk->sk_receive_queue.lock);
2406f6fb8f10Schetan loke 	h.raw = packet_current_rx_frame(po, skb,
2407f6fb8f10Schetan loke 					TP_STATUS_KERNEL, (macoff+snaplen));
2408bbd6ef87SPatrick McHardy 	if (!h.raw)
240958d19b19SWillem de Bruijn 		goto drop_n_account;
241046e4c421SWillem de Bruijn 
241161fad681SWillem de Bruijn 	if (po->tp_version <= TPACKET_V2) {
241261fad681SWillem de Bruijn 		slot_id = po->rx_ring.head;
241361fad681SWillem de Bruijn 		if (test_bit(slot_id, po->rx_ring.rx_owner_map))
241461fad681SWillem de Bruijn 			goto drop_n_account;
241561fad681SWillem de Bruijn 		__set_bit(slot_id, po->rx_ring.rx_owner_map);
241661fad681SWillem de Bruijn 	}
241761fad681SWillem de Bruijn 
2418dfc39d40SJianfeng Tan 	if (vnet_hdr_sz &&
241946e4c421SWillem de Bruijn 	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
242046e4c421SWillem de Bruijn 				    sizeof(struct virtio_net_hdr),
242188fd1cb8SJohn Ogness 				    vio_le(), true, 0)) {
242288fd1cb8SJohn Ogness 		if (po->tp_version == TPACKET_V3)
242388fd1cb8SJohn Ogness 			prb_clear_blk_fill_status(&po->rx_ring);
242446e4c421SWillem de Bruijn 		goto drop_n_account;
242588fd1cb8SJohn Ogness 	}
242646e4c421SWillem de Bruijn 
2427f6fb8f10Schetan loke 	if (po->tp_version <= TPACKET_V2) {
2428f6fb8f10Schetan loke 		packet_increment_rx_head(po, &po->rx_ring);
2429f6fb8f10Schetan loke 	/*
2430f6fb8f10Schetan loke 	 * LOSING will be reported till you read the stats,
2431f6fb8f10Schetan loke 	 * because it's COR - Clear On Read.
2432f6fb8f10Schetan loke 	 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2433f6fb8f10Schetan loke 	 * at packet level.
2434f6fb8f10Schetan loke 	 */
24358e8e2951SEric Dumazet 		if (atomic_read(&po->tp_drops))
2436f6fb8f10Schetan loke 			status |= TP_STATUS_LOSING;
2437f6fb8f10Schetan loke 	}
2438945d015eSEric Dumazet 
2439ee80fbf3SDaniel Borkmann 	po->stats.stats1.tp_packets++;
24401da177e4SLinus Torvalds 	if (copy_skb) {
24411da177e4SLinus Torvalds 		status |= TP_STATUS_COPY;
244227942a15SMartin KaFai Lau 		skb_clear_delivery_time(copy_skb);
24431da177e4SLinus Torvalds 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
24441da177e4SLinus Torvalds 	}
24451da177e4SLinus Torvalds 	spin_unlock(&sk->sk_receive_queue.lock);
24461da177e4SLinus Torvalds 
2447bbd6ef87SPatrick McHardy 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2448b9c32fb2SDaniel Borkmann 
2449171c3b15SRichard Sanger 	/* Always timestamp; prefer an existing software timestamp taken
2450171c3b15SRichard Sanger 	 * closer to the time of capture.
2451171c3b15SRichard Sanger 	 */
2452171c3b15SRichard Sanger 	ts_status = tpacket_get_timestamp(skb, &ts,
24531051ce4aSEric Dumazet 					  READ_ONCE(po->tp_tstamp) |
24541051ce4aSEric Dumazet 					  SOF_TIMESTAMPING_SOFTWARE);
2455171c3b15SRichard Sanger 	if (!ts_status)
2456d413fcb4SArnd Bergmann 		ktime_get_real_ts64(&ts);
24571da177e4SLinus Torvalds 
2458b9c32fb2SDaniel Borkmann 	status |= ts_status;
2459b9c32fb2SDaniel Borkmann 
2460bbd6ef87SPatrick McHardy 	switch (po->tp_version) {
2461bbd6ef87SPatrick McHardy 	case TPACKET_V1:
2462bbd6ef87SPatrick McHardy 		h.h1->tp_len = skb->len;
2463bbd6ef87SPatrick McHardy 		h.h1->tp_snaplen = snaplen;
2464bbd6ef87SPatrick McHardy 		h.h1->tp_mac = macoff;
2465bbd6ef87SPatrick McHardy 		h.h1->tp_net = netoff;
24664b457bdfSDaniel Borkmann 		h.h1->tp_sec = ts.tv_sec;
24674b457bdfSDaniel Borkmann 		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2468bbd6ef87SPatrick McHardy 		hdrlen = sizeof(*h.h1);
2469bbd6ef87SPatrick McHardy 		break;
2470bbd6ef87SPatrick McHardy 	case TPACKET_V2:
2471bbd6ef87SPatrick McHardy 		h.h2->tp_len = skb->len;
2472bbd6ef87SPatrick McHardy 		h.h2->tp_snaplen = snaplen;
2473bbd6ef87SPatrick McHardy 		h.h2->tp_mac = macoff;
2474bbd6ef87SPatrick McHardy 		h.h2->tp_net = netoff;
2475bbd6ef87SPatrick McHardy 		h.h2->tp_sec = ts.tv_sec;
2476bbd6ef87SPatrick McHardy 		h.h2->tp_nsec = ts.tv_nsec;
2477df8a39deSJiri Pirko 		if (skb_vlan_tag_present(skb)) {
2478df8a39deSJiri Pirko 			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2479a0cdfcf3SAtzm Watanabe 			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2480a0cdfcf3SAtzm Watanabe 			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
24815a041d25SChengen Du 		} else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
24825a041d25SChengen Du 			h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev);
24835a041d25SChengen Du 			h.h2->tp_vlan_tpid = ntohs(skb->protocol);
24845a041d25SChengen Du 			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2485a3bcc23eSBen Greear 		} else {
2486a3bcc23eSBen Greear 			h.h2->tp_vlan_tci = 0;
2487a0cdfcf3SAtzm Watanabe 			h.h2->tp_vlan_tpid = 0;
2488a3bcc23eSBen Greear 		}
2489e4d26f4bSAtzm Watanabe 		memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2490bbd6ef87SPatrick McHardy 		hdrlen = sizeof(*h.h2);
2491bbd6ef87SPatrick McHardy 		break;
2492f6fb8f10Schetan loke 	case TPACKET_V3:
2493f6fb8f10Schetan loke 		/* tp_nxt_offset,vlan are already populated above.
2494f6fb8f10Schetan loke 		 * So DONT clear those fields here
2495f6fb8f10Schetan loke 		 */
2496f6fb8f10Schetan loke 		h.h3->tp_status |= status;
2497f6fb8f10Schetan loke 		h.h3->tp_len = skb->len;
2498f6fb8f10Schetan loke 		h.h3->tp_snaplen = snaplen;
2499f6fb8f10Schetan loke 		h.h3->tp_mac = macoff;
2500f6fb8f10Schetan loke 		h.h3->tp_net = netoff;
2501f6fb8f10Schetan loke 		h.h3->tp_sec  = ts.tv_sec;
2502f6fb8f10Schetan loke 		h.h3->tp_nsec = ts.tv_nsec;
2503e4d26f4bSAtzm Watanabe 		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2504f6fb8f10Schetan loke 		hdrlen = sizeof(*h.h3);
2505f6fb8f10Schetan loke 		break;
2506bbd6ef87SPatrick McHardy 	default:
2507bbd6ef87SPatrick McHardy 		BUG();
2508bbd6ef87SPatrick McHardy 	}
25091da177e4SLinus Torvalds 
2510bbd6ef87SPatrick McHardy 	sll = h.raw + TPACKET_ALIGN(hdrlen);
2511b95cce35SStephen Hemminger 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
25121da177e4SLinus Torvalds 	sll->sll_family = AF_PACKET;
25131da177e4SLinus Torvalds 	sll->sll_hatype = dev->type;
25145a041d25SChengen Du 	sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ?
25155a041d25SChengen Du 		vlan_get_protocol_dgram(skb) : skb->protocol;
25161da177e4SLinus Torvalds 	sll->sll_pkttype = skb->pkt_type;
2517ee5675ecSEric Dumazet 	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
251880feaacbSPeter P. Waskiewicz Jr 		sll->sll_ifindex = orig_dev->ifindex;
251980feaacbSPeter P. Waskiewicz Jr 	else
25201da177e4SLinus Torvalds 		sll->sll_ifindex = dev->ifindex;
25211da177e4SLinus Torvalds 
2522e16aa207SRalf Baechle 	smp_mb();
2523f0d4eb29SDaniel Borkmann 
2524f6dafa95SChangli Gao #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2525f0d4eb29SDaniel Borkmann 	if (po->tp_version <= TPACKET_V2) {
25260af55bb5SChangli Gao 		u8 *start, *end;
25271da177e4SLinus Torvalds 
2528f0d4eb29SDaniel Borkmann 		end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2529f0d4eb29SDaniel Borkmann 					macoff + snaplen);
2530f0d4eb29SDaniel Borkmann 
25310af55bb5SChangli Gao 		for (start = h.raw; start < end; start += PAGE_SIZE)
25320af55bb5SChangli Gao 			flush_dcache_page(pgv_to_page(start));
2533f6fb8f10Schetan loke 	}
2534cc9f01b2SChetan Loke 	smp_wmb();
2535f6dafa95SChangli Gao #endif
2536f0d4eb29SDaniel Borkmann 
2537da413eecSDan Collins 	if (po->tp_version <= TPACKET_V2) {
253861fad681SWillem de Bruijn 		spin_lock(&sk->sk_receive_queue.lock);
2539cc9f01b2SChetan Loke 		__packet_set_status(po, h.raw, status);
254061fad681SWillem de Bruijn 		__clear_bit(slot_id, po->rx_ring.rx_owner_map);
254161fad681SWillem de Bruijn 		spin_unlock(&sk->sk_receive_queue.lock);
2542676d2369SDavid S. Miller 		sk->sk_data_ready(sk);
254388fd1cb8SJohn Ogness 	} else if (po->tp_version == TPACKET_V3) {
2544da413eecSDan Collins 		prb_clear_blk_fill_status(&po->rx_ring);
2545da413eecSDan Collins 	}
25461da177e4SLinus Torvalds 
25471da177e4SLinus Torvalds drop_n_restore:
25481da177e4SLinus Torvalds 	if (skb_head != skb->data && skb_shared(skb)) {
25491da177e4SLinus Torvalds 		skb->data = skb_head;
25501da177e4SLinus Torvalds 		skb->len = skb_len;
25511da177e4SLinus Torvalds 	}
25521da177e4SLinus Torvalds drop:
2553da37845fSWeongyo Jeong 	if (!is_drop_n_account)
2554da37845fSWeongyo Jeong 		consume_skb(skb);
2555da37845fSWeongyo Jeong 	else
25561da177e4SLinus Torvalds 		kfree_skb(skb);
25571da177e4SLinus Torvalds 	return 0;
25581da177e4SLinus Torvalds 
255958d19b19SWillem de Bruijn drop_n_account:
25601da177e4SLinus Torvalds 	spin_unlock(&sk->sk_receive_queue.lock);
25618e8e2951SEric Dumazet 	atomic_inc(&po->tp_drops);
25628e8e2951SEric Dumazet 	is_drop_n_account = true;
25631da177e4SLinus Torvalds 
2564676d2369SDavid S. Miller 	sk->sk_data_ready(sk);
25651da177e4SLinus Torvalds 	kfree_skb(copy_skb);
25661da177e4SLinus Torvalds 	goto drop_n_restore;
25671da177e4SLinus Torvalds }
25681da177e4SLinus Torvalds 
tpacket_destruct_skb(struct sk_buff * skb)256969e3c75fSJohann Baudy static void tpacket_destruct_skb(struct sk_buff *skb)
257069e3c75fSJohann Baudy {
257169e3c75fSJohann Baudy 	struct packet_sock *po = pkt_sk(skb->sk);
257269e3c75fSJohann Baudy 
257369e3c75fSJohann Baudy 	if (likely(po->tx_ring.pg_vec)) {
2574f0d4eb29SDaniel Borkmann 		void *ph;
2575b9c32fb2SDaniel Borkmann 		__u32 ts;
2576b9c32fb2SDaniel Borkmann 
25775cd8d46eSWillem de Bruijn 		ph = skb_zcopy_get_nouarg(skb);
2578b0138408SDaniel Borkmann 		packet_dec_pending(&po->tx_ring);
2579b9c32fb2SDaniel Borkmann 
2580b9c32fb2SDaniel Borkmann 		ts = __packet_set_timestamp(po, ph, skb);
2581b9c32fb2SDaniel Borkmann 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
258289ed5b51SNeil Horman 
258389ed5b51SNeil Horman 		complete(&po->skb_completion);
258469e3c75fSJohann Baudy 	}
258569e3c75fSJohann Baudy 
258669e3c75fSJohann Baudy 	sock_wfree(skb);
258769e3c75fSJohann Baudy }
258869e3c75fSJohann Baudy 
__packet_snd_vnet_parse(struct virtio_net_hdr * vnet_hdr,size_t len)258916cc1400SWillem de Bruijn static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
259016cc1400SWillem de Bruijn {
259116cc1400SWillem de Bruijn 	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
259216cc1400SWillem de Bruijn 	    (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
259316cc1400SWillem de Bruijn 	     __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
259416cc1400SWillem de Bruijn 	      __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
259516cc1400SWillem de Bruijn 		vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
259616cc1400SWillem de Bruijn 			 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
259716cc1400SWillem de Bruijn 			__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
259816cc1400SWillem de Bruijn 
259916cc1400SWillem de Bruijn 	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
260016cc1400SWillem de Bruijn 		return -EINVAL;
260116cc1400SWillem de Bruijn 
260216cc1400SWillem de Bruijn 	return 0;
260316cc1400SWillem de Bruijn }
260416cc1400SWillem de Bruijn 
packet_snd_vnet_parse(struct msghdr * msg,size_t * len,struct virtio_net_hdr * vnet_hdr,int vnet_hdr_sz)260516cc1400SWillem de Bruijn static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2606dfc39d40SJianfeng Tan 				 struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
260716cc1400SWillem de Bruijn {
2608dfc39d40SJianfeng Tan 	int ret;
2609dfc39d40SJianfeng Tan 
2610dfc39d40SJianfeng Tan 	if (*len < vnet_hdr_sz)
261116cc1400SWillem de Bruijn 		return -EINVAL;
2612dfc39d40SJianfeng Tan 	*len -= vnet_hdr_sz;
261316cc1400SWillem de Bruijn 
2614cbbd26b8SAl Viro 	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
261516cc1400SWillem de Bruijn 		return -EFAULT;
261616cc1400SWillem de Bruijn 
2617dfc39d40SJianfeng Tan 	ret = __packet_snd_vnet_parse(vnet_hdr, *len);
2618dfc39d40SJianfeng Tan 	if (ret)
2619dfc39d40SJianfeng Tan 		return ret;
2620dfc39d40SJianfeng Tan 
2621dfc39d40SJianfeng Tan 	/* move iter to point to the start of mac header */
2622dfc39d40SJianfeng Tan 	if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
2623dfc39d40SJianfeng Tan 		iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
2624dfc39d40SJianfeng Tan 
2625dfc39d40SJianfeng Tan 	return 0;
262616cc1400SWillem de Bruijn }
262716cc1400SWillem de Bruijn 
tpacket_fill_skb(struct packet_sock * po,struct sk_buff * skb,void * frame,struct net_device * dev,void * data,int tp_len,__be16 proto,unsigned char * addr,int hlen,int copylen,const struct sockcm_cookie * sockc)262869e3c75fSJohann Baudy static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
26298d39b4a6SWillem de Bruijn 		void *frame, struct net_device *dev, void *data, int tp_len,
2630c14ac945SSoheil Hassas Yeganeh 		__be16 proto, unsigned char *addr, int hlen, int copylen,
2631c14ac945SSoheil Hassas Yeganeh 		const struct sockcm_cookie *sockc)
263269e3c75fSJohann Baudy {
2633184f489eSDaniel Borkmann 	union tpacket_uhdr ph;
26348d39b4a6SWillem de Bruijn 	int to_write, offset, len, nr_frags, len_max;
263569e3c75fSJohann Baudy 	struct socket *sock = po->sk.sk_socket;
263669e3c75fSJohann Baudy 	struct page *page;
263769e3c75fSJohann Baudy 	int err;
263869e3c75fSJohann Baudy 
263969e3c75fSJohann Baudy 	ph.raw = frame;
264069e3c75fSJohann Baudy 
264169e3c75fSJohann Baudy 	skb->protocol = proto;
264269e3c75fSJohann Baudy 	skb->dev = dev;
26438bf43be7SEric Dumazet 	skb->priority = READ_ONCE(po->sk.sk_priority);
26443c5b4d69SEric Dumazet 	skb->mark = READ_ONCE(po->sk.sk_mark);
26453d0ba8c0SRichard Cochran 	skb->tstamp = sockc->transmit_time;
26468f932f76SWillem de Bruijn 	skb_setup_tx_timestamp(skb, sockc->tsflags);
26475cd8d46eSWillem de Bruijn 	skb_zcopy_set_nouarg(skb, ph.raw);
264869e3c75fSJohann Baudy 
2649ae641949SHerbert Xu 	skb_reserve(skb, hlen);
265069e3c75fSJohann Baudy 	skb_reset_network_header(skb);
2651c1aad275SJason Wang 
265269e3c75fSJohann Baudy 	to_write = tp_len;
265369e3c75fSJohann Baudy 
265469e3c75fSJohann Baudy 	if (sock->type == SOCK_DGRAM) {
265569e3c75fSJohann Baudy 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
265669e3c75fSJohann Baudy 				NULL, tp_len);
265769e3c75fSJohann Baudy 		if (unlikely(err < 0))
265869e3c75fSJohann Baudy 			return -EINVAL;
26591d036d25SWillem de Bruijn 	} else if (copylen) {
26609ed988cdSWillem de Bruijn 		int hdrlen = min_t(int, copylen, tp_len);
26619ed988cdSWillem de Bruijn 
266269e3c75fSJohann Baudy 		skb_push(skb, dev->hard_header_len);
26631d036d25SWillem de Bruijn 		skb_put(skb, copylen - dev->hard_header_len);
26649ed988cdSWillem de Bruijn 		err = skb_store_bits(skb, 0, data, hdrlen);
266569e3c75fSJohann Baudy 		if (unlikely(err))
266669e3c75fSJohann Baudy 			return err;
26679ed988cdSWillem de Bruijn 		if (!dev_validate_header(dev, skb->data, hdrlen))
26689ed988cdSWillem de Bruijn 			return -EINVAL;
266969e3c75fSJohann Baudy 
26709ed988cdSWillem de Bruijn 		data += hdrlen;
26719ed988cdSWillem de Bruijn 		to_write -= hdrlen;
267269e3c75fSJohann Baudy 	}
267369e3c75fSJohann Baudy 
267469e3c75fSJohann Baudy 	offset = offset_in_page(data);
267569e3c75fSJohann Baudy 	len_max = PAGE_SIZE - offset;
267669e3c75fSJohann Baudy 	len = ((to_write > len_max) ? len_max : to_write);
267769e3c75fSJohann Baudy 
267869e3c75fSJohann Baudy 	skb->data_len = to_write;
267969e3c75fSJohann Baudy 	skb->len += to_write;
268069e3c75fSJohann Baudy 	skb->truesize += to_write;
268114afee4bSReshetova, Elena 	refcount_add(to_write, &po->sk.sk_wmem_alloc);
268269e3c75fSJohann Baudy 
268369e3c75fSJohann Baudy 	while (likely(to_write)) {
268469e3c75fSJohann Baudy 		nr_frags = skb_shinfo(skb)->nr_frags;
268569e3c75fSJohann Baudy 
268669e3c75fSJohann Baudy 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
26873948b059SEric Dumazet 			pr_err("Packet exceed the number of skb frags(%u)\n",
26883948b059SEric Dumazet 			       (unsigned int)MAX_SKB_FRAGS);
268969e3c75fSJohann Baudy 			return -EFAULT;
269069e3c75fSJohann Baudy 		}
269169e3c75fSJohann Baudy 
26920af55bb5SChangli Gao 		page = pgv_to_page(data);
26930af55bb5SChangli Gao 		data += len;
269469e3c75fSJohann Baudy 		flush_dcache_page(page);
269569e3c75fSJohann Baudy 		get_page(page);
26960af55bb5SChangli Gao 		skb_fill_page_desc(skb, nr_frags, page, offset, len);
269769e3c75fSJohann Baudy 		to_write -= len;
269869e3c75fSJohann Baudy 		offset = 0;
269969e3c75fSJohann Baudy 		len_max = PAGE_SIZE;
270069e3c75fSJohann Baudy 		len = ((to_write > len_max) ? len_max : to_write);
270169e3c75fSJohann Baudy 	}
270269e3c75fSJohann Baudy 
270375c65772SMaxim Mikityanskiy 	packet_parse_headers(skb, sock);
2704efdfa2f7SDaniel Borkmann 
270569e3c75fSJohann Baudy 	return tp_len;
270669e3c75fSJohann Baudy }
270769e3c75fSJohann Baudy 
tpacket_parse_header(struct packet_sock * po,void * frame,int size_max,void ** data)27088d39b4a6SWillem de Bruijn static int tpacket_parse_header(struct packet_sock *po, void *frame,
27098d39b4a6SWillem de Bruijn 				int size_max, void **data)
27108d39b4a6SWillem de Bruijn {
27118d39b4a6SWillem de Bruijn 	union tpacket_uhdr ph;
27128d39b4a6SWillem de Bruijn 	int tp_len, off;
27138d39b4a6SWillem de Bruijn 
27148d39b4a6SWillem de Bruijn 	ph.raw = frame;
27158d39b4a6SWillem de Bruijn 
27168d39b4a6SWillem de Bruijn 	switch (po->tp_version) {
27177f953ab2SSowmini Varadhan 	case TPACKET_V3:
27187f953ab2SSowmini Varadhan 		if (ph.h3->tp_next_offset != 0) {
27197f953ab2SSowmini Varadhan 			pr_warn_once("variable sized slot not supported");
27207f953ab2SSowmini Varadhan 			return -EINVAL;
27217f953ab2SSowmini Varadhan 		}
27227f953ab2SSowmini Varadhan 		tp_len = ph.h3->tp_len;
27237f953ab2SSowmini Varadhan 		break;
27248d39b4a6SWillem de Bruijn 	case TPACKET_V2:
27258d39b4a6SWillem de Bruijn 		tp_len = ph.h2->tp_len;
27268d39b4a6SWillem de Bruijn 		break;
27278d39b4a6SWillem de Bruijn 	default:
27288d39b4a6SWillem de Bruijn 		tp_len = ph.h1->tp_len;
27298d39b4a6SWillem de Bruijn 		break;
27308d39b4a6SWillem de Bruijn 	}
27318d39b4a6SWillem de Bruijn 	if (unlikely(tp_len > size_max)) {
27328d39b4a6SWillem de Bruijn 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
27338d39b4a6SWillem de Bruijn 		return -EMSGSIZE;
27348d39b4a6SWillem de Bruijn 	}
27358d39b4a6SWillem de Bruijn 
273674383446SEric Dumazet 	if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) {
27378d39b4a6SWillem de Bruijn 		int off_min, off_max;
27388d39b4a6SWillem de Bruijn 
27398d39b4a6SWillem de Bruijn 		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
27408d39b4a6SWillem de Bruijn 		off_max = po->tx_ring.frame_size - tp_len;
27418d39b4a6SWillem de Bruijn 		if (po->sk.sk_type == SOCK_DGRAM) {
27428d39b4a6SWillem de Bruijn 			switch (po->tp_version) {
27437f953ab2SSowmini Varadhan 			case TPACKET_V3:
27447f953ab2SSowmini Varadhan 				off = ph.h3->tp_net;
27457f953ab2SSowmini Varadhan 				break;
27468d39b4a6SWillem de Bruijn 			case TPACKET_V2:
27478d39b4a6SWillem de Bruijn 				off = ph.h2->tp_net;
27488d39b4a6SWillem de Bruijn 				break;
27498d39b4a6SWillem de Bruijn 			default:
27508d39b4a6SWillem de Bruijn 				off = ph.h1->tp_net;
27518d39b4a6SWillem de Bruijn 				break;
27528d39b4a6SWillem de Bruijn 			}
27538d39b4a6SWillem de Bruijn 		} else {
27548d39b4a6SWillem de Bruijn 			switch (po->tp_version) {
27557f953ab2SSowmini Varadhan 			case TPACKET_V3:
27567f953ab2SSowmini Varadhan 				off = ph.h3->tp_mac;
27577f953ab2SSowmini Varadhan 				break;
27588d39b4a6SWillem de Bruijn 			case TPACKET_V2:
27598d39b4a6SWillem de Bruijn 				off = ph.h2->tp_mac;
27608d39b4a6SWillem de Bruijn 				break;
27618d39b4a6SWillem de Bruijn 			default:
27628d39b4a6SWillem de Bruijn 				off = ph.h1->tp_mac;
27638d39b4a6SWillem de Bruijn 				break;
27648d39b4a6SWillem de Bruijn 			}
27658d39b4a6SWillem de Bruijn 		}
27668d39b4a6SWillem de Bruijn 		if (unlikely((off < off_min) || (off_max < off)))
27678d39b4a6SWillem de Bruijn 			return -EINVAL;
27688d39b4a6SWillem de Bruijn 	} else {
27698d39b4a6SWillem de Bruijn 		off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
27708d39b4a6SWillem de Bruijn 	}
27718d39b4a6SWillem de Bruijn 
27728d39b4a6SWillem de Bruijn 	*data = frame + off;
27738d39b4a6SWillem de Bruijn 	return tp_len;
27748d39b4a6SWillem de Bruijn }
27758d39b4a6SWillem de Bruijn 
tpacket_snd(struct packet_sock * po,struct msghdr * msg)277669e3c75fSJohann Baudy static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
277769e3c75fSJohann Baudy {
277889ed5b51SNeil Horman 	struct sk_buff *skb = NULL;
277969e3c75fSJohann Baudy 	struct net_device *dev;
27801d036d25SWillem de Bruijn 	struct virtio_net_hdr *vnet_hdr = NULL;
2781c14ac945SSoheil Hassas Yeganeh 	struct sockcm_cookie sockc;
278269e3c75fSJohann Baudy 	__be16 proto;
278309effa67SDavid S. Miller 	int err, reserve = 0;
278469e3c75fSJohann Baudy 	void *ph;
2785342dfc30SSteffen Hurrle 	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
278687a2fd28SDaniel Borkmann 	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2787dfc39d40SJianfeng Tan 	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2788486efdc8SWillem de Bruijn 	unsigned char *addr = NULL;
278969e3c75fSJohann Baudy 	int tp_len, size_max;
27908d39b4a6SWillem de Bruijn 	void *data;
279169e3c75fSJohann Baudy 	int len_sum = 0;
27929e67030aSdanborkmann@iogearbox.net 	int status = TP_STATUS_AVAILABLE;
27931d036d25SWillem de Bruijn 	int hlen, tlen, copylen = 0;
279489ed5b51SNeil Horman 	long timeo = 0;
279569e3c75fSJohann Baudy 
279669e3c75fSJohann Baudy 	mutex_lock(&po->pg_vec_lock);
279769e3c75fSJohann Baudy 
279832d3182cSEric Dumazet 	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
279932d3182cSEric Dumazet 	 * we need to confirm it under protection of pg_vec_lock.
280032d3182cSEric Dumazet 	 */
280132d3182cSEric Dumazet 	if (unlikely(!po->tx_ring.pg_vec)) {
280232d3182cSEric Dumazet 		err = -EBUSY;
280332d3182cSEric Dumazet 		goto out;
280432d3182cSEric Dumazet 	}
280566e56cd4SDaniel Borkmann 	if (likely(saddr == NULL)) {
2806e40526cbSDaniel Borkmann 		dev	= packet_cached_dev_get(po);
2807c7d2ef5dSEric Dumazet 		proto	= READ_ONCE(po->num);
280869e3c75fSJohann Baudy 	} else {
280969e3c75fSJohann Baudy 		err = -EINVAL;
281069e3c75fSJohann Baudy 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
281169e3c75fSJohann Baudy 			goto out;
281269e3c75fSJohann Baudy 		if (msg->msg_namelen < (saddr->sll_halen
281369e3c75fSJohann Baudy 					+ offsetof(struct sockaddr_ll,
281469e3c75fSJohann Baudy 						sll_addr)))
281569e3c75fSJohann Baudy 			goto out;
281669e3c75fSJohann Baudy 		proto	= saddr->sll_protocol;
2817827d9780SBen Greear 		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2818486efdc8SWillem de Bruijn 		if (po->sk.sk_socket->type == SOCK_DGRAM) {
2819486efdc8SWillem de Bruijn 			if (dev && msg->msg_namelen < dev->addr_len +
2820486efdc8SWillem de Bruijn 				   offsetof(struct sockaddr_ll, sll_addr))
2821d972f3dcSJason Gunthorpe 				goto out_put;
2822486efdc8SWillem de Bruijn 			addr = saddr->sll_addr;
2823486efdc8SWillem de Bruijn 		}
282469e3c75fSJohann Baudy 	}
282569e3c75fSJohann Baudy 
282669e3c75fSJohann Baudy 	err = -ENXIO;
282769e3c75fSJohann Baudy 	if (unlikely(dev == NULL))
282869e3c75fSJohann Baudy 		goto out;
282969e3c75fSJohann Baudy 	err = -ENETDOWN;
283069e3c75fSJohann Baudy 	if (unlikely(!(dev->flags & IFF_UP)))
283169e3c75fSJohann Baudy 		goto out_put;
283269e3c75fSJohann Baudy 
2833657a0667SWillem de Bruijn 	sockcm_init(&sockc, &po->sk);
2834d19b183cSDouglas Caetano dos Santos 	if (msg->msg_controllen) {
2835d19b183cSDouglas Caetano dos Santos 		err = sock_cmsg_send(&po->sk, msg, &sockc);
2836d19b183cSDouglas Caetano dos Santos 		if (unlikely(err))
2837d19b183cSDouglas Caetano dos Santos 			goto out_put;
2838d19b183cSDouglas Caetano dos Santos 	}
2839d19b183cSDouglas Caetano dos Santos 
28405cfb4c8dSDaniel Borkmann 	if (po->sk.sk_socket->type == SOCK_RAW)
28415cfb4c8dSDaniel Borkmann 		reserve = dev->hard_header_len;
284269e3c75fSJohann Baudy 	size_max = po->tx_ring.frame_size
2843b5dd884eSGabor Gombas 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
284469e3c75fSJohann Baudy 
2845dfc39d40SJianfeng Tan 	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
28465cfb4c8dSDaniel Borkmann 		size_max = dev->mtu + reserve + VLAN_HLEN;
284709effa67SDavid S. Miller 
284889ed5b51SNeil Horman 	reinit_completion(&po->skb_completion);
284989ed5b51SNeil Horman 
285069e3c75fSJohann Baudy 	do {
285169e3c75fSJohann Baudy 		ph = packet_current_frame(po, &po->tx_ring,
285269e3c75fSJohann Baudy 					  TP_STATUS_SEND_REQUEST);
285369e3c75fSJohann Baudy 		if (unlikely(ph == NULL)) {
285489ed5b51SNeil Horman 			if (need_wait && skb) {
285589ed5b51SNeil Horman 				timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
285689ed5b51SNeil Horman 				timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
285789ed5b51SNeil Horman 				if (timeo <= 0) {
285889ed5b51SNeil Horman 					err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
285989ed5b51SNeil Horman 					goto out_put;
286089ed5b51SNeil Horman 				}
286189ed5b51SNeil Horman 			}
286289ed5b51SNeil Horman 			/* check for additional frames */
286369e3c75fSJohann Baudy 			continue;
286469e3c75fSJohann Baudy 		}
286569e3c75fSJohann Baudy 
28668d39b4a6SWillem de Bruijn 		skb = NULL;
28678d39b4a6SWillem de Bruijn 		tp_len = tpacket_parse_header(po, ph, size_max, &data);
28688d39b4a6SWillem de Bruijn 		if (tp_len < 0)
28698d39b4a6SWillem de Bruijn 			goto tpacket_error;
28708d39b4a6SWillem de Bruijn 
287169e3c75fSJohann Baudy 		status = TP_STATUS_SEND_REQUEST;
2872ae641949SHerbert Xu 		hlen = LL_RESERVED_SPACE(dev);
2873ae641949SHerbert Xu 		tlen = dev->needed_tailroom;
2874dfc39d40SJianfeng Tan 		if (vnet_hdr_sz) {
28751d036d25SWillem de Bruijn 			vnet_hdr = data;
2876dfc39d40SJianfeng Tan 			data += vnet_hdr_sz;
2877dfc39d40SJianfeng Tan 			tp_len -= vnet_hdr_sz;
28781d036d25SWillem de Bruijn 			if (tp_len < 0 ||
28791d036d25SWillem de Bruijn 			    __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
28801d036d25SWillem de Bruijn 				tp_len = -EINVAL;
28811d036d25SWillem de Bruijn 				goto tpacket_error;
28821d036d25SWillem de Bruijn 			}
28831d036d25SWillem de Bruijn 			copylen = __virtio16_to_cpu(vio_le(),
28841d036d25SWillem de Bruijn 						    vnet_hdr->hdr_len);
28851d036d25SWillem de Bruijn 		}
28861d036d25SWillem de Bruijn 		copylen = max_t(int, copylen, dev->hard_header_len);
288769e3c75fSJohann Baudy 		skb = sock_alloc_send_skb(&po->sk,
28881d036d25SWillem de Bruijn 				hlen + tlen + sizeof(struct sockaddr_ll) +
28891d036d25SWillem de Bruijn 				(copylen - dev->hard_header_len),
2890fbf33a28SKretschmer, Mathias 				!need_wait, &err);
289169e3c75fSJohann Baudy 
2892fbf33a28SKretschmer, Mathias 		if (unlikely(skb == NULL)) {
2893fbf33a28SKretschmer, Mathias 			/* we assume the socket was initially writeable ... */
2894fbf33a28SKretschmer, Mathias 			if (likely(len_sum > 0))
2895fbf33a28SKretschmer, Mathias 				err = len_sum;
289669e3c75fSJohann Baudy 			goto out_status;
2897fbf33a28SKretschmer, Mathias 		}
28988d39b4a6SWillem de Bruijn 		tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2899c14ac945SSoheil Hassas Yeganeh 					  addr, hlen, copylen, &sockc);
2900dbd46ab4SAlexander Drozdov 		if (likely(tp_len >= 0) &&
29015cfb4c8dSDaniel Borkmann 		    tp_len > dev->mtu + reserve &&
2902dfc39d40SJianfeng Tan 		    !vnet_hdr_sz &&
29033c70c132SDaniel Borkmann 		    !packet_extra_vlan_len_allowed(dev, skb))
290452f1454fSDaniel Borkmann 			tp_len = -EMSGSIZE;
29053c70c132SDaniel Borkmann 
290669e3c75fSJohann Baudy 		if (unlikely(tp_len < 0)) {
29078d39b4a6SWillem de Bruijn tpacket_error:
2908164bddacSEric Dumazet 			if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) {
290969e3c75fSJohann Baudy 				__packet_set_status(po, ph,
291069e3c75fSJohann Baudy 						TP_STATUS_AVAILABLE);
291169e3c75fSJohann Baudy 				packet_increment_head(&po->tx_ring);
291269e3c75fSJohann Baudy 				kfree_skb(skb);
291369e3c75fSJohann Baudy 				continue;
291469e3c75fSJohann Baudy 			} else {
291569e3c75fSJohann Baudy 				status = TP_STATUS_WRONG_FORMAT;
291669e3c75fSJohann Baudy 				err = tp_len;
291769e3c75fSJohann Baudy 				goto out_status;
291869e3c75fSJohann Baudy 			}
291969e3c75fSJohann Baudy 		}
292069e3c75fSJohann Baudy 
2921dfc39d40SJianfeng Tan 		if (vnet_hdr_sz) {
29229d2f67e4SJianfeng Tan 			if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
29231d036d25SWillem de Bruijn 				tp_len = -EINVAL;
29241d036d25SWillem de Bruijn 				goto tpacket_error;
29251d036d25SWillem de Bruijn 			}
29269d2f67e4SJianfeng Tan 			virtio_net_hdr_set_proto(skb, vnet_hdr);
29279d2f67e4SJianfeng Tan 		}
29281d036d25SWillem de Bruijn 
292969e3c75fSJohann Baudy 		skb->destructor = tpacket_destruct_skb;
293069e3c75fSJohann Baudy 		__packet_set_status(po, ph, TP_STATUS_SENDING);
2931b0138408SDaniel Borkmann 		packet_inc_pending(&po->tx_ring);
293269e3c75fSJohann Baudy 
293369e3c75fSJohann Baudy 		status = TP_STATUS_SEND_REQUEST;
2934105a201eSEric Dumazet 		err = packet_xmit(po, skb);
293529e8e659SHangbin Liu 		if (unlikely(err != 0)) {
293629e8e659SHangbin Liu 			if (err > 0)
2937eb70df13SJarek Poplawski 				err = net_xmit_errno(err);
2938eb70df13SJarek Poplawski 			if (err && __packet_get_status(po, ph) ==
2939eb70df13SJarek Poplawski 				   TP_STATUS_AVAILABLE) {
2940eb70df13SJarek Poplawski 				/* skb was destructed already */
2941eb70df13SJarek Poplawski 				skb = NULL;
2942eb70df13SJarek Poplawski 				goto out_status;
2943eb70df13SJarek Poplawski 			}
2944eb70df13SJarek Poplawski 			/*
2945eb70df13SJarek Poplawski 			 * skb was dropped but not destructed yet;
2946eb70df13SJarek Poplawski 			 * let's treat it like congestion or err < 0
2947eb70df13SJarek Poplawski 			 */
2948eb70df13SJarek Poplawski 			err = 0;
2949eb70df13SJarek Poplawski 		}
295069e3c75fSJohann Baudy 		packet_increment_head(&po->tx_ring);
295169e3c75fSJohann Baudy 		len_sum += tp_len;
2952b0138408SDaniel Borkmann 	} while (likely((ph != NULL) ||
2953b0138408SDaniel Borkmann 		/* Note: packet_read_pending() might be slow if we have
2954b0138408SDaniel Borkmann 		 * to call it as it's per_cpu variable, but in fast-path
2955b0138408SDaniel Borkmann 		 * we already short-circuit the loop with the first
2956b0138408SDaniel Borkmann 		 * condition, and luckily don't have to go that path
2957b0138408SDaniel Borkmann 		 * anyway.
2958b0138408SDaniel Borkmann 		 */
2959b0138408SDaniel Borkmann 		 (need_wait && packet_read_pending(&po->tx_ring))));
296069e3c75fSJohann Baudy 
296169e3c75fSJohann Baudy 	err = len_sum;
296269e3c75fSJohann Baudy 	goto out_put;
296369e3c75fSJohann Baudy 
296469e3c75fSJohann Baudy out_status:
296569e3c75fSJohann Baudy 	__packet_set_status(po, ph, status);
296669e3c75fSJohann Baudy 	kfree_skb(skb);
296769e3c75fSJohann Baudy out_put:
296869e3c75fSJohann Baudy 	dev_put(dev);
296969e3c75fSJohann Baudy out:
297069e3c75fSJohann Baudy 	mutex_unlock(&po->pg_vec_lock);
297169e3c75fSJohann Baudy 	return err;
297269e3c75fSJohann Baudy }
29731da177e4SLinus Torvalds 
packet_alloc_skb(struct sock * sk,size_t prepad,size_t reserve,size_t len,size_t linear,int noblock,int * err)2974eea49cc9SOlof Johansson static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2975bfd5f4a3SSridhar Samudrala 				        size_t reserve, size_t len,
2976bfd5f4a3SSridhar Samudrala 				        size_t linear, int noblock,
2977bfd5f4a3SSridhar Samudrala 				        int *err)
2978bfd5f4a3SSridhar Samudrala {
2979bfd5f4a3SSridhar Samudrala 	struct sk_buff *skb;
2980bfd5f4a3SSridhar Samudrala 
2981bfd5f4a3SSridhar Samudrala 	/* Under a page?  Don't bother with paged skb. */
2982bfd5f4a3SSridhar Samudrala 	if (prepad + len < PAGE_SIZE || !linear)
2983bfd5f4a3SSridhar Samudrala 		linear = len;
2984bfd5f4a3SSridhar Samudrala 
2985ae6db08fSEric Dumazet 	if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
2986ae6db08fSEric Dumazet 		linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
2987bfd5f4a3SSridhar Samudrala 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2988ae6db08fSEric Dumazet 				   err, PAGE_ALLOC_COSTLY_ORDER);
2989bfd5f4a3SSridhar Samudrala 	if (!skb)
2990bfd5f4a3SSridhar Samudrala 		return NULL;
2991bfd5f4a3SSridhar Samudrala 
2992bfd5f4a3SSridhar Samudrala 	skb_reserve(skb, reserve);
2993bfd5f4a3SSridhar Samudrala 	skb_put(skb, linear);
2994bfd5f4a3SSridhar Samudrala 	skb->data_len = len - linear;
2995bfd5f4a3SSridhar Samudrala 	skb->len += len - linear;
2996bfd5f4a3SSridhar Samudrala 
2997bfd5f4a3SSridhar Samudrala 	return skb;
2998bfd5f4a3SSridhar Samudrala }
2999bfd5f4a3SSridhar Samudrala 
packet_snd(struct socket * sock,struct msghdr * msg,size_t len)3000d346a3faSDaniel Borkmann static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
30011da177e4SLinus Torvalds {
30021da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
3003342dfc30SSteffen Hurrle 	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
30041da177e4SLinus Torvalds 	struct sk_buff *skb;
30051da177e4SLinus Torvalds 	struct net_device *dev;
30060e11c91eSAl Viro 	__be16 proto;
3007486efdc8SWillem de Bruijn 	unsigned char *addr = NULL;
3008827d9780SBen Greear 	int err, reserve = 0;
3009c7d39e32SEdward Jee 	struct sockcm_cookie sockc;
3010bfd5f4a3SSridhar Samudrala 	struct virtio_net_hdr vnet_hdr = { 0 };
3011bfd5f4a3SSridhar Samudrala 	int offset = 0;
3012bfd5f4a3SSridhar Samudrala 	struct packet_sock *po = pkt_sk(sk);
3013dfc39d40SJianfeng Tan 	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
301457031eb7SWillem de Bruijn 	int hlen, tlen, linear;
30153bdc0ebaSBen Greear 	int extra_len = 0;
30161da177e4SLinus Torvalds 
30171da177e4SLinus Torvalds 	/*
30181da177e4SLinus Torvalds 	 *	Get and verify the address.
30191da177e4SLinus Torvalds 	 */
30201da177e4SLinus Torvalds 
302166e56cd4SDaniel Borkmann 	if (likely(saddr == NULL)) {
3022e40526cbSDaniel Borkmann 		dev	= packet_cached_dev_get(po);
3023c7d2ef5dSEric Dumazet 		proto	= READ_ONCE(po->num);
30241da177e4SLinus Torvalds 	} else {
30251da177e4SLinus Torvalds 		err = -EINVAL;
30261da177e4SLinus Torvalds 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
30271da177e4SLinus Torvalds 			goto out;
30280fb375fbSEric W. Biederman 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
30290fb375fbSEric W. Biederman 			goto out;
30301da177e4SLinus Torvalds 		proto	= saddr->sll_protocol;
3031827d9780SBen Greear 		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
3032486efdc8SWillem de Bruijn 		if (sock->type == SOCK_DGRAM) {
3033486efdc8SWillem de Bruijn 			if (dev && msg->msg_namelen < dev->addr_len +
3034486efdc8SWillem de Bruijn 				   offsetof(struct sockaddr_ll, sll_addr))
3035d972f3dcSJason Gunthorpe 				goto out_unlock;
3036486efdc8SWillem de Bruijn 			addr = saddr->sll_addr;
3037486efdc8SWillem de Bruijn 		}
30381da177e4SLinus Torvalds 	}
30391da177e4SLinus Torvalds 
30401da177e4SLinus Torvalds 	err = -ENXIO;
3041e40526cbSDaniel Borkmann 	if (unlikely(dev == NULL))
30421da177e4SLinus Torvalds 		goto out_unlock;
3043e40526cbSDaniel Borkmann 	err = -ENETDOWN;
3044e40526cbSDaniel Borkmann 	if (unlikely(!(dev->flags & IFF_UP)))
3045e40526cbSDaniel Borkmann 		goto out_unlock;
3046e40526cbSDaniel Borkmann 
3047657a0667SWillem de Bruijn 	sockcm_init(&sockc, sk);
30483c5b4d69SEric Dumazet 	sockc.mark = READ_ONCE(sk->sk_mark);
3049c7d39e32SEdward Jee 	if (msg->msg_controllen) {
3050c7d39e32SEdward Jee 		err = sock_cmsg_send(sk, msg, &sockc);
3051c7d39e32SEdward Jee 		if (unlikely(err))
3052c7d39e32SEdward Jee 			goto out_unlock;
3053c7d39e32SEdward Jee 	}
3054c7d39e32SEdward Jee 
30551da177e4SLinus Torvalds 	if (sock->type == SOCK_RAW)
30561da177e4SLinus Torvalds 		reserve = dev->hard_header_len;
3057dfc39d40SJianfeng Tan 	if (vnet_hdr_sz) {
3058dfc39d40SJianfeng Tan 		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
305916cc1400SWillem de Bruijn 		if (err)
30601da177e4SLinus Torvalds 			goto out_unlock;
3061bfd5f4a3SSridhar Samudrala 	}
3062bfd5f4a3SSridhar Samudrala 
30633bdc0ebaSBen Greear 	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
30643bdc0ebaSBen Greear 		if (!netif_supports_nofcs(dev)) {
30653bdc0ebaSBen Greear 			err = -EPROTONOSUPPORT;
30663bdc0ebaSBen Greear 			goto out_unlock;
30673bdc0ebaSBen Greear 		}
30683bdc0ebaSBen Greear 		extra_len = 4; /* We're doing our own CRC */
30693bdc0ebaSBen Greear 	}
30703bdc0ebaSBen Greear 
3071bfd5f4a3SSridhar Samudrala 	err = -EMSGSIZE;
307216cc1400SWillem de Bruijn 	if (!vnet_hdr.gso_type &&
307316cc1400SWillem de Bruijn 	    (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
3074bfd5f4a3SSridhar Samudrala 		goto out_unlock;
3075bfd5f4a3SSridhar Samudrala 
3076bfd5f4a3SSridhar Samudrala 	err = -ENOBUFS;
3077ae641949SHerbert Xu 	hlen = LL_RESERVED_SPACE(dev);
3078ae641949SHerbert Xu 	tlen = dev->needed_tailroom;
307957031eb7SWillem de Bruijn 	linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
308057031eb7SWillem de Bruijn 	linear = max(linear, min_t(int, len, dev->hard_header_len));
308157031eb7SWillem de Bruijn 	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
30821da177e4SLinus Torvalds 			       msg->msg_flags & MSG_DONTWAIT, &err);
30831da177e4SLinus Torvalds 	if (skb == NULL)
30841da177e4SLinus Torvalds 		goto out_unlock;
30851da177e4SLinus Torvalds 
3086b84bbaf7SWillem de Bruijn 	skb_reset_network_header(skb);
30871da177e4SLinus Torvalds 
30881da177e4SLinus Torvalds 	err = -EINVAL;
30899c707762SWillem de Bruijn 	if (sock->type == SOCK_DGRAM) {
30909c707762SWillem de Bruijn 		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
309146d2cfb1SChristoph Jaeger 		if (unlikely(offset < 0))
30921da177e4SLinus Torvalds 			goto out_free;
3093b84bbaf7SWillem de Bruijn 	} else if (reserve) {
30949aad13b0SWillem de Bruijn 		skb_reserve(skb, -reserve);
309588a8121dSNicolas Dichtel 		if (len < reserve + sizeof(struct ipv6hdr) &&
309688a8121dSNicolas Dichtel 		    dev->min_header_len != dev->hard_header_len)
3097993675a3SWillem de Bruijn 			skb_reset_network_header(skb);
30989c707762SWillem de Bruijn 	}
30991da177e4SLinus Torvalds 
31001da177e4SLinus Torvalds 	/* Returns -EFAULT on error */
3101c0371da6SAl Viro 	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
31021da177e4SLinus Torvalds 	if (err)
31031da177e4SLinus Torvalds 		goto out_free;
3104bf84a010SDaniel Borkmann 
3105dc633700SZhengchao Shao 	if ((sock->type == SOCK_RAW &&
3106dc633700SZhengchao Shao 	     !dev_validate_header(dev, skb->data, len)) || !skb->len) {
31079ed988cdSWillem de Bruijn 		err = -EINVAL;
31089ed988cdSWillem de Bruijn 		goto out_free;
31099ed988cdSWillem de Bruijn 	}
31109ed988cdSWillem de Bruijn 
31118f932f76SWillem de Bruijn 	skb_setup_tx_timestamp(skb, sockc.tsflags);
31121da177e4SLinus Torvalds 
311316cc1400SWillem de Bruijn 	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
31143c70c132SDaniel Borkmann 	    !packet_extra_vlan_len_allowed(dev, skb)) {
311557f89bfaSBen Greear 		err = -EMSGSIZE;
311657f89bfaSBen Greear 		goto out_free;
311757f89bfaSBen Greear 	}
311857f89bfaSBen Greear 
311909effa67SDavid S. Miller 	skb->protocol = proto;
312009effa67SDavid S. Miller 	skb->dev = dev;
31218bf43be7SEric Dumazet 	skb->priority = READ_ONCE(sk->sk_priority);
3122c7d39e32SEdward Jee 	skb->mark = sockc.mark;
31233d0ba8c0SRichard Cochran 	skb->tstamp = sockc.transmit_time;
31240fd5d57bSDaniel Borkmann 
3125dfed913eSHangbin Liu 	if (unlikely(extra_len == 4))
3126dfed913eSHangbin Liu 		skb->no_fcs = 1;
3127dfed913eSHangbin Liu 
3128dfed913eSHangbin Liu 	packet_parse_headers(skb, sock);
3129dfed913eSHangbin Liu 
3130dfc39d40SJianfeng Tan 	if (vnet_hdr_sz) {
3131db60eb5fSJarno Rajahalme 		err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
313216cc1400SWillem de Bruijn 		if (err)
3133bfd5f4a3SSridhar Samudrala 			goto out_free;
3134dfc39d40SJianfeng Tan 		len += vnet_hdr_sz;
31359d2f67e4SJianfeng Tan 		virtio_net_hdr_set_proto(skb, &vnet_hdr);
3136bfd5f4a3SSridhar Samudrala 	}
3137bfd5f4a3SSridhar Samudrala 
3138105a201eSEric Dumazet 	err = packet_xmit(po, skb);
3139105a201eSEric Dumazet 
314029e8e659SHangbin Liu 	if (unlikely(err != 0)) {
314129e8e659SHangbin Liu 		if (err > 0)
314229e8e659SHangbin Liu 			err = net_xmit_errno(err);
314329e8e659SHangbin Liu 		if (err)
31441da177e4SLinus Torvalds 			goto out_unlock;
314529e8e659SHangbin Liu 	}
31461da177e4SLinus Torvalds 
31471da177e4SLinus Torvalds 	dev_put(dev);
31481da177e4SLinus Torvalds 
314940d4e3dfSEric Dumazet 	return len;
31501da177e4SLinus Torvalds 
31511da177e4SLinus Torvalds out_free:
31521da177e4SLinus Torvalds 	kfree_skb(skb);
31531da177e4SLinus Torvalds out_unlock:
31541da177e4SLinus Torvalds 	dev_put(dev);
31551da177e4SLinus Torvalds out:
31561da177e4SLinus Torvalds 	return err;
31571da177e4SLinus Torvalds }
31581da177e4SLinus Torvalds 
packet_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)31591b784140SYing Xue static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
316069e3c75fSJohann Baudy {
316169e3c75fSJohann Baudy 	struct sock *sk = sock->sk;
316269e3c75fSJohann Baudy 	struct packet_sock *po = pkt_sk(sk);
3163d346a3faSDaniel Borkmann 
3164d1b5bee4SEric Dumazet 	/* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3165d1b5bee4SEric Dumazet 	 * tpacket_snd() will redo the check safely.
3166d1b5bee4SEric Dumazet 	 */
3167d1b5bee4SEric Dumazet 	if (data_race(po->tx_ring.pg_vec))
316869e3c75fSJohann Baudy 		return tpacket_snd(po, msg);
3169d1b5bee4SEric Dumazet 
317069e3c75fSJohann Baudy 	return packet_snd(sock, msg, len);
317169e3c75fSJohann Baudy }
317269e3c75fSJohann Baudy 
31731da177e4SLinus Torvalds /*
31741da177e4SLinus Torvalds  *	Close a PACKET socket. This is fairly simple. We immediately go
31751da177e4SLinus Torvalds  *	to 'closed' state and remove our protocol entry in the device list.
31761da177e4SLinus Torvalds  */
31771da177e4SLinus Torvalds 
packet_release(struct socket * sock)31781da177e4SLinus Torvalds static int packet_release(struct socket *sock)
31791da177e4SLinus Torvalds {
31801da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
31811da177e4SLinus Torvalds 	struct packet_sock *po;
31822bd624b4SAnoob Soman 	struct packet_fanout *f;
3183d12d01d6SDenis V. Lunev 	struct net *net;
3184f6fb8f10Schetan loke 	union tpacket_req_u req_u;
31851da177e4SLinus Torvalds 
31861da177e4SLinus Torvalds 	if (!sk)
31871da177e4SLinus Torvalds 		return 0;
31881da177e4SLinus Torvalds 
31893b1e0a65SYOSHIFUJI Hideaki 	net = sock_net(sk);
31901da177e4SLinus Torvalds 	po = pkt_sk(sk);
31911da177e4SLinus Torvalds 
31920fa7fa98SPavel Emelyanov 	mutex_lock(&net->packet.sklist_lock);
3193808f5114Sstephen hemminger 	sk_del_node_init_rcu(sk);
31940fa7fa98SPavel Emelyanov 	mutex_unlock(&net->packet.sklist_lock);
31950fa7fa98SPavel Emelyanov 
3196920de804SEric Dumazet 	sock_prot_inuse_add(net, sk->sk_prot, -1);
31971da177e4SLinus Torvalds 
3198808f5114Sstephen hemminger 	spin_lock(&po->bind_lock);
3199ce06b03eSDavid S. Miller 	unregister_prot_hook(sk, false);
320066e56cd4SDaniel Borkmann 	packet_cached_dev_reset(po);
320166e56cd4SDaniel Borkmann 
3202160ff18aSBen Greear 	if (po->prot_hook.dev) {
3203d62607c3SJakub Kicinski 		netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
3204160ff18aSBen Greear 		po->prot_hook.dev = NULL;
3205160ff18aSBen Greear 	}
3206808f5114Sstephen hemminger 	spin_unlock(&po->bind_lock);
32071da177e4SLinus Torvalds 
32081da177e4SLinus Torvalds 	packet_flush_mclist(sk);
32091da177e4SLinus Torvalds 
32105171b37dSEric Dumazet 	lock_sock(sk);
32119665d5d6SPhil Sutter 	if (po->rx_ring.pg_vec) {
3212f6fb8f10Schetan loke 		memset(&req_u, 0, sizeof(req_u));
3213f6fb8f10Schetan loke 		packet_set_ring(sk, &req_u, 1, 0);
32149665d5d6SPhil Sutter 	}
321569e3c75fSJohann Baudy 
32169665d5d6SPhil Sutter 	if (po->tx_ring.pg_vec) {
32179665d5d6SPhil Sutter 		memset(&req_u, 0, sizeof(req_u));
3218f6fb8f10Schetan loke 		packet_set_ring(sk, &req_u, 1, 1);
32199665d5d6SPhil Sutter 	}
32205171b37dSEric Dumazet 	release_sock(sk);
32211da177e4SLinus Torvalds 
32222bd624b4SAnoob Soman 	f = fanout_release(sk);
3223dc99f600SDavid S. Miller 
3224808f5114Sstephen hemminger 	synchronize_net();
32252bd624b4SAnoob Soman 
322657f015f5SMike Maloney 	kfree(po->rollover);
3227afa0925cSWillem de Bruijn 	if (f) {
32282bd624b4SAnoob Soman 		fanout_release_data(f);
32299c661b0bSTanner Love 		kvfree(f);
32302bd624b4SAnoob Soman 	}
32311da177e4SLinus Torvalds 	/*
32321da177e4SLinus Torvalds 	 *	Now the socket is dead. No more input will appear.
32331da177e4SLinus Torvalds 	 */
32341da177e4SLinus Torvalds 	sock_orphan(sk);
32351da177e4SLinus Torvalds 	sock->sk = NULL;
32361da177e4SLinus Torvalds 
32371da177e4SLinus Torvalds 	/* Purge queues */
32381da177e4SLinus Torvalds 
32391da177e4SLinus Torvalds 	skb_queue_purge(&sk->sk_receive_queue);
3240b0138408SDaniel Borkmann 	packet_free_pending(po);
32411da177e4SLinus Torvalds 
32421da177e4SLinus Torvalds 	sock_put(sk);
32431da177e4SLinus Torvalds 	return 0;
32441da177e4SLinus Torvalds }
32451da177e4SLinus Torvalds 
32461da177e4SLinus Torvalds /*
32471da177e4SLinus Torvalds  *	Attach a packet hook.
32481da177e4SLinus Torvalds  */
32491da177e4SLinus Torvalds 
packet_do_bind(struct sock * sk,const char * name,int ifindex,__be16 proto)325030f7ea1cSFrancesco Ruggeri static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
325130f7ea1cSFrancesco Ruggeri 			  __be16 proto)
32521da177e4SLinus Torvalds {
32531da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
325430f7ea1cSFrancesco Ruggeri 	struct net_device *dev = NULL;
325530f7ea1cSFrancesco Ruggeri 	bool unlisted = false;
3256bf44077cSEric Dumazet 	bool need_rehook;
3257bf44077cSEric Dumazet 	int ret = 0;
3258dc99f600SDavid S. Miller 
32591da177e4SLinus Torvalds 	lock_sock(sk);
32601da177e4SLinus Torvalds 	spin_lock(&po->bind_lock);
32616ffc57eaSEric Dumazet 	if (!proto)
32626ffc57eaSEric Dumazet 		proto = po->num;
32636ffc57eaSEric Dumazet 
326430f7ea1cSFrancesco Ruggeri 	rcu_read_lock();
326530f7ea1cSFrancesco Ruggeri 
32664971613cSWillem de Bruijn 	if (po->fanout) {
32674971613cSWillem de Bruijn 		ret = -EINVAL;
32684971613cSWillem de Bruijn 		goto out_unlock;
32694971613cSWillem de Bruijn 	}
32704971613cSWillem de Bruijn 
327130f7ea1cSFrancesco Ruggeri 	if (name) {
327230f7ea1cSFrancesco Ruggeri 		dev = dev_get_by_name_rcu(sock_net(sk), name);
327330f7ea1cSFrancesco Ruggeri 		if (!dev) {
327430f7ea1cSFrancesco Ruggeri 			ret = -ENODEV;
327530f7ea1cSFrancesco Ruggeri 			goto out_unlock;
327630f7ea1cSFrancesco Ruggeri 		}
327730f7ea1cSFrancesco Ruggeri 	} else if (ifindex) {
327830f7ea1cSFrancesco Ruggeri 		dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
327930f7ea1cSFrancesco Ruggeri 		if (!dev) {
328030f7ea1cSFrancesco Ruggeri 			ret = -ENODEV;
328130f7ea1cSFrancesco Ruggeri 			goto out_unlock;
328230f7ea1cSFrancesco Ruggeri 		}
328330f7ea1cSFrancesco Ruggeri 	}
328430f7ea1cSFrancesco Ruggeri 
3285bf44077cSEric Dumazet 	need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
3286902fefb8SDaniel Borkmann 
3287902fefb8SDaniel Borkmann 	if (need_rehook) {
3288bf44077cSEric Dumazet 		dev_hold(dev);
328961edf479SEric Dumazet 		if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
329030f7ea1cSFrancesco Ruggeri 			rcu_read_unlock();
329115fe076eSEric Dumazet 			/* prevents packet_notifier() from calling
329215fe076eSEric Dumazet 			 * register_prot_hook()
329315fe076eSEric Dumazet 			 */
3294c7d2ef5dSEric Dumazet 			WRITE_ONCE(po->num, 0);
329530f7ea1cSFrancesco Ruggeri 			__unregister_prot_hook(sk, true);
329630f7ea1cSFrancesco Ruggeri 			rcu_read_lock();
329730f7ea1cSFrancesco Ruggeri 			if (dev)
329830f7ea1cSFrancesco Ruggeri 				unlisted = !dev_get_by_index_rcu(sock_net(sk),
329930f7ea1cSFrancesco Ruggeri 								 dev->ifindex);
330030f7ea1cSFrancesco Ruggeri 		}
330166e56cd4SDaniel Borkmann 
330261edf479SEric Dumazet 		BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
3303c7d2ef5dSEric Dumazet 		WRITE_ONCE(po->num, proto);
3304902fefb8SDaniel Borkmann 		po->prot_hook.type = proto;
3305902fefb8SDaniel Borkmann 
3306d62607c3SJakub Kicinski 		netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
3307f1d9268eSEric Dumazet 
330830f7ea1cSFrancesco Ruggeri 		if (unlikely(unlisted)) {
330930f7ea1cSFrancesco Ruggeri 			po->prot_hook.dev = NULL;
3310e032f7c9SEric Dumazet 			WRITE_ONCE(po->ifindex, -1);
331130f7ea1cSFrancesco Ruggeri 			packet_cached_dev_reset(po);
331230f7ea1cSFrancesco Ruggeri 		} else {
3313d62607c3SJakub Kicinski 			netdev_hold(dev, &po->prot_hook.dev_tracker,
3314f1d9268eSEric Dumazet 				    GFP_ATOMIC);
331530f7ea1cSFrancesco Ruggeri 			po->prot_hook.dev = dev;
3316e032f7c9SEric Dumazet 			WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
331766e56cd4SDaniel Borkmann 			packet_cached_dev_assign(po, dev);
3318902fefb8SDaniel Borkmann 		}
3319bf44077cSEric Dumazet 		dev_put(dev);
332030f7ea1cSFrancesco Ruggeri 	}
332166e56cd4SDaniel Borkmann 
3322902fefb8SDaniel Borkmann 	if (proto == 0 || !need_rehook)
33231da177e4SLinus Torvalds 		goto out_unlock;
33241da177e4SLinus Torvalds 
332530f7ea1cSFrancesco Ruggeri 	if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3326ce06b03eSDavid S. Miller 		register_prot_hook(sk);
33271da177e4SLinus Torvalds 	} else {
33281da177e4SLinus Torvalds 		sk->sk_err = ENETDOWN;
33291da177e4SLinus Torvalds 		if (!sock_flag(sk, SOCK_DEAD))
3330e3ae2365SAlexander Aring 			sk_error_report(sk);
33311da177e4SLinus Torvalds 	}
33321da177e4SLinus Torvalds 
33331da177e4SLinus Torvalds out_unlock:
333430f7ea1cSFrancesco Ruggeri 	rcu_read_unlock();
33351da177e4SLinus Torvalds 	spin_unlock(&po->bind_lock);
33361da177e4SLinus Torvalds 	release_sock(sk);
333730f7ea1cSFrancesco Ruggeri 	return ret;
33381da177e4SLinus Torvalds }
33391da177e4SLinus Torvalds 
33401da177e4SLinus Torvalds /*
33411da177e4SLinus Torvalds  *	Bind a packet socket to a device
33421da177e4SLinus Torvalds  */
33431da177e4SLinus Torvalds 
packet_bind_spkt(struct socket * sock,struct sockaddr * uaddr,int addr_len)334440d4e3dfSEric Dumazet static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
334540d4e3dfSEric Dumazet 			    int addr_len)
33461da177e4SLinus Torvalds {
33471da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
3348b5f0de6dSKees Cook 	char name[sizeof(uaddr->sa_data_min) + 1];
33491da177e4SLinus Torvalds 
33501da177e4SLinus Torvalds 	/*
33511da177e4SLinus Torvalds 	 *	Check legality
33521da177e4SLinus Torvalds 	 */
33531da177e4SLinus Torvalds 
33541da177e4SLinus Torvalds 	if (addr_len != sizeof(struct sockaddr))
33551da177e4SLinus Torvalds 		return -EINVAL;
3356540e2894SAlexander Potapenko 	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3357540e2894SAlexander Potapenko 	 * zero-terminated.
3358540e2894SAlexander Potapenko 	 */
3359b5f0de6dSKees Cook 	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
3360b5f0de6dSKees Cook 	name[sizeof(uaddr->sa_data_min)] = 0;
33611da177e4SLinus Torvalds 
33626ffc57eaSEric Dumazet 	return packet_do_bind(sk, name, 0, 0);
33631da177e4SLinus Torvalds }
33641da177e4SLinus Torvalds 
packet_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)33651da177e4SLinus Torvalds static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
33661da177e4SLinus Torvalds {
33671da177e4SLinus Torvalds 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
33681da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
33691da177e4SLinus Torvalds 
33701da177e4SLinus Torvalds 	/*
33711da177e4SLinus Torvalds 	 *	Check legality
33721da177e4SLinus Torvalds 	 */
33731da177e4SLinus Torvalds 
33741da177e4SLinus Torvalds 	if (addr_len < sizeof(struct sockaddr_ll))
33751da177e4SLinus Torvalds 		return -EINVAL;
33761da177e4SLinus Torvalds 	if (sll->sll_family != AF_PACKET)
33771da177e4SLinus Torvalds 		return -EINVAL;
33781da177e4SLinus Torvalds 
33796ffc57eaSEric Dumazet 	return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
33801da177e4SLinus Torvalds }
33811da177e4SLinus Torvalds 
33821da177e4SLinus Torvalds static struct proto packet_proto = {
33831da177e4SLinus Torvalds 	.name	  = "PACKET",
33841da177e4SLinus Torvalds 	.owner	  = THIS_MODULE,
33851da177e4SLinus Torvalds 	.obj_size = sizeof(struct packet_sock),
33861da177e4SLinus Torvalds };
33871da177e4SLinus Torvalds 
33881da177e4SLinus Torvalds /*
33891da177e4SLinus Torvalds  *	Create a packet of type SOCK_PACKET.
33901da177e4SLinus Torvalds  */
33911da177e4SLinus Torvalds 
packet_create(struct net * net,struct socket * sock,int protocol,int kern)33923f378b68SEric Paris static int packet_create(struct net *net, struct socket *sock, int protocol,
33933f378b68SEric Paris 			 int kern)
33941da177e4SLinus Torvalds {
33951da177e4SLinus Torvalds 	struct sock *sk;
33961da177e4SLinus Torvalds 	struct packet_sock *po;
33970e11c91eSAl Viro 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
33981da177e4SLinus Torvalds 	int err;
33991da177e4SLinus Torvalds 
3400df008c91SEric W. Biederman 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
34011da177e4SLinus Torvalds 		return -EPERM;
3402be02097cSDavid S. Miller 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3403be02097cSDavid S. Miller 	    sock->type != SOCK_PACKET)
34041da177e4SLinus Torvalds 		return -ESOCKTNOSUPPORT;
34051da177e4SLinus Torvalds 
34061da177e4SLinus Torvalds 	sock->state = SS_UNCONNECTED;
34071da177e4SLinus Torvalds 
34081da177e4SLinus Torvalds 	err = -ENOBUFS;
340911aa9c28SEric W. Biederman 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
34101da177e4SLinus Torvalds 	if (sk == NULL)
34111da177e4SLinus Torvalds 		goto out;
34121da177e4SLinus Torvalds 
34131da177e4SLinus Torvalds 	sock->ops = &packet_ops;
34141da177e4SLinus Torvalds 	if (sock->type == SOCK_PACKET)
34151da177e4SLinus Torvalds 		sock->ops = &packet_ops_spkt;
3416be02097cSDavid S. Miller 
3417157f08dbSIgnat Korchagin 	po = pkt_sk(sk);
3418157f08dbSIgnat Korchagin 	err = packet_alloc_pending(po);
3419157f08dbSIgnat Korchagin 	if (err)
3420157f08dbSIgnat Korchagin 		goto out_sk_free;
3421157f08dbSIgnat Korchagin 
34221da177e4SLinus Torvalds 	sock_init_data(sock, sk);
34231da177e4SLinus Torvalds 
342489ed5b51SNeil Horman 	init_completion(&po->skb_completion);
34251da177e4SLinus Torvalds 	sk->sk_family = PF_PACKET;
34260e11c91eSAl Viro 	po->num = proto;
342766e56cd4SDaniel Borkmann 
342866e56cd4SDaniel Borkmann 	packet_cached_dev_reset(po);
34291da177e4SLinus Torvalds 
34301da177e4SLinus Torvalds 	sk->sk_destruct = packet_sock_destruct;
34311da177e4SLinus Torvalds 
34321da177e4SLinus Torvalds 	/*
34331da177e4SLinus Torvalds 	 *	Attach a protocol block
34341da177e4SLinus Torvalds 	 */
34351da177e4SLinus Torvalds 
34361da177e4SLinus Torvalds 	spin_lock_init(&po->bind_lock);
3437905db440SHerbert Xu 	mutex_init(&po->pg_vec_lock);
34380648ab70SWillem de Bruijn 	po->rollover = NULL;
34391da177e4SLinus Torvalds 	po->prot_hook.func = packet_rcv;
3440be02097cSDavid S. Miller 
34411da177e4SLinus Torvalds 	if (sock->type == SOCK_PACKET)
34421da177e4SLinus Torvalds 		po->prot_hook.func = packet_rcv_spkt;
3443be02097cSDavid S. Miller 
34441da177e4SLinus Torvalds 	po->prot_hook.af_packet_priv = sk;
344547934e06SCongyu Liu 	po->prot_hook.af_packet_net = sock_net(sk);
34461da177e4SLinus Torvalds 
34470e11c91eSAl Viro 	if (proto) {
34480e11c91eSAl Viro 		po->prot_hook.type = proto;
3449a6361f0cSWillem de Bruijn 		__register_prot_hook(sk);
34501da177e4SLinus Torvalds 	}
34511da177e4SLinus Torvalds 
34520fa7fa98SPavel Emelyanov 	mutex_lock(&net->packet.sklist_lock);
3453a4dc6a49SMaxime Chevallier 	sk_add_node_tail_rcu(sk, &net->packet.sklist);
34540fa7fa98SPavel Emelyanov 	mutex_unlock(&net->packet.sklist_lock);
34550fa7fa98SPavel Emelyanov 
34563680453cSEric Dumazet 	sock_prot_inuse_add(net, &packet_proto, 1);
3457808f5114Sstephen hemminger 
345840d4e3dfSEric Dumazet 	return 0;
3459157f08dbSIgnat Korchagin out_sk_free:
3460b0138408SDaniel Borkmann 	sk_free(sk);
34611da177e4SLinus Torvalds out:
34621da177e4SLinus Torvalds 	return err;
34631da177e4SLinus Torvalds }
34641da177e4SLinus Torvalds 
34651da177e4SLinus Torvalds /*
34661da177e4SLinus Torvalds  *	Pull a packet from our receive queue and hand it to the user.
34671da177e4SLinus Torvalds  *	If necessary we block.
34681da177e4SLinus Torvalds  */
34691da177e4SLinus Torvalds 
packet_recvmsg(struct socket * sock,struct msghdr * msg,size_t len,int flags)34701b784140SYing Xue static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
34711b784140SYing Xue 			  int flags)
34721da177e4SLinus Torvalds {
34731da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
34741da177e4SLinus Torvalds 	struct sk_buff *skb;
34751da177e4SLinus Torvalds 	int copied, err;
3476dfc39d40SJianfeng Tan 	int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
34772472d761SEyal Birger 	unsigned int origlen = 0;
34781da177e4SLinus Torvalds 
34791da177e4SLinus Torvalds 	err = -EINVAL;
3480ed85b565SRichard Cochran 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
34811da177e4SLinus Torvalds 		goto out;
34821da177e4SLinus Torvalds 
34831da177e4SLinus Torvalds #if 0
34841da177e4SLinus Torvalds 	/* What error should we return now? EUNATTACH? */
34851da177e4SLinus Torvalds 	if (pkt_sk(sk)->ifindex < 0)
34861da177e4SLinus Torvalds 		return -ENODEV;
34871da177e4SLinus Torvalds #endif
34881da177e4SLinus Torvalds 
3489ed85b565SRichard Cochran 	if (flags & MSG_ERRQUEUE) {
3490cb820f8eSRichard Cochran 		err = sock_recv_errqueue(sk, msg, len,
3491cb820f8eSRichard Cochran 					 SOL_PACKET, PACKET_TX_TIMESTAMP);
3492ed85b565SRichard Cochran 		goto out;
3493ed85b565SRichard Cochran 	}
3494ed85b565SRichard Cochran 
34951da177e4SLinus Torvalds 	/*
34961da177e4SLinus Torvalds 	 *	Call the generic datagram receiver. This handles all sorts
34971da177e4SLinus Torvalds 	 *	of horrible races and re-entrancy so we can forget about it
34981da177e4SLinus Torvalds 	 *	in the protocol layers.
34991da177e4SLinus Torvalds 	 *
35001da177e4SLinus Torvalds 	 *	Now it will return ENETDOWN, if device have just gone down,
35011da177e4SLinus Torvalds 	 *	but then it will block.
35021da177e4SLinus Torvalds 	 */
35031da177e4SLinus Torvalds 
3504f4b41f06SOliver Hartkopp 	skb = skb_recv_datagram(sk, flags, &err);
35051da177e4SLinus Torvalds 
35061da177e4SLinus Torvalds 	/*
35071da177e4SLinus Torvalds 	 *	An error occurred so return it. Because skb_recv_datagram()
35081da177e4SLinus Torvalds 	 *	handles the blocking we don't see and worry about blocking
35091da177e4SLinus Torvalds 	 *	retries.
35101da177e4SLinus Torvalds 	 */
35111da177e4SLinus Torvalds 
35121da177e4SLinus Torvalds 	if (skb == NULL)
35131da177e4SLinus Torvalds 		goto out;
35141da177e4SLinus Torvalds 
35159bb6cd65SEric Dumazet 	packet_rcv_try_clear_pressure(pkt_sk(sk));
35162ccdbaa6SWillem de Bruijn 
3517dfc39d40SJianfeng Tan 	if (vnet_hdr_len) {
3518dfc39d40SJianfeng Tan 		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
351916cc1400SWillem de Bruijn 		if (err)
3520bfd5f4a3SSridhar Samudrala 			goto out_free;
3521bfd5f4a3SSridhar Samudrala 	}
3522bfd5f4a3SSridhar Samudrala 
3523f3d33426SHannes Frederic Sowa 	/* You lose any data beyond the buffer you gave. If it worries
3524f3d33426SHannes Frederic Sowa 	 * a user program they can ask the device for its MTU
3525f3d33426SHannes Frederic Sowa 	 * anyway.
35260fb375fbSEric W. Biederman 	 */
35271da177e4SLinus Torvalds 	copied = skb->len;
352840d4e3dfSEric Dumazet 	if (copied > len) {
35291da177e4SLinus Torvalds 		copied = len;
35301da177e4SLinus Torvalds 		msg->msg_flags |= MSG_TRUNC;
35311da177e4SLinus Torvalds 	}
35321da177e4SLinus Torvalds 
353351f3d02bSDavid S. Miller 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
35341da177e4SLinus Torvalds 	if (err)
35351da177e4SLinus Torvalds 		goto out_free;
35361da177e4SLinus Torvalds 
35372472d761SEyal Birger 	if (sock->type != SOCK_PACKET) {
35382472d761SEyal Birger 		struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
35392472d761SEyal Birger 
35402472d761SEyal Birger 		/* Original length was stored in sockaddr_ll fields */
35412472d761SEyal Birger 		origlen = PACKET_SKB_CB(skb)->sa.origlen;
35422472d761SEyal Birger 		sll->sll_family = AF_PACKET;
35435a041d25SChengen Du 		sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
35445a041d25SChengen Du 			vlan_get_protocol_dgram(skb) : skb->protocol;
35452472d761SEyal Birger 	}
35462472d761SEyal Birger 
35476fd1d51cSErin MacNeil 	sock_recv_cmsgs(msg, sk, skb);
35481da177e4SLinus Torvalds 
3549f3d33426SHannes Frederic Sowa 	if (msg->msg_name) {
3550c700525fSEric Dumazet 		const size_t max_len = min(sizeof(skb->cb),
3551c700525fSEric Dumazet 					   sizeof(struct sockaddr_storage));
3552b2cf86e1SWillem de Bruijn 		int copy_len;
3553b2cf86e1SWillem de Bruijn 
3554f3d33426SHannes Frederic Sowa 		/* If the address length field is there to be filled
3555f3d33426SHannes Frederic Sowa 		 * in, we fill it in now.
3556f3d33426SHannes Frederic Sowa 		 */
3557f3d33426SHannes Frederic Sowa 		if (sock->type == SOCK_PACKET) {
3558342dfc30SSteffen Hurrle 			__sockaddr_check_size(sizeof(struct sockaddr_pkt));
3559f3d33426SHannes Frederic Sowa 			msg->msg_namelen = sizeof(struct sockaddr_pkt);
3560b2cf86e1SWillem de Bruijn 			copy_len = msg->msg_namelen;
3561f3d33426SHannes Frederic Sowa 		} else {
3562f3d33426SHannes Frederic Sowa 			struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
35632472d761SEyal Birger 
3564f3d33426SHannes Frederic Sowa 			msg->msg_namelen = sll->sll_halen +
3565f3d33426SHannes Frederic Sowa 				offsetof(struct sockaddr_ll, sll_addr);
3566b2cf86e1SWillem de Bruijn 			copy_len = msg->msg_namelen;
3567b2cf86e1SWillem de Bruijn 			if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3568b2cf86e1SWillem de Bruijn 				memset(msg->msg_name +
3569b2cf86e1SWillem de Bruijn 				       offsetof(struct sockaddr_ll, sll_addr),
3570b2cf86e1SWillem de Bruijn 				       0, sizeof(sll->sll_addr));
3571b2cf86e1SWillem de Bruijn 				msg->msg_namelen = sizeof(struct sockaddr_ll);
3572f3d33426SHannes Frederic Sowa 			}
3573b2cf86e1SWillem de Bruijn 		}
3574c700525fSEric Dumazet 		if (WARN_ON_ONCE(copy_len > max_len)) {
3575c700525fSEric Dumazet 			copy_len = max_len;
3576c700525fSEric Dumazet 			msg->msg_namelen = copy_len;
3577c700525fSEric Dumazet 		}
3578b2cf86e1SWillem de Bruijn 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3579f3d33426SHannes Frederic Sowa 	}
35801da177e4SLinus Torvalds 
3581fd53c297SEric Dumazet 	if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
3582ffbc6111SHerbert Xu 		struct tpacket_auxdata aux;
3583ffbc6111SHerbert Xu 
3584ffbc6111SHerbert Xu 		aux.tp_status = TP_STATUS_USER;
3585ffbc6111SHerbert Xu 		if (skb->ip_summed == CHECKSUM_PARTIAL)
3586ffbc6111SHerbert Xu 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3587682f048bSAlexander Drozdov 		else if (skb->pkt_type != PACKET_OUTGOING &&
3588b85f628aSWillem de Bruijn 			 skb_csum_unnecessary(skb))
3589682f048bSAlexander Drozdov 			aux.tp_status |= TP_STATUS_CSUM_VALID;
35908e08bb75SXin Long 		if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
35918e08bb75SXin Long 			aux.tp_status |= TP_STATUS_GSO_TCP;
3592682f048bSAlexander Drozdov 
35932472d761SEyal Birger 		aux.tp_len = origlen;
3594ffbc6111SHerbert Xu 		aux.tp_snaplen = skb->len;
3595ffbc6111SHerbert Xu 		aux.tp_mac = 0;
3596bbe735e4SArnaldo Carvalho de Melo 		aux.tp_net = skb_network_offset(skb);
3597df8a39deSJiri Pirko 		if (skb_vlan_tag_present(skb)) {
3598df8a39deSJiri Pirko 			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3599a0cdfcf3SAtzm Watanabe 			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3600a0cdfcf3SAtzm Watanabe 			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
36015a041d25SChengen Du 		} else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
36025a041d25SChengen Du 			struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
36035a041d25SChengen Du 			struct net_device *dev;
36045a041d25SChengen Du 
36055a041d25SChengen Du 			rcu_read_lock();
36065a041d25SChengen Du 			dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex);
36075a041d25SChengen Du 			if (dev) {
36085a041d25SChengen Du 				aux.tp_vlan_tci = vlan_get_tci(skb, dev);
36095a041d25SChengen Du 				aux.tp_vlan_tpid = ntohs(skb->protocol);
36105a041d25SChengen Du 				aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
36115a041d25SChengen Du 			} else {
36125a041d25SChengen Du 				aux.tp_vlan_tci = 0;
36135a041d25SChengen Du 				aux.tp_vlan_tpid = 0;
36145a041d25SChengen Du 			}
36155a041d25SChengen Du 			rcu_read_unlock();
3616a3bcc23eSBen Greear 		} else {
3617a3bcc23eSBen Greear 			aux.tp_vlan_tci = 0;
3618a0cdfcf3SAtzm Watanabe 			aux.tp_vlan_tpid = 0;
3619a3bcc23eSBen Greear 		}
3620ffbc6111SHerbert Xu 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
36218dc41944SHerbert Xu 	}
36228dc41944SHerbert Xu 
36231da177e4SLinus Torvalds 	/*
36241da177e4SLinus Torvalds 	 *	Free or return the buffer as appropriate. Again this
36251da177e4SLinus Torvalds 	 *	hides all the races and re-entrancy issues from us.
36261da177e4SLinus Torvalds 	 */
3627bfd5f4a3SSridhar Samudrala 	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
36281da177e4SLinus Torvalds 
36291da177e4SLinus Torvalds out_free:
36301da177e4SLinus Torvalds 	skb_free_datagram(sk, skb);
36311da177e4SLinus Torvalds out:
36321da177e4SLinus Torvalds 	return err;
36331da177e4SLinus Torvalds }
36341da177e4SLinus Torvalds 
packet_getname_spkt(struct socket * sock,struct sockaddr * uaddr,int peer)36351da177e4SLinus Torvalds static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
36369b2c45d4SDenys Vlasenko 			       int peer)
36371da177e4SLinus Torvalds {
36381da177e4SLinus Torvalds 	struct net_device *dev;
36391da177e4SLinus Torvalds 	struct sock *sk	= sock->sk;
36401da177e4SLinus Torvalds 
36411da177e4SLinus Torvalds 	if (peer)
36421da177e4SLinus Torvalds 		return -EOPNOTSUPP;
36431da177e4SLinus Torvalds 
36441da177e4SLinus Torvalds 	uaddr->sa_family = AF_PACKET;
3645b5f0de6dSKees Cook 	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
3646654d1f8aSEric Dumazet 	rcu_read_lock();
3647e032f7c9SEric Dumazet 	dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3648654d1f8aSEric Dumazet 	if (dev)
3649b5f0de6dSKees Cook 		strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
3650654d1f8aSEric Dumazet 	rcu_read_unlock();
36511da177e4SLinus Torvalds 
36529b2c45d4SDenys Vlasenko 	return sizeof(*uaddr);
36531da177e4SLinus Torvalds }
36541da177e4SLinus Torvalds 
packet_getname(struct socket * sock,struct sockaddr * uaddr,int peer)36551da177e4SLinus Torvalds static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
36569b2c45d4SDenys Vlasenko 			  int peer)
36571da177e4SLinus Torvalds {
36581da177e4SLinus Torvalds 	struct net_device *dev;
36591da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
36601da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
366113cfa97bSCyrill Gorcunov 	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3662e032f7c9SEric Dumazet 	int ifindex;
36631da177e4SLinus Torvalds 
36641da177e4SLinus Torvalds 	if (peer)
36651da177e4SLinus Torvalds 		return -EOPNOTSUPP;
36661da177e4SLinus Torvalds 
3667e032f7c9SEric Dumazet 	ifindex = READ_ONCE(po->ifindex);
36681da177e4SLinus Torvalds 	sll->sll_family = AF_PACKET;
3669e032f7c9SEric Dumazet 	sll->sll_ifindex = ifindex;
3670c7d2ef5dSEric Dumazet 	sll->sll_protocol = READ_ONCE(po->num);
367167286640SVasiliy Kulikov 	sll->sll_pkttype = 0;
3672654d1f8aSEric Dumazet 	rcu_read_lock();
3673e032f7c9SEric Dumazet 	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
36741da177e4SLinus Torvalds 	if (dev) {
36751da177e4SLinus Torvalds 		sll->sll_hatype = dev->type;
36761da177e4SLinus Torvalds 		sll->sll_halen = dev->addr_len;
3677e2bca487SKuniyuki Iwashima 
3678e2bca487SKuniyuki Iwashima 		/* Let __fortify_memcpy_chk() know the actual buffer size. */
3679e2bca487SKuniyuki Iwashima 		memcpy(((struct sockaddr_storage *)sll)->__data +
3680e2bca487SKuniyuki Iwashima 		       offsetof(struct sockaddr_ll, sll_addr) -
3681e2bca487SKuniyuki Iwashima 		       offsetofend(struct sockaddr_ll, sll_family),
3682e2bca487SKuniyuki Iwashima 		       dev->dev_addr, dev->addr_len);
36831da177e4SLinus Torvalds 	} else {
36841da177e4SLinus Torvalds 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
36851da177e4SLinus Torvalds 		sll->sll_halen = 0;
36861da177e4SLinus Torvalds 	}
3687654d1f8aSEric Dumazet 	rcu_read_unlock();
36881da177e4SLinus Torvalds 
36899b2c45d4SDenys Vlasenko 	return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
36901da177e4SLinus Torvalds }
36911da177e4SLinus Torvalds 
packet_dev_mc(struct net_device * dev,struct packet_mclist * i,int what)36922aeb0b88SWang Chen static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
36932aeb0b88SWang Chen 			 int what)
36941da177e4SLinus Torvalds {
36951da177e4SLinus Torvalds 	switch (i->type) {
36961da177e4SLinus Torvalds 	case PACKET_MR_MULTICAST:
36971162563fSJiri Pirko 		if (i->alen != dev->addr_len)
36981162563fSJiri Pirko 			return -EINVAL;
36991da177e4SLinus Torvalds 		if (what > 0)
370022bedad3SJiri Pirko 			return dev_mc_add(dev, i->addr);
37011da177e4SLinus Torvalds 		else
370222bedad3SJiri Pirko 			return dev_mc_del(dev, i->addr);
37031da177e4SLinus Torvalds 		break;
37041da177e4SLinus Torvalds 	case PACKET_MR_PROMISC:
37052aeb0b88SWang Chen 		return dev_set_promiscuity(dev, what);
37061da177e4SLinus Torvalds 	case PACKET_MR_ALLMULTI:
37072aeb0b88SWang Chen 		return dev_set_allmulti(dev, what);
3708d95ed927SEric W. Biederman 	case PACKET_MR_UNICAST:
37091162563fSJiri Pirko 		if (i->alen != dev->addr_len)
37101162563fSJiri Pirko 			return -EINVAL;
3711d95ed927SEric W. Biederman 		if (what > 0)
3712a748ee24SJiri Pirko 			return dev_uc_add(dev, i->addr);
3713d95ed927SEric W. Biederman 		else
3714a748ee24SJiri Pirko 			return dev_uc_del(dev, i->addr);
3715d95ed927SEric W. Biederman 		break;
371640d4e3dfSEric Dumazet 	default:
371740d4e3dfSEric Dumazet 		break;
37181da177e4SLinus Torvalds 	}
37192aeb0b88SWang Chen 	return 0;
37201da177e4SLinus Torvalds }
37211da177e4SLinus Torvalds 
packet_dev_mclist_delete(struct net_device * dev,struct packet_mclist ** mlp)372282f17091SFrancesco Ruggeri static void packet_dev_mclist_delete(struct net_device *dev,
372382f17091SFrancesco Ruggeri 				     struct packet_mclist **mlp)
37241da177e4SLinus Torvalds {
372582f17091SFrancesco Ruggeri 	struct packet_mclist *ml;
372682f17091SFrancesco Ruggeri 
372782f17091SFrancesco Ruggeri 	while ((ml = *mlp) != NULL) {
372882f17091SFrancesco Ruggeri 		if (ml->ifindex == dev->ifindex) {
372982f17091SFrancesco Ruggeri 			packet_dev_mc(dev, ml, -1);
373082f17091SFrancesco Ruggeri 			*mlp = ml->next;
373182f17091SFrancesco Ruggeri 			kfree(ml);
373282f17091SFrancesco Ruggeri 		} else
373382f17091SFrancesco Ruggeri 			mlp = &ml->next;
37341da177e4SLinus Torvalds 	}
37351da177e4SLinus Torvalds }
37361da177e4SLinus Torvalds 
packet_mc_add(struct sock * sk,struct packet_mreq_max * mreq)37370fb375fbSEric W. Biederman static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
37381da177e4SLinus Torvalds {
37391da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
37401da177e4SLinus Torvalds 	struct packet_mclist *ml, *i;
37411da177e4SLinus Torvalds 	struct net_device *dev;
37421da177e4SLinus Torvalds 	int err;
37431da177e4SLinus Torvalds 
37441da177e4SLinus Torvalds 	rtnl_lock();
37451da177e4SLinus Torvalds 
37461da177e4SLinus Torvalds 	err = -ENODEV;
37473b1e0a65SYOSHIFUJI Hideaki 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
37481da177e4SLinus Torvalds 	if (!dev)
37491da177e4SLinus Torvalds 		goto done;
37501da177e4SLinus Torvalds 
37511da177e4SLinus Torvalds 	err = -EINVAL;
37521162563fSJiri Pirko 	if (mreq->mr_alen > dev->addr_len)
37531da177e4SLinus Torvalds 		goto done;
37541da177e4SLinus Torvalds 
37551da177e4SLinus Torvalds 	err = -ENOBUFS;
37568b3a7005SKris Katterjohn 	i = kmalloc(sizeof(*i), GFP_KERNEL);
37571da177e4SLinus Torvalds 	if (i == NULL)
37581da177e4SLinus Torvalds 		goto done;
37591da177e4SLinus Torvalds 
37601da177e4SLinus Torvalds 	err = 0;
37611da177e4SLinus Torvalds 	for (ml = po->mclist; ml; ml = ml->next) {
37621da177e4SLinus Torvalds 		if (ml->ifindex == mreq->mr_ifindex &&
37631da177e4SLinus Torvalds 		    ml->type == mreq->mr_type &&
37641da177e4SLinus Torvalds 		    ml->alen == mreq->mr_alen &&
37651da177e4SLinus Torvalds 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
37661da177e4SLinus Torvalds 			ml->count++;
37671da177e4SLinus Torvalds 			/* Free the new element ... */
37681da177e4SLinus Torvalds 			kfree(i);
37691da177e4SLinus Torvalds 			goto done;
37701da177e4SLinus Torvalds 		}
37711da177e4SLinus Torvalds 	}
37721da177e4SLinus Torvalds 
37731da177e4SLinus Torvalds 	i->type = mreq->mr_type;
37741da177e4SLinus Torvalds 	i->ifindex = mreq->mr_ifindex;
37751da177e4SLinus Torvalds 	i->alen = mreq->mr_alen;
37761da177e4SLinus Torvalds 	memcpy(i->addr, mreq->mr_address, i->alen);
3777309cf37fSMathias Krause 	memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
37781da177e4SLinus Torvalds 	i->count = 1;
37791da177e4SLinus Torvalds 	i->next = po->mclist;
37801da177e4SLinus Torvalds 	po->mclist = i;
37812aeb0b88SWang Chen 	err = packet_dev_mc(dev, i, 1);
37822aeb0b88SWang Chen 	if (err) {
37832aeb0b88SWang Chen 		po->mclist = i->next;
37842aeb0b88SWang Chen 		kfree(i);
37852aeb0b88SWang Chen 	}
37861da177e4SLinus Torvalds 
37871da177e4SLinus Torvalds done:
37881da177e4SLinus Torvalds 	rtnl_unlock();
37891da177e4SLinus Torvalds 	return err;
37901da177e4SLinus Torvalds }
37911da177e4SLinus Torvalds 
packet_mc_drop(struct sock * sk,struct packet_mreq_max * mreq)37920fb375fbSEric W. Biederman static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
37931da177e4SLinus Torvalds {
37941da177e4SLinus Torvalds 	struct packet_mclist *ml, **mlp;
37951da177e4SLinus Torvalds 
37961da177e4SLinus Torvalds 	rtnl_lock();
37971da177e4SLinus Torvalds 
37981da177e4SLinus Torvalds 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
37991da177e4SLinus Torvalds 		if (ml->ifindex == mreq->mr_ifindex &&
38001da177e4SLinus Torvalds 		    ml->type == mreq->mr_type &&
38011da177e4SLinus Torvalds 		    ml->alen == mreq->mr_alen &&
38021da177e4SLinus Torvalds 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
38031da177e4SLinus Torvalds 			if (--ml->count == 0) {
38041da177e4SLinus Torvalds 				struct net_device *dev;
38051da177e4SLinus Torvalds 				*mlp = ml->next;
3806ad959e76SEric Dumazet 				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3807ad959e76SEric Dumazet 				if (dev)
38081da177e4SLinus Torvalds 					packet_dev_mc(dev, ml, -1);
38091da177e4SLinus Torvalds 				kfree(ml);
38101da177e4SLinus Torvalds 			}
381182f17091SFrancesco Ruggeri 			break;
381282f17091SFrancesco Ruggeri 		}
381382f17091SFrancesco Ruggeri 	}
38141da177e4SLinus Torvalds 	rtnl_unlock();
38151da177e4SLinus Torvalds 	return 0;
38161da177e4SLinus Torvalds }
38171da177e4SLinus Torvalds 
packet_flush_mclist(struct sock * sk)38181da177e4SLinus Torvalds static void packet_flush_mclist(struct sock *sk)
38191da177e4SLinus Torvalds {
38201da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
38211da177e4SLinus Torvalds 	struct packet_mclist *ml;
38221da177e4SLinus Torvalds 
38231da177e4SLinus Torvalds 	if (!po->mclist)
38241da177e4SLinus Torvalds 		return;
38251da177e4SLinus Torvalds 
38261da177e4SLinus Torvalds 	rtnl_lock();
38271da177e4SLinus Torvalds 	while ((ml = po->mclist) != NULL) {
38281da177e4SLinus Torvalds 		struct net_device *dev;
38291da177e4SLinus Torvalds 
38301da177e4SLinus Torvalds 		po->mclist = ml->next;
3831ad959e76SEric Dumazet 		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3832ad959e76SEric Dumazet 		if (dev != NULL)
38331da177e4SLinus Torvalds 			packet_dev_mc(dev, ml, -1);
38341da177e4SLinus Torvalds 		kfree(ml);
38351da177e4SLinus Torvalds 	}
38361da177e4SLinus Torvalds 	rtnl_unlock();
38371da177e4SLinus Torvalds }
38381da177e4SLinus Torvalds 
38391da177e4SLinus Torvalds static int
packet_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3840a7b75c5aSChristoph Hellwig packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3841a7b75c5aSChristoph Hellwig 		  unsigned int optlen)
38421da177e4SLinus Torvalds {
38431da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
38448dc41944SHerbert Xu 	struct packet_sock *po = pkt_sk(sk);
38451da177e4SLinus Torvalds 	int ret;
38461da177e4SLinus Torvalds 
38471da177e4SLinus Torvalds 	if (level != SOL_PACKET)
38481da177e4SLinus Torvalds 		return -ENOPROTOOPT;
38491da177e4SLinus Torvalds 
38501da177e4SLinus Torvalds 	switch (optname) {
38511da177e4SLinus Torvalds 	case PACKET_ADD_MEMBERSHIP:
38521da177e4SLinus Torvalds 	case PACKET_DROP_MEMBERSHIP:
38531da177e4SLinus Torvalds 	{
38540fb375fbSEric W. Biederman 		struct packet_mreq_max mreq;
38550fb375fbSEric W. Biederman 		int len = optlen;
38560fb375fbSEric W. Biederman 		memset(&mreq, 0, sizeof(mreq));
38570fb375fbSEric W. Biederman 		if (len < sizeof(struct packet_mreq))
38581da177e4SLinus Torvalds 			return -EINVAL;
38590fb375fbSEric W. Biederman 		if (len > sizeof(mreq))
38600fb375fbSEric W. Biederman 			len = sizeof(mreq);
3861a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&mreq, optval, len))
38621da177e4SLinus Torvalds 			return -EFAULT;
38630fb375fbSEric W. Biederman 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
38640fb375fbSEric W. Biederman 			return -EINVAL;
38651da177e4SLinus Torvalds 		if (optname == PACKET_ADD_MEMBERSHIP)
38661da177e4SLinus Torvalds 			ret = packet_mc_add(sk, &mreq);
38671da177e4SLinus Torvalds 		else
38681da177e4SLinus Torvalds 			ret = packet_mc_drop(sk, &mreq);
38691da177e4SLinus Torvalds 		return ret;
38701da177e4SLinus Torvalds 	}
3871a2efcfa0SDavid S. Miller 
38721da177e4SLinus Torvalds 	case PACKET_RX_RING:
387369e3c75fSJohann Baudy 	case PACKET_TX_RING:
38741da177e4SLinus Torvalds 	{
3875f6fb8f10Schetan loke 		union tpacket_req_u req_u;
38761da177e4SLinus Torvalds 
3877c572d3a9SEric Dumazet 		ret = -EINVAL;
38785171b37dSEric Dumazet 		lock_sock(sk);
3879f6fb8f10Schetan loke 		switch (po->tp_version) {
3880f6fb8f10Schetan loke 		case TPACKET_V1:
3881f6fb8f10Schetan loke 		case TPACKET_V2:
3882c572d3a9SEric Dumazet 			if (optlen < sizeof(req_u.req))
3883c572d3a9SEric Dumazet 				break;
3884c572d3a9SEric Dumazet 			ret = copy_from_sockptr(&req_u.req, optval,
3885c572d3a9SEric Dumazet 						sizeof(req_u.req)) ?
3886c572d3a9SEric Dumazet 						-EINVAL : 0;
3887f6fb8f10Schetan loke 			break;
3888f6fb8f10Schetan loke 		case TPACKET_V3:
3889f6fb8f10Schetan loke 		default:
3890c572d3a9SEric Dumazet 			if (optlen < sizeof(req_u.req3))
3891c572d3a9SEric Dumazet 				break;
3892c572d3a9SEric Dumazet 			ret = copy_from_sockptr(&req_u.req3, optval,
3893c572d3a9SEric Dumazet 						sizeof(req_u.req3)) ?
3894c572d3a9SEric Dumazet 						-EINVAL : 0;
3895f6fb8f10Schetan loke 			break;
3896f6fb8f10Schetan loke 		}
3897c572d3a9SEric Dumazet 		if (!ret)
38985171b37dSEric Dumazet 			ret = packet_set_ring(sk, &req_u, 0,
3899f6fb8f10Schetan loke 					      optname == PACKET_TX_RING);
39005171b37dSEric Dumazet 		release_sock(sk);
39015171b37dSEric Dumazet 		return ret;
39025171b37dSEric Dumazet 	}
39031da177e4SLinus Torvalds 	case PACKET_COPY_THRESH:
39041da177e4SLinus Torvalds 	{
39051da177e4SLinus Torvalds 		int val;
39061da177e4SLinus Torvalds 
39071da177e4SLinus Torvalds 		if (optlen != sizeof(val))
39081da177e4SLinus Torvalds 			return -EINVAL;
3909a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
39101da177e4SLinus Torvalds 			return -EFAULT;
39111da177e4SLinus Torvalds 
39121da177e4SLinus Torvalds 		pkt_sk(sk)->copy_thresh = val;
39131da177e4SLinus Torvalds 		return 0;
39141da177e4SLinus Torvalds 	}
3915bbd6ef87SPatrick McHardy 	case PACKET_VERSION:
3916bbd6ef87SPatrick McHardy 	{
3917bbd6ef87SPatrick McHardy 		int val;
3918bbd6ef87SPatrick McHardy 
3919bbd6ef87SPatrick McHardy 		if (optlen != sizeof(val))
3920bbd6ef87SPatrick McHardy 			return -EINVAL;
3921a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
3922bbd6ef87SPatrick McHardy 			return -EFAULT;
3923bbd6ef87SPatrick McHardy 		switch (val) {
3924bbd6ef87SPatrick McHardy 		case TPACKET_V1:
3925bbd6ef87SPatrick McHardy 		case TPACKET_V2:
3926f6fb8f10Schetan loke 		case TPACKET_V3:
392784ac7260SPhilip Pettersson 			break;
3928bbd6ef87SPatrick McHardy 		default:
3929bbd6ef87SPatrick McHardy 			return -EINVAL;
3930bbd6ef87SPatrick McHardy 		}
393184ac7260SPhilip Pettersson 		lock_sock(sk);
393284ac7260SPhilip Pettersson 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
393384ac7260SPhilip Pettersson 			ret = -EBUSY;
393484ac7260SPhilip Pettersson 		} else {
393584ac7260SPhilip Pettersson 			po->tp_version = val;
393684ac7260SPhilip Pettersson 			ret = 0;
393784ac7260SPhilip Pettersson 		}
393884ac7260SPhilip Pettersson 		release_sock(sk);
393984ac7260SPhilip Pettersson 		return ret;
3940bbd6ef87SPatrick McHardy 	}
39418913336aSPatrick McHardy 	case PACKET_RESERVE:
39428913336aSPatrick McHardy 	{
39438913336aSPatrick McHardy 		unsigned int val;
39448913336aSPatrick McHardy 
39458913336aSPatrick McHardy 		if (optlen != sizeof(val))
39468913336aSPatrick McHardy 			return -EINVAL;
3947a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
39488913336aSPatrick McHardy 			return -EFAULT;
3949bcc5364bSAndrey Konovalov 		if (val > INT_MAX)
3950bcc5364bSAndrey Konovalov 			return -EINVAL;
3951c27927e3SWillem de Bruijn 		lock_sock(sk);
3952c27927e3SWillem de Bruijn 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3953c27927e3SWillem de Bruijn 			ret = -EBUSY;
3954c27927e3SWillem de Bruijn 		} else {
39558913336aSPatrick McHardy 			po->tp_reserve = val;
3956c27927e3SWillem de Bruijn 			ret = 0;
3957c27927e3SWillem de Bruijn 		}
3958c27927e3SWillem de Bruijn 		release_sock(sk);
3959c27927e3SWillem de Bruijn 		return ret;
39608913336aSPatrick McHardy 	}
396169e3c75fSJohann Baudy 	case PACKET_LOSS:
396269e3c75fSJohann Baudy 	{
396369e3c75fSJohann Baudy 		unsigned int val;
396469e3c75fSJohann Baudy 
396569e3c75fSJohann Baudy 		if (optlen != sizeof(val))
396669e3c75fSJohann Baudy 			return -EINVAL;
3967a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
396869e3c75fSJohann Baudy 			return -EFAULT;
3969a6361f0cSWillem de Bruijn 
3970a6361f0cSWillem de Bruijn 		lock_sock(sk);
3971a6361f0cSWillem de Bruijn 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3972a6361f0cSWillem de Bruijn 			ret = -EBUSY;
3973a6361f0cSWillem de Bruijn 		} else {
3974164bddacSEric Dumazet 			packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val);
3975a6361f0cSWillem de Bruijn 			ret = 0;
3976a6361f0cSWillem de Bruijn 		}
3977a6361f0cSWillem de Bruijn 		release_sock(sk);
3978a6361f0cSWillem de Bruijn 		return ret;
397969e3c75fSJohann Baudy 	}
39808dc41944SHerbert Xu 	case PACKET_AUXDATA:
39818dc41944SHerbert Xu 	{
39828dc41944SHerbert Xu 		int val;
39838dc41944SHerbert Xu 
39848dc41944SHerbert Xu 		if (optlen < sizeof(val))
39858dc41944SHerbert Xu 			return -EINVAL;
3986a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
39878dc41944SHerbert Xu 			return -EFAULT;
39888dc41944SHerbert Xu 
3989fd53c297SEric Dumazet 		packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
39908dc41944SHerbert Xu 		return 0;
39918dc41944SHerbert Xu 	}
399280feaacbSPeter P. Waskiewicz Jr 	case PACKET_ORIGDEV:
399380feaacbSPeter P. Waskiewicz Jr 	{
399480feaacbSPeter P. Waskiewicz Jr 		int val;
399580feaacbSPeter P. Waskiewicz Jr 
399680feaacbSPeter P. Waskiewicz Jr 		if (optlen < sizeof(val))
399780feaacbSPeter P. Waskiewicz Jr 			return -EINVAL;
3998a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
399980feaacbSPeter P. Waskiewicz Jr 			return -EFAULT;
400080feaacbSPeter P. Waskiewicz Jr 
4001ee5675ecSEric Dumazet 		packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
400280feaacbSPeter P. Waskiewicz Jr 		return 0;
400380feaacbSPeter P. Waskiewicz Jr 	}
4004bfd5f4a3SSridhar Samudrala 	case PACKET_VNET_HDR:
4005dfc39d40SJianfeng Tan 	case PACKET_VNET_HDR_SZ:
4006bfd5f4a3SSridhar Samudrala 	{
4007dfc39d40SJianfeng Tan 		int val, hdr_len;
4008bfd5f4a3SSridhar Samudrala 
4009bfd5f4a3SSridhar Samudrala 		if (sock->type != SOCK_RAW)
4010bfd5f4a3SSridhar Samudrala 			return -EINVAL;
4011bfd5f4a3SSridhar Samudrala 		if (optlen < sizeof(val))
4012bfd5f4a3SSridhar Samudrala 			return -EINVAL;
4013a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
4014bfd5f4a3SSridhar Samudrala 			return -EFAULT;
4015bfd5f4a3SSridhar Samudrala 
4016dfc39d40SJianfeng Tan 		if (optname == PACKET_VNET_HDR_SZ) {
4017dfc39d40SJianfeng Tan 			if (val && val != sizeof(struct virtio_net_hdr) &&
4018dfc39d40SJianfeng Tan 			    val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
4019dfc39d40SJianfeng Tan 				return -EINVAL;
4020dfc39d40SJianfeng Tan 			hdr_len = val;
4021dfc39d40SJianfeng Tan 		} else {
4022dfc39d40SJianfeng Tan 			hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
4023dfc39d40SJianfeng Tan 		}
4024a6361f0cSWillem de Bruijn 		lock_sock(sk);
4025a6361f0cSWillem de Bruijn 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
4026a6361f0cSWillem de Bruijn 			ret = -EBUSY;
4027a6361f0cSWillem de Bruijn 		} else {
4028dfc39d40SJianfeng Tan 			WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
4029a6361f0cSWillem de Bruijn 			ret = 0;
4030a6361f0cSWillem de Bruijn 		}
4031a6361f0cSWillem de Bruijn 		release_sock(sk);
4032a6361f0cSWillem de Bruijn 		return ret;
4033bfd5f4a3SSridhar Samudrala 	}
4034614f60faSScott McMillan 	case PACKET_TIMESTAMP:
4035614f60faSScott McMillan 	{
4036614f60faSScott McMillan 		int val;
4037614f60faSScott McMillan 
4038614f60faSScott McMillan 		if (optlen != sizeof(val))
4039614f60faSScott McMillan 			return -EINVAL;
4040a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
4041614f60faSScott McMillan 			return -EFAULT;
4042614f60faSScott McMillan 
40431051ce4aSEric Dumazet 		WRITE_ONCE(po->tp_tstamp, val);
4044614f60faSScott McMillan 		return 0;
4045614f60faSScott McMillan 	}
4046dc99f600SDavid S. Miller 	case PACKET_FANOUT:
4047dc99f600SDavid S. Miller 	{
40489c661b0bSTanner Love 		struct fanout_args args = { 0 };
4049dc99f600SDavid S. Miller 
40509c661b0bSTanner Love 		if (optlen != sizeof(int) && optlen != sizeof(args))
4051dc99f600SDavid S. Miller 			return -EINVAL;
40529c661b0bSTanner Love 		if (copy_from_sockptr(&args, optval, optlen))
4053dc99f600SDavid S. Miller 			return -EFAULT;
4054dc99f600SDavid S. Miller 
40559c661b0bSTanner Love 		return fanout_add(sk, &args);
4056dc99f600SDavid S. Miller 	}
405747dceb8eSWillem de Bruijn 	case PACKET_FANOUT_DATA:
405847dceb8eSWillem de Bruijn 	{
4059e42e70adSEric Dumazet 		/* Paired with the WRITE_ONCE() in fanout_add() */
4060e42e70adSEric Dumazet 		if (!READ_ONCE(po->fanout))
406147dceb8eSWillem de Bruijn 			return -EINVAL;
406247dceb8eSWillem de Bruijn 
406347dceb8eSWillem de Bruijn 		return fanout_set_data(po, optval, optlen);
406447dceb8eSWillem de Bruijn 	}
4065fa788d98SVincent Whitchurch 	case PACKET_IGNORE_OUTGOING:
4066fa788d98SVincent Whitchurch 	{
4067fa788d98SVincent Whitchurch 		int val;
4068fa788d98SVincent Whitchurch 
4069fa788d98SVincent Whitchurch 		if (optlen != sizeof(val))
4070fa788d98SVincent Whitchurch 			return -EINVAL;
4071a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
4072fa788d98SVincent Whitchurch 			return -EFAULT;
4073fa788d98SVincent Whitchurch 		if (val < 0 || val > 1)
4074fa788d98SVincent Whitchurch 			return -EINVAL;
4075fa788d98SVincent Whitchurch 
40762c02c505SEric Dumazet 		WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
4077fa788d98SVincent Whitchurch 		return 0;
4078fa788d98SVincent Whitchurch 	}
40795920cd3aSPaul Chavent 	case PACKET_TX_HAS_OFF:
40805920cd3aSPaul Chavent 	{
40815920cd3aSPaul Chavent 		unsigned int val;
40825920cd3aSPaul Chavent 
40835920cd3aSPaul Chavent 		if (optlen != sizeof(val))
40845920cd3aSPaul Chavent 			return -EINVAL;
4085a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
40865920cd3aSPaul Chavent 			return -EFAULT;
4087a6361f0cSWillem de Bruijn 
4088a6361f0cSWillem de Bruijn 		lock_sock(sk);
408925c55b38SJiapeng Chong 		if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
409074383446SEric Dumazet 			packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val);
409125c55b38SJiapeng Chong 
4092a6361f0cSWillem de Bruijn 		release_sock(sk);
40935920cd3aSPaul Chavent 		return 0;
40945920cd3aSPaul Chavent 	}
4095d346a3faSDaniel Borkmann 	case PACKET_QDISC_BYPASS:
4096d346a3faSDaniel Borkmann 	{
4097d346a3faSDaniel Borkmann 		int val;
4098d346a3faSDaniel Borkmann 
4099d346a3faSDaniel Borkmann 		if (optlen != sizeof(val))
4100d346a3faSDaniel Borkmann 			return -EINVAL;
4101a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&val, optval, sizeof(val)))
4102d346a3faSDaniel Borkmann 			return -EFAULT;
4103d346a3faSDaniel Borkmann 
4104105a201eSEric Dumazet 		packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val);
4105d346a3faSDaniel Borkmann 		return 0;
4106d346a3faSDaniel Borkmann 	}
41071da177e4SLinus Torvalds 	default:
41081da177e4SLinus Torvalds 		return -ENOPROTOOPT;
41091da177e4SLinus Torvalds 	}
41101da177e4SLinus Torvalds }
41111da177e4SLinus Torvalds 
packet_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)41121da177e4SLinus Torvalds static int packet_getsockopt(struct socket *sock, int level, int optname,
41131da177e4SLinus Torvalds 			     char __user *optval, int __user *optlen)
41141da177e4SLinus Torvalds {
41151da177e4SLinus Torvalds 	int len;
4116c06fff6eSEric Dumazet 	int val, lv = sizeof(val);
41171da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
41181da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
4119c06fff6eSEric Dumazet 	void *data = &val;
4120ee80fbf3SDaniel Borkmann 	union tpacket_stats_u st;
4121a9b63918SWillem de Bruijn 	struct tpacket_rollover_stats rstats;
41228e8e2951SEric Dumazet 	int drops;
41231da177e4SLinus Torvalds 
41241da177e4SLinus Torvalds 	if (level != SOL_PACKET)
41251da177e4SLinus Torvalds 		return -ENOPROTOOPT;
41261da177e4SLinus Torvalds 
41271da177e4SLinus Torvalds 	if (get_user(len, optlen))
41281da177e4SLinus Torvalds 		return -EFAULT;
41291da177e4SLinus Torvalds 
41301da177e4SLinus Torvalds 	if (len < 0)
41311da177e4SLinus Torvalds 		return -EINVAL;
41321da177e4SLinus Torvalds 
41331da177e4SLinus Torvalds 	switch (optname) {
41341da177e4SLinus Torvalds 	case PACKET_STATISTICS:
41351da177e4SLinus Torvalds 		spin_lock_bh(&sk->sk_receive_queue.lock);
4136ee80fbf3SDaniel Borkmann 		memcpy(&st, &po->stats, sizeof(st));
4137ee80fbf3SDaniel Borkmann 		memset(&po->stats, 0, sizeof(po->stats));
4138ee80fbf3SDaniel Borkmann 		spin_unlock_bh(&sk->sk_receive_queue.lock);
41398e8e2951SEric Dumazet 		drops = atomic_xchg(&po->tp_drops, 0);
4140ee80fbf3SDaniel Borkmann 
4141f6fb8f10Schetan loke 		if (po->tp_version == TPACKET_V3) {
4142c06fff6eSEric Dumazet 			lv = sizeof(struct tpacket_stats_v3);
41438e8e2951SEric Dumazet 			st.stats3.tp_drops = drops;
41448e8e2951SEric Dumazet 			st.stats3.tp_packets += drops;
4145ee80fbf3SDaniel Borkmann 			data = &st.stats3;
4146f6fb8f10Schetan loke 		} else {
4147c06fff6eSEric Dumazet 			lv = sizeof(struct tpacket_stats);
41488e8e2951SEric Dumazet 			st.stats1.tp_drops = drops;
41498e8e2951SEric Dumazet 			st.stats1.tp_packets += drops;
4150ee80fbf3SDaniel Borkmann 			data = &st.stats1;
4151f6fb8f10Schetan loke 		}
4152ee80fbf3SDaniel Borkmann 
41531da177e4SLinus Torvalds 		break;
41548dc41944SHerbert Xu 	case PACKET_AUXDATA:
4155fd53c297SEric Dumazet 		val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
41568dc41944SHerbert Xu 		break;
415780feaacbSPeter P. Waskiewicz Jr 	case PACKET_ORIGDEV:
4158ee5675ecSEric Dumazet 		val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
415980feaacbSPeter P. Waskiewicz Jr 		break;
4160bfd5f4a3SSridhar Samudrala 	case PACKET_VNET_HDR:
4161dfc39d40SJianfeng Tan 		val = !!READ_ONCE(po->vnet_hdr_sz);
4162dfc39d40SJianfeng Tan 		break;
4163dfc39d40SJianfeng Tan 	case PACKET_VNET_HDR_SZ:
4164dfc39d40SJianfeng Tan 		val = READ_ONCE(po->vnet_hdr_sz);
4165bfd5f4a3SSridhar Samudrala 		break;
4166bbd6ef87SPatrick McHardy 	case PACKET_VERSION:
4167bbd6ef87SPatrick McHardy 		val = po->tp_version;
4168bbd6ef87SPatrick McHardy 		break;
4169bbd6ef87SPatrick McHardy 	case PACKET_HDRLEN:
4170bbd6ef87SPatrick McHardy 		if (len > sizeof(int))
4171bbd6ef87SPatrick McHardy 			len = sizeof(int);
4172fd2c83b3SAlexander Potapenko 		if (len < sizeof(int))
4173fd2c83b3SAlexander Potapenko 			return -EINVAL;
4174bbd6ef87SPatrick McHardy 		if (copy_from_user(&val, optval, len))
4175bbd6ef87SPatrick McHardy 			return -EFAULT;
4176bbd6ef87SPatrick McHardy 		switch (val) {
4177bbd6ef87SPatrick McHardy 		case TPACKET_V1:
4178bbd6ef87SPatrick McHardy 			val = sizeof(struct tpacket_hdr);
4179bbd6ef87SPatrick McHardy 			break;
4180bbd6ef87SPatrick McHardy 		case TPACKET_V2:
4181bbd6ef87SPatrick McHardy 			val = sizeof(struct tpacket2_hdr);
4182bbd6ef87SPatrick McHardy 			break;
4183f6fb8f10Schetan loke 		case TPACKET_V3:
4184f6fb8f10Schetan loke 			val = sizeof(struct tpacket3_hdr);
4185f6fb8f10Schetan loke 			break;
4186bbd6ef87SPatrick McHardy 		default:
4187bbd6ef87SPatrick McHardy 			return -EINVAL;
4188bbd6ef87SPatrick McHardy 		}
4189bbd6ef87SPatrick McHardy 		break;
41908913336aSPatrick McHardy 	case PACKET_RESERVE:
41918913336aSPatrick McHardy 		val = po->tp_reserve;
41928913336aSPatrick McHardy 		break;
419369e3c75fSJohann Baudy 	case PACKET_LOSS:
4194164bddacSEric Dumazet 		val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS);
419569e3c75fSJohann Baudy 		break;
4196614f60faSScott McMillan 	case PACKET_TIMESTAMP:
41971051ce4aSEric Dumazet 		val = READ_ONCE(po->tp_tstamp);
4198614f60faSScott McMillan 		break;
4199dc99f600SDavid S. Miller 	case PACKET_FANOUT:
4200dc99f600SDavid S. Miller 		val = (po->fanout ?
4201dc99f600SDavid S. Miller 		       ((u32)po->fanout->id |
420277f65ebdSWillem de Bruijn 			((u32)po->fanout->type << 16) |
420377f65ebdSWillem de Bruijn 			((u32)po->fanout->flags << 24)) :
4204dc99f600SDavid S. Miller 		       0);
4205dc99f600SDavid S. Miller 		break;
4206fa788d98SVincent Whitchurch 	case PACKET_IGNORE_OUTGOING:
42072c02c505SEric Dumazet 		val = READ_ONCE(po->prot_hook.ignore_outgoing);
4208fa788d98SVincent Whitchurch 		break;
4209a9b63918SWillem de Bruijn 	case PACKET_ROLLOVER_STATS:
421057f015f5SMike Maloney 		if (!po->rollover)
421157f015f5SMike Maloney 			return -EINVAL;
421257f015f5SMike Maloney 		rstats.tp_all = atomic_long_read(&po->rollover->num);
421357f015f5SMike Maloney 		rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
421457f015f5SMike Maloney 		rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4215a9b63918SWillem de Bruijn 		data = &rstats;
4216a9b63918SWillem de Bruijn 		lv = sizeof(rstats);
4217a9b63918SWillem de Bruijn 		break;
42185920cd3aSPaul Chavent 	case PACKET_TX_HAS_OFF:
421974383446SEric Dumazet 		val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF);
42205920cd3aSPaul Chavent 		break;
4221d346a3faSDaniel Borkmann 	case PACKET_QDISC_BYPASS:
4222105a201eSEric Dumazet 		val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS);
4223d346a3faSDaniel Borkmann 		break;
42241da177e4SLinus Torvalds 	default:
42251da177e4SLinus Torvalds 		return -ENOPROTOOPT;
42261da177e4SLinus Torvalds 	}
42271da177e4SLinus Torvalds 
4228c06fff6eSEric Dumazet 	if (len > lv)
4229c06fff6eSEric Dumazet 		len = lv;
42301da177e4SLinus Torvalds 	if (put_user(len, optlen))
42311da177e4SLinus Torvalds 		return -EFAULT;
42328dc41944SHerbert Xu 	if (copy_to_user(optval, data, len))
42338dc41944SHerbert Xu 		return -EFAULT;
42341da177e4SLinus Torvalds 	return 0;
42351da177e4SLinus Torvalds }
42361da177e4SLinus Torvalds 
packet_notifier(struct notifier_block * this,unsigned long msg,void * ptr)4237351638e7SJiri Pirko static int packet_notifier(struct notifier_block *this,
4238351638e7SJiri Pirko 			   unsigned long msg, void *ptr)
42391da177e4SLinus Torvalds {
42401da177e4SLinus Torvalds 	struct sock *sk;
4241351638e7SJiri Pirko 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4242c346dca1SYOSHIFUJI Hideaki 	struct net *net = dev_net(dev);
42431da177e4SLinus Torvalds 
4244808f5114Sstephen hemminger 	rcu_read_lock();
4245b67bfe0dSSasha Levin 	sk_for_each_rcu(sk, &net->packet.sklist) {
42461da177e4SLinus Torvalds 		struct packet_sock *po = pkt_sk(sk);
42471da177e4SLinus Torvalds 
42481da177e4SLinus Torvalds 		switch (msg) {
42491da177e4SLinus Torvalds 		case NETDEV_UNREGISTER:
42501da177e4SLinus Torvalds 			if (po->mclist)
425182f17091SFrancesco Ruggeri 				packet_dev_mclist_delete(dev, &po->mclist);
4252df561f66SGustavo A. R. Silva 			fallthrough;
4253a2efcfa0SDavid S. Miller 
42541da177e4SLinus Torvalds 		case NETDEV_DOWN:
42551da177e4SLinus Torvalds 			if (dev->ifindex == po->ifindex) {
42561da177e4SLinus Torvalds 				spin_lock(&po->bind_lock);
425761edf479SEric Dumazet 				if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
4258ce06b03eSDavid S. Miller 					__unregister_prot_hook(sk, false);
42591da177e4SLinus Torvalds 					sk->sk_err = ENETDOWN;
42601da177e4SLinus Torvalds 					if (!sock_flag(sk, SOCK_DEAD))
4261e3ae2365SAlexander Aring 						sk_error_report(sk);
42621da177e4SLinus Torvalds 				}
42631da177e4SLinus Torvalds 				if (msg == NETDEV_UNREGISTER) {
426466e56cd4SDaniel Borkmann 					packet_cached_dev_reset(po);
4265e032f7c9SEric Dumazet 					WRITE_ONCE(po->ifindex, -1);
4266d62607c3SJakub Kicinski 					netdev_put(po->prot_hook.dev,
4267f1d9268eSEric Dumazet 						   &po->prot_hook.dev_tracker);
42681da177e4SLinus Torvalds 					po->prot_hook.dev = NULL;
42691da177e4SLinus Torvalds 				}
42701da177e4SLinus Torvalds 				spin_unlock(&po->bind_lock);
42711da177e4SLinus Torvalds 			}
42721da177e4SLinus Torvalds 			break;
42731da177e4SLinus Torvalds 		case NETDEV_UP:
4274808f5114Sstephen hemminger 			if (dev->ifindex == po->ifindex) {
42751da177e4SLinus Torvalds 				spin_lock(&po->bind_lock);
4276ce06b03eSDavid S. Miller 				if (po->num)
4277ce06b03eSDavid S. Miller 					register_prot_hook(sk);
42781da177e4SLinus Torvalds 				spin_unlock(&po->bind_lock);
4279808f5114Sstephen hemminger 			}
42801da177e4SLinus Torvalds 			break;
42811da177e4SLinus Torvalds 		}
42821da177e4SLinus Torvalds 	}
4283808f5114Sstephen hemminger 	rcu_read_unlock();
42841da177e4SLinus Torvalds 	return NOTIFY_DONE;
42851da177e4SLinus Torvalds }
42861da177e4SLinus Torvalds 
42871da177e4SLinus Torvalds 
packet_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)42881da177e4SLinus Torvalds static int packet_ioctl(struct socket *sock, unsigned int cmd,
42891da177e4SLinus Torvalds 			unsigned long arg)
42901da177e4SLinus Torvalds {
42911da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
42921da177e4SLinus Torvalds 
42931da177e4SLinus Torvalds 	switch (cmd) {
42941da177e4SLinus Torvalds 	case SIOCOUTQ:
42951da177e4SLinus Torvalds 	{
429631e6d363SEric Dumazet 		int amount = sk_wmem_alloc_get(sk);
429731e6d363SEric Dumazet 
42981da177e4SLinus Torvalds 		return put_user(amount, (int __user *)arg);
42991da177e4SLinus Torvalds 	}
43001da177e4SLinus Torvalds 	case SIOCINQ:
43011da177e4SLinus Torvalds 	{
43021da177e4SLinus Torvalds 		struct sk_buff *skb;
43031da177e4SLinus Torvalds 		int amount = 0;
43041da177e4SLinus Torvalds 
43051da177e4SLinus Torvalds 		spin_lock_bh(&sk->sk_receive_queue.lock);
43061da177e4SLinus Torvalds 		skb = skb_peek(&sk->sk_receive_queue);
43071da177e4SLinus Torvalds 		if (skb)
43081da177e4SLinus Torvalds 			amount = skb->len;
43091da177e4SLinus Torvalds 		spin_unlock_bh(&sk->sk_receive_queue.lock);
43101da177e4SLinus Torvalds 		return put_user(amount, (int __user *)arg);
43111da177e4SLinus Torvalds 	}
43121da177e4SLinus Torvalds #ifdef CONFIG_INET
43131da177e4SLinus Torvalds 	case SIOCADDRT:
43141da177e4SLinus Torvalds 	case SIOCDELRT:
43151da177e4SLinus Torvalds 	case SIOCDARP:
43161da177e4SLinus Torvalds 	case SIOCGARP:
43171da177e4SLinus Torvalds 	case SIOCSARP:
43181da177e4SLinus Torvalds 	case SIOCGIFADDR:
43191da177e4SLinus Torvalds 	case SIOCSIFADDR:
43201da177e4SLinus Torvalds 	case SIOCGIFBRDADDR:
43211da177e4SLinus Torvalds 	case SIOCSIFBRDADDR:
43221da177e4SLinus Torvalds 	case SIOCGIFNETMASK:
43231da177e4SLinus Torvalds 	case SIOCSIFNETMASK:
43241da177e4SLinus Torvalds 	case SIOCGIFDSTADDR:
43251da177e4SLinus Torvalds 	case SIOCSIFDSTADDR:
43261da177e4SLinus Torvalds 	case SIOCSIFFLAGS:
43271da177e4SLinus Torvalds 		return inet_dgram_ops.ioctl(sock, cmd, arg);
43281da177e4SLinus Torvalds #endif
43291da177e4SLinus Torvalds 
43301da177e4SLinus Torvalds 	default:
4331b5e5fa5eSChristoph Hellwig 		return -ENOIOCTLCMD;
43321da177e4SLinus Torvalds 	}
43331da177e4SLinus Torvalds 	return 0;
43341da177e4SLinus Torvalds }
43351da177e4SLinus Torvalds 
packet_poll(struct file * file,struct socket * sock,poll_table * wait)4336a11e1d43SLinus Torvalds static __poll_t packet_poll(struct file *file, struct socket *sock,
4337a11e1d43SLinus Torvalds 				poll_table *wait)
43381da177e4SLinus Torvalds {
43391da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
43401da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
4341a11e1d43SLinus Torvalds 	__poll_t mask = datagram_poll(file, sock, wait);
43421da177e4SLinus Torvalds 
43431da177e4SLinus Torvalds 	spin_lock_bh(&sk->sk_receive_queue.lock);
434469e3c75fSJohann Baudy 	if (po->rx_ring.pg_vec) {
4345f6fb8f10Schetan loke 		if (!packet_previous_rx_frame(po, &po->rx_ring,
4346f6fb8f10Schetan loke 			TP_STATUS_KERNEL))
4347a9a08845SLinus Torvalds 			mask |= EPOLLIN | EPOLLRDNORM;
43481da177e4SLinus Torvalds 	}
43499bb6cd65SEric Dumazet 	packet_rcv_try_clear_pressure(po);
43501da177e4SLinus Torvalds 	spin_unlock_bh(&sk->sk_receive_queue.lock);
435169e3c75fSJohann Baudy 	spin_lock_bh(&sk->sk_write_queue.lock);
435269e3c75fSJohann Baudy 	if (po->tx_ring.pg_vec) {
435369e3c75fSJohann Baudy 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4354a9a08845SLinus Torvalds 			mask |= EPOLLOUT | EPOLLWRNORM;
435569e3c75fSJohann Baudy 	}
435669e3c75fSJohann Baudy 	spin_unlock_bh(&sk->sk_write_queue.lock);
43571da177e4SLinus Torvalds 	return mask;
43581da177e4SLinus Torvalds }
43591da177e4SLinus Torvalds 
43601da177e4SLinus Torvalds 
43611da177e4SLinus Torvalds /* Dirty? Well, I still did not learn better way to account
43621da177e4SLinus Torvalds  * for user mmaps.
43631da177e4SLinus Torvalds  */
43641da177e4SLinus Torvalds 
packet_mm_open(struct vm_area_struct * vma)43651da177e4SLinus Torvalds static void packet_mm_open(struct vm_area_struct *vma)
43661da177e4SLinus Torvalds {
43671da177e4SLinus Torvalds 	struct file *file = vma->vm_file;
4368b69aee04SEric Dumazet 	struct socket *sock = file->private_data;
43691da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
43701da177e4SLinus Torvalds 
43711da177e4SLinus Torvalds 	if (sk)
4372865b7157SDaniel Borkmann 		atomic_long_inc(&pkt_sk(sk)->mapped);
43731da177e4SLinus Torvalds }
43741da177e4SLinus Torvalds 
packet_mm_close(struct vm_area_struct * vma)43751da177e4SLinus Torvalds static void packet_mm_close(struct vm_area_struct *vma)
43761da177e4SLinus Torvalds {
43771da177e4SLinus Torvalds 	struct file *file = vma->vm_file;
4378b69aee04SEric Dumazet 	struct socket *sock = file->private_data;
43791da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
43801da177e4SLinus Torvalds 
43811da177e4SLinus Torvalds 	if (sk)
4382865b7157SDaniel Borkmann 		atomic_long_dec(&pkt_sk(sk)->mapped);
43831da177e4SLinus Torvalds }
43841da177e4SLinus Torvalds 
4385f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct packet_mmap_ops = {
43861da177e4SLinus Torvalds 	.open	=	packet_mm_open,
43871da177e4SLinus Torvalds 	.close	=	packet_mm_close,
43881da177e4SLinus Torvalds };
43891da177e4SLinus Torvalds 
free_pg_vec(struct pgv * pg_vec,unsigned int order,unsigned int len)43903a7ad063SEric Dumazet static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
43913a7ad063SEric Dumazet 			unsigned int len)
43921da177e4SLinus Torvalds {
43931da177e4SLinus Torvalds 	int i;
43941da177e4SLinus Torvalds 
43951da177e4SLinus Torvalds 	for (i = 0; i < len; i++) {
43960e3125c7SNeil Horman 		if (likely(pg_vec[i].buffer)) {
43973a7ad063SEric Dumazet 			if (is_vmalloc_addr(pg_vec[i].buffer))
43983a7ad063SEric Dumazet 				vfree(pg_vec[i].buffer);
43993a7ad063SEric Dumazet 			else
44003a7ad063SEric Dumazet 				free_pages((unsigned long)pg_vec[i].buffer,
44013a7ad063SEric Dumazet 					   order);
44020e3125c7SNeil Horman 			pg_vec[i].buffer = NULL;
44030e3125c7SNeil Horman 		}
44041da177e4SLinus Torvalds 	}
44051da177e4SLinus Torvalds 	kfree(pg_vec);
44061da177e4SLinus Torvalds }
44071da177e4SLinus Torvalds 
alloc_one_pg_vec_page(unsigned long order)44083a7ad063SEric Dumazet static char *alloc_one_pg_vec_page(unsigned long order)
44094ebf0ae2SDavid S. Miller {
4410f0d4eb29SDaniel Borkmann 	char *buffer;
44113a7ad063SEric Dumazet 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
44123a7ad063SEric Dumazet 			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4413719bfeaaSEric Dumazet 
44143a7ad063SEric Dumazet 	buffer = (char *) __get_free_pages(gfp_flags, order);
44150e3125c7SNeil Horman 	if (buffer)
44160e3125c7SNeil Horman 		return buffer;
44170e3125c7SNeil Horman 
44183a7ad063SEric Dumazet 	/* __get_free_pages failed, fall back to vmalloc */
44193a7ad063SEric Dumazet 	buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
44203a7ad063SEric Dumazet 	if (buffer)
44210e3125c7SNeil Horman 		return buffer;
44223a7ad063SEric Dumazet 
44233a7ad063SEric Dumazet 	/* vmalloc failed, lets dig into swap here */
44243a7ad063SEric Dumazet 	gfp_flags &= ~__GFP_NORETRY;
44253a7ad063SEric Dumazet 	buffer = (char *) __get_free_pages(gfp_flags, order);
44263a7ad063SEric Dumazet 	if (buffer)
44273a7ad063SEric Dumazet 		return buffer;
44283a7ad063SEric Dumazet 
44293a7ad063SEric Dumazet 	/* complete and utter failure */
44303a7ad063SEric Dumazet 	return NULL;
44314ebf0ae2SDavid S. Miller }
44324ebf0ae2SDavid S. Miller 
alloc_pg_vec(struct tpacket_req * req,int order)44333a7ad063SEric Dumazet static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
44344ebf0ae2SDavid S. Miller {
44354ebf0ae2SDavid S. Miller 	unsigned int block_nr = req->tp_block_nr;
44360e3125c7SNeil Horman 	struct pgv *pg_vec;
44374ebf0ae2SDavid S. Miller 	int i;
44384ebf0ae2SDavid S. Miller 
4439398f0132SChristoph Paasch 	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
44404ebf0ae2SDavid S. Miller 	if (unlikely(!pg_vec))
44414ebf0ae2SDavid S. Miller 		goto out;
44424ebf0ae2SDavid S. Miller 
44434ebf0ae2SDavid S. Miller 	for (i = 0; i < block_nr; i++) {
44443a7ad063SEric Dumazet 		pg_vec[i].buffer = alloc_one_pg_vec_page(order);
44450e3125c7SNeil Horman 		if (unlikely(!pg_vec[i].buffer))
44464ebf0ae2SDavid S. Miller 			goto out_free_pgvec;
44474ebf0ae2SDavid S. Miller 	}
44484ebf0ae2SDavid S. Miller 
44494ebf0ae2SDavid S. Miller out:
44504ebf0ae2SDavid S. Miller 	return pg_vec;
44514ebf0ae2SDavid S. Miller 
44524ebf0ae2SDavid S. Miller out_free_pgvec:
44533a7ad063SEric Dumazet 	free_pg_vec(pg_vec, order, block_nr);
44544ebf0ae2SDavid S. Miller 	pg_vec = NULL;
44554ebf0ae2SDavid S. Miller 	goto out;
44564ebf0ae2SDavid S. Miller }
44571da177e4SLinus Torvalds 
packet_set_ring(struct sock * sk,union tpacket_req_u * req_u,int closing,int tx_ring)4458f6fb8f10Schetan loke static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
445969e3c75fSJohann Baudy 		int closing, int tx_ring)
44601da177e4SLinus Torvalds {
44610e3125c7SNeil Horman 	struct pgv *pg_vec = NULL;
44621da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
446361fad681SWillem de Bruijn 	unsigned long *rx_owner_map = NULL;
44643a7ad063SEric Dumazet 	int was_running, order = 0;
446569e3c75fSJohann Baudy 	struct packet_ring_buffer *rb;
446669e3c75fSJohann Baudy 	struct sk_buff_head *rb_queue;
44670e11c91eSAl Viro 	__be16 num;
44682a6d6c31SColin Ian King 	int err;
4469f6fb8f10Schetan loke 	/* Added to avoid minimal code churn */
4470f6fb8f10Schetan loke 	struct tpacket_req *req = &req_u->req;
4471f6fb8f10Schetan loke 
447269e3c75fSJohann Baudy 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
447369e3c75fSJohann Baudy 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
447469e3c75fSJohann Baudy 
447569e3c75fSJohann Baudy 	err = -EBUSY;
447669e3c75fSJohann Baudy 	if (!closing) {
4477865b7157SDaniel Borkmann 		if (atomic_long_read(&po->mapped))
447869e3c75fSJohann Baudy 			goto out;
4479b0138408SDaniel Borkmann 		if (packet_read_pending(rb))
448069e3c75fSJohann Baudy 			goto out;
448169e3c75fSJohann Baudy 	}
44821da177e4SLinus Torvalds 
44831da177e4SLinus Torvalds 	if (req->tp_block_nr) {
44844576cd46SWillem de Bruijn 		unsigned int min_frame_size;
44854576cd46SWillem de Bruijn 
44861da177e4SLinus Torvalds 		/* Sanity tests and some calculations */
448769e3c75fSJohann Baudy 		err = -EBUSY;
448869e3c75fSJohann Baudy 		if (unlikely(rb->pg_vec))
448969e3c75fSJohann Baudy 			goto out;
44901da177e4SLinus Torvalds 
4491bbd6ef87SPatrick McHardy 		switch (po->tp_version) {
4492bbd6ef87SPatrick McHardy 		case TPACKET_V1:
4493bbd6ef87SPatrick McHardy 			po->tp_hdrlen = TPACKET_HDRLEN;
4494bbd6ef87SPatrick McHardy 			break;
4495bbd6ef87SPatrick McHardy 		case TPACKET_V2:
4496bbd6ef87SPatrick McHardy 			po->tp_hdrlen = TPACKET2_HDRLEN;
4497bbd6ef87SPatrick McHardy 			break;
4498f6fb8f10Schetan loke 		case TPACKET_V3:
4499f6fb8f10Schetan loke 			po->tp_hdrlen = TPACKET3_HDRLEN;
4500f6fb8f10Schetan loke 			break;
4501bbd6ef87SPatrick McHardy 		}
4502bbd6ef87SPatrick McHardy 
450369e3c75fSJohann Baudy 		err = -EINVAL;
45044ebf0ae2SDavid S. Miller 		if (unlikely((int)req->tp_block_size <= 0))
450569e3c75fSJohann Baudy 			goto out;
450690836b67STobias Klauser 		if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
450769e3c75fSJohann Baudy 			goto out;
45084576cd46SWillem de Bruijn 		min_frame_size = po->tp_hdrlen + po->tp_reserve;
4509dc808110SEric Dumazet 		if (po->tp_version >= TPACKET_V3 &&
45104576cd46SWillem de Bruijn 		    req->tp_block_size <
45114576cd46SWillem de Bruijn 		    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4512dc808110SEric Dumazet 			goto out;
45134576cd46SWillem de Bruijn 		if (unlikely(req->tp_frame_size < min_frame_size))
451469e3c75fSJohann Baudy 			goto out;
45154ebf0ae2SDavid S. Miller 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
451669e3c75fSJohann Baudy 			goto out;
45171da177e4SLinus Torvalds 
451869e3c75fSJohann Baudy 		rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
45194194b491STobias Klauser 		if (unlikely(rb->frames_per_block == 0))
452069e3c75fSJohann Baudy 			goto out;
4521fc62814dSKal Conley 		if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
45228f8d28e4SAndrey Konovalov 			goto out;
452369e3c75fSJohann Baudy 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
45244ebf0ae2SDavid S. Miller 					req->tp_frame_nr))
452569e3c75fSJohann Baudy 			goto out;
45261da177e4SLinus Torvalds 
45271da177e4SLinus Torvalds 		err = -ENOMEM;
45283a7ad063SEric Dumazet 		order = get_order(req->tp_block_size);
45293a7ad063SEric Dumazet 		pg_vec = alloc_pg_vec(req, order);
45304ebf0ae2SDavid S. Miller 		if (unlikely(!pg_vec))
45311da177e4SLinus Torvalds 			goto out;
4532f6fb8f10Schetan loke 		switch (po->tp_version) {
4533f6fb8f10Schetan loke 		case TPACKET_V3:
45347f953ab2SSowmini Varadhan 			/* Block transmit is not supported yet */
45357f953ab2SSowmini Varadhan 			if (!tx_ring) {
4536e8e85cc5SManinder Singh 				init_prb_bdqc(po, rb, pg_vec, req_u);
45377f953ab2SSowmini Varadhan 			} else {
45387f953ab2SSowmini Varadhan 				struct tpacket_req3 *req3 = &req_u->req3;
45397f953ab2SSowmini Varadhan 
45407f953ab2SSowmini Varadhan 				if (req3->tp_retire_blk_tov ||
45417f953ab2SSowmini Varadhan 				    req3->tp_sizeof_priv ||
45427f953ab2SSowmini Varadhan 				    req3->tp_feature_req_word) {
45437f953ab2SSowmini Varadhan 					err = -EINVAL;
454455655e3dSEric Dumazet 					goto out_free_pg_vec;
45457f953ab2SSowmini Varadhan 				}
45467f953ab2SSowmini Varadhan 			}
4547f6fb8f10Schetan loke 			break;
4548f6fb8f10Schetan loke 		default:
454961fad681SWillem de Bruijn 			if (!tx_ring) {
455061fad681SWillem de Bruijn 				rx_owner_map = bitmap_alloc(req->tp_frame_nr,
455161fad681SWillem de Bruijn 					GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
455261fad681SWillem de Bruijn 				if (!rx_owner_map)
455361fad681SWillem de Bruijn 					goto out_free_pg_vec;
455461fad681SWillem de Bruijn 			}
4555f6fb8f10Schetan loke 			break;
4556f6fb8f10Schetan loke 		}
45571da177e4SLinus Torvalds 	}
45581da177e4SLinus Torvalds 	/* Done */
455969e3c75fSJohann Baudy 	else {
456069e3c75fSJohann Baudy 		err = -EINVAL;
45614ebf0ae2SDavid S. Miller 		if (unlikely(req->tp_frame_nr))
456269e3c75fSJohann Baudy 			goto out;
45631da177e4SLinus Torvalds 	}
45641da177e4SLinus Torvalds 
45651da177e4SLinus Torvalds 
45661da177e4SLinus Torvalds 	/* Detach socket from network */
45671da177e4SLinus Torvalds 	spin_lock(&po->bind_lock);
456861edf479SEric Dumazet 	was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING);
45691da177e4SLinus Torvalds 	num = po->num;
45701da177e4SLinus Torvalds 	if (was_running) {
4571c7d2ef5dSEric Dumazet 		WRITE_ONCE(po->num, 0);
4572ce06b03eSDavid S. Miller 		__unregister_prot_hook(sk, false);
45731da177e4SLinus Torvalds 	}
45741da177e4SLinus Torvalds 	spin_unlock(&po->bind_lock);
45751da177e4SLinus Torvalds 
45761da177e4SLinus Torvalds 	synchronize_net();
45771da177e4SLinus Torvalds 
45781da177e4SLinus Torvalds 	err = -EBUSY;
4579905db440SHerbert Xu 	mutex_lock(&po->pg_vec_lock);
4580865b7157SDaniel Borkmann 	if (closing || atomic_long_read(&po->mapped) == 0) {
45811da177e4SLinus Torvalds 		err = 0;
458269e3c75fSJohann Baudy 		spin_lock_bh(&rb_queue->lock);
4583c053fd96SChangli Gao 		swap(rb->pg_vec, pg_vec);
458461fad681SWillem de Bruijn 		if (po->tp_version <= TPACKET_V2)
458561fad681SWillem de Bruijn 			swap(rb->rx_owner_map, rx_owner_map);
458669e3c75fSJohann Baudy 		rb->frame_max = (req->tp_frame_nr - 1);
458769e3c75fSJohann Baudy 		rb->head = 0;
458869e3c75fSJohann Baudy 		rb->frame_size = req->tp_frame_size;
458969e3c75fSJohann Baudy 		spin_unlock_bh(&rb_queue->lock);
45901da177e4SLinus Torvalds 
45913a7ad063SEric Dumazet 		swap(rb->pg_vec_order, order);
4592c053fd96SChangli Gao 		swap(rb->pg_vec_len, req->tp_block_nr);
45931da177e4SLinus Torvalds 
459469e3c75fSJohann Baudy 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
459569e3c75fSJohann Baudy 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
459669e3c75fSJohann Baudy 						tpacket_rcv : packet_rcv;
459769e3c75fSJohann Baudy 		skb_queue_purge(rb_queue);
4598865b7157SDaniel Borkmann 		if (atomic_long_read(&po->mapped))
4599865b7157SDaniel Borkmann 			pr_err("packet_mmap: vma is busy: %ld\n",
4600865b7157SDaniel Borkmann 			       atomic_long_read(&po->mapped));
46011da177e4SLinus Torvalds 	}
4602905db440SHerbert Xu 	mutex_unlock(&po->pg_vec_lock);
46031da177e4SLinus Torvalds 
46041da177e4SLinus Torvalds 	spin_lock(&po->bind_lock);
4605ce06b03eSDavid S. Miller 	if (was_running) {
4606c7d2ef5dSEric Dumazet 		WRITE_ONCE(po->num, num);
4607ce06b03eSDavid S. Miller 		register_prot_hook(sk);
46081da177e4SLinus Torvalds 	}
46091da177e4SLinus Torvalds 	spin_unlock(&po->bind_lock);
4610c800aaf8SWANG Cong 	if (pg_vec && (po->tp_version > TPACKET_V2)) {
4611f6fb8f10Schetan loke 		/* Because we don't support block-based V3 on tx-ring */
4612f6fb8f10Schetan loke 		if (!tx_ring)
461373d0fcf2STobias Klauser 			prb_shutdown_retire_blk_timer(po, rb_queue);
4614f6fb8f10Schetan loke 	}
46151da177e4SLinus Torvalds 
461655655e3dSEric Dumazet out_free_pg_vec:
4617ec6af094SWillem de Bruijn 	if (pg_vec) {
461861fad681SWillem de Bruijn 		bitmap_free(rx_owner_map);
46193a7ad063SEric Dumazet 		free_pg_vec(pg_vec, order, req->tp_block_nr);
4620ec6af094SWillem de Bruijn 	}
46211da177e4SLinus Torvalds out:
46221da177e4SLinus Torvalds 	return err;
46231da177e4SLinus Torvalds }
46241da177e4SLinus Torvalds 
packet_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)462569e3c75fSJohann Baudy static int packet_mmap(struct file *file, struct socket *sock,
462669e3c75fSJohann Baudy 		struct vm_area_struct *vma)
46271da177e4SLinus Torvalds {
46281da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
46291da177e4SLinus Torvalds 	struct packet_sock *po = pkt_sk(sk);
463069e3c75fSJohann Baudy 	unsigned long size, expected_size;
463169e3c75fSJohann Baudy 	struct packet_ring_buffer *rb;
46321da177e4SLinus Torvalds 	unsigned long start;
46331da177e4SLinus Torvalds 	int err = -EINVAL;
46341da177e4SLinus Torvalds 	int i;
46351da177e4SLinus Torvalds 
46361da177e4SLinus Torvalds 	if (vma->vm_pgoff)
46371da177e4SLinus Torvalds 		return -EINVAL;
46381da177e4SLinus Torvalds 
4639905db440SHerbert Xu 	mutex_lock(&po->pg_vec_lock);
464069e3c75fSJohann Baudy 
464169e3c75fSJohann Baudy 	expected_size = 0;
464269e3c75fSJohann Baudy 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
464369e3c75fSJohann Baudy 		if (rb->pg_vec) {
464469e3c75fSJohann Baudy 			expected_size += rb->pg_vec_len
464569e3c75fSJohann Baudy 						* rb->pg_vec_pages
464669e3c75fSJohann Baudy 						* PAGE_SIZE;
464769e3c75fSJohann Baudy 		}
464869e3c75fSJohann Baudy 	}
464969e3c75fSJohann Baudy 
465069e3c75fSJohann Baudy 	if (expected_size == 0)
46511da177e4SLinus Torvalds 		goto out;
465269e3c75fSJohann Baudy 
465369e3c75fSJohann Baudy 	size = vma->vm_end - vma->vm_start;
465469e3c75fSJohann Baudy 	if (size != expected_size)
46551da177e4SLinus Torvalds 		goto out;
46561da177e4SLinus Torvalds 
46571da177e4SLinus Torvalds 	start = vma->vm_start;
465869e3c75fSJohann Baudy 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
465969e3c75fSJohann Baudy 		if (rb->pg_vec == NULL)
466069e3c75fSJohann Baudy 			continue;
466169e3c75fSJohann Baudy 
466269e3c75fSJohann Baudy 		for (i = 0; i < rb->pg_vec_len; i++) {
46630e3125c7SNeil Horman 			struct page *page;
46640e3125c7SNeil Horman 			void *kaddr = rb->pg_vec[i].buffer;
46654ebf0ae2SDavid S. Miller 			int pg_num;
46664ebf0ae2SDavid S. Miller 
4667c56b4d90SChangli Gao 			for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4668c56b4d90SChangli Gao 				page = pgv_to_page(kaddr);
46694ebf0ae2SDavid S. Miller 				err = vm_insert_page(vma, start, page);
46704ebf0ae2SDavid S. Miller 				if (unlikely(err))
46711da177e4SLinus Torvalds 					goto out;
46724ebf0ae2SDavid S. Miller 				start += PAGE_SIZE;
46730e3125c7SNeil Horman 				kaddr += PAGE_SIZE;
46741da177e4SLinus Torvalds 			}
46754ebf0ae2SDavid S. Miller 		}
467669e3c75fSJohann Baudy 	}
467769e3c75fSJohann Baudy 
4678865b7157SDaniel Borkmann 	atomic_long_inc(&po->mapped);
46791da177e4SLinus Torvalds 	vma->vm_ops = &packet_mmap_ops;
46801da177e4SLinus Torvalds 	err = 0;
46811da177e4SLinus Torvalds 
46821da177e4SLinus Torvalds out:
4683905db440SHerbert Xu 	mutex_unlock(&po->pg_vec_lock);
46841da177e4SLinus Torvalds 	return err;
46851da177e4SLinus Torvalds }
46861da177e4SLinus Torvalds 
468790ddc4f0SEric Dumazet static const struct proto_ops packet_ops_spkt = {
46881da177e4SLinus Torvalds 	.family =	PF_PACKET,
46891da177e4SLinus Torvalds 	.owner =	THIS_MODULE,
46901da177e4SLinus Torvalds 	.release =	packet_release,
46911da177e4SLinus Torvalds 	.bind =		packet_bind_spkt,
46921da177e4SLinus Torvalds 	.connect =	sock_no_connect,
46931da177e4SLinus Torvalds 	.socketpair =	sock_no_socketpair,
46941da177e4SLinus Torvalds 	.accept =	sock_no_accept,
46951da177e4SLinus Torvalds 	.getname =	packet_getname_spkt,
4696a11e1d43SLinus Torvalds 	.poll =		datagram_poll,
46971da177e4SLinus Torvalds 	.ioctl =	packet_ioctl,
4698c7cbdbf2SArnd Bergmann 	.gettstamp =	sock_gettstamp,
46991da177e4SLinus Torvalds 	.listen =	sock_no_listen,
47001da177e4SLinus Torvalds 	.shutdown =	sock_no_shutdown,
47011da177e4SLinus Torvalds 	.sendmsg =	packet_sendmsg_spkt,
47021da177e4SLinus Torvalds 	.recvmsg =	packet_recvmsg,
47031da177e4SLinus Torvalds 	.mmap =		sock_no_mmap,
47041da177e4SLinus Torvalds };
47051da177e4SLinus Torvalds 
470690ddc4f0SEric Dumazet static const struct proto_ops packet_ops = {
47071da177e4SLinus Torvalds 	.family =	PF_PACKET,
47081da177e4SLinus Torvalds 	.owner =	THIS_MODULE,
47091da177e4SLinus Torvalds 	.release =	packet_release,
47101da177e4SLinus Torvalds 	.bind =		packet_bind,
47111da177e4SLinus Torvalds 	.connect =	sock_no_connect,
47121da177e4SLinus Torvalds 	.socketpair =	sock_no_socketpair,
47131da177e4SLinus Torvalds 	.accept =	sock_no_accept,
47141da177e4SLinus Torvalds 	.getname =	packet_getname,
4715a11e1d43SLinus Torvalds 	.poll =		packet_poll,
47161da177e4SLinus Torvalds 	.ioctl =	packet_ioctl,
4717c7cbdbf2SArnd Bergmann 	.gettstamp =	sock_gettstamp,
47181da177e4SLinus Torvalds 	.listen =	sock_no_listen,
47191da177e4SLinus Torvalds 	.shutdown =	sock_no_shutdown,
47201da177e4SLinus Torvalds 	.setsockopt =	packet_setsockopt,
47211da177e4SLinus Torvalds 	.getsockopt =	packet_getsockopt,
47221da177e4SLinus Torvalds 	.sendmsg =	packet_sendmsg,
47231da177e4SLinus Torvalds 	.recvmsg =	packet_recvmsg,
47241da177e4SLinus Torvalds 	.mmap =		packet_mmap,
47251da177e4SLinus Torvalds };
47261da177e4SLinus Torvalds 
4727ec1b4cf7SStephen Hemminger static const struct net_proto_family packet_family_ops = {
47281da177e4SLinus Torvalds 	.family =	PF_PACKET,
47291da177e4SLinus Torvalds 	.create =	packet_create,
47301da177e4SLinus Torvalds 	.owner	=	THIS_MODULE,
47311da177e4SLinus Torvalds };
47321da177e4SLinus Torvalds 
47331da177e4SLinus Torvalds static struct notifier_block packet_netdev_notifier = {
47341da177e4SLinus Torvalds 	.notifier_call =	packet_notifier,
47351da177e4SLinus Torvalds };
47361da177e4SLinus Torvalds 
47371da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS
47381da177e4SLinus Torvalds 
packet_seq_start(struct seq_file * seq,loff_t * pos)47391da177e4SLinus Torvalds static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4740808f5114Sstephen hemminger 	__acquires(RCU)
47411da177e4SLinus Torvalds {
4742e372c414SDenis V. Lunev 	struct net *net = seq_file_net(seq);
4743808f5114Sstephen hemminger 
4744808f5114Sstephen hemminger 	rcu_read_lock();
4745808f5114Sstephen hemminger 	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
47461da177e4SLinus Torvalds }
47471da177e4SLinus Torvalds 
packet_seq_next(struct seq_file * seq,void * v,loff_t * pos)47481da177e4SLinus Torvalds static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
47491da177e4SLinus Torvalds {
47501bf40954SHerbert Xu 	struct net *net = seq_file_net(seq);
4751808f5114Sstephen hemminger 	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
47521da177e4SLinus Torvalds }
47531da177e4SLinus Torvalds 
packet_seq_stop(struct seq_file * seq,void * v)47541da177e4SLinus Torvalds static void packet_seq_stop(struct seq_file *seq, void *v)
4755808f5114Sstephen hemminger 	__releases(RCU)
47561da177e4SLinus Torvalds {
4757808f5114Sstephen hemminger 	rcu_read_unlock();
47581da177e4SLinus Torvalds }
47591da177e4SLinus Torvalds 
packet_seq_show(struct seq_file * seq,void * v)47601da177e4SLinus Torvalds static int packet_seq_show(struct seq_file *seq, void *v)
47611da177e4SLinus Torvalds {
47621da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN)
4763abdcd06cSBaruch Siach 		seq_printf(seq,
4764abdcd06cSBaruch Siach 			   "%*sRefCnt Type Proto  Iface R Rmem   User   Inode\n",
4765abdcd06cSBaruch Siach 			   IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
47661da177e4SLinus Torvalds 	else {
4767b7ceabd9SLi Zefan 		struct sock *s = sk_entry(v);
47681da177e4SLinus Torvalds 		const struct packet_sock *po = pkt_sk(s);
47691da177e4SLinus Torvalds 
47701da177e4SLinus Torvalds 		seq_printf(seq,
477171338aa7SDan Rosenberg 			   "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
47721da177e4SLinus Torvalds 			   s,
477341c6d650SReshetova, Elena 			   refcount_read(&s->sk_refcnt),
47741da177e4SLinus Torvalds 			   s->sk_type,
4775c7d2ef5dSEric Dumazet 			   ntohs(READ_ONCE(po->num)),
4776e032f7c9SEric Dumazet 			   READ_ONCE(po->ifindex),
477761edf479SEric Dumazet 			   packet_sock_flag(po, PACKET_SOCK_RUNNING),
47781da177e4SLinus Torvalds 			   atomic_read(&s->sk_rmem_alloc),
4779a7cb5a49SEric W. Biederman 			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
47801da177e4SLinus Torvalds 			   sock_i_ino(s));
47811da177e4SLinus Torvalds 	}
47821da177e4SLinus Torvalds 
47831da177e4SLinus Torvalds 	return 0;
47841da177e4SLinus Torvalds }
47851da177e4SLinus Torvalds 
478656b3d975SPhilippe De Muyter static const struct seq_operations packet_seq_ops = {
47871da177e4SLinus Torvalds 	.start	= packet_seq_start,
47881da177e4SLinus Torvalds 	.next	= packet_seq_next,
47891da177e4SLinus Torvalds 	.stop	= packet_seq_stop,
47901da177e4SLinus Torvalds 	.show	= packet_seq_show,
47911da177e4SLinus Torvalds };
47921da177e4SLinus Torvalds #endif
47931da177e4SLinus Torvalds 
packet_net_init(struct net * net)47942c8c1e72SAlexey Dobriyan static int __net_init packet_net_init(struct net *net)
4795d12d01d6SDenis V. Lunev {
47960fa7fa98SPavel Emelyanov 	mutex_init(&net->packet.sklist_lock);
47972aaef4e4SDenis V. Lunev 	INIT_HLIST_HEAD(&net->packet.sklist);
4798d12d01d6SDenis V. Lunev 
4799a268e0f2SYonatan Linik #ifdef CONFIG_PROC_FS
4800c3506372SChristoph Hellwig 	if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4801c3506372SChristoph Hellwig 			sizeof(struct seq_net_private)))
4802d12d01d6SDenis V. Lunev 		return -ENOMEM;
4803a268e0f2SYonatan Linik #endif /* CONFIG_PROC_FS */
4804d12d01d6SDenis V. Lunev 
4805d12d01d6SDenis V. Lunev 	return 0;
4806d12d01d6SDenis V. Lunev }
4807d12d01d6SDenis V. Lunev 
packet_net_exit(struct net * net)48082c8c1e72SAlexey Dobriyan static void __net_exit packet_net_exit(struct net *net)
4809d12d01d6SDenis V. Lunev {
4810ece31ffdSGao feng 	remove_proc_entry("packet", net->proc_net);
4811669f8f1aSVasily Averin 	WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4812d12d01d6SDenis V. Lunev }
4813d12d01d6SDenis V. Lunev 
4814d12d01d6SDenis V. Lunev static struct pernet_operations packet_net_ops = {
4815d12d01d6SDenis V. Lunev 	.init = packet_net_init,
4816d12d01d6SDenis V. Lunev 	.exit = packet_net_exit,
4817d12d01d6SDenis V. Lunev };
4818d12d01d6SDenis V. Lunev 
4819d12d01d6SDenis V. Lunev 
packet_exit(void)48201da177e4SLinus Torvalds static void __exit packet_exit(void)
48211da177e4SLinus Torvalds {
48221da177e4SLinus Torvalds 	sock_unregister(PF_PACKET);
48231da177e4SLinus Torvalds 	proto_unregister(&packet_proto);
482463b7c2ebSZiyang Xuan 	unregister_netdevice_notifier(&packet_netdev_notifier);
482563b7c2ebSZiyang Xuan 	unregister_pernet_subsys(&packet_net_ops);
48261da177e4SLinus Torvalds }
48271da177e4SLinus Torvalds 
packet_init(void)48281da177e4SLinus Torvalds static int __init packet_init(void)
48291da177e4SLinus Torvalds {
483036096f2fSYueHaibing 	int rc;
48311da177e4SLinus Torvalds 
483236096f2fSYueHaibing 	rc = register_pernet_subsys(&packet_net_ops);
483336096f2fSYueHaibing 	if (rc)
483463b7c2ebSZiyang Xuan 		goto out;
483536096f2fSYueHaibing 	rc = register_netdevice_notifier(&packet_netdev_notifier);
483636096f2fSYueHaibing 	if (rc)
483736096f2fSYueHaibing 		goto out_pernet;
483863b7c2ebSZiyang Xuan 	rc = proto_register(&packet_proto, 0);
483963b7c2ebSZiyang Xuan 	if (rc)
484063b7c2ebSZiyang Xuan 		goto out_notifier;
484163b7c2ebSZiyang Xuan 	rc = sock_register(&packet_family_ops);
484263b7c2ebSZiyang Xuan 	if (rc)
484363b7c2ebSZiyang Xuan 		goto out_proto;
48441da177e4SLinus Torvalds 
484536096f2fSYueHaibing 	return 0;
484636096f2fSYueHaibing 
484736096f2fSYueHaibing out_proto:
484836096f2fSYueHaibing 	proto_unregister(&packet_proto);
484963b7c2ebSZiyang Xuan out_notifier:
485063b7c2ebSZiyang Xuan 	unregister_netdevice_notifier(&packet_netdev_notifier);
485163b7c2ebSZiyang Xuan out_pernet:
485263b7c2ebSZiyang Xuan 	unregister_pernet_subsys(&packet_net_ops);
48531da177e4SLinus Torvalds out:
48541da177e4SLinus Torvalds 	return rc;
48551da177e4SLinus Torvalds }
48561da177e4SLinus Torvalds 
48571da177e4SLinus Torvalds module_init(packet_init);
48581da177e4SLinus Torvalds module_exit(packet_exit);
48591da177e4SLinus Torvalds MODULE_LICENSE("GPL");
48601da177e4SLinus Torvalds MODULE_ALIAS_NETPROTO(PF_PACKET);
4861