xref: /openbmc/linux/net/packet/af_packet.c (revision a09d2831)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83 
84 #ifdef CONFIG_INET
85 #include <net/inet_common.h>
86 #endif
87 
88 /*
89    Assumptions:
90    - if device has no dev->hard_header routine, it adds and removes ll header
91      inside itself. In this case ll header is invisible outside of device,
92      but higher levels still should reserve dev->hard_header_len.
93      Some devices are enough clever to reallocate skb, when header
94      will not fit to reserved space (tunnel), another ones are silly
95      (PPP).
96    - packet socket receives packets with pulled ll header,
97      so that SOCK_RAW should push it back.
98 
99 On receive:
100 -----------
101 
102 Incoming, dev->hard_header!=NULL
103    mac_header -> ll header
104    data       -> data
105 
106 Outgoing, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> ll header
109 
110 Incoming, dev->hard_header==NULL
111    mac_header -> UNKNOWN position. It is very likely, that it points to ll
112 		 header.  PPP makes it, that is wrong, because introduce
113 		 assymetry between rx and tx paths.
114    data       -> data
115 
116 Outgoing, dev->hard_header==NULL
117    mac_header -> data. ll header is still not built!
118    data       -> data
119 
120 Resume
121   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122 
123 
124 On transmit:
125 ------------
126 
127 dev->hard_header != NULL
128    mac_header -> ll header
129    data       -> ll header
130 
131 dev->hard_header == NULL (ll header is added by device, we cannot control it)
132    mac_header -> data
133    data       -> data
134 
135    We should set nh.raw on output to correct posistion,
136    packet classifier depends on it.
137  */
138 
139 /* Private packet socket structures. */
140 
141 struct packet_mclist {
142 	struct packet_mclist	*next;
143 	int			ifindex;
144 	int			count;
145 	unsigned short		type;
146 	unsigned short		alen;
147 	unsigned char		addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max {
153 	int		mr_ifindex;
154 	unsigned short	mr_type;
155 	unsigned short	mr_alen;
156 	unsigned char	mr_address[MAX_ADDR_LEN];
157 };
158 
159 #ifdef CONFIG_PACKET_MMAP
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161 		int closing, int tx_ring);
162 
163 struct packet_ring_buffer {
164 	char			**pg_vec;
165 	unsigned int		head;
166 	unsigned int		frames_per_block;
167 	unsigned int		frame_size;
168 	unsigned int		frame_max;
169 
170 	unsigned int		pg_vec_order;
171 	unsigned int		pg_vec_pages;
172 	unsigned int		pg_vec_len;
173 
174 	atomic_t		pending;
175 };
176 
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179 #endif
180 
181 static void packet_flush_mclist(struct sock *sk);
182 
183 struct packet_sock {
184 	/* struct sock has to be the first member of packet_sock */
185 	struct sock		sk;
186 	struct tpacket_stats	stats;
187 #ifdef CONFIG_PACKET_MMAP
188 	struct packet_ring_buffer	rx_ring;
189 	struct packet_ring_buffer	tx_ring;
190 	int			copy_thresh;
191 #endif
192 	spinlock_t		bind_lock;
193 	struct mutex		pg_vec_lock;
194 	unsigned int		running:1,	/* prot_hook is attached*/
195 				auxdata:1,
196 				origdev:1;
197 	int			ifindex;	/* bound device		*/
198 	__be16			num;
199 	struct packet_mclist	*mclist;
200 #ifdef CONFIG_PACKET_MMAP
201 	atomic_t		mapped;
202 	enum tpacket_versions	tp_version;
203 	unsigned int		tp_hdrlen;
204 	unsigned int		tp_reserve;
205 	unsigned int		tp_loss:1;
206 #endif
207 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
208 };
209 
210 struct packet_skb_cb {
211 	unsigned int origlen;
212 	union {
213 		struct sockaddr_pkt pkt;
214 		struct sockaddr_ll ll;
215 	} sa;
216 };
217 
218 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
219 
220 #ifdef CONFIG_PACKET_MMAP
221 
222 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
223 {
224 	union {
225 		struct tpacket_hdr *h1;
226 		struct tpacket2_hdr *h2;
227 		void *raw;
228 	} h;
229 
230 	h.raw = frame;
231 	switch (po->tp_version) {
232 	case TPACKET_V1:
233 		h.h1->tp_status = status;
234 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
235 		break;
236 	case TPACKET_V2:
237 		h.h2->tp_status = status;
238 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
239 		break;
240 	default:
241 		pr_err("TPACKET version not supported\n");
242 		BUG();
243 	}
244 
245 	smp_wmb();
246 }
247 
248 static int __packet_get_status(struct packet_sock *po, void *frame)
249 {
250 	union {
251 		struct tpacket_hdr *h1;
252 		struct tpacket2_hdr *h2;
253 		void *raw;
254 	} h;
255 
256 	smp_rmb();
257 
258 	h.raw = frame;
259 	switch (po->tp_version) {
260 	case TPACKET_V1:
261 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
262 		return h.h1->tp_status;
263 	case TPACKET_V2:
264 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
265 		return h.h2->tp_status;
266 	default:
267 		pr_err("TPACKET version not supported\n");
268 		BUG();
269 		return 0;
270 	}
271 }
272 
273 static void *packet_lookup_frame(struct packet_sock *po,
274 		struct packet_ring_buffer *rb,
275 		unsigned int position,
276 		int status)
277 {
278 	unsigned int pg_vec_pos, frame_offset;
279 	union {
280 		struct tpacket_hdr *h1;
281 		struct tpacket2_hdr *h2;
282 		void *raw;
283 	} h;
284 
285 	pg_vec_pos = position / rb->frames_per_block;
286 	frame_offset = position % rb->frames_per_block;
287 
288 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
289 
290 	if (status != __packet_get_status(po, h.raw))
291 		return NULL;
292 
293 	return h.raw;
294 }
295 
296 static inline void *packet_current_frame(struct packet_sock *po,
297 		struct packet_ring_buffer *rb,
298 		int status)
299 {
300 	return packet_lookup_frame(po, rb, rb->head, status);
301 }
302 
303 static inline void *packet_previous_frame(struct packet_sock *po,
304 		struct packet_ring_buffer *rb,
305 		int status)
306 {
307 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
308 	return packet_lookup_frame(po, rb, previous, status);
309 }
310 
311 static inline void packet_increment_head(struct packet_ring_buffer *buff)
312 {
313 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
314 }
315 
316 #endif
317 
318 static inline struct packet_sock *pkt_sk(struct sock *sk)
319 {
320 	return (struct packet_sock *)sk;
321 }
322 
323 static void packet_sock_destruct(struct sock *sk)
324 {
325 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
326 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
327 
328 	if (!sock_flag(sk, SOCK_DEAD)) {
329 		pr_err("Attempt to release alive packet socket: %p\n", sk);
330 		return;
331 	}
332 
333 	sk_refcnt_debug_dec(sk);
334 }
335 
336 
337 static const struct proto_ops packet_ops;
338 
339 static const struct proto_ops packet_ops_spkt;
340 
341 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
342 			   struct packet_type *pt, struct net_device *orig_dev)
343 {
344 	struct sock *sk;
345 	struct sockaddr_pkt *spkt;
346 
347 	/*
348 	 *	When we registered the protocol we saved the socket in the data
349 	 *	field for just this event.
350 	 */
351 
352 	sk = pt->af_packet_priv;
353 
354 	/*
355 	 *	Yank back the headers [hope the device set this
356 	 *	right or kerboom...]
357 	 *
358 	 *	Incoming packets have ll header pulled,
359 	 *	push it back.
360 	 *
361 	 *	For outgoing ones skb->data == skb_mac_header(skb)
362 	 *	so that this procedure is noop.
363 	 */
364 
365 	if (skb->pkt_type == PACKET_LOOPBACK)
366 		goto out;
367 
368 	if (!net_eq(dev_net(dev), sock_net(sk)))
369 		goto out;
370 
371 	skb = skb_share_check(skb, GFP_ATOMIC);
372 	if (skb == NULL)
373 		goto oom;
374 
375 	/* drop any routing info */
376 	skb_dst_drop(skb);
377 
378 	/* drop conntrack reference */
379 	nf_reset(skb);
380 
381 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382 
383 	skb_push(skb, skb->data - skb_mac_header(skb));
384 
385 	/*
386 	 *	The SOCK_PACKET socket receives _all_ frames.
387 	 */
388 
389 	spkt->spkt_family = dev->type;
390 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391 	spkt->spkt_protocol = skb->protocol;
392 
393 	/*
394 	 *	Charge the memory to the socket. This is done specifically
395 	 *	to prevent sockets using all the memory up.
396 	 */
397 
398 	if (sock_queue_rcv_skb(sk, skb) == 0)
399 		return 0;
400 
401 out:
402 	kfree_skb(skb);
403 oom:
404 	return 0;
405 }
406 
407 
408 /*
409  *	Output a raw packet to a device layer. This bypasses all the other
410  *	protocol layers and you must therefore supply it with a complete frame
411  */
412 
413 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414 			       struct msghdr *msg, size_t len)
415 {
416 	struct sock *sk = sock->sk;
417 	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
418 	struct sk_buff *skb = NULL;
419 	struct net_device *dev;
420 	__be16 proto = 0;
421 	int err;
422 
423 	/*
424 	 *	Get and verify the address.
425 	 */
426 
427 	if (saddr) {
428 		if (msg->msg_namelen < sizeof(struct sockaddr))
429 			return -EINVAL;
430 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
431 			proto = saddr->spkt_protocol;
432 	} else
433 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
434 
435 	/*
436 	 *	Find the device first to size check it
437 	 */
438 
439 	saddr->spkt_device[13] = 0;
440 retry:
441 	rcu_read_lock();
442 	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
443 	err = -ENODEV;
444 	if (dev == NULL)
445 		goto out_unlock;
446 
447 	err = -ENETDOWN;
448 	if (!(dev->flags & IFF_UP))
449 		goto out_unlock;
450 
451 	/*
452 	 * You may not queue a frame bigger than the mtu. This is the lowest level
453 	 * raw protocol and you must do your own fragmentation at this level.
454 	 */
455 
456 	err = -EMSGSIZE;
457 	if (len > dev->mtu + dev->hard_header_len)
458 		goto out_unlock;
459 
460 	if (!skb) {
461 		size_t reserved = LL_RESERVED_SPACE(dev);
462 		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
463 
464 		rcu_read_unlock();
465 		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
466 		if (skb == NULL)
467 			return -ENOBUFS;
468 		/* FIXME: Save some space for broken drivers that write a hard
469 		 * header at transmission time by themselves. PPP is the notable
470 		 * one here. This should really be fixed at the driver level.
471 		 */
472 		skb_reserve(skb, reserved);
473 		skb_reset_network_header(skb);
474 
475 		/* Try to align data part correctly */
476 		if (hhlen) {
477 			skb->data -= hhlen;
478 			skb->tail -= hhlen;
479 			if (len < hhlen)
480 				skb_reset_network_header(skb);
481 		}
482 		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
483 		if (err)
484 			goto out_free;
485 		goto retry;
486 	}
487 
488 
489 	skb->protocol = proto;
490 	skb->dev = dev;
491 	skb->priority = sk->sk_priority;
492 	skb->mark = sk->sk_mark;
493 
494 	dev_queue_xmit(skb);
495 	rcu_read_unlock();
496 	return len;
497 
498 out_unlock:
499 	rcu_read_unlock();
500 out_free:
501 	kfree_skb(skb);
502 	return err;
503 }
504 
505 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
506 				      unsigned int res)
507 {
508 	struct sk_filter *filter;
509 
510 	rcu_read_lock_bh();
511 	filter = rcu_dereference(sk->sk_filter);
512 	if (filter != NULL)
513 		res = sk_run_filter(skb, filter->insns, filter->len);
514 	rcu_read_unlock_bh();
515 
516 	return res;
517 }
518 
519 /*
520    This function makes lazy skb cloning in hope that most of packets
521    are discarded by BPF.
522 
523    Note tricky part: we DO mangle shared skb! skb->data, skb->len
524    and skb->cb are mangled. It works because (and until) packets
525    falling here are owned by current CPU. Output packets are cloned
526    by dev_queue_xmit_nit(), input packets are processed by net_bh
527    sequencially, so that if we return skb to original state on exit,
528    we will not harm anyone.
529  */
530 
531 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
532 		      struct packet_type *pt, struct net_device *orig_dev)
533 {
534 	struct sock *sk;
535 	struct sockaddr_ll *sll;
536 	struct packet_sock *po;
537 	u8 *skb_head = skb->data;
538 	int skb_len = skb->len;
539 	unsigned int snaplen, res;
540 
541 	if (skb->pkt_type == PACKET_LOOPBACK)
542 		goto drop;
543 
544 	sk = pt->af_packet_priv;
545 	po = pkt_sk(sk);
546 
547 	if (!net_eq(dev_net(dev), sock_net(sk)))
548 		goto drop;
549 
550 	skb->dev = dev;
551 
552 	if (dev->header_ops) {
553 		/* The device has an explicit notion of ll header,
554 		   exported to higher levels.
555 
556 		   Otherwise, the device hides datails of it frame
557 		   structure, so that corresponding packet head
558 		   never delivered to user.
559 		 */
560 		if (sk->sk_type != SOCK_DGRAM)
561 			skb_push(skb, skb->data - skb_mac_header(skb));
562 		else if (skb->pkt_type == PACKET_OUTGOING) {
563 			/* Special case: outgoing packets have ll header at head */
564 			skb_pull(skb, skb_network_offset(skb));
565 		}
566 	}
567 
568 	snaplen = skb->len;
569 
570 	res = run_filter(skb, sk, snaplen);
571 	if (!res)
572 		goto drop_n_restore;
573 	if (snaplen > res)
574 		snaplen = res;
575 
576 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
577 	    (unsigned)sk->sk_rcvbuf)
578 		goto drop_n_acct;
579 
580 	if (skb_shared(skb)) {
581 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
582 		if (nskb == NULL)
583 			goto drop_n_acct;
584 
585 		if (skb_head != skb->data) {
586 			skb->data = skb_head;
587 			skb->len = skb_len;
588 		}
589 		kfree_skb(skb);
590 		skb = nskb;
591 	}
592 
593 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
594 		     sizeof(skb->cb));
595 
596 	sll = &PACKET_SKB_CB(skb)->sa.ll;
597 	sll->sll_family = AF_PACKET;
598 	sll->sll_hatype = dev->type;
599 	sll->sll_protocol = skb->protocol;
600 	sll->sll_pkttype = skb->pkt_type;
601 	if (unlikely(po->origdev))
602 		sll->sll_ifindex = orig_dev->ifindex;
603 	else
604 		sll->sll_ifindex = dev->ifindex;
605 
606 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
607 
608 	PACKET_SKB_CB(skb)->origlen = skb->len;
609 
610 	if (pskb_trim(skb, snaplen))
611 		goto drop_n_acct;
612 
613 	skb_set_owner_r(skb, sk);
614 	skb->dev = NULL;
615 	skb_dst_drop(skb);
616 
617 	/* drop conntrack reference */
618 	nf_reset(skb);
619 
620 	spin_lock(&sk->sk_receive_queue.lock);
621 	po->stats.tp_packets++;
622 	skb->dropcount = atomic_read(&sk->sk_drops);
623 	__skb_queue_tail(&sk->sk_receive_queue, skb);
624 	spin_unlock(&sk->sk_receive_queue.lock);
625 	sk->sk_data_ready(sk, skb->len);
626 	return 0;
627 
628 drop_n_acct:
629 	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
630 
631 drop_n_restore:
632 	if (skb_head != skb->data && skb_shared(skb)) {
633 		skb->data = skb_head;
634 		skb->len = skb_len;
635 	}
636 drop:
637 	consume_skb(skb);
638 	return 0;
639 }
640 
641 #ifdef CONFIG_PACKET_MMAP
642 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
643 		       struct packet_type *pt, struct net_device *orig_dev)
644 {
645 	struct sock *sk;
646 	struct packet_sock *po;
647 	struct sockaddr_ll *sll;
648 	union {
649 		struct tpacket_hdr *h1;
650 		struct tpacket2_hdr *h2;
651 		void *raw;
652 	} h;
653 	u8 *skb_head = skb->data;
654 	int skb_len = skb->len;
655 	unsigned int snaplen, res;
656 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
657 	unsigned short macoff, netoff, hdrlen;
658 	struct sk_buff *copy_skb = NULL;
659 	struct timeval tv;
660 	struct timespec ts;
661 
662 	if (skb->pkt_type == PACKET_LOOPBACK)
663 		goto drop;
664 
665 	sk = pt->af_packet_priv;
666 	po = pkt_sk(sk);
667 
668 	if (!net_eq(dev_net(dev), sock_net(sk)))
669 		goto drop;
670 
671 	if (dev->header_ops) {
672 		if (sk->sk_type != SOCK_DGRAM)
673 			skb_push(skb, skb->data - skb_mac_header(skb));
674 		else if (skb->pkt_type == PACKET_OUTGOING) {
675 			/* Special case: outgoing packets have ll header at head */
676 			skb_pull(skb, skb_network_offset(skb));
677 		}
678 	}
679 
680 	if (skb->ip_summed == CHECKSUM_PARTIAL)
681 		status |= TP_STATUS_CSUMNOTREADY;
682 
683 	snaplen = skb->len;
684 
685 	res = run_filter(skb, sk, snaplen);
686 	if (!res)
687 		goto drop_n_restore;
688 	if (snaplen > res)
689 		snaplen = res;
690 
691 	if (sk->sk_type == SOCK_DGRAM) {
692 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
693 				  po->tp_reserve;
694 	} else {
695 		unsigned maclen = skb_network_offset(skb);
696 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
697 				       (maclen < 16 ? 16 : maclen)) +
698 			po->tp_reserve;
699 		macoff = netoff - maclen;
700 	}
701 
702 	if (macoff + snaplen > po->rx_ring.frame_size) {
703 		if (po->copy_thresh &&
704 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
705 		    (unsigned)sk->sk_rcvbuf) {
706 			if (skb_shared(skb)) {
707 				copy_skb = skb_clone(skb, GFP_ATOMIC);
708 			} else {
709 				copy_skb = skb_get(skb);
710 				skb_head = skb->data;
711 			}
712 			if (copy_skb)
713 				skb_set_owner_r(copy_skb, sk);
714 		}
715 		snaplen = po->rx_ring.frame_size - macoff;
716 		if ((int)snaplen < 0)
717 			snaplen = 0;
718 	}
719 
720 	spin_lock(&sk->sk_receive_queue.lock);
721 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
722 	if (!h.raw)
723 		goto ring_is_full;
724 	packet_increment_head(&po->rx_ring);
725 	po->stats.tp_packets++;
726 	if (copy_skb) {
727 		status |= TP_STATUS_COPY;
728 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
729 	}
730 	if (!po->stats.tp_drops)
731 		status &= ~TP_STATUS_LOSING;
732 	spin_unlock(&sk->sk_receive_queue.lock);
733 
734 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
735 
736 	switch (po->tp_version) {
737 	case TPACKET_V1:
738 		h.h1->tp_len = skb->len;
739 		h.h1->tp_snaplen = snaplen;
740 		h.h1->tp_mac = macoff;
741 		h.h1->tp_net = netoff;
742 		if (skb->tstamp.tv64)
743 			tv = ktime_to_timeval(skb->tstamp);
744 		else
745 			do_gettimeofday(&tv);
746 		h.h1->tp_sec = tv.tv_sec;
747 		h.h1->tp_usec = tv.tv_usec;
748 		hdrlen = sizeof(*h.h1);
749 		break;
750 	case TPACKET_V2:
751 		h.h2->tp_len = skb->len;
752 		h.h2->tp_snaplen = snaplen;
753 		h.h2->tp_mac = macoff;
754 		h.h2->tp_net = netoff;
755 		if (skb->tstamp.tv64)
756 			ts = ktime_to_timespec(skb->tstamp);
757 		else
758 			getnstimeofday(&ts);
759 		h.h2->tp_sec = ts.tv_sec;
760 		h.h2->tp_nsec = ts.tv_nsec;
761 		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
762 		hdrlen = sizeof(*h.h2);
763 		break;
764 	default:
765 		BUG();
766 	}
767 
768 	sll = h.raw + TPACKET_ALIGN(hdrlen);
769 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
770 	sll->sll_family = AF_PACKET;
771 	sll->sll_hatype = dev->type;
772 	sll->sll_protocol = skb->protocol;
773 	sll->sll_pkttype = skb->pkt_type;
774 	if (unlikely(po->origdev))
775 		sll->sll_ifindex = orig_dev->ifindex;
776 	else
777 		sll->sll_ifindex = dev->ifindex;
778 
779 	__packet_set_status(po, h.raw, status);
780 	smp_mb();
781 	{
782 		struct page *p_start, *p_end;
783 		u8 *h_end = h.raw + macoff + snaplen - 1;
784 
785 		p_start = virt_to_page(h.raw);
786 		p_end = virt_to_page(h_end);
787 		while (p_start <= p_end) {
788 			flush_dcache_page(p_start);
789 			p_start++;
790 		}
791 	}
792 
793 	sk->sk_data_ready(sk, 0);
794 
795 drop_n_restore:
796 	if (skb_head != skb->data && skb_shared(skb)) {
797 		skb->data = skb_head;
798 		skb->len = skb_len;
799 	}
800 drop:
801 	kfree_skb(skb);
802 	return 0;
803 
804 ring_is_full:
805 	po->stats.tp_drops++;
806 	spin_unlock(&sk->sk_receive_queue.lock);
807 
808 	sk->sk_data_ready(sk, 0);
809 	kfree_skb(copy_skb);
810 	goto drop_n_restore;
811 }
812 
813 static void tpacket_destruct_skb(struct sk_buff *skb)
814 {
815 	struct packet_sock *po = pkt_sk(skb->sk);
816 	void *ph;
817 
818 	BUG_ON(skb == NULL);
819 
820 	if (likely(po->tx_ring.pg_vec)) {
821 		ph = skb_shinfo(skb)->destructor_arg;
822 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
823 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
824 		atomic_dec(&po->tx_ring.pending);
825 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
826 	}
827 
828 	sock_wfree(skb);
829 }
830 
831 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
832 		void *frame, struct net_device *dev, int size_max,
833 		__be16 proto, unsigned char *addr)
834 {
835 	union {
836 		struct tpacket_hdr *h1;
837 		struct tpacket2_hdr *h2;
838 		void *raw;
839 	} ph;
840 	int to_write, offset, len, tp_len, nr_frags, len_max;
841 	struct socket *sock = po->sk.sk_socket;
842 	struct page *page;
843 	void *data;
844 	int err;
845 
846 	ph.raw = frame;
847 
848 	skb->protocol = proto;
849 	skb->dev = dev;
850 	skb->priority = po->sk.sk_priority;
851 	skb->mark = po->sk.sk_mark;
852 	skb_shinfo(skb)->destructor_arg = ph.raw;
853 
854 	switch (po->tp_version) {
855 	case TPACKET_V2:
856 		tp_len = ph.h2->tp_len;
857 		break;
858 	default:
859 		tp_len = ph.h1->tp_len;
860 		break;
861 	}
862 	if (unlikely(tp_len > size_max)) {
863 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
864 		return -EMSGSIZE;
865 	}
866 
867 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
868 	skb_reset_network_header(skb);
869 
870 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
871 	to_write = tp_len;
872 
873 	if (sock->type == SOCK_DGRAM) {
874 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
875 				NULL, tp_len);
876 		if (unlikely(err < 0))
877 			return -EINVAL;
878 	} else if (dev->hard_header_len) {
879 		/* net device doesn't like empty head */
880 		if (unlikely(tp_len <= dev->hard_header_len)) {
881 			pr_err("packet size is too short (%d < %d)\n",
882 			       tp_len, dev->hard_header_len);
883 			return -EINVAL;
884 		}
885 
886 		skb_push(skb, dev->hard_header_len);
887 		err = skb_store_bits(skb, 0, data,
888 				dev->hard_header_len);
889 		if (unlikely(err))
890 			return err;
891 
892 		data += dev->hard_header_len;
893 		to_write -= dev->hard_header_len;
894 	}
895 
896 	err = -EFAULT;
897 	page = virt_to_page(data);
898 	offset = offset_in_page(data);
899 	len_max = PAGE_SIZE - offset;
900 	len = ((to_write > len_max) ? len_max : to_write);
901 
902 	skb->data_len = to_write;
903 	skb->len += to_write;
904 	skb->truesize += to_write;
905 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
906 
907 	while (likely(to_write)) {
908 		nr_frags = skb_shinfo(skb)->nr_frags;
909 
910 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
911 			pr_err("Packet exceed the number of skb frags(%lu)\n",
912 			       MAX_SKB_FRAGS);
913 			return -EFAULT;
914 		}
915 
916 		flush_dcache_page(page);
917 		get_page(page);
918 		skb_fill_page_desc(skb,
919 				nr_frags,
920 				page++, offset, len);
921 		to_write -= len;
922 		offset = 0;
923 		len_max = PAGE_SIZE;
924 		len = ((to_write > len_max) ? len_max : to_write);
925 	}
926 
927 	return tp_len;
928 }
929 
930 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
931 {
932 	struct socket *sock;
933 	struct sk_buff *skb;
934 	struct net_device *dev;
935 	__be16 proto;
936 	int ifindex, err, reserve = 0;
937 	void *ph;
938 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
939 	int tp_len, size_max;
940 	unsigned char *addr;
941 	int len_sum = 0;
942 	int status = 0;
943 
944 	sock = po->sk.sk_socket;
945 
946 	mutex_lock(&po->pg_vec_lock);
947 
948 	err = -EBUSY;
949 	if (saddr == NULL) {
950 		ifindex	= po->ifindex;
951 		proto	= po->num;
952 		addr	= NULL;
953 	} else {
954 		err = -EINVAL;
955 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
956 			goto out;
957 		if (msg->msg_namelen < (saddr->sll_halen
958 					+ offsetof(struct sockaddr_ll,
959 						sll_addr)))
960 			goto out;
961 		ifindex	= saddr->sll_ifindex;
962 		proto	= saddr->sll_protocol;
963 		addr	= saddr->sll_addr;
964 	}
965 
966 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
967 	err = -ENXIO;
968 	if (unlikely(dev == NULL))
969 		goto out;
970 
971 	reserve = dev->hard_header_len;
972 
973 	err = -ENETDOWN;
974 	if (unlikely(!(dev->flags & IFF_UP)))
975 		goto out_put;
976 
977 	size_max = po->tx_ring.frame_size
978 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
979 
980 	if (size_max > dev->mtu + reserve)
981 		size_max = dev->mtu + reserve;
982 
983 	do {
984 		ph = packet_current_frame(po, &po->tx_ring,
985 				TP_STATUS_SEND_REQUEST);
986 
987 		if (unlikely(ph == NULL)) {
988 			schedule();
989 			continue;
990 		}
991 
992 		status = TP_STATUS_SEND_REQUEST;
993 		skb = sock_alloc_send_skb(&po->sk,
994 				LL_ALLOCATED_SPACE(dev)
995 				+ sizeof(struct sockaddr_ll),
996 				0, &err);
997 
998 		if (unlikely(skb == NULL))
999 			goto out_status;
1000 
1001 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1002 				addr);
1003 
1004 		if (unlikely(tp_len < 0)) {
1005 			if (po->tp_loss) {
1006 				__packet_set_status(po, ph,
1007 						TP_STATUS_AVAILABLE);
1008 				packet_increment_head(&po->tx_ring);
1009 				kfree_skb(skb);
1010 				continue;
1011 			} else {
1012 				status = TP_STATUS_WRONG_FORMAT;
1013 				err = tp_len;
1014 				goto out_status;
1015 			}
1016 		}
1017 
1018 		skb->destructor = tpacket_destruct_skb;
1019 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1020 		atomic_inc(&po->tx_ring.pending);
1021 
1022 		status = TP_STATUS_SEND_REQUEST;
1023 		err = dev_queue_xmit(skb);
1024 		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1025 			goto out_xmit;
1026 		packet_increment_head(&po->tx_ring);
1027 		len_sum += tp_len;
1028 	} while (likely((ph != NULL) ||
1029 			((!(msg->msg_flags & MSG_DONTWAIT)) &&
1030 			 (atomic_read(&po->tx_ring.pending))))
1031 		);
1032 
1033 	err = len_sum;
1034 	goto out_put;
1035 
1036 out_xmit:
1037 	skb->destructor = sock_wfree;
1038 	atomic_dec(&po->tx_ring.pending);
1039 out_status:
1040 	__packet_set_status(po, ph, status);
1041 	kfree_skb(skb);
1042 out_put:
1043 	dev_put(dev);
1044 out:
1045 	mutex_unlock(&po->pg_vec_lock);
1046 	return err;
1047 }
1048 #endif
1049 
1050 static int packet_snd(struct socket *sock,
1051 			  struct msghdr *msg, size_t len)
1052 {
1053 	struct sock *sk = sock->sk;
1054 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1055 	struct sk_buff *skb;
1056 	struct net_device *dev;
1057 	__be16 proto;
1058 	unsigned char *addr;
1059 	int ifindex, err, reserve = 0;
1060 
1061 	/*
1062 	 *	Get and verify the address.
1063 	 */
1064 
1065 	if (saddr == NULL) {
1066 		struct packet_sock *po = pkt_sk(sk);
1067 
1068 		ifindex	= po->ifindex;
1069 		proto	= po->num;
1070 		addr	= NULL;
1071 	} else {
1072 		err = -EINVAL;
1073 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1074 			goto out;
1075 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1076 			goto out;
1077 		ifindex	= saddr->sll_ifindex;
1078 		proto	= saddr->sll_protocol;
1079 		addr	= saddr->sll_addr;
1080 	}
1081 
1082 
1083 	dev = dev_get_by_index(sock_net(sk), ifindex);
1084 	err = -ENXIO;
1085 	if (dev == NULL)
1086 		goto out_unlock;
1087 	if (sock->type == SOCK_RAW)
1088 		reserve = dev->hard_header_len;
1089 
1090 	err = -ENETDOWN;
1091 	if (!(dev->flags & IFF_UP))
1092 		goto out_unlock;
1093 
1094 	err = -EMSGSIZE;
1095 	if (len > dev->mtu+reserve)
1096 		goto out_unlock;
1097 
1098 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1099 				msg->msg_flags & MSG_DONTWAIT, &err);
1100 	if (skb == NULL)
1101 		goto out_unlock;
1102 
1103 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1104 	skb_reset_network_header(skb);
1105 
1106 	err = -EINVAL;
1107 	if (sock->type == SOCK_DGRAM &&
1108 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1109 		goto out_free;
1110 
1111 	/* Returns -EFAULT on error */
1112 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1113 	if (err)
1114 		goto out_free;
1115 
1116 	skb->protocol = proto;
1117 	skb->dev = dev;
1118 	skb->priority = sk->sk_priority;
1119 	skb->mark = sk->sk_mark;
1120 
1121 	/*
1122 	 *	Now send it
1123 	 */
1124 
1125 	err = dev_queue_xmit(skb);
1126 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1127 		goto out_unlock;
1128 
1129 	dev_put(dev);
1130 
1131 	return len;
1132 
1133 out_free:
1134 	kfree_skb(skb);
1135 out_unlock:
1136 	if (dev)
1137 		dev_put(dev);
1138 out:
1139 	return err;
1140 }
1141 
1142 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1143 		struct msghdr *msg, size_t len)
1144 {
1145 #ifdef CONFIG_PACKET_MMAP
1146 	struct sock *sk = sock->sk;
1147 	struct packet_sock *po = pkt_sk(sk);
1148 	if (po->tx_ring.pg_vec)
1149 		return tpacket_snd(po, msg);
1150 	else
1151 #endif
1152 		return packet_snd(sock, msg, len);
1153 }
1154 
1155 /*
1156  *	Close a PACKET socket. This is fairly simple. We immediately go
1157  *	to 'closed' state and remove our protocol entry in the device list.
1158  */
1159 
1160 static int packet_release(struct socket *sock)
1161 {
1162 	struct sock *sk = sock->sk;
1163 	struct packet_sock *po;
1164 	struct net *net;
1165 #ifdef CONFIG_PACKET_MMAP
1166 	struct tpacket_req req;
1167 #endif
1168 
1169 	if (!sk)
1170 		return 0;
1171 
1172 	net = sock_net(sk);
1173 	po = pkt_sk(sk);
1174 
1175 	write_lock_bh(&net->packet.sklist_lock);
1176 	sk_del_node_init(sk);
1177 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1178 	write_unlock_bh(&net->packet.sklist_lock);
1179 
1180 	/*
1181 	 *	Unhook packet receive handler.
1182 	 */
1183 
1184 	if (po->running) {
1185 		/*
1186 		 *	Remove the protocol hook
1187 		 */
1188 		dev_remove_pack(&po->prot_hook);
1189 		po->running = 0;
1190 		po->num = 0;
1191 		__sock_put(sk);
1192 	}
1193 
1194 	packet_flush_mclist(sk);
1195 
1196 #ifdef CONFIG_PACKET_MMAP
1197 	memset(&req, 0, sizeof(req));
1198 
1199 	if (po->rx_ring.pg_vec)
1200 		packet_set_ring(sk, &req, 1, 0);
1201 
1202 	if (po->tx_ring.pg_vec)
1203 		packet_set_ring(sk, &req, 1, 1);
1204 #endif
1205 
1206 	/*
1207 	 *	Now the socket is dead. No more input will appear.
1208 	 */
1209 
1210 	sock_orphan(sk);
1211 	sock->sk = NULL;
1212 
1213 	/* Purge queues */
1214 
1215 	skb_queue_purge(&sk->sk_receive_queue);
1216 	sk_refcnt_debug_release(sk);
1217 
1218 	sock_put(sk);
1219 	return 0;
1220 }
1221 
1222 /*
1223  *	Attach a packet hook.
1224  */
1225 
1226 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1227 {
1228 	struct packet_sock *po = pkt_sk(sk);
1229 	/*
1230 	 *	Detach an existing hook if present.
1231 	 */
1232 
1233 	lock_sock(sk);
1234 
1235 	spin_lock(&po->bind_lock);
1236 	if (po->running) {
1237 		__sock_put(sk);
1238 		po->running = 0;
1239 		po->num = 0;
1240 		spin_unlock(&po->bind_lock);
1241 		dev_remove_pack(&po->prot_hook);
1242 		spin_lock(&po->bind_lock);
1243 	}
1244 
1245 	po->num = protocol;
1246 	po->prot_hook.type = protocol;
1247 	po->prot_hook.dev = dev;
1248 
1249 	po->ifindex = dev ? dev->ifindex : 0;
1250 
1251 	if (protocol == 0)
1252 		goto out_unlock;
1253 
1254 	if (!dev || (dev->flags & IFF_UP)) {
1255 		dev_add_pack(&po->prot_hook);
1256 		sock_hold(sk);
1257 		po->running = 1;
1258 	} else {
1259 		sk->sk_err = ENETDOWN;
1260 		if (!sock_flag(sk, SOCK_DEAD))
1261 			sk->sk_error_report(sk);
1262 	}
1263 
1264 out_unlock:
1265 	spin_unlock(&po->bind_lock);
1266 	release_sock(sk);
1267 	return 0;
1268 }
1269 
1270 /*
1271  *	Bind a packet socket to a device
1272  */
1273 
1274 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1275 			    int addr_len)
1276 {
1277 	struct sock *sk = sock->sk;
1278 	char name[15];
1279 	struct net_device *dev;
1280 	int err = -ENODEV;
1281 
1282 	/*
1283 	 *	Check legality
1284 	 */
1285 
1286 	if (addr_len != sizeof(struct sockaddr))
1287 		return -EINVAL;
1288 	strlcpy(name, uaddr->sa_data, sizeof(name));
1289 
1290 	dev = dev_get_by_name(sock_net(sk), name);
1291 	if (dev) {
1292 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1293 		dev_put(dev);
1294 	}
1295 	return err;
1296 }
1297 
1298 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1299 {
1300 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1301 	struct sock *sk = sock->sk;
1302 	struct net_device *dev = NULL;
1303 	int err;
1304 
1305 
1306 	/*
1307 	 *	Check legality
1308 	 */
1309 
1310 	if (addr_len < sizeof(struct sockaddr_ll))
1311 		return -EINVAL;
1312 	if (sll->sll_family != AF_PACKET)
1313 		return -EINVAL;
1314 
1315 	if (sll->sll_ifindex) {
1316 		err = -ENODEV;
1317 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1318 		if (dev == NULL)
1319 			goto out;
1320 	}
1321 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1322 	if (dev)
1323 		dev_put(dev);
1324 
1325 out:
1326 	return err;
1327 }
1328 
1329 static struct proto packet_proto = {
1330 	.name	  = "PACKET",
1331 	.owner	  = THIS_MODULE,
1332 	.obj_size = sizeof(struct packet_sock),
1333 };
1334 
1335 /*
1336  *	Create a packet of type SOCK_PACKET.
1337  */
1338 
1339 static int packet_create(struct net *net, struct socket *sock, int protocol,
1340 			 int kern)
1341 {
1342 	struct sock *sk;
1343 	struct packet_sock *po;
1344 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1345 	int err;
1346 
1347 	if (!capable(CAP_NET_RAW))
1348 		return -EPERM;
1349 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1350 	    sock->type != SOCK_PACKET)
1351 		return -ESOCKTNOSUPPORT;
1352 
1353 	sock->state = SS_UNCONNECTED;
1354 
1355 	err = -ENOBUFS;
1356 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1357 	if (sk == NULL)
1358 		goto out;
1359 
1360 	sock->ops = &packet_ops;
1361 	if (sock->type == SOCK_PACKET)
1362 		sock->ops = &packet_ops_spkt;
1363 
1364 	sock_init_data(sock, sk);
1365 
1366 	po = pkt_sk(sk);
1367 	sk->sk_family = PF_PACKET;
1368 	po->num = proto;
1369 
1370 	sk->sk_destruct = packet_sock_destruct;
1371 	sk_refcnt_debug_inc(sk);
1372 
1373 	/*
1374 	 *	Attach a protocol block
1375 	 */
1376 
1377 	spin_lock_init(&po->bind_lock);
1378 	mutex_init(&po->pg_vec_lock);
1379 	po->prot_hook.func = packet_rcv;
1380 
1381 	if (sock->type == SOCK_PACKET)
1382 		po->prot_hook.func = packet_rcv_spkt;
1383 
1384 	po->prot_hook.af_packet_priv = sk;
1385 
1386 	if (proto) {
1387 		po->prot_hook.type = proto;
1388 		dev_add_pack(&po->prot_hook);
1389 		sock_hold(sk);
1390 		po->running = 1;
1391 	}
1392 
1393 	write_lock_bh(&net->packet.sklist_lock);
1394 	sk_add_node(sk, &net->packet.sklist);
1395 	sock_prot_inuse_add(net, &packet_proto, 1);
1396 	write_unlock_bh(&net->packet.sklist_lock);
1397 	return 0;
1398 out:
1399 	return err;
1400 }
1401 
1402 /*
1403  *	Pull a packet from our receive queue and hand it to the user.
1404  *	If necessary we block.
1405  */
1406 
1407 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1408 			  struct msghdr *msg, size_t len, int flags)
1409 {
1410 	struct sock *sk = sock->sk;
1411 	struct sk_buff *skb;
1412 	int copied, err;
1413 	struct sockaddr_ll *sll;
1414 
1415 	err = -EINVAL;
1416 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1417 		goto out;
1418 
1419 #if 0
1420 	/* What error should we return now? EUNATTACH? */
1421 	if (pkt_sk(sk)->ifindex < 0)
1422 		return -ENODEV;
1423 #endif
1424 
1425 	/*
1426 	 *	Call the generic datagram receiver. This handles all sorts
1427 	 *	of horrible races and re-entrancy so we can forget about it
1428 	 *	in the protocol layers.
1429 	 *
1430 	 *	Now it will return ENETDOWN, if device have just gone down,
1431 	 *	but then it will block.
1432 	 */
1433 
1434 	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1435 
1436 	/*
1437 	 *	An error occurred so return it. Because skb_recv_datagram()
1438 	 *	handles the blocking we don't see and worry about blocking
1439 	 *	retries.
1440 	 */
1441 
1442 	if (skb == NULL)
1443 		goto out;
1444 
1445 	/*
1446 	 *	If the address length field is there to be filled in, we fill
1447 	 *	it in now.
1448 	 */
1449 
1450 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1451 	if (sock->type == SOCK_PACKET)
1452 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1453 	else
1454 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1455 
1456 	/*
1457 	 *	You lose any data beyond the buffer you gave. If it worries a
1458 	 *	user program they can ask the device for its MTU anyway.
1459 	 */
1460 
1461 	copied = skb->len;
1462 	if (copied > len) {
1463 		copied = len;
1464 		msg->msg_flags |= MSG_TRUNC;
1465 	}
1466 
1467 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1468 	if (err)
1469 		goto out_free;
1470 
1471 	sock_recv_ts_and_drops(msg, sk, skb);
1472 
1473 	if (msg->msg_name)
1474 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1475 		       msg->msg_namelen);
1476 
1477 	if (pkt_sk(sk)->auxdata) {
1478 		struct tpacket_auxdata aux;
1479 
1480 		aux.tp_status = TP_STATUS_USER;
1481 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1482 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1483 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1484 		aux.tp_snaplen = skb->len;
1485 		aux.tp_mac = 0;
1486 		aux.tp_net = skb_network_offset(skb);
1487 		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1488 
1489 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1490 	}
1491 
1492 	/*
1493 	 *	Free or return the buffer as appropriate. Again this
1494 	 *	hides all the races and re-entrancy issues from us.
1495 	 */
1496 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1497 
1498 out_free:
1499 	skb_free_datagram(sk, skb);
1500 out:
1501 	return err;
1502 }
1503 
1504 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1505 			       int *uaddr_len, int peer)
1506 {
1507 	struct net_device *dev;
1508 	struct sock *sk	= sock->sk;
1509 
1510 	if (peer)
1511 		return -EOPNOTSUPP;
1512 
1513 	uaddr->sa_family = AF_PACKET;
1514 	rcu_read_lock();
1515 	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1516 	if (dev)
1517 		strlcpy(uaddr->sa_data, dev->name, 15);
1518 	else
1519 		memset(uaddr->sa_data, 0, 14);
1520 	rcu_read_unlock();
1521 	*uaddr_len = sizeof(*uaddr);
1522 
1523 	return 0;
1524 }
1525 
1526 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1527 			  int *uaddr_len, int peer)
1528 {
1529 	struct net_device *dev;
1530 	struct sock *sk = sock->sk;
1531 	struct packet_sock *po = pkt_sk(sk);
1532 	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1533 
1534 	if (peer)
1535 		return -EOPNOTSUPP;
1536 
1537 	sll->sll_family = AF_PACKET;
1538 	sll->sll_ifindex = po->ifindex;
1539 	sll->sll_protocol = po->num;
1540 	rcu_read_lock();
1541 	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1542 	if (dev) {
1543 		sll->sll_hatype = dev->type;
1544 		sll->sll_halen = dev->addr_len;
1545 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1546 	} else {
1547 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1548 		sll->sll_halen = 0;
1549 	}
1550 	rcu_read_unlock();
1551 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1552 
1553 	return 0;
1554 }
1555 
1556 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1557 			 int what)
1558 {
1559 	switch (i->type) {
1560 	case PACKET_MR_MULTICAST:
1561 		if (what > 0)
1562 			return dev_mc_add(dev, i->addr, i->alen, 0);
1563 		else
1564 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1565 		break;
1566 	case PACKET_MR_PROMISC:
1567 		return dev_set_promiscuity(dev, what);
1568 		break;
1569 	case PACKET_MR_ALLMULTI:
1570 		return dev_set_allmulti(dev, what);
1571 		break;
1572 	case PACKET_MR_UNICAST:
1573 		if (what > 0)
1574 			return dev_unicast_add(dev, i->addr);
1575 		else
1576 			return dev_unicast_delete(dev, i->addr);
1577 		break;
1578 	default:
1579 		break;
1580 	}
1581 	return 0;
1582 }
1583 
1584 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1585 {
1586 	for ( ; i; i = i->next) {
1587 		if (i->ifindex == dev->ifindex)
1588 			packet_dev_mc(dev, i, what);
1589 	}
1590 }
1591 
1592 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1593 {
1594 	struct packet_sock *po = pkt_sk(sk);
1595 	struct packet_mclist *ml, *i;
1596 	struct net_device *dev;
1597 	int err;
1598 
1599 	rtnl_lock();
1600 
1601 	err = -ENODEV;
1602 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1603 	if (!dev)
1604 		goto done;
1605 
1606 	err = -EINVAL;
1607 	if (mreq->mr_alen > dev->addr_len)
1608 		goto done;
1609 
1610 	err = -ENOBUFS;
1611 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1612 	if (i == NULL)
1613 		goto done;
1614 
1615 	err = 0;
1616 	for (ml = po->mclist; ml; ml = ml->next) {
1617 		if (ml->ifindex == mreq->mr_ifindex &&
1618 		    ml->type == mreq->mr_type &&
1619 		    ml->alen == mreq->mr_alen &&
1620 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1621 			ml->count++;
1622 			/* Free the new element ... */
1623 			kfree(i);
1624 			goto done;
1625 		}
1626 	}
1627 
1628 	i->type = mreq->mr_type;
1629 	i->ifindex = mreq->mr_ifindex;
1630 	i->alen = mreq->mr_alen;
1631 	memcpy(i->addr, mreq->mr_address, i->alen);
1632 	i->count = 1;
1633 	i->next = po->mclist;
1634 	po->mclist = i;
1635 	err = packet_dev_mc(dev, i, 1);
1636 	if (err) {
1637 		po->mclist = i->next;
1638 		kfree(i);
1639 	}
1640 
1641 done:
1642 	rtnl_unlock();
1643 	return err;
1644 }
1645 
1646 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1647 {
1648 	struct packet_mclist *ml, **mlp;
1649 
1650 	rtnl_lock();
1651 
1652 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1653 		if (ml->ifindex == mreq->mr_ifindex &&
1654 		    ml->type == mreq->mr_type &&
1655 		    ml->alen == mreq->mr_alen &&
1656 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1657 			if (--ml->count == 0) {
1658 				struct net_device *dev;
1659 				*mlp = ml->next;
1660 				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1661 				if (dev)
1662 					packet_dev_mc(dev, ml, -1);
1663 				kfree(ml);
1664 			}
1665 			rtnl_unlock();
1666 			return 0;
1667 		}
1668 	}
1669 	rtnl_unlock();
1670 	return -EADDRNOTAVAIL;
1671 }
1672 
1673 static void packet_flush_mclist(struct sock *sk)
1674 {
1675 	struct packet_sock *po = pkt_sk(sk);
1676 	struct packet_mclist *ml;
1677 
1678 	if (!po->mclist)
1679 		return;
1680 
1681 	rtnl_lock();
1682 	while ((ml = po->mclist) != NULL) {
1683 		struct net_device *dev;
1684 
1685 		po->mclist = ml->next;
1686 		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1687 		if (dev != NULL)
1688 			packet_dev_mc(dev, ml, -1);
1689 		kfree(ml);
1690 	}
1691 	rtnl_unlock();
1692 }
1693 
1694 static int
1695 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1696 {
1697 	struct sock *sk = sock->sk;
1698 	struct packet_sock *po = pkt_sk(sk);
1699 	int ret;
1700 
1701 	if (level != SOL_PACKET)
1702 		return -ENOPROTOOPT;
1703 
1704 	switch (optname) {
1705 	case PACKET_ADD_MEMBERSHIP:
1706 	case PACKET_DROP_MEMBERSHIP:
1707 	{
1708 		struct packet_mreq_max mreq;
1709 		int len = optlen;
1710 		memset(&mreq, 0, sizeof(mreq));
1711 		if (len < sizeof(struct packet_mreq))
1712 			return -EINVAL;
1713 		if (len > sizeof(mreq))
1714 			len = sizeof(mreq);
1715 		if (copy_from_user(&mreq, optval, len))
1716 			return -EFAULT;
1717 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1718 			return -EINVAL;
1719 		if (optname == PACKET_ADD_MEMBERSHIP)
1720 			ret = packet_mc_add(sk, &mreq);
1721 		else
1722 			ret = packet_mc_drop(sk, &mreq);
1723 		return ret;
1724 	}
1725 
1726 #ifdef CONFIG_PACKET_MMAP
1727 	case PACKET_RX_RING:
1728 	case PACKET_TX_RING:
1729 	{
1730 		struct tpacket_req req;
1731 
1732 		if (optlen < sizeof(req))
1733 			return -EINVAL;
1734 		if (copy_from_user(&req, optval, sizeof(req)))
1735 			return -EFAULT;
1736 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1737 	}
1738 	case PACKET_COPY_THRESH:
1739 	{
1740 		int val;
1741 
1742 		if (optlen != sizeof(val))
1743 			return -EINVAL;
1744 		if (copy_from_user(&val, optval, sizeof(val)))
1745 			return -EFAULT;
1746 
1747 		pkt_sk(sk)->copy_thresh = val;
1748 		return 0;
1749 	}
1750 	case PACKET_VERSION:
1751 	{
1752 		int val;
1753 
1754 		if (optlen != sizeof(val))
1755 			return -EINVAL;
1756 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1757 			return -EBUSY;
1758 		if (copy_from_user(&val, optval, sizeof(val)))
1759 			return -EFAULT;
1760 		switch (val) {
1761 		case TPACKET_V1:
1762 		case TPACKET_V2:
1763 			po->tp_version = val;
1764 			return 0;
1765 		default:
1766 			return -EINVAL;
1767 		}
1768 	}
1769 	case PACKET_RESERVE:
1770 	{
1771 		unsigned int val;
1772 
1773 		if (optlen != sizeof(val))
1774 			return -EINVAL;
1775 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1776 			return -EBUSY;
1777 		if (copy_from_user(&val, optval, sizeof(val)))
1778 			return -EFAULT;
1779 		po->tp_reserve = val;
1780 		return 0;
1781 	}
1782 	case PACKET_LOSS:
1783 	{
1784 		unsigned int val;
1785 
1786 		if (optlen != sizeof(val))
1787 			return -EINVAL;
1788 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1789 			return -EBUSY;
1790 		if (copy_from_user(&val, optval, sizeof(val)))
1791 			return -EFAULT;
1792 		po->tp_loss = !!val;
1793 		return 0;
1794 	}
1795 #endif
1796 	case PACKET_AUXDATA:
1797 	{
1798 		int val;
1799 
1800 		if (optlen < sizeof(val))
1801 			return -EINVAL;
1802 		if (copy_from_user(&val, optval, sizeof(val)))
1803 			return -EFAULT;
1804 
1805 		po->auxdata = !!val;
1806 		return 0;
1807 	}
1808 	case PACKET_ORIGDEV:
1809 	{
1810 		int val;
1811 
1812 		if (optlen < sizeof(val))
1813 			return -EINVAL;
1814 		if (copy_from_user(&val, optval, sizeof(val)))
1815 			return -EFAULT;
1816 
1817 		po->origdev = !!val;
1818 		return 0;
1819 	}
1820 	default:
1821 		return -ENOPROTOOPT;
1822 	}
1823 }
1824 
1825 static int packet_getsockopt(struct socket *sock, int level, int optname,
1826 			     char __user *optval, int __user *optlen)
1827 {
1828 	int len;
1829 	int val;
1830 	struct sock *sk = sock->sk;
1831 	struct packet_sock *po = pkt_sk(sk);
1832 	void *data;
1833 	struct tpacket_stats st;
1834 
1835 	if (level != SOL_PACKET)
1836 		return -ENOPROTOOPT;
1837 
1838 	if (get_user(len, optlen))
1839 		return -EFAULT;
1840 
1841 	if (len < 0)
1842 		return -EINVAL;
1843 
1844 	switch (optname) {
1845 	case PACKET_STATISTICS:
1846 		if (len > sizeof(struct tpacket_stats))
1847 			len = sizeof(struct tpacket_stats);
1848 		spin_lock_bh(&sk->sk_receive_queue.lock);
1849 		st = po->stats;
1850 		memset(&po->stats, 0, sizeof(st));
1851 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1852 		st.tp_packets += st.tp_drops;
1853 
1854 		data = &st;
1855 		break;
1856 	case PACKET_AUXDATA:
1857 		if (len > sizeof(int))
1858 			len = sizeof(int);
1859 		val = po->auxdata;
1860 
1861 		data = &val;
1862 		break;
1863 	case PACKET_ORIGDEV:
1864 		if (len > sizeof(int))
1865 			len = sizeof(int);
1866 		val = po->origdev;
1867 
1868 		data = &val;
1869 		break;
1870 #ifdef CONFIG_PACKET_MMAP
1871 	case PACKET_VERSION:
1872 		if (len > sizeof(int))
1873 			len = sizeof(int);
1874 		val = po->tp_version;
1875 		data = &val;
1876 		break;
1877 	case PACKET_HDRLEN:
1878 		if (len > sizeof(int))
1879 			len = sizeof(int);
1880 		if (copy_from_user(&val, optval, len))
1881 			return -EFAULT;
1882 		switch (val) {
1883 		case TPACKET_V1:
1884 			val = sizeof(struct tpacket_hdr);
1885 			break;
1886 		case TPACKET_V2:
1887 			val = sizeof(struct tpacket2_hdr);
1888 			break;
1889 		default:
1890 			return -EINVAL;
1891 		}
1892 		data = &val;
1893 		break;
1894 	case PACKET_RESERVE:
1895 		if (len > sizeof(unsigned int))
1896 			len = sizeof(unsigned int);
1897 		val = po->tp_reserve;
1898 		data = &val;
1899 		break;
1900 	case PACKET_LOSS:
1901 		if (len > sizeof(unsigned int))
1902 			len = sizeof(unsigned int);
1903 		val = po->tp_loss;
1904 		data = &val;
1905 		break;
1906 #endif
1907 	default:
1908 		return -ENOPROTOOPT;
1909 	}
1910 
1911 	if (put_user(len, optlen))
1912 		return -EFAULT;
1913 	if (copy_to_user(optval, data, len))
1914 		return -EFAULT;
1915 	return 0;
1916 }
1917 
1918 
1919 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1920 {
1921 	struct sock *sk;
1922 	struct hlist_node *node;
1923 	struct net_device *dev = data;
1924 	struct net *net = dev_net(dev);
1925 
1926 	read_lock(&net->packet.sklist_lock);
1927 	sk_for_each(sk, node, &net->packet.sklist) {
1928 		struct packet_sock *po = pkt_sk(sk);
1929 
1930 		switch (msg) {
1931 		case NETDEV_UNREGISTER:
1932 			if (po->mclist)
1933 				packet_dev_mclist(dev, po->mclist, -1);
1934 			/* fallthrough */
1935 
1936 		case NETDEV_DOWN:
1937 			if (dev->ifindex == po->ifindex) {
1938 				spin_lock(&po->bind_lock);
1939 				if (po->running) {
1940 					__dev_remove_pack(&po->prot_hook);
1941 					__sock_put(sk);
1942 					po->running = 0;
1943 					sk->sk_err = ENETDOWN;
1944 					if (!sock_flag(sk, SOCK_DEAD))
1945 						sk->sk_error_report(sk);
1946 				}
1947 				if (msg == NETDEV_UNREGISTER) {
1948 					po->ifindex = -1;
1949 					po->prot_hook.dev = NULL;
1950 				}
1951 				spin_unlock(&po->bind_lock);
1952 			}
1953 			break;
1954 		case NETDEV_UP:
1955 			spin_lock(&po->bind_lock);
1956 			if (dev->ifindex == po->ifindex && po->num &&
1957 			    !po->running) {
1958 				dev_add_pack(&po->prot_hook);
1959 				sock_hold(sk);
1960 				po->running = 1;
1961 			}
1962 			spin_unlock(&po->bind_lock);
1963 			break;
1964 		}
1965 	}
1966 	read_unlock(&net->packet.sklist_lock);
1967 	return NOTIFY_DONE;
1968 }
1969 
1970 
1971 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1972 			unsigned long arg)
1973 {
1974 	struct sock *sk = sock->sk;
1975 
1976 	switch (cmd) {
1977 	case SIOCOUTQ:
1978 	{
1979 		int amount = sk_wmem_alloc_get(sk);
1980 
1981 		return put_user(amount, (int __user *)arg);
1982 	}
1983 	case SIOCINQ:
1984 	{
1985 		struct sk_buff *skb;
1986 		int amount = 0;
1987 
1988 		spin_lock_bh(&sk->sk_receive_queue.lock);
1989 		skb = skb_peek(&sk->sk_receive_queue);
1990 		if (skb)
1991 			amount = skb->len;
1992 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1993 		return put_user(amount, (int __user *)arg);
1994 	}
1995 	case SIOCGSTAMP:
1996 		return sock_get_timestamp(sk, (struct timeval __user *)arg);
1997 	case SIOCGSTAMPNS:
1998 		return sock_get_timestampns(sk, (struct timespec __user *)arg);
1999 
2000 #ifdef CONFIG_INET
2001 	case SIOCADDRT:
2002 	case SIOCDELRT:
2003 	case SIOCDARP:
2004 	case SIOCGARP:
2005 	case SIOCSARP:
2006 	case SIOCGIFADDR:
2007 	case SIOCSIFADDR:
2008 	case SIOCGIFBRDADDR:
2009 	case SIOCSIFBRDADDR:
2010 	case SIOCGIFNETMASK:
2011 	case SIOCSIFNETMASK:
2012 	case SIOCGIFDSTADDR:
2013 	case SIOCSIFDSTADDR:
2014 	case SIOCSIFFLAGS:
2015 		if (!net_eq(sock_net(sk), &init_net))
2016 			return -ENOIOCTLCMD;
2017 		return inet_dgram_ops.ioctl(sock, cmd, arg);
2018 #endif
2019 
2020 	default:
2021 		return -ENOIOCTLCMD;
2022 	}
2023 	return 0;
2024 }
2025 
2026 #ifndef CONFIG_PACKET_MMAP
2027 #define packet_mmap sock_no_mmap
2028 #define packet_poll datagram_poll
2029 #else
2030 
2031 static unsigned int packet_poll(struct file *file, struct socket *sock,
2032 				poll_table *wait)
2033 {
2034 	struct sock *sk = sock->sk;
2035 	struct packet_sock *po = pkt_sk(sk);
2036 	unsigned int mask = datagram_poll(file, sock, wait);
2037 
2038 	spin_lock_bh(&sk->sk_receive_queue.lock);
2039 	if (po->rx_ring.pg_vec) {
2040 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2041 			mask |= POLLIN | POLLRDNORM;
2042 	}
2043 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2044 	spin_lock_bh(&sk->sk_write_queue.lock);
2045 	if (po->tx_ring.pg_vec) {
2046 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2047 			mask |= POLLOUT | POLLWRNORM;
2048 	}
2049 	spin_unlock_bh(&sk->sk_write_queue.lock);
2050 	return mask;
2051 }
2052 
2053 
2054 /* Dirty? Well, I still did not learn better way to account
2055  * for user mmaps.
2056  */
2057 
2058 static void packet_mm_open(struct vm_area_struct *vma)
2059 {
2060 	struct file *file = vma->vm_file;
2061 	struct socket *sock = file->private_data;
2062 	struct sock *sk = sock->sk;
2063 
2064 	if (sk)
2065 		atomic_inc(&pkt_sk(sk)->mapped);
2066 }
2067 
2068 static void packet_mm_close(struct vm_area_struct *vma)
2069 {
2070 	struct file *file = vma->vm_file;
2071 	struct socket *sock = file->private_data;
2072 	struct sock *sk = sock->sk;
2073 
2074 	if (sk)
2075 		atomic_dec(&pkt_sk(sk)->mapped);
2076 }
2077 
2078 static const struct vm_operations_struct packet_mmap_ops = {
2079 	.open	=	packet_mm_open,
2080 	.close	=	packet_mm_close,
2081 };
2082 
2083 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2084 {
2085 	int i;
2086 
2087 	for (i = 0; i < len; i++) {
2088 		if (likely(pg_vec[i]))
2089 			free_pages((unsigned long) pg_vec[i], order);
2090 	}
2091 	kfree(pg_vec);
2092 }
2093 
2094 static inline char *alloc_one_pg_vec_page(unsigned long order)
2095 {
2096 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2097 
2098 	return (char *) __get_free_pages(gfp_flags, order);
2099 }
2100 
2101 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2102 {
2103 	unsigned int block_nr = req->tp_block_nr;
2104 	char **pg_vec;
2105 	int i;
2106 
2107 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2108 	if (unlikely(!pg_vec))
2109 		goto out;
2110 
2111 	for (i = 0; i < block_nr; i++) {
2112 		pg_vec[i] = alloc_one_pg_vec_page(order);
2113 		if (unlikely(!pg_vec[i]))
2114 			goto out_free_pgvec;
2115 	}
2116 
2117 out:
2118 	return pg_vec;
2119 
2120 out_free_pgvec:
2121 	free_pg_vec(pg_vec, order, block_nr);
2122 	pg_vec = NULL;
2123 	goto out;
2124 }
2125 
2126 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2127 		int closing, int tx_ring)
2128 {
2129 	char **pg_vec = NULL;
2130 	struct packet_sock *po = pkt_sk(sk);
2131 	int was_running, order = 0;
2132 	struct packet_ring_buffer *rb;
2133 	struct sk_buff_head *rb_queue;
2134 	__be16 num;
2135 	int err;
2136 
2137 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2138 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2139 
2140 	err = -EBUSY;
2141 	if (!closing) {
2142 		if (atomic_read(&po->mapped))
2143 			goto out;
2144 		if (atomic_read(&rb->pending))
2145 			goto out;
2146 	}
2147 
2148 	if (req->tp_block_nr) {
2149 		/* Sanity tests and some calculations */
2150 		err = -EBUSY;
2151 		if (unlikely(rb->pg_vec))
2152 			goto out;
2153 
2154 		switch (po->tp_version) {
2155 		case TPACKET_V1:
2156 			po->tp_hdrlen = TPACKET_HDRLEN;
2157 			break;
2158 		case TPACKET_V2:
2159 			po->tp_hdrlen = TPACKET2_HDRLEN;
2160 			break;
2161 		}
2162 
2163 		err = -EINVAL;
2164 		if (unlikely((int)req->tp_block_size <= 0))
2165 			goto out;
2166 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2167 			goto out;
2168 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2169 					po->tp_reserve))
2170 			goto out;
2171 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2172 			goto out;
2173 
2174 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2175 		if (unlikely(rb->frames_per_block <= 0))
2176 			goto out;
2177 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2178 					req->tp_frame_nr))
2179 			goto out;
2180 
2181 		err = -ENOMEM;
2182 		order = get_order(req->tp_block_size);
2183 		pg_vec = alloc_pg_vec(req, order);
2184 		if (unlikely(!pg_vec))
2185 			goto out;
2186 	}
2187 	/* Done */
2188 	else {
2189 		err = -EINVAL;
2190 		if (unlikely(req->tp_frame_nr))
2191 			goto out;
2192 	}
2193 
2194 	lock_sock(sk);
2195 
2196 	/* Detach socket from network */
2197 	spin_lock(&po->bind_lock);
2198 	was_running = po->running;
2199 	num = po->num;
2200 	if (was_running) {
2201 		__dev_remove_pack(&po->prot_hook);
2202 		po->num = 0;
2203 		po->running = 0;
2204 		__sock_put(sk);
2205 	}
2206 	spin_unlock(&po->bind_lock);
2207 
2208 	synchronize_net();
2209 
2210 	err = -EBUSY;
2211 	mutex_lock(&po->pg_vec_lock);
2212 	if (closing || atomic_read(&po->mapped) == 0) {
2213 		err = 0;
2214 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2215 		spin_lock_bh(&rb_queue->lock);
2216 		pg_vec = XC(rb->pg_vec, pg_vec);
2217 		rb->frame_max = (req->tp_frame_nr - 1);
2218 		rb->head = 0;
2219 		rb->frame_size = req->tp_frame_size;
2220 		spin_unlock_bh(&rb_queue->lock);
2221 
2222 		order = XC(rb->pg_vec_order, order);
2223 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2224 
2225 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2226 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2227 						tpacket_rcv : packet_rcv;
2228 		skb_queue_purge(rb_queue);
2229 #undef XC
2230 		if (atomic_read(&po->mapped))
2231 			pr_err("packet_mmap: vma is busy: %d\n",
2232 			       atomic_read(&po->mapped));
2233 	}
2234 	mutex_unlock(&po->pg_vec_lock);
2235 
2236 	spin_lock(&po->bind_lock);
2237 	if (was_running && !po->running) {
2238 		sock_hold(sk);
2239 		po->running = 1;
2240 		po->num = num;
2241 		dev_add_pack(&po->prot_hook);
2242 	}
2243 	spin_unlock(&po->bind_lock);
2244 
2245 	release_sock(sk);
2246 
2247 	if (pg_vec)
2248 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2249 out:
2250 	return err;
2251 }
2252 
2253 static int packet_mmap(struct file *file, struct socket *sock,
2254 		struct vm_area_struct *vma)
2255 {
2256 	struct sock *sk = sock->sk;
2257 	struct packet_sock *po = pkt_sk(sk);
2258 	unsigned long size, expected_size;
2259 	struct packet_ring_buffer *rb;
2260 	unsigned long start;
2261 	int err = -EINVAL;
2262 	int i;
2263 
2264 	if (vma->vm_pgoff)
2265 		return -EINVAL;
2266 
2267 	mutex_lock(&po->pg_vec_lock);
2268 
2269 	expected_size = 0;
2270 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2271 		if (rb->pg_vec) {
2272 			expected_size += rb->pg_vec_len
2273 						* rb->pg_vec_pages
2274 						* PAGE_SIZE;
2275 		}
2276 	}
2277 
2278 	if (expected_size == 0)
2279 		goto out;
2280 
2281 	size = vma->vm_end - vma->vm_start;
2282 	if (size != expected_size)
2283 		goto out;
2284 
2285 	start = vma->vm_start;
2286 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2287 		if (rb->pg_vec == NULL)
2288 			continue;
2289 
2290 		for (i = 0; i < rb->pg_vec_len; i++) {
2291 			struct page *page = virt_to_page(rb->pg_vec[i]);
2292 			int pg_num;
2293 
2294 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2295 					pg_num++, page++) {
2296 				err = vm_insert_page(vma, start, page);
2297 				if (unlikely(err))
2298 					goto out;
2299 				start += PAGE_SIZE;
2300 			}
2301 		}
2302 	}
2303 
2304 	atomic_inc(&po->mapped);
2305 	vma->vm_ops = &packet_mmap_ops;
2306 	err = 0;
2307 
2308 out:
2309 	mutex_unlock(&po->pg_vec_lock);
2310 	return err;
2311 }
2312 #endif
2313 
2314 
2315 static const struct proto_ops packet_ops_spkt = {
2316 	.family =	PF_PACKET,
2317 	.owner =	THIS_MODULE,
2318 	.release =	packet_release,
2319 	.bind =		packet_bind_spkt,
2320 	.connect =	sock_no_connect,
2321 	.socketpair =	sock_no_socketpair,
2322 	.accept =	sock_no_accept,
2323 	.getname =	packet_getname_spkt,
2324 	.poll =		datagram_poll,
2325 	.ioctl =	packet_ioctl,
2326 	.listen =	sock_no_listen,
2327 	.shutdown =	sock_no_shutdown,
2328 	.setsockopt =	sock_no_setsockopt,
2329 	.getsockopt =	sock_no_getsockopt,
2330 	.sendmsg =	packet_sendmsg_spkt,
2331 	.recvmsg =	packet_recvmsg,
2332 	.mmap =		sock_no_mmap,
2333 	.sendpage =	sock_no_sendpage,
2334 };
2335 
2336 static const struct proto_ops packet_ops = {
2337 	.family =	PF_PACKET,
2338 	.owner =	THIS_MODULE,
2339 	.release =	packet_release,
2340 	.bind =		packet_bind,
2341 	.connect =	sock_no_connect,
2342 	.socketpair =	sock_no_socketpair,
2343 	.accept =	sock_no_accept,
2344 	.getname =	packet_getname,
2345 	.poll =		packet_poll,
2346 	.ioctl =	packet_ioctl,
2347 	.listen =	sock_no_listen,
2348 	.shutdown =	sock_no_shutdown,
2349 	.setsockopt =	packet_setsockopt,
2350 	.getsockopt =	packet_getsockopt,
2351 	.sendmsg =	packet_sendmsg,
2352 	.recvmsg =	packet_recvmsg,
2353 	.mmap =		packet_mmap,
2354 	.sendpage =	sock_no_sendpage,
2355 };
2356 
2357 static const struct net_proto_family packet_family_ops = {
2358 	.family =	PF_PACKET,
2359 	.create =	packet_create,
2360 	.owner	=	THIS_MODULE,
2361 };
2362 
2363 static struct notifier_block packet_netdev_notifier = {
2364 	.notifier_call =	packet_notifier,
2365 };
2366 
2367 #ifdef CONFIG_PROC_FS
2368 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2369 {
2370 	struct sock *s;
2371 	struct hlist_node *node;
2372 
2373 	sk_for_each(s, node, &net->packet.sklist) {
2374 		if (!off--)
2375 			return s;
2376 	}
2377 	return NULL;
2378 }
2379 
2380 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2381 	__acquires(seq_file_net(seq)->packet.sklist_lock)
2382 {
2383 	struct net *net = seq_file_net(seq);
2384 	read_lock(&net->packet.sklist_lock);
2385 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2386 }
2387 
2388 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2389 {
2390 	struct net *net = seq_file_net(seq);
2391 	++*pos;
2392 	return  (v == SEQ_START_TOKEN)
2393 		? sk_head(&net->packet.sklist)
2394 		: sk_next((struct sock *)v) ;
2395 }
2396 
2397 static void packet_seq_stop(struct seq_file *seq, void *v)
2398 	__releases(seq_file_net(seq)->packet.sklist_lock)
2399 {
2400 	struct net *net = seq_file_net(seq);
2401 	read_unlock(&net->packet.sklist_lock);
2402 }
2403 
2404 static int packet_seq_show(struct seq_file *seq, void *v)
2405 {
2406 	if (v == SEQ_START_TOKEN)
2407 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2408 	else {
2409 		struct sock *s = v;
2410 		const struct packet_sock *po = pkt_sk(s);
2411 
2412 		seq_printf(seq,
2413 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2414 			   s,
2415 			   atomic_read(&s->sk_refcnt),
2416 			   s->sk_type,
2417 			   ntohs(po->num),
2418 			   po->ifindex,
2419 			   po->running,
2420 			   atomic_read(&s->sk_rmem_alloc),
2421 			   sock_i_uid(s),
2422 			   sock_i_ino(s));
2423 	}
2424 
2425 	return 0;
2426 }
2427 
2428 static const struct seq_operations packet_seq_ops = {
2429 	.start	= packet_seq_start,
2430 	.next	= packet_seq_next,
2431 	.stop	= packet_seq_stop,
2432 	.show	= packet_seq_show,
2433 };
2434 
2435 static int packet_seq_open(struct inode *inode, struct file *file)
2436 {
2437 	return seq_open_net(inode, file, &packet_seq_ops,
2438 			    sizeof(struct seq_net_private));
2439 }
2440 
2441 static const struct file_operations packet_seq_fops = {
2442 	.owner		= THIS_MODULE,
2443 	.open		= packet_seq_open,
2444 	.read		= seq_read,
2445 	.llseek		= seq_lseek,
2446 	.release	= seq_release_net,
2447 };
2448 
2449 #endif
2450 
2451 static int packet_net_init(struct net *net)
2452 {
2453 	rwlock_init(&net->packet.sklist_lock);
2454 	INIT_HLIST_HEAD(&net->packet.sklist);
2455 
2456 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2457 		return -ENOMEM;
2458 
2459 	return 0;
2460 }
2461 
2462 static void packet_net_exit(struct net *net)
2463 {
2464 	proc_net_remove(net, "packet");
2465 }
2466 
2467 static struct pernet_operations packet_net_ops = {
2468 	.init = packet_net_init,
2469 	.exit = packet_net_exit,
2470 };
2471 
2472 
2473 static void __exit packet_exit(void)
2474 {
2475 	unregister_netdevice_notifier(&packet_netdev_notifier);
2476 	unregister_pernet_subsys(&packet_net_ops);
2477 	sock_unregister(PF_PACKET);
2478 	proto_unregister(&packet_proto);
2479 }
2480 
2481 static int __init packet_init(void)
2482 {
2483 	int rc = proto_register(&packet_proto, 0);
2484 
2485 	if (rc != 0)
2486 		goto out;
2487 
2488 	sock_register(&packet_family_ops);
2489 	register_pernet_subsys(&packet_net_ops);
2490 	register_netdevice_notifier(&packet_netdev_notifier);
2491 out:
2492 	return rc;
2493 }
2494 
2495 module_init(packet_init);
2496 module_exit(packet_exit);
2497 MODULE_LICENSE("GPL");
2498 MODULE_ALIAS_NETPROTO(PF_PACKET);
2499