xref: /openbmc/linux/net/packet/af_packet.c (revision e8e0929d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86 
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97 
98 On receive:
99 -----------
100 
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104 
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108 
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 		 header.  PPP makes it, that is wrong, because introduce
112 		 assymetry between rx and tx paths.
113    data       -> data
114 
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118 
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121 
122 
123 On transmit:
124 ------------
125 
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129 
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133 
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137 
138 /* Private packet socket structures. */
139 
140 struct packet_mclist {
141 	struct packet_mclist	*next;
142 	int			ifindex;
143 	int			count;
144 	unsigned short		type;
145 	unsigned short		alen;
146 	unsigned char		addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152 	int		mr_ifindex;
153 	unsigned short	mr_type;
154 	unsigned short	mr_alen;
155 	unsigned char	mr_address[MAX_ADDR_LEN];
156 };
157 
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160 		int closing, int tx_ring);
161 
162 struct packet_ring_buffer {
163 	char			**pg_vec;
164 	unsigned int		head;
165 	unsigned int		frames_per_block;
166 	unsigned int		frame_size;
167 	unsigned int		frame_max;
168 
169 	unsigned int		pg_vec_order;
170 	unsigned int		pg_vec_pages;
171 	unsigned int		pg_vec_len;
172 
173 	atomic_t		pending;
174 };
175 
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179 
180 static void packet_flush_mclist(struct sock *sk);
181 
182 struct packet_sock {
183 	/* struct sock has to be the first member of packet_sock */
184 	struct sock		sk;
185 	struct tpacket_stats	stats;
186 #ifdef CONFIG_PACKET_MMAP
187 	struct packet_ring_buffer	rx_ring;
188 	struct packet_ring_buffer	tx_ring;
189 	int			copy_thresh;
190 #endif
191 	struct packet_type	prot_hook;
192 	spinlock_t		bind_lock;
193 	struct mutex		pg_vec_lock;
194 	unsigned int		running:1,	/* prot_hook is attached*/
195 				auxdata:1,
196 				origdev:1;
197 	int			ifindex;	/* bound device		*/
198 	__be16			num;
199 	struct packet_mclist	*mclist;
200 #ifdef CONFIG_PACKET_MMAP
201 	atomic_t		mapped;
202 	enum tpacket_versions	tp_version;
203 	unsigned int		tp_hdrlen;
204 	unsigned int		tp_reserve;
205 	unsigned int		tp_loss:1;
206 #endif
207 };
208 
209 struct packet_skb_cb {
210 	unsigned int origlen;
211 	union {
212 		struct sockaddr_pkt pkt;
213 		struct sockaddr_ll ll;
214 	} sa;
215 };
216 
217 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
218 
219 #ifdef CONFIG_PACKET_MMAP
220 
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223 	union {
224 		struct tpacket_hdr *h1;
225 		struct tpacket2_hdr *h2;
226 		void *raw;
227 	} h;
228 
229 	h.raw = frame;
230 	switch (po->tp_version) {
231 	case TPACKET_V1:
232 		h.h1->tp_status = status;
233 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
234 		break;
235 	case TPACKET_V2:
236 		h.h2->tp_status = status;
237 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
238 		break;
239 	default:
240 		pr_err("TPACKET version not supported\n");
241 		BUG();
242 	}
243 
244 	smp_wmb();
245 }
246 
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249 	union {
250 		struct tpacket_hdr *h1;
251 		struct tpacket2_hdr *h2;
252 		void *raw;
253 	} h;
254 
255 	smp_rmb();
256 
257 	h.raw = frame;
258 	switch (po->tp_version) {
259 	case TPACKET_V1:
260 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
261 		return h.h1->tp_status;
262 	case TPACKET_V2:
263 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
264 		return h.h2->tp_status;
265 	default:
266 		pr_err("TPACKET version not supported\n");
267 		BUG();
268 		return 0;
269 	}
270 }
271 
272 static void *packet_lookup_frame(struct packet_sock *po,
273 		struct packet_ring_buffer *rb,
274 		unsigned int position,
275 		int status)
276 {
277 	unsigned int pg_vec_pos, frame_offset;
278 	union {
279 		struct tpacket_hdr *h1;
280 		struct tpacket2_hdr *h2;
281 		void *raw;
282 	} h;
283 
284 	pg_vec_pos = position / rb->frames_per_block;
285 	frame_offset = position % rb->frames_per_block;
286 
287 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288 
289 	if (status != __packet_get_status(po, h.raw))
290 		return NULL;
291 
292 	return h.raw;
293 }
294 
295 static inline void *packet_current_frame(struct packet_sock *po,
296 		struct packet_ring_buffer *rb,
297 		int status)
298 {
299 	return packet_lookup_frame(po, rb, rb->head, status);
300 }
301 
302 static inline void *packet_previous_frame(struct packet_sock *po,
303 		struct packet_ring_buffer *rb,
304 		int status)
305 {
306 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307 	return packet_lookup_frame(po, rb, previous, status);
308 }
309 
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314 
315 #endif
316 
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319 	return (struct packet_sock *)sk;
320 }
321 
322 static void packet_sock_destruct(struct sock *sk)
323 {
324 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326 
327 	if (!sock_flag(sk, SOCK_DEAD)) {
328 		pr_err("Attempt to release alive packet socket: %p\n", sk);
329 		return;
330 	}
331 
332 	sk_refcnt_debug_dec(sk);
333 }
334 
335 
336 static const struct proto_ops packet_ops;
337 
338 static const struct proto_ops packet_ops_spkt;
339 
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341 			   struct packet_type *pt, struct net_device *orig_dev)
342 {
343 	struct sock *sk;
344 	struct sockaddr_pkt *spkt;
345 
346 	/*
347 	 *	When we registered the protocol we saved the socket in the data
348 	 *	field for just this event.
349 	 */
350 
351 	sk = pt->af_packet_priv;
352 
353 	/*
354 	 *	Yank back the headers [hope the device set this
355 	 *	right or kerboom...]
356 	 *
357 	 *	Incoming packets have ll header pulled,
358 	 *	push it back.
359 	 *
360 	 *	For outgoing ones skb->data == skb_mac_header(skb)
361 	 *	so that this procedure is noop.
362 	 */
363 
364 	if (skb->pkt_type == PACKET_LOOPBACK)
365 		goto out;
366 
367 	if (dev_net(dev) != sock_net(sk))
368 		goto out;
369 
370 	skb = skb_share_check(skb, GFP_ATOMIC);
371 	if (skb == NULL)
372 		goto oom;
373 
374 	/* drop any routing info */
375 	skb_dst_drop(skb);
376 
377 	/* drop conntrack reference */
378 	nf_reset(skb);
379 
380 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381 
382 	skb_push(skb, skb->data - skb_mac_header(skb));
383 
384 	/*
385 	 *	The SOCK_PACKET socket receives _all_ frames.
386 	 */
387 
388 	spkt->spkt_family = dev->type;
389 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390 	spkt->spkt_protocol = skb->protocol;
391 
392 	/*
393 	 *	Charge the memory to the socket. This is done specifically
394 	 *	to prevent sockets using all the memory up.
395 	 */
396 
397 	if (sock_queue_rcv_skb(sk, skb) == 0)
398 		return 0;
399 
400 out:
401 	kfree_skb(skb);
402 oom:
403 	return 0;
404 }
405 
406 
407 /*
408  *	Output a raw packet to a device layer. This bypasses all the other
409  *	protocol layers and you must therefore supply it with a complete frame
410  */
411 
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413 			       struct msghdr *msg, size_t len)
414 {
415 	struct sock *sk = sock->sk;
416 	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417 	struct sk_buff *skb;
418 	struct net_device *dev;
419 	__be16 proto = 0;
420 	int err;
421 
422 	/*
423 	 *	Get and verify the address.
424 	 */
425 
426 	if (saddr) {
427 		if (msg->msg_namelen < sizeof(struct sockaddr))
428 			return -EINVAL;
429 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430 			proto = saddr->spkt_protocol;
431 	} else
432 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
433 
434 	/*
435 	 *	Find the device first to size check it
436 	 */
437 
438 	saddr->spkt_device[13] = 0;
439 	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440 	err = -ENODEV;
441 	if (dev == NULL)
442 		goto out_unlock;
443 
444 	err = -ENETDOWN;
445 	if (!(dev->flags & IFF_UP))
446 		goto out_unlock;
447 
448 	/*
449 	 * You may not queue a frame bigger than the mtu. This is the lowest level
450 	 * raw protocol and you must do your own fragmentation at this level.
451 	 */
452 
453 	err = -EMSGSIZE;
454 	if (len > dev->mtu + dev->hard_header_len)
455 		goto out_unlock;
456 
457 	err = -ENOBUFS;
458 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459 
460 	/*
461 	 * If the write buffer is full, then tough. At this level the user
462 	 * gets to deal with the problem - do your own algorithmic backoffs.
463 	 * That's far more flexible.
464 	 */
465 
466 	if (skb == NULL)
467 		goto out_unlock;
468 
469 	/*
470 	 *	Fill it in
471 	 */
472 
473 	/* FIXME: Save some space for broken drivers that write a
474 	 * hard header at transmission time by themselves. PPP is the
475 	 * notable one here. This should really be fixed at the driver level.
476 	 */
477 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
478 	skb_reset_network_header(skb);
479 
480 	/* Try to align data part correctly */
481 	if (dev->header_ops) {
482 		skb->data -= dev->hard_header_len;
483 		skb->tail -= dev->hard_header_len;
484 		if (len < dev->hard_header_len)
485 			skb_reset_network_header(skb);
486 	}
487 
488 	/* Returns -EFAULT on error */
489 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490 	skb->protocol = proto;
491 	skb->dev = dev;
492 	skb->priority = sk->sk_priority;
493 	if (err)
494 		goto out_free;
495 
496 	/*
497 	 *	Now send it
498 	 */
499 
500 	dev_queue_xmit(skb);
501 	dev_put(dev);
502 	return len;
503 
504 out_free:
505 	kfree_skb(skb);
506 out_unlock:
507 	if (dev)
508 		dev_put(dev);
509 	return err;
510 }
511 
512 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
513 				      unsigned int res)
514 {
515 	struct sk_filter *filter;
516 
517 	rcu_read_lock_bh();
518 	filter = rcu_dereference(sk->sk_filter);
519 	if (filter != NULL)
520 		res = sk_run_filter(skb, filter->insns, filter->len);
521 	rcu_read_unlock_bh();
522 
523 	return res;
524 }
525 
526 /*
527    This function makes lazy skb cloning in hope that most of packets
528    are discarded by BPF.
529 
530    Note tricky part: we DO mangle shared skb! skb->data, skb->len
531    and skb->cb are mangled. It works because (and until) packets
532    falling here are owned by current CPU. Output packets are cloned
533    by dev_queue_xmit_nit(), input packets are processed by net_bh
534    sequencially, so that if we return skb to original state on exit,
535    we will not harm anyone.
536  */
537 
538 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
539 		      struct packet_type *pt, struct net_device *orig_dev)
540 {
541 	struct sock *sk;
542 	struct sockaddr_ll *sll;
543 	struct packet_sock *po;
544 	u8 *skb_head = skb->data;
545 	int skb_len = skb->len;
546 	unsigned int snaplen, res;
547 
548 	if (skb->pkt_type == PACKET_LOOPBACK)
549 		goto drop;
550 
551 	sk = pt->af_packet_priv;
552 	po = pkt_sk(sk);
553 
554 	if (dev_net(dev) != sock_net(sk))
555 		goto drop;
556 
557 	skb->dev = dev;
558 
559 	if (dev->header_ops) {
560 		/* The device has an explicit notion of ll header,
561 		   exported to higher levels.
562 
563 		   Otherwise, the device hides datails of it frame
564 		   structure, so that corresponding packet head
565 		   never delivered to user.
566 		 */
567 		if (sk->sk_type != SOCK_DGRAM)
568 			skb_push(skb, skb->data - skb_mac_header(skb));
569 		else if (skb->pkt_type == PACKET_OUTGOING) {
570 			/* Special case: outgoing packets have ll header at head */
571 			skb_pull(skb, skb_network_offset(skb));
572 		}
573 	}
574 
575 	snaplen = skb->len;
576 
577 	res = run_filter(skb, sk, snaplen);
578 	if (!res)
579 		goto drop_n_restore;
580 	if (snaplen > res)
581 		snaplen = res;
582 
583 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
584 	    (unsigned)sk->sk_rcvbuf)
585 		goto drop_n_acct;
586 
587 	if (skb_shared(skb)) {
588 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
589 		if (nskb == NULL)
590 			goto drop_n_acct;
591 
592 		if (skb_head != skb->data) {
593 			skb->data = skb_head;
594 			skb->len = skb_len;
595 		}
596 		kfree_skb(skb);
597 		skb = nskb;
598 	}
599 
600 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
601 		     sizeof(skb->cb));
602 
603 	sll = &PACKET_SKB_CB(skb)->sa.ll;
604 	sll->sll_family = AF_PACKET;
605 	sll->sll_hatype = dev->type;
606 	sll->sll_protocol = skb->protocol;
607 	sll->sll_pkttype = skb->pkt_type;
608 	if (unlikely(po->origdev))
609 		sll->sll_ifindex = orig_dev->ifindex;
610 	else
611 		sll->sll_ifindex = dev->ifindex;
612 
613 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
614 
615 	PACKET_SKB_CB(skb)->origlen = skb->len;
616 
617 	if (pskb_trim(skb, snaplen))
618 		goto drop_n_acct;
619 
620 	skb_set_owner_r(skb, sk);
621 	skb->dev = NULL;
622 	skb_dst_drop(skb);
623 
624 	/* drop conntrack reference */
625 	nf_reset(skb);
626 
627 	spin_lock(&sk->sk_receive_queue.lock);
628 	po->stats.tp_packets++;
629 	__skb_queue_tail(&sk->sk_receive_queue, skb);
630 	spin_unlock(&sk->sk_receive_queue.lock);
631 	sk->sk_data_ready(sk, skb->len);
632 	return 0;
633 
634 drop_n_acct:
635 	spin_lock(&sk->sk_receive_queue.lock);
636 	po->stats.tp_drops++;
637 	spin_unlock(&sk->sk_receive_queue.lock);
638 
639 drop_n_restore:
640 	if (skb_head != skb->data && skb_shared(skb)) {
641 		skb->data = skb_head;
642 		skb->len = skb_len;
643 	}
644 drop:
645 	consume_skb(skb);
646 	return 0;
647 }
648 
649 #ifdef CONFIG_PACKET_MMAP
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651 		       struct packet_type *pt, struct net_device *orig_dev)
652 {
653 	struct sock *sk;
654 	struct packet_sock *po;
655 	struct sockaddr_ll *sll;
656 	union {
657 		struct tpacket_hdr *h1;
658 		struct tpacket2_hdr *h2;
659 		void *raw;
660 	} h;
661 	u8 *skb_head = skb->data;
662 	int skb_len = skb->len;
663 	unsigned int snaplen, res;
664 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665 	unsigned short macoff, netoff, hdrlen;
666 	struct sk_buff *copy_skb = NULL;
667 	struct timeval tv;
668 	struct timespec ts;
669 
670 	if (skb->pkt_type == PACKET_LOOPBACK)
671 		goto drop;
672 
673 	sk = pt->af_packet_priv;
674 	po = pkt_sk(sk);
675 
676 	if (dev_net(dev) != sock_net(sk))
677 		goto drop;
678 
679 	if (dev->header_ops) {
680 		if (sk->sk_type != SOCK_DGRAM)
681 			skb_push(skb, skb->data - skb_mac_header(skb));
682 		else if (skb->pkt_type == PACKET_OUTGOING) {
683 			/* Special case: outgoing packets have ll header at head */
684 			skb_pull(skb, skb_network_offset(skb));
685 		}
686 	}
687 
688 	if (skb->ip_summed == CHECKSUM_PARTIAL)
689 		status |= TP_STATUS_CSUMNOTREADY;
690 
691 	snaplen = skb->len;
692 
693 	res = run_filter(skb, sk, snaplen);
694 	if (!res)
695 		goto drop_n_restore;
696 	if (snaplen > res)
697 		snaplen = res;
698 
699 	if (sk->sk_type == SOCK_DGRAM) {
700 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701 				  po->tp_reserve;
702 	} else {
703 		unsigned maclen = skb_network_offset(skb);
704 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
705 				       (maclen < 16 ? 16 : maclen)) +
706 			po->tp_reserve;
707 		macoff = netoff - maclen;
708 	}
709 
710 	if (macoff + snaplen > po->rx_ring.frame_size) {
711 		if (po->copy_thresh &&
712 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713 		    (unsigned)sk->sk_rcvbuf) {
714 			if (skb_shared(skb)) {
715 				copy_skb = skb_clone(skb, GFP_ATOMIC);
716 			} else {
717 				copy_skb = skb_get(skb);
718 				skb_head = skb->data;
719 			}
720 			if (copy_skb)
721 				skb_set_owner_r(copy_skb, sk);
722 		}
723 		snaplen = po->rx_ring.frame_size - macoff;
724 		if ((int)snaplen < 0)
725 			snaplen = 0;
726 	}
727 
728 	spin_lock(&sk->sk_receive_queue.lock);
729 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730 	if (!h.raw)
731 		goto ring_is_full;
732 	packet_increment_head(&po->rx_ring);
733 	po->stats.tp_packets++;
734 	if (copy_skb) {
735 		status |= TP_STATUS_COPY;
736 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737 	}
738 	if (!po->stats.tp_drops)
739 		status &= ~TP_STATUS_LOSING;
740 	spin_unlock(&sk->sk_receive_queue.lock);
741 
742 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743 
744 	switch (po->tp_version) {
745 	case TPACKET_V1:
746 		h.h1->tp_len = skb->len;
747 		h.h1->tp_snaplen = snaplen;
748 		h.h1->tp_mac = macoff;
749 		h.h1->tp_net = netoff;
750 		if (skb->tstamp.tv64)
751 			tv = ktime_to_timeval(skb->tstamp);
752 		else
753 			do_gettimeofday(&tv);
754 		h.h1->tp_sec = tv.tv_sec;
755 		h.h1->tp_usec = tv.tv_usec;
756 		hdrlen = sizeof(*h.h1);
757 		break;
758 	case TPACKET_V2:
759 		h.h2->tp_len = skb->len;
760 		h.h2->tp_snaplen = snaplen;
761 		h.h2->tp_mac = macoff;
762 		h.h2->tp_net = netoff;
763 		if (skb->tstamp.tv64)
764 			ts = ktime_to_timespec(skb->tstamp);
765 		else
766 			getnstimeofday(&ts);
767 		h.h2->tp_sec = ts.tv_sec;
768 		h.h2->tp_nsec = ts.tv_nsec;
769 		h.h2->tp_vlan_tci = skb->vlan_tci;
770 		hdrlen = sizeof(*h.h2);
771 		break;
772 	default:
773 		BUG();
774 	}
775 
776 	sll = h.raw + TPACKET_ALIGN(hdrlen);
777 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778 	sll->sll_family = AF_PACKET;
779 	sll->sll_hatype = dev->type;
780 	sll->sll_protocol = skb->protocol;
781 	sll->sll_pkttype = skb->pkt_type;
782 	if (unlikely(po->origdev))
783 		sll->sll_ifindex = orig_dev->ifindex;
784 	else
785 		sll->sll_ifindex = dev->ifindex;
786 
787 	__packet_set_status(po, h.raw, status);
788 	smp_mb();
789 	{
790 		struct page *p_start, *p_end;
791 		u8 *h_end = h.raw + macoff + snaplen - 1;
792 
793 		p_start = virt_to_page(h.raw);
794 		p_end = virt_to_page(h_end);
795 		while (p_start <= p_end) {
796 			flush_dcache_page(p_start);
797 			p_start++;
798 		}
799 	}
800 
801 	sk->sk_data_ready(sk, 0);
802 
803 drop_n_restore:
804 	if (skb_head != skb->data && skb_shared(skb)) {
805 		skb->data = skb_head;
806 		skb->len = skb_len;
807 	}
808 drop:
809 	kfree_skb(skb);
810 	return 0;
811 
812 ring_is_full:
813 	po->stats.tp_drops++;
814 	spin_unlock(&sk->sk_receive_queue.lock);
815 
816 	sk->sk_data_ready(sk, 0);
817 	kfree_skb(copy_skb);
818 	goto drop_n_restore;
819 }
820 
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823 	struct packet_sock *po = pkt_sk(skb->sk);
824 	void *ph;
825 
826 	BUG_ON(skb == NULL);
827 
828 	if (likely(po->tx_ring.pg_vec)) {
829 		ph = skb_shinfo(skb)->destructor_arg;
830 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832 		atomic_dec(&po->tx_ring.pending);
833 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834 	}
835 
836 	sock_wfree(skb);
837 }
838 
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
840 		void *frame, struct net_device *dev, int size_max,
841 		__be16 proto, unsigned char *addr)
842 {
843 	union {
844 		struct tpacket_hdr *h1;
845 		struct tpacket2_hdr *h2;
846 		void *raw;
847 	} ph;
848 	int to_write, offset, len, tp_len, nr_frags, len_max;
849 	struct socket *sock = po->sk.sk_socket;
850 	struct page *page;
851 	void *data;
852 	int err;
853 
854 	ph.raw = frame;
855 
856 	skb->protocol = proto;
857 	skb->dev = dev;
858 	skb->priority = po->sk.sk_priority;
859 	skb_shinfo(skb)->destructor_arg = ph.raw;
860 
861 	switch (po->tp_version) {
862 	case TPACKET_V2:
863 		tp_len = ph.h2->tp_len;
864 		break;
865 	default:
866 		tp_len = ph.h1->tp_len;
867 		break;
868 	}
869 	if (unlikely(tp_len > size_max)) {
870 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
871 		return -EMSGSIZE;
872 	}
873 
874 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
875 	skb_reset_network_header(skb);
876 
877 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
878 	to_write = tp_len;
879 
880 	if (sock->type == SOCK_DGRAM) {
881 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
882 				NULL, tp_len);
883 		if (unlikely(err < 0))
884 			return -EINVAL;
885 	} else if (dev->hard_header_len) {
886 		/* net device doesn't like empty head */
887 		if (unlikely(tp_len <= dev->hard_header_len)) {
888 			pr_err("packet size is too short (%d < %d)\n",
889 			       tp_len, dev->hard_header_len);
890 			return -EINVAL;
891 		}
892 
893 		skb_push(skb, dev->hard_header_len);
894 		err = skb_store_bits(skb, 0, data,
895 				dev->hard_header_len);
896 		if (unlikely(err))
897 			return err;
898 
899 		data += dev->hard_header_len;
900 		to_write -= dev->hard_header_len;
901 	}
902 
903 	err = -EFAULT;
904 	page = virt_to_page(data);
905 	offset = offset_in_page(data);
906 	len_max = PAGE_SIZE - offset;
907 	len = ((to_write > len_max) ? len_max : to_write);
908 
909 	skb->data_len = to_write;
910 	skb->len += to_write;
911 	skb->truesize += to_write;
912 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
913 
914 	while (likely(to_write)) {
915 		nr_frags = skb_shinfo(skb)->nr_frags;
916 
917 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
918 			pr_err("Packet exceed the number of skb frags(%lu)\n",
919 			       MAX_SKB_FRAGS);
920 			return -EFAULT;
921 		}
922 
923 		flush_dcache_page(page);
924 		get_page(page);
925 		skb_fill_page_desc(skb,
926 				nr_frags,
927 				page++, offset, len);
928 		to_write -= len;
929 		offset = 0;
930 		len_max = PAGE_SIZE;
931 		len = ((to_write > len_max) ? len_max : to_write);
932 	}
933 
934 	return tp_len;
935 }
936 
937 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
938 {
939 	struct socket *sock;
940 	struct sk_buff *skb;
941 	struct net_device *dev;
942 	__be16 proto;
943 	int ifindex, err, reserve = 0;
944 	void *ph;
945 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
946 	int tp_len, size_max;
947 	unsigned char *addr;
948 	int len_sum = 0;
949 	int status = 0;
950 
951 	sock = po->sk.sk_socket;
952 
953 	mutex_lock(&po->pg_vec_lock);
954 
955 	err = -EBUSY;
956 	if (saddr == NULL) {
957 		ifindex	= po->ifindex;
958 		proto	= po->num;
959 		addr	= NULL;
960 	} else {
961 		err = -EINVAL;
962 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
963 			goto out;
964 		if (msg->msg_namelen < (saddr->sll_halen
965 					+ offsetof(struct sockaddr_ll,
966 						sll_addr)))
967 			goto out;
968 		ifindex	= saddr->sll_ifindex;
969 		proto	= saddr->sll_protocol;
970 		addr	= saddr->sll_addr;
971 	}
972 
973 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
974 	err = -ENXIO;
975 	if (unlikely(dev == NULL))
976 		goto out;
977 
978 	reserve = dev->hard_header_len;
979 
980 	err = -ENETDOWN;
981 	if (unlikely(!(dev->flags & IFF_UP)))
982 		goto out_put;
983 
984 	size_max = po->tx_ring.frame_size
985 		- sizeof(struct skb_shared_info)
986 		- po->tp_hdrlen
987 		- LL_ALLOCATED_SPACE(dev)
988 		- sizeof(struct sockaddr_ll);
989 
990 	if (size_max > dev->mtu + reserve)
991 		size_max = dev->mtu + reserve;
992 
993 	do {
994 		ph = packet_current_frame(po, &po->tx_ring,
995 				TP_STATUS_SEND_REQUEST);
996 
997 		if (unlikely(ph == NULL)) {
998 			schedule();
999 			continue;
1000 		}
1001 
1002 		status = TP_STATUS_SEND_REQUEST;
1003 		skb = sock_alloc_send_skb(&po->sk,
1004 				LL_ALLOCATED_SPACE(dev)
1005 				+ sizeof(struct sockaddr_ll),
1006 				0, &err);
1007 
1008 		if (unlikely(skb == NULL))
1009 			goto out_status;
1010 
1011 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1012 				addr);
1013 
1014 		if (unlikely(tp_len < 0)) {
1015 			if (po->tp_loss) {
1016 				__packet_set_status(po, ph,
1017 						TP_STATUS_AVAILABLE);
1018 				packet_increment_head(&po->tx_ring);
1019 				kfree_skb(skb);
1020 				continue;
1021 			} else {
1022 				status = TP_STATUS_WRONG_FORMAT;
1023 				err = tp_len;
1024 				goto out_status;
1025 			}
1026 		}
1027 
1028 		skb->destructor = tpacket_destruct_skb;
1029 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1030 		atomic_inc(&po->tx_ring.pending);
1031 
1032 		status = TP_STATUS_SEND_REQUEST;
1033 		err = dev_queue_xmit(skb);
1034 		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1035 			goto out_xmit;
1036 		packet_increment_head(&po->tx_ring);
1037 		len_sum += tp_len;
1038 	} while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1039 					&& (atomic_read(&po->tx_ring.pending))))
1040 	      );
1041 
1042 	err = len_sum;
1043 	goto out_put;
1044 
1045 out_xmit:
1046 	skb->destructor = sock_wfree;
1047 	atomic_dec(&po->tx_ring.pending);
1048 out_status:
1049 	__packet_set_status(po, ph, status);
1050 	kfree_skb(skb);
1051 out_put:
1052 	dev_put(dev);
1053 out:
1054 	mutex_unlock(&po->pg_vec_lock);
1055 	return err;
1056 }
1057 #endif
1058 
1059 static int packet_snd(struct socket *sock,
1060 			  struct msghdr *msg, size_t len)
1061 {
1062 	struct sock *sk = sock->sk;
1063 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1064 	struct sk_buff *skb;
1065 	struct net_device *dev;
1066 	__be16 proto;
1067 	unsigned char *addr;
1068 	int ifindex, err, reserve = 0;
1069 
1070 	/*
1071 	 *	Get and verify the address.
1072 	 */
1073 
1074 	if (saddr == NULL) {
1075 		struct packet_sock *po = pkt_sk(sk);
1076 
1077 		ifindex	= po->ifindex;
1078 		proto	= po->num;
1079 		addr	= NULL;
1080 	} else {
1081 		err = -EINVAL;
1082 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1083 			goto out;
1084 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1085 			goto out;
1086 		ifindex	= saddr->sll_ifindex;
1087 		proto	= saddr->sll_protocol;
1088 		addr	= saddr->sll_addr;
1089 	}
1090 
1091 
1092 	dev = dev_get_by_index(sock_net(sk), ifindex);
1093 	err = -ENXIO;
1094 	if (dev == NULL)
1095 		goto out_unlock;
1096 	if (sock->type == SOCK_RAW)
1097 		reserve = dev->hard_header_len;
1098 
1099 	err = -ENETDOWN;
1100 	if (!(dev->flags & IFF_UP))
1101 		goto out_unlock;
1102 
1103 	err = -EMSGSIZE;
1104 	if (len > dev->mtu+reserve)
1105 		goto out_unlock;
1106 
1107 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1108 				msg->msg_flags & MSG_DONTWAIT, &err);
1109 	if (skb == NULL)
1110 		goto out_unlock;
1111 
1112 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1113 	skb_reset_network_header(skb);
1114 
1115 	err = -EINVAL;
1116 	if (sock->type == SOCK_DGRAM &&
1117 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1118 		goto out_free;
1119 
1120 	/* Returns -EFAULT on error */
1121 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1122 	if (err)
1123 		goto out_free;
1124 
1125 	skb->protocol = proto;
1126 	skb->dev = dev;
1127 	skb->priority = sk->sk_priority;
1128 
1129 	/*
1130 	 *	Now send it
1131 	 */
1132 
1133 	err = dev_queue_xmit(skb);
1134 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1135 		goto out_unlock;
1136 
1137 	dev_put(dev);
1138 
1139 	return len;
1140 
1141 out_free:
1142 	kfree_skb(skb);
1143 out_unlock:
1144 	if (dev)
1145 		dev_put(dev);
1146 out:
1147 	return err;
1148 }
1149 
1150 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1151 		struct msghdr *msg, size_t len)
1152 {
1153 #ifdef CONFIG_PACKET_MMAP
1154 	struct sock *sk = sock->sk;
1155 	struct packet_sock *po = pkt_sk(sk);
1156 	if (po->tx_ring.pg_vec)
1157 		return tpacket_snd(po, msg);
1158 	else
1159 #endif
1160 		return packet_snd(sock, msg, len);
1161 }
1162 
1163 /*
1164  *	Close a PACKET socket. This is fairly simple. We immediately go
1165  *	to 'closed' state and remove our protocol entry in the device list.
1166  */
1167 
1168 static int packet_release(struct socket *sock)
1169 {
1170 	struct sock *sk = sock->sk;
1171 	struct packet_sock *po;
1172 	struct net *net;
1173 #ifdef CONFIG_PACKET_MMAP
1174 	struct tpacket_req req;
1175 #endif
1176 
1177 	if (!sk)
1178 		return 0;
1179 
1180 	net = sock_net(sk);
1181 	po = pkt_sk(sk);
1182 
1183 	write_lock_bh(&net->packet.sklist_lock);
1184 	sk_del_node_init(sk);
1185 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1186 	write_unlock_bh(&net->packet.sklist_lock);
1187 
1188 	/*
1189 	 *	Unhook packet receive handler.
1190 	 */
1191 
1192 	if (po->running) {
1193 		/*
1194 		 *	Remove the protocol hook
1195 		 */
1196 		dev_remove_pack(&po->prot_hook);
1197 		po->running = 0;
1198 		po->num = 0;
1199 		__sock_put(sk);
1200 	}
1201 
1202 	packet_flush_mclist(sk);
1203 
1204 #ifdef CONFIG_PACKET_MMAP
1205 	memset(&req, 0, sizeof(req));
1206 
1207 	if (po->rx_ring.pg_vec)
1208 		packet_set_ring(sk, &req, 1, 0);
1209 
1210 	if (po->tx_ring.pg_vec)
1211 		packet_set_ring(sk, &req, 1, 1);
1212 #endif
1213 
1214 	/*
1215 	 *	Now the socket is dead. No more input will appear.
1216 	 */
1217 
1218 	sock_orphan(sk);
1219 	sock->sk = NULL;
1220 
1221 	/* Purge queues */
1222 
1223 	skb_queue_purge(&sk->sk_receive_queue);
1224 	sk_refcnt_debug_release(sk);
1225 
1226 	sock_put(sk);
1227 	return 0;
1228 }
1229 
1230 /*
1231  *	Attach a packet hook.
1232  */
1233 
1234 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1235 {
1236 	struct packet_sock *po = pkt_sk(sk);
1237 	/*
1238 	 *	Detach an existing hook if present.
1239 	 */
1240 
1241 	lock_sock(sk);
1242 
1243 	spin_lock(&po->bind_lock);
1244 	if (po->running) {
1245 		__sock_put(sk);
1246 		po->running = 0;
1247 		po->num = 0;
1248 		spin_unlock(&po->bind_lock);
1249 		dev_remove_pack(&po->prot_hook);
1250 		spin_lock(&po->bind_lock);
1251 	}
1252 
1253 	po->num = protocol;
1254 	po->prot_hook.type = protocol;
1255 	po->prot_hook.dev = dev;
1256 
1257 	po->ifindex = dev ? dev->ifindex : 0;
1258 
1259 	if (protocol == 0)
1260 		goto out_unlock;
1261 
1262 	if (!dev || (dev->flags & IFF_UP)) {
1263 		dev_add_pack(&po->prot_hook);
1264 		sock_hold(sk);
1265 		po->running = 1;
1266 	} else {
1267 		sk->sk_err = ENETDOWN;
1268 		if (!sock_flag(sk, SOCK_DEAD))
1269 			sk->sk_error_report(sk);
1270 	}
1271 
1272 out_unlock:
1273 	spin_unlock(&po->bind_lock);
1274 	release_sock(sk);
1275 	return 0;
1276 }
1277 
1278 /*
1279  *	Bind a packet socket to a device
1280  */
1281 
1282 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1283 			    int addr_len)
1284 {
1285 	struct sock *sk = sock->sk;
1286 	char name[15];
1287 	struct net_device *dev;
1288 	int err = -ENODEV;
1289 
1290 	/*
1291 	 *	Check legality
1292 	 */
1293 
1294 	if (addr_len != sizeof(struct sockaddr))
1295 		return -EINVAL;
1296 	strlcpy(name, uaddr->sa_data, sizeof(name));
1297 
1298 	dev = dev_get_by_name(sock_net(sk), name);
1299 	if (dev) {
1300 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1301 		dev_put(dev);
1302 	}
1303 	return err;
1304 }
1305 
1306 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1307 {
1308 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1309 	struct sock *sk = sock->sk;
1310 	struct net_device *dev = NULL;
1311 	int err;
1312 
1313 
1314 	/*
1315 	 *	Check legality
1316 	 */
1317 
1318 	if (addr_len < sizeof(struct sockaddr_ll))
1319 		return -EINVAL;
1320 	if (sll->sll_family != AF_PACKET)
1321 		return -EINVAL;
1322 
1323 	if (sll->sll_ifindex) {
1324 		err = -ENODEV;
1325 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1326 		if (dev == NULL)
1327 			goto out;
1328 	}
1329 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1330 	if (dev)
1331 		dev_put(dev);
1332 
1333 out:
1334 	return err;
1335 }
1336 
1337 static struct proto packet_proto = {
1338 	.name	  = "PACKET",
1339 	.owner	  = THIS_MODULE,
1340 	.obj_size = sizeof(struct packet_sock),
1341 };
1342 
1343 /*
1344  *	Create a packet of type SOCK_PACKET.
1345  */
1346 
1347 static int packet_create(struct net *net, struct socket *sock, int protocol)
1348 {
1349 	struct sock *sk;
1350 	struct packet_sock *po;
1351 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1352 	int err;
1353 
1354 	if (!capable(CAP_NET_RAW))
1355 		return -EPERM;
1356 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1357 	    sock->type != SOCK_PACKET)
1358 		return -ESOCKTNOSUPPORT;
1359 
1360 	sock->state = SS_UNCONNECTED;
1361 
1362 	err = -ENOBUFS;
1363 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1364 	if (sk == NULL)
1365 		goto out;
1366 
1367 	sock->ops = &packet_ops;
1368 	if (sock->type == SOCK_PACKET)
1369 		sock->ops = &packet_ops_spkt;
1370 
1371 	sock_init_data(sock, sk);
1372 
1373 	po = pkt_sk(sk);
1374 	sk->sk_family = PF_PACKET;
1375 	po->num = proto;
1376 
1377 	sk->sk_destruct = packet_sock_destruct;
1378 	sk_refcnt_debug_inc(sk);
1379 
1380 	/*
1381 	 *	Attach a protocol block
1382 	 */
1383 
1384 	spin_lock_init(&po->bind_lock);
1385 	mutex_init(&po->pg_vec_lock);
1386 	po->prot_hook.func = packet_rcv;
1387 
1388 	if (sock->type == SOCK_PACKET)
1389 		po->prot_hook.func = packet_rcv_spkt;
1390 
1391 	po->prot_hook.af_packet_priv = sk;
1392 
1393 	if (proto) {
1394 		po->prot_hook.type = proto;
1395 		dev_add_pack(&po->prot_hook);
1396 		sock_hold(sk);
1397 		po->running = 1;
1398 	}
1399 
1400 	write_lock_bh(&net->packet.sklist_lock);
1401 	sk_add_node(sk, &net->packet.sklist);
1402 	sock_prot_inuse_add(net, &packet_proto, 1);
1403 	write_unlock_bh(&net->packet.sklist_lock);
1404 	return 0;
1405 out:
1406 	return err;
1407 }
1408 
1409 /*
1410  *	Pull a packet from our receive queue and hand it to the user.
1411  *	If necessary we block.
1412  */
1413 
1414 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1415 			  struct msghdr *msg, size_t len, int flags)
1416 {
1417 	struct sock *sk = sock->sk;
1418 	struct sk_buff *skb;
1419 	int copied, err;
1420 	struct sockaddr_ll *sll;
1421 
1422 	err = -EINVAL;
1423 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1424 		goto out;
1425 
1426 #if 0
1427 	/* What error should we return now? EUNATTACH? */
1428 	if (pkt_sk(sk)->ifindex < 0)
1429 		return -ENODEV;
1430 #endif
1431 
1432 	/*
1433 	 *	Call the generic datagram receiver. This handles all sorts
1434 	 *	of horrible races and re-entrancy so we can forget about it
1435 	 *	in the protocol layers.
1436 	 *
1437 	 *	Now it will return ENETDOWN, if device have just gone down,
1438 	 *	but then it will block.
1439 	 */
1440 
1441 	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1442 
1443 	/*
1444 	 *	An error occurred so return it. Because skb_recv_datagram()
1445 	 *	handles the blocking we don't see and worry about blocking
1446 	 *	retries.
1447 	 */
1448 
1449 	if (skb == NULL)
1450 		goto out;
1451 
1452 	/*
1453 	 *	If the address length field is there to be filled in, we fill
1454 	 *	it in now.
1455 	 */
1456 
1457 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1458 	if (sock->type == SOCK_PACKET)
1459 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1460 	else
1461 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1462 
1463 	/*
1464 	 *	You lose any data beyond the buffer you gave. If it worries a
1465 	 *	user program they can ask the device for its MTU anyway.
1466 	 */
1467 
1468 	copied = skb->len;
1469 	if (copied > len) {
1470 		copied = len;
1471 		msg->msg_flags |= MSG_TRUNC;
1472 	}
1473 
1474 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1475 	if (err)
1476 		goto out_free;
1477 
1478 	sock_recv_timestamp(msg, sk, skb);
1479 
1480 	if (msg->msg_name)
1481 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1482 		       msg->msg_namelen);
1483 
1484 	if (pkt_sk(sk)->auxdata) {
1485 		struct tpacket_auxdata aux;
1486 
1487 		aux.tp_status = TP_STATUS_USER;
1488 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1489 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1490 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1491 		aux.tp_snaplen = skb->len;
1492 		aux.tp_mac = 0;
1493 		aux.tp_net = skb_network_offset(skb);
1494 		aux.tp_vlan_tci = skb->vlan_tci;
1495 
1496 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1497 	}
1498 
1499 	/*
1500 	 *	Free or return the buffer as appropriate. Again this
1501 	 *	hides all the races and re-entrancy issues from us.
1502 	 */
1503 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1504 
1505 out_free:
1506 	skb_free_datagram(sk, skb);
1507 out:
1508 	return err;
1509 }
1510 
1511 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1512 			       int *uaddr_len, int peer)
1513 {
1514 	struct net_device *dev;
1515 	struct sock *sk	= sock->sk;
1516 
1517 	if (peer)
1518 		return -EOPNOTSUPP;
1519 
1520 	uaddr->sa_family = AF_PACKET;
1521 	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1522 	if (dev) {
1523 		strlcpy(uaddr->sa_data, dev->name, 15);
1524 		dev_put(dev);
1525 	} else
1526 		memset(uaddr->sa_data, 0, 14);
1527 	*uaddr_len = sizeof(*uaddr);
1528 
1529 	return 0;
1530 }
1531 
1532 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1533 			  int *uaddr_len, int peer)
1534 {
1535 	struct net_device *dev;
1536 	struct sock *sk = sock->sk;
1537 	struct packet_sock *po = pkt_sk(sk);
1538 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1539 
1540 	if (peer)
1541 		return -EOPNOTSUPP;
1542 
1543 	sll->sll_family = AF_PACKET;
1544 	sll->sll_ifindex = po->ifindex;
1545 	sll->sll_protocol = po->num;
1546 	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1547 	if (dev) {
1548 		sll->sll_hatype = dev->type;
1549 		sll->sll_halen = dev->addr_len;
1550 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1551 		dev_put(dev);
1552 	} else {
1553 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1554 		sll->sll_halen = 0;
1555 	}
1556 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1557 
1558 	return 0;
1559 }
1560 
1561 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1562 			 int what)
1563 {
1564 	switch (i->type) {
1565 	case PACKET_MR_MULTICAST:
1566 		if (what > 0)
1567 			return dev_mc_add(dev, i->addr, i->alen, 0);
1568 		else
1569 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1570 		break;
1571 	case PACKET_MR_PROMISC:
1572 		return dev_set_promiscuity(dev, what);
1573 		break;
1574 	case PACKET_MR_ALLMULTI:
1575 		return dev_set_allmulti(dev, what);
1576 		break;
1577 	case PACKET_MR_UNICAST:
1578 		if (what > 0)
1579 			return dev_unicast_add(dev, i->addr);
1580 		else
1581 			return dev_unicast_delete(dev, i->addr);
1582 		break;
1583 	default:
1584 		break;
1585 	}
1586 	return 0;
1587 }
1588 
1589 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1590 {
1591 	for ( ; i; i = i->next) {
1592 		if (i->ifindex == dev->ifindex)
1593 			packet_dev_mc(dev, i, what);
1594 	}
1595 }
1596 
1597 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1598 {
1599 	struct packet_sock *po = pkt_sk(sk);
1600 	struct packet_mclist *ml, *i;
1601 	struct net_device *dev;
1602 	int err;
1603 
1604 	rtnl_lock();
1605 
1606 	err = -ENODEV;
1607 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1608 	if (!dev)
1609 		goto done;
1610 
1611 	err = -EINVAL;
1612 	if (mreq->mr_alen > dev->addr_len)
1613 		goto done;
1614 
1615 	err = -ENOBUFS;
1616 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1617 	if (i == NULL)
1618 		goto done;
1619 
1620 	err = 0;
1621 	for (ml = po->mclist; ml; ml = ml->next) {
1622 		if (ml->ifindex == mreq->mr_ifindex &&
1623 		    ml->type == mreq->mr_type &&
1624 		    ml->alen == mreq->mr_alen &&
1625 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1626 			ml->count++;
1627 			/* Free the new element ... */
1628 			kfree(i);
1629 			goto done;
1630 		}
1631 	}
1632 
1633 	i->type = mreq->mr_type;
1634 	i->ifindex = mreq->mr_ifindex;
1635 	i->alen = mreq->mr_alen;
1636 	memcpy(i->addr, mreq->mr_address, i->alen);
1637 	i->count = 1;
1638 	i->next = po->mclist;
1639 	po->mclist = i;
1640 	err = packet_dev_mc(dev, i, 1);
1641 	if (err) {
1642 		po->mclist = i->next;
1643 		kfree(i);
1644 	}
1645 
1646 done:
1647 	rtnl_unlock();
1648 	return err;
1649 }
1650 
1651 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1652 {
1653 	struct packet_mclist *ml, **mlp;
1654 
1655 	rtnl_lock();
1656 
1657 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1658 		if (ml->ifindex == mreq->mr_ifindex &&
1659 		    ml->type == mreq->mr_type &&
1660 		    ml->alen == mreq->mr_alen &&
1661 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1662 			if (--ml->count == 0) {
1663 				struct net_device *dev;
1664 				*mlp = ml->next;
1665 				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1666 				if (dev) {
1667 					packet_dev_mc(dev, ml, -1);
1668 					dev_put(dev);
1669 				}
1670 				kfree(ml);
1671 			}
1672 			rtnl_unlock();
1673 			return 0;
1674 		}
1675 	}
1676 	rtnl_unlock();
1677 	return -EADDRNOTAVAIL;
1678 }
1679 
1680 static void packet_flush_mclist(struct sock *sk)
1681 {
1682 	struct packet_sock *po = pkt_sk(sk);
1683 	struct packet_mclist *ml;
1684 
1685 	if (!po->mclist)
1686 		return;
1687 
1688 	rtnl_lock();
1689 	while ((ml = po->mclist) != NULL) {
1690 		struct net_device *dev;
1691 
1692 		po->mclist = ml->next;
1693 		dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1694 		if (dev != NULL) {
1695 			packet_dev_mc(dev, ml, -1);
1696 			dev_put(dev);
1697 		}
1698 		kfree(ml);
1699 	}
1700 	rtnl_unlock();
1701 }
1702 
1703 static int
1704 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1705 {
1706 	struct sock *sk = sock->sk;
1707 	struct packet_sock *po = pkt_sk(sk);
1708 	int ret;
1709 
1710 	if (level != SOL_PACKET)
1711 		return -ENOPROTOOPT;
1712 
1713 	switch (optname) {
1714 	case PACKET_ADD_MEMBERSHIP:
1715 	case PACKET_DROP_MEMBERSHIP:
1716 	{
1717 		struct packet_mreq_max mreq;
1718 		int len = optlen;
1719 		memset(&mreq, 0, sizeof(mreq));
1720 		if (len < sizeof(struct packet_mreq))
1721 			return -EINVAL;
1722 		if (len > sizeof(mreq))
1723 			len = sizeof(mreq);
1724 		if (copy_from_user(&mreq, optval, len))
1725 			return -EFAULT;
1726 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1727 			return -EINVAL;
1728 		if (optname == PACKET_ADD_MEMBERSHIP)
1729 			ret = packet_mc_add(sk, &mreq);
1730 		else
1731 			ret = packet_mc_drop(sk, &mreq);
1732 		return ret;
1733 	}
1734 
1735 #ifdef CONFIG_PACKET_MMAP
1736 	case PACKET_RX_RING:
1737 	case PACKET_TX_RING:
1738 	{
1739 		struct tpacket_req req;
1740 
1741 		if (optlen < sizeof(req))
1742 			return -EINVAL;
1743 		if (copy_from_user(&req, optval, sizeof(req)))
1744 			return -EFAULT;
1745 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1746 	}
1747 	case PACKET_COPY_THRESH:
1748 	{
1749 		int val;
1750 
1751 		if (optlen != sizeof(val))
1752 			return -EINVAL;
1753 		if (copy_from_user(&val, optval, sizeof(val)))
1754 			return -EFAULT;
1755 
1756 		pkt_sk(sk)->copy_thresh = val;
1757 		return 0;
1758 	}
1759 	case PACKET_VERSION:
1760 	{
1761 		int val;
1762 
1763 		if (optlen != sizeof(val))
1764 			return -EINVAL;
1765 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1766 			return -EBUSY;
1767 		if (copy_from_user(&val, optval, sizeof(val)))
1768 			return -EFAULT;
1769 		switch (val) {
1770 		case TPACKET_V1:
1771 		case TPACKET_V2:
1772 			po->tp_version = val;
1773 			return 0;
1774 		default:
1775 			return -EINVAL;
1776 		}
1777 	}
1778 	case PACKET_RESERVE:
1779 	{
1780 		unsigned int val;
1781 
1782 		if (optlen != sizeof(val))
1783 			return -EINVAL;
1784 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1785 			return -EBUSY;
1786 		if (copy_from_user(&val, optval, sizeof(val)))
1787 			return -EFAULT;
1788 		po->tp_reserve = val;
1789 		return 0;
1790 	}
1791 	case PACKET_LOSS:
1792 	{
1793 		unsigned int val;
1794 
1795 		if (optlen != sizeof(val))
1796 			return -EINVAL;
1797 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1798 			return -EBUSY;
1799 		if (copy_from_user(&val, optval, sizeof(val)))
1800 			return -EFAULT;
1801 		po->tp_loss = !!val;
1802 		return 0;
1803 	}
1804 #endif
1805 	case PACKET_AUXDATA:
1806 	{
1807 		int val;
1808 
1809 		if (optlen < sizeof(val))
1810 			return -EINVAL;
1811 		if (copy_from_user(&val, optval, sizeof(val)))
1812 			return -EFAULT;
1813 
1814 		po->auxdata = !!val;
1815 		return 0;
1816 	}
1817 	case PACKET_ORIGDEV:
1818 	{
1819 		int val;
1820 
1821 		if (optlen < sizeof(val))
1822 			return -EINVAL;
1823 		if (copy_from_user(&val, optval, sizeof(val)))
1824 			return -EFAULT;
1825 
1826 		po->origdev = !!val;
1827 		return 0;
1828 	}
1829 	default:
1830 		return -ENOPROTOOPT;
1831 	}
1832 }
1833 
1834 static int packet_getsockopt(struct socket *sock, int level, int optname,
1835 			     char __user *optval, int __user *optlen)
1836 {
1837 	int len;
1838 	int val;
1839 	struct sock *sk = sock->sk;
1840 	struct packet_sock *po = pkt_sk(sk);
1841 	void *data;
1842 	struct tpacket_stats st;
1843 
1844 	if (level != SOL_PACKET)
1845 		return -ENOPROTOOPT;
1846 
1847 	if (get_user(len, optlen))
1848 		return -EFAULT;
1849 
1850 	if (len < 0)
1851 		return -EINVAL;
1852 
1853 	switch (optname) {
1854 	case PACKET_STATISTICS:
1855 		if (len > sizeof(struct tpacket_stats))
1856 			len = sizeof(struct tpacket_stats);
1857 		spin_lock_bh(&sk->sk_receive_queue.lock);
1858 		st = po->stats;
1859 		memset(&po->stats, 0, sizeof(st));
1860 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1861 		st.tp_packets += st.tp_drops;
1862 
1863 		data = &st;
1864 		break;
1865 	case PACKET_AUXDATA:
1866 		if (len > sizeof(int))
1867 			len = sizeof(int);
1868 		val = po->auxdata;
1869 
1870 		data = &val;
1871 		break;
1872 	case PACKET_ORIGDEV:
1873 		if (len > sizeof(int))
1874 			len = sizeof(int);
1875 		val = po->origdev;
1876 
1877 		data = &val;
1878 		break;
1879 #ifdef CONFIG_PACKET_MMAP
1880 	case PACKET_VERSION:
1881 		if (len > sizeof(int))
1882 			len = sizeof(int);
1883 		val = po->tp_version;
1884 		data = &val;
1885 		break;
1886 	case PACKET_HDRLEN:
1887 		if (len > sizeof(int))
1888 			len = sizeof(int);
1889 		if (copy_from_user(&val, optval, len))
1890 			return -EFAULT;
1891 		switch (val) {
1892 		case TPACKET_V1:
1893 			val = sizeof(struct tpacket_hdr);
1894 			break;
1895 		case TPACKET_V2:
1896 			val = sizeof(struct tpacket2_hdr);
1897 			break;
1898 		default:
1899 			return -EINVAL;
1900 		}
1901 		data = &val;
1902 		break;
1903 	case PACKET_RESERVE:
1904 		if (len > sizeof(unsigned int))
1905 			len = sizeof(unsigned int);
1906 		val = po->tp_reserve;
1907 		data = &val;
1908 		break;
1909 	case PACKET_LOSS:
1910 		if (len > sizeof(unsigned int))
1911 			len = sizeof(unsigned int);
1912 		val = po->tp_loss;
1913 		data = &val;
1914 		break;
1915 #endif
1916 	default:
1917 		return -ENOPROTOOPT;
1918 	}
1919 
1920 	if (put_user(len, optlen))
1921 		return -EFAULT;
1922 	if (copy_to_user(optval, data, len))
1923 		return -EFAULT;
1924 	return 0;
1925 }
1926 
1927 
1928 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1929 {
1930 	struct sock *sk;
1931 	struct hlist_node *node;
1932 	struct net_device *dev = data;
1933 	struct net *net = dev_net(dev);
1934 
1935 	read_lock(&net->packet.sklist_lock);
1936 	sk_for_each(sk, node, &net->packet.sklist) {
1937 		struct packet_sock *po = pkt_sk(sk);
1938 
1939 		switch (msg) {
1940 		case NETDEV_UNREGISTER:
1941 			if (po->mclist)
1942 				packet_dev_mclist(dev, po->mclist, -1);
1943 			/* fallthrough */
1944 
1945 		case NETDEV_DOWN:
1946 			if (dev->ifindex == po->ifindex) {
1947 				spin_lock(&po->bind_lock);
1948 				if (po->running) {
1949 					__dev_remove_pack(&po->prot_hook);
1950 					__sock_put(sk);
1951 					po->running = 0;
1952 					sk->sk_err = ENETDOWN;
1953 					if (!sock_flag(sk, SOCK_DEAD))
1954 						sk->sk_error_report(sk);
1955 				}
1956 				if (msg == NETDEV_UNREGISTER) {
1957 					po->ifindex = -1;
1958 					po->prot_hook.dev = NULL;
1959 				}
1960 				spin_unlock(&po->bind_lock);
1961 			}
1962 			break;
1963 		case NETDEV_UP:
1964 			spin_lock(&po->bind_lock);
1965 			if (dev->ifindex == po->ifindex && po->num &&
1966 			    !po->running) {
1967 				dev_add_pack(&po->prot_hook);
1968 				sock_hold(sk);
1969 				po->running = 1;
1970 			}
1971 			spin_unlock(&po->bind_lock);
1972 			break;
1973 		}
1974 	}
1975 	read_unlock(&net->packet.sklist_lock);
1976 	return NOTIFY_DONE;
1977 }
1978 
1979 
1980 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1981 			unsigned long arg)
1982 {
1983 	struct sock *sk = sock->sk;
1984 
1985 	switch (cmd) {
1986 	case SIOCOUTQ:
1987 	{
1988 		int amount = sk_wmem_alloc_get(sk);
1989 
1990 		return put_user(amount, (int __user *)arg);
1991 	}
1992 	case SIOCINQ:
1993 	{
1994 		struct sk_buff *skb;
1995 		int amount = 0;
1996 
1997 		spin_lock_bh(&sk->sk_receive_queue.lock);
1998 		skb = skb_peek(&sk->sk_receive_queue);
1999 		if (skb)
2000 			amount = skb->len;
2001 		spin_unlock_bh(&sk->sk_receive_queue.lock);
2002 		return put_user(amount, (int __user *)arg);
2003 	}
2004 	case SIOCGSTAMP:
2005 		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2006 	case SIOCGSTAMPNS:
2007 		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2008 
2009 #ifdef CONFIG_INET
2010 	case SIOCADDRT:
2011 	case SIOCDELRT:
2012 	case SIOCDARP:
2013 	case SIOCGARP:
2014 	case SIOCSARP:
2015 	case SIOCGIFADDR:
2016 	case SIOCSIFADDR:
2017 	case SIOCGIFBRDADDR:
2018 	case SIOCSIFBRDADDR:
2019 	case SIOCGIFNETMASK:
2020 	case SIOCSIFNETMASK:
2021 	case SIOCGIFDSTADDR:
2022 	case SIOCSIFDSTADDR:
2023 	case SIOCSIFFLAGS:
2024 		if (!net_eq(sock_net(sk), &init_net))
2025 			return -ENOIOCTLCMD;
2026 		return inet_dgram_ops.ioctl(sock, cmd, arg);
2027 #endif
2028 
2029 	default:
2030 		return -ENOIOCTLCMD;
2031 	}
2032 	return 0;
2033 }
2034 
2035 #ifndef CONFIG_PACKET_MMAP
2036 #define packet_mmap sock_no_mmap
2037 #define packet_poll datagram_poll
2038 #else
2039 
2040 static unsigned int packet_poll(struct file *file, struct socket *sock,
2041 				poll_table *wait)
2042 {
2043 	struct sock *sk = sock->sk;
2044 	struct packet_sock *po = pkt_sk(sk);
2045 	unsigned int mask = datagram_poll(file, sock, wait);
2046 
2047 	spin_lock_bh(&sk->sk_receive_queue.lock);
2048 	if (po->rx_ring.pg_vec) {
2049 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2050 			mask |= POLLIN | POLLRDNORM;
2051 	}
2052 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2053 	spin_lock_bh(&sk->sk_write_queue.lock);
2054 	if (po->tx_ring.pg_vec) {
2055 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2056 			mask |= POLLOUT | POLLWRNORM;
2057 	}
2058 	spin_unlock_bh(&sk->sk_write_queue.lock);
2059 	return mask;
2060 }
2061 
2062 
2063 /* Dirty? Well, I still did not learn better way to account
2064  * for user mmaps.
2065  */
2066 
2067 static void packet_mm_open(struct vm_area_struct *vma)
2068 {
2069 	struct file *file = vma->vm_file;
2070 	struct socket *sock = file->private_data;
2071 	struct sock *sk = sock->sk;
2072 
2073 	if (sk)
2074 		atomic_inc(&pkt_sk(sk)->mapped);
2075 }
2076 
2077 static void packet_mm_close(struct vm_area_struct *vma)
2078 {
2079 	struct file *file = vma->vm_file;
2080 	struct socket *sock = file->private_data;
2081 	struct sock *sk = sock->sk;
2082 
2083 	if (sk)
2084 		atomic_dec(&pkt_sk(sk)->mapped);
2085 }
2086 
2087 static const struct vm_operations_struct packet_mmap_ops = {
2088 	.open	=	packet_mm_open,
2089 	.close	=	packet_mm_close,
2090 };
2091 
2092 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2093 {
2094 	int i;
2095 
2096 	for (i = 0; i < len; i++) {
2097 		if (likely(pg_vec[i]))
2098 			free_pages((unsigned long) pg_vec[i], order);
2099 	}
2100 	kfree(pg_vec);
2101 }
2102 
2103 static inline char *alloc_one_pg_vec_page(unsigned long order)
2104 {
2105 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2106 
2107 	return (char *) __get_free_pages(gfp_flags, order);
2108 }
2109 
2110 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2111 {
2112 	unsigned int block_nr = req->tp_block_nr;
2113 	char **pg_vec;
2114 	int i;
2115 
2116 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2117 	if (unlikely(!pg_vec))
2118 		goto out;
2119 
2120 	for (i = 0; i < block_nr; i++) {
2121 		pg_vec[i] = alloc_one_pg_vec_page(order);
2122 		if (unlikely(!pg_vec[i]))
2123 			goto out_free_pgvec;
2124 	}
2125 
2126 out:
2127 	return pg_vec;
2128 
2129 out_free_pgvec:
2130 	free_pg_vec(pg_vec, order, block_nr);
2131 	pg_vec = NULL;
2132 	goto out;
2133 }
2134 
2135 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2136 		int closing, int tx_ring)
2137 {
2138 	char **pg_vec = NULL;
2139 	struct packet_sock *po = pkt_sk(sk);
2140 	int was_running, order = 0;
2141 	struct packet_ring_buffer *rb;
2142 	struct sk_buff_head *rb_queue;
2143 	__be16 num;
2144 	int err;
2145 
2146 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2147 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2148 
2149 	err = -EBUSY;
2150 	if (!closing) {
2151 		if (atomic_read(&po->mapped))
2152 			goto out;
2153 		if (atomic_read(&rb->pending))
2154 			goto out;
2155 	}
2156 
2157 	if (req->tp_block_nr) {
2158 		/* Sanity tests and some calculations */
2159 		err = -EBUSY;
2160 		if (unlikely(rb->pg_vec))
2161 			goto out;
2162 
2163 		switch (po->tp_version) {
2164 		case TPACKET_V1:
2165 			po->tp_hdrlen = TPACKET_HDRLEN;
2166 			break;
2167 		case TPACKET_V2:
2168 			po->tp_hdrlen = TPACKET2_HDRLEN;
2169 			break;
2170 		}
2171 
2172 		err = -EINVAL;
2173 		if (unlikely((int)req->tp_block_size <= 0))
2174 			goto out;
2175 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2176 			goto out;
2177 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2178 					po->tp_reserve))
2179 			goto out;
2180 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2181 			goto out;
2182 
2183 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2184 		if (unlikely(rb->frames_per_block <= 0))
2185 			goto out;
2186 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2187 					req->tp_frame_nr))
2188 			goto out;
2189 
2190 		err = -ENOMEM;
2191 		order = get_order(req->tp_block_size);
2192 		pg_vec = alloc_pg_vec(req, order);
2193 		if (unlikely(!pg_vec))
2194 			goto out;
2195 	}
2196 	/* Done */
2197 	else {
2198 		err = -EINVAL;
2199 		if (unlikely(req->tp_frame_nr))
2200 			goto out;
2201 	}
2202 
2203 	lock_sock(sk);
2204 
2205 	/* Detach socket from network */
2206 	spin_lock(&po->bind_lock);
2207 	was_running = po->running;
2208 	num = po->num;
2209 	if (was_running) {
2210 		__dev_remove_pack(&po->prot_hook);
2211 		po->num = 0;
2212 		po->running = 0;
2213 		__sock_put(sk);
2214 	}
2215 	spin_unlock(&po->bind_lock);
2216 
2217 	synchronize_net();
2218 
2219 	err = -EBUSY;
2220 	mutex_lock(&po->pg_vec_lock);
2221 	if (closing || atomic_read(&po->mapped) == 0) {
2222 		err = 0;
2223 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2224 		spin_lock_bh(&rb_queue->lock);
2225 		pg_vec = XC(rb->pg_vec, pg_vec);
2226 		rb->frame_max = (req->tp_frame_nr - 1);
2227 		rb->head = 0;
2228 		rb->frame_size = req->tp_frame_size;
2229 		spin_unlock_bh(&rb_queue->lock);
2230 
2231 		order = XC(rb->pg_vec_order, order);
2232 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2233 
2234 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2235 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2236 						tpacket_rcv : packet_rcv;
2237 		skb_queue_purge(rb_queue);
2238 #undef XC
2239 		if (atomic_read(&po->mapped))
2240 			pr_err("packet_mmap: vma is busy: %d\n",
2241 			       atomic_read(&po->mapped));
2242 	}
2243 	mutex_unlock(&po->pg_vec_lock);
2244 
2245 	spin_lock(&po->bind_lock);
2246 	if (was_running && !po->running) {
2247 		sock_hold(sk);
2248 		po->running = 1;
2249 		po->num = num;
2250 		dev_add_pack(&po->prot_hook);
2251 	}
2252 	spin_unlock(&po->bind_lock);
2253 
2254 	release_sock(sk);
2255 
2256 	if (pg_vec)
2257 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2258 out:
2259 	return err;
2260 }
2261 
2262 static int packet_mmap(struct file *file, struct socket *sock,
2263 		struct vm_area_struct *vma)
2264 {
2265 	struct sock *sk = sock->sk;
2266 	struct packet_sock *po = pkt_sk(sk);
2267 	unsigned long size, expected_size;
2268 	struct packet_ring_buffer *rb;
2269 	unsigned long start;
2270 	int err = -EINVAL;
2271 	int i;
2272 
2273 	if (vma->vm_pgoff)
2274 		return -EINVAL;
2275 
2276 	mutex_lock(&po->pg_vec_lock);
2277 
2278 	expected_size = 0;
2279 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2280 		if (rb->pg_vec) {
2281 			expected_size += rb->pg_vec_len
2282 						* rb->pg_vec_pages
2283 						* PAGE_SIZE;
2284 		}
2285 	}
2286 
2287 	if (expected_size == 0)
2288 		goto out;
2289 
2290 	size = vma->vm_end - vma->vm_start;
2291 	if (size != expected_size)
2292 		goto out;
2293 
2294 	start = vma->vm_start;
2295 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2296 		if (rb->pg_vec == NULL)
2297 			continue;
2298 
2299 		for (i = 0; i < rb->pg_vec_len; i++) {
2300 			struct page *page = virt_to_page(rb->pg_vec[i]);
2301 			int pg_num;
2302 
2303 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2304 					pg_num++, page++) {
2305 				err = vm_insert_page(vma, start, page);
2306 				if (unlikely(err))
2307 					goto out;
2308 				start += PAGE_SIZE;
2309 			}
2310 		}
2311 	}
2312 
2313 	atomic_inc(&po->mapped);
2314 	vma->vm_ops = &packet_mmap_ops;
2315 	err = 0;
2316 
2317 out:
2318 	mutex_unlock(&po->pg_vec_lock);
2319 	return err;
2320 }
2321 #endif
2322 
2323 
2324 static const struct proto_ops packet_ops_spkt = {
2325 	.family =	PF_PACKET,
2326 	.owner =	THIS_MODULE,
2327 	.release =	packet_release,
2328 	.bind =		packet_bind_spkt,
2329 	.connect =	sock_no_connect,
2330 	.socketpair =	sock_no_socketpair,
2331 	.accept =	sock_no_accept,
2332 	.getname =	packet_getname_spkt,
2333 	.poll =		datagram_poll,
2334 	.ioctl =	packet_ioctl,
2335 	.listen =	sock_no_listen,
2336 	.shutdown =	sock_no_shutdown,
2337 	.setsockopt =	sock_no_setsockopt,
2338 	.getsockopt =	sock_no_getsockopt,
2339 	.sendmsg =	packet_sendmsg_spkt,
2340 	.recvmsg =	packet_recvmsg,
2341 	.mmap =		sock_no_mmap,
2342 	.sendpage =	sock_no_sendpage,
2343 };
2344 
2345 static const struct proto_ops packet_ops = {
2346 	.family =	PF_PACKET,
2347 	.owner =	THIS_MODULE,
2348 	.release =	packet_release,
2349 	.bind =		packet_bind,
2350 	.connect =	sock_no_connect,
2351 	.socketpair =	sock_no_socketpair,
2352 	.accept =	sock_no_accept,
2353 	.getname =	packet_getname,
2354 	.poll =		packet_poll,
2355 	.ioctl =	packet_ioctl,
2356 	.listen =	sock_no_listen,
2357 	.shutdown =	sock_no_shutdown,
2358 	.setsockopt =	packet_setsockopt,
2359 	.getsockopt =	packet_getsockopt,
2360 	.sendmsg =	packet_sendmsg,
2361 	.recvmsg =	packet_recvmsg,
2362 	.mmap =		packet_mmap,
2363 	.sendpage =	sock_no_sendpage,
2364 };
2365 
2366 static struct net_proto_family packet_family_ops = {
2367 	.family =	PF_PACKET,
2368 	.create =	packet_create,
2369 	.owner	=	THIS_MODULE,
2370 };
2371 
2372 static struct notifier_block packet_netdev_notifier = {
2373 	.notifier_call =	packet_notifier,
2374 };
2375 
2376 #ifdef CONFIG_PROC_FS
2377 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2378 {
2379 	struct sock *s;
2380 	struct hlist_node *node;
2381 
2382 	sk_for_each(s, node, &net->packet.sklist) {
2383 		if (!off--)
2384 			return s;
2385 	}
2386 	return NULL;
2387 }
2388 
2389 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2390 	__acquires(seq_file_net(seq)->packet.sklist_lock)
2391 {
2392 	struct net *net = seq_file_net(seq);
2393 	read_lock(&net->packet.sklist_lock);
2394 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2395 }
2396 
2397 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2398 {
2399 	struct net *net = seq_file_net(seq);
2400 	++*pos;
2401 	return  (v == SEQ_START_TOKEN)
2402 		? sk_head(&net->packet.sklist)
2403 		: sk_next((struct sock *)v) ;
2404 }
2405 
2406 static void packet_seq_stop(struct seq_file *seq, void *v)
2407 	__releases(seq_file_net(seq)->packet.sklist_lock)
2408 {
2409 	struct net *net = seq_file_net(seq);
2410 	read_unlock(&net->packet.sklist_lock);
2411 }
2412 
2413 static int packet_seq_show(struct seq_file *seq, void *v)
2414 {
2415 	if (v == SEQ_START_TOKEN)
2416 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2417 	else {
2418 		struct sock *s = v;
2419 		const struct packet_sock *po = pkt_sk(s);
2420 
2421 		seq_printf(seq,
2422 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2423 			   s,
2424 			   atomic_read(&s->sk_refcnt),
2425 			   s->sk_type,
2426 			   ntohs(po->num),
2427 			   po->ifindex,
2428 			   po->running,
2429 			   atomic_read(&s->sk_rmem_alloc),
2430 			   sock_i_uid(s),
2431 			   sock_i_ino(s));
2432 	}
2433 
2434 	return 0;
2435 }
2436 
2437 static const struct seq_operations packet_seq_ops = {
2438 	.start	= packet_seq_start,
2439 	.next	= packet_seq_next,
2440 	.stop	= packet_seq_stop,
2441 	.show	= packet_seq_show,
2442 };
2443 
2444 static int packet_seq_open(struct inode *inode, struct file *file)
2445 {
2446 	return seq_open_net(inode, file, &packet_seq_ops,
2447 			    sizeof(struct seq_net_private));
2448 }
2449 
2450 static const struct file_operations packet_seq_fops = {
2451 	.owner		= THIS_MODULE,
2452 	.open		= packet_seq_open,
2453 	.read		= seq_read,
2454 	.llseek		= seq_lseek,
2455 	.release	= seq_release_net,
2456 };
2457 
2458 #endif
2459 
2460 static int packet_net_init(struct net *net)
2461 {
2462 	rwlock_init(&net->packet.sklist_lock);
2463 	INIT_HLIST_HEAD(&net->packet.sklist);
2464 
2465 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2466 		return -ENOMEM;
2467 
2468 	return 0;
2469 }
2470 
2471 static void packet_net_exit(struct net *net)
2472 {
2473 	proc_net_remove(net, "packet");
2474 }
2475 
2476 static struct pernet_operations packet_net_ops = {
2477 	.init = packet_net_init,
2478 	.exit = packet_net_exit,
2479 };
2480 
2481 
2482 static void __exit packet_exit(void)
2483 {
2484 	unregister_netdevice_notifier(&packet_netdev_notifier);
2485 	unregister_pernet_subsys(&packet_net_ops);
2486 	sock_unregister(PF_PACKET);
2487 	proto_unregister(&packet_proto);
2488 }
2489 
2490 static int __init packet_init(void)
2491 {
2492 	int rc = proto_register(&packet_proto, 0);
2493 
2494 	if (rc != 0)
2495 		goto out;
2496 
2497 	sock_register(&packet_family_ops);
2498 	register_pernet_subsys(&packet_net_ops);
2499 	register_netdevice_notifier(&packet_netdev_notifier);
2500 out:
2501 	return rc;
2502 }
2503 
2504 module_init(packet_init);
2505 module_exit(packet_exit);
2506 MODULE_LICENSE("GPL");
2507 MODULE_ALIAS_NETPROTO(PF_PACKET);
2508