xref: /openbmc/linux/net/packet/af_packet.c (revision 22246614)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Version:	$Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *		Alan Cox	:	verify_area() now used correctly
16  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
17  *		Alan Cox	:	tidied skbuff lists.
18  *		Alan Cox	:	Now uses generic datagram routines I
19  *					added. Also fixed the peek/read crash
20  *					from all old Linux datagram code.
21  *		Alan Cox	:	Uses the improved datagram code.
22  *		Alan Cox	:	Added NULL's for socket options.
23  *		Alan Cox	:	Re-commented the code.
24  *		Alan Cox	:	Use new kernel side addressing
25  *		Rob Janssen	:	Correct MTU usage.
26  *		Dave Platt	:	Counter leaks caused by incorrect
27  *					interrupt locking and some slightly
28  *					dubious gcc output. Can you read
29  *					compiler: it said _VOLATILE_
30  *	Richard Kooijman	:	Timestamp fixes.
31  *		Alan Cox	:	New buffers. Use sk->mac.raw.
32  *		Alan Cox	:	sendmsg/recvmsg support.
33  *		Alan Cox	:	Protocol setting support
34  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
35  *	Cyrus Durgin		:	Fixed kerneld for kmod.
36  *	Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
40  *					The convention is that longer addresses
41  *					will simply extend the hardware address
42  *					byte arrays at the end of sockaddr_ll
43  *					and packet_mreq.
44  *
45  *		This program is free software; you can redistribute it and/or
46  *		modify it under the terms of the GNU General Public License
47  *		as published by the Free Software Foundation; either version
48  *		2 of the License, or (at your option) any later version.
49  *
50  */
51 
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86 
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97 
98 On receive:
99 -----------
100 
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104 
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108 
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 		 header.  PPP makes it, that is wrong, because introduce
112 		 assymetry between rx and tx paths.
113    data       -> data
114 
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118 
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121 
122 
123 On transmit:
124 ------------
125 
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129 
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133 
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137 
138 /* Private packet socket structures. */
139 
140 struct packet_mclist
141 {
142 	struct packet_mclist	*next;
143 	int			ifindex;
144 	int			count;
145 	unsigned short		type;
146 	unsigned short		alen;
147 	unsigned char		addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max
153 {
154 	int		mr_ifindex;
155 	unsigned short	mr_type;
156 	unsigned short	mr_alen;
157 	unsigned char	mr_address[MAX_ADDR_LEN];
158 };
159 
160 #ifdef CONFIG_PACKET_MMAP
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
162 #endif
163 
164 static void packet_flush_mclist(struct sock *sk);
165 
166 struct packet_sock {
167 	/* struct sock has to be the first member of packet_sock */
168 	struct sock		sk;
169 	struct tpacket_stats	stats;
170 #ifdef CONFIG_PACKET_MMAP
171 	char *			*pg_vec;
172 	unsigned int		head;
173 	unsigned int            frames_per_block;
174 	unsigned int		frame_size;
175 	unsigned int		frame_max;
176 	int			copy_thresh;
177 #endif
178 	struct packet_type	prot_hook;
179 	spinlock_t		bind_lock;
180 	unsigned int		running:1,	/* prot_hook is attached*/
181 				auxdata:1,
182 				origdev:1;
183 	int			ifindex;	/* bound device		*/
184 	__be16			num;
185 	struct packet_mclist	*mclist;
186 #ifdef CONFIG_PACKET_MMAP
187 	atomic_t		mapped;
188 	unsigned int            pg_vec_order;
189 	unsigned int		pg_vec_pages;
190 	unsigned int		pg_vec_len;
191 #endif
192 };
193 
194 struct packet_skb_cb {
195 	unsigned int origlen;
196 	union {
197 		struct sockaddr_pkt pkt;
198 		struct sockaddr_ll ll;
199 	} sa;
200 };
201 
202 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
203 
204 #ifdef CONFIG_PACKET_MMAP
205 
206 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
207 {
208 	unsigned int pg_vec_pos, frame_offset;
209 
210 	pg_vec_pos = position / po->frames_per_block;
211 	frame_offset = position % po->frames_per_block;
212 
213 	return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
214 }
215 #endif
216 
217 static inline struct packet_sock *pkt_sk(struct sock *sk)
218 {
219 	return (struct packet_sock *)sk;
220 }
221 
222 static void packet_sock_destruct(struct sock *sk)
223 {
224 	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
225 	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
226 
227 	if (!sock_flag(sk, SOCK_DEAD)) {
228 		printk("Attempt to release alive packet socket: %p\n", sk);
229 		return;
230 	}
231 
232 	sk_refcnt_debug_dec(sk);
233 }
234 
235 
236 static const struct proto_ops packet_ops;
237 
238 static const struct proto_ops packet_ops_spkt;
239 
240 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
241 {
242 	struct sock *sk;
243 	struct sockaddr_pkt *spkt;
244 
245 	/*
246 	 *	When we registered the protocol we saved the socket in the data
247 	 *	field for just this event.
248 	 */
249 
250 	sk = pt->af_packet_priv;
251 
252 	/*
253 	 *	Yank back the headers [hope the device set this
254 	 *	right or kerboom...]
255 	 *
256 	 *	Incoming packets have ll header pulled,
257 	 *	push it back.
258 	 *
259 	 *	For outgoing ones skb->data == skb_mac_header(skb)
260 	 *	so that this procedure is noop.
261 	 */
262 
263 	if (skb->pkt_type == PACKET_LOOPBACK)
264 		goto out;
265 
266 	if (dev_net(dev) != sock_net(sk))
267 		goto out;
268 
269 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
270 		goto oom;
271 
272 	/* drop any routing info */
273 	dst_release(skb->dst);
274 	skb->dst = NULL;
275 
276 	/* drop conntrack reference */
277 	nf_reset(skb);
278 
279 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
280 
281 	skb_push(skb, skb->data - skb_mac_header(skb));
282 
283 	/*
284 	 *	The SOCK_PACKET socket receives _all_ frames.
285 	 */
286 
287 	spkt->spkt_family = dev->type;
288 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
289 	spkt->spkt_protocol = skb->protocol;
290 
291 	/*
292 	 *	Charge the memory to the socket. This is done specifically
293 	 *	to prevent sockets using all the memory up.
294 	 */
295 
296 	if (sock_queue_rcv_skb(sk,skb) == 0)
297 		return 0;
298 
299 out:
300 	kfree_skb(skb);
301 oom:
302 	return 0;
303 }
304 
305 
306 /*
307  *	Output a raw packet to a device layer. This bypasses all the other
308  *	protocol layers and you must therefore supply it with a complete frame
309  */
310 
311 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
312 			       struct msghdr *msg, size_t len)
313 {
314 	struct sock *sk = sock->sk;
315 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
316 	struct sk_buff *skb;
317 	struct net_device *dev;
318 	__be16 proto=0;
319 	int err;
320 
321 	/*
322 	 *	Get and verify the address.
323 	 */
324 
325 	if (saddr)
326 	{
327 		if (msg->msg_namelen < sizeof(struct sockaddr))
328 			return(-EINVAL);
329 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
330 			proto=saddr->spkt_protocol;
331 	}
332 	else
333 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
334 
335 	/*
336 	 *	Find the device first to size check it
337 	 */
338 
339 	saddr->spkt_device[13] = 0;
340 	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
341 	err = -ENODEV;
342 	if (dev == NULL)
343 		goto out_unlock;
344 
345 	err = -ENETDOWN;
346 	if (!(dev->flags & IFF_UP))
347 		goto out_unlock;
348 
349 	/*
350 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
351 	 *	raw protocol and you must do your own fragmentation at this level.
352 	 */
353 
354 	err = -EMSGSIZE;
355 	if (len > dev->mtu + dev->hard_header_len)
356 		goto out_unlock;
357 
358 	err = -ENOBUFS;
359 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
360 
361 	/*
362 	 *	If the write buffer is full, then tough. At this level the user gets to
363 	 *	deal with the problem - do your own algorithmic backoffs. That's far
364 	 *	more flexible.
365 	 */
366 
367 	if (skb == NULL)
368 		goto out_unlock;
369 
370 	/*
371 	 *	Fill it in
372 	 */
373 
374 	/* FIXME: Save some space for broken drivers that write a
375 	 * hard header at transmission time by themselves. PPP is the
376 	 * notable one here. This should really be fixed at the driver level.
377 	 */
378 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
379 	skb_reset_network_header(skb);
380 
381 	/* Try to align data part correctly */
382 	if (dev->header_ops) {
383 		skb->data -= dev->hard_header_len;
384 		skb->tail -= dev->hard_header_len;
385 		if (len < dev->hard_header_len)
386 			skb_reset_network_header(skb);
387 	}
388 
389 	/* Returns -EFAULT on error */
390 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
391 	skb->protocol = proto;
392 	skb->dev = dev;
393 	skb->priority = sk->sk_priority;
394 	if (err)
395 		goto out_free;
396 
397 	/*
398 	 *	Now send it
399 	 */
400 
401 	dev_queue_xmit(skb);
402 	dev_put(dev);
403 	return(len);
404 
405 out_free:
406 	kfree_skb(skb);
407 out_unlock:
408 	if (dev)
409 		dev_put(dev);
410 	return err;
411 }
412 
413 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
414 				      unsigned int res)
415 {
416 	struct sk_filter *filter;
417 
418 	rcu_read_lock_bh();
419 	filter = rcu_dereference(sk->sk_filter);
420 	if (filter != NULL)
421 		res = sk_run_filter(skb, filter->insns, filter->len);
422 	rcu_read_unlock_bh();
423 
424 	return res;
425 }
426 
427 /*
428    This function makes lazy skb cloning in hope that most of packets
429    are discarded by BPF.
430 
431    Note tricky part: we DO mangle shared skb! skb->data, skb->len
432    and skb->cb are mangled. It works because (and until) packets
433    falling here are owned by current CPU. Output packets are cloned
434    by dev_queue_xmit_nit(), input packets are processed by net_bh
435    sequencially, so that if we return skb to original state on exit,
436    we will not harm anyone.
437  */
438 
439 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
440 {
441 	struct sock *sk;
442 	struct sockaddr_ll *sll;
443 	struct packet_sock *po;
444 	u8 * skb_head = skb->data;
445 	int skb_len = skb->len;
446 	unsigned int snaplen, res;
447 
448 	if (skb->pkt_type == PACKET_LOOPBACK)
449 		goto drop;
450 
451 	sk = pt->af_packet_priv;
452 	po = pkt_sk(sk);
453 
454 	if (dev_net(dev) != sock_net(sk))
455 		goto drop;
456 
457 	skb->dev = dev;
458 
459 	if (dev->header_ops) {
460 		/* The device has an explicit notion of ll header,
461 		   exported to higher levels.
462 
463 		   Otherwise, the device hides datails of it frame
464 		   structure, so that corresponding packet head
465 		   never delivered to user.
466 		 */
467 		if (sk->sk_type != SOCK_DGRAM)
468 			skb_push(skb, skb->data - skb_mac_header(skb));
469 		else if (skb->pkt_type == PACKET_OUTGOING) {
470 			/* Special case: outgoing packets have ll header at head */
471 			skb_pull(skb, skb_network_offset(skb));
472 		}
473 	}
474 
475 	snaplen = skb->len;
476 
477 	res = run_filter(skb, sk, snaplen);
478 	if (!res)
479 		goto drop_n_restore;
480 	if (snaplen > res)
481 		snaplen = res;
482 
483 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
484 	    (unsigned)sk->sk_rcvbuf)
485 		goto drop_n_acct;
486 
487 	if (skb_shared(skb)) {
488 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
489 		if (nskb == NULL)
490 			goto drop_n_acct;
491 
492 		if (skb_head != skb->data) {
493 			skb->data = skb_head;
494 			skb->len = skb_len;
495 		}
496 		kfree_skb(skb);
497 		skb = nskb;
498 	}
499 
500 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
501 		     sizeof(skb->cb));
502 
503 	sll = &PACKET_SKB_CB(skb)->sa.ll;
504 	sll->sll_family = AF_PACKET;
505 	sll->sll_hatype = dev->type;
506 	sll->sll_protocol = skb->protocol;
507 	sll->sll_pkttype = skb->pkt_type;
508 	if (unlikely(po->origdev))
509 		sll->sll_ifindex = orig_dev->ifindex;
510 	else
511 		sll->sll_ifindex = dev->ifindex;
512 
513 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
514 
515 	PACKET_SKB_CB(skb)->origlen = skb->len;
516 
517 	if (pskb_trim(skb, snaplen))
518 		goto drop_n_acct;
519 
520 	skb_set_owner_r(skb, sk);
521 	skb->dev = NULL;
522 	dst_release(skb->dst);
523 	skb->dst = NULL;
524 
525 	/* drop conntrack reference */
526 	nf_reset(skb);
527 
528 	spin_lock(&sk->sk_receive_queue.lock);
529 	po->stats.tp_packets++;
530 	__skb_queue_tail(&sk->sk_receive_queue, skb);
531 	spin_unlock(&sk->sk_receive_queue.lock);
532 	sk->sk_data_ready(sk, skb->len);
533 	return 0;
534 
535 drop_n_acct:
536 	spin_lock(&sk->sk_receive_queue.lock);
537 	po->stats.tp_drops++;
538 	spin_unlock(&sk->sk_receive_queue.lock);
539 
540 drop_n_restore:
541 	if (skb_head != skb->data && skb_shared(skb)) {
542 		skb->data = skb_head;
543 		skb->len = skb_len;
544 	}
545 drop:
546 	kfree_skb(skb);
547 	return 0;
548 }
549 
550 #ifdef CONFIG_PACKET_MMAP
551 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
552 {
553 	struct sock *sk;
554 	struct packet_sock *po;
555 	struct sockaddr_ll *sll;
556 	struct tpacket_hdr *h;
557 	u8 * skb_head = skb->data;
558 	int skb_len = skb->len;
559 	unsigned int snaplen, res;
560 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
561 	unsigned short macoff, netoff;
562 	struct sk_buff *copy_skb = NULL;
563 	struct timeval tv;
564 
565 	if (skb->pkt_type == PACKET_LOOPBACK)
566 		goto drop;
567 
568 	sk = pt->af_packet_priv;
569 	po = pkt_sk(sk);
570 
571 	if (dev_net(dev) != sock_net(sk))
572 		goto drop;
573 
574 	if (dev->header_ops) {
575 		if (sk->sk_type != SOCK_DGRAM)
576 			skb_push(skb, skb->data - skb_mac_header(skb));
577 		else if (skb->pkt_type == PACKET_OUTGOING) {
578 			/* Special case: outgoing packets have ll header at head */
579 			skb_pull(skb, skb_network_offset(skb));
580 		}
581 	}
582 
583 	if (skb->ip_summed == CHECKSUM_PARTIAL)
584 		status |= TP_STATUS_CSUMNOTREADY;
585 
586 	snaplen = skb->len;
587 
588 	res = run_filter(skb, sk, snaplen);
589 	if (!res)
590 		goto drop_n_restore;
591 	if (snaplen > res)
592 		snaplen = res;
593 
594 	if (sk->sk_type == SOCK_DGRAM) {
595 		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
596 	} else {
597 		unsigned maclen = skb_network_offset(skb);
598 		netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
599 		macoff = netoff - maclen;
600 	}
601 
602 	if (macoff + snaplen > po->frame_size) {
603 		if (po->copy_thresh &&
604 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
605 		    (unsigned)sk->sk_rcvbuf) {
606 			if (skb_shared(skb)) {
607 				copy_skb = skb_clone(skb, GFP_ATOMIC);
608 			} else {
609 				copy_skb = skb_get(skb);
610 				skb_head = skb->data;
611 			}
612 			if (copy_skb)
613 				skb_set_owner_r(copy_skb, sk);
614 		}
615 		snaplen = po->frame_size - macoff;
616 		if ((int)snaplen < 0)
617 			snaplen = 0;
618 	}
619 
620 	spin_lock(&sk->sk_receive_queue.lock);
621 	h = packet_lookup_frame(po, po->head);
622 
623 	if (h->tp_status)
624 		goto ring_is_full;
625 	po->head = po->head != po->frame_max ? po->head+1 : 0;
626 	po->stats.tp_packets++;
627 	if (copy_skb) {
628 		status |= TP_STATUS_COPY;
629 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
630 	}
631 	if (!po->stats.tp_drops)
632 		status &= ~TP_STATUS_LOSING;
633 	spin_unlock(&sk->sk_receive_queue.lock);
634 
635 	skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
636 
637 	h->tp_len = skb->len;
638 	h->tp_snaplen = snaplen;
639 	h->tp_mac = macoff;
640 	h->tp_net = netoff;
641 	if (skb->tstamp.tv64)
642 		tv = ktime_to_timeval(skb->tstamp);
643 	else
644 		do_gettimeofday(&tv);
645 	h->tp_sec = tv.tv_sec;
646 	h->tp_usec = tv.tv_usec;
647 
648 	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
649 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
650 	sll->sll_family = AF_PACKET;
651 	sll->sll_hatype = dev->type;
652 	sll->sll_protocol = skb->protocol;
653 	sll->sll_pkttype = skb->pkt_type;
654 	if (unlikely(po->origdev))
655 		sll->sll_ifindex = orig_dev->ifindex;
656 	else
657 		sll->sll_ifindex = dev->ifindex;
658 
659 	h->tp_status = status;
660 	smp_mb();
661 
662 	{
663 		struct page *p_start, *p_end;
664 		u8 *h_end = (u8 *)h + macoff + snaplen - 1;
665 
666 		p_start = virt_to_page(h);
667 		p_end = virt_to_page(h_end);
668 		while (p_start <= p_end) {
669 			flush_dcache_page(p_start);
670 			p_start++;
671 		}
672 	}
673 
674 	sk->sk_data_ready(sk, 0);
675 
676 drop_n_restore:
677 	if (skb_head != skb->data && skb_shared(skb)) {
678 		skb->data = skb_head;
679 		skb->len = skb_len;
680 	}
681 drop:
682 	kfree_skb(skb);
683 	return 0;
684 
685 ring_is_full:
686 	po->stats.tp_drops++;
687 	spin_unlock(&sk->sk_receive_queue.lock);
688 
689 	sk->sk_data_ready(sk, 0);
690 	if (copy_skb)
691 		kfree_skb(copy_skb);
692 	goto drop_n_restore;
693 }
694 
695 #endif
696 
697 
698 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
699 			  struct msghdr *msg, size_t len)
700 {
701 	struct sock *sk = sock->sk;
702 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
703 	struct sk_buff *skb;
704 	struct net_device *dev;
705 	__be16 proto;
706 	unsigned char *addr;
707 	int ifindex, err, reserve = 0;
708 
709 	/*
710 	 *	Get and verify the address.
711 	 */
712 
713 	if (saddr == NULL) {
714 		struct packet_sock *po = pkt_sk(sk);
715 
716 		ifindex	= po->ifindex;
717 		proto	= po->num;
718 		addr	= NULL;
719 	} else {
720 		err = -EINVAL;
721 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
722 			goto out;
723 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
724 			goto out;
725 		ifindex	= saddr->sll_ifindex;
726 		proto	= saddr->sll_protocol;
727 		addr	= saddr->sll_addr;
728 	}
729 
730 
731 	dev = dev_get_by_index(sock_net(sk), ifindex);
732 	err = -ENXIO;
733 	if (dev == NULL)
734 		goto out_unlock;
735 	if (sock->type == SOCK_RAW)
736 		reserve = dev->hard_header_len;
737 
738 	err = -ENETDOWN;
739 	if (!(dev->flags & IFF_UP))
740 		goto out_unlock;
741 
742 	err = -EMSGSIZE;
743 	if (len > dev->mtu+reserve)
744 		goto out_unlock;
745 
746 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
747 				msg->msg_flags & MSG_DONTWAIT, &err);
748 	if (skb==NULL)
749 		goto out_unlock;
750 
751 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
752 	skb_reset_network_header(skb);
753 
754 	err = -EINVAL;
755 	if (sock->type == SOCK_DGRAM &&
756 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
757 		goto out_free;
758 
759 	/* Returns -EFAULT on error */
760 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
761 	if (err)
762 		goto out_free;
763 
764 	skb->protocol = proto;
765 	skb->dev = dev;
766 	skb->priority = sk->sk_priority;
767 
768 	/*
769 	 *	Now send it
770 	 */
771 
772 	err = dev_queue_xmit(skb);
773 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
774 		goto out_unlock;
775 
776 	dev_put(dev);
777 
778 	return(len);
779 
780 out_free:
781 	kfree_skb(skb);
782 out_unlock:
783 	if (dev)
784 		dev_put(dev);
785 out:
786 	return err;
787 }
788 
789 /*
790  *	Close a PACKET socket. This is fairly simple. We immediately go
791  *	to 'closed' state and remove our protocol entry in the device list.
792  */
793 
794 static int packet_release(struct socket *sock)
795 {
796 	struct sock *sk = sock->sk;
797 	struct packet_sock *po;
798 	struct net *net;
799 
800 	if (!sk)
801 		return 0;
802 
803 	net = sock_net(sk);
804 	po = pkt_sk(sk);
805 
806 	write_lock_bh(&net->packet.sklist_lock);
807 	sk_del_node_init(sk);
808 	write_unlock_bh(&net->packet.sklist_lock);
809 
810 	/*
811 	 *	Unhook packet receive handler.
812 	 */
813 
814 	if (po->running) {
815 		/*
816 		 *	Remove the protocol hook
817 		 */
818 		dev_remove_pack(&po->prot_hook);
819 		po->running = 0;
820 		po->num = 0;
821 		__sock_put(sk);
822 	}
823 
824 	packet_flush_mclist(sk);
825 
826 #ifdef CONFIG_PACKET_MMAP
827 	if (po->pg_vec) {
828 		struct tpacket_req req;
829 		memset(&req, 0, sizeof(req));
830 		packet_set_ring(sk, &req, 1);
831 	}
832 #endif
833 
834 	/*
835 	 *	Now the socket is dead. No more input will appear.
836 	 */
837 
838 	sock_orphan(sk);
839 	sock->sk = NULL;
840 
841 	/* Purge queues */
842 
843 	skb_queue_purge(&sk->sk_receive_queue);
844 	sk_refcnt_debug_release(sk);
845 
846 	sock_put(sk);
847 	return 0;
848 }
849 
850 /*
851  *	Attach a packet hook.
852  */
853 
854 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
855 {
856 	struct packet_sock *po = pkt_sk(sk);
857 	/*
858 	 *	Detach an existing hook if present.
859 	 */
860 
861 	lock_sock(sk);
862 
863 	spin_lock(&po->bind_lock);
864 	if (po->running) {
865 		__sock_put(sk);
866 		po->running = 0;
867 		po->num = 0;
868 		spin_unlock(&po->bind_lock);
869 		dev_remove_pack(&po->prot_hook);
870 		spin_lock(&po->bind_lock);
871 	}
872 
873 	po->num = protocol;
874 	po->prot_hook.type = protocol;
875 	po->prot_hook.dev = dev;
876 
877 	po->ifindex = dev ? dev->ifindex : 0;
878 
879 	if (protocol == 0)
880 		goto out_unlock;
881 
882 	if (!dev || (dev->flags & IFF_UP)) {
883 		dev_add_pack(&po->prot_hook);
884 		sock_hold(sk);
885 		po->running = 1;
886 	} else {
887 		sk->sk_err = ENETDOWN;
888 		if (!sock_flag(sk, SOCK_DEAD))
889 			sk->sk_error_report(sk);
890 	}
891 
892 out_unlock:
893 	spin_unlock(&po->bind_lock);
894 	release_sock(sk);
895 	return 0;
896 }
897 
898 /*
899  *	Bind a packet socket to a device
900  */
901 
902 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
903 {
904 	struct sock *sk=sock->sk;
905 	char name[15];
906 	struct net_device *dev;
907 	int err = -ENODEV;
908 
909 	/*
910 	 *	Check legality
911 	 */
912 
913 	if (addr_len != sizeof(struct sockaddr))
914 		return -EINVAL;
915 	strlcpy(name,uaddr->sa_data,sizeof(name));
916 
917 	dev = dev_get_by_name(sock_net(sk), name);
918 	if (dev) {
919 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
920 		dev_put(dev);
921 	}
922 	return err;
923 }
924 
925 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
926 {
927 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
928 	struct sock *sk=sock->sk;
929 	struct net_device *dev = NULL;
930 	int err;
931 
932 
933 	/*
934 	 *	Check legality
935 	 */
936 
937 	if (addr_len < sizeof(struct sockaddr_ll))
938 		return -EINVAL;
939 	if (sll->sll_family != AF_PACKET)
940 		return -EINVAL;
941 
942 	if (sll->sll_ifindex) {
943 		err = -ENODEV;
944 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
945 		if (dev == NULL)
946 			goto out;
947 	}
948 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
949 	if (dev)
950 		dev_put(dev);
951 
952 out:
953 	return err;
954 }
955 
956 static struct proto packet_proto = {
957 	.name	  = "PACKET",
958 	.owner	  = THIS_MODULE,
959 	.obj_size = sizeof(struct packet_sock),
960 };
961 
962 /*
963  *	Create a packet of type SOCK_PACKET.
964  */
965 
966 static int packet_create(struct net *net, struct socket *sock, int protocol)
967 {
968 	struct sock *sk;
969 	struct packet_sock *po;
970 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
971 	int err;
972 
973 	if (!capable(CAP_NET_RAW))
974 		return -EPERM;
975 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
976 	    sock->type != SOCK_PACKET)
977 		return -ESOCKTNOSUPPORT;
978 
979 	sock->state = SS_UNCONNECTED;
980 
981 	err = -ENOBUFS;
982 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
983 	if (sk == NULL)
984 		goto out;
985 
986 	sock->ops = &packet_ops;
987 	if (sock->type == SOCK_PACKET)
988 		sock->ops = &packet_ops_spkt;
989 
990 	sock_init_data(sock, sk);
991 
992 	po = pkt_sk(sk);
993 	sk->sk_family = PF_PACKET;
994 	po->num = proto;
995 
996 	sk->sk_destruct = packet_sock_destruct;
997 	sk_refcnt_debug_inc(sk);
998 
999 	/*
1000 	 *	Attach a protocol block
1001 	 */
1002 
1003 	spin_lock_init(&po->bind_lock);
1004 	po->prot_hook.func = packet_rcv;
1005 
1006 	if (sock->type == SOCK_PACKET)
1007 		po->prot_hook.func = packet_rcv_spkt;
1008 
1009 	po->prot_hook.af_packet_priv = sk;
1010 
1011 	if (proto) {
1012 		po->prot_hook.type = proto;
1013 		dev_add_pack(&po->prot_hook);
1014 		sock_hold(sk);
1015 		po->running = 1;
1016 	}
1017 
1018 	write_lock_bh(&net->packet.sklist_lock);
1019 	sk_add_node(sk, &net->packet.sklist);
1020 	write_unlock_bh(&net->packet.sklist_lock);
1021 	return(0);
1022 out:
1023 	return err;
1024 }
1025 
1026 /*
1027  *	Pull a packet from our receive queue and hand it to the user.
1028  *	If necessary we block.
1029  */
1030 
1031 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1032 			  struct msghdr *msg, size_t len, int flags)
1033 {
1034 	struct sock *sk = sock->sk;
1035 	struct sk_buff *skb;
1036 	int copied, err;
1037 	struct sockaddr_ll *sll;
1038 
1039 	err = -EINVAL;
1040 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1041 		goto out;
1042 
1043 #if 0
1044 	/* What error should we return now? EUNATTACH? */
1045 	if (pkt_sk(sk)->ifindex < 0)
1046 		return -ENODEV;
1047 #endif
1048 
1049 	/*
1050 	 *	Call the generic datagram receiver. This handles all sorts
1051 	 *	of horrible races and re-entrancy so we can forget about it
1052 	 *	in the protocol layers.
1053 	 *
1054 	 *	Now it will return ENETDOWN, if device have just gone down,
1055 	 *	but then it will block.
1056 	 */
1057 
1058 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1059 
1060 	/*
1061 	 *	An error occurred so return it. Because skb_recv_datagram()
1062 	 *	handles the blocking we don't see and worry about blocking
1063 	 *	retries.
1064 	 */
1065 
1066 	if (skb == NULL)
1067 		goto out;
1068 
1069 	/*
1070 	 *	If the address length field is there to be filled in, we fill
1071 	 *	it in now.
1072 	 */
1073 
1074 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1075 	if (sock->type == SOCK_PACKET)
1076 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1077 	else
1078 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1079 
1080 	/*
1081 	 *	You lose any data beyond the buffer you gave. If it worries a
1082 	 *	user program they can ask the device for its MTU anyway.
1083 	 */
1084 
1085 	copied = skb->len;
1086 	if (copied > len)
1087 	{
1088 		copied=len;
1089 		msg->msg_flags|=MSG_TRUNC;
1090 	}
1091 
1092 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1093 	if (err)
1094 		goto out_free;
1095 
1096 	sock_recv_timestamp(msg, sk, skb);
1097 
1098 	if (msg->msg_name)
1099 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1100 		       msg->msg_namelen);
1101 
1102 	if (pkt_sk(sk)->auxdata) {
1103 		struct tpacket_auxdata aux;
1104 
1105 		aux.tp_status = TP_STATUS_USER;
1106 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1107 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1108 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1109 		aux.tp_snaplen = skb->len;
1110 		aux.tp_mac = 0;
1111 		aux.tp_net = skb_network_offset(skb);
1112 
1113 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1114 	}
1115 
1116 	/*
1117 	 *	Free or return the buffer as appropriate. Again this
1118 	 *	hides all the races and re-entrancy issues from us.
1119 	 */
1120 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1121 
1122 out_free:
1123 	skb_free_datagram(sk, skb);
1124 out:
1125 	return err;
1126 }
1127 
1128 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1129 			       int *uaddr_len, int peer)
1130 {
1131 	struct net_device *dev;
1132 	struct sock *sk	= sock->sk;
1133 
1134 	if (peer)
1135 		return -EOPNOTSUPP;
1136 
1137 	uaddr->sa_family = AF_PACKET;
1138 	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1139 	if (dev) {
1140 		strlcpy(uaddr->sa_data, dev->name, 15);
1141 		dev_put(dev);
1142 	} else
1143 		memset(uaddr->sa_data, 0, 14);
1144 	*uaddr_len = sizeof(*uaddr);
1145 
1146 	return 0;
1147 }
1148 
1149 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1150 			  int *uaddr_len, int peer)
1151 {
1152 	struct net_device *dev;
1153 	struct sock *sk = sock->sk;
1154 	struct packet_sock *po = pkt_sk(sk);
1155 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1156 
1157 	if (peer)
1158 		return -EOPNOTSUPP;
1159 
1160 	sll->sll_family = AF_PACKET;
1161 	sll->sll_ifindex = po->ifindex;
1162 	sll->sll_protocol = po->num;
1163 	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1164 	if (dev) {
1165 		sll->sll_hatype = dev->type;
1166 		sll->sll_halen = dev->addr_len;
1167 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1168 		dev_put(dev);
1169 	} else {
1170 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1171 		sll->sll_halen = 0;
1172 	}
1173 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1174 
1175 	return 0;
1176 }
1177 
1178 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1179 {
1180 	switch (i->type) {
1181 	case PACKET_MR_MULTICAST:
1182 		if (what > 0)
1183 			dev_mc_add(dev, i->addr, i->alen, 0);
1184 		else
1185 			dev_mc_delete(dev, i->addr, i->alen, 0);
1186 		break;
1187 	case PACKET_MR_PROMISC:
1188 		dev_set_promiscuity(dev, what);
1189 		break;
1190 	case PACKET_MR_ALLMULTI:
1191 		dev_set_allmulti(dev, what);
1192 		break;
1193 	default:;
1194 	}
1195 }
1196 
1197 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1198 {
1199 	for ( ; i; i=i->next) {
1200 		if (i->ifindex == dev->ifindex)
1201 			packet_dev_mc(dev, i, what);
1202 	}
1203 }
1204 
1205 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1206 {
1207 	struct packet_sock *po = pkt_sk(sk);
1208 	struct packet_mclist *ml, *i;
1209 	struct net_device *dev;
1210 	int err;
1211 
1212 	rtnl_lock();
1213 
1214 	err = -ENODEV;
1215 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1216 	if (!dev)
1217 		goto done;
1218 
1219 	err = -EINVAL;
1220 	if (mreq->mr_alen > dev->addr_len)
1221 		goto done;
1222 
1223 	err = -ENOBUFS;
1224 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1225 	if (i == NULL)
1226 		goto done;
1227 
1228 	err = 0;
1229 	for (ml = po->mclist; ml; ml = ml->next) {
1230 		if (ml->ifindex == mreq->mr_ifindex &&
1231 		    ml->type == mreq->mr_type &&
1232 		    ml->alen == mreq->mr_alen &&
1233 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1234 			ml->count++;
1235 			/* Free the new element ... */
1236 			kfree(i);
1237 			goto done;
1238 		}
1239 	}
1240 
1241 	i->type = mreq->mr_type;
1242 	i->ifindex = mreq->mr_ifindex;
1243 	i->alen = mreq->mr_alen;
1244 	memcpy(i->addr, mreq->mr_address, i->alen);
1245 	i->count = 1;
1246 	i->next = po->mclist;
1247 	po->mclist = i;
1248 	packet_dev_mc(dev, i, +1);
1249 
1250 done:
1251 	rtnl_unlock();
1252 	return err;
1253 }
1254 
1255 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1256 {
1257 	struct packet_mclist *ml, **mlp;
1258 
1259 	rtnl_lock();
1260 
1261 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1262 		if (ml->ifindex == mreq->mr_ifindex &&
1263 		    ml->type == mreq->mr_type &&
1264 		    ml->alen == mreq->mr_alen &&
1265 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1266 			if (--ml->count == 0) {
1267 				struct net_device *dev;
1268 				*mlp = ml->next;
1269 				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1270 				if (dev) {
1271 					packet_dev_mc(dev, ml, -1);
1272 					dev_put(dev);
1273 				}
1274 				kfree(ml);
1275 			}
1276 			rtnl_unlock();
1277 			return 0;
1278 		}
1279 	}
1280 	rtnl_unlock();
1281 	return -EADDRNOTAVAIL;
1282 }
1283 
1284 static void packet_flush_mclist(struct sock *sk)
1285 {
1286 	struct packet_sock *po = pkt_sk(sk);
1287 	struct packet_mclist *ml;
1288 
1289 	if (!po->mclist)
1290 		return;
1291 
1292 	rtnl_lock();
1293 	while ((ml = po->mclist) != NULL) {
1294 		struct net_device *dev;
1295 
1296 		po->mclist = ml->next;
1297 		if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1298 			packet_dev_mc(dev, ml, -1);
1299 			dev_put(dev);
1300 		}
1301 		kfree(ml);
1302 	}
1303 	rtnl_unlock();
1304 }
1305 
1306 static int
1307 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1308 {
1309 	struct sock *sk = sock->sk;
1310 	struct packet_sock *po = pkt_sk(sk);
1311 	int ret;
1312 
1313 	if (level != SOL_PACKET)
1314 		return -ENOPROTOOPT;
1315 
1316 	switch(optname)	{
1317 	case PACKET_ADD_MEMBERSHIP:
1318 	case PACKET_DROP_MEMBERSHIP:
1319 	{
1320 		struct packet_mreq_max mreq;
1321 		int len = optlen;
1322 		memset(&mreq, 0, sizeof(mreq));
1323 		if (len < sizeof(struct packet_mreq))
1324 			return -EINVAL;
1325 		if (len > sizeof(mreq))
1326 			len = sizeof(mreq);
1327 		if (copy_from_user(&mreq,optval,len))
1328 			return -EFAULT;
1329 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1330 			return -EINVAL;
1331 		if (optname == PACKET_ADD_MEMBERSHIP)
1332 			ret = packet_mc_add(sk, &mreq);
1333 		else
1334 			ret = packet_mc_drop(sk, &mreq);
1335 		return ret;
1336 	}
1337 
1338 #ifdef CONFIG_PACKET_MMAP
1339 	case PACKET_RX_RING:
1340 	{
1341 		struct tpacket_req req;
1342 
1343 		if (optlen<sizeof(req))
1344 			return -EINVAL;
1345 		if (copy_from_user(&req,optval,sizeof(req)))
1346 			return -EFAULT;
1347 		return packet_set_ring(sk, &req, 0);
1348 	}
1349 	case PACKET_COPY_THRESH:
1350 	{
1351 		int val;
1352 
1353 		if (optlen!=sizeof(val))
1354 			return -EINVAL;
1355 		if (copy_from_user(&val,optval,sizeof(val)))
1356 			return -EFAULT;
1357 
1358 		pkt_sk(sk)->copy_thresh = val;
1359 		return 0;
1360 	}
1361 #endif
1362 	case PACKET_AUXDATA:
1363 	{
1364 		int val;
1365 
1366 		if (optlen < sizeof(val))
1367 			return -EINVAL;
1368 		if (copy_from_user(&val, optval, sizeof(val)))
1369 			return -EFAULT;
1370 
1371 		po->auxdata = !!val;
1372 		return 0;
1373 	}
1374 	case PACKET_ORIGDEV:
1375 	{
1376 		int val;
1377 
1378 		if (optlen < sizeof(val))
1379 			return -EINVAL;
1380 		if (copy_from_user(&val, optval, sizeof(val)))
1381 			return -EFAULT;
1382 
1383 		po->origdev = !!val;
1384 		return 0;
1385 	}
1386 	default:
1387 		return -ENOPROTOOPT;
1388 	}
1389 }
1390 
1391 static int packet_getsockopt(struct socket *sock, int level, int optname,
1392 			     char __user *optval, int __user *optlen)
1393 {
1394 	int len;
1395 	int val;
1396 	struct sock *sk = sock->sk;
1397 	struct packet_sock *po = pkt_sk(sk);
1398 	void *data;
1399 	struct tpacket_stats st;
1400 
1401 	if (level != SOL_PACKET)
1402 		return -ENOPROTOOPT;
1403 
1404 	if (get_user(len, optlen))
1405 		return -EFAULT;
1406 
1407 	if (len < 0)
1408 		return -EINVAL;
1409 
1410 	switch(optname)	{
1411 	case PACKET_STATISTICS:
1412 		if (len > sizeof(struct tpacket_stats))
1413 			len = sizeof(struct tpacket_stats);
1414 		spin_lock_bh(&sk->sk_receive_queue.lock);
1415 		st = po->stats;
1416 		memset(&po->stats, 0, sizeof(st));
1417 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1418 		st.tp_packets += st.tp_drops;
1419 
1420 		data = &st;
1421 		break;
1422 	case PACKET_AUXDATA:
1423 		if (len > sizeof(int))
1424 			len = sizeof(int);
1425 		val = po->auxdata;
1426 
1427 		data = &val;
1428 		break;
1429 	case PACKET_ORIGDEV:
1430 		if (len > sizeof(int))
1431 			len = sizeof(int);
1432 		val = po->origdev;
1433 
1434 		data = &val;
1435 		break;
1436 	default:
1437 		return -ENOPROTOOPT;
1438 	}
1439 
1440 	if (put_user(len, optlen))
1441 		return -EFAULT;
1442 	if (copy_to_user(optval, data, len))
1443 		return -EFAULT;
1444 	return 0;
1445 }
1446 
1447 
1448 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1449 {
1450 	struct sock *sk;
1451 	struct hlist_node *node;
1452 	struct net_device *dev = data;
1453 	struct net *net = dev_net(dev);
1454 
1455 	read_lock(&net->packet.sklist_lock);
1456 	sk_for_each(sk, node, &net->packet.sklist) {
1457 		struct packet_sock *po = pkt_sk(sk);
1458 
1459 		switch (msg) {
1460 		case NETDEV_UNREGISTER:
1461 			if (po->mclist)
1462 				packet_dev_mclist(dev, po->mclist, -1);
1463 			/* fallthrough */
1464 
1465 		case NETDEV_DOWN:
1466 			if (dev->ifindex == po->ifindex) {
1467 				spin_lock(&po->bind_lock);
1468 				if (po->running) {
1469 					__dev_remove_pack(&po->prot_hook);
1470 					__sock_put(sk);
1471 					po->running = 0;
1472 					sk->sk_err = ENETDOWN;
1473 					if (!sock_flag(sk, SOCK_DEAD))
1474 						sk->sk_error_report(sk);
1475 				}
1476 				if (msg == NETDEV_UNREGISTER) {
1477 					po->ifindex = -1;
1478 					po->prot_hook.dev = NULL;
1479 				}
1480 				spin_unlock(&po->bind_lock);
1481 			}
1482 			break;
1483 		case NETDEV_UP:
1484 			spin_lock(&po->bind_lock);
1485 			if (dev->ifindex == po->ifindex && po->num &&
1486 			    !po->running) {
1487 				dev_add_pack(&po->prot_hook);
1488 				sock_hold(sk);
1489 				po->running = 1;
1490 			}
1491 			spin_unlock(&po->bind_lock);
1492 			break;
1493 		}
1494 	}
1495 	read_unlock(&net->packet.sklist_lock);
1496 	return NOTIFY_DONE;
1497 }
1498 
1499 
1500 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1501 			unsigned long arg)
1502 {
1503 	struct sock *sk = sock->sk;
1504 
1505 	switch(cmd) {
1506 		case SIOCOUTQ:
1507 		{
1508 			int amount = atomic_read(&sk->sk_wmem_alloc);
1509 			return put_user(amount, (int __user *)arg);
1510 		}
1511 		case SIOCINQ:
1512 		{
1513 			struct sk_buff *skb;
1514 			int amount = 0;
1515 
1516 			spin_lock_bh(&sk->sk_receive_queue.lock);
1517 			skb = skb_peek(&sk->sk_receive_queue);
1518 			if (skb)
1519 				amount = skb->len;
1520 			spin_unlock_bh(&sk->sk_receive_queue.lock);
1521 			return put_user(amount, (int __user *)arg);
1522 		}
1523 		case SIOCGSTAMP:
1524 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
1525 		case SIOCGSTAMPNS:
1526 			return sock_get_timestampns(sk, (struct timespec __user *)arg);
1527 
1528 #ifdef CONFIG_INET
1529 		case SIOCADDRT:
1530 		case SIOCDELRT:
1531 		case SIOCDARP:
1532 		case SIOCGARP:
1533 		case SIOCSARP:
1534 		case SIOCGIFADDR:
1535 		case SIOCSIFADDR:
1536 		case SIOCGIFBRDADDR:
1537 		case SIOCSIFBRDADDR:
1538 		case SIOCGIFNETMASK:
1539 		case SIOCSIFNETMASK:
1540 		case SIOCGIFDSTADDR:
1541 		case SIOCSIFDSTADDR:
1542 		case SIOCSIFFLAGS:
1543 			if (sock_net(sk) != &init_net)
1544 				return -ENOIOCTLCMD;
1545 			return inet_dgram_ops.ioctl(sock, cmd, arg);
1546 #endif
1547 
1548 		default:
1549 			return -ENOIOCTLCMD;
1550 	}
1551 	return 0;
1552 }
1553 
1554 #ifndef CONFIG_PACKET_MMAP
1555 #define packet_mmap sock_no_mmap
1556 #define packet_poll datagram_poll
1557 #else
1558 
1559 static unsigned int packet_poll(struct file * file, struct socket *sock,
1560 				poll_table *wait)
1561 {
1562 	struct sock *sk = sock->sk;
1563 	struct packet_sock *po = pkt_sk(sk);
1564 	unsigned int mask = datagram_poll(file, sock, wait);
1565 
1566 	spin_lock_bh(&sk->sk_receive_queue.lock);
1567 	if (po->pg_vec) {
1568 		unsigned last = po->head ? po->head-1 : po->frame_max;
1569 		struct tpacket_hdr *h;
1570 
1571 		h = packet_lookup_frame(po, last);
1572 
1573 		if (h->tp_status)
1574 			mask |= POLLIN | POLLRDNORM;
1575 	}
1576 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1577 	return mask;
1578 }
1579 
1580 
1581 /* Dirty? Well, I still did not learn better way to account
1582  * for user mmaps.
1583  */
1584 
1585 static void packet_mm_open(struct vm_area_struct *vma)
1586 {
1587 	struct file *file = vma->vm_file;
1588 	struct socket * sock = file->private_data;
1589 	struct sock *sk = sock->sk;
1590 
1591 	if (sk)
1592 		atomic_inc(&pkt_sk(sk)->mapped);
1593 }
1594 
1595 static void packet_mm_close(struct vm_area_struct *vma)
1596 {
1597 	struct file *file = vma->vm_file;
1598 	struct socket * sock = file->private_data;
1599 	struct sock *sk = sock->sk;
1600 
1601 	if (sk)
1602 		atomic_dec(&pkt_sk(sk)->mapped);
1603 }
1604 
1605 static struct vm_operations_struct packet_mmap_ops = {
1606 	.open =	packet_mm_open,
1607 	.close =packet_mm_close,
1608 };
1609 
1610 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1611 {
1612 	int i;
1613 
1614 	for (i = 0; i < len; i++) {
1615 		if (likely(pg_vec[i]))
1616 			free_pages((unsigned long) pg_vec[i], order);
1617 	}
1618 	kfree(pg_vec);
1619 }
1620 
1621 static inline char *alloc_one_pg_vec_page(unsigned long order)
1622 {
1623 	return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1624 					 order);
1625 }
1626 
1627 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1628 {
1629 	unsigned int block_nr = req->tp_block_nr;
1630 	char **pg_vec;
1631 	int i;
1632 
1633 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1634 	if (unlikely(!pg_vec))
1635 		goto out;
1636 
1637 	for (i = 0; i < block_nr; i++) {
1638 		pg_vec[i] = alloc_one_pg_vec_page(order);
1639 		if (unlikely(!pg_vec[i]))
1640 			goto out_free_pgvec;
1641 	}
1642 
1643 out:
1644 	return pg_vec;
1645 
1646 out_free_pgvec:
1647 	free_pg_vec(pg_vec, order, block_nr);
1648 	pg_vec = NULL;
1649 	goto out;
1650 }
1651 
1652 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1653 {
1654 	char **pg_vec = NULL;
1655 	struct packet_sock *po = pkt_sk(sk);
1656 	int was_running, order = 0;
1657 	__be16 num;
1658 	int err = 0;
1659 
1660 	if (req->tp_block_nr) {
1661 		int i;
1662 
1663 		/* Sanity tests and some calculations */
1664 
1665 		if (unlikely(po->pg_vec))
1666 			return -EBUSY;
1667 
1668 		if (unlikely((int)req->tp_block_size <= 0))
1669 			return -EINVAL;
1670 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1671 			return -EINVAL;
1672 		if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1673 			return -EINVAL;
1674 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1675 			return -EINVAL;
1676 
1677 		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1678 		if (unlikely(po->frames_per_block <= 0))
1679 			return -EINVAL;
1680 		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1681 			     req->tp_frame_nr))
1682 			return -EINVAL;
1683 
1684 		err = -ENOMEM;
1685 		order = get_order(req->tp_block_size);
1686 		pg_vec = alloc_pg_vec(req, order);
1687 		if (unlikely(!pg_vec))
1688 			goto out;
1689 
1690 		for (i = 0; i < req->tp_block_nr; i++) {
1691 			char *ptr = pg_vec[i];
1692 			struct tpacket_hdr *header;
1693 			int k;
1694 
1695 			for (k = 0; k < po->frames_per_block; k++) {
1696 				header = (struct tpacket_hdr *) ptr;
1697 				header->tp_status = TP_STATUS_KERNEL;
1698 				ptr += req->tp_frame_size;
1699 			}
1700 		}
1701 		/* Done */
1702 	} else {
1703 		if (unlikely(req->tp_frame_nr))
1704 			return -EINVAL;
1705 	}
1706 
1707 	lock_sock(sk);
1708 
1709 	/* Detach socket from network */
1710 	spin_lock(&po->bind_lock);
1711 	was_running = po->running;
1712 	num = po->num;
1713 	if (was_running) {
1714 		__dev_remove_pack(&po->prot_hook);
1715 		po->num = 0;
1716 		po->running = 0;
1717 		__sock_put(sk);
1718 	}
1719 	spin_unlock(&po->bind_lock);
1720 
1721 	synchronize_net();
1722 
1723 	err = -EBUSY;
1724 	if (closing || atomic_read(&po->mapped) == 0) {
1725 		err = 0;
1726 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1727 
1728 		spin_lock_bh(&sk->sk_receive_queue.lock);
1729 		pg_vec = XC(po->pg_vec, pg_vec);
1730 		po->frame_max = (req->tp_frame_nr - 1);
1731 		po->head = 0;
1732 		po->frame_size = req->tp_frame_size;
1733 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1734 
1735 		order = XC(po->pg_vec_order, order);
1736 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1737 
1738 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1739 		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1740 		skb_queue_purge(&sk->sk_receive_queue);
1741 #undef XC
1742 		if (atomic_read(&po->mapped))
1743 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1744 	}
1745 
1746 	spin_lock(&po->bind_lock);
1747 	if (was_running && !po->running) {
1748 		sock_hold(sk);
1749 		po->running = 1;
1750 		po->num = num;
1751 		dev_add_pack(&po->prot_hook);
1752 	}
1753 	spin_unlock(&po->bind_lock);
1754 
1755 	release_sock(sk);
1756 
1757 	if (pg_vec)
1758 		free_pg_vec(pg_vec, order, req->tp_block_nr);
1759 out:
1760 	return err;
1761 }
1762 
1763 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1764 {
1765 	struct sock *sk = sock->sk;
1766 	struct packet_sock *po = pkt_sk(sk);
1767 	unsigned long size;
1768 	unsigned long start;
1769 	int err = -EINVAL;
1770 	int i;
1771 
1772 	if (vma->vm_pgoff)
1773 		return -EINVAL;
1774 
1775 	size = vma->vm_end - vma->vm_start;
1776 
1777 	lock_sock(sk);
1778 	if (po->pg_vec == NULL)
1779 		goto out;
1780 	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1781 		goto out;
1782 
1783 	start = vma->vm_start;
1784 	for (i = 0; i < po->pg_vec_len; i++) {
1785 		struct page *page = virt_to_page(po->pg_vec[i]);
1786 		int pg_num;
1787 
1788 		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1789 			err = vm_insert_page(vma, start, page);
1790 			if (unlikely(err))
1791 				goto out;
1792 			start += PAGE_SIZE;
1793 		}
1794 	}
1795 	atomic_inc(&po->mapped);
1796 	vma->vm_ops = &packet_mmap_ops;
1797 	err = 0;
1798 
1799 out:
1800 	release_sock(sk);
1801 	return err;
1802 }
1803 #endif
1804 
1805 
1806 static const struct proto_ops packet_ops_spkt = {
1807 	.family =	PF_PACKET,
1808 	.owner =	THIS_MODULE,
1809 	.release =	packet_release,
1810 	.bind =		packet_bind_spkt,
1811 	.connect =	sock_no_connect,
1812 	.socketpair =	sock_no_socketpair,
1813 	.accept =	sock_no_accept,
1814 	.getname =	packet_getname_spkt,
1815 	.poll =		datagram_poll,
1816 	.ioctl =	packet_ioctl,
1817 	.listen =	sock_no_listen,
1818 	.shutdown =	sock_no_shutdown,
1819 	.setsockopt =	sock_no_setsockopt,
1820 	.getsockopt =	sock_no_getsockopt,
1821 	.sendmsg =	packet_sendmsg_spkt,
1822 	.recvmsg =	packet_recvmsg,
1823 	.mmap =		sock_no_mmap,
1824 	.sendpage =	sock_no_sendpage,
1825 };
1826 
1827 static const struct proto_ops packet_ops = {
1828 	.family =	PF_PACKET,
1829 	.owner =	THIS_MODULE,
1830 	.release =	packet_release,
1831 	.bind =		packet_bind,
1832 	.connect =	sock_no_connect,
1833 	.socketpair =	sock_no_socketpair,
1834 	.accept =	sock_no_accept,
1835 	.getname =	packet_getname,
1836 	.poll =		packet_poll,
1837 	.ioctl =	packet_ioctl,
1838 	.listen =	sock_no_listen,
1839 	.shutdown =	sock_no_shutdown,
1840 	.setsockopt =	packet_setsockopt,
1841 	.getsockopt =	packet_getsockopt,
1842 	.sendmsg =	packet_sendmsg,
1843 	.recvmsg =	packet_recvmsg,
1844 	.mmap =		packet_mmap,
1845 	.sendpage =	sock_no_sendpage,
1846 };
1847 
1848 static struct net_proto_family packet_family_ops = {
1849 	.family =	PF_PACKET,
1850 	.create =	packet_create,
1851 	.owner	=	THIS_MODULE,
1852 };
1853 
1854 static struct notifier_block packet_netdev_notifier = {
1855 	.notifier_call =packet_notifier,
1856 };
1857 
1858 #ifdef CONFIG_PROC_FS
1859 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
1860 {
1861 	struct sock *s;
1862 	struct hlist_node *node;
1863 
1864 	sk_for_each(s, node, &net->packet.sklist) {
1865 		if (!off--)
1866 			return s;
1867 	}
1868 	return NULL;
1869 }
1870 
1871 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1872 	__acquires(seq_file_net(seq)->packet.sklist_lock)
1873 {
1874 	struct net *net = seq_file_net(seq);
1875 	read_lock(&net->packet.sklist_lock);
1876 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
1877 }
1878 
1879 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1880 {
1881 	struct net *net = seq_file_net(seq);
1882 	++*pos;
1883 	return  (v == SEQ_START_TOKEN)
1884 		? sk_head(&net->packet.sklist)
1885 		: sk_next((struct sock*)v) ;
1886 }
1887 
1888 static void packet_seq_stop(struct seq_file *seq, void *v)
1889 	__releases(seq_file_net(seq)->packet.sklist_lock)
1890 {
1891 	struct net *net = seq_file_net(seq);
1892 	read_unlock(&net->packet.sklist_lock);
1893 }
1894 
1895 static int packet_seq_show(struct seq_file *seq, void *v)
1896 {
1897 	if (v == SEQ_START_TOKEN)
1898 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1899 	else {
1900 		struct sock *s = v;
1901 		const struct packet_sock *po = pkt_sk(s);
1902 
1903 		seq_printf(seq,
1904 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1905 			   s,
1906 			   atomic_read(&s->sk_refcnt),
1907 			   s->sk_type,
1908 			   ntohs(po->num),
1909 			   po->ifindex,
1910 			   po->running,
1911 			   atomic_read(&s->sk_rmem_alloc),
1912 			   sock_i_uid(s),
1913 			   sock_i_ino(s) );
1914 	}
1915 
1916 	return 0;
1917 }
1918 
1919 static const struct seq_operations packet_seq_ops = {
1920 	.start	= packet_seq_start,
1921 	.next	= packet_seq_next,
1922 	.stop	= packet_seq_stop,
1923 	.show	= packet_seq_show,
1924 };
1925 
1926 static int packet_seq_open(struct inode *inode, struct file *file)
1927 {
1928 	return seq_open_net(inode, file, &packet_seq_ops,
1929 			    sizeof(struct seq_net_private));
1930 }
1931 
1932 static const struct file_operations packet_seq_fops = {
1933 	.owner		= THIS_MODULE,
1934 	.open		= packet_seq_open,
1935 	.read		= seq_read,
1936 	.llseek		= seq_lseek,
1937 	.release	= seq_release_net,
1938 };
1939 
1940 #endif
1941 
1942 static int packet_net_init(struct net *net)
1943 {
1944 	rwlock_init(&net->packet.sklist_lock);
1945 	INIT_HLIST_HEAD(&net->packet.sklist);
1946 
1947 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
1948 		return -ENOMEM;
1949 
1950 	return 0;
1951 }
1952 
1953 static void packet_net_exit(struct net *net)
1954 {
1955 	proc_net_remove(net, "packet");
1956 }
1957 
1958 static struct pernet_operations packet_net_ops = {
1959 	.init = packet_net_init,
1960 	.exit = packet_net_exit,
1961 };
1962 
1963 
1964 static void __exit packet_exit(void)
1965 {
1966 	unregister_netdevice_notifier(&packet_netdev_notifier);
1967 	unregister_pernet_subsys(&packet_net_ops);
1968 	sock_unregister(PF_PACKET);
1969 	proto_unregister(&packet_proto);
1970 }
1971 
1972 static int __init packet_init(void)
1973 {
1974 	int rc = proto_register(&packet_proto, 0);
1975 
1976 	if (rc != 0)
1977 		goto out;
1978 
1979 	sock_register(&packet_family_ops);
1980 	register_pernet_subsys(&packet_net_ops);
1981 	register_netdevice_notifier(&packet_netdev_notifier);
1982 out:
1983 	return rc;
1984 }
1985 
1986 module_init(packet_init);
1987 module_exit(packet_exit);
1988 MODULE_LICENSE("GPL");
1989 MODULE_ALIAS_NETPROTO(PF_PACKET);
1990