xref: /openbmc/linux/net/ipv4/ipmr.c (revision e868d61272caa648214046a096e5a6bfc068dc8c)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM	1
69 #endif
70 
71 static struct sock *mroute_socket;
72 
73 
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77 
78 static DEFINE_RWLOCK(mrt_lock);
79 
80 /*
81  *	Multicast router control variables
82  */
83 
84 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
85 static int maxvif;
86 
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88 
89 static int mroute_do_assert;				/* Set in PIM assert	*/
90 static int mroute_do_pim;
91 
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
93 
94 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
96 
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99 
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104 
105    In this case data path is free of exclusive locks at all.
106  */
107 
108 static struct kmem_cache *mrt_cachep __read_mostly;
109 
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113 
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117 
118 static struct timer_list ipmr_expire_timer;
119 
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121 
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125 	struct net_device  *dev;
126 
127 	dev = __dev_get_by_name("tunl0");
128 
129 	if (dev) {
130 		int err;
131 		struct ifreq ifr;
132 		mm_segment_t	oldfs;
133 		struct ip_tunnel_parm p;
134 		struct in_device  *in_dev;
135 
136 		memset(&p, 0, sizeof(p));
137 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
138 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
139 		p.iph.version = 4;
140 		p.iph.ihl = 5;
141 		p.iph.protocol = IPPROTO_IPIP;
142 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143 		ifr.ifr_ifru.ifru_data = (void*)&p;
144 
145 		oldfs = get_fs(); set_fs(KERNEL_DS);
146 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147 		set_fs(oldfs);
148 
149 		dev = NULL;
150 
151 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152 			dev->flags |= IFF_MULTICAST;
153 
154 			in_dev = __in_dev_get_rtnl(dev);
155 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156 				goto failure;
157 			in_dev->cnf.rp_filter = 0;
158 
159 			if (dev_open(dev))
160 				goto failure;
161 		}
162 	}
163 	return dev;
164 
165 failure:
166 	/* allow the register to be completed before unregistering. */
167 	rtnl_unlock();
168 	rtnl_lock();
169 
170 	unregister_netdevice(dev);
171 	return NULL;
172 }
173 
174 #ifdef CONFIG_IP_PIMSM
175 
176 static int reg_vif_num = -1;
177 
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180 	read_lock(&mrt_lock);
181 	((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
182 	((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
183 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184 	read_unlock(&mrt_lock);
185 	kfree_skb(skb);
186 	return 0;
187 }
188 
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191 	return (struct net_device_stats*)netdev_priv(dev);
192 }
193 
194 static void reg_vif_setup(struct net_device *dev)
195 {
196 	dev->type		= ARPHRD_PIMREG;
197 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198 	dev->flags		= IFF_NOARP;
199 	dev->hard_start_xmit	= reg_vif_xmit;
200 	dev->get_stats		= reg_vif_get_stats;
201 	dev->destructor		= free_netdev;
202 }
203 
204 static struct net_device *ipmr_reg_vif(void)
205 {
206 	struct net_device *dev;
207 	struct in_device *in_dev;
208 
209 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210 			   reg_vif_setup);
211 
212 	if (dev == NULL)
213 		return NULL;
214 
215 	if (register_netdevice(dev)) {
216 		free_netdev(dev);
217 		return NULL;
218 	}
219 	dev->iflink = 0;
220 
221 	if ((in_dev = inetdev_init(dev)) == NULL)
222 		goto failure;
223 
224 	in_dev->cnf.rp_filter = 0;
225 
226 	if (dev_open(dev))
227 		goto failure;
228 
229 	return dev;
230 
231 failure:
232 	/* allow the register to be completed before unregistering. */
233 	rtnl_unlock();
234 	rtnl_lock();
235 
236 	unregister_netdevice(dev);
237 	return NULL;
238 }
239 #endif
240 
241 /*
242  *	Delete a VIF entry
243  */
244 
245 static int vif_delete(int vifi)
246 {
247 	struct vif_device *v;
248 	struct net_device *dev;
249 	struct in_device *in_dev;
250 
251 	if (vifi < 0 || vifi >= maxvif)
252 		return -EADDRNOTAVAIL;
253 
254 	v = &vif_table[vifi];
255 
256 	write_lock_bh(&mrt_lock);
257 	dev = v->dev;
258 	v->dev = NULL;
259 
260 	if (!dev) {
261 		write_unlock_bh(&mrt_lock);
262 		return -EADDRNOTAVAIL;
263 	}
264 
265 #ifdef CONFIG_IP_PIMSM
266 	if (vifi == reg_vif_num)
267 		reg_vif_num = -1;
268 #endif
269 
270 	if (vifi+1 == maxvif) {
271 		int tmp;
272 		for (tmp=vifi-1; tmp>=0; tmp--) {
273 			if (VIF_EXISTS(tmp))
274 				break;
275 		}
276 		maxvif = tmp+1;
277 	}
278 
279 	write_unlock_bh(&mrt_lock);
280 
281 	dev_set_allmulti(dev, -1);
282 
283 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284 		in_dev->cnf.mc_forwarding--;
285 		ip_rt_multicast_event(in_dev);
286 	}
287 
288 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289 		unregister_netdevice(dev);
290 
291 	dev_put(dev);
292 	return 0;
293 }
294 
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298 
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301 	struct sk_buff *skb;
302 	struct nlmsgerr *e;
303 
304 	atomic_dec(&cache_resolve_queue_len);
305 
306 	while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307 		if (ip_hdr(skb)->version == 0) {
308 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309 			nlh->nlmsg_type = NLMSG_ERROR;
310 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311 			skb_trim(skb, nlh->nlmsg_len);
312 			e = NLMSG_DATA(nlh);
313 			e->error = -ETIMEDOUT;
314 			memset(&e->msg, 0, sizeof(e->msg));
315 
316 			rtnl_unicast(skb, NETLINK_CB(skb).pid);
317 		} else
318 			kfree_skb(skb);
319 	}
320 
321 	kmem_cache_free(mrt_cachep, c);
322 }
323 
324 
325 /* Single timer process for all the unresolved queue. */
326 
327 static void ipmr_expire_process(unsigned long dummy)
328 {
329 	unsigned long now;
330 	unsigned long expires;
331 	struct mfc_cache *c, **cp;
332 
333 	if (!spin_trylock(&mfc_unres_lock)) {
334 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
335 		return;
336 	}
337 
338 	if (atomic_read(&cache_resolve_queue_len) == 0)
339 		goto out;
340 
341 	now = jiffies;
342 	expires = 10*HZ;
343 	cp = &mfc_unres_queue;
344 
345 	while ((c=*cp) != NULL) {
346 		if (time_after(c->mfc_un.unres.expires, now)) {
347 			unsigned long interval = c->mfc_un.unres.expires - now;
348 			if (interval < expires)
349 				expires = interval;
350 			cp = &c->next;
351 			continue;
352 		}
353 
354 		*cp = c->next;
355 
356 		ipmr_destroy_unres(c);
357 	}
358 
359 	if (atomic_read(&cache_resolve_queue_len))
360 		mod_timer(&ipmr_expire_timer, jiffies + expires);
361 
362 out:
363 	spin_unlock(&mfc_unres_lock);
364 }
365 
366 /* Fill oifs list. It is called under write locked mrt_lock. */
367 
368 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
369 {
370 	int vifi;
371 
372 	cache->mfc_un.res.minvif = MAXVIFS;
373 	cache->mfc_un.res.maxvif = 0;
374 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
375 
376 	for (vifi=0; vifi<maxvif; vifi++) {
377 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
378 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
379 			if (cache->mfc_un.res.minvif > vifi)
380 				cache->mfc_un.res.minvif = vifi;
381 			if (cache->mfc_un.res.maxvif <= vifi)
382 				cache->mfc_un.res.maxvif = vifi + 1;
383 		}
384 	}
385 }
386 
387 static int vif_add(struct vifctl *vifc, int mrtsock)
388 {
389 	int vifi = vifc->vifc_vifi;
390 	struct vif_device *v = &vif_table[vifi];
391 	struct net_device *dev;
392 	struct in_device *in_dev;
393 
394 	/* Is vif busy ? */
395 	if (VIF_EXISTS(vifi))
396 		return -EADDRINUSE;
397 
398 	switch (vifc->vifc_flags) {
399 #ifdef CONFIG_IP_PIMSM
400 	case VIFF_REGISTER:
401 		/*
402 		 * Special Purpose VIF in PIM
403 		 * All the packets will be sent to the daemon
404 		 */
405 		if (reg_vif_num >= 0)
406 			return -EADDRINUSE;
407 		dev = ipmr_reg_vif();
408 		if (!dev)
409 			return -ENOBUFS;
410 		break;
411 #endif
412 	case VIFF_TUNNEL:
413 		dev = ipmr_new_tunnel(vifc);
414 		if (!dev)
415 			return -ENOBUFS;
416 		break;
417 	case 0:
418 		dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
419 		if (!dev)
420 			return -EADDRNOTAVAIL;
421 		dev_put(dev);
422 		break;
423 	default:
424 		return -EINVAL;
425 	}
426 
427 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
428 		return -EADDRNOTAVAIL;
429 	in_dev->cnf.mc_forwarding++;
430 	dev_set_allmulti(dev, +1);
431 	ip_rt_multicast_event(in_dev);
432 
433 	/*
434 	 *	Fill in the VIF structures
435 	 */
436 	v->rate_limit=vifc->vifc_rate_limit;
437 	v->local=vifc->vifc_lcl_addr.s_addr;
438 	v->remote=vifc->vifc_rmt_addr.s_addr;
439 	v->flags=vifc->vifc_flags;
440 	if (!mrtsock)
441 		v->flags |= VIFF_STATIC;
442 	v->threshold=vifc->vifc_threshold;
443 	v->bytes_in = 0;
444 	v->bytes_out = 0;
445 	v->pkt_in = 0;
446 	v->pkt_out = 0;
447 	v->link = dev->ifindex;
448 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
449 		v->link = dev->iflink;
450 
451 	/* And finish update writing critical data */
452 	write_lock_bh(&mrt_lock);
453 	dev_hold(dev);
454 	v->dev=dev;
455 #ifdef CONFIG_IP_PIMSM
456 	if (v->flags&VIFF_REGISTER)
457 		reg_vif_num = vifi;
458 #endif
459 	if (vifi+1 > maxvif)
460 		maxvif = vifi+1;
461 	write_unlock_bh(&mrt_lock);
462 	return 0;
463 }
464 
465 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
466 {
467 	int line=MFC_HASH(mcastgrp,origin);
468 	struct mfc_cache *c;
469 
470 	for (c=mfc_cache_array[line]; c; c = c->next) {
471 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472 			break;
473 	}
474 	return c;
475 }
476 
477 /*
478  *	Allocate a multicast cache entry
479  */
480 static struct mfc_cache *ipmr_cache_alloc(void)
481 {
482 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
483 	if (c==NULL)
484 		return NULL;
485 	c->mfc_un.res.minvif = MAXVIFS;
486 	return c;
487 }
488 
489 static struct mfc_cache *ipmr_cache_alloc_unres(void)
490 {
491 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
492 	if (c==NULL)
493 		return NULL;
494 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
495 	c->mfc_un.unres.expires = jiffies + 10*HZ;
496 	return c;
497 }
498 
499 /*
500  *	A cache entry has gone into a resolved state from queued
501  */
502 
503 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
504 {
505 	struct sk_buff *skb;
506 	struct nlmsgerr *e;
507 
508 	/*
509 	 *	Play the pending entries through our router
510 	 */
511 
512 	while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
513 		if (ip_hdr(skb)->version == 0) {
514 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
515 
516 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
517 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
518 						  (u8 *)nlh);
519 			} else {
520 				nlh->nlmsg_type = NLMSG_ERROR;
521 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
522 				skb_trim(skb, nlh->nlmsg_len);
523 				e = NLMSG_DATA(nlh);
524 				e->error = -EMSGSIZE;
525 				memset(&e->msg, 0, sizeof(e->msg));
526 			}
527 
528 			rtnl_unicast(skb, NETLINK_CB(skb).pid);
529 		} else
530 			ip_mr_forward(skb, c, 0);
531 	}
532 }
533 
534 /*
535  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
536  *	expects the following bizarre scheme.
537  *
538  *	Called under mrt_lock.
539  */
540 
541 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
542 {
543 	struct sk_buff *skb;
544 	const int ihl = ip_hdrlen(pkt);
545 	struct igmphdr *igmp;
546 	struct igmpmsg *msg;
547 	int ret;
548 
549 #ifdef CONFIG_IP_PIMSM
550 	if (assert == IGMPMSG_WHOLEPKT)
551 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
552 	else
553 #endif
554 		skb = alloc_skb(128, GFP_ATOMIC);
555 
556 	if (!skb)
557 		return -ENOBUFS;
558 
559 #ifdef CONFIG_IP_PIMSM
560 	if (assert == IGMPMSG_WHOLEPKT) {
561 		/* Ugly, but we have no choice with this interface.
562 		   Duplicate old header, fix ihl, length etc.
563 		   And all this only to mangle msg->im_msgtype and
564 		   to set msg->im_mbz to "mbz" :-)
565 		 */
566 		skb_push(skb, sizeof(struct iphdr));
567 		skb_reset_network_header(skb);
568 		skb_reset_transport_header(skb);
569 		msg = (struct igmpmsg *)skb_network_header(skb);
570 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
571 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
572 		msg->im_mbz = 0;
573 		msg->im_vif = reg_vif_num;
574 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
575 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
576 					     sizeof(struct iphdr));
577 	} else
578 #endif
579 	{
580 
581 	/*
582 	 *	Copy the IP header
583 	 */
584 
585 	skb->network_header = skb->tail;
586 	skb_put(skb, ihl);
587 	skb_copy_to_linear_data(skb, pkt->data, ihl);
588 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
589 	msg = (struct igmpmsg *)skb_network_header(skb);
590 	msg->im_vif = vifi;
591 	skb->dst = dst_clone(pkt->dst);
592 
593 	/*
594 	 *	Add our header
595 	 */
596 
597 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
598 	igmp->type	=
599 	msg->im_msgtype = assert;
600 	igmp->code 	=	0;
601 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
602 	skb->transport_header = skb->network_header;
603 	}
604 
605 	if (mroute_socket == NULL) {
606 		kfree_skb(skb);
607 		return -EINVAL;
608 	}
609 
610 	/*
611 	 *	Deliver to mrouted
612 	 */
613 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
614 		if (net_ratelimit())
615 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
616 		kfree_skb(skb);
617 	}
618 
619 	return ret;
620 }
621 
622 /*
623  *	Queue a packet for resolution. It gets locked cache entry!
624  */
625 
626 static int
627 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
628 {
629 	int err;
630 	struct mfc_cache *c;
631 	const struct iphdr *iph = ip_hdr(skb);
632 
633 	spin_lock_bh(&mfc_unres_lock);
634 	for (c=mfc_unres_queue; c; c=c->next) {
635 		if (c->mfc_mcastgrp == iph->daddr &&
636 		    c->mfc_origin == iph->saddr)
637 			break;
638 	}
639 
640 	if (c == NULL) {
641 		/*
642 		 *	Create a new entry if allowable
643 		 */
644 
645 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
646 		    (c=ipmr_cache_alloc_unres())==NULL) {
647 			spin_unlock_bh(&mfc_unres_lock);
648 
649 			kfree_skb(skb);
650 			return -ENOBUFS;
651 		}
652 
653 		/*
654 		 *	Fill in the new cache entry
655 		 */
656 		c->mfc_parent	= -1;
657 		c->mfc_origin	= iph->saddr;
658 		c->mfc_mcastgrp	= iph->daddr;
659 
660 		/*
661 		 *	Reflect first query at mrouted.
662 		 */
663 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
664 			/* If the report failed throw the cache entry
665 			   out - Brad Parker
666 			 */
667 			spin_unlock_bh(&mfc_unres_lock);
668 
669 			kmem_cache_free(mrt_cachep, c);
670 			kfree_skb(skb);
671 			return err;
672 		}
673 
674 		atomic_inc(&cache_resolve_queue_len);
675 		c->next = mfc_unres_queue;
676 		mfc_unres_queue = c;
677 
678 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
679 	}
680 
681 	/*
682 	 *	See if we can append the packet
683 	 */
684 	if (c->mfc_un.unres.unresolved.qlen>3) {
685 		kfree_skb(skb);
686 		err = -ENOBUFS;
687 	} else {
688 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
689 		err = 0;
690 	}
691 
692 	spin_unlock_bh(&mfc_unres_lock);
693 	return err;
694 }
695 
696 /*
697  *	MFC cache manipulation by user space mroute daemon
698  */
699 
700 static int ipmr_mfc_delete(struct mfcctl *mfc)
701 {
702 	int line;
703 	struct mfc_cache *c, **cp;
704 
705 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
706 
707 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
708 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
709 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
710 			write_lock_bh(&mrt_lock);
711 			*cp = c->next;
712 			write_unlock_bh(&mrt_lock);
713 
714 			kmem_cache_free(mrt_cachep, c);
715 			return 0;
716 		}
717 	}
718 	return -ENOENT;
719 }
720 
721 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
722 {
723 	int line;
724 	struct mfc_cache *uc, *c, **cp;
725 
726 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
727 
728 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
729 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
730 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
731 			break;
732 	}
733 
734 	if (c != NULL) {
735 		write_lock_bh(&mrt_lock);
736 		c->mfc_parent = mfc->mfcc_parent;
737 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
738 		if (!mrtsock)
739 			c->mfc_flags |= MFC_STATIC;
740 		write_unlock_bh(&mrt_lock);
741 		return 0;
742 	}
743 
744 	if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
745 		return -EINVAL;
746 
747 	c=ipmr_cache_alloc();
748 	if (c==NULL)
749 		return -ENOMEM;
750 
751 	c->mfc_origin=mfc->mfcc_origin.s_addr;
752 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
753 	c->mfc_parent=mfc->mfcc_parent;
754 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
755 	if (!mrtsock)
756 		c->mfc_flags |= MFC_STATIC;
757 
758 	write_lock_bh(&mrt_lock);
759 	c->next = mfc_cache_array[line];
760 	mfc_cache_array[line] = c;
761 	write_unlock_bh(&mrt_lock);
762 
763 	/*
764 	 *	Check to see if we resolved a queued list. If so we
765 	 *	need to send on the frames and tidy up.
766 	 */
767 	spin_lock_bh(&mfc_unres_lock);
768 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
769 	     cp = &uc->next) {
770 		if (uc->mfc_origin == c->mfc_origin &&
771 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
772 			*cp = uc->next;
773 			if (atomic_dec_and_test(&cache_resolve_queue_len))
774 				del_timer(&ipmr_expire_timer);
775 			break;
776 		}
777 	}
778 	spin_unlock_bh(&mfc_unres_lock);
779 
780 	if (uc) {
781 		ipmr_cache_resolve(uc, c);
782 		kmem_cache_free(mrt_cachep, uc);
783 	}
784 	return 0;
785 }
786 
787 /*
788  *	Close the multicast socket, and clear the vif tables etc
789  */
790 
791 static void mroute_clean_tables(struct sock *sk)
792 {
793 	int i;
794 
795 	/*
796 	 *	Shut down all active vif entries
797 	 */
798 	for (i=0; i<maxvif; i++) {
799 		if (!(vif_table[i].flags&VIFF_STATIC))
800 			vif_delete(i);
801 	}
802 
803 	/*
804 	 *	Wipe the cache
805 	 */
806 	for (i=0;i<MFC_LINES;i++) {
807 		struct mfc_cache *c, **cp;
808 
809 		cp = &mfc_cache_array[i];
810 		while ((c = *cp) != NULL) {
811 			if (c->mfc_flags&MFC_STATIC) {
812 				cp = &c->next;
813 				continue;
814 			}
815 			write_lock_bh(&mrt_lock);
816 			*cp = c->next;
817 			write_unlock_bh(&mrt_lock);
818 
819 			kmem_cache_free(mrt_cachep, c);
820 		}
821 	}
822 
823 	if (atomic_read(&cache_resolve_queue_len) != 0) {
824 		struct mfc_cache *c;
825 
826 		spin_lock_bh(&mfc_unres_lock);
827 		while (mfc_unres_queue != NULL) {
828 			c = mfc_unres_queue;
829 			mfc_unres_queue = c->next;
830 			spin_unlock_bh(&mfc_unres_lock);
831 
832 			ipmr_destroy_unres(c);
833 
834 			spin_lock_bh(&mfc_unres_lock);
835 		}
836 		spin_unlock_bh(&mfc_unres_lock);
837 	}
838 }
839 
840 static void mrtsock_destruct(struct sock *sk)
841 {
842 	rtnl_lock();
843 	if (sk == mroute_socket) {
844 		ipv4_devconf.mc_forwarding--;
845 
846 		write_lock_bh(&mrt_lock);
847 		mroute_socket=NULL;
848 		write_unlock_bh(&mrt_lock);
849 
850 		mroute_clean_tables(sk);
851 	}
852 	rtnl_unlock();
853 }
854 
855 /*
856  *	Socket options and virtual interface manipulation. The whole
857  *	virtual interface system is a complete heap, but unfortunately
858  *	that's how BSD mrouted happens to think. Maybe one day with a proper
859  *	MOSPF/PIM router set up we can clean this up.
860  */
861 
862 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
863 {
864 	int ret;
865 	struct vifctl vif;
866 	struct mfcctl mfc;
867 
868 	if (optname != MRT_INIT) {
869 		if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
870 			return -EACCES;
871 	}
872 
873 	switch (optname) {
874 	case MRT_INIT:
875 		if (sk->sk_type != SOCK_RAW ||
876 		    inet_sk(sk)->num != IPPROTO_IGMP)
877 			return -EOPNOTSUPP;
878 		if (optlen!=sizeof(int))
879 			return -ENOPROTOOPT;
880 
881 		rtnl_lock();
882 		if (mroute_socket) {
883 			rtnl_unlock();
884 			return -EADDRINUSE;
885 		}
886 
887 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
888 		if (ret == 0) {
889 			write_lock_bh(&mrt_lock);
890 			mroute_socket=sk;
891 			write_unlock_bh(&mrt_lock);
892 
893 			ipv4_devconf.mc_forwarding++;
894 		}
895 		rtnl_unlock();
896 		return ret;
897 	case MRT_DONE:
898 		if (sk!=mroute_socket)
899 			return -EACCES;
900 		return ip_ra_control(sk, 0, NULL);
901 	case MRT_ADD_VIF:
902 	case MRT_DEL_VIF:
903 		if (optlen!=sizeof(vif))
904 			return -EINVAL;
905 		if (copy_from_user(&vif,optval,sizeof(vif)))
906 			return -EFAULT;
907 		if (vif.vifc_vifi >= MAXVIFS)
908 			return -ENFILE;
909 		rtnl_lock();
910 		if (optname==MRT_ADD_VIF) {
911 			ret = vif_add(&vif, sk==mroute_socket);
912 		} else {
913 			ret = vif_delete(vif.vifc_vifi);
914 		}
915 		rtnl_unlock();
916 		return ret;
917 
918 		/*
919 		 *	Manipulate the forwarding caches. These live
920 		 *	in a sort of kernel/user symbiosis.
921 		 */
922 	case MRT_ADD_MFC:
923 	case MRT_DEL_MFC:
924 		if (optlen!=sizeof(mfc))
925 			return -EINVAL;
926 		if (copy_from_user(&mfc,optval, sizeof(mfc)))
927 			return -EFAULT;
928 		rtnl_lock();
929 		if (optname==MRT_DEL_MFC)
930 			ret = ipmr_mfc_delete(&mfc);
931 		else
932 			ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
933 		rtnl_unlock();
934 		return ret;
935 		/*
936 		 *	Control PIM assert.
937 		 */
938 	case MRT_ASSERT:
939 	{
940 		int v;
941 		if (get_user(v,(int __user *)optval))
942 			return -EFAULT;
943 		mroute_do_assert=(v)?1:0;
944 		return 0;
945 	}
946 #ifdef CONFIG_IP_PIMSM
947 	case MRT_PIM:
948 	{
949 		int v, ret;
950 		if (get_user(v,(int __user *)optval))
951 			return -EFAULT;
952 		v = (v)?1:0;
953 		rtnl_lock();
954 		ret = 0;
955 		if (v != mroute_do_pim) {
956 			mroute_do_pim = v;
957 			mroute_do_assert = v;
958 #ifdef CONFIG_IP_PIMSM_V2
959 			if (mroute_do_pim)
960 				ret = inet_add_protocol(&pim_protocol,
961 							IPPROTO_PIM);
962 			else
963 				ret = inet_del_protocol(&pim_protocol,
964 							IPPROTO_PIM);
965 			if (ret < 0)
966 				ret = -EAGAIN;
967 #endif
968 		}
969 		rtnl_unlock();
970 		return ret;
971 	}
972 #endif
973 	/*
974 	 *	Spurious command, or MRT_VERSION which you cannot
975 	 *	set.
976 	 */
977 	default:
978 		return -ENOPROTOOPT;
979 	}
980 }
981 
982 /*
983  *	Getsock opt support for the multicast routing system.
984  */
985 
986 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
987 {
988 	int olr;
989 	int val;
990 
991 	if (optname!=MRT_VERSION &&
992 #ifdef CONFIG_IP_PIMSM
993 	   optname!=MRT_PIM &&
994 #endif
995 	   optname!=MRT_ASSERT)
996 		return -ENOPROTOOPT;
997 
998 	if (get_user(olr, optlen))
999 		return -EFAULT;
1000 
1001 	olr = min_t(unsigned int, olr, sizeof(int));
1002 	if (olr < 0)
1003 		return -EINVAL;
1004 
1005 	if (put_user(olr,optlen))
1006 		return -EFAULT;
1007 	if (optname==MRT_VERSION)
1008 		val=0x0305;
1009 #ifdef CONFIG_IP_PIMSM
1010 	else if (optname==MRT_PIM)
1011 		val=mroute_do_pim;
1012 #endif
1013 	else
1014 		val=mroute_do_assert;
1015 	if (copy_to_user(optval,&val,olr))
1016 		return -EFAULT;
1017 	return 0;
1018 }
1019 
1020 /*
1021  *	The IP multicast ioctl support routines.
1022  */
1023 
1024 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1025 {
1026 	struct sioc_sg_req sr;
1027 	struct sioc_vif_req vr;
1028 	struct vif_device *vif;
1029 	struct mfc_cache *c;
1030 
1031 	switch (cmd) {
1032 	case SIOCGETVIFCNT:
1033 		if (copy_from_user(&vr,arg,sizeof(vr)))
1034 			return -EFAULT;
1035 		if (vr.vifi>=maxvif)
1036 			return -EINVAL;
1037 		read_lock(&mrt_lock);
1038 		vif=&vif_table[vr.vifi];
1039 		if (VIF_EXISTS(vr.vifi))	{
1040 			vr.icount=vif->pkt_in;
1041 			vr.ocount=vif->pkt_out;
1042 			vr.ibytes=vif->bytes_in;
1043 			vr.obytes=vif->bytes_out;
1044 			read_unlock(&mrt_lock);
1045 
1046 			if (copy_to_user(arg,&vr,sizeof(vr)))
1047 				return -EFAULT;
1048 			return 0;
1049 		}
1050 		read_unlock(&mrt_lock);
1051 		return -EADDRNOTAVAIL;
1052 	case SIOCGETSGCNT:
1053 		if (copy_from_user(&sr,arg,sizeof(sr)))
1054 			return -EFAULT;
1055 
1056 		read_lock(&mrt_lock);
1057 		c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1058 		if (c) {
1059 			sr.pktcnt = c->mfc_un.res.pkt;
1060 			sr.bytecnt = c->mfc_un.res.bytes;
1061 			sr.wrong_if = c->mfc_un.res.wrong_if;
1062 			read_unlock(&mrt_lock);
1063 
1064 			if (copy_to_user(arg,&sr,sizeof(sr)))
1065 				return -EFAULT;
1066 			return 0;
1067 		}
1068 		read_unlock(&mrt_lock);
1069 		return -EADDRNOTAVAIL;
1070 	default:
1071 		return -ENOIOCTLCMD;
1072 	}
1073 }
1074 
1075 
1076 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1077 {
1078 	struct vif_device *v;
1079 	int ct;
1080 	if (event != NETDEV_UNREGISTER)
1081 		return NOTIFY_DONE;
1082 	v=&vif_table[0];
1083 	for (ct=0;ct<maxvif;ct++,v++) {
1084 		if (v->dev==ptr)
1085 			vif_delete(ct);
1086 	}
1087 	return NOTIFY_DONE;
1088 }
1089 
1090 
1091 static struct notifier_block ip_mr_notifier={
1092 	.notifier_call = ipmr_device_event,
1093 };
1094 
1095 /*
1096  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1097  *	This avoids tunnel drivers and other mess and gives us the speed so
1098  *	important for multicast video.
1099  */
1100 
1101 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1102 {
1103 	struct iphdr *iph;
1104 	struct iphdr *old_iph = ip_hdr(skb);
1105 
1106 	skb_push(skb, sizeof(struct iphdr));
1107 	skb->transport_header = skb->network_header;
1108 	skb_reset_network_header(skb);
1109 	iph = ip_hdr(skb);
1110 
1111 	iph->version	= 	4;
1112 	iph->tos	=	old_iph->tos;
1113 	iph->ttl	=	old_iph->ttl;
1114 	iph->frag_off	=	0;
1115 	iph->daddr	=	daddr;
1116 	iph->saddr	=	saddr;
1117 	iph->protocol	=	IPPROTO_IPIP;
1118 	iph->ihl	=	5;
1119 	iph->tot_len	=	htons(skb->len);
1120 	ip_select_ident(iph, skb->dst, NULL);
1121 	ip_send_check(iph);
1122 
1123 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1124 	nf_reset(skb);
1125 }
1126 
1127 static inline int ipmr_forward_finish(struct sk_buff *skb)
1128 {
1129 	struct ip_options * opt	= &(IPCB(skb)->opt);
1130 
1131 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1132 
1133 	if (unlikely(opt->optlen))
1134 		ip_forward_options(skb);
1135 
1136 	return dst_output(skb);
1137 }
1138 
1139 /*
1140  *	Processing handlers for ipmr_forward
1141  */
1142 
1143 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1144 {
1145 	const struct iphdr *iph = ip_hdr(skb);
1146 	struct vif_device *vif = &vif_table[vifi];
1147 	struct net_device *dev;
1148 	struct rtable *rt;
1149 	int    encap = 0;
1150 
1151 	if (vif->dev == NULL)
1152 		goto out_free;
1153 
1154 #ifdef CONFIG_IP_PIMSM
1155 	if (vif->flags & VIFF_REGISTER) {
1156 		vif->pkt_out++;
1157 		vif->bytes_out+=skb->len;
1158 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1159 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1160 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1161 		kfree_skb(skb);
1162 		return;
1163 	}
1164 #endif
1165 
1166 	if (vif->flags&VIFF_TUNNEL) {
1167 		struct flowi fl = { .oif = vif->link,
1168 				    .nl_u = { .ip4_u =
1169 					      { .daddr = vif->remote,
1170 						.saddr = vif->local,
1171 						.tos = RT_TOS(iph->tos) } },
1172 				    .proto = IPPROTO_IPIP };
1173 		if (ip_route_output_key(&rt, &fl))
1174 			goto out_free;
1175 		encap = sizeof(struct iphdr);
1176 	} else {
1177 		struct flowi fl = { .oif = vif->link,
1178 				    .nl_u = { .ip4_u =
1179 					      { .daddr = iph->daddr,
1180 						.tos = RT_TOS(iph->tos) } },
1181 				    .proto = IPPROTO_IPIP };
1182 		if (ip_route_output_key(&rt, &fl))
1183 			goto out_free;
1184 	}
1185 
1186 	dev = rt->u.dst.dev;
1187 
1188 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1189 		/* Do not fragment multicasts. Alas, IPv4 does not
1190 		   allow to send ICMP, so that packets will disappear
1191 		   to blackhole.
1192 		 */
1193 
1194 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1195 		ip_rt_put(rt);
1196 		goto out_free;
1197 	}
1198 
1199 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1200 
1201 	if (skb_cow(skb, encap)) {
1202 		ip_rt_put(rt);
1203 		goto out_free;
1204 	}
1205 
1206 	vif->pkt_out++;
1207 	vif->bytes_out+=skb->len;
1208 
1209 	dst_release(skb->dst);
1210 	skb->dst = &rt->u.dst;
1211 	ip_decrease_ttl(ip_hdr(skb));
1212 
1213 	/* FIXME: forward and output firewalls used to be called here.
1214 	 * What do we do with netfilter? -- RR */
1215 	if (vif->flags & VIFF_TUNNEL) {
1216 		ip_encap(skb, vif->local, vif->remote);
1217 		/* FIXME: extra output firewall step used to be here. --RR */
1218 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1219 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1220 	}
1221 
1222 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1223 
1224 	/*
1225 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1226 	 * not only before forwarding, but after forwarding on all output
1227 	 * interfaces. It is clear, if mrouter runs a multicasting
1228 	 * program, it should receive packets not depending to what interface
1229 	 * program is joined.
1230 	 * If we will not make it, the program will have to join on all
1231 	 * interfaces. On the other hand, multihoming host (or router, but
1232 	 * not mrouter) cannot join to more than one interface - it will
1233 	 * result in receiving multiple packets.
1234 	 */
1235 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1236 		ipmr_forward_finish);
1237 	return;
1238 
1239 out_free:
1240 	kfree_skb(skb);
1241 	return;
1242 }
1243 
1244 static int ipmr_find_vif(struct net_device *dev)
1245 {
1246 	int ct;
1247 	for (ct=maxvif-1; ct>=0; ct--) {
1248 		if (vif_table[ct].dev == dev)
1249 			break;
1250 	}
1251 	return ct;
1252 }
1253 
1254 /* "local" means that we should preserve one skb (for local delivery) */
1255 
1256 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1257 {
1258 	int psend = -1;
1259 	int vif, ct;
1260 
1261 	vif = cache->mfc_parent;
1262 	cache->mfc_un.res.pkt++;
1263 	cache->mfc_un.res.bytes += skb->len;
1264 
1265 	/*
1266 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1267 	 */
1268 	if (vif_table[vif].dev != skb->dev) {
1269 		int true_vifi;
1270 
1271 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1272 			/* It is our own packet, looped back.
1273 			   Very complicated situation...
1274 
1275 			   The best workaround until routing daemons will be
1276 			   fixed is not to redistribute packet, if it was
1277 			   send through wrong interface. It means, that
1278 			   multicast applications WILL NOT work for
1279 			   (S,G), which have default multicast route pointing
1280 			   to wrong oif. In any case, it is not a good
1281 			   idea to use multicasting applications on router.
1282 			 */
1283 			goto dont_forward;
1284 		}
1285 
1286 		cache->mfc_un.res.wrong_if++;
1287 		true_vifi = ipmr_find_vif(skb->dev);
1288 
1289 		if (true_vifi >= 0 && mroute_do_assert &&
1290 		    /* pimsm uses asserts, when switching from RPT to SPT,
1291 		       so that we cannot check that packet arrived on an oif.
1292 		       It is bad, but otherwise we would need to move pretty
1293 		       large chunk of pimd to kernel. Ough... --ANK
1294 		     */
1295 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1296 		    time_after(jiffies,
1297 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1298 			cache->mfc_un.res.last_assert = jiffies;
1299 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1300 		}
1301 		goto dont_forward;
1302 	}
1303 
1304 	vif_table[vif].pkt_in++;
1305 	vif_table[vif].bytes_in+=skb->len;
1306 
1307 	/*
1308 	 *	Forward the frame
1309 	 */
1310 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1311 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1312 			if (psend != -1) {
1313 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1314 				if (skb2)
1315 					ipmr_queue_xmit(skb2, cache, psend);
1316 			}
1317 			psend=ct;
1318 		}
1319 	}
1320 	if (psend != -1) {
1321 		if (local) {
1322 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1323 			if (skb2)
1324 				ipmr_queue_xmit(skb2, cache, psend);
1325 		} else {
1326 			ipmr_queue_xmit(skb, cache, psend);
1327 			return 0;
1328 		}
1329 	}
1330 
1331 dont_forward:
1332 	if (!local)
1333 		kfree_skb(skb);
1334 	return 0;
1335 }
1336 
1337 
1338 /*
1339  *	Multicast packets for forwarding arrive here
1340  */
1341 
1342 int ip_mr_input(struct sk_buff *skb)
1343 {
1344 	struct mfc_cache *cache;
1345 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1346 
1347 	/* Packet is looped back after forward, it should not be
1348 	   forwarded second time, but still can be delivered locally.
1349 	 */
1350 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1351 		goto dont_forward;
1352 
1353 	if (!local) {
1354 		    if (IPCB(skb)->opt.router_alert) {
1355 			    if (ip_call_ra_chain(skb))
1356 				    return 0;
1357 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1358 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1359 			       Cisco IOS <= 11.2(8)) do not put router alert
1360 			       option to IGMP packets destined to routable
1361 			       groups. It is very bad, because it means
1362 			       that we can forward NO IGMP messages.
1363 			     */
1364 			    read_lock(&mrt_lock);
1365 			    if (mroute_socket) {
1366 				    nf_reset(skb);
1367 				    raw_rcv(mroute_socket, skb);
1368 				    read_unlock(&mrt_lock);
1369 				    return 0;
1370 			    }
1371 			    read_unlock(&mrt_lock);
1372 		    }
1373 	}
1374 
1375 	read_lock(&mrt_lock);
1376 	cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1377 
1378 	/*
1379 	 *	No usable cache entry
1380 	 */
1381 	if (cache==NULL) {
1382 		int vif;
1383 
1384 		if (local) {
1385 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1386 			ip_local_deliver(skb);
1387 			if (skb2 == NULL) {
1388 				read_unlock(&mrt_lock);
1389 				return -ENOBUFS;
1390 			}
1391 			skb = skb2;
1392 		}
1393 
1394 		vif = ipmr_find_vif(skb->dev);
1395 		if (vif >= 0) {
1396 			int err = ipmr_cache_unresolved(vif, skb);
1397 			read_unlock(&mrt_lock);
1398 
1399 			return err;
1400 		}
1401 		read_unlock(&mrt_lock);
1402 		kfree_skb(skb);
1403 		return -ENODEV;
1404 	}
1405 
1406 	ip_mr_forward(skb, cache, local);
1407 
1408 	read_unlock(&mrt_lock);
1409 
1410 	if (local)
1411 		return ip_local_deliver(skb);
1412 
1413 	return 0;
1414 
1415 dont_forward:
1416 	if (local)
1417 		return ip_local_deliver(skb);
1418 	kfree_skb(skb);
1419 	return 0;
1420 }
1421 
1422 #ifdef CONFIG_IP_PIMSM_V1
1423 /*
1424  * Handle IGMP messages of PIMv1
1425  */
1426 
1427 int pim_rcv_v1(struct sk_buff * skb)
1428 {
1429 	struct igmphdr *pim;
1430 	struct iphdr   *encap;
1431 	struct net_device  *reg_dev = NULL;
1432 
1433 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1434 		goto drop;
1435 
1436 	pim = igmp_hdr(skb);
1437 
1438 	if (!mroute_do_pim ||
1439 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1440 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1441 		goto drop;
1442 
1443 	encap = (struct iphdr *)(skb_transport_header(skb) +
1444 				 sizeof(struct igmphdr));
1445 	/*
1446 	   Check that:
1447 	   a. packet is really destinted to a multicast group
1448 	   b. packet is not a NULL-REGISTER
1449 	   c. packet is not truncated
1450 	 */
1451 	if (!MULTICAST(encap->daddr) ||
1452 	    encap->tot_len == 0 ||
1453 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1454 		goto drop;
1455 
1456 	read_lock(&mrt_lock);
1457 	if (reg_vif_num >= 0)
1458 		reg_dev = vif_table[reg_vif_num].dev;
1459 	if (reg_dev)
1460 		dev_hold(reg_dev);
1461 	read_unlock(&mrt_lock);
1462 
1463 	if (reg_dev == NULL)
1464 		goto drop;
1465 
1466 	skb->mac_header = skb->network_header;
1467 	skb_pull(skb, (u8*)encap - skb->data);
1468 	skb_reset_network_header(skb);
1469 	skb->dev = reg_dev;
1470 	skb->protocol = htons(ETH_P_IP);
1471 	skb->ip_summed = 0;
1472 	skb->pkt_type = PACKET_HOST;
1473 	dst_release(skb->dst);
1474 	skb->dst = NULL;
1475 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1476 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1477 	nf_reset(skb);
1478 	netif_rx(skb);
1479 	dev_put(reg_dev);
1480 	return 0;
1481  drop:
1482 	kfree_skb(skb);
1483 	return 0;
1484 }
1485 #endif
1486 
1487 #ifdef CONFIG_IP_PIMSM_V2
1488 static int pim_rcv(struct sk_buff * skb)
1489 {
1490 	struct pimreghdr *pim;
1491 	struct iphdr   *encap;
1492 	struct net_device  *reg_dev = NULL;
1493 
1494 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1495 		goto drop;
1496 
1497 	pim = (struct pimreghdr *)skb_transport_header(skb);
1498 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1499 	    (pim->flags&PIM_NULL_REGISTER) ||
1500 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1501 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1502 		goto drop;
1503 
1504 	/* check if the inner packet is destined to mcast group */
1505 	encap = (struct iphdr *)(skb_transport_header(skb) +
1506 				 sizeof(struct pimreghdr));
1507 	if (!MULTICAST(encap->daddr) ||
1508 	    encap->tot_len == 0 ||
1509 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1510 		goto drop;
1511 
1512 	read_lock(&mrt_lock);
1513 	if (reg_vif_num >= 0)
1514 		reg_dev = vif_table[reg_vif_num].dev;
1515 	if (reg_dev)
1516 		dev_hold(reg_dev);
1517 	read_unlock(&mrt_lock);
1518 
1519 	if (reg_dev == NULL)
1520 		goto drop;
1521 
1522 	skb->mac_header = skb->network_header;
1523 	skb_pull(skb, (u8*)encap - skb->data);
1524 	skb_reset_network_header(skb);
1525 	skb->dev = reg_dev;
1526 	skb->protocol = htons(ETH_P_IP);
1527 	skb->ip_summed = 0;
1528 	skb->pkt_type = PACKET_HOST;
1529 	dst_release(skb->dst);
1530 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1531 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1532 	skb->dst = NULL;
1533 	nf_reset(skb);
1534 	netif_rx(skb);
1535 	dev_put(reg_dev);
1536 	return 0;
1537  drop:
1538 	kfree_skb(skb);
1539 	return 0;
1540 }
1541 #endif
1542 
1543 static int
1544 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1545 {
1546 	int ct;
1547 	struct rtnexthop *nhp;
1548 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1549 	u8 *b = skb_tail_pointer(skb);
1550 	struct rtattr *mp_head;
1551 
1552 	if (dev)
1553 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1554 
1555 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1556 
1557 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1558 		if (c->mfc_un.res.ttls[ct] < 255) {
1559 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1560 				goto rtattr_failure;
1561 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1562 			nhp->rtnh_flags = 0;
1563 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1564 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1565 			nhp->rtnh_len = sizeof(*nhp);
1566 		}
1567 	}
1568 	mp_head->rta_type = RTA_MULTIPATH;
1569 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1570 	rtm->rtm_type = RTN_MULTICAST;
1571 	return 1;
1572 
1573 rtattr_failure:
1574 	nlmsg_trim(skb, b);
1575 	return -EMSGSIZE;
1576 }
1577 
1578 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1579 {
1580 	int err;
1581 	struct mfc_cache *cache;
1582 	struct rtable *rt = (struct rtable*)skb->dst;
1583 
1584 	read_lock(&mrt_lock);
1585 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1586 
1587 	if (cache==NULL) {
1588 		struct sk_buff *skb2;
1589 		struct iphdr *iph;
1590 		struct net_device *dev;
1591 		int vif;
1592 
1593 		if (nowait) {
1594 			read_unlock(&mrt_lock);
1595 			return -EAGAIN;
1596 		}
1597 
1598 		dev = skb->dev;
1599 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1600 			read_unlock(&mrt_lock);
1601 			return -ENODEV;
1602 		}
1603 		skb2 = skb_clone(skb, GFP_ATOMIC);
1604 		if (!skb2) {
1605 			read_unlock(&mrt_lock);
1606 			return -ENOMEM;
1607 		}
1608 
1609 		skb_push(skb2, sizeof(struct iphdr));
1610 		skb_reset_network_header(skb2);
1611 		iph = ip_hdr(skb2);
1612 		iph->ihl = sizeof(struct iphdr) >> 2;
1613 		iph->saddr = rt->rt_src;
1614 		iph->daddr = rt->rt_dst;
1615 		iph->version = 0;
1616 		err = ipmr_cache_unresolved(vif, skb2);
1617 		read_unlock(&mrt_lock);
1618 		return err;
1619 	}
1620 
1621 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1622 		cache->mfc_flags |= MFC_NOTIFY;
1623 	err = ipmr_fill_mroute(skb, cache, rtm);
1624 	read_unlock(&mrt_lock);
1625 	return err;
1626 }
1627 
1628 #ifdef CONFIG_PROC_FS
1629 /*
1630  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1631  */
1632 struct ipmr_vif_iter {
1633 	int ct;
1634 };
1635 
1636 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1637 					   loff_t pos)
1638 {
1639 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1640 		if (!VIF_EXISTS(iter->ct))
1641 			continue;
1642 		if (pos-- == 0)
1643 			return &vif_table[iter->ct];
1644 	}
1645 	return NULL;
1646 }
1647 
1648 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1649 {
1650 	read_lock(&mrt_lock);
1651 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1652 		: SEQ_START_TOKEN;
1653 }
1654 
1655 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1656 {
1657 	struct ipmr_vif_iter *iter = seq->private;
1658 
1659 	++*pos;
1660 	if (v == SEQ_START_TOKEN)
1661 		return ipmr_vif_seq_idx(iter, 0);
1662 
1663 	while (++iter->ct < maxvif) {
1664 		if (!VIF_EXISTS(iter->ct))
1665 			continue;
1666 		return &vif_table[iter->ct];
1667 	}
1668 	return NULL;
1669 }
1670 
1671 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1672 {
1673 	read_unlock(&mrt_lock);
1674 }
1675 
1676 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1677 {
1678 	if (v == SEQ_START_TOKEN) {
1679 		seq_puts(seq,
1680 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1681 	} else {
1682 		const struct vif_device *vif = v;
1683 		const char *name =  vif->dev ? vif->dev->name : "none";
1684 
1685 		seq_printf(seq,
1686 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1687 			   vif - vif_table,
1688 			   name, vif->bytes_in, vif->pkt_in,
1689 			   vif->bytes_out, vif->pkt_out,
1690 			   vif->flags, vif->local, vif->remote);
1691 	}
1692 	return 0;
1693 }
1694 
1695 static const struct seq_operations ipmr_vif_seq_ops = {
1696 	.start = ipmr_vif_seq_start,
1697 	.next  = ipmr_vif_seq_next,
1698 	.stop  = ipmr_vif_seq_stop,
1699 	.show  = ipmr_vif_seq_show,
1700 };
1701 
1702 static int ipmr_vif_open(struct inode *inode, struct file *file)
1703 {
1704 	struct seq_file *seq;
1705 	int rc = -ENOMEM;
1706 	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1707 
1708 	if (!s)
1709 		goto out;
1710 
1711 	rc = seq_open(file, &ipmr_vif_seq_ops);
1712 	if (rc)
1713 		goto out_kfree;
1714 
1715 	s->ct = 0;
1716 	seq = file->private_data;
1717 	seq->private = s;
1718 out:
1719 	return rc;
1720 out_kfree:
1721 	kfree(s);
1722 	goto out;
1723 
1724 }
1725 
1726 static const struct file_operations ipmr_vif_fops = {
1727 	.owner	 = THIS_MODULE,
1728 	.open    = ipmr_vif_open,
1729 	.read    = seq_read,
1730 	.llseek  = seq_lseek,
1731 	.release = seq_release_private,
1732 };
1733 
1734 struct ipmr_mfc_iter {
1735 	struct mfc_cache **cache;
1736 	int ct;
1737 };
1738 
1739 
1740 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1741 {
1742 	struct mfc_cache *mfc;
1743 
1744 	it->cache = mfc_cache_array;
1745 	read_lock(&mrt_lock);
1746 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1747 		for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1748 			if (pos-- == 0)
1749 				return mfc;
1750 	read_unlock(&mrt_lock);
1751 
1752 	it->cache = &mfc_unres_queue;
1753 	spin_lock_bh(&mfc_unres_lock);
1754 	for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1755 		if (pos-- == 0)
1756 			return mfc;
1757 	spin_unlock_bh(&mfc_unres_lock);
1758 
1759 	it->cache = NULL;
1760 	return NULL;
1761 }
1762 
1763 
1764 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1765 {
1766 	struct ipmr_mfc_iter *it = seq->private;
1767 	it->cache = NULL;
1768 	it->ct = 0;
1769 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1770 		: SEQ_START_TOKEN;
1771 }
1772 
1773 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1774 {
1775 	struct mfc_cache *mfc = v;
1776 	struct ipmr_mfc_iter *it = seq->private;
1777 
1778 	++*pos;
1779 
1780 	if (v == SEQ_START_TOKEN)
1781 		return ipmr_mfc_seq_idx(seq->private, 0);
1782 
1783 	if (mfc->next)
1784 		return mfc->next;
1785 
1786 	if (it->cache == &mfc_unres_queue)
1787 		goto end_of_list;
1788 
1789 	BUG_ON(it->cache != mfc_cache_array);
1790 
1791 	while (++it->ct < MFC_LINES) {
1792 		mfc = mfc_cache_array[it->ct];
1793 		if (mfc)
1794 			return mfc;
1795 	}
1796 
1797 	/* exhausted cache_array, show unresolved */
1798 	read_unlock(&mrt_lock);
1799 	it->cache = &mfc_unres_queue;
1800 	it->ct = 0;
1801 
1802 	spin_lock_bh(&mfc_unres_lock);
1803 	mfc = mfc_unres_queue;
1804 	if (mfc)
1805 		return mfc;
1806 
1807  end_of_list:
1808 	spin_unlock_bh(&mfc_unres_lock);
1809 	it->cache = NULL;
1810 
1811 	return NULL;
1812 }
1813 
1814 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1815 {
1816 	struct ipmr_mfc_iter *it = seq->private;
1817 
1818 	if (it->cache == &mfc_unres_queue)
1819 		spin_unlock_bh(&mfc_unres_lock);
1820 	else if (it->cache == mfc_cache_array)
1821 		read_unlock(&mrt_lock);
1822 }
1823 
1824 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1825 {
1826 	int n;
1827 
1828 	if (v == SEQ_START_TOKEN) {
1829 		seq_puts(seq,
1830 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1831 	} else {
1832 		const struct mfc_cache *mfc = v;
1833 		const struct ipmr_mfc_iter *it = seq->private;
1834 
1835 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1836 			   (unsigned long) mfc->mfc_mcastgrp,
1837 			   (unsigned long) mfc->mfc_origin,
1838 			   mfc->mfc_parent,
1839 			   mfc->mfc_un.res.pkt,
1840 			   mfc->mfc_un.res.bytes,
1841 			   mfc->mfc_un.res.wrong_if);
1842 
1843 		if (it->cache != &mfc_unres_queue) {
1844 			for (n = mfc->mfc_un.res.minvif;
1845 			     n < mfc->mfc_un.res.maxvif; n++ ) {
1846 				if (VIF_EXISTS(n)
1847 				   && mfc->mfc_un.res.ttls[n] < 255)
1848 				seq_printf(seq,
1849 					   " %2d:%-3d",
1850 					   n, mfc->mfc_un.res.ttls[n]);
1851 			}
1852 		}
1853 		seq_putc(seq, '\n');
1854 	}
1855 	return 0;
1856 }
1857 
1858 static const struct seq_operations ipmr_mfc_seq_ops = {
1859 	.start = ipmr_mfc_seq_start,
1860 	.next  = ipmr_mfc_seq_next,
1861 	.stop  = ipmr_mfc_seq_stop,
1862 	.show  = ipmr_mfc_seq_show,
1863 };
1864 
1865 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1866 {
1867 	struct seq_file *seq;
1868 	int rc = -ENOMEM;
1869 	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1870 
1871 	if (!s)
1872 		goto out;
1873 
1874 	rc = seq_open(file, &ipmr_mfc_seq_ops);
1875 	if (rc)
1876 		goto out_kfree;
1877 
1878 	seq = file->private_data;
1879 	seq->private = s;
1880 out:
1881 	return rc;
1882 out_kfree:
1883 	kfree(s);
1884 	goto out;
1885 
1886 }
1887 
1888 static const struct file_operations ipmr_mfc_fops = {
1889 	.owner	 = THIS_MODULE,
1890 	.open    = ipmr_mfc_open,
1891 	.read    = seq_read,
1892 	.llseek  = seq_lseek,
1893 	.release = seq_release_private,
1894 };
1895 #endif
1896 
1897 #ifdef CONFIG_IP_PIMSM_V2
1898 static struct net_protocol pim_protocol = {
1899 	.handler	=	pim_rcv,
1900 };
1901 #endif
1902 
1903 
1904 /*
1905  *	Setup for IP multicast routing
1906  */
1907 
1908 void __init ip_mr_init(void)
1909 {
1910 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1911 				       sizeof(struct mfc_cache),
1912 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1913 				       NULL, NULL);
1914 	init_timer(&ipmr_expire_timer);
1915 	ipmr_expire_timer.function=ipmr_expire_process;
1916 	register_netdevice_notifier(&ip_mr_notifier);
1917 #ifdef CONFIG_PROC_FS
1918 	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1919 	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1920 #endif
1921 }
1922