xref: /openbmc/linux/net/ipv4/ipmr.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 
65 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
66 #define CONFIG_IP_PIMSM	1
67 #endif
68 
69 static struct sock *mroute_socket;
70 
71 
72 /* Big lock, protecting vif table, mrt cache and mroute socket state.
73    Note that the changes are semaphored via rtnl_lock.
74  */
75 
76 static DEFINE_RWLOCK(mrt_lock);
77 
78 /*
79  *	Multicast router control variables
80  */
81 
82 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
83 static int maxvif;
84 
85 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
86 
87 static int mroute_do_assert;				/* Set in PIM assert	*/
88 static int mroute_do_pim;
89 
90 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
91 
92 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
93 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
94 
95 /* Special spinlock for queue of unresolved entries */
96 static DEFINE_SPINLOCK(mfc_unres_lock);
97 
98 /* We return to original Alan's scheme. Hash table of resolved
99    entries is changed only in process context and protected
100    with weak lock mrt_lock. Queue of unresolved entries is protected
101    with strong spinlock mfc_unres_lock.
102 
103    In this case data path is free of exclusive locks at all.
104  */
105 
106 static kmem_cache_t *mrt_cachep __read_mostly;
107 
108 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
110 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
111 
112 #ifdef CONFIG_IP_PIMSM_V2
113 static struct net_protocol pim_protocol;
114 #endif
115 
116 static struct timer_list ipmr_expire_timer;
117 
118 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
119 
120 static
121 struct net_device *ipmr_new_tunnel(struct vifctl *v)
122 {
123 	struct net_device  *dev;
124 
125 	dev = __dev_get_by_name("tunl0");
126 
127 	if (dev) {
128 		int err;
129 		struct ifreq ifr;
130 		mm_segment_t	oldfs;
131 		struct ip_tunnel_parm p;
132 		struct in_device  *in_dev;
133 
134 		memset(&p, 0, sizeof(p));
135 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
136 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
137 		p.iph.version = 4;
138 		p.iph.ihl = 5;
139 		p.iph.protocol = IPPROTO_IPIP;
140 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
141 		ifr.ifr_ifru.ifru_data = (void*)&p;
142 
143 		oldfs = get_fs(); set_fs(KERNEL_DS);
144 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
145 		set_fs(oldfs);
146 
147 		dev = NULL;
148 
149 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
150 			dev->flags |= IFF_MULTICAST;
151 
152 			in_dev = __in_dev_get_rtnl(dev);
153 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
154 				goto failure;
155 			in_dev->cnf.rp_filter = 0;
156 
157 			if (dev_open(dev))
158 				goto failure;
159 		}
160 	}
161 	return dev;
162 
163 failure:
164 	/* allow the register to be completed before unregistering. */
165 	rtnl_unlock();
166 	rtnl_lock();
167 
168 	unregister_netdevice(dev);
169 	return NULL;
170 }
171 
172 #ifdef CONFIG_IP_PIMSM
173 
174 static int reg_vif_num = -1;
175 
176 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
177 {
178 	read_lock(&mrt_lock);
179 	((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
180 	((struct net_device_stats*)dev->priv)->tx_packets++;
181 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
182 	read_unlock(&mrt_lock);
183 	kfree_skb(skb);
184 	return 0;
185 }
186 
187 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
188 {
189 	return (struct net_device_stats*)dev->priv;
190 }
191 
192 static void reg_vif_setup(struct net_device *dev)
193 {
194 	dev->type		= ARPHRD_PIMREG;
195 	dev->mtu		= 1500 - sizeof(struct iphdr) - 8;
196 	dev->flags		= IFF_NOARP;
197 	dev->hard_start_xmit	= reg_vif_xmit;
198 	dev->get_stats		= reg_vif_get_stats;
199 	dev->destructor		= free_netdev;
200 }
201 
202 static struct net_device *ipmr_reg_vif(void)
203 {
204 	struct net_device *dev;
205 	struct in_device *in_dev;
206 
207 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
208 			   reg_vif_setup);
209 
210 	if (dev == NULL)
211 		return NULL;
212 
213 	if (register_netdevice(dev)) {
214 		free_netdev(dev);
215 		return NULL;
216 	}
217 	dev->iflink = 0;
218 
219 	if ((in_dev = inetdev_init(dev)) == NULL)
220 		goto failure;
221 
222 	in_dev->cnf.rp_filter = 0;
223 
224 	if (dev_open(dev))
225 		goto failure;
226 
227 	return dev;
228 
229 failure:
230 	/* allow the register to be completed before unregistering. */
231 	rtnl_unlock();
232 	rtnl_lock();
233 
234 	unregister_netdevice(dev);
235 	return NULL;
236 }
237 #endif
238 
239 /*
240  *	Delete a VIF entry
241  */
242 
243 static int vif_delete(int vifi)
244 {
245 	struct vif_device *v;
246 	struct net_device *dev;
247 	struct in_device *in_dev;
248 
249 	if (vifi < 0 || vifi >= maxvif)
250 		return -EADDRNOTAVAIL;
251 
252 	v = &vif_table[vifi];
253 
254 	write_lock_bh(&mrt_lock);
255 	dev = v->dev;
256 	v->dev = NULL;
257 
258 	if (!dev) {
259 		write_unlock_bh(&mrt_lock);
260 		return -EADDRNOTAVAIL;
261 	}
262 
263 #ifdef CONFIG_IP_PIMSM
264 	if (vifi == reg_vif_num)
265 		reg_vif_num = -1;
266 #endif
267 
268 	if (vifi+1 == maxvif) {
269 		int tmp;
270 		for (tmp=vifi-1; tmp>=0; tmp--) {
271 			if (VIF_EXISTS(tmp))
272 				break;
273 		}
274 		maxvif = tmp+1;
275 	}
276 
277 	write_unlock_bh(&mrt_lock);
278 
279 	dev_set_allmulti(dev, -1);
280 
281 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
282 		in_dev->cnf.mc_forwarding--;
283 		ip_rt_multicast_event(in_dev);
284 	}
285 
286 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
287 		unregister_netdevice(dev);
288 
289 	dev_put(dev);
290 	return 0;
291 }
292 
293 /* Destroy an unresolved cache entry, killing queued skbs
294    and reporting error to netlink readers.
295  */
296 
297 static void ipmr_destroy_unres(struct mfc_cache *c)
298 {
299 	struct sk_buff *skb;
300 	struct nlmsgerr *e;
301 
302 	atomic_dec(&cache_resolve_queue_len);
303 
304 	while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
305 		if (skb->nh.iph->version == 0) {
306 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
307 			nlh->nlmsg_type = NLMSG_ERROR;
308 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
309 			skb_trim(skb, nlh->nlmsg_len);
310 			e = NLMSG_DATA(nlh);
311 			e->error = -ETIMEDOUT;
312 			memset(&e->msg, 0, sizeof(e->msg));
313 			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
314 		} else
315 			kfree_skb(skb);
316 	}
317 
318 	kmem_cache_free(mrt_cachep, c);
319 }
320 
321 
322 /* Single timer process for all the unresolved queue. */
323 
324 static void ipmr_expire_process(unsigned long dummy)
325 {
326 	unsigned long now;
327 	unsigned long expires;
328 	struct mfc_cache *c, **cp;
329 
330 	if (!spin_trylock(&mfc_unres_lock)) {
331 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
332 		return;
333 	}
334 
335 	if (atomic_read(&cache_resolve_queue_len) == 0)
336 		goto out;
337 
338 	now = jiffies;
339 	expires = 10*HZ;
340 	cp = &mfc_unres_queue;
341 
342 	while ((c=*cp) != NULL) {
343 		if (time_after(c->mfc_un.unres.expires, now)) {
344 			unsigned long interval = c->mfc_un.unres.expires - now;
345 			if (interval < expires)
346 				expires = interval;
347 			cp = &c->next;
348 			continue;
349 		}
350 
351 		*cp = c->next;
352 
353 		ipmr_destroy_unres(c);
354 	}
355 
356 	if (atomic_read(&cache_resolve_queue_len))
357 		mod_timer(&ipmr_expire_timer, jiffies + expires);
358 
359 out:
360 	spin_unlock(&mfc_unres_lock);
361 }
362 
363 /* Fill oifs list. It is called under write locked mrt_lock. */
364 
365 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
366 {
367 	int vifi;
368 
369 	cache->mfc_un.res.minvif = MAXVIFS;
370 	cache->mfc_un.res.maxvif = 0;
371 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
372 
373 	for (vifi=0; vifi<maxvif; vifi++) {
374 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
375 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
376 			if (cache->mfc_un.res.minvif > vifi)
377 				cache->mfc_un.res.minvif = vifi;
378 			if (cache->mfc_un.res.maxvif <= vifi)
379 				cache->mfc_un.res.maxvif = vifi + 1;
380 		}
381 	}
382 }
383 
384 static int vif_add(struct vifctl *vifc, int mrtsock)
385 {
386 	int vifi = vifc->vifc_vifi;
387 	struct vif_device *v = &vif_table[vifi];
388 	struct net_device *dev;
389 	struct in_device *in_dev;
390 
391 	/* Is vif busy ? */
392 	if (VIF_EXISTS(vifi))
393 		return -EADDRINUSE;
394 
395 	switch (vifc->vifc_flags) {
396 #ifdef CONFIG_IP_PIMSM
397 	case VIFF_REGISTER:
398 		/*
399 		 * Special Purpose VIF in PIM
400 		 * All the packets will be sent to the daemon
401 		 */
402 		if (reg_vif_num >= 0)
403 			return -EADDRINUSE;
404 		dev = ipmr_reg_vif();
405 		if (!dev)
406 			return -ENOBUFS;
407 		break;
408 #endif
409 	case VIFF_TUNNEL:
410 		dev = ipmr_new_tunnel(vifc);
411 		if (!dev)
412 			return -ENOBUFS;
413 		break;
414 	case 0:
415 		dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
416 		if (!dev)
417 			return -EADDRNOTAVAIL;
418 		__dev_put(dev);
419 		break;
420 	default:
421 		return -EINVAL;
422 	}
423 
424 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
425 		return -EADDRNOTAVAIL;
426 	in_dev->cnf.mc_forwarding++;
427 	dev_set_allmulti(dev, +1);
428 	ip_rt_multicast_event(in_dev);
429 
430 	/*
431 	 *	Fill in the VIF structures
432 	 */
433 	v->rate_limit=vifc->vifc_rate_limit;
434 	v->local=vifc->vifc_lcl_addr.s_addr;
435 	v->remote=vifc->vifc_rmt_addr.s_addr;
436 	v->flags=vifc->vifc_flags;
437 	if (!mrtsock)
438 		v->flags |= VIFF_STATIC;
439 	v->threshold=vifc->vifc_threshold;
440 	v->bytes_in = 0;
441 	v->bytes_out = 0;
442 	v->pkt_in = 0;
443 	v->pkt_out = 0;
444 	v->link = dev->ifindex;
445 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
446 		v->link = dev->iflink;
447 
448 	/* And finish update writing critical data */
449 	write_lock_bh(&mrt_lock);
450 	dev_hold(dev);
451 	v->dev=dev;
452 #ifdef CONFIG_IP_PIMSM
453 	if (v->flags&VIFF_REGISTER)
454 		reg_vif_num = vifi;
455 #endif
456 	if (vifi+1 > maxvif)
457 		maxvif = vifi+1;
458 	write_unlock_bh(&mrt_lock);
459 	return 0;
460 }
461 
462 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
463 {
464 	int line=MFC_HASH(mcastgrp,origin);
465 	struct mfc_cache *c;
466 
467 	for (c=mfc_cache_array[line]; c; c = c->next) {
468 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
469 			break;
470 	}
471 	return c;
472 }
473 
474 /*
475  *	Allocate a multicast cache entry
476  */
477 static struct mfc_cache *ipmr_cache_alloc(void)
478 {
479 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
480 	if(c==NULL)
481 		return NULL;
482 	memset(c, 0, sizeof(*c));
483 	c->mfc_un.res.minvif = MAXVIFS;
484 	return c;
485 }
486 
487 static struct mfc_cache *ipmr_cache_alloc_unres(void)
488 {
489 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
490 	if(c==NULL)
491 		return NULL;
492 	memset(c, 0, sizeof(*c));
493 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
494 	c->mfc_un.unres.expires = jiffies + 10*HZ;
495 	return c;
496 }
497 
498 /*
499  *	A cache entry has gone into a resolved state from queued
500  */
501 
502 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
503 {
504 	struct sk_buff *skb;
505 	struct nlmsgerr *e;
506 
507 	/*
508 	 *	Play the pending entries through our router
509 	 */
510 
511 	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512 		if (skb->nh.iph->version == 0) {
513 			int err;
514 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
515 
516 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
517 				nlh->nlmsg_len = skb->tail - (u8*)nlh;
518 			} else {
519 				nlh->nlmsg_type = NLMSG_ERROR;
520 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
521 				skb_trim(skb, nlh->nlmsg_len);
522 				e = NLMSG_DATA(nlh);
523 				e->error = -EMSGSIZE;
524 				memset(&e->msg, 0, sizeof(e->msg));
525 			}
526 			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
527 		} else
528 			ip_mr_forward(skb, c, 0);
529 	}
530 }
531 
532 /*
533  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
534  *	expects the following bizarre scheme.
535  *
536  *	Called under mrt_lock.
537  */
538 
539 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
540 {
541 	struct sk_buff *skb;
542 	int ihl = pkt->nh.iph->ihl<<2;
543 	struct igmphdr *igmp;
544 	struct igmpmsg *msg;
545 	int ret;
546 
547 #ifdef CONFIG_IP_PIMSM
548 	if (assert == IGMPMSG_WHOLEPKT)
549 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
550 	else
551 #endif
552 		skb = alloc_skb(128, GFP_ATOMIC);
553 
554 	if(!skb)
555 		return -ENOBUFS;
556 
557 #ifdef CONFIG_IP_PIMSM
558 	if (assert == IGMPMSG_WHOLEPKT) {
559 		/* Ugly, but we have no choice with this interface.
560 		   Duplicate old header, fix ihl, length etc.
561 		   And all this only to mangle msg->im_msgtype and
562 		   to set msg->im_mbz to "mbz" :-)
563 		 */
564 		msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
565 		skb->nh.raw = skb->h.raw = (u8*)msg;
566 		memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
567 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
568 		msg->im_mbz = 0;
569  		msg->im_vif = reg_vif_num;
570 		skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
571 		skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
572 	} else
573 #endif
574 	{
575 
576 	/*
577 	 *	Copy the IP header
578 	 */
579 
580 	skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
581 	memcpy(skb->data,pkt->data,ihl);
582 	skb->nh.iph->protocol = 0;			/* Flag to the kernel this is a route add */
583 	msg = (struct igmpmsg*)skb->nh.iph;
584 	msg->im_vif = vifi;
585 	skb->dst = dst_clone(pkt->dst);
586 
587 	/*
588 	 *	Add our header
589 	 */
590 
591 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
592 	igmp->type	=
593 	msg->im_msgtype = assert;
594 	igmp->code 	=	0;
595 	skb->nh.iph->tot_len=htons(skb->len);			/* Fix the length */
596 	skb->h.raw = skb->nh.raw;
597         }
598 
599 	if (mroute_socket == NULL) {
600 		kfree_skb(skb);
601 		return -EINVAL;
602 	}
603 
604 	/*
605 	 *	Deliver to mrouted
606 	 */
607 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
608 		if (net_ratelimit())
609 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
610 		kfree_skb(skb);
611 	}
612 
613 	return ret;
614 }
615 
616 /*
617  *	Queue a packet for resolution. It gets locked cache entry!
618  */
619 
620 static int
621 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
622 {
623 	int err;
624 	struct mfc_cache *c;
625 
626 	spin_lock_bh(&mfc_unres_lock);
627 	for (c=mfc_unres_queue; c; c=c->next) {
628 		if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
629 		    c->mfc_origin == skb->nh.iph->saddr)
630 			break;
631 	}
632 
633 	if (c == NULL) {
634 		/*
635 		 *	Create a new entry if allowable
636 		 */
637 
638 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
639 		    (c=ipmr_cache_alloc_unres())==NULL) {
640 			spin_unlock_bh(&mfc_unres_lock);
641 
642 			kfree_skb(skb);
643 			return -ENOBUFS;
644 		}
645 
646 		/*
647 		 *	Fill in the new cache entry
648 		 */
649 		c->mfc_parent=-1;
650 		c->mfc_origin=skb->nh.iph->saddr;
651 		c->mfc_mcastgrp=skb->nh.iph->daddr;
652 
653 		/*
654 		 *	Reflect first query at mrouted.
655 		 */
656 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
657 			/* If the report failed throw the cache entry
658 			   out - Brad Parker
659 			 */
660 			spin_unlock_bh(&mfc_unres_lock);
661 
662 			kmem_cache_free(mrt_cachep, c);
663 			kfree_skb(skb);
664 			return err;
665 		}
666 
667 		atomic_inc(&cache_resolve_queue_len);
668 		c->next = mfc_unres_queue;
669 		mfc_unres_queue = c;
670 
671 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
672 	}
673 
674 	/*
675 	 *	See if we can append the packet
676 	 */
677 	if (c->mfc_un.unres.unresolved.qlen>3) {
678 		kfree_skb(skb);
679 		err = -ENOBUFS;
680 	} else {
681 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
682 		err = 0;
683 	}
684 
685 	spin_unlock_bh(&mfc_unres_lock);
686 	return err;
687 }
688 
689 /*
690  *	MFC cache manipulation by user space mroute daemon
691  */
692 
693 static int ipmr_mfc_delete(struct mfcctl *mfc)
694 {
695 	int line;
696 	struct mfc_cache *c, **cp;
697 
698 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
699 
700 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
701 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
702 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
703 			write_lock_bh(&mrt_lock);
704 			*cp = c->next;
705 			write_unlock_bh(&mrt_lock);
706 
707 			kmem_cache_free(mrt_cachep, c);
708 			return 0;
709 		}
710 	}
711 	return -ENOENT;
712 }
713 
714 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
715 {
716 	int line;
717 	struct mfc_cache *uc, *c, **cp;
718 
719 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
720 
721 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
722 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
723 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
724 			break;
725 	}
726 
727 	if (c != NULL) {
728 		write_lock_bh(&mrt_lock);
729 		c->mfc_parent = mfc->mfcc_parent;
730 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
731 		if (!mrtsock)
732 			c->mfc_flags |= MFC_STATIC;
733 		write_unlock_bh(&mrt_lock);
734 		return 0;
735 	}
736 
737 	if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
738 		return -EINVAL;
739 
740 	c=ipmr_cache_alloc();
741 	if (c==NULL)
742 		return -ENOMEM;
743 
744 	c->mfc_origin=mfc->mfcc_origin.s_addr;
745 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
746 	c->mfc_parent=mfc->mfcc_parent;
747 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
748 	if (!mrtsock)
749 		c->mfc_flags |= MFC_STATIC;
750 
751 	write_lock_bh(&mrt_lock);
752 	c->next = mfc_cache_array[line];
753 	mfc_cache_array[line] = c;
754 	write_unlock_bh(&mrt_lock);
755 
756 	/*
757 	 *	Check to see if we resolved a queued list. If so we
758 	 *	need to send on the frames and tidy up.
759 	 */
760 	spin_lock_bh(&mfc_unres_lock);
761 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
762 	     cp = &uc->next) {
763 		if (uc->mfc_origin == c->mfc_origin &&
764 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
765 			*cp = uc->next;
766 			if (atomic_dec_and_test(&cache_resolve_queue_len))
767 				del_timer(&ipmr_expire_timer);
768 			break;
769 		}
770 	}
771 	spin_unlock_bh(&mfc_unres_lock);
772 
773 	if (uc) {
774 		ipmr_cache_resolve(uc, c);
775 		kmem_cache_free(mrt_cachep, uc);
776 	}
777 	return 0;
778 }
779 
780 /*
781  *	Close the multicast socket, and clear the vif tables etc
782  */
783 
784 static void mroute_clean_tables(struct sock *sk)
785 {
786 	int i;
787 
788 	/*
789 	 *	Shut down all active vif entries
790 	 */
791 	for(i=0; i<maxvif; i++) {
792 		if (!(vif_table[i].flags&VIFF_STATIC))
793 			vif_delete(i);
794 	}
795 
796 	/*
797 	 *	Wipe the cache
798 	 */
799 	for (i=0;i<MFC_LINES;i++) {
800 		struct mfc_cache *c, **cp;
801 
802 		cp = &mfc_cache_array[i];
803 		while ((c = *cp) != NULL) {
804 			if (c->mfc_flags&MFC_STATIC) {
805 				cp = &c->next;
806 				continue;
807 			}
808 			write_lock_bh(&mrt_lock);
809 			*cp = c->next;
810 			write_unlock_bh(&mrt_lock);
811 
812 			kmem_cache_free(mrt_cachep, c);
813 		}
814 	}
815 
816 	if (atomic_read(&cache_resolve_queue_len) != 0) {
817 		struct mfc_cache *c;
818 
819 		spin_lock_bh(&mfc_unres_lock);
820 		while (mfc_unres_queue != NULL) {
821 			c = mfc_unres_queue;
822 			mfc_unres_queue = c->next;
823 			spin_unlock_bh(&mfc_unres_lock);
824 
825 			ipmr_destroy_unres(c);
826 
827 			spin_lock_bh(&mfc_unres_lock);
828 		}
829 		spin_unlock_bh(&mfc_unres_lock);
830 	}
831 }
832 
833 static void mrtsock_destruct(struct sock *sk)
834 {
835 	rtnl_lock();
836 	if (sk == mroute_socket) {
837 		ipv4_devconf.mc_forwarding--;
838 
839 		write_lock_bh(&mrt_lock);
840 		mroute_socket=NULL;
841 		write_unlock_bh(&mrt_lock);
842 
843 		mroute_clean_tables(sk);
844 	}
845 	rtnl_unlock();
846 }
847 
848 /*
849  *	Socket options and virtual interface manipulation. The whole
850  *	virtual interface system is a complete heap, but unfortunately
851  *	that's how BSD mrouted happens to think. Maybe one day with a proper
852  *	MOSPF/PIM router set up we can clean this up.
853  */
854 
855 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
856 {
857 	int ret;
858 	struct vifctl vif;
859 	struct mfcctl mfc;
860 
861 	if(optname!=MRT_INIT)
862 	{
863 		if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
864 			return -EACCES;
865 	}
866 
867 	switch(optname)
868 	{
869 		case MRT_INIT:
870 			if (sk->sk_type != SOCK_RAW ||
871 			    inet_sk(sk)->num != IPPROTO_IGMP)
872 				return -EOPNOTSUPP;
873 			if(optlen!=sizeof(int))
874 				return -ENOPROTOOPT;
875 
876 			rtnl_lock();
877 			if (mroute_socket) {
878 				rtnl_unlock();
879 				return -EADDRINUSE;
880 			}
881 
882 			ret = ip_ra_control(sk, 1, mrtsock_destruct);
883 			if (ret == 0) {
884 				write_lock_bh(&mrt_lock);
885 				mroute_socket=sk;
886 				write_unlock_bh(&mrt_lock);
887 
888 				ipv4_devconf.mc_forwarding++;
889 			}
890 			rtnl_unlock();
891 			return ret;
892 		case MRT_DONE:
893 			if (sk!=mroute_socket)
894 				return -EACCES;
895 			return ip_ra_control(sk, 0, NULL);
896 		case MRT_ADD_VIF:
897 		case MRT_DEL_VIF:
898 			if(optlen!=sizeof(vif))
899 				return -EINVAL;
900 			if (copy_from_user(&vif,optval,sizeof(vif)))
901 				return -EFAULT;
902 			if(vif.vifc_vifi >= MAXVIFS)
903 				return -ENFILE;
904 			rtnl_lock();
905 			if (optname==MRT_ADD_VIF) {
906 				ret = vif_add(&vif, sk==mroute_socket);
907 			} else {
908 				ret = vif_delete(vif.vifc_vifi);
909 			}
910 			rtnl_unlock();
911 			return ret;
912 
913 		/*
914 		 *	Manipulate the forwarding caches. These live
915 		 *	in a sort of kernel/user symbiosis.
916 		 */
917 		case MRT_ADD_MFC:
918 		case MRT_DEL_MFC:
919 			if(optlen!=sizeof(mfc))
920 				return -EINVAL;
921 			if (copy_from_user(&mfc,optval, sizeof(mfc)))
922 				return -EFAULT;
923 			rtnl_lock();
924 			if (optname==MRT_DEL_MFC)
925 				ret = ipmr_mfc_delete(&mfc);
926 			else
927 				ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
928 			rtnl_unlock();
929 			return ret;
930 		/*
931 		 *	Control PIM assert.
932 		 */
933 		case MRT_ASSERT:
934 		{
935 			int v;
936 			if(get_user(v,(int __user *)optval))
937 				return -EFAULT;
938 			mroute_do_assert=(v)?1:0;
939 			return 0;
940 		}
941 #ifdef CONFIG_IP_PIMSM
942 		case MRT_PIM:
943 		{
944 			int v, ret;
945 			if(get_user(v,(int __user *)optval))
946 				return -EFAULT;
947 			v = (v)?1:0;
948 			rtnl_lock();
949 			ret = 0;
950 			if (v != mroute_do_pim) {
951 				mroute_do_pim = v;
952 				mroute_do_assert = v;
953 #ifdef CONFIG_IP_PIMSM_V2
954 				if (mroute_do_pim)
955 					ret = inet_add_protocol(&pim_protocol,
956 								IPPROTO_PIM);
957 				else
958 					ret = inet_del_protocol(&pim_protocol,
959 								IPPROTO_PIM);
960 				if (ret < 0)
961 					ret = -EAGAIN;
962 #endif
963 			}
964 			rtnl_unlock();
965 			return ret;
966 		}
967 #endif
968 		/*
969 		 *	Spurious command, or MRT_VERSION which you cannot
970 		 *	set.
971 		 */
972 		default:
973 			return -ENOPROTOOPT;
974 	}
975 }
976 
977 /*
978  *	Getsock opt support for the multicast routing system.
979  */
980 
981 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
982 {
983 	int olr;
984 	int val;
985 
986 	if(optname!=MRT_VERSION &&
987 #ifdef CONFIG_IP_PIMSM
988 	   optname!=MRT_PIM &&
989 #endif
990 	   optname!=MRT_ASSERT)
991 		return -ENOPROTOOPT;
992 
993 	if (get_user(olr, optlen))
994 		return -EFAULT;
995 
996 	olr = min_t(unsigned int, olr, sizeof(int));
997 	if (olr < 0)
998 		return -EINVAL;
999 
1000 	if(put_user(olr,optlen))
1001 		return -EFAULT;
1002 	if(optname==MRT_VERSION)
1003 		val=0x0305;
1004 #ifdef CONFIG_IP_PIMSM
1005 	else if(optname==MRT_PIM)
1006 		val=mroute_do_pim;
1007 #endif
1008 	else
1009 		val=mroute_do_assert;
1010 	if(copy_to_user(optval,&val,olr))
1011 		return -EFAULT;
1012 	return 0;
1013 }
1014 
1015 /*
1016  *	The IP multicast ioctl support routines.
1017  */
1018 
1019 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1020 {
1021 	struct sioc_sg_req sr;
1022 	struct sioc_vif_req vr;
1023 	struct vif_device *vif;
1024 	struct mfc_cache *c;
1025 
1026 	switch(cmd)
1027 	{
1028 		case SIOCGETVIFCNT:
1029 			if (copy_from_user(&vr,arg,sizeof(vr)))
1030 				return -EFAULT;
1031 			if(vr.vifi>=maxvif)
1032 				return -EINVAL;
1033 			read_lock(&mrt_lock);
1034 			vif=&vif_table[vr.vifi];
1035 			if(VIF_EXISTS(vr.vifi))	{
1036 				vr.icount=vif->pkt_in;
1037 				vr.ocount=vif->pkt_out;
1038 				vr.ibytes=vif->bytes_in;
1039 				vr.obytes=vif->bytes_out;
1040 				read_unlock(&mrt_lock);
1041 
1042 				if (copy_to_user(arg,&vr,sizeof(vr)))
1043 					return -EFAULT;
1044 				return 0;
1045 			}
1046 			read_unlock(&mrt_lock);
1047 			return -EADDRNOTAVAIL;
1048 		case SIOCGETSGCNT:
1049 			if (copy_from_user(&sr,arg,sizeof(sr)))
1050 				return -EFAULT;
1051 
1052 			read_lock(&mrt_lock);
1053 			c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1054 			if (c) {
1055 				sr.pktcnt = c->mfc_un.res.pkt;
1056 				sr.bytecnt = c->mfc_un.res.bytes;
1057 				sr.wrong_if = c->mfc_un.res.wrong_if;
1058 				read_unlock(&mrt_lock);
1059 
1060 				if (copy_to_user(arg,&sr,sizeof(sr)))
1061 					return -EFAULT;
1062 				return 0;
1063 			}
1064 			read_unlock(&mrt_lock);
1065 			return -EADDRNOTAVAIL;
1066 		default:
1067 			return -ENOIOCTLCMD;
1068 	}
1069 }
1070 
1071 
1072 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1073 {
1074 	struct vif_device *v;
1075 	int ct;
1076 	if (event != NETDEV_UNREGISTER)
1077 		return NOTIFY_DONE;
1078 	v=&vif_table[0];
1079 	for(ct=0;ct<maxvif;ct++,v++) {
1080 		if (v->dev==ptr)
1081 			vif_delete(ct);
1082 	}
1083 	return NOTIFY_DONE;
1084 }
1085 
1086 
1087 static struct notifier_block ip_mr_notifier={
1088 	.notifier_call = ipmr_device_event,
1089 };
1090 
1091 /*
1092  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1093  *	This avoids tunnel drivers and other mess and gives us the speed so
1094  *	important for multicast video.
1095  */
1096 
1097 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1098 {
1099 	struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1100 
1101 	iph->version	= 	4;
1102 	iph->tos	=	skb->nh.iph->tos;
1103 	iph->ttl	=	skb->nh.iph->ttl;
1104 	iph->frag_off	=	0;
1105 	iph->daddr	=	daddr;
1106 	iph->saddr	=	saddr;
1107 	iph->protocol	=	IPPROTO_IPIP;
1108 	iph->ihl	=	5;
1109 	iph->tot_len	=	htons(skb->len);
1110 	ip_select_ident(iph, skb->dst, NULL);
1111 	ip_send_check(iph);
1112 
1113 	skb->h.ipiph = skb->nh.iph;
1114 	skb->nh.iph = iph;
1115 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1116 	nf_reset(skb);
1117 }
1118 
1119 static inline int ipmr_forward_finish(struct sk_buff *skb)
1120 {
1121 	struct ip_options * opt	= &(IPCB(skb)->opt);
1122 
1123 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1124 
1125 	if (unlikely(opt->optlen))
1126 		ip_forward_options(skb);
1127 
1128 	return dst_output(skb);
1129 }
1130 
1131 /*
1132  *	Processing handlers for ipmr_forward
1133  */
1134 
1135 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1136 {
1137 	struct iphdr *iph = skb->nh.iph;
1138 	struct vif_device *vif = &vif_table[vifi];
1139 	struct net_device *dev;
1140 	struct rtable *rt;
1141 	int    encap = 0;
1142 
1143 	if (vif->dev == NULL)
1144 		goto out_free;
1145 
1146 #ifdef CONFIG_IP_PIMSM
1147 	if (vif->flags & VIFF_REGISTER) {
1148 		vif->pkt_out++;
1149 		vif->bytes_out+=skb->len;
1150 		((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1151 		((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1152 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1153 		kfree_skb(skb);
1154 		return;
1155 	}
1156 #endif
1157 
1158 	if (vif->flags&VIFF_TUNNEL) {
1159 		struct flowi fl = { .oif = vif->link,
1160 				    .nl_u = { .ip4_u =
1161 					      { .daddr = vif->remote,
1162 						.saddr = vif->local,
1163 						.tos = RT_TOS(iph->tos) } },
1164 				    .proto = IPPROTO_IPIP };
1165 		if (ip_route_output_key(&rt, &fl))
1166 			goto out_free;
1167 		encap = sizeof(struct iphdr);
1168 	} else {
1169 		struct flowi fl = { .oif = vif->link,
1170 				    .nl_u = { .ip4_u =
1171 					      { .daddr = iph->daddr,
1172 						.tos = RT_TOS(iph->tos) } },
1173 				    .proto = IPPROTO_IPIP };
1174 		if (ip_route_output_key(&rt, &fl))
1175 			goto out_free;
1176 	}
1177 
1178 	dev = rt->u.dst.dev;
1179 
1180 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1181 		/* Do not fragment multicasts. Alas, IPv4 does not
1182 		   allow to send ICMP, so that packets will disappear
1183 		   to blackhole.
1184 		 */
1185 
1186 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1187 		ip_rt_put(rt);
1188 		goto out_free;
1189 	}
1190 
1191 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1192 
1193 	if (skb_cow(skb, encap)) {
1194  		ip_rt_put(rt);
1195 		goto out_free;
1196 	}
1197 
1198 	vif->pkt_out++;
1199 	vif->bytes_out+=skb->len;
1200 
1201 	dst_release(skb->dst);
1202 	skb->dst = &rt->u.dst;
1203 	iph = skb->nh.iph;
1204 	ip_decrease_ttl(iph);
1205 
1206 	/* FIXME: forward and output firewalls used to be called here.
1207 	 * What do we do with netfilter? -- RR */
1208 	if (vif->flags & VIFF_TUNNEL) {
1209 		ip_encap(skb, vif->local, vif->remote);
1210 		/* FIXME: extra output firewall step used to be here. --RR */
1211 		((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1212 		((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1213 	}
1214 
1215 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1216 
1217 	/*
1218 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1219 	 * not only before forwarding, but after forwarding on all output
1220 	 * interfaces. It is clear, if mrouter runs a multicasting
1221 	 * program, it should receive packets not depending to what interface
1222 	 * program is joined.
1223 	 * If we will not make it, the program will have to join on all
1224 	 * interfaces. On the other hand, multihoming host (or router, but
1225 	 * not mrouter) cannot join to more than one interface - it will
1226 	 * result in receiving multiple packets.
1227 	 */
1228 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1229 		ipmr_forward_finish);
1230 	return;
1231 
1232 out_free:
1233 	kfree_skb(skb);
1234 	return;
1235 }
1236 
1237 static int ipmr_find_vif(struct net_device *dev)
1238 {
1239 	int ct;
1240 	for (ct=maxvif-1; ct>=0; ct--) {
1241 		if (vif_table[ct].dev == dev)
1242 			break;
1243 	}
1244 	return ct;
1245 }
1246 
1247 /* "local" means that we should preserve one skb (for local delivery) */
1248 
1249 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1250 {
1251 	int psend = -1;
1252 	int vif, ct;
1253 
1254 	vif = cache->mfc_parent;
1255 	cache->mfc_un.res.pkt++;
1256 	cache->mfc_un.res.bytes += skb->len;
1257 
1258 	/*
1259 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1260 	 */
1261 	if (vif_table[vif].dev != skb->dev) {
1262 		int true_vifi;
1263 
1264 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1265 			/* It is our own packet, looped back.
1266 			   Very complicated situation...
1267 
1268 			   The best workaround until routing daemons will be
1269 			   fixed is not to redistribute packet, if it was
1270 			   send through wrong interface. It means, that
1271 			   multicast applications WILL NOT work for
1272 			   (S,G), which have default multicast route pointing
1273 			   to wrong oif. In any case, it is not a good
1274 			   idea to use multicasting applications on router.
1275 			 */
1276 			goto dont_forward;
1277 		}
1278 
1279 		cache->mfc_un.res.wrong_if++;
1280 		true_vifi = ipmr_find_vif(skb->dev);
1281 
1282 		if (true_vifi >= 0 && mroute_do_assert &&
1283 		    /* pimsm uses asserts, when switching from RPT to SPT,
1284 		       so that we cannot check that packet arrived on an oif.
1285 		       It is bad, but otherwise we would need to move pretty
1286 		       large chunk of pimd to kernel. Ough... --ANK
1287 		     */
1288 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1289 		    time_after(jiffies,
1290 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1291 			cache->mfc_un.res.last_assert = jiffies;
1292 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1293 		}
1294 		goto dont_forward;
1295 	}
1296 
1297 	vif_table[vif].pkt_in++;
1298 	vif_table[vif].bytes_in+=skb->len;
1299 
1300 	/*
1301 	 *	Forward the frame
1302 	 */
1303 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1304 		if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1305 			if (psend != -1) {
1306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1307 				if (skb2)
1308 					ipmr_queue_xmit(skb2, cache, psend);
1309 			}
1310 			psend=ct;
1311 		}
1312 	}
1313 	if (psend != -1) {
1314 		if (local) {
1315 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1316 			if (skb2)
1317 				ipmr_queue_xmit(skb2, cache, psend);
1318 		} else {
1319 			ipmr_queue_xmit(skb, cache, psend);
1320 			return 0;
1321 		}
1322 	}
1323 
1324 dont_forward:
1325 	if (!local)
1326 		kfree_skb(skb);
1327 	return 0;
1328 }
1329 
1330 
1331 /*
1332  *	Multicast packets for forwarding arrive here
1333  */
1334 
1335 int ip_mr_input(struct sk_buff *skb)
1336 {
1337 	struct mfc_cache *cache;
1338 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1339 
1340 	/* Packet is looped back after forward, it should not be
1341 	   forwarded second time, but still can be delivered locally.
1342 	 */
1343 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1344 		goto dont_forward;
1345 
1346 	if (!local) {
1347 		    if (IPCB(skb)->opt.router_alert) {
1348 			    if (ip_call_ra_chain(skb))
1349 				    return 0;
1350 		    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1351 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1352 			       Cisco IOS <= 11.2(8)) do not put router alert
1353 			       option to IGMP packets destined to routable
1354 			       groups. It is very bad, because it means
1355 			       that we can forward NO IGMP messages.
1356 			     */
1357 			    read_lock(&mrt_lock);
1358 			    if (mroute_socket) {
1359 				    nf_reset(skb);
1360 				    raw_rcv(mroute_socket, skb);
1361 				    read_unlock(&mrt_lock);
1362 				    return 0;
1363 			    }
1364 			    read_unlock(&mrt_lock);
1365 		    }
1366 	}
1367 
1368 	read_lock(&mrt_lock);
1369 	cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1370 
1371 	/*
1372 	 *	No usable cache entry
1373 	 */
1374 	if (cache==NULL) {
1375 		int vif;
1376 
1377 		if (local) {
1378 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1379 			ip_local_deliver(skb);
1380 			if (skb2 == NULL) {
1381 				read_unlock(&mrt_lock);
1382 				return -ENOBUFS;
1383 			}
1384 			skb = skb2;
1385 		}
1386 
1387 		vif = ipmr_find_vif(skb->dev);
1388 		if (vif >= 0) {
1389 			int err = ipmr_cache_unresolved(vif, skb);
1390 			read_unlock(&mrt_lock);
1391 
1392 			return err;
1393 		}
1394 		read_unlock(&mrt_lock);
1395 		kfree_skb(skb);
1396 		return -ENODEV;
1397 	}
1398 
1399 	ip_mr_forward(skb, cache, local);
1400 
1401 	read_unlock(&mrt_lock);
1402 
1403 	if (local)
1404 		return ip_local_deliver(skb);
1405 
1406 	return 0;
1407 
1408 dont_forward:
1409 	if (local)
1410 		return ip_local_deliver(skb);
1411 	kfree_skb(skb);
1412 	return 0;
1413 }
1414 
1415 #ifdef CONFIG_IP_PIMSM_V1
1416 /*
1417  * Handle IGMP messages of PIMv1
1418  */
1419 
1420 int pim_rcv_v1(struct sk_buff * skb)
1421 {
1422 	struct igmphdr *pim;
1423 	struct iphdr   *encap;
1424 	struct net_device  *reg_dev = NULL;
1425 
1426 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1427 		goto drop;
1428 
1429 	pim = (struct igmphdr*)skb->h.raw;
1430 
1431         if (!mroute_do_pim ||
1432 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1433 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1434 		goto drop;
1435 
1436 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1437 	/*
1438 	   Check that:
1439 	   a. packet is really destinted to a multicast group
1440 	   b. packet is not a NULL-REGISTER
1441 	   c. packet is not truncated
1442 	 */
1443 	if (!MULTICAST(encap->daddr) ||
1444 	    encap->tot_len == 0 ||
1445 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1446 		goto drop;
1447 
1448 	read_lock(&mrt_lock);
1449 	if (reg_vif_num >= 0)
1450 		reg_dev = vif_table[reg_vif_num].dev;
1451 	if (reg_dev)
1452 		dev_hold(reg_dev);
1453 	read_unlock(&mrt_lock);
1454 
1455 	if (reg_dev == NULL)
1456 		goto drop;
1457 
1458 	skb->mac.raw = skb->nh.raw;
1459 	skb_pull(skb, (u8*)encap - skb->data);
1460 	skb->nh.iph = (struct iphdr *)skb->data;
1461 	skb->dev = reg_dev;
1462 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1463 	skb->protocol = htons(ETH_P_IP);
1464 	skb->ip_summed = 0;
1465 	skb->pkt_type = PACKET_HOST;
1466 	dst_release(skb->dst);
1467 	skb->dst = NULL;
1468 	((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1469 	((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1470 	nf_reset(skb);
1471 	netif_rx(skb);
1472 	dev_put(reg_dev);
1473 	return 0;
1474  drop:
1475 	kfree_skb(skb);
1476 	return 0;
1477 }
1478 #endif
1479 
1480 #ifdef CONFIG_IP_PIMSM_V2
1481 static int pim_rcv(struct sk_buff * skb)
1482 {
1483 	struct pimreghdr *pim;
1484 	struct iphdr   *encap;
1485 	struct net_device  *reg_dev = NULL;
1486 
1487 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1488 		goto drop;
1489 
1490 	pim = (struct pimreghdr*)skb->h.raw;
1491         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1492 	    (pim->flags&PIM_NULL_REGISTER) ||
1493 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1494 	     (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1495 		goto drop;
1496 
1497 	/* check if the inner packet is destined to mcast group */
1498 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1499 	if (!MULTICAST(encap->daddr) ||
1500 	    encap->tot_len == 0 ||
1501 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1502 		goto drop;
1503 
1504 	read_lock(&mrt_lock);
1505 	if (reg_vif_num >= 0)
1506 		reg_dev = vif_table[reg_vif_num].dev;
1507 	if (reg_dev)
1508 		dev_hold(reg_dev);
1509 	read_unlock(&mrt_lock);
1510 
1511 	if (reg_dev == NULL)
1512 		goto drop;
1513 
1514 	skb->mac.raw = skb->nh.raw;
1515 	skb_pull(skb, (u8*)encap - skb->data);
1516 	skb->nh.iph = (struct iphdr *)skb->data;
1517 	skb->dev = reg_dev;
1518 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1519 	skb->protocol = htons(ETH_P_IP);
1520 	skb->ip_summed = 0;
1521 	skb->pkt_type = PACKET_HOST;
1522 	dst_release(skb->dst);
1523 	((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1524 	((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1525 	skb->dst = NULL;
1526 	nf_reset(skb);
1527 	netif_rx(skb);
1528 	dev_put(reg_dev);
1529 	return 0;
1530  drop:
1531 	kfree_skb(skb);
1532 	return 0;
1533 }
1534 #endif
1535 
1536 static int
1537 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1538 {
1539 	int ct;
1540 	struct rtnexthop *nhp;
1541 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1542 	u8 *b = skb->tail;
1543 	struct rtattr *mp_head;
1544 
1545 	if (dev)
1546 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1547 
1548 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1549 
1550 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1551 		if (c->mfc_un.res.ttls[ct] < 255) {
1552 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1553 				goto rtattr_failure;
1554 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1555 			nhp->rtnh_flags = 0;
1556 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1557 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1558 			nhp->rtnh_len = sizeof(*nhp);
1559 		}
1560 	}
1561 	mp_head->rta_type = RTA_MULTIPATH;
1562 	mp_head->rta_len = skb->tail - (u8*)mp_head;
1563 	rtm->rtm_type = RTN_MULTICAST;
1564 	return 1;
1565 
1566 rtattr_failure:
1567 	skb_trim(skb, b - skb->data);
1568 	return -EMSGSIZE;
1569 }
1570 
1571 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1572 {
1573 	int err;
1574 	struct mfc_cache *cache;
1575 	struct rtable *rt = (struct rtable*)skb->dst;
1576 
1577 	read_lock(&mrt_lock);
1578 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1579 
1580 	if (cache==NULL) {
1581 		struct net_device *dev;
1582 		int vif;
1583 
1584 		if (nowait) {
1585 			read_unlock(&mrt_lock);
1586 			return -EAGAIN;
1587 		}
1588 
1589 		dev = skb->dev;
1590 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1591 			read_unlock(&mrt_lock);
1592 			return -ENODEV;
1593 		}
1594 		skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1595 		skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1596 		skb->nh.iph->saddr = rt->rt_src;
1597 		skb->nh.iph->daddr = rt->rt_dst;
1598 		skb->nh.iph->version = 0;
1599 		err = ipmr_cache_unresolved(vif, skb);
1600 		read_unlock(&mrt_lock);
1601 		return err;
1602 	}
1603 
1604 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1605 		cache->mfc_flags |= MFC_NOTIFY;
1606 	err = ipmr_fill_mroute(skb, cache, rtm);
1607 	read_unlock(&mrt_lock);
1608 	return err;
1609 }
1610 
1611 #ifdef CONFIG_PROC_FS
1612 /*
1613  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1614  */
1615 struct ipmr_vif_iter {
1616 	int ct;
1617 };
1618 
1619 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1620 					   loff_t pos)
1621 {
1622 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1623 		if(!VIF_EXISTS(iter->ct))
1624 			continue;
1625 		if (pos-- == 0)
1626 			return &vif_table[iter->ct];
1627 	}
1628 	return NULL;
1629 }
1630 
1631 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1632 {
1633 	read_lock(&mrt_lock);
1634 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1635 		: SEQ_START_TOKEN;
1636 }
1637 
1638 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1639 {
1640 	struct ipmr_vif_iter *iter = seq->private;
1641 
1642 	++*pos;
1643 	if (v == SEQ_START_TOKEN)
1644 		return ipmr_vif_seq_idx(iter, 0);
1645 
1646 	while (++iter->ct < maxvif) {
1647 		if(!VIF_EXISTS(iter->ct))
1648 			continue;
1649 		return &vif_table[iter->ct];
1650 	}
1651 	return NULL;
1652 }
1653 
1654 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1655 {
1656 	read_unlock(&mrt_lock);
1657 }
1658 
1659 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1660 {
1661 	if (v == SEQ_START_TOKEN) {
1662 		seq_puts(seq,
1663 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1664 	} else {
1665 		const struct vif_device *vif = v;
1666 		const char *name =  vif->dev ? vif->dev->name : "none";
1667 
1668 		seq_printf(seq,
1669 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1670 			   vif - vif_table,
1671 			   name, vif->bytes_in, vif->pkt_in,
1672 			   vif->bytes_out, vif->pkt_out,
1673 			   vif->flags, vif->local, vif->remote);
1674 	}
1675 	return 0;
1676 }
1677 
1678 static struct seq_operations ipmr_vif_seq_ops = {
1679 	.start = ipmr_vif_seq_start,
1680 	.next  = ipmr_vif_seq_next,
1681 	.stop  = ipmr_vif_seq_stop,
1682 	.show  = ipmr_vif_seq_show,
1683 };
1684 
1685 static int ipmr_vif_open(struct inode *inode, struct file *file)
1686 {
1687 	struct seq_file *seq;
1688 	int rc = -ENOMEM;
1689 	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1690 
1691 	if (!s)
1692 		goto out;
1693 
1694 	rc = seq_open(file, &ipmr_vif_seq_ops);
1695 	if (rc)
1696 		goto out_kfree;
1697 
1698 	s->ct = 0;
1699 	seq = file->private_data;
1700 	seq->private = s;
1701 out:
1702 	return rc;
1703 out_kfree:
1704 	kfree(s);
1705 	goto out;
1706 
1707 }
1708 
1709 static struct file_operations ipmr_vif_fops = {
1710 	.owner	 = THIS_MODULE,
1711 	.open    = ipmr_vif_open,
1712 	.read    = seq_read,
1713 	.llseek  = seq_lseek,
1714 	.release = seq_release_private,
1715 };
1716 
1717 struct ipmr_mfc_iter {
1718 	struct mfc_cache **cache;
1719 	int ct;
1720 };
1721 
1722 
1723 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1724 {
1725 	struct mfc_cache *mfc;
1726 
1727 	it->cache = mfc_cache_array;
1728 	read_lock(&mrt_lock);
1729 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1730 		for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1731 			if (pos-- == 0)
1732 				return mfc;
1733 	read_unlock(&mrt_lock);
1734 
1735 	it->cache = &mfc_unres_queue;
1736 	spin_lock_bh(&mfc_unres_lock);
1737 	for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1738 		if (pos-- == 0)
1739 			return mfc;
1740 	spin_unlock_bh(&mfc_unres_lock);
1741 
1742 	it->cache = NULL;
1743 	return NULL;
1744 }
1745 
1746 
1747 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1748 {
1749 	struct ipmr_mfc_iter *it = seq->private;
1750 	it->cache = NULL;
1751 	it->ct = 0;
1752 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1753 		: SEQ_START_TOKEN;
1754 }
1755 
1756 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1757 {
1758 	struct mfc_cache *mfc = v;
1759 	struct ipmr_mfc_iter *it = seq->private;
1760 
1761 	++*pos;
1762 
1763 	if (v == SEQ_START_TOKEN)
1764 		return ipmr_mfc_seq_idx(seq->private, 0);
1765 
1766 	if (mfc->next)
1767 		return mfc->next;
1768 
1769 	if (it->cache == &mfc_unres_queue)
1770 		goto end_of_list;
1771 
1772 	BUG_ON(it->cache != mfc_cache_array);
1773 
1774 	while (++it->ct < MFC_LINES) {
1775 		mfc = mfc_cache_array[it->ct];
1776 		if (mfc)
1777 			return mfc;
1778 	}
1779 
1780 	/* exhausted cache_array, show unresolved */
1781 	read_unlock(&mrt_lock);
1782 	it->cache = &mfc_unres_queue;
1783 	it->ct = 0;
1784 
1785 	spin_lock_bh(&mfc_unres_lock);
1786 	mfc = mfc_unres_queue;
1787 	if (mfc)
1788 		return mfc;
1789 
1790  end_of_list:
1791 	spin_unlock_bh(&mfc_unres_lock);
1792 	it->cache = NULL;
1793 
1794 	return NULL;
1795 }
1796 
1797 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1798 {
1799 	struct ipmr_mfc_iter *it = seq->private;
1800 
1801 	if (it->cache == &mfc_unres_queue)
1802 		spin_unlock_bh(&mfc_unres_lock);
1803 	else if (it->cache == mfc_cache_array)
1804 		read_unlock(&mrt_lock);
1805 }
1806 
1807 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1808 {
1809 	int n;
1810 
1811 	if (v == SEQ_START_TOKEN) {
1812 		seq_puts(seq,
1813 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1814 	} else {
1815 		const struct mfc_cache *mfc = v;
1816 		const struct ipmr_mfc_iter *it = seq->private;
1817 
1818 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1819 			   (unsigned long) mfc->mfc_mcastgrp,
1820 			   (unsigned long) mfc->mfc_origin,
1821 			   mfc->mfc_parent,
1822 			   mfc->mfc_un.res.pkt,
1823 			   mfc->mfc_un.res.bytes,
1824 			   mfc->mfc_un.res.wrong_if);
1825 
1826 		if (it->cache != &mfc_unres_queue) {
1827 			for(n = mfc->mfc_un.res.minvif;
1828 			    n < mfc->mfc_un.res.maxvif; n++ ) {
1829 				if(VIF_EXISTS(n)
1830 				   && mfc->mfc_un.res.ttls[n] < 255)
1831 				seq_printf(seq,
1832 					   " %2d:%-3d",
1833 					   n, mfc->mfc_un.res.ttls[n]);
1834 			}
1835 		}
1836 		seq_putc(seq, '\n');
1837 	}
1838 	return 0;
1839 }
1840 
1841 static struct seq_operations ipmr_mfc_seq_ops = {
1842 	.start = ipmr_mfc_seq_start,
1843 	.next  = ipmr_mfc_seq_next,
1844 	.stop  = ipmr_mfc_seq_stop,
1845 	.show  = ipmr_mfc_seq_show,
1846 };
1847 
1848 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1849 {
1850 	struct seq_file *seq;
1851 	int rc = -ENOMEM;
1852 	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1853 
1854 	if (!s)
1855 		goto out;
1856 
1857 	rc = seq_open(file, &ipmr_mfc_seq_ops);
1858 	if (rc)
1859 		goto out_kfree;
1860 
1861 	seq = file->private_data;
1862 	seq->private = s;
1863 out:
1864 	return rc;
1865 out_kfree:
1866 	kfree(s);
1867 	goto out;
1868 
1869 }
1870 
1871 static struct file_operations ipmr_mfc_fops = {
1872 	.owner	 = THIS_MODULE,
1873 	.open    = ipmr_mfc_open,
1874 	.read    = seq_read,
1875 	.llseek  = seq_lseek,
1876 	.release = seq_release_private,
1877 };
1878 #endif
1879 
1880 #ifdef CONFIG_IP_PIMSM_V2
1881 static struct net_protocol pim_protocol = {
1882 	.handler	=	pim_rcv,
1883 };
1884 #endif
1885 
1886 
1887 /*
1888  *	Setup for IP multicast routing
1889  */
1890 
1891 void __init ip_mr_init(void)
1892 {
1893 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1894 				       sizeof(struct mfc_cache),
1895 				       0, SLAB_HWCACHE_ALIGN,
1896 				       NULL, NULL);
1897 	if (!mrt_cachep)
1898 		panic("cannot allocate ip_mrt_cache");
1899 
1900 	init_timer(&ipmr_expire_timer);
1901 	ipmr_expire_timer.function=ipmr_expire_process;
1902 	register_netdevice_notifier(&ip_mr_notifier);
1903 #ifdef CONFIG_PROC_FS
1904 	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1905 	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1906 #endif
1907 }
1908