xref: /openbmc/linux/net/ipv4/ipmr.c (revision f42b3800)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/net_namespace.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66 #include <net/netlink.h>
67 
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM	1
70 #endif
71 
72 static struct sock *mroute_socket;
73 
74 
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78 
79 static DEFINE_RWLOCK(mrt_lock);
80 
81 /*
82  *	Multicast router control variables
83  */
84 
85 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
86 static int maxvif;
87 
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89 
90 static int mroute_do_assert;				/* Set in PIM assert	*/
91 static int mroute_do_pim;
92 
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
94 
95 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
97 
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100 
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105 
106    In this case data path is free of exclusive locks at all.
107  */
108 
109 static struct kmem_cache *mrt_cachep __read_mostly;
110 
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114 
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118 
119 static struct timer_list ipmr_expire_timer;
120 
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122 
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126 	struct net_device  *dev;
127 
128 	dev = __dev_get_by_name(&init_net, "tunl0");
129 
130 	if (dev) {
131 		int err;
132 		struct ifreq ifr;
133 		mm_segment_t	oldfs;
134 		struct ip_tunnel_parm p;
135 		struct in_device  *in_dev;
136 
137 		memset(&p, 0, sizeof(p));
138 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
139 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
140 		p.iph.version = 4;
141 		p.iph.ihl = 5;
142 		p.iph.protocol = IPPROTO_IPIP;
143 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
145 
146 		oldfs = get_fs(); set_fs(KERNEL_DS);
147 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148 		set_fs(oldfs);
149 
150 		dev = NULL;
151 
152 		if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
153 			dev->flags |= IFF_MULTICAST;
154 
155 			in_dev = __in_dev_get_rtnl(dev);
156 			if (in_dev == NULL)
157 				goto failure;
158 
159 			ipv4_devconf_setall(in_dev);
160 			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
161 
162 			if (dev_open(dev))
163 				goto failure;
164 		}
165 	}
166 	return dev;
167 
168 failure:
169 	/* allow the register to be completed before unregistering. */
170 	rtnl_unlock();
171 	rtnl_lock();
172 
173 	unregister_netdevice(dev);
174 	return NULL;
175 }
176 
177 #ifdef CONFIG_IP_PIMSM
178 
179 static int reg_vif_num = -1;
180 
181 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
182 {
183 	read_lock(&mrt_lock);
184 	((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
185 	((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
186 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
187 	read_unlock(&mrt_lock);
188 	kfree_skb(skb);
189 	return 0;
190 }
191 
192 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
193 {
194 	return (struct net_device_stats*)netdev_priv(dev);
195 }
196 
197 static void reg_vif_setup(struct net_device *dev)
198 {
199 	dev->type		= ARPHRD_PIMREG;
200 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
201 	dev->flags		= IFF_NOARP;
202 	dev->hard_start_xmit	= reg_vif_xmit;
203 	dev->get_stats		= reg_vif_get_stats;
204 	dev->destructor		= free_netdev;
205 }
206 
207 static struct net_device *ipmr_reg_vif(void)
208 {
209 	struct net_device *dev;
210 	struct in_device *in_dev;
211 
212 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
213 			   reg_vif_setup);
214 
215 	if (dev == NULL)
216 		return NULL;
217 
218 	if (register_netdevice(dev)) {
219 		free_netdev(dev);
220 		return NULL;
221 	}
222 	dev->iflink = 0;
223 
224 	rcu_read_lock();
225 	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
226 		rcu_read_unlock();
227 		goto failure;
228 	}
229 
230 	ipv4_devconf_setall(in_dev);
231 	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
232 	rcu_read_unlock();
233 
234 	if (dev_open(dev))
235 		goto failure;
236 
237 	return dev;
238 
239 failure:
240 	/* allow the register to be completed before unregistering. */
241 	rtnl_unlock();
242 	rtnl_lock();
243 
244 	unregister_netdevice(dev);
245 	return NULL;
246 }
247 #endif
248 
249 /*
250  *	Delete a VIF entry
251  */
252 
253 static int vif_delete(int vifi)
254 {
255 	struct vif_device *v;
256 	struct net_device *dev;
257 	struct in_device *in_dev;
258 
259 	if (vifi < 0 || vifi >= maxvif)
260 		return -EADDRNOTAVAIL;
261 
262 	v = &vif_table[vifi];
263 
264 	write_lock_bh(&mrt_lock);
265 	dev = v->dev;
266 	v->dev = NULL;
267 
268 	if (!dev) {
269 		write_unlock_bh(&mrt_lock);
270 		return -EADDRNOTAVAIL;
271 	}
272 
273 #ifdef CONFIG_IP_PIMSM
274 	if (vifi == reg_vif_num)
275 		reg_vif_num = -1;
276 #endif
277 
278 	if (vifi+1 == maxvif) {
279 		int tmp;
280 		for (tmp=vifi-1; tmp>=0; tmp--) {
281 			if (VIF_EXISTS(tmp))
282 				break;
283 		}
284 		maxvif = tmp+1;
285 	}
286 
287 	write_unlock_bh(&mrt_lock);
288 
289 	dev_set_allmulti(dev, -1);
290 
291 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
292 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
293 		ip_rt_multicast_event(in_dev);
294 	}
295 
296 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
297 		unregister_netdevice(dev);
298 
299 	dev_put(dev);
300 	return 0;
301 }
302 
303 /* Destroy an unresolved cache entry, killing queued skbs
304    and reporting error to netlink readers.
305  */
306 
307 static void ipmr_destroy_unres(struct mfc_cache *c)
308 {
309 	struct sk_buff *skb;
310 	struct nlmsgerr *e;
311 
312 	atomic_dec(&cache_resolve_queue_len);
313 
314 	while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
315 		if (ip_hdr(skb)->version == 0) {
316 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
317 			nlh->nlmsg_type = NLMSG_ERROR;
318 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
319 			skb_trim(skb, nlh->nlmsg_len);
320 			e = NLMSG_DATA(nlh);
321 			e->error = -ETIMEDOUT;
322 			memset(&e->msg, 0, sizeof(e->msg));
323 
324 			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
325 		} else
326 			kfree_skb(skb);
327 	}
328 
329 	kmem_cache_free(mrt_cachep, c);
330 }
331 
332 
333 /* Single timer process for all the unresolved queue. */
334 
335 static void ipmr_expire_process(unsigned long dummy)
336 {
337 	unsigned long now;
338 	unsigned long expires;
339 	struct mfc_cache *c, **cp;
340 
341 	if (!spin_trylock(&mfc_unres_lock)) {
342 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
343 		return;
344 	}
345 
346 	if (atomic_read(&cache_resolve_queue_len) == 0)
347 		goto out;
348 
349 	now = jiffies;
350 	expires = 10*HZ;
351 	cp = &mfc_unres_queue;
352 
353 	while ((c=*cp) != NULL) {
354 		if (time_after(c->mfc_un.unres.expires, now)) {
355 			unsigned long interval = c->mfc_un.unres.expires - now;
356 			if (interval < expires)
357 				expires = interval;
358 			cp = &c->next;
359 			continue;
360 		}
361 
362 		*cp = c->next;
363 
364 		ipmr_destroy_unres(c);
365 	}
366 
367 	if (atomic_read(&cache_resolve_queue_len))
368 		mod_timer(&ipmr_expire_timer, jiffies + expires);
369 
370 out:
371 	spin_unlock(&mfc_unres_lock);
372 }
373 
374 /* Fill oifs list. It is called under write locked mrt_lock. */
375 
376 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
377 {
378 	int vifi;
379 
380 	cache->mfc_un.res.minvif = MAXVIFS;
381 	cache->mfc_un.res.maxvif = 0;
382 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
383 
384 	for (vifi=0; vifi<maxvif; vifi++) {
385 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
386 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
387 			if (cache->mfc_un.res.minvif > vifi)
388 				cache->mfc_un.res.minvif = vifi;
389 			if (cache->mfc_un.res.maxvif <= vifi)
390 				cache->mfc_un.res.maxvif = vifi + 1;
391 		}
392 	}
393 }
394 
395 static int vif_add(struct vifctl *vifc, int mrtsock)
396 {
397 	int vifi = vifc->vifc_vifi;
398 	struct vif_device *v = &vif_table[vifi];
399 	struct net_device *dev;
400 	struct in_device *in_dev;
401 
402 	/* Is vif busy ? */
403 	if (VIF_EXISTS(vifi))
404 		return -EADDRINUSE;
405 
406 	switch (vifc->vifc_flags) {
407 #ifdef CONFIG_IP_PIMSM
408 	case VIFF_REGISTER:
409 		/*
410 		 * Special Purpose VIF in PIM
411 		 * All the packets will be sent to the daemon
412 		 */
413 		if (reg_vif_num >= 0)
414 			return -EADDRINUSE;
415 		dev = ipmr_reg_vif();
416 		if (!dev)
417 			return -ENOBUFS;
418 		break;
419 #endif
420 	case VIFF_TUNNEL:
421 		dev = ipmr_new_tunnel(vifc);
422 		if (!dev)
423 			return -ENOBUFS;
424 		break;
425 	case 0:
426 		dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
427 		if (!dev)
428 			return -EADDRNOTAVAIL;
429 		dev_put(dev);
430 		break;
431 	default:
432 		return -EINVAL;
433 	}
434 
435 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
436 		return -EADDRNOTAVAIL;
437 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
438 	dev_set_allmulti(dev, +1);
439 	ip_rt_multicast_event(in_dev);
440 
441 	/*
442 	 *	Fill in the VIF structures
443 	 */
444 	v->rate_limit=vifc->vifc_rate_limit;
445 	v->local=vifc->vifc_lcl_addr.s_addr;
446 	v->remote=vifc->vifc_rmt_addr.s_addr;
447 	v->flags=vifc->vifc_flags;
448 	if (!mrtsock)
449 		v->flags |= VIFF_STATIC;
450 	v->threshold=vifc->vifc_threshold;
451 	v->bytes_in = 0;
452 	v->bytes_out = 0;
453 	v->pkt_in = 0;
454 	v->pkt_out = 0;
455 	v->link = dev->ifindex;
456 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
457 		v->link = dev->iflink;
458 
459 	/* And finish update writing critical data */
460 	write_lock_bh(&mrt_lock);
461 	dev_hold(dev);
462 	v->dev=dev;
463 #ifdef CONFIG_IP_PIMSM
464 	if (v->flags&VIFF_REGISTER)
465 		reg_vif_num = vifi;
466 #endif
467 	if (vifi+1 > maxvif)
468 		maxvif = vifi+1;
469 	write_unlock_bh(&mrt_lock);
470 	return 0;
471 }
472 
473 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
474 {
475 	int line=MFC_HASH(mcastgrp,origin);
476 	struct mfc_cache *c;
477 
478 	for (c=mfc_cache_array[line]; c; c = c->next) {
479 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
480 			break;
481 	}
482 	return c;
483 }
484 
485 /*
486  *	Allocate a multicast cache entry
487  */
488 static struct mfc_cache *ipmr_cache_alloc(void)
489 {
490 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
491 	if (c==NULL)
492 		return NULL;
493 	c->mfc_un.res.minvif = MAXVIFS;
494 	return c;
495 }
496 
497 static struct mfc_cache *ipmr_cache_alloc_unres(void)
498 {
499 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
500 	if (c==NULL)
501 		return NULL;
502 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
503 	c->mfc_un.unres.expires = jiffies + 10*HZ;
504 	return c;
505 }
506 
507 /*
508  *	A cache entry has gone into a resolved state from queued
509  */
510 
511 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
512 {
513 	struct sk_buff *skb;
514 	struct nlmsgerr *e;
515 
516 	/*
517 	 *	Play the pending entries through our router
518 	 */
519 
520 	while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
521 		if (ip_hdr(skb)->version == 0) {
522 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
523 
524 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
525 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
526 						  (u8 *)nlh);
527 			} else {
528 				nlh->nlmsg_type = NLMSG_ERROR;
529 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
530 				skb_trim(skb, nlh->nlmsg_len);
531 				e = NLMSG_DATA(nlh);
532 				e->error = -EMSGSIZE;
533 				memset(&e->msg, 0, sizeof(e->msg));
534 			}
535 
536 			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
537 		} else
538 			ip_mr_forward(skb, c, 0);
539 	}
540 }
541 
542 /*
543  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
544  *	expects the following bizarre scheme.
545  *
546  *	Called under mrt_lock.
547  */
548 
549 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
550 {
551 	struct sk_buff *skb;
552 	const int ihl = ip_hdrlen(pkt);
553 	struct igmphdr *igmp;
554 	struct igmpmsg *msg;
555 	int ret;
556 
557 #ifdef CONFIG_IP_PIMSM
558 	if (assert == IGMPMSG_WHOLEPKT)
559 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
560 	else
561 #endif
562 		skb = alloc_skb(128, GFP_ATOMIC);
563 
564 	if (!skb)
565 		return -ENOBUFS;
566 
567 #ifdef CONFIG_IP_PIMSM
568 	if (assert == IGMPMSG_WHOLEPKT) {
569 		/* Ugly, but we have no choice with this interface.
570 		   Duplicate old header, fix ihl, length etc.
571 		   And all this only to mangle msg->im_msgtype and
572 		   to set msg->im_mbz to "mbz" :-)
573 		 */
574 		skb_push(skb, sizeof(struct iphdr));
575 		skb_reset_network_header(skb);
576 		skb_reset_transport_header(skb);
577 		msg = (struct igmpmsg *)skb_network_header(skb);
578 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
579 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
580 		msg->im_mbz = 0;
581 		msg->im_vif = reg_vif_num;
582 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
583 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
584 					     sizeof(struct iphdr));
585 	} else
586 #endif
587 	{
588 
589 	/*
590 	 *	Copy the IP header
591 	 */
592 
593 	skb->network_header = skb->tail;
594 	skb_put(skb, ihl);
595 	skb_copy_to_linear_data(skb, pkt->data, ihl);
596 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
597 	msg = (struct igmpmsg *)skb_network_header(skb);
598 	msg->im_vif = vifi;
599 	skb->dst = dst_clone(pkt->dst);
600 
601 	/*
602 	 *	Add our header
603 	 */
604 
605 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
606 	igmp->type	=
607 	msg->im_msgtype = assert;
608 	igmp->code 	=	0;
609 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
610 	skb->transport_header = skb->network_header;
611 	}
612 
613 	if (mroute_socket == NULL) {
614 		kfree_skb(skb);
615 		return -EINVAL;
616 	}
617 
618 	/*
619 	 *	Deliver to mrouted
620 	 */
621 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
622 		if (net_ratelimit())
623 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
624 		kfree_skb(skb);
625 	}
626 
627 	return ret;
628 }
629 
630 /*
631  *	Queue a packet for resolution. It gets locked cache entry!
632  */
633 
634 static int
635 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
636 {
637 	int err;
638 	struct mfc_cache *c;
639 	const struct iphdr *iph = ip_hdr(skb);
640 
641 	spin_lock_bh(&mfc_unres_lock);
642 	for (c=mfc_unres_queue; c; c=c->next) {
643 		if (c->mfc_mcastgrp == iph->daddr &&
644 		    c->mfc_origin == iph->saddr)
645 			break;
646 	}
647 
648 	if (c == NULL) {
649 		/*
650 		 *	Create a new entry if allowable
651 		 */
652 
653 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
654 		    (c=ipmr_cache_alloc_unres())==NULL) {
655 			spin_unlock_bh(&mfc_unres_lock);
656 
657 			kfree_skb(skb);
658 			return -ENOBUFS;
659 		}
660 
661 		/*
662 		 *	Fill in the new cache entry
663 		 */
664 		c->mfc_parent	= -1;
665 		c->mfc_origin	= iph->saddr;
666 		c->mfc_mcastgrp	= iph->daddr;
667 
668 		/*
669 		 *	Reflect first query at mrouted.
670 		 */
671 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
672 			/* If the report failed throw the cache entry
673 			   out - Brad Parker
674 			 */
675 			spin_unlock_bh(&mfc_unres_lock);
676 
677 			kmem_cache_free(mrt_cachep, c);
678 			kfree_skb(skb);
679 			return err;
680 		}
681 
682 		atomic_inc(&cache_resolve_queue_len);
683 		c->next = mfc_unres_queue;
684 		mfc_unres_queue = c;
685 
686 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
687 	}
688 
689 	/*
690 	 *	See if we can append the packet
691 	 */
692 	if (c->mfc_un.unres.unresolved.qlen>3) {
693 		kfree_skb(skb);
694 		err = -ENOBUFS;
695 	} else {
696 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
697 		err = 0;
698 	}
699 
700 	spin_unlock_bh(&mfc_unres_lock);
701 	return err;
702 }
703 
704 /*
705  *	MFC cache manipulation by user space mroute daemon
706  */
707 
708 static int ipmr_mfc_delete(struct mfcctl *mfc)
709 {
710 	int line;
711 	struct mfc_cache *c, **cp;
712 
713 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714 
715 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
718 			write_lock_bh(&mrt_lock);
719 			*cp = c->next;
720 			write_unlock_bh(&mrt_lock);
721 
722 			kmem_cache_free(mrt_cachep, c);
723 			return 0;
724 		}
725 	}
726 	return -ENOENT;
727 }
728 
729 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
730 {
731 	int line;
732 	struct mfc_cache *uc, *c, **cp;
733 
734 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
735 
736 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
737 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
738 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
739 			break;
740 	}
741 
742 	if (c != NULL) {
743 		write_lock_bh(&mrt_lock);
744 		c->mfc_parent = mfc->mfcc_parent;
745 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
746 		if (!mrtsock)
747 			c->mfc_flags |= MFC_STATIC;
748 		write_unlock_bh(&mrt_lock);
749 		return 0;
750 	}
751 
752 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
753 		return -EINVAL;
754 
755 	c=ipmr_cache_alloc();
756 	if (c==NULL)
757 		return -ENOMEM;
758 
759 	c->mfc_origin=mfc->mfcc_origin.s_addr;
760 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
761 	c->mfc_parent=mfc->mfcc_parent;
762 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
763 	if (!mrtsock)
764 		c->mfc_flags |= MFC_STATIC;
765 
766 	write_lock_bh(&mrt_lock);
767 	c->next = mfc_cache_array[line];
768 	mfc_cache_array[line] = c;
769 	write_unlock_bh(&mrt_lock);
770 
771 	/*
772 	 *	Check to see if we resolved a queued list. If so we
773 	 *	need to send on the frames and tidy up.
774 	 */
775 	spin_lock_bh(&mfc_unres_lock);
776 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
777 	     cp = &uc->next) {
778 		if (uc->mfc_origin == c->mfc_origin &&
779 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
780 			*cp = uc->next;
781 			if (atomic_dec_and_test(&cache_resolve_queue_len))
782 				del_timer(&ipmr_expire_timer);
783 			break;
784 		}
785 	}
786 	spin_unlock_bh(&mfc_unres_lock);
787 
788 	if (uc) {
789 		ipmr_cache_resolve(uc, c);
790 		kmem_cache_free(mrt_cachep, uc);
791 	}
792 	return 0;
793 }
794 
795 /*
796  *	Close the multicast socket, and clear the vif tables etc
797  */
798 
799 static void mroute_clean_tables(struct sock *sk)
800 {
801 	int i;
802 
803 	/*
804 	 *	Shut down all active vif entries
805 	 */
806 	for (i=0; i<maxvif; i++) {
807 		if (!(vif_table[i].flags&VIFF_STATIC))
808 			vif_delete(i);
809 	}
810 
811 	/*
812 	 *	Wipe the cache
813 	 */
814 	for (i=0;i<MFC_LINES;i++) {
815 		struct mfc_cache *c, **cp;
816 
817 		cp = &mfc_cache_array[i];
818 		while ((c = *cp) != NULL) {
819 			if (c->mfc_flags&MFC_STATIC) {
820 				cp = &c->next;
821 				continue;
822 			}
823 			write_lock_bh(&mrt_lock);
824 			*cp = c->next;
825 			write_unlock_bh(&mrt_lock);
826 
827 			kmem_cache_free(mrt_cachep, c);
828 		}
829 	}
830 
831 	if (atomic_read(&cache_resolve_queue_len) != 0) {
832 		struct mfc_cache *c;
833 
834 		spin_lock_bh(&mfc_unres_lock);
835 		while (mfc_unres_queue != NULL) {
836 			c = mfc_unres_queue;
837 			mfc_unres_queue = c->next;
838 			spin_unlock_bh(&mfc_unres_lock);
839 
840 			ipmr_destroy_unres(c);
841 
842 			spin_lock_bh(&mfc_unres_lock);
843 		}
844 		spin_unlock_bh(&mfc_unres_lock);
845 	}
846 }
847 
848 static void mrtsock_destruct(struct sock *sk)
849 {
850 	rtnl_lock();
851 	if (sk == mroute_socket) {
852 		IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
853 
854 		write_lock_bh(&mrt_lock);
855 		mroute_socket=NULL;
856 		write_unlock_bh(&mrt_lock);
857 
858 		mroute_clean_tables(sk);
859 	}
860 	rtnl_unlock();
861 }
862 
863 /*
864  *	Socket options and virtual interface manipulation. The whole
865  *	virtual interface system is a complete heap, but unfortunately
866  *	that's how BSD mrouted happens to think. Maybe one day with a proper
867  *	MOSPF/PIM router set up we can clean this up.
868  */
869 
870 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
871 {
872 	int ret;
873 	struct vifctl vif;
874 	struct mfcctl mfc;
875 
876 	if (optname != MRT_INIT) {
877 		if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
878 			return -EACCES;
879 	}
880 
881 	switch (optname) {
882 	case MRT_INIT:
883 		if (sk->sk_type != SOCK_RAW ||
884 		    inet_sk(sk)->num != IPPROTO_IGMP)
885 			return -EOPNOTSUPP;
886 		if (optlen!=sizeof(int))
887 			return -ENOPROTOOPT;
888 
889 		rtnl_lock();
890 		if (mroute_socket) {
891 			rtnl_unlock();
892 			return -EADDRINUSE;
893 		}
894 
895 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
896 		if (ret == 0) {
897 			write_lock_bh(&mrt_lock);
898 			mroute_socket=sk;
899 			write_unlock_bh(&mrt_lock);
900 
901 			IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
902 		}
903 		rtnl_unlock();
904 		return ret;
905 	case MRT_DONE:
906 		if (sk!=mroute_socket)
907 			return -EACCES;
908 		return ip_ra_control(sk, 0, NULL);
909 	case MRT_ADD_VIF:
910 	case MRT_DEL_VIF:
911 		if (optlen!=sizeof(vif))
912 			return -EINVAL;
913 		if (copy_from_user(&vif,optval,sizeof(vif)))
914 			return -EFAULT;
915 		if (vif.vifc_vifi >= MAXVIFS)
916 			return -ENFILE;
917 		rtnl_lock();
918 		if (optname==MRT_ADD_VIF) {
919 			ret = vif_add(&vif, sk==mroute_socket);
920 		} else {
921 			ret = vif_delete(vif.vifc_vifi);
922 		}
923 		rtnl_unlock();
924 		return ret;
925 
926 		/*
927 		 *	Manipulate the forwarding caches. These live
928 		 *	in a sort of kernel/user symbiosis.
929 		 */
930 	case MRT_ADD_MFC:
931 	case MRT_DEL_MFC:
932 		if (optlen!=sizeof(mfc))
933 			return -EINVAL;
934 		if (copy_from_user(&mfc,optval, sizeof(mfc)))
935 			return -EFAULT;
936 		rtnl_lock();
937 		if (optname==MRT_DEL_MFC)
938 			ret = ipmr_mfc_delete(&mfc);
939 		else
940 			ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
941 		rtnl_unlock();
942 		return ret;
943 		/*
944 		 *	Control PIM assert.
945 		 */
946 	case MRT_ASSERT:
947 	{
948 		int v;
949 		if (get_user(v,(int __user *)optval))
950 			return -EFAULT;
951 		mroute_do_assert=(v)?1:0;
952 		return 0;
953 	}
954 #ifdef CONFIG_IP_PIMSM
955 	case MRT_PIM:
956 	{
957 		int v;
958 
959 		if (get_user(v,(int __user *)optval))
960 			return -EFAULT;
961 		v = (v) ? 1 : 0;
962 
963 		rtnl_lock();
964 		ret = 0;
965 		if (v != mroute_do_pim) {
966 			mroute_do_pim = v;
967 			mroute_do_assert = v;
968 #ifdef CONFIG_IP_PIMSM_V2
969 			if (mroute_do_pim)
970 				ret = inet_add_protocol(&pim_protocol,
971 							IPPROTO_PIM);
972 			else
973 				ret = inet_del_protocol(&pim_protocol,
974 							IPPROTO_PIM);
975 			if (ret < 0)
976 				ret = -EAGAIN;
977 #endif
978 		}
979 		rtnl_unlock();
980 		return ret;
981 	}
982 #endif
983 	/*
984 	 *	Spurious command, or MRT_VERSION which you cannot
985 	 *	set.
986 	 */
987 	default:
988 		return -ENOPROTOOPT;
989 	}
990 }
991 
992 /*
993  *	Getsock opt support for the multicast routing system.
994  */
995 
996 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
997 {
998 	int olr;
999 	int val;
1000 
1001 	if (optname!=MRT_VERSION &&
1002 #ifdef CONFIG_IP_PIMSM
1003 	   optname!=MRT_PIM &&
1004 #endif
1005 	   optname!=MRT_ASSERT)
1006 		return -ENOPROTOOPT;
1007 
1008 	if (get_user(olr, optlen))
1009 		return -EFAULT;
1010 
1011 	olr = min_t(unsigned int, olr, sizeof(int));
1012 	if (olr < 0)
1013 		return -EINVAL;
1014 
1015 	if (put_user(olr,optlen))
1016 		return -EFAULT;
1017 	if (optname==MRT_VERSION)
1018 		val=0x0305;
1019 #ifdef CONFIG_IP_PIMSM
1020 	else if (optname==MRT_PIM)
1021 		val=mroute_do_pim;
1022 #endif
1023 	else
1024 		val=mroute_do_assert;
1025 	if (copy_to_user(optval,&val,olr))
1026 		return -EFAULT;
1027 	return 0;
1028 }
1029 
1030 /*
1031  *	The IP multicast ioctl support routines.
1032  */
1033 
1034 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1035 {
1036 	struct sioc_sg_req sr;
1037 	struct sioc_vif_req vr;
1038 	struct vif_device *vif;
1039 	struct mfc_cache *c;
1040 
1041 	switch (cmd) {
1042 	case SIOCGETVIFCNT:
1043 		if (copy_from_user(&vr,arg,sizeof(vr)))
1044 			return -EFAULT;
1045 		if (vr.vifi>=maxvif)
1046 			return -EINVAL;
1047 		read_lock(&mrt_lock);
1048 		vif=&vif_table[vr.vifi];
1049 		if (VIF_EXISTS(vr.vifi))	{
1050 			vr.icount=vif->pkt_in;
1051 			vr.ocount=vif->pkt_out;
1052 			vr.ibytes=vif->bytes_in;
1053 			vr.obytes=vif->bytes_out;
1054 			read_unlock(&mrt_lock);
1055 
1056 			if (copy_to_user(arg,&vr,sizeof(vr)))
1057 				return -EFAULT;
1058 			return 0;
1059 		}
1060 		read_unlock(&mrt_lock);
1061 		return -EADDRNOTAVAIL;
1062 	case SIOCGETSGCNT:
1063 		if (copy_from_user(&sr,arg,sizeof(sr)))
1064 			return -EFAULT;
1065 
1066 		read_lock(&mrt_lock);
1067 		c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1068 		if (c) {
1069 			sr.pktcnt = c->mfc_un.res.pkt;
1070 			sr.bytecnt = c->mfc_un.res.bytes;
1071 			sr.wrong_if = c->mfc_un.res.wrong_if;
1072 			read_unlock(&mrt_lock);
1073 
1074 			if (copy_to_user(arg,&sr,sizeof(sr)))
1075 				return -EFAULT;
1076 			return 0;
1077 		}
1078 		read_unlock(&mrt_lock);
1079 		return -EADDRNOTAVAIL;
1080 	default:
1081 		return -ENOIOCTLCMD;
1082 	}
1083 }
1084 
1085 
1086 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1087 {
1088 	struct net_device *dev = ptr;
1089 	struct vif_device *v;
1090 	int ct;
1091 
1092 	if (dev_net(dev) != &init_net)
1093 		return NOTIFY_DONE;
1094 
1095 	if (event != NETDEV_UNREGISTER)
1096 		return NOTIFY_DONE;
1097 	v=&vif_table[0];
1098 	for (ct=0;ct<maxvif;ct++,v++) {
1099 		if (v->dev==dev)
1100 			vif_delete(ct);
1101 	}
1102 	return NOTIFY_DONE;
1103 }
1104 
1105 
1106 static struct notifier_block ip_mr_notifier={
1107 	.notifier_call = ipmr_device_event,
1108 };
1109 
1110 /*
1111  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1112  *	This avoids tunnel drivers and other mess and gives us the speed so
1113  *	important for multicast video.
1114  */
1115 
1116 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1117 {
1118 	struct iphdr *iph;
1119 	struct iphdr *old_iph = ip_hdr(skb);
1120 
1121 	skb_push(skb, sizeof(struct iphdr));
1122 	skb->transport_header = skb->network_header;
1123 	skb_reset_network_header(skb);
1124 	iph = ip_hdr(skb);
1125 
1126 	iph->version	= 	4;
1127 	iph->tos	=	old_iph->tos;
1128 	iph->ttl	=	old_iph->ttl;
1129 	iph->frag_off	=	0;
1130 	iph->daddr	=	daddr;
1131 	iph->saddr	=	saddr;
1132 	iph->protocol	=	IPPROTO_IPIP;
1133 	iph->ihl	=	5;
1134 	iph->tot_len	=	htons(skb->len);
1135 	ip_select_ident(iph, skb->dst, NULL);
1136 	ip_send_check(iph);
1137 
1138 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1139 	nf_reset(skb);
1140 }
1141 
1142 static inline int ipmr_forward_finish(struct sk_buff *skb)
1143 {
1144 	struct ip_options * opt	= &(IPCB(skb)->opt);
1145 
1146 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1147 
1148 	if (unlikely(opt->optlen))
1149 		ip_forward_options(skb);
1150 
1151 	return dst_output(skb);
1152 }
1153 
1154 /*
1155  *	Processing handlers for ipmr_forward
1156  */
1157 
1158 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1159 {
1160 	const struct iphdr *iph = ip_hdr(skb);
1161 	struct vif_device *vif = &vif_table[vifi];
1162 	struct net_device *dev;
1163 	struct rtable *rt;
1164 	int    encap = 0;
1165 
1166 	if (vif->dev == NULL)
1167 		goto out_free;
1168 
1169 #ifdef CONFIG_IP_PIMSM
1170 	if (vif->flags & VIFF_REGISTER) {
1171 		vif->pkt_out++;
1172 		vif->bytes_out+=skb->len;
1173 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1174 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1175 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1176 		kfree_skb(skb);
1177 		return;
1178 	}
1179 #endif
1180 
1181 	if (vif->flags&VIFF_TUNNEL) {
1182 		struct flowi fl = { .oif = vif->link,
1183 				    .nl_u = { .ip4_u =
1184 					      { .daddr = vif->remote,
1185 						.saddr = vif->local,
1186 						.tos = RT_TOS(iph->tos) } },
1187 				    .proto = IPPROTO_IPIP };
1188 		if (ip_route_output_key(&init_net, &rt, &fl))
1189 			goto out_free;
1190 		encap = sizeof(struct iphdr);
1191 	} else {
1192 		struct flowi fl = { .oif = vif->link,
1193 				    .nl_u = { .ip4_u =
1194 					      { .daddr = iph->daddr,
1195 						.tos = RT_TOS(iph->tos) } },
1196 				    .proto = IPPROTO_IPIP };
1197 		if (ip_route_output_key(&init_net, &rt, &fl))
1198 			goto out_free;
1199 	}
1200 
1201 	dev = rt->u.dst.dev;
1202 
1203 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1204 		/* Do not fragment multicasts. Alas, IPv4 does not
1205 		   allow to send ICMP, so that packets will disappear
1206 		   to blackhole.
1207 		 */
1208 
1209 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1210 		ip_rt_put(rt);
1211 		goto out_free;
1212 	}
1213 
1214 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1215 
1216 	if (skb_cow(skb, encap)) {
1217 		ip_rt_put(rt);
1218 		goto out_free;
1219 	}
1220 
1221 	vif->pkt_out++;
1222 	vif->bytes_out+=skb->len;
1223 
1224 	dst_release(skb->dst);
1225 	skb->dst = &rt->u.dst;
1226 	ip_decrease_ttl(ip_hdr(skb));
1227 
1228 	/* FIXME: forward and output firewalls used to be called here.
1229 	 * What do we do with netfilter? -- RR */
1230 	if (vif->flags & VIFF_TUNNEL) {
1231 		ip_encap(skb, vif->local, vif->remote);
1232 		/* FIXME: extra output firewall step used to be here. --RR */
1233 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1234 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1235 	}
1236 
1237 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1238 
1239 	/*
1240 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1241 	 * not only before forwarding, but after forwarding on all output
1242 	 * interfaces. It is clear, if mrouter runs a multicasting
1243 	 * program, it should receive packets not depending to what interface
1244 	 * program is joined.
1245 	 * If we will not make it, the program will have to join on all
1246 	 * interfaces. On the other hand, multihoming host (or router, but
1247 	 * not mrouter) cannot join to more than one interface - it will
1248 	 * result in receiving multiple packets.
1249 	 */
1250 	NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1251 		ipmr_forward_finish);
1252 	return;
1253 
1254 out_free:
1255 	kfree_skb(skb);
1256 	return;
1257 }
1258 
1259 static int ipmr_find_vif(struct net_device *dev)
1260 {
1261 	int ct;
1262 	for (ct=maxvif-1; ct>=0; ct--) {
1263 		if (vif_table[ct].dev == dev)
1264 			break;
1265 	}
1266 	return ct;
1267 }
1268 
1269 /* "local" means that we should preserve one skb (for local delivery) */
1270 
1271 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1272 {
1273 	int psend = -1;
1274 	int vif, ct;
1275 
1276 	vif = cache->mfc_parent;
1277 	cache->mfc_un.res.pkt++;
1278 	cache->mfc_un.res.bytes += skb->len;
1279 
1280 	/*
1281 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1282 	 */
1283 	if (vif_table[vif].dev != skb->dev) {
1284 		int true_vifi;
1285 
1286 		if (skb->rtable->fl.iif == 0) {
1287 			/* It is our own packet, looped back.
1288 			   Very complicated situation...
1289 
1290 			   The best workaround until routing daemons will be
1291 			   fixed is not to redistribute packet, if it was
1292 			   send through wrong interface. It means, that
1293 			   multicast applications WILL NOT work for
1294 			   (S,G), which have default multicast route pointing
1295 			   to wrong oif. In any case, it is not a good
1296 			   idea to use multicasting applications on router.
1297 			 */
1298 			goto dont_forward;
1299 		}
1300 
1301 		cache->mfc_un.res.wrong_if++;
1302 		true_vifi = ipmr_find_vif(skb->dev);
1303 
1304 		if (true_vifi >= 0 && mroute_do_assert &&
1305 		    /* pimsm uses asserts, when switching from RPT to SPT,
1306 		       so that we cannot check that packet arrived on an oif.
1307 		       It is bad, but otherwise we would need to move pretty
1308 		       large chunk of pimd to kernel. Ough... --ANK
1309 		     */
1310 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1311 		    time_after(jiffies,
1312 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1313 			cache->mfc_un.res.last_assert = jiffies;
1314 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1315 		}
1316 		goto dont_forward;
1317 	}
1318 
1319 	vif_table[vif].pkt_in++;
1320 	vif_table[vif].bytes_in+=skb->len;
1321 
1322 	/*
1323 	 *	Forward the frame
1324 	 */
1325 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1326 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1327 			if (psend != -1) {
1328 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1329 				if (skb2)
1330 					ipmr_queue_xmit(skb2, cache, psend);
1331 			}
1332 			psend=ct;
1333 		}
1334 	}
1335 	if (psend != -1) {
1336 		if (local) {
1337 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1338 			if (skb2)
1339 				ipmr_queue_xmit(skb2, cache, psend);
1340 		} else {
1341 			ipmr_queue_xmit(skb, cache, psend);
1342 			return 0;
1343 		}
1344 	}
1345 
1346 dont_forward:
1347 	if (!local)
1348 		kfree_skb(skb);
1349 	return 0;
1350 }
1351 
1352 
1353 /*
1354  *	Multicast packets for forwarding arrive here
1355  */
1356 
1357 int ip_mr_input(struct sk_buff *skb)
1358 {
1359 	struct mfc_cache *cache;
1360 	int local = skb->rtable->rt_flags&RTCF_LOCAL;
1361 
1362 	/* Packet is looped back after forward, it should not be
1363 	   forwarded second time, but still can be delivered locally.
1364 	 */
1365 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1366 		goto dont_forward;
1367 
1368 	if (!local) {
1369 		    if (IPCB(skb)->opt.router_alert) {
1370 			    if (ip_call_ra_chain(skb))
1371 				    return 0;
1372 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1373 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1374 			       Cisco IOS <= 11.2(8)) do not put router alert
1375 			       option to IGMP packets destined to routable
1376 			       groups. It is very bad, because it means
1377 			       that we can forward NO IGMP messages.
1378 			     */
1379 			    read_lock(&mrt_lock);
1380 			    if (mroute_socket) {
1381 				    nf_reset(skb);
1382 				    raw_rcv(mroute_socket, skb);
1383 				    read_unlock(&mrt_lock);
1384 				    return 0;
1385 			    }
1386 			    read_unlock(&mrt_lock);
1387 		    }
1388 	}
1389 
1390 	read_lock(&mrt_lock);
1391 	cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1392 
1393 	/*
1394 	 *	No usable cache entry
1395 	 */
1396 	if (cache==NULL) {
1397 		int vif;
1398 
1399 		if (local) {
1400 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1401 			ip_local_deliver(skb);
1402 			if (skb2 == NULL) {
1403 				read_unlock(&mrt_lock);
1404 				return -ENOBUFS;
1405 			}
1406 			skb = skb2;
1407 		}
1408 
1409 		vif = ipmr_find_vif(skb->dev);
1410 		if (vif >= 0) {
1411 			int err = ipmr_cache_unresolved(vif, skb);
1412 			read_unlock(&mrt_lock);
1413 
1414 			return err;
1415 		}
1416 		read_unlock(&mrt_lock);
1417 		kfree_skb(skb);
1418 		return -ENODEV;
1419 	}
1420 
1421 	ip_mr_forward(skb, cache, local);
1422 
1423 	read_unlock(&mrt_lock);
1424 
1425 	if (local)
1426 		return ip_local_deliver(skb);
1427 
1428 	return 0;
1429 
1430 dont_forward:
1431 	if (local)
1432 		return ip_local_deliver(skb);
1433 	kfree_skb(skb);
1434 	return 0;
1435 }
1436 
1437 #ifdef CONFIG_IP_PIMSM_V1
1438 /*
1439  * Handle IGMP messages of PIMv1
1440  */
1441 
1442 int pim_rcv_v1(struct sk_buff * skb)
1443 {
1444 	struct igmphdr *pim;
1445 	struct iphdr   *encap;
1446 	struct net_device  *reg_dev = NULL;
1447 
1448 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1449 		goto drop;
1450 
1451 	pim = igmp_hdr(skb);
1452 
1453 	if (!mroute_do_pim ||
1454 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1455 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1456 		goto drop;
1457 
1458 	encap = (struct iphdr *)(skb_transport_header(skb) +
1459 				 sizeof(struct igmphdr));
1460 	/*
1461 	   Check that:
1462 	   a. packet is really destinted to a multicast group
1463 	   b. packet is not a NULL-REGISTER
1464 	   c. packet is not truncated
1465 	 */
1466 	if (!ipv4_is_multicast(encap->daddr) ||
1467 	    encap->tot_len == 0 ||
1468 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1469 		goto drop;
1470 
1471 	read_lock(&mrt_lock);
1472 	if (reg_vif_num >= 0)
1473 		reg_dev = vif_table[reg_vif_num].dev;
1474 	if (reg_dev)
1475 		dev_hold(reg_dev);
1476 	read_unlock(&mrt_lock);
1477 
1478 	if (reg_dev == NULL)
1479 		goto drop;
1480 
1481 	skb->mac_header = skb->network_header;
1482 	skb_pull(skb, (u8*)encap - skb->data);
1483 	skb_reset_network_header(skb);
1484 	skb->dev = reg_dev;
1485 	skb->protocol = htons(ETH_P_IP);
1486 	skb->ip_summed = 0;
1487 	skb->pkt_type = PACKET_HOST;
1488 	dst_release(skb->dst);
1489 	skb->dst = NULL;
1490 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1491 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1492 	nf_reset(skb);
1493 	netif_rx(skb);
1494 	dev_put(reg_dev);
1495 	return 0;
1496  drop:
1497 	kfree_skb(skb);
1498 	return 0;
1499 }
1500 #endif
1501 
1502 #ifdef CONFIG_IP_PIMSM_V2
1503 static int pim_rcv(struct sk_buff * skb)
1504 {
1505 	struct pimreghdr *pim;
1506 	struct iphdr   *encap;
1507 	struct net_device  *reg_dev = NULL;
1508 
1509 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1510 		goto drop;
1511 
1512 	pim = (struct pimreghdr *)skb_transport_header(skb);
1513 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1514 	    (pim->flags&PIM_NULL_REGISTER) ||
1515 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1516 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1517 		goto drop;
1518 
1519 	/* check if the inner packet is destined to mcast group */
1520 	encap = (struct iphdr *)(skb_transport_header(skb) +
1521 				 sizeof(struct pimreghdr));
1522 	if (!ipv4_is_multicast(encap->daddr) ||
1523 	    encap->tot_len == 0 ||
1524 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1525 		goto drop;
1526 
1527 	read_lock(&mrt_lock);
1528 	if (reg_vif_num >= 0)
1529 		reg_dev = vif_table[reg_vif_num].dev;
1530 	if (reg_dev)
1531 		dev_hold(reg_dev);
1532 	read_unlock(&mrt_lock);
1533 
1534 	if (reg_dev == NULL)
1535 		goto drop;
1536 
1537 	skb->mac_header = skb->network_header;
1538 	skb_pull(skb, (u8*)encap - skb->data);
1539 	skb_reset_network_header(skb);
1540 	skb->dev = reg_dev;
1541 	skb->protocol = htons(ETH_P_IP);
1542 	skb->ip_summed = 0;
1543 	skb->pkt_type = PACKET_HOST;
1544 	dst_release(skb->dst);
1545 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1546 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1547 	skb->dst = NULL;
1548 	nf_reset(skb);
1549 	netif_rx(skb);
1550 	dev_put(reg_dev);
1551 	return 0;
1552  drop:
1553 	kfree_skb(skb);
1554 	return 0;
1555 }
1556 #endif
1557 
1558 static int
1559 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1560 {
1561 	int ct;
1562 	struct rtnexthop *nhp;
1563 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1564 	u8 *b = skb_tail_pointer(skb);
1565 	struct rtattr *mp_head;
1566 
1567 	if (dev)
1568 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1569 
1570 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1571 
1572 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1573 		if (c->mfc_un.res.ttls[ct] < 255) {
1574 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1575 				goto rtattr_failure;
1576 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1577 			nhp->rtnh_flags = 0;
1578 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1579 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1580 			nhp->rtnh_len = sizeof(*nhp);
1581 		}
1582 	}
1583 	mp_head->rta_type = RTA_MULTIPATH;
1584 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1585 	rtm->rtm_type = RTN_MULTICAST;
1586 	return 1;
1587 
1588 rtattr_failure:
1589 	nlmsg_trim(skb, b);
1590 	return -EMSGSIZE;
1591 }
1592 
1593 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1594 {
1595 	int err;
1596 	struct mfc_cache *cache;
1597 	struct rtable *rt = skb->rtable;
1598 
1599 	read_lock(&mrt_lock);
1600 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1601 
1602 	if (cache==NULL) {
1603 		struct sk_buff *skb2;
1604 		struct iphdr *iph;
1605 		struct net_device *dev;
1606 		int vif;
1607 
1608 		if (nowait) {
1609 			read_unlock(&mrt_lock);
1610 			return -EAGAIN;
1611 		}
1612 
1613 		dev = skb->dev;
1614 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1615 			read_unlock(&mrt_lock);
1616 			return -ENODEV;
1617 		}
1618 		skb2 = skb_clone(skb, GFP_ATOMIC);
1619 		if (!skb2) {
1620 			read_unlock(&mrt_lock);
1621 			return -ENOMEM;
1622 		}
1623 
1624 		skb_push(skb2, sizeof(struct iphdr));
1625 		skb_reset_network_header(skb2);
1626 		iph = ip_hdr(skb2);
1627 		iph->ihl = sizeof(struct iphdr) >> 2;
1628 		iph->saddr = rt->rt_src;
1629 		iph->daddr = rt->rt_dst;
1630 		iph->version = 0;
1631 		err = ipmr_cache_unresolved(vif, skb2);
1632 		read_unlock(&mrt_lock);
1633 		return err;
1634 	}
1635 
1636 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1637 		cache->mfc_flags |= MFC_NOTIFY;
1638 	err = ipmr_fill_mroute(skb, cache, rtm);
1639 	read_unlock(&mrt_lock);
1640 	return err;
1641 }
1642 
1643 #ifdef CONFIG_PROC_FS
1644 /*
1645  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1646  */
1647 struct ipmr_vif_iter {
1648 	int ct;
1649 };
1650 
1651 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1652 					   loff_t pos)
1653 {
1654 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1655 		if (!VIF_EXISTS(iter->ct))
1656 			continue;
1657 		if (pos-- == 0)
1658 			return &vif_table[iter->ct];
1659 	}
1660 	return NULL;
1661 }
1662 
1663 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1664 	__acquires(mrt_lock)
1665 {
1666 	read_lock(&mrt_lock);
1667 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1668 		: SEQ_START_TOKEN;
1669 }
1670 
1671 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1672 {
1673 	struct ipmr_vif_iter *iter = seq->private;
1674 
1675 	++*pos;
1676 	if (v == SEQ_START_TOKEN)
1677 		return ipmr_vif_seq_idx(iter, 0);
1678 
1679 	while (++iter->ct < maxvif) {
1680 		if (!VIF_EXISTS(iter->ct))
1681 			continue;
1682 		return &vif_table[iter->ct];
1683 	}
1684 	return NULL;
1685 }
1686 
1687 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1688 	__releases(mrt_lock)
1689 {
1690 	read_unlock(&mrt_lock);
1691 }
1692 
1693 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1694 {
1695 	if (v == SEQ_START_TOKEN) {
1696 		seq_puts(seq,
1697 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1698 	} else {
1699 		const struct vif_device *vif = v;
1700 		const char *name =  vif->dev ? vif->dev->name : "none";
1701 
1702 		seq_printf(seq,
1703 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1704 			   vif - vif_table,
1705 			   name, vif->bytes_in, vif->pkt_in,
1706 			   vif->bytes_out, vif->pkt_out,
1707 			   vif->flags, vif->local, vif->remote);
1708 	}
1709 	return 0;
1710 }
1711 
1712 static const struct seq_operations ipmr_vif_seq_ops = {
1713 	.start = ipmr_vif_seq_start,
1714 	.next  = ipmr_vif_seq_next,
1715 	.stop  = ipmr_vif_seq_stop,
1716 	.show  = ipmr_vif_seq_show,
1717 };
1718 
1719 static int ipmr_vif_open(struct inode *inode, struct file *file)
1720 {
1721 	return seq_open_private(file, &ipmr_vif_seq_ops,
1722 			sizeof(struct ipmr_vif_iter));
1723 }
1724 
1725 static const struct file_operations ipmr_vif_fops = {
1726 	.owner	 = THIS_MODULE,
1727 	.open    = ipmr_vif_open,
1728 	.read    = seq_read,
1729 	.llseek  = seq_lseek,
1730 	.release = seq_release_private,
1731 };
1732 
1733 struct ipmr_mfc_iter {
1734 	struct mfc_cache **cache;
1735 	int ct;
1736 };
1737 
1738 
1739 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1740 {
1741 	struct mfc_cache *mfc;
1742 
1743 	it->cache = mfc_cache_array;
1744 	read_lock(&mrt_lock);
1745 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1746 		for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1747 			if (pos-- == 0)
1748 				return mfc;
1749 	read_unlock(&mrt_lock);
1750 
1751 	it->cache = &mfc_unres_queue;
1752 	spin_lock_bh(&mfc_unres_lock);
1753 	for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1754 		if (pos-- == 0)
1755 			return mfc;
1756 	spin_unlock_bh(&mfc_unres_lock);
1757 
1758 	it->cache = NULL;
1759 	return NULL;
1760 }
1761 
1762 
1763 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1764 {
1765 	struct ipmr_mfc_iter *it = seq->private;
1766 	it->cache = NULL;
1767 	it->ct = 0;
1768 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1769 		: SEQ_START_TOKEN;
1770 }
1771 
1772 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1773 {
1774 	struct mfc_cache *mfc = v;
1775 	struct ipmr_mfc_iter *it = seq->private;
1776 
1777 	++*pos;
1778 
1779 	if (v == SEQ_START_TOKEN)
1780 		return ipmr_mfc_seq_idx(seq->private, 0);
1781 
1782 	if (mfc->next)
1783 		return mfc->next;
1784 
1785 	if (it->cache == &mfc_unres_queue)
1786 		goto end_of_list;
1787 
1788 	BUG_ON(it->cache != mfc_cache_array);
1789 
1790 	while (++it->ct < MFC_LINES) {
1791 		mfc = mfc_cache_array[it->ct];
1792 		if (mfc)
1793 			return mfc;
1794 	}
1795 
1796 	/* exhausted cache_array, show unresolved */
1797 	read_unlock(&mrt_lock);
1798 	it->cache = &mfc_unres_queue;
1799 	it->ct = 0;
1800 
1801 	spin_lock_bh(&mfc_unres_lock);
1802 	mfc = mfc_unres_queue;
1803 	if (mfc)
1804 		return mfc;
1805 
1806  end_of_list:
1807 	spin_unlock_bh(&mfc_unres_lock);
1808 	it->cache = NULL;
1809 
1810 	return NULL;
1811 }
1812 
1813 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1814 {
1815 	struct ipmr_mfc_iter *it = seq->private;
1816 
1817 	if (it->cache == &mfc_unres_queue)
1818 		spin_unlock_bh(&mfc_unres_lock);
1819 	else if (it->cache == mfc_cache_array)
1820 		read_unlock(&mrt_lock);
1821 }
1822 
1823 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1824 {
1825 	int n;
1826 
1827 	if (v == SEQ_START_TOKEN) {
1828 		seq_puts(seq,
1829 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1830 	} else {
1831 		const struct mfc_cache *mfc = v;
1832 		const struct ipmr_mfc_iter *it = seq->private;
1833 
1834 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1835 			   (unsigned long) mfc->mfc_mcastgrp,
1836 			   (unsigned long) mfc->mfc_origin,
1837 			   mfc->mfc_parent,
1838 			   mfc->mfc_un.res.pkt,
1839 			   mfc->mfc_un.res.bytes,
1840 			   mfc->mfc_un.res.wrong_if);
1841 
1842 		if (it->cache != &mfc_unres_queue) {
1843 			for (n = mfc->mfc_un.res.minvif;
1844 			     n < mfc->mfc_un.res.maxvif; n++ ) {
1845 				if (VIF_EXISTS(n)
1846 				   && mfc->mfc_un.res.ttls[n] < 255)
1847 				seq_printf(seq,
1848 					   " %2d:%-3d",
1849 					   n, mfc->mfc_un.res.ttls[n]);
1850 			}
1851 		}
1852 		seq_putc(seq, '\n');
1853 	}
1854 	return 0;
1855 }
1856 
1857 static const struct seq_operations ipmr_mfc_seq_ops = {
1858 	.start = ipmr_mfc_seq_start,
1859 	.next  = ipmr_mfc_seq_next,
1860 	.stop  = ipmr_mfc_seq_stop,
1861 	.show  = ipmr_mfc_seq_show,
1862 };
1863 
1864 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1865 {
1866 	return seq_open_private(file, &ipmr_mfc_seq_ops,
1867 			sizeof(struct ipmr_mfc_iter));
1868 }
1869 
1870 static const struct file_operations ipmr_mfc_fops = {
1871 	.owner	 = THIS_MODULE,
1872 	.open    = ipmr_mfc_open,
1873 	.read    = seq_read,
1874 	.llseek  = seq_lseek,
1875 	.release = seq_release_private,
1876 };
1877 #endif
1878 
1879 #ifdef CONFIG_IP_PIMSM_V2
1880 static struct net_protocol pim_protocol = {
1881 	.handler	=	pim_rcv,
1882 };
1883 #endif
1884 
1885 
1886 /*
1887  *	Setup for IP multicast routing
1888  */
1889 
1890 void __init ip_mr_init(void)
1891 {
1892 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1893 				       sizeof(struct mfc_cache),
1894 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1895 				       NULL);
1896 	setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1897 	register_netdevice_notifier(&ip_mr_notifier);
1898 #ifdef CONFIG_PROC_FS
1899 	proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
1900 	proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
1901 #endif
1902 }
1903