xref: /openbmc/linux/net/ipv4/ipmr.c (revision 04c71976)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/net_namespace.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66 #include <net/netlink.h>
67 
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM	1
70 #endif
71 
72 static struct sock *mroute_socket;
73 
74 
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78 
79 static DEFINE_RWLOCK(mrt_lock);
80 
81 /*
82  *	Multicast router control variables
83  */
84 
85 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
86 static int maxvif;
87 
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89 
90 static int mroute_do_assert;				/* Set in PIM assert	*/
91 static int mroute_do_pim;
92 
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
94 
95 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
97 
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100 
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105 
106    In this case data path is free of exclusive locks at all.
107  */
108 
109 static struct kmem_cache *mrt_cachep __read_mostly;
110 
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114 
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118 
119 static struct timer_list ipmr_expire_timer;
120 
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122 
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126 	struct net_device  *dev;
127 
128 	dev = __dev_get_by_name(&init_net, "tunl0");
129 
130 	if (dev) {
131 		int err;
132 		struct ifreq ifr;
133 		mm_segment_t	oldfs;
134 		struct ip_tunnel_parm p;
135 		struct in_device  *in_dev;
136 
137 		memset(&p, 0, sizeof(p));
138 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
139 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
140 		p.iph.version = 4;
141 		p.iph.ihl = 5;
142 		p.iph.protocol = IPPROTO_IPIP;
143 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144 		ifr.ifr_ifru.ifru_data = (void*)&p;
145 
146 		oldfs = get_fs(); set_fs(KERNEL_DS);
147 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148 		set_fs(oldfs);
149 
150 		dev = NULL;
151 
152 		if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
153 			dev->flags |= IFF_MULTICAST;
154 
155 			in_dev = __in_dev_get_rtnl(dev);
156 			if (in_dev == NULL)
157 				goto failure;
158 
159 			ipv4_devconf_setall(in_dev);
160 			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
161 
162 			if (dev_open(dev))
163 				goto failure;
164 		}
165 	}
166 	return dev;
167 
168 failure:
169 	/* allow the register to be completed before unregistering. */
170 	rtnl_unlock();
171 	rtnl_lock();
172 
173 	unregister_netdevice(dev);
174 	return NULL;
175 }
176 
177 #ifdef CONFIG_IP_PIMSM
178 
179 static int reg_vif_num = -1;
180 
181 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
182 {
183 	read_lock(&mrt_lock);
184 	((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
185 	((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
186 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
187 	read_unlock(&mrt_lock);
188 	kfree_skb(skb);
189 	return 0;
190 }
191 
192 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
193 {
194 	return (struct net_device_stats*)netdev_priv(dev);
195 }
196 
197 static void reg_vif_setup(struct net_device *dev)
198 {
199 	dev->type		= ARPHRD_PIMREG;
200 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
201 	dev->flags		= IFF_NOARP;
202 	dev->hard_start_xmit	= reg_vif_xmit;
203 	dev->get_stats		= reg_vif_get_stats;
204 	dev->destructor		= free_netdev;
205 }
206 
207 static struct net_device *ipmr_reg_vif(void)
208 {
209 	struct net_device *dev;
210 	struct in_device *in_dev;
211 
212 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
213 			   reg_vif_setup);
214 
215 	if (dev == NULL)
216 		return NULL;
217 
218 	if (register_netdevice(dev)) {
219 		free_netdev(dev);
220 		return NULL;
221 	}
222 	dev->iflink = 0;
223 
224 	rcu_read_lock();
225 	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
226 		rcu_read_unlock();
227 		goto failure;
228 	}
229 
230 	ipv4_devconf_setall(in_dev);
231 	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
232 	rcu_read_unlock();
233 
234 	if (dev_open(dev))
235 		goto failure;
236 
237 	return dev;
238 
239 failure:
240 	/* allow the register to be completed before unregistering. */
241 	rtnl_unlock();
242 	rtnl_lock();
243 
244 	unregister_netdevice(dev);
245 	return NULL;
246 }
247 #endif
248 
249 /*
250  *	Delete a VIF entry
251  */
252 
253 static int vif_delete(int vifi)
254 {
255 	struct vif_device *v;
256 	struct net_device *dev;
257 	struct in_device *in_dev;
258 
259 	if (vifi < 0 || vifi >= maxvif)
260 		return -EADDRNOTAVAIL;
261 
262 	v = &vif_table[vifi];
263 
264 	write_lock_bh(&mrt_lock);
265 	dev = v->dev;
266 	v->dev = NULL;
267 
268 	if (!dev) {
269 		write_unlock_bh(&mrt_lock);
270 		return -EADDRNOTAVAIL;
271 	}
272 
273 #ifdef CONFIG_IP_PIMSM
274 	if (vifi == reg_vif_num)
275 		reg_vif_num = -1;
276 #endif
277 
278 	if (vifi+1 == maxvif) {
279 		int tmp;
280 		for (tmp=vifi-1; tmp>=0; tmp--) {
281 			if (VIF_EXISTS(tmp))
282 				break;
283 		}
284 		maxvif = tmp+1;
285 	}
286 
287 	write_unlock_bh(&mrt_lock);
288 
289 	dev_set_allmulti(dev, -1);
290 
291 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
292 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
293 		ip_rt_multicast_event(in_dev);
294 	}
295 
296 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
297 		unregister_netdevice(dev);
298 
299 	dev_put(dev);
300 	return 0;
301 }
302 
303 /* Destroy an unresolved cache entry, killing queued skbs
304    and reporting error to netlink readers.
305  */
306 
307 static void ipmr_destroy_unres(struct mfc_cache *c)
308 {
309 	struct sk_buff *skb;
310 	struct nlmsgerr *e;
311 
312 	atomic_dec(&cache_resolve_queue_len);
313 
314 	while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
315 		if (ip_hdr(skb)->version == 0) {
316 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
317 			nlh->nlmsg_type = NLMSG_ERROR;
318 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
319 			skb_trim(skb, nlh->nlmsg_len);
320 			e = NLMSG_DATA(nlh);
321 			e->error = -ETIMEDOUT;
322 			memset(&e->msg, 0, sizeof(e->msg));
323 
324 			rtnl_unicast(skb, NETLINK_CB(skb).pid);
325 		} else
326 			kfree_skb(skb);
327 	}
328 
329 	kmem_cache_free(mrt_cachep, c);
330 }
331 
332 
333 /* Single timer process for all the unresolved queue. */
334 
335 static void ipmr_expire_process(unsigned long dummy)
336 {
337 	unsigned long now;
338 	unsigned long expires;
339 	struct mfc_cache *c, **cp;
340 
341 	if (!spin_trylock(&mfc_unres_lock)) {
342 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
343 		return;
344 	}
345 
346 	if (atomic_read(&cache_resolve_queue_len) == 0)
347 		goto out;
348 
349 	now = jiffies;
350 	expires = 10*HZ;
351 	cp = &mfc_unres_queue;
352 
353 	while ((c=*cp) != NULL) {
354 		if (time_after(c->mfc_un.unres.expires, now)) {
355 			unsigned long interval = c->mfc_un.unres.expires - now;
356 			if (interval < expires)
357 				expires = interval;
358 			cp = &c->next;
359 			continue;
360 		}
361 
362 		*cp = c->next;
363 
364 		ipmr_destroy_unres(c);
365 	}
366 
367 	if (atomic_read(&cache_resolve_queue_len))
368 		mod_timer(&ipmr_expire_timer, jiffies + expires);
369 
370 out:
371 	spin_unlock(&mfc_unres_lock);
372 }
373 
374 /* Fill oifs list. It is called under write locked mrt_lock. */
375 
376 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
377 {
378 	int vifi;
379 
380 	cache->mfc_un.res.minvif = MAXVIFS;
381 	cache->mfc_un.res.maxvif = 0;
382 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
383 
384 	for (vifi=0; vifi<maxvif; vifi++) {
385 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
386 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
387 			if (cache->mfc_un.res.minvif > vifi)
388 				cache->mfc_un.res.minvif = vifi;
389 			if (cache->mfc_un.res.maxvif <= vifi)
390 				cache->mfc_un.res.maxvif = vifi + 1;
391 		}
392 	}
393 }
394 
395 static int vif_add(struct vifctl *vifc, int mrtsock)
396 {
397 	int vifi = vifc->vifc_vifi;
398 	struct vif_device *v = &vif_table[vifi];
399 	struct net_device *dev;
400 	struct in_device *in_dev;
401 
402 	/* Is vif busy ? */
403 	if (VIF_EXISTS(vifi))
404 		return -EADDRINUSE;
405 
406 	switch (vifc->vifc_flags) {
407 #ifdef CONFIG_IP_PIMSM
408 	case VIFF_REGISTER:
409 		/*
410 		 * Special Purpose VIF in PIM
411 		 * All the packets will be sent to the daemon
412 		 */
413 		if (reg_vif_num >= 0)
414 			return -EADDRINUSE;
415 		dev = ipmr_reg_vif();
416 		if (!dev)
417 			return -ENOBUFS;
418 		break;
419 #endif
420 	case VIFF_TUNNEL:
421 		dev = ipmr_new_tunnel(vifc);
422 		if (!dev)
423 			return -ENOBUFS;
424 		break;
425 	case 0:
426 		dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
427 		if (!dev)
428 			return -EADDRNOTAVAIL;
429 		dev_put(dev);
430 		break;
431 	default:
432 		return -EINVAL;
433 	}
434 
435 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
436 		return -EADDRNOTAVAIL;
437 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
438 	dev_set_allmulti(dev, +1);
439 	ip_rt_multicast_event(in_dev);
440 
441 	/*
442 	 *	Fill in the VIF structures
443 	 */
444 	v->rate_limit=vifc->vifc_rate_limit;
445 	v->local=vifc->vifc_lcl_addr.s_addr;
446 	v->remote=vifc->vifc_rmt_addr.s_addr;
447 	v->flags=vifc->vifc_flags;
448 	if (!mrtsock)
449 		v->flags |= VIFF_STATIC;
450 	v->threshold=vifc->vifc_threshold;
451 	v->bytes_in = 0;
452 	v->bytes_out = 0;
453 	v->pkt_in = 0;
454 	v->pkt_out = 0;
455 	v->link = dev->ifindex;
456 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
457 		v->link = dev->iflink;
458 
459 	/* And finish update writing critical data */
460 	write_lock_bh(&mrt_lock);
461 	dev_hold(dev);
462 	v->dev=dev;
463 #ifdef CONFIG_IP_PIMSM
464 	if (v->flags&VIFF_REGISTER)
465 		reg_vif_num = vifi;
466 #endif
467 	if (vifi+1 > maxvif)
468 		maxvif = vifi+1;
469 	write_unlock_bh(&mrt_lock);
470 	return 0;
471 }
472 
473 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
474 {
475 	int line=MFC_HASH(mcastgrp,origin);
476 	struct mfc_cache *c;
477 
478 	for (c=mfc_cache_array[line]; c; c = c->next) {
479 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
480 			break;
481 	}
482 	return c;
483 }
484 
485 /*
486  *	Allocate a multicast cache entry
487  */
488 static struct mfc_cache *ipmr_cache_alloc(void)
489 {
490 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
491 	if (c==NULL)
492 		return NULL;
493 	c->mfc_un.res.minvif = MAXVIFS;
494 	return c;
495 }
496 
497 static struct mfc_cache *ipmr_cache_alloc_unres(void)
498 {
499 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
500 	if (c==NULL)
501 		return NULL;
502 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
503 	c->mfc_un.unres.expires = jiffies + 10*HZ;
504 	return c;
505 }
506 
507 /*
508  *	A cache entry has gone into a resolved state from queued
509  */
510 
511 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
512 {
513 	struct sk_buff *skb;
514 	struct nlmsgerr *e;
515 
516 	/*
517 	 *	Play the pending entries through our router
518 	 */
519 
520 	while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
521 		if (ip_hdr(skb)->version == 0) {
522 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
523 
524 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
525 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
526 						  (u8 *)nlh);
527 			} else {
528 				nlh->nlmsg_type = NLMSG_ERROR;
529 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
530 				skb_trim(skb, nlh->nlmsg_len);
531 				e = NLMSG_DATA(nlh);
532 				e->error = -EMSGSIZE;
533 				memset(&e->msg, 0, sizeof(e->msg));
534 			}
535 
536 			rtnl_unicast(skb, NETLINK_CB(skb).pid);
537 		} else
538 			ip_mr_forward(skb, c, 0);
539 	}
540 }
541 
542 /*
543  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
544  *	expects the following bizarre scheme.
545  *
546  *	Called under mrt_lock.
547  */
548 
549 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
550 {
551 	struct sk_buff *skb;
552 	const int ihl = ip_hdrlen(pkt);
553 	struct igmphdr *igmp;
554 	struct igmpmsg *msg;
555 	int ret;
556 
557 #ifdef CONFIG_IP_PIMSM
558 	if (assert == IGMPMSG_WHOLEPKT)
559 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
560 	else
561 #endif
562 		skb = alloc_skb(128, GFP_ATOMIC);
563 
564 	if (!skb)
565 		return -ENOBUFS;
566 
567 #ifdef CONFIG_IP_PIMSM
568 	if (assert == IGMPMSG_WHOLEPKT) {
569 		/* Ugly, but we have no choice with this interface.
570 		   Duplicate old header, fix ihl, length etc.
571 		   And all this only to mangle msg->im_msgtype and
572 		   to set msg->im_mbz to "mbz" :-)
573 		 */
574 		skb_push(skb, sizeof(struct iphdr));
575 		skb_reset_network_header(skb);
576 		skb_reset_transport_header(skb);
577 		msg = (struct igmpmsg *)skb_network_header(skb);
578 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
579 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
580 		msg->im_mbz = 0;
581 		msg->im_vif = reg_vif_num;
582 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
583 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
584 					     sizeof(struct iphdr));
585 	} else
586 #endif
587 	{
588 
589 	/*
590 	 *	Copy the IP header
591 	 */
592 
593 	skb->network_header = skb->tail;
594 	skb_put(skb, ihl);
595 	skb_copy_to_linear_data(skb, pkt->data, ihl);
596 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
597 	msg = (struct igmpmsg *)skb_network_header(skb);
598 	msg->im_vif = vifi;
599 	skb->dst = dst_clone(pkt->dst);
600 
601 	/*
602 	 *	Add our header
603 	 */
604 
605 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
606 	igmp->type	=
607 	msg->im_msgtype = assert;
608 	igmp->code 	=	0;
609 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
610 	skb->transport_header = skb->network_header;
611 	}
612 
613 	if (mroute_socket == NULL) {
614 		kfree_skb(skb);
615 		return -EINVAL;
616 	}
617 
618 	/*
619 	 *	Deliver to mrouted
620 	 */
621 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
622 		if (net_ratelimit())
623 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
624 		kfree_skb(skb);
625 	}
626 
627 	return ret;
628 }
629 
630 /*
631  *	Queue a packet for resolution. It gets locked cache entry!
632  */
633 
634 static int
635 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
636 {
637 	int err;
638 	struct mfc_cache *c;
639 	const struct iphdr *iph = ip_hdr(skb);
640 
641 	spin_lock_bh(&mfc_unres_lock);
642 	for (c=mfc_unres_queue; c; c=c->next) {
643 		if (c->mfc_mcastgrp == iph->daddr &&
644 		    c->mfc_origin == iph->saddr)
645 			break;
646 	}
647 
648 	if (c == NULL) {
649 		/*
650 		 *	Create a new entry if allowable
651 		 */
652 
653 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
654 		    (c=ipmr_cache_alloc_unres())==NULL) {
655 			spin_unlock_bh(&mfc_unres_lock);
656 
657 			kfree_skb(skb);
658 			return -ENOBUFS;
659 		}
660 
661 		/*
662 		 *	Fill in the new cache entry
663 		 */
664 		c->mfc_parent	= -1;
665 		c->mfc_origin	= iph->saddr;
666 		c->mfc_mcastgrp	= iph->daddr;
667 
668 		/*
669 		 *	Reflect first query at mrouted.
670 		 */
671 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
672 			/* If the report failed throw the cache entry
673 			   out - Brad Parker
674 			 */
675 			spin_unlock_bh(&mfc_unres_lock);
676 
677 			kmem_cache_free(mrt_cachep, c);
678 			kfree_skb(skb);
679 			return err;
680 		}
681 
682 		atomic_inc(&cache_resolve_queue_len);
683 		c->next = mfc_unres_queue;
684 		mfc_unres_queue = c;
685 
686 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
687 	}
688 
689 	/*
690 	 *	See if we can append the packet
691 	 */
692 	if (c->mfc_un.unres.unresolved.qlen>3) {
693 		kfree_skb(skb);
694 		err = -ENOBUFS;
695 	} else {
696 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
697 		err = 0;
698 	}
699 
700 	spin_unlock_bh(&mfc_unres_lock);
701 	return err;
702 }
703 
704 /*
705  *	MFC cache manipulation by user space mroute daemon
706  */
707 
708 static int ipmr_mfc_delete(struct mfcctl *mfc)
709 {
710 	int line;
711 	struct mfc_cache *c, **cp;
712 
713 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714 
715 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
718 			write_lock_bh(&mrt_lock);
719 			*cp = c->next;
720 			write_unlock_bh(&mrt_lock);
721 
722 			kmem_cache_free(mrt_cachep, c);
723 			return 0;
724 		}
725 	}
726 	return -ENOENT;
727 }
728 
729 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
730 {
731 	int line;
732 	struct mfc_cache *uc, *c, **cp;
733 
734 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
735 
736 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
737 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
738 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
739 			break;
740 	}
741 
742 	if (c != NULL) {
743 		write_lock_bh(&mrt_lock);
744 		c->mfc_parent = mfc->mfcc_parent;
745 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
746 		if (!mrtsock)
747 			c->mfc_flags |= MFC_STATIC;
748 		write_unlock_bh(&mrt_lock);
749 		return 0;
750 	}
751 
752 	if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
753 		return -EINVAL;
754 
755 	c=ipmr_cache_alloc();
756 	if (c==NULL)
757 		return -ENOMEM;
758 
759 	c->mfc_origin=mfc->mfcc_origin.s_addr;
760 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
761 	c->mfc_parent=mfc->mfcc_parent;
762 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
763 	if (!mrtsock)
764 		c->mfc_flags |= MFC_STATIC;
765 
766 	write_lock_bh(&mrt_lock);
767 	c->next = mfc_cache_array[line];
768 	mfc_cache_array[line] = c;
769 	write_unlock_bh(&mrt_lock);
770 
771 	/*
772 	 *	Check to see if we resolved a queued list. If so we
773 	 *	need to send on the frames and tidy up.
774 	 */
775 	spin_lock_bh(&mfc_unres_lock);
776 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
777 	     cp = &uc->next) {
778 		if (uc->mfc_origin == c->mfc_origin &&
779 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
780 			*cp = uc->next;
781 			if (atomic_dec_and_test(&cache_resolve_queue_len))
782 				del_timer(&ipmr_expire_timer);
783 			break;
784 		}
785 	}
786 	spin_unlock_bh(&mfc_unres_lock);
787 
788 	if (uc) {
789 		ipmr_cache_resolve(uc, c);
790 		kmem_cache_free(mrt_cachep, uc);
791 	}
792 	return 0;
793 }
794 
795 /*
796  *	Close the multicast socket, and clear the vif tables etc
797  */
798 
799 static void mroute_clean_tables(struct sock *sk)
800 {
801 	int i;
802 
803 	/*
804 	 *	Shut down all active vif entries
805 	 */
806 	for (i=0; i<maxvif; i++) {
807 		if (!(vif_table[i].flags&VIFF_STATIC))
808 			vif_delete(i);
809 	}
810 
811 	/*
812 	 *	Wipe the cache
813 	 */
814 	for (i=0;i<MFC_LINES;i++) {
815 		struct mfc_cache *c, **cp;
816 
817 		cp = &mfc_cache_array[i];
818 		while ((c = *cp) != NULL) {
819 			if (c->mfc_flags&MFC_STATIC) {
820 				cp = &c->next;
821 				continue;
822 			}
823 			write_lock_bh(&mrt_lock);
824 			*cp = c->next;
825 			write_unlock_bh(&mrt_lock);
826 
827 			kmem_cache_free(mrt_cachep, c);
828 		}
829 	}
830 
831 	if (atomic_read(&cache_resolve_queue_len) != 0) {
832 		struct mfc_cache *c;
833 
834 		spin_lock_bh(&mfc_unres_lock);
835 		while (mfc_unres_queue != NULL) {
836 			c = mfc_unres_queue;
837 			mfc_unres_queue = c->next;
838 			spin_unlock_bh(&mfc_unres_lock);
839 
840 			ipmr_destroy_unres(c);
841 
842 			spin_lock_bh(&mfc_unres_lock);
843 		}
844 		spin_unlock_bh(&mfc_unres_lock);
845 	}
846 }
847 
848 static void mrtsock_destruct(struct sock *sk)
849 {
850 	rtnl_lock();
851 	if (sk == mroute_socket) {
852 		IPV4_DEVCONF_ALL(MC_FORWARDING)--;
853 
854 		write_lock_bh(&mrt_lock);
855 		mroute_socket=NULL;
856 		write_unlock_bh(&mrt_lock);
857 
858 		mroute_clean_tables(sk);
859 	}
860 	rtnl_unlock();
861 }
862 
863 /*
864  *	Socket options and virtual interface manipulation. The whole
865  *	virtual interface system is a complete heap, but unfortunately
866  *	that's how BSD mrouted happens to think. Maybe one day with a proper
867  *	MOSPF/PIM router set up we can clean this up.
868  */
869 
870 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
871 {
872 	int ret;
873 	struct vifctl vif;
874 	struct mfcctl mfc;
875 
876 	if (optname != MRT_INIT) {
877 		if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
878 			return -EACCES;
879 	}
880 
881 	switch (optname) {
882 	case MRT_INIT:
883 		if (sk->sk_type != SOCK_RAW ||
884 		    inet_sk(sk)->num != IPPROTO_IGMP)
885 			return -EOPNOTSUPP;
886 		if (optlen!=sizeof(int))
887 			return -ENOPROTOOPT;
888 
889 		rtnl_lock();
890 		if (mroute_socket) {
891 			rtnl_unlock();
892 			return -EADDRINUSE;
893 		}
894 
895 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
896 		if (ret == 0) {
897 			write_lock_bh(&mrt_lock);
898 			mroute_socket=sk;
899 			write_unlock_bh(&mrt_lock);
900 
901 			IPV4_DEVCONF_ALL(MC_FORWARDING)++;
902 		}
903 		rtnl_unlock();
904 		return ret;
905 	case MRT_DONE:
906 		if (sk!=mroute_socket)
907 			return -EACCES;
908 		return ip_ra_control(sk, 0, NULL);
909 	case MRT_ADD_VIF:
910 	case MRT_DEL_VIF:
911 		if (optlen!=sizeof(vif))
912 			return -EINVAL;
913 		if (copy_from_user(&vif,optval,sizeof(vif)))
914 			return -EFAULT;
915 		if (vif.vifc_vifi >= MAXVIFS)
916 			return -ENFILE;
917 		rtnl_lock();
918 		if (optname==MRT_ADD_VIF) {
919 			ret = vif_add(&vif, sk==mroute_socket);
920 		} else {
921 			ret = vif_delete(vif.vifc_vifi);
922 		}
923 		rtnl_unlock();
924 		return ret;
925 
926 		/*
927 		 *	Manipulate the forwarding caches. These live
928 		 *	in a sort of kernel/user symbiosis.
929 		 */
930 	case MRT_ADD_MFC:
931 	case MRT_DEL_MFC:
932 		if (optlen!=sizeof(mfc))
933 			return -EINVAL;
934 		if (copy_from_user(&mfc,optval, sizeof(mfc)))
935 			return -EFAULT;
936 		rtnl_lock();
937 		if (optname==MRT_DEL_MFC)
938 			ret = ipmr_mfc_delete(&mfc);
939 		else
940 			ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
941 		rtnl_unlock();
942 		return ret;
943 		/*
944 		 *	Control PIM assert.
945 		 */
946 	case MRT_ASSERT:
947 	{
948 		int v;
949 		if (get_user(v,(int __user *)optval))
950 			return -EFAULT;
951 		mroute_do_assert=(v)?1:0;
952 		return 0;
953 	}
954 #ifdef CONFIG_IP_PIMSM
955 	case MRT_PIM:
956 	{
957 		int v, ret;
958 		if (get_user(v,(int __user *)optval))
959 			return -EFAULT;
960 		v = (v)?1:0;
961 		rtnl_lock();
962 		ret = 0;
963 		if (v != mroute_do_pim) {
964 			mroute_do_pim = v;
965 			mroute_do_assert = v;
966 #ifdef CONFIG_IP_PIMSM_V2
967 			if (mroute_do_pim)
968 				ret = inet_add_protocol(&pim_protocol,
969 							IPPROTO_PIM);
970 			else
971 				ret = inet_del_protocol(&pim_protocol,
972 							IPPROTO_PIM);
973 			if (ret < 0)
974 				ret = -EAGAIN;
975 #endif
976 		}
977 		rtnl_unlock();
978 		return ret;
979 	}
980 #endif
981 	/*
982 	 *	Spurious command, or MRT_VERSION which you cannot
983 	 *	set.
984 	 */
985 	default:
986 		return -ENOPROTOOPT;
987 	}
988 }
989 
990 /*
991  *	Getsock opt support for the multicast routing system.
992  */
993 
994 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
995 {
996 	int olr;
997 	int val;
998 
999 	if (optname!=MRT_VERSION &&
1000 #ifdef CONFIG_IP_PIMSM
1001 	   optname!=MRT_PIM &&
1002 #endif
1003 	   optname!=MRT_ASSERT)
1004 		return -ENOPROTOOPT;
1005 
1006 	if (get_user(olr, optlen))
1007 		return -EFAULT;
1008 
1009 	olr = min_t(unsigned int, olr, sizeof(int));
1010 	if (olr < 0)
1011 		return -EINVAL;
1012 
1013 	if (put_user(olr,optlen))
1014 		return -EFAULT;
1015 	if (optname==MRT_VERSION)
1016 		val=0x0305;
1017 #ifdef CONFIG_IP_PIMSM
1018 	else if (optname==MRT_PIM)
1019 		val=mroute_do_pim;
1020 #endif
1021 	else
1022 		val=mroute_do_assert;
1023 	if (copy_to_user(optval,&val,olr))
1024 		return -EFAULT;
1025 	return 0;
1026 }
1027 
1028 /*
1029  *	The IP multicast ioctl support routines.
1030  */
1031 
1032 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1033 {
1034 	struct sioc_sg_req sr;
1035 	struct sioc_vif_req vr;
1036 	struct vif_device *vif;
1037 	struct mfc_cache *c;
1038 
1039 	switch (cmd) {
1040 	case SIOCGETVIFCNT:
1041 		if (copy_from_user(&vr,arg,sizeof(vr)))
1042 			return -EFAULT;
1043 		if (vr.vifi>=maxvif)
1044 			return -EINVAL;
1045 		read_lock(&mrt_lock);
1046 		vif=&vif_table[vr.vifi];
1047 		if (VIF_EXISTS(vr.vifi))	{
1048 			vr.icount=vif->pkt_in;
1049 			vr.ocount=vif->pkt_out;
1050 			vr.ibytes=vif->bytes_in;
1051 			vr.obytes=vif->bytes_out;
1052 			read_unlock(&mrt_lock);
1053 
1054 			if (copy_to_user(arg,&vr,sizeof(vr)))
1055 				return -EFAULT;
1056 			return 0;
1057 		}
1058 		read_unlock(&mrt_lock);
1059 		return -EADDRNOTAVAIL;
1060 	case SIOCGETSGCNT:
1061 		if (copy_from_user(&sr,arg,sizeof(sr)))
1062 			return -EFAULT;
1063 
1064 		read_lock(&mrt_lock);
1065 		c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1066 		if (c) {
1067 			sr.pktcnt = c->mfc_un.res.pkt;
1068 			sr.bytecnt = c->mfc_un.res.bytes;
1069 			sr.wrong_if = c->mfc_un.res.wrong_if;
1070 			read_unlock(&mrt_lock);
1071 
1072 			if (copy_to_user(arg,&sr,sizeof(sr)))
1073 				return -EFAULT;
1074 			return 0;
1075 		}
1076 		read_unlock(&mrt_lock);
1077 		return -EADDRNOTAVAIL;
1078 	default:
1079 		return -ENOIOCTLCMD;
1080 	}
1081 }
1082 
1083 
1084 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1085 {
1086 	struct net_device *dev = ptr;
1087 	struct vif_device *v;
1088 	int ct;
1089 
1090 	if (dev->nd_net != &init_net)
1091 		return NOTIFY_DONE;
1092 
1093 	if (event != NETDEV_UNREGISTER)
1094 		return NOTIFY_DONE;
1095 	v=&vif_table[0];
1096 	for (ct=0;ct<maxvif;ct++,v++) {
1097 		if (v->dev==dev)
1098 			vif_delete(ct);
1099 	}
1100 	return NOTIFY_DONE;
1101 }
1102 
1103 
1104 static struct notifier_block ip_mr_notifier={
1105 	.notifier_call = ipmr_device_event,
1106 };
1107 
1108 /*
1109  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1110  *	This avoids tunnel drivers and other mess and gives us the speed so
1111  *	important for multicast video.
1112  */
1113 
1114 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1115 {
1116 	struct iphdr *iph;
1117 	struct iphdr *old_iph = ip_hdr(skb);
1118 
1119 	skb_push(skb, sizeof(struct iphdr));
1120 	skb->transport_header = skb->network_header;
1121 	skb_reset_network_header(skb);
1122 	iph = ip_hdr(skb);
1123 
1124 	iph->version	= 	4;
1125 	iph->tos	=	old_iph->tos;
1126 	iph->ttl	=	old_iph->ttl;
1127 	iph->frag_off	=	0;
1128 	iph->daddr	=	daddr;
1129 	iph->saddr	=	saddr;
1130 	iph->protocol	=	IPPROTO_IPIP;
1131 	iph->ihl	=	5;
1132 	iph->tot_len	=	htons(skb->len);
1133 	ip_select_ident(iph, skb->dst, NULL);
1134 	ip_send_check(iph);
1135 
1136 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1137 	nf_reset(skb);
1138 }
1139 
1140 static inline int ipmr_forward_finish(struct sk_buff *skb)
1141 {
1142 	struct ip_options * opt	= &(IPCB(skb)->opt);
1143 
1144 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1145 
1146 	if (unlikely(opt->optlen))
1147 		ip_forward_options(skb);
1148 
1149 	return dst_output(skb);
1150 }
1151 
1152 /*
1153  *	Processing handlers for ipmr_forward
1154  */
1155 
1156 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1157 {
1158 	const struct iphdr *iph = ip_hdr(skb);
1159 	struct vif_device *vif = &vif_table[vifi];
1160 	struct net_device *dev;
1161 	struct rtable *rt;
1162 	int    encap = 0;
1163 
1164 	if (vif->dev == NULL)
1165 		goto out_free;
1166 
1167 #ifdef CONFIG_IP_PIMSM
1168 	if (vif->flags & VIFF_REGISTER) {
1169 		vif->pkt_out++;
1170 		vif->bytes_out+=skb->len;
1171 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1172 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1173 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1174 		kfree_skb(skb);
1175 		return;
1176 	}
1177 #endif
1178 
1179 	if (vif->flags&VIFF_TUNNEL) {
1180 		struct flowi fl = { .oif = vif->link,
1181 				    .nl_u = { .ip4_u =
1182 					      { .daddr = vif->remote,
1183 						.saddr = vif->local,
1184 						.tos = RT_TOS(iph->tos) } },
1185 				    .proto = IPPROTO_IPIP };
1186 		if (ip_route_output_key(&rt, &fl))
1187 			goto out_free;
1188 		encap = sizeof(struct iphdr);
1189 	} else {
1190 		struct flowi fl = { .oif = vif->link,
1191 				    .nl_u = { .ip4_u =
1192 					      { .daddr = iph->daddr,
1193 						.tos = RT_TOS(iph->tos) } },
1194 				    .proto = IPPROTO_IPIP };
1195 		if (ip_route_output_key(&rt, &fl))
1196 			goto out_free;
1197 	}
1198 
1199 	dev = rt->u.dst.dev;
1200 
1201 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1202 		/* Do not fragment multicasts. Alas, IPv4 does not
1203 		   allow to send ICMP, so that packets will disappear
1204 		   to blackhole.
1205 		 */
1206 
1207 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1208 		ip_rt_put(rt);
1209 		goto out_free;
1210 	}
1211 
1212 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1213 
1214 	if (skb_cow(skb, encap)) {
1215 		ip_rt_put(rt);
1216 		goto out_free;
1217 	}
1218 
1219 	vif->pkt_out++;
1220 	vif->bytes_out+=skb->len;
1221 
1222 	dst_release(skb->dst);
1223 	skb->dst = &rt->u.dst;
1224 	ip_decrease_ttl(ip_hdr(skb));
1225 
1226 	/* FIXME: forward and output firewalls used to be called here.
1227 	 * What do we do with netfilter? -- RR */
1228 	if (vif->flags & VIFF_TUNNEL) {
1229 		ip_encap(skb, vif->local, vif->remote);
1230 		/* FIXME: extra output firewall step used to be here. --RR */
1231 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1232 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1233 	}
1234 
1235 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1236 
1237 	/*
1238 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1239 	 * not only before forwarding, but after forwarding on all output
1240 	 * interfaces. It is clear, if mrouter runs a multicasting
1241 	 * program, it should receive packets not depending to what interface
1242 	 * program is joined.
1243 	 * If we will not make it, the program will have to join on all
1244 	 * interfaces. On the other hand, multihoming host (or router, but
1245 	 * not mrouter) cannot join to more than one interface - it will
1246 	 * result in receiving multiple packets.
1247 	 */
1248 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1249 		ipmr_forward_finish);
1250 	return;
1251 
1252 out_free:
1253 	kfree_skb(skb);
1254 	return;
1255 }
1256 
1257 static int ipmr_find_vif(struct net_device *dev)
1258 {
1259 	int ct;
1260 	for (ct=maxvif-1; ct>=0; ct--) {
1261 		if (vif_table[ct].dev == dev)
1262 			break;
1263 	}
1264 	return ct;
1265 }
1266 
1267 /* "local" means that we should preserve one skb (for local delivery) */
1268 
1269 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1270 {
1271 	int psend = -1;
1272 	int vif, ct;
1273 
1274 	vif = cache->mfc_parent;
1275 	cache->mfc_un.res.pkt++;
1276 	cache->mfc_un.res.bytes += skb->len;
1277 
1278 	/*
1279 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1280 	 */
1281 	if (vif_table[vif].dev != skb->dev) {
1282 		int true_vifi;
1283 
1284 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1285 			/* It is our own packet, looped back.
1286 			   Very complicated situation...
1287 
1288 			   The best workaround until routing daemons will be
1289 			   fixed is not to redistribute packet, if it was
1290 			   send through wrong interface. It means, that
1291 			   multicast applications WILL NOT work for
1292 			   (S,G), which have default multicast route pointing
1293 			   to wrong oif. In any case, it is not a good
1294 			   idea to use multicasting applications on router.
1295 			 */
1296 			goto dont_forward;
1297 		}
1298 
1299 		cache->mfc_un.res.wrong_if++;
1300 		true_vifi = ipmr_find_vif(skb->dev);
1301 
1302 		if (true_vifi >= 0 && mroute_do_assert &&
1303 		    /* pimsm uses asserts, when switching from RPT to SPT,
1304 		       so that we cannot check that packet arrived on an oif.
1305 		       It is bad, but otherwise we would need to move pretty
1306 		       large chunk of pimd to kernel. Ough... --ANK
1307 		     */
1308 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1309 		    time_after(jiffies,
1310 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1311 			cache->mfc_un.res.last_assert = jiffies;
1312 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1313 		}
1314 		goto dont_forward;
1315 	}
1316 
1317 	vif_table[vif].pkt_in++;
1318 	vif_table[vif].bytes_in+=skb->len;
1319 
1320 	/*
1321 	 *	Forward the frame
1322 	 */
1323 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1324 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1325 			if (psend != -1) {
1326 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1327 				if (skb2)
1328 					ipmr_queue_xmit(skb2, cache, psend);
1329 			}
1330 			psend=ct;
1331 		}
1332 	}
1333 	if (psend != -1) {
1334 		if (local) {
1335 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1336 			if (skb2)
1337 				ipmr_queue_xmit(skb2, cache, psend);
1338 		} else {
1339 			ipmr_queue_xmit(skb, cache, psend);
1340 			return 0;
1341 		}
1342 	}
1343 
1344 dont_forward:
1345 	if (!local)
1346 		kfree_skb(skb);
1347 	return 0;
1348 }
1349 
1350 
1351 /*
1352  *	Multicast packets for forwarding arrive here
1353  */
1354 
1355 int ip_mr_input(struct sk_buff *skb)
1356 {
1357 	struct mfc_cache *cache;
1358 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1359 
1360 	/* Packet is looped back after forward, it should not be
1361 	   forwarded second time, but still can be delivered locally.
1362 	 */
1363 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1364 		goto dont_forward;
1365 
1366 	if (!local) {
1367 		    if (IPCB(skb)->opt.router_alert) {
1368 			    if (ip_call_ra_chain(skb))
1369 				    return 0;
1370 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1371 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1372 			       Cisco IOS <= 11.2(8)) do not put router alert
1373 			       option to IGMP packets destined to routable
1374 			       groups. It is very bad, because it means
1375 			       that we can forward NO IGMP messages.
1376 			     */
1377 			    read_lock(&mrt_lock);
1378 			    if (mroute_socket) {
1379 				    nf_reset(skb);
1380 				    raw_rcv(mroute_socket, skb);
1381 				    read_unlock(&mrt_lock);
1382 				    return 0;
1383 			    }
1384 			    read_unlock(&mrt_lock);
1385 		    }
1386 	}
1387 
1388 	read_lock(&mrt_lock);
1389 	cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1390 
1391 	/*
1392 	 *	No usable cache entry
1393 	 */
1394 	if (cache==NULL) {
1395 		int vif;
1396 
1397 		if (local) {
1398 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1399 			ip_local_deliver(skb);
1400 			if (skb2 == NULL) {
1401 				read_unlock(&mrt_lock);
1402 				return -ENOBUFS;
1403 			}
1404 			skb = skb2;
1405 		}
1406 
1407 		vif = ipmr_find_vif(skb->dev);
1408 		if (vif >= 0) {
1409 			int err = ipmr_cache_unresolved(vif, skb);
1410 			read_unlock(&mrt_lock);
1411 
1412 			return err;
1413 		}
1414 		read_unlock(&mrt_lock);
1415 		kfree_skb(skb);
1416 		return -ENODEV;
1417 	}
1418 
1419 	ip_mr_forward(skb, cache, local);
1420 
1421 	read_unlock(&mrt_lock);
1422 
1423 	if (local)
1424 		return ip_local_deliver(skb);
1425 
1426 	return 0;
1427 
1428 dont_forward:
1429 	if (local)
1430 		return ip_local_deliver(skb);
1431 	kfree_skb(skb);
1432 	return 0;
1433 }
1434 
1435 #ifdef CONFIG_IP_PIMSM_V1
1436 /*
1437  * Handle IGMP messages of PIMv1
1438  */
1439 
1440 int pim_rcv_v1(struct sk_buff * skb)
1441 {
1442 	struct igmphdr *pim;
1443 	struct iphdr   *encap;
1444 	struct net_device  *reg_dev = NULL;
1445 
1446 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1447 		goto drop;
1448 
1449 	pim = igmp_hdr(skb);
1450 
1451 	if (!mroute_do_pim ||
1452 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1453 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1454 		goto drop;
1455 
1456 	encap = (struct iphdr *)(skb_transport_header(skb) +
1457 				 sizeof(struct igmphdr));
1458 	/*
1459 	   Check that:
1460 	   a. packet is really destinted to a multicast group
1461 	   b. packet is not a NULL-REGISTER
1462 	   c. packet is not truncated
1463 	 */
1464 	if (!MULTICAST(encap->daddr) ||
1465 	    encap->tot_len == 0 ||
1466 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1467 		goto drop;
1468 
1469 	read_lock(&mrt_lock);
1470 	if (reg_vif_num >= 0)
1471 		reg_dev = vif_table[reg_vif_num].dev;
1472 	if (reg_dev)
1473 		dev_hold(reg_dev);
1474 	read_unlock(&mrt_lock);
1475 
1476 	if (reg_dev == NULL)
1477 		goto drop;
1478 
1479 	skb->mac_header = skb->network_header;
1480 	skb_pull(skb, (u8*)encap - skb->data);
1481 	skb_reset_network_header(skb);
1482 	skb->dev = reg_dev;
1483 	skb->protocol = htons(ETH_P_IP);
1484 	skb->ip_summed = 0;
1485 	skb->pkt_type = PACKET_HOST;
1486 	dst_release(skb->dst);
1487 	skb->dst = NULL;
1488 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1489 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1490 	nf_reset(skb);
1491 	netif_rx(skb);
1492 	dev_put(reg_dev);
1493 	return 0;
1494  drop:
1495 	kfree_skb(skb);
1496 	return 0;
1497 }
1498 #endif
1499 
1500 #ifdef CONFIG_IP_PIMSM_V2
1501 static int pim_rcv(struct sk_buff * skb)
1502 {
1503 	struct pimreghdr *pim;
1504 	struct iphdr   *encap;
1505 	struct net_device  *reg_dev = NULL;
1506 
1507 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1508 		goto drop;
1509 
1510 	pim = (struct pimreghdr *)skb_transport_header(skb);
1511 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1512 	    (pim->flags&PIM_NULL_REGISTER) ||
1513 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1514 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1515 		goto drop;
1516 
1517 	/* check if the inner packet is destined to mcast group */
1518 	encap = (struct iphdr *)(skb_transport_header(skb) +
1519 				 sizeof(struct pimreghdr));
1520 	if (!MULTICAST(encap->daddr) ||
1521 	    encap->tot_len == 0 ||
1522 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1523 		goto drop;
1524 
1525 	read_lock(&mrt_lock);
1526 	if (reg_vif_num >= 0)
1527 		reg_dev = vif_table[reg_vif_num].dev;
1528 	if (reg_dev)
1529 		dev_hold(reg_dev);
1530 	read_unlock(&mrt_lock);
1531 
1532 	if (reg_dev == NULL)
1533 		goto drop;
1534 
1535 	skb->mac_header = skb->network_header;
1536 	skb_pull(skb, (u8*)encap - skb->data);
1537 	skb_reset_network_header(skb);
1538 	skb->dev = reg_dev;
1539 	skb->protocol = htons(ETH_P_IP);
1540 	skb->ip_summed = 0;
1541 	skb->pkt_type = PACKET_HOST;
1542 	dst_release(skb->dst);
1543 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1544 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1545 	skb->dst = NULL;
1546 	nf_reset(skb);
1547 	netif_rx(skb);
1548 	dev_put(reg_dev);
1549 	return 0;
1550  drop:
1551 	kfree_skb(skb);
1552 	return 0;
1553 }
1554 #endif
1555 
1556 static int
1557 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1558 {
1559 	int ct;
1560 	struct rtnexthop *nhp;
1561 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1562 	u8 *b = skb_tail_pointer(skb);
1563 	struct rtattr *mp_head;
1564 
1565 	if (dev)
1566 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1567 
1568 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1569 
1570 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1571 		if (c->mfc_un.res.ttls[ct] < 255) {
1572 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1573 				goto rtattr_failure;
1574 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1575 			nhp->rtnh_flags = 0;
1576 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1577 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1578 			nhp->rtnh_len = sizeof(*nhp);
1579 		}
1580 	}
1581 	mp_head->rta_type = RTA_MULTIPATH;
1582 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1583 	rtm->rtm_type = RTN_MULTICAST;
1584 	return 1;
1585 
1586 rtattr_failure:
1587 	nlmsg_trim(skb, b);
1588 	return -EMSGSIZE;
1589 }
1590 
1591 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1592 {
1593 	int err;
1594 	struct mfc_cache *cache;
1595 	struct rtable *rt = (struct rtable*)skb->dst;
1596 
1597 	read_lock(&mrt_lock);
1598 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1599 
1600 	if (cache==NULL) {
1601 		struct sk_buff *skb2;
1602 		struct iphdr *iph;
1603 		struct net_device *dev;
1604 		int vif;
1605 
1606 		if (nowait) {
1607 			read_unlock(&mrt_lock);
1608 			return -EAGAIN;
1609 		}
1610 
1611 		dev = skb->dev;
1612 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1613 			read_unlock(&mrt_lock);
1614 			return -ENODEV;
1615 		}
1616 		skb2 = skb_clone(skb, GFP_ATOMIC);
1617 		if (!skb2) {
1618 			read_unlock(&mrt_lock);
1619 			return -ENOMEM;
1620 		}
1621 
1622 		skb_push(skb2, sizeof(struct iphdr));
1623 		skb_reset_network_header(skb2);
1624 		iph = ip_hdr(skb2);
1625 		iph->ihl = sizeof(struct iphdr) >> 2;
1626 		iph->saddr = rt->rt_src;
1627 		iph->daddr = rt->rt_dst;
1628 		iph->version = 0;
1629 		err = ipmr_cache_unresolved(vif, skb2);
1630 		read_unlock(&mrt_lock);
1631 		return err;
1632 	}
1633 
1634 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1635 		cache->mfc_flags |= MFC_NOTIFY;
1636 	err = ipmr_fill_mroute(skb, cache, rtm);
1637 	read_unlock(&mrt_lock);
1638 	return err;
1639 }
1640 
1641 #ifdef CONFIG_PROC_FS
1642 /*
1643  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1644  */
1645 struct ipmr_vif_iter {
1646 	int ct;
1647 };
1648 
1649 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1650 					   loff_t pos)
1651 {
1652 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1653 		if (!VIF_EXISTS(iter->ct))
1654 			continue;
1655 		if (pos-- == 0)
1656 			return &vif_table[iter->ct];
1657 	}
1658 	return NULL;
1659 }
1660 
1661 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1662 {
1663 	read_lock(&mrt_lock);
1664 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1665 		: SEQ_START_TOKEN;
1666 }
1667 
1668 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1669 {
1670 	struct ipmr_vif_iter *iter = seq->private;
1671 
1672 	++*pos;
1673 	if (v == SEQ_START_TOKEN)
1674 		return ipmr_vif_seq_idx(iter, 0);
1675 
1676 	while (++iter->ct < maxvif) {
1677 		if (!VIF_EXISTS(iter->ct))
1678 			continue;
1679 		return &vif_table[iter->ct];
1680 	}
1681 	return NULL;
1682 }
1683 
1684 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1685 {
1686 	read_unlock(&mrt_lock);
1687 }
1688 
1689 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1690 {
1691 	if (v == SEQ_START_TOKEN) {
1692 		seq_puts(seq,
1693 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1694 	} else {
1695 		const struct vif_device *vif = v;
1696 		const char *name =  vif->dev ? vif->dev->name : "none";
1697 
1698 		seq_printf(seq,
1699 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1700 			   vif - vif_table,
1701 			   name, vif->bytes_in, vif->pkt_in,
1702 			   vif->bytes_out, vif->pkt_out,
1703 			   vif->flags, vif->local, vif->remote);
1704 	}
1705 	return 0;
1706 }
1707 
1708 static const struct seq_operations ipmr_vif_seq_ops = {
1709 	.start = ipmr_vif_seq_start,
1710 	.next  = ipmr_vif_seq_next,
1711 	.stop  = ipmr_vif_seq_stop,
1712 	.show  = ipmr_vif_seq_show,
1713 };
1714 
1715 static int ipmr_vif_open(struct inode *inode, struct file *file)
1716 {
1717 	return seq_open_private(file, &ipmr_vif_seq_ops,
1718 			sizeof(struct ipmr_vif_iter));
1719 }
1720 
1721 static const struct file_operations ipmr_vif_fops = {
1722 	.owner	 = THIS_MODULE,
1723 	.open    = ipmr_vif_open,
1724 	.read    = seq_read,
1725 	.llseek  = seq_lseek,
1726 	.release = seq_release_private,
1727 };
1728 
1729 struct ipmr_mfc_iter {
1730 	struct mfc_cache **cache;
1731 	int ct;
1732 };
1733 
1734 
1735 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1736 {
1737 	struct mfc_cache *mfc;
1738 
1739 	it->cache = mfc_cache_array;
1740 	read_lock(&mrt_lock);
1741 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1742 		for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1743 			if (pos-- == 0)
1744 				return mfc;
1745 	read_unlock(&mrt_lock);
1746 
1747 	it->cache = &mfc_unres_queue;
1748 	spin_lock_bh(&mfc_unres_lock);
1749 	for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1750 		if (pos-- == 0)
1751 			return mfc;
1752 	spin_unlock_bh(&mfc_unres_lock);
1753 
1754 	it->cache = NULL;
1755 	return NULL;
1756 }
1757 
1758 
1759 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1760 {
1761 	struct ipmr_mfc_iter *it = seq->private;
1762 	it->cache = NULL;
1763 	it->ct = 0;
1764 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1765 		: SEQ_START_TOKEN;
1766 }
1767 
1768 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1769 {
1770 	struct mfc_cache *mfc = v;
1771 	struct ipmr_mfc_iter *it = seq->private;
1772 
1773 	++*pos;
1774 
1775 	if (v == SEQ_START_TOKEN)
1776 		return ipmr_mfc_seq_idx(seq->private, 0);
1777 
1778 	if (mfc->next)
1779 		return mfc->next;
1780 
1781 	if (it->cache == &mfc_unres_queue)
1782 		goto end_of_list;
1783 
1784 	BUG_ON(it->cache != mfc_cache_array);
1785 
1786 	while (++it->ct < MFC_LINES) {
1787 		mfc = mfc_cache_array[it->ct];
1788 		if (mfc)
1789 			return mfc;
1790 	}
1791 
1792 	/* exhausted cache_array, show unresolved */
1793 	read_unlock(&mrt_lock);
1794 	it->cache = &mfc_unres_queue;
1795 	it->ct = 0;
1796 
1797 	spin_lock_bh(&mfc_unres_lock);
1798 	mfc = mfc_unres_queue;
1799 	if (mfc)
1800 		return mfc;
1801 
1802  end_of_list:
1803 	spin_unlock_bh(&mfc_unres_lock);
1804 	it->cache = NULL;
1805 
1806 	return NULL;
1807 }
1808 
1809 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1810 {
1811 	struct ipmr_mfc_iter *it = seq->private;
1812 
1813 	if (it->cache == &mfc_unres_queue)
1814 		spin_unlock_bh(&mfc_unres_lock);
1815 	else if (it->cache == mfc_cache_array)
1816 		read_unlock(&mrt_lock);
1817 }
1818 
1819 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1820 {
1821 	int n;
1822 
1823 	if (v == SEQ_START_TOKEN) {
1824 		seq_puts(seq,
1825 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1826 	} else {
1827 		const struct mfc_cache *mfc = v;
1828 		const struct ipmr_mfc_iter *it = seq->private;
1829 
1830 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1831 			   (unsigned long) mfc->mfc_mcastgrp,
1832 			   (unsigned long) mfc->mfc_origin,
1833 			   mfc->mfc_parent,
1834 			   mfc->mfc_un.res.pkt,
1835 			   mfc->mfc_un.res.bytes,
1836 			   mfc->mfc_un.res.wrong_if);
1837 
1838 		if (it->cache != &mfc_unres_queue) {
1839 			for (n = mfc->mfc_un.res.minvif;
1840 			     n < mfc->mfc_un.res.maxvif; n++ ) {
1841 				if (VIF_EXISTS(n)
1842 				   && mfc->mfc_un.res.ttls[n] < 255)
1843 				seq_printf(seq,
1844 					   " %2d:%-3d",
1845 					   n, mfc->mfc_un.res.ttls[n]);
1846 			}
1847 		}
1848 		seq_putc(seq, '\n');
1849 	}
1850 	return 0;
1851 }
1852 
1853 static const struct seq_operations ipmr_mfc_seq_ops = {
1854 	.start = ipmr_mfc_seq_start,
1855 	.next  = ipmr_mfc_seq_next,
1856 	.stop  = ipmr_mfc_seq_stop,
1857 	.show  = ipmr_mfc_seq_show,
1858 };
1859 
1860 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1861 {
1862 	return seq_open_private(file, &ipmr_mfc_seq_ops,
1863 			sizeof(struct ipmr_mfc_iter));
1864 }
1865 
1866 static const struct file_operations ipmr_mfc_fops = {
1867 	.owner	 = THIS_MODULE,
1868 	.open    = ipmr_mfc_open,
1869 	.read    = seq_read,
1870 	.llseek  = seq_lseek,
1871 	.release = seq_release_private,
1872 };
1873 #endif
1874 
1875 #ifdef CONFIG_IP_PIMSM_V2
1876 static struct net_protocol pim_protocol = {
1877 	.handler	=	pim_rcv,
1878 };
1879 #endif
1880 
1881 
1882 /*
1883  *	Setup for IP multicast routing
1884  */
1885 
1886 void __init ip_mr_init(void)
1887 {
1888 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1889 				       sizeof(struct mfc_cache),
1890 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1891 				       NULL);
1892 	init_timer(&ipmr_expire_timer);
1893 	ipmr_expire_timer.function=ipmr_expire_process;
1894 	register_netdevice_notifier(&ip_mr_notifier);
1895 #ifdef CONFIG_PROC_FS
1896 	proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
1897 	proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
1898 #endif
1899 }
1900