xref: /openbmc/linux/net/ipv4/ipmr.c (revision 1fa6ac37)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Fixes:
13  *	Michael Chastain	:	Incorrect size of copying.
14  *	Alan Cox		:	Added the cache manager code
15  *	Alan Cox		:	Fixed the clone/copy bug and device race.
16  *	Mike McLagan		:	Routing by source
17  *	Malcolm Beattie		:	Buffer handling fixes.
18  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
19  *	SVR Anand		:	Fixed several multicast bugs and problems.
20  *	Alexey Kuznetsov	:	Status, optimisations and more.
21  *	Brad Parker		:	Better behaviour on mrouted upcall
22  *					overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25  *					Relax this requirement to work with older peers.
26  *
27  */
28 
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67 
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM	1
70 #endif
71 
72 struct mr_table {
73 	struct list_head	list;
74 #ifdef CONFIG_NET_NS
75 	struct net		*net;
76 #endif
77 	u32			id;
78 	struct sock		*mroute_sk;
79 	struct timer_list	ipmr_expire_timer;
80 	struct list_head	mfc_unres_queue;
81 	struct list_head	mfc_cache_array[MFC_LINES];
82 	struct vif_device	vif_table[MAXVIFS];
83 	int			maxvif;
84 	atomic_t		cache_resolve_queue_len;
85 	int			mroute_do_assert;
86 	int			mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88 	int			mroute_reg_vif_num;
89 #endif
90 };
91 
92 struct ipmr_rule {
93 	struct fib_rule		common;
94 };
95 
96 struct ipmr_result {
97 	struct mr_table		*mrt;
98 };
99 
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103 
104 static DEFINE_RWLOCK(mrt_lock);
105 
106 /*
107  *	Multicast router control variables
108  */
109 
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111 
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114 
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119 
120    In this case data path is free of exclusive locks at all.
121  */
122 
123 static struct kmem_cache *mrt_cachep __read_mostly;
124 
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127 			 struct sk_buff *skb, struct mfc_cache *cache,
128 			 int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130 			     struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132 			      struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134 
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137 	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138 
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141 	struct mr_table *mrt;
142 
143 	ipmr_for_each_table(mrt, net) {
144 		if (mrt->id == id)
145 			return mrt;
146 	}
147 	return NULL;
148 }
149 
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151 			   struct mr_table **mrt)
152 {
153 	struct ipmr_result res;
154 	struct fib_lookup_arg arg = { .result = &res, };
155 	int err;
156 
157 	err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158 	if (err < 0)
159 		return err;
160 	*mrt = res.mrt;
161 	return 0;
162 }
163 
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165 			    int flags, struct fib_lookup_arg *arg)
166 {
167 	struct ipmr_result *res = arg->result;
168 	struct mr_table *mrt;
169 
170 	switch (rule->action) {
171 	case FR_ACT_TO_TBL:
172 		break;
173 	case FR_ACT_UNREACHABLE:
174 		return -ENETUNREACH;
175 	case FR_ACT_PROHIBIT:
176 		return -EACCES;
177 	case FR_ACT_BLACKHOLE:
178 	default:
179 		return -EINVAL;
180 	}
181 
182 	mrt = ipmr_get_table(rule->fr_net, rule->table);
183 	if (mrt == NULL)
184 		return -EAGAIN;
185 	res->mrt = mrt;
186 	return 0;
187 }
188 
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191 	return 1;
192 }
193 
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195 	FRA_GENERIC_POLICY,
196 };
197 
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199 			       struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201 	return 0;
202 }
203 
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205 			     struct nlattr **tb)
206 {
207 	return 1;
208 }
209 
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211 			  struct fib_rule_hdr *frh)
212 {
213 	frh->dst_len = 0;
214 	frh->src_len = 0;
215 	frh->tos     = 0;
216 	return 0;
217 }
218 
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220 	.family		= RTNL_FAMILY_IPMR,
221 	.rule_size	= sizeof(struct ipmr_rule),
222 	.addr_size	= sizeof(u32),
223 	.action		= ipmr_rule_action,
224 	.match		= ipmr_rule_match,
225 	.configure	= ipmr_rule_configure,
226 	.compare	= ipmr_rule_compare,
227 	.default_pref	= fib_default_rule_pref,
228 	.fill		= ipmr_rule_fill,
229 	.nlgroup	= RTNLGRP_IPV4_RULE,
230 	.policy		= ipmr_rule_policy,
231 	.owner		= THIS_MODULE,
232 };
233 
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236 	struct fib_rules_ops *ops;
237 	struct mr_table *mrt;
238 	int err;
239 
240 	ops = fib_rules_register(&ipmr_rules_ops_template, net);
241 	if (IS_ERR(ops))
242 		return PTR_ERR(ops);
243 
244 	INIT_LIST_HEAD(&net->ipv4.mr_tables);
245 
246 	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247 	if (mrt == NULL) {
248 		err = -ENOMEM;
249 		goto err1;
250 	}
251 
252 	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253 	if (err < 0)
254 		goto err2;
255 
256 	net->ipv4.mr_rules_ops = ops;
257 	return 0;
258 
259 err2:
260 	kfree(mrt);
261 err1:
262 	fib_rules_unregister(ops);
263 	return err;
264 }
265 
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268 	struct mr_table *mrt, *next;
269 
270 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271 		list_del(&mrt->list);
272 		kfree(mrt);
273 	}
274 	fib_rules_unregister(net->ipv4.mr_rules_ops);
275 }
276 #else
277 #define ipmr_for_each_table(mrt, net) \
278 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279 
280 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281 {
282 	return net->ipv4.mrt;
283 }
284 
285 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286 			   struct mr_table **mrt)
287 {
288 	*mrt = net->ipv4.mrt;
289 	return 0;
290 }
291 
292 static int __net_init ipmr_rules_init(struct net *net)
293 {
294 	net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295 	return net->ipv4.mrt ? 0 : -ENOMEM;
296 }
297 
298 static void __net_exit ipmr_rules_exit(struct net *net)
299 {
300 	kfree(net->ipv4.mrt);
301 }
302 #endif
303 
304 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305 {
306 	struct mr_table *mrt;
307 	unsigned int i;
308 
309 	mrt = ipmr_get_table(net, id);
310 	if (mrt != NULL)
311 		return mrt;
312 
313 	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314 	if (mrt == NULL)
315 		return NULL;
316 	write_pnet(&mrt->net, net);
317 	mrt->id = id;
318 
319 	/* Forwarding cache */
320 	for (i = 0; i < MFC_LINES; i++)
321 		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322 
323 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324 
325 	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326 		    (unsigned long)mrt);
327 
328 #ifdef CONFIG_IP_PIMSM
329 	mrt->mroute_reg_vif_num = -1;
330 #endif
331 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332 	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333 #endif
334 	return mrt;
335 }
336 
337 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
338 
339 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
340 {
341 	struct net *net = dev_net(dev);
342 
343 	dev_close(dev);
344 
345 	dev = __dev_get_by_name(net, "tunl0");
346 	if (dev) {
347 		const struct net_device_ops *ops = dev->netdev_ops;
348 		struct ifreq ifr;
349 		struct ip_tunnel_parm p;
350 
351 		memset(&p, 0, sizeof(p));
352 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
353 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
354 		p.iph.version = 4;
355 		p.iph.ihl = 5;
356 		p.iph.protocol = IPPROTO_IPIP;
357 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
358 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
359 
360 		if (ops->ndo_do_ioctl) {
361 			mm_segment_t oldfs = get_fs();
362 
363 			set_fs(KERNEL_DS);
364 			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
365 			set_fs(oldfs);
366 		}
367 	}
368 }
369 
370 static
371 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
372 {
373 	struct net_device  *dev;
374 
375 	dev = __dev_get_by_name(net, "tunl0");
376 
377 	if (dev) {
378 		const struct net_device_ops *ops = dev->netdev_ops;
379 		int err;
380 		struct ifreq ifr;
381 		struct ip_tunnel_parm p;
382 		struct in_device  *in_dev;
383 
384 		memset(&p, 0, sizeof(p));
385 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
386 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
387 		p.iph.version = 4;
388 		p.iph.ihl = 5;
389 		p.iph.protocol = IPPROTO_IPIP;
390 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
391 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
392 
393 		if (ops->ndo_do_ioctl) {
394 			mm_segment_t oldfs = get_fs();
395 
396 			set_fs(KERNEL_DS);
397 			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398 			set_fs(oldfs);
399 		} else
400 			err = -EOPNOTSUPP;
401 
402 		dev = NULL;
403 
404 		if (err == 0 &&
405 		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
406 			dev->flags |= IFF_MULTICAST;
407 
408 			in_dev = __in_dev_get_rtnl(dev);
409 			if (in_dev == NULL)
410 				goto failure;
411 
412 			ipv4_devconf_setall(in_dev);
413 			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
414 
415 			if (dev_open(dev))
416 				goto failure;
417 			dev_hold(dev);
418 		}
419 	}
420 	return dev;
421 
422 failure:
423 	/* allow the register to be completed before unregistering. */
424 	rtnl_unlock();
425 	rtnl_lock();
426 
427 	unregister_netdevice(dev);
428 	return NULL;
429 }
430 
431 #ifdef CONFIG_IP_PIMSM
432 
433 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434 {
435 	struct net *net = dev_net(dev);
436 	struct mr_table *mrt;
437 	struct flowi fl = {
438 		.oif		= dev->ifindex,
439 		.iif		= skb->skb_iif,
440 		.mark		= skb->mark,
441 	};
442 	int err;
443 
444 	err = ipmr_fib_lookup(net, &fl, &mrt);
445 	if (err < 0)
446 		return err;
447 
448 	read_lock(&mrt_lock);
449 	dev->stats.tx_bytes += skb->len;
450 	dev->stats.tx_packets++;
451 	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
452 	read_unlock(&mrt_lock);
453 	kfree_skb(skb);
454 	return NETDEV_TX_OK;
455 }
456 
457 static const struct net_device_ops reg_vif_netdev_ops = {
458 	.ndo_start_xmit	= reg_vif_xmit,
459 };
460 
461 static void reg_vif_setup(struct net_device *dev)
462 {
463 	dev->type		= ARPHRD_PIMREG;
464 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
465 	dev->flags		= IFF_NOARP;
466 	dev->netdev_ops		= &reg_vif_netdev_ops,
467 	dev->destructor		= free_netdev;
468 	dev->features		|= NETIF_F_NETNS_LOCAL;
469 }
470 
471 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
472 {
473 	struct net_device *dev;
474 	struct in_device *in_dev;
475 	char name[IFNAMSIZ];
476 
477 	if (mrt->id == RT_TABLE_DEFAULT)
478 		sprintf(name, "pimreg");
479 	else
480 		sprintf(name, "pimreg%u", mrt->id);
481 
482 	dev = alloc_netdev(0, name, reg_vif_setup);
483 
484 	if (dev == NULL)
485 		return NULL;
486 
487 	dev_net_set(dev, net);
488 
489 	if (register_netdevice(dev)) {
490 		free_netdev(dev);
491 		return NULL;
492 	}
493 	dev->iflink = 0;
494 
495 	rcu_read_lock();
496 	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
497 		rcu_read_unlock();
498 		goto failure;
499 	}
500 
501 	ipv4_devconf_setall(in_dev);
502 	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
503 	rcu_read_unlock();
504 
505 	if (dev_open(dev))
506 		goto failure;
507 
508 	dev_hold(dev);
509 
510 	return dev;
511 
512 failure:
513 	/* allow the register to be completed before unregistering. */
514 	rtnl_unlock();
515 	rtnl_lock();
516 
517 	unregister_netdevice(dev);
518 	return NULL;
519 }
520 #endif
521 
522 /*
523  *	Delete a VIF entry
524  *	@notify: Set to 1, if the caller is a notifier_call
525  */
526 
527 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
528 		      struct list_head *head)
529 {
530 	struct vif_device *v;
531 	struct net_device *dev;
532 	struct in_device *in_dev;
533 
534 	if (vifi < 0 || vifi >= mrt->maxvif)
535 		return -EADDRNOTAVAIL;
536 
537 	v = &mrt->vif_table[vifi];
538 
539 	write_lock_bh(&mrt_lock);
540 	dev = v->dev;
541 	v->dev = NULL;
542 
543 	if (!dev) {
544 		write_unlock_bh(&mrt_lock);
545 		return -EADDRNOTAVAIL;
546 	}
547 
548 #ifdef CONFIG_IP_PIMSM
549 	if (vifi == mrt->mroute_reg_vif_num)
550 		mrt->mroute_reg_vif_num = -1;
551 #endif
552 
553 	if (vifi+1 == mrt->maxvif) {
554 		int tmp;
555 		for (tmp=vifi-1; tmp>=0; tmp--) {
556 			if (VIF_EXISTS(mrt, tmp))
557 				break;
558 		}
559 		mrt->maxvif = tmp+1;
560 	}
561 
562 	write_unlock_bh(&mrt_lock);
563 
564 	dev_set_allmulti(dev, -1);
565 
566 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
567 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
568 		ip_rt_multicast_event(in_dev);
569 	}
570 
571 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
572 		unregister_netdevice_queue(dev, head);
573 
574 	dev_put(dev);
575 	return 0;
576 }
577 
578 static inline void ipmr_cache_free(struct mfc_cache *c)
579 {
580 	kmem_cache_free(mrt_cachep, c);
581 }
582 
583 /* Destroy an unresolved cache entry, killing queued skbs
584    and reporting error to netlink readers.
585  */
586 
587 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
588 {
589 	struct net *net = read_pnet(&mrt->net);
590 	struct sk_buff *skb;
591 	struct nlmsgerr *e;
592 
593 	atomic_dec(&mrt->cache_resolve_queue_len);
594 
595 	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
596 		if (ip_hdr(skb)->version == 0) {
597 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
598 			nlh->nlmsg_type = NLMSG_ERROR;
599 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
600 			skb_trim(skb, nlh->nlmsg_len);
601 			e = NLMSG_DATA(nlh);
602 			e->error = -ETIMEDOUT;
603 			memset(&e->msg, 0, sizeof(e->msg));
604 
605 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
606 		} else
607 			kfree_skb(skb);
608 	}
609 
610 	ipmr_cache_free(c);
611 }
612 
613 
614 /* Timer process for the unresolved queue. */
615 
616 static void ipmr_expire_process(unsigned long arg)
617 {
618 	struct mr_table *mrt = (struct mr_table *)arg;
619 	unsigned long now;
620 	unsigned long expires;
621 	struct mfc_cache *c, *next;
622 
623 	if (!spin_trylock(&mfc_unres_lock)) {
624 		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
625 		return;
626 	}
627 
628 	if (list_empty(&mrt->mfc_unres_queue))
629 		goto out;
630 
631 	now = jiffies;
632 	expires = 10*HZ;
633 
634 	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
635 		if (time_after(c->mfc_un.unres.expires, now)) {
636 			unsigned long interval = c->mfc_un.unres.expires - now;
637 			if (interval < expires)
638 				expires = interval;
639 			continue;
640 		}
641 
642 		list_del(&c->list);
643 		ipmr_destroy_unres(mrt, c);
644 	}
645 
646 	if (!list_empty(&mrt->mfc_unres_queue))
647 		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
648 
649 out:
650 	spin_unlock(&mfc_unres_lock);
651 }
652 
653 /* Fill oifs list. It is called under write locked mrt_lock. */
654 
655 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
656 				   unsigned char *ttls)
657 {
658 	int vifi;
659 
660 	cache->mfc_un.res.minvif = MAXVIFS;
661 	cache->mfc_un.res.maxvif = 0;
662 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
663 
664 	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
665 		if (VIF_EXISTS(mrt, vifi) &&
666 		    ttls[vifi] && ttls[vifi] < 255) {
667 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
668 			if (cache->mfc_un.res.minvif > vifi)
669 				cache->mfc_un.res.minvif = vifi;
670 			if (cache->mfc_un.res.maxvif <= vifi)
671 				cache->mfc_un.res.maxvif = vifi + 1;
672 		}
673 	}
674 }
675 
676 static int vif_add(struct net *net, struct mr_table *mrt,
677 		   struct vifctl *vifc, int mrtsock)
678 {
679 	int vifi = vifc->vifc_vifi;
680 	struct vif_device *v = &mrt->vif_table[vifi];
681 	struct net_device *dev;
682 	struct in_device *in_dev;
683 	int err;
684 
685 	/* Is vif busy ? */
686 	if (VIF_EXISTS(mrt, vifi))
687 		return -EADDRINUSE;
688 
689 	switch (vifc->vifc_flags) {
690 #ifdef CONFIG_IP_PIMSM
691 	case VIFF_REGISTER:
692 		/*
693 		 * Special Purpose VIF in PIM
694 		 * All the packets will be sent to the daemon
695 		 */
696 		if (mrt->mroute_reg_vif_num >= 0)
697 			return -EADDRINUSE;
698 		dev = ipmr_reg_vif(net, mrt);
699 		if (!dev)
700 			return -ENOBUFS;
701 		err = dev_set_allmulti(dev, 1);
702 		if (err) {
703 			unregister_netdevice(dev);
704 			dev_put(dev);
705 			return err;
706 		}
707 		break;
708 #endif
709 	case VIFF_TUNNEL:
710 		dev = ipmr_new_tunnel(net, vifc);
711 		if (!dev)
712 			return -ENOBUFS;
713 		err = dev_set_allmulti(dev, 1);
714 		if (err) {
715 			ipmr_del_tunnel(dev, vifc);
716 			dev_put(dev);
717 			return err;
718 		}
719 		break;
720 
721 	case VIFF_USE_IFINDEX:
722 	case 0:
723 		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
724 			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
725 			if (dev && dev->ip_ptr == NULL) {
726 				dev_put(dev);
727 				return -EADDRNOTAVAIL;
728 			}
729 		} else
730 			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
731 
732 		if (!dev)
733 			return -EADDRNOTAVAIL;
734 		err = dev_set_allmulti(dev, 1);
735 		if (err) {
736 			dev_put(dev);
737 			return err;
738 		}
739 		break;
740 	default:
741 		return -EINVAL;
742 	}
743 
744 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
745 		dev_put(dev);
746 		return -EADDRNOTAVAIL;
747 	}
748 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
749 	ip_rt_multicast_event(in_dev);
750 
751 	/*
752 	 *	Fill in the VIF structures
753 	 */
754 	v->rate_limit = vifc->vifc_rate_limit;
755 	v->local = vifc->vifc_lcl_addr.s_addr;
756 	v->remote = vifc->vifc_rmt_addr.s_addr;
757 	v->flags = vifc->vifc_flags;
758 	if (!mrtsock)
759 		v->flags |= VIFF_STATIC;
760 	v->threshold = vifc->vifc_threshold;
761 	v->bytes_in = 0;
762 	v->bytes_out = 0;
763 	v->pkt_in = 0;
764 	v->pkt_out = 0;
765 	v->link = dev->ifindex;
766 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
767 		v->link = dev->iflink;
768 
769 	/* And finish update writing critical data */
770 	write_lock_bh(&mrt_lock);
771 	v->dev = dev;
772 #ifdef CONFIG_IP_PIMSM
773 	if (v->flags&VIFF_REGISTER)
774 		mrt->mroute_reg_vif_num = vifi;
775 #endif
776 	if (vifi+1 > mrt->maxvif)
777 		mrt->maxvif = vifi+1;
778 	write_unlock_bh(&mrt_lock);
779 	return 0;
780 }
781 
782 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
783 					 __be32 origin,
784 					 __be32 mcastgrp)
785 {
786 	int line = MFC_HASH(mcastgrp, origin);
787 	struct mfc_cache *c;
788 
789 	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
790 		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
791 			return c;
792 	}
793 	return NULL;
794 }
795 
796 /*
797  *	Allocate a multicast cache entry
798  */
799 static struct mfc_cache *ipmr_cache_alloc(void)
800 {
801 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
802 	if (c == NULL)
803 		return NULL;
804 	c->mfc_un.res.minvif = MAXVIFS;
805 	return c;
806 }
807 
808 static struct mfc_cache *ipmr_cache_alloc_unres(void)
809 {
810 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
811 	if (c == NULL)
812 		return NULL;
813 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
814 	c->mfc_un.unres.expires = jiffies + 10*HZ;
815 	return c;
816 }
817 
818 /*
819  *	A cache entry has gone into a resolved state from queued
820  */
821 
822 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
823 			       struct mfc_cache *uc, struct mfc_cache *c)
824 {
825 	struct sk_buff *skb;
826 	struct nlmsgerr *e;
827 
828 	/*
829 	 *	Play the pending entries through our router
830 	 */
831 
832 	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
833 		if (ip_hdr(skb)->version == 0) {
834 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
835 
836 			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
837 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
838 						  (u8 *)nlh);
839 			} else {
840 				nlh->nlmsg_type = NLMSG_ERROR;
841 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
842 				skb_trim(skb, nlh->nlmsg_len);
843 				e = NLMSG_DATA(nlh);
844 				e->error = -EMSGSIZE;
845 				memset(&e->msg, 0, sizeof(e->msg));
846 			}
847 
848 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
849 		} else
850 			ip_mr_forward(net, mrt, skb, c, 0);
851 	}
852 }
853 
854 /*
855  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
856  *	expects the following bizarre scheme.
857  *
858  *	Called under mrt_lock.
859  */
860 
861 static int ipmr_cache_report(struct mr_table *mrt,
862 			     struct sk_buff *pkt, vifi_t vifi, int assert)
863 {
864 	struct sk_buff *skb;
865 	const int ihl = ip_hdrlen(pkt);
866 	struct igmphdr *igmp;
867 	struct igmpmsg *msg;
868 	int ret;
869 
870 #ifdef CONFIG_IP_PIMSM
871 	if (assert == IGMPMSG_WHOLEPKT)
872 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
873 	else
874 #endif
875 		skb = alloc_skb(128, GFP_ATOMIC);
876 
877 	if (!skb)
878 		return -ENOBUFS;
879 
880 #ifdef CONFIG_IP_PIMSM
881 	if (assert == IGMPMSG_WHOLEPKT) {
882 		/* Ugly, but we have no choice with this interface.
883 		   Duplicate old header, fix ihl, length etc.
884 		   And all this only to mangle msg->im_msgtype and
885 		   to set msg->im_mbz to "mbz" :-)
886 		 */
887 		skb_push(skb, sizeof(struct iphdr));
888 		skb_reset_network_header(skb);
889 		skb_reset_transport_header(skb);
890 		msg = (struct igmpmsg *)skb_network_header(skb);
891 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
892 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
893 		msg->im_mbz = 0;
894 		msg->im_vif = mrt->mroute_reg_vif_num;
895 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
896 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
897 					     sizeof(struct iphdr));
898 	} else
899 #endif
900 	{
901 
902 	/*
903 	 *	Copy the IP header
904 	 */
905 
906 	skb->network_header = skb->tail;
907 	skb_put(skb, ihl);
908 	skb_copy_to_linear_data(skb, pkt->data, ihl);
909 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
910 	msg = (struct igmpmsg *)skb_network_header(skb);
911 	msg->im_vif = vifi;
912 	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
913 
914 	/*
915 	 *	Add our header
916 	 */
917 
918 	igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
919 	igmp->type	=
920 	msg->im_msgtype = assert;
921 	igmp->code 	=	0;
922 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
923 	skb->transport_header = skb->network_header;
924 	}
925 
926 	if (mrt->mroute_sk == NULL) {
927 		kfree_skb(skb);
928 		return -EINVAL;
929 	}
930 
931 	/*
932 	 *	Deliver to mrouted
933 	 */
934 	ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
935 	if (ret < 0) {
936 		if (net_ratelimit())
937 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
938 		kfree_skb(skb);
939 	}
940 
941 	return ret;
942 }
943 
944 /*
945  *	Queue a packet for resolution. It gets locked cache entry!
946  */
947 
948 static int
949 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
950 {
951 	bool found = false;
952 	int err;
953 	struct mfc_cache *c;
954 	const struct iphdr *iph = ip_hdr(skb);
955 
956 	spin_lock_bh(&mfc_unres_lock);
957 	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
958 		if (c->mfc_mcastgrp == iph->daddr &&
959 		    c->mfc_origin == iph->saddr) {
960 			found = true;
961 			break;
962 		}
963 	}
964 
965 	if (!found) {
966 		/*
967 		 *	Create a new entry if allowable
968 		 */
969 
970 		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
971 		    (c = ipmr_cache_alloc_unres()) == NULL) {
972 			spin_unlock_bh(&mfc_unres_lock);
973 
974 			kfree_skb(skb);
975 			return -ENOBUFS;
976 		}
977 
978 		/*
979 		 *	Fill in the new cache entry
980 		 */
981 		c->mfc_parent	= -1;
982 		c->mfc_origin	= iph->saddr;
983 		c->mfc_mcastgrp	= iph->daddr;
984 
985 		/*
986 		 *	Reflect first query at mrouted.
987 		 */
988 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
989 		if (err < 0) {
990 			/* If the report failed throw the cache entry
991 			   out - Brad Parker
992 			 */
993 			spin_unlock_bh(&mfc_unres_lock);
994 
995 			ipmr_cache_free(c);
996 			kfree_skb(skb);
997 			return err;
998 		}
999 
1000 		atomic_inc(&mrt->cache_resolve_queue_len);
1001 		list_add(&c->list, &mrt->mfc_unres_queue);
1002 
1003 		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1004 			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1005 	}
1006 
1007 	/*
1008 	 *	See if we can append the packet
1009 	 */
1010 	if (c->mfc_un.unres.unresolved.qlen>3) {
1011 		kfree_skb(skb);
1012 		err = -ENOBUFS;
1013 	} else {
1014 		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1015 		err = 0;
1016 	}
1017 
1018 	spin_unlock_bh(&mfc_unres_lock);
1019 	return err;
1020 }
1021 
1022 /*
1023  *	MFC cache manipulation by user space mroute daemon
1024  */
1025 
1026 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1027 {
1028 	int line;
1029 	struct mfc_cache *c, *next;
1030 
1031 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1032 
1033 	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1034 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1035 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1036 			write_lock_bh(&mrt_lock);
1037 			list_del(&c->list);
1038 			write_unlock_bh(&mrt_lock);
1039 
1040 			ipmr_cache_free(c);
1041 			return 0;
1042 		}
1043 	}
1044 	return -ENOENT;
1045 }
1046 
1047 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1048 			struct mfcctl *mfc, int mrtsock)
1049 {
1050 	bool found = false;
1051 	int line;
1052 	struct mfc_cache *uc, *c;
1053 
1054 	if (mfc->mfcc_parent >= MAXVIFS)
1055 		return -ENFILE;
1056 
1057 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1058 
1059 	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1060 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1061 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1062 			found = true;
1063 			break;
1064 		}
1065 	}
1066 
1067 	if (found) {
1068 		write_lock_bh(&mrt_lock);
1069 		c->mfc_parent = mfc->mfcc_parent;
1070 		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1071 		if (!mrtsock)
1072 			c->mfc_flags |= MFC_STATIC;
1073 		write_unlock_bh(&mrt_lock);
1074 		return 0;
1075 	}
1076 
1077 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1078 		return -EINVAL;
1079 
1080 	c = ipmr_cache_alloc();
1081 	if (c == NULL)
1082 		return -ENOMEM;
1083 
1084 	c->mfc_origin = mfc->mfcc_origin.s_addr;
1085 	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1086 	c->mfc_parent = mfc->mfcc_parent;
1087 	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1088 	if (!mrtsock)
1089 		c->mfc_flags |= MFC_STATIC;
1090 
1091 	write_lock_bh(&mrt_lock);
1092 	list_add(&c->list, &mrt->mfc_cache_array[line]);
1093 	write_unlock_bh(&mrt_lock);
1094 
1095 	/*
1096 	 *	Check to see if we resolved a queued list. If so we
1097 	 *	need to send on the frames and tidy up.
1098 	 */
1099 	found = false;
1100 	spin_lock_bh(&mfc_unres_lock);
1101 	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1102 		if (uc->mfc_origin == c->mfc_origin &&
1103 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1104 			list_del(&uc->list);
1105 			atomic_dec(&mrt->cache_resolve_queue_len);
1106 			found = true;
1107 			break;
1108 		}
1109 	}
1110 	if (list_empty(&mrt->mfc_unres_queue))
1111 		del_timer(&mrt->ipmr_expire_timer);
1112 	spin_unlock_bh(&mfc_unres_lock);
1113 
1114 	if (found) {
1115 		ipmr_cache_resolve(net, mrt, uc, c);
1116 		ipmr_cache_free(uc);
1117 	}
1118 	return 0;
1119 }
1120 
1121 /*
1122  *	Close the multicast socket, and clear the vif tables etc
1123  */
1124 
1125 static void mroute_clean_tables(struct mr_table *mrt)
1126 {
1127 	int i;
1128 	LIST_HEAD(list);
1129 	struct mfc_cache *c, *next;
1130 
1131 	/*
1132 	 *	Shut down all active vif entries
1133 	 */
1134 	for (i = 0; i < mrt->maxvif; i++) {
1135 		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1136 			vif_delete(mrt, i, 0, &list);
1137 	}
1138 	unregister_netdevice_many(&list);
1139 
1140 	/*
1141 	 *	Wipe the cache
1142 	 */
1143 	for (i = 0; i < MFC_LINES; i++) {
1144 		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1145 			if (c->mfc_flags&MFC_STATIC)
1146 				continue;
1147 			write_lock_bh(&mrt_lock);
1148 			list_del(&c->list);
1149 			write_unlock_bh(&mrt_lock);
1150 
1151 			ipmr_cache_free(c);
1152 		}
1153 	}
1154 
1155 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1156 		spin_lock_bh(&mfc_unres_lock);
1157 		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1158 			list_del(&c->list);
1159 			ipmr_destroy_unres(mrt, c);
1160 		}
1161 		spin_unlock_bh(&mfc_unres_lock);
1162 	}
1163 }
1164 
1165 static void mrtsock_destruct(struct sock *sk)
1166 {
1167 	struct net *net = sock_net(sk);
1168 	struct mr_table *mrt;
1169 
1170 	rtnl_lock();
1171 	ipmr_for_each_table(mrt, net) {
1172 		if (sk == mrt->mroute_sk) {
1173 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1174 
1175 			write_lock_bh(&mrt_lock);
1176 			mrt->mroute_sk = NULL;
1177 			write_unlock_bh(&mrt_lock);
1178 
1179 			mroute_clean_tables(mrt);
1180 		}
1181 	}
1182 	rtnl_unlock();
1183 }
1184 
1185 /*
1186  *	Socket options and virtual interface manipulation. The whole
1187  *	virtual interface system is a complete heap, but unfortunately
1188  *	that's how BSD mrouted happens to think. Maybe one day with a proper
1189  *	MOSPF/PIM router set up we can clean this up.
1190  */
1191 
1192 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1193 {
1194 	int ret;
1195 	struct vifctl vif;
1196 	struct mfcctl mfc;
1197 	struct net *net = sock_net(sk);
1198 	struct mr_table *mrt;
1199 
1200 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1201 	if (mrt == NULL)
1202 		return -ENOENT;
1203 
1204 	if (optname != MRT_INIT) {
1205 		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
1206 			return -EACCES;
1207 	}
1208 
1209 	switch (optname) {
1210 	case MRT_INIT:
1211 		if (sk->sk_type != SOCK_RAW ||
1212 		    inet_sk(sk)->inet_num != IPPROTO_IGMP)
1213 			return -EOPNOTSUPP;
1214 		if (optlen != sizeof(int))
1215 			return -ENOPROTOOPT;
1216 
1217 		rtnl_lock();
1218 		if (mrt->mroute_sk) {
1219 			rtnl_unlock();
1220 			return -EADDRINUSE;
1221 		}
1222 
1223 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
1224 		if (ret == 0) {
1225 			write_lock_bh(&mrt_lock);
1226 			mrt->mroute_sk = sk;
1227 			write_unlock_bh(&mrt_lock);
1228 
1229 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1230 		}
1231 		rtnl_unlock();
1232 		return ret;
1233 	case MRT_DONE:
1234 		if (sk != mrt->mroute_sk)
1235 			return -EACCES;
1236 		return ip_ra_control(sk, 0, NULL);
1237 	case MRT_ADD_VIF:
1238 	case MRT_DEL_VIF:
1239 		if (optlen != sizeof(vif))
1240 			return -EINVAL;
1241 		if (copy_from_user(&vif, optval, sizeof(vif)))
1242 			return -EFAULT;
1243 		if (vif.vifc_vifi >= MAXVIFS)
1244 			return -ENFILE;
1245 		rtnl_lock();
1246 		if (optname == MRT_ADD_VIF) {
1247 			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1248 		} else {
1249 			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1250 		}
1251 		rtnl_unlock();
1252 		return ret;
1253 
1254 		/*
1255 		 *	Manipulate the forwarding caches. These live
1256 		 *	in a sort of kernel/user symbiosis.
1257 		 */
1258 	case MRT_ADD_MFC:
1259 	case MRT_DEL_MFC:
1260 		if (optlen != sizeof(mfc))
1261 			return -EINVAL;
1262 		if (copy_from_user(&mfc, optval, sizeof(mfc)))
1263 			return -EFAULT;
1264 		rtnl_lock();
1265 		if (optname == MRT_DEL_MFC)
1266 			ret = ipmr_mfc_delete(mrt, &mfc);
1267 		else
1268 			ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1269 		rtnl_unlock();
1270 		return ret;
1271 		/*
1272 		 *	Control PIM assert.
1273 		 */
1274 	case MRT_ASSERT:
1275 	{
1276 		int v;
1277 		if (get_user(v,(int __user *)optval))
1278 			return -EFAULT;
1279 		mrt->mroute_do_assert = (v) ? 1 : 0;
1280 		return 0;
1281 	}
1282 #ifdef CONFIG_IP_PIMSM
1283 	case MRT_PIM:
1284 	{
1285 		int v;
1286 
1287 		if (get_user(v,(int __user *)optval))
1288 			return -EFAULT;
1289 		v = (v) ? 1 : 0;
1290 
1291 		rtnl_lock();
1292 		ret = 0;
1293 		if (v != mrt->mroute_do_pim) {
1294 			mrt->mroute_do_pim = v;
1295 			mrt->mroute_do_assert = v;
1296 		}
1297 		rtnl_unlock();
1298 		return ret;
1299 	}
1300 #endif
1301 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1302 	case MRT_TABLE:
1303 	{
1304 		u32 v;
1305 
1306 		if (optlen != sizeof(u32))
1307 			return -EINVAL;
1308 		if (get_user(v, (u32 __user *)optval))
1309 			return -EFAULT;
1310 		if (sk == mrt->mroute_sk)
1311 			return -EBUSY;
1312 
1313 		rtnl_lock();
1314 		ret = 0;
1315 		if (!ipmr_new_table(net, v))
1316 			ret = -ENOMEM;
1317 		raw_sk(sk)->ipmr_table = v;
1318 		rtnl_unlock();
1319 		return ret;
1320 	}
1321 #endif
1322 	/*
1323 	 *	Spurious command, or MRT_VERSION which you cannot
1324 	 *	set.
1325 	 */
1326 	default:
1327 		return -ENOPROTOOPT;
1328 	}
1329 }
1330 
1331 /*
1332  *	Getsock opt support for the multicast routing system.
1333  */
1334 
1335 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1336 {
1337 	int olr;
1338 	int val;
1339 	struct net *net = sock_net(sk);
1340 	struct mr_table *mrt;
1341 
1342 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1343 	if (mrt == NULL)
1344 		return -ENOENT;
1345 
1346 	if (optname != MRT_VERSION &&
1347 #ifdef CONFIG_IP_PIMSM
1348 	   optname!=MRT_PIM &&
1349 #endif
1350 	   optname!=MRT_ASSERT)
1351 		return -ENOPROTOOPT;
1352 
1353 	if (get_user(olr, optlen))
1354 		return -EFAULT;
1355 
1356 	olr = min_t(unsigned int, olr, sizeof(int));
1357 	if (olr < 0)
1358 		return -EINVAL;
1359 
1360 	if (put_user(olr, optlen))
1361 		return -EFAULT;
1362 	if (optname == MRT_VERSION)
1363 		val = 0x0305;
1364 #ifdef CONFIG_IP_PIMSM
1365 	else if (optname == MRT_PIM)
1366 		val = mrt->mroute_do_pim;
1367 #endif
1368 	else
1369 		val = mrt->mroute_do_assert;
1370 	if (copy_to_user(optval, &val, olr))
1371 		return -EFAULT;
1372 	return 0;
1373 }
1374 
1375 /*
1376  *	The IP multicast ioctl support routines.
1377  */
1378 
1379 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1380 {
1381 	struct sioc_sg_req sr;
1382 	struct sioc_vif_req vr;
1383 	struct vif_device *vif;
1384 	struct mfc_cache *c;
1385 	struct net *net = sock_net(sk);
1386 	struct mr_table *mrt;
1387 
1388 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1389 	if (mrt == NULL)
1390 		return -ENOENT;
1391 
1392 	switch (cmd) {
1393 	case SIOCGETVIFCNT:
1394 		if (copy_from_user(&vr, arg, sizeof(vr)))
1395 			return -EFAULT;
1396 		if (vr.vifi >= mrt->maxvif)
1397 			return -EINVAL;
1398 		read_lock(&mrt_lock);
1399 		vif = &mrt->vif_table[vr.vifi];
1400 		if (VIF_EXISTS(mrt, vr.vifi)) {
1401 			vr.icount = vif->pkt_in;
1402 			vr.ocount = vif->pkt_out;
1403 			vr.ibytes = vif->bytes_in;
1404 			vr.obytes = vif->bytes_out;
1405 			read_unlock(&mrt_lock);
1406 
1407 			if (copy_to_user(arg, &vr, sizeof(vr)))
1408 				return -EFAULT;
1409 			return 0;
1410 		}
1411 		read_unlock(&mrt_lock);
1412 		return -EADDRNOTAVAIL;
1413 	case SIOCGETSGCNT:
1414 		if (copy_from_user(&sr, arg, sizeof(sr)))
1415 			return -EFAULT;
1416 
1417 		read_lock(&mrt_lock);
1418 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1419 		if (c) {
1420 			sr.pktcnt = c->mfc_un.res.pkt;
1421 			sr.bytecnt = c->mfc_un.res.bytes;
1422 			sr.wrong_if = c->mfc_un.res.wrong_if;
1423 			read_unlock(&mrt_lock);
1424 
1425 			if (copy_to_user(arg, &sr, sizeof(sr)))
1426 				return -EFAULT;
1427 			return 0;
1428 		}
1429 		read_unlock(&mrt_lock);
1430 		return -EADDRNOTAVAIL;
1431 	default:
1432 		return -ENOIOCTLCMD;
1433 	}
1434 }
1435 
1436 
1437 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1438 {
1439 	struct net_device *dev = ptr;
1440 	struct net *net = dev_net(dev);
1441 	struct mr_table *mrt;
1442 	struct vif_device *v;
1443 	int ct;
1444 	LIST_HEAD(list);
1445 
1446 	if (event != NETDEV_UNREGISTER)
1447 		return NOTIFY_DONE;
1448 
1449 	ipmr_for_each_table(mrt, net) {
1450 		v = &mrt->vif_table[0];
1451 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1452 			if (v->dev == dev)
1453 				vif_delete(mrt, ct, 1, &list);
1454 		}
1455 	}
1456 	unregister_netdevice_many(&list);
1457 	return NOTIFY_DONE;
1458 }
1459 
1460 
1461 static struct notifier_block ip_mr_notifier = {
1462 	.notifier_call = ipmr_device_event,
1463 };
1464 
1465 /*
1466  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1467  *	This avoids tunnel drivers and other mess and gives us the speed so
1468  *	important for multicast video.
1469  */
1470 
1471 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1472 {
1473 	struct iphdr *iph;
1474 	struct iphdr *old_iph = ip_hdr(skb);
1475 
1476 	skb_push(skb, sizeof(struct iphdr));
1477 	skb->transport_header = skb->network_header;
1478 	skb_reset_network_header(skb);
1479 	iph = ip_hdr(skb);
1480 
1481 	iph->version	= 	4;
1482 	iph->tos	=	old_iph->tos;
1483 	iph->ttl	=	old_iph->ttl;
1484 	iph->frag_off	=	0;
1485 	iph->daddr	=	daddr;
1486 	iph->saddr	=	saddr;
1487 	iph->protocol	=	IPPROTO_IPIP;
1488 	iph->ihl	=	5;
1489 	iph->tot_len	=	htons(skb->len);
1490 	ip_select_ident(iph, skb_dst(skb), NULL);
1491 	ip_send_check(iph);
1492 
1493 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1494 	nf_reset(skb);
1495 }
1496 
1497 static inline int ipmr_forward_finish(struct sk_buff *skb)
1498 {
1499 	struct ip_options * opt	= &(IPCB(skb)->opt);
1500 
1501 	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1502 
1503 	if (unlikely(opt->optlen))
1504 		ip_forward_options(skb);
1505 
1506 	return dst_output(skb);
1507 }
1508 
1509 /*
1510  *	Processing handlers for ipmr_forward
1511  */
1512 
1513 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1514 			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
1515 {
1516 	const struct iphdr *iph = ip_hdr(skb);
1517 	struct vif_device *vif = &mrt->vif_table[vifi];
1518 	struct net_device *dev;
1519 	struct rtable *rt;
1520 	int    encap = 0;
1521 
1522 	if (vif->dev == NULL)
1523 		goto out_free;
1524 
1525 #ifdef CONFIG_IP_PIMSM
1526 	if (vif->flags & VIFF_REGISTER) {
1527 		vif->pkt_out++;
1528 		vif->bytes_out += skb->len;
1529 		vif->dev->stats.tx_bytes += skb->len;
1530 		vif->dev->stats.tx_packets++;
1531 		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1532 		goto out_free;
1533 	}
1534 #endif
1535 
1536 	if (vif->flags&VIFF_TUNNEL) {
1537 		struct flowi fl = { .oif = vif->link,
1538 				    .nl_u = { .ip4_u =
1539 					      { .daddr = vif->remote,
1540 						.saddr = vif->local,
1541 						.tos = RT_TOS(iph->tos) } },
1542 				    .proto = IPPROTO_IPIP };
1543 		if (ip_route_output_key(net, &rt, &fl))
1544 			goto out_free;
1545 		encap = sizeof(struct iphdr);
1546 	} else {
1547 		struct flowi fl = { .oif = vif->link,
1548 				    .nl_u = { .ip4_u =
1549 					      { .daddr = iph->daddr,
1550 						.tos = RT_TOS(iph->tos) } },
1551 				    .proto = IPPROTO_IPIP };
1552 		if (ip_route_output_key(net, &rt, &fl))
1553 			goto out_free;
1554 	}
1555 
1556 	dev = rt->u.dst.dev;
1557 
1558 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1559 		/* Do not fragment multicasts. Alas, IPv4 does not
1560 		   allow to send ICMP, so that packets will disappear
1561 		   to blackhole.
1562 		 */
1563 
1564 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1565 		ip_rt_put(rt);
1566 		goto out_free;
1567 	}
1568 
1569 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1570 
1571 	if (skb_cow(skb, encap)) {
1572 		ip_rt_put(rt);
1573 		goto out_free;
1574 	}
1575 
1576 	vif->pkt_out++;
1577 	vif->bytes_out += skb->len;
1578 
1579 	skb_dst_drop(skb);
1580 	skb_dst_set(skb, &rt->u.dst);
1581 	ip_decrease_ttl(ip_hdr(skb));
1582 
1583 	/* FIXME: forward and output firewalls used to be called here.
1584 	 * What do we do with netfilter? -- RR */
1585 	if (vif->flags & VIFF_TUNNEL) {
1586 		ip_encap(skb, vif->local, vif->remote);
1587 		/* FIXME: extra output firewall step used to be here. --RR */
1588 		vif->dev->stats.tx_packets++;
1589 		vif->dev->stats.tx_bytes += skb->len;
1590 	}
1591 
1592 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1593 
1594 	/*
1595 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1596 	 * not only before forwarding, but after forwarding on all output
1597 	 * interfaces. It is clear, if mrouter runs a multicasting
1598 	 * program, it should receive packets not depending to what interface
1599 	 * program is joined.
1600 	 * If we will not make it, the program will have to join on all
1601 	 * interfaces. On the other hand, multihoming host (or router, but
1602 	 * not mrouter) cannot join to more than one interface - it will
1603 	 * result in receiving multiple packets.
1604 	 */
1605 	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1606 		ipmr_forward_finish);
1607 	return;
1608 
1609 out_free:
1610 	kfree_skb(skb);
1611 }
1612 
1613 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1614 {
1615 	int ct;
1616 
1617 	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1618 		if (mrt->vif_table[ct].dev == dev)
1619 			break;
1620 	}
1621 	return ct;
1622 }
1623 
1624 /* "local" means that we should preserve one skb (for local delivery) */
1625 
1626 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1627 			 struct sk_buff *skb, struct mfc_cache *cache,
1628 			 int local)
1629 {
1630 	int psend = -1;
1631 	int vif, ct;
1632 
1633 	vif = cache->mfc_parent;
1634 	cache->mfc_un.res.pkt++;
1635 	cache->mfc_un.res.bytes += skb->len;
1636 
1637 	/*
1638 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1639 	 */
1640 	if (mrt->vif_table[vif].dev != skb->dev) {
1641 		int true_vifi;
1642 
1643 		if (skb_rtable(skb)->fl.iif == 0) {
1644 			/* It is our own packet, looped back.
1645 			   Very complicated situation...
1646 
1647 			   The best workaround until routing daemons will be
1648 			   fixed is not to redistribute packet, if it was
1649 			   send through wrong interface. It means, that
1650 			   multicast applications WILL NOT work for
1651 			   (S,G), which have default multicast route pointing
1652 			   to wrong oif. In any case, it is not a good
1653 			   idea to use multicasting applications on router.
1654 			 */
1655 			goto dont_forward;
1656 		}
1657 
1658 		cache->mfc_un.res.wrong_if++;
1659 		true_vifi = ipmr_find_vif(mrt, skb->dev);
1660 
1661 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
1662 		    /* pimsm uses asserts, when switching from RPT to SPT,
1663 		       so that we cannot check that packet arrived on an oif.
1664 		       It is bad, but otherwise we would need to move pretty
1665 		       large chunk of pimd to kernel. Ough... --ANK
1666 		     */
1667 		    (mrt->mroute_do_pim ||
1668 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
1669 		    time_after(jiffies,
1670 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1671 			cache->mfc_un.res.last_assert = jiffies;
1672 			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1673 		}
1674 		goto dont_forward;
1675 	}
1676 
1677 	mrt->vif_table[vif].pkt_in++;
1678 	mrt->vif_table[vif].bytes_in += skb->len;
1679 
1680 	/*
1681 	 *	Forward the frame
1682 	 */
1683 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1684 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1685 			if (psend != -1) {
1686 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1687 				if (skb2)
1688 					ipmr_queue_xmit(net, mrt, skb2, cache,
1689 							psend);
1690 			}
1691 			psend = ct;
1692 		}
1693 	}
1694 	if (psend != -1) {
1695 		if (local) {
1696 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1697 			if (skb2)
1698 				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1699 		} else {
1700 			ipmr_queue_xmit(net, mrt, skb, cache, psend);
1701 			return 0;
1702 		}
1703 	}
1704 
1705 dont_forward:
1706 	if (!local)
1707 		kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 
1712 /*
1713  *	Multicast packets for forwarding arrive here
1714  */
1715 
1716 int ip_mr_input(struct sk_buff *skb)
1717 {
1718 	struct mfc_cache *cache;
1719 	struct net *net = dev_net(skb->dev);
1720 	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1721 	struct mr_table *mrt;
1722 	int err;
1723 
1724 	/* Packet is looped back after forward, it should not be
1725 	   forwarded second time, but still can be delivered locally.
1726 	 */
1727 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1728 		goto dont_forward;
1729 
1730 	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1731 	if (err < 0)
1732 		return err;
1733 
1734 	if (!local) {
1735 		    if (IPCB(skb)->opt.router_alert) {
1736 			    if (ip_call_ra_chain(skb))
1737 				    return 0;
1738 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1739 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1740 			       Cisco IOS <= 11.2(8)) do not put router alert
1741 			       option to IGMP packets destined to routable
1742 			       groups. It is very bad, because it means
1743 			       that we can forward NO IGMP messages.
1744 			     */
1745 			    read_lock(&mrt_lock);
1746 			    if (mrt->mroute_sk) {
1747 				    nf_reset(skb);
1748 				    raw_rcv(mrt->mroute_sk, skb);
1749 				    read_unlock(&mrt_lock);
1750 				    return 0;
1751 			    }
1752 			    read_unlock(&mrt_lock);
1753 		    }
1754 	}
1755 
1756 	read_lock(&mrt_lock);
1757 	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1758 
1759 	/*
1760 	 *	No usable cache entry
1761 	 */
1762 	if (cache == NULL) {
1763 		int vif;
1764 
1765 		if (local) {
1766 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1767 			ip_local_deliver(skb);
1768 			if (skb2 == NULL) {
1769 				read_unlock(&mrt_lock);
1770 				return -ENOBUFS;
1771 			}
1772 			skb = skb2;
1773 		}
1774 
1775 		vif = ipmr_find_vif(mrt, skb->dev);
1776 		if (vif >= 0) {
1777 			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1778 			read_unlock(&mrt_lock);
1779 
1780 			return err2;
1781 		}
1782 		read_unlock(&mrt_lock);
1783 		kfree_skb(skb);
1784 		return -ENODEV;
1785 	}
1786 
1787 	ip_mr_forward(net, mrt, skb, cache, local);
1788 
1789 	read_unlock(&mrt_lock);
1790 
1791 	if (local)
1792 		return ip_local_deliver(skb);
1793 
1794 	return 0;
1795 
1796 dont_forward:
1797 	if (local)
1798 		return ip_local_deliver(skb);
1799 	kfree_skb(skb);
1800 	return 0;
1801 }
1802 
1803 #ifdef CONFIG_IP_PIMSM
1804 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1805 		     unsigned int pimlen)
1806 {
1807 	struct net_device *reg_dev = NULL;
1808 	struct iphdr *encap;
1809 
1810 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1811 	/*
1812 	   Check that:
1813 	   a. packet is really destinted to a multicast group
1814 	   b. packet is not a NULL-REGISTER
1815 	   c. packet is not truncated
1816 	 */
1817 	if (!ipv4_is_multicast(encap->daddr) ||
1818 	    encap->tot_len == 0 ||
1819 	    ntohs(encap->tot_len) + pimlen > skb->len)
1820 		return 1;
1821 
1822 	read_lock(&mrt_lock);
1823 	if (mrt->mroute_reg_vif_num >= 0)
1824 		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1825 	if (reg_dev)
1826 		dev_hold(reg_dev);
1827 	read_unlock(&mrt_lock);
1828 
1829 	if (reg_dev == NULL)
1830 		return 1;
1831 
1832 	skb->mac_header = skb->network_header;
1833 	skb_pull(skb, (u8*)encap - skb->data);
1834 	skb_reset_network_header(skb);
1835 	skb->protocol = htons(ETH_P_IP);
1836 	skb->ip_summed = 0;
1837 	skb->pkt_type = PACKET_HOST;
1838 
1839 	skb_tunnel_rx(skb, reg_dev);
1840 
1841 	netif_rx(skb);
1842 	dev_put(reg_dev);
1843 
1844 	return 0;
1845 }
1846 #endif
1847 
1848 #ifdef CONFIG_IP_PIMSM_V1
1849 /*
1850  * Handle IGMP messages of PIMv1
1851  */
1852 
1853 int pim_rcv_v1(struct sk_buff * skb)
1854 {
1855 	struct igmphdr *pim;
1856 	struct net *net = dev_net(skb->dev);
1857 	struct mr_table *mrt;
1858 
1859 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1860 		goto drop;
1861 
1862 	pim = igmp_hdr(skb);
1863 
1864 	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1865 		goto drop;
1866 
1867 	if (!mrt->mroute_do_pim ||
1868 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1869 		goto drop;
1870 
1871 	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1872 drop:
1873 		kfree_skb(skb);
1874 	}
1875 	return 0;
1876 }
1877 #endif
1878 
1879 #ifdef CONFIG_IP_PIMSM_V2
1880 static int pim_rcv(struct sk_buff * skb)
1881 {
1882 	struct pimreghdr *pim;
1883 	struct net *net = dev_net(skb->dev);
1884 	struct mr_table *mrt;
1885 
1886 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1887 		goto drop;
1888 
1889 	pim = (struct pimreghdr *)skb_transport_header(skb);
1890 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1891 	    (pim->flags&PIM_NULL_REGISTER) ||
1892 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1893 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1894 		goto drop;
1895 
1896 	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1897 		goto drop;
1898 
1899 	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1900 drop:
1901 		kfree_skb(skb);
1902 	}
1903 	return 0;
1904 }
1905 #endif
1906 
1907 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1908 			      struct mfc_cache *c, struct rtmsg *rtm)
1909 {
1910 	int ct;
1911 	struct rtnexthop *nhp;
1912 	u8 *b = skb_tail_pointer(skb);
1913 	struct rtattr *mp_head;
1914 
1915 	/* If cache is unresolved, don't try to parse IIF and OIF */
1916 	if (c->mfc_parent >= MAXVIFS)
1917 		return -ENOENT;
1918 
1919 	if (VIF_EXISTS(mrt, c->mfc_parent))
1920 		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1921 
1922 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1923 
1924 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1925 		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1926 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1927 				goto rtattr_failure;
1928 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1929 			nhp->rtnh_flags = 0;
1930 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1931 			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1932 			nhp->rtnh_len = sizeof(*nhp);
1933 		}
1934 	}
1935 	mp_head->rta_type = RTA_MULTIPATH;
1936 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1937 	rtm->rtm_type = RTN_MULTICAST;
1938 	return 1;
1939 
1940 rtattr_failure:
1941 	nlmsg_trim(skb, b);
1942 	return -EMSGSIZE;
1943 }
1944 
1945 int ipmr_get_route(struct net *net,
1946 		   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1947 {
1948 	int err;
1949 	struct mr_table *mrt;
1950 	struct mfc_cache *cache;
1951 	struct rtable *rt = skb_rtable(skb);
1952 
1953 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1954 	if (mrt == NULL)
1955 		return -ENOENT;
1956 
1957 	read_lock(&mrt_lock);
1958 	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1959 
1960 	if (cache == NULL) {
1961 		struct sk_buff *skb2;
1962 		struct iphdr *iph;
1963 		struct net_device *dev;
1964 		int vif;
1965 
1966 		if (nowait) {
1967 			read_unlock(&mrt_lock);
1968 			return -EAGAIN;
1969 		}
1970 
1971 		dev = skb->dev;
1972 		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1973 			read_unlock(&mrt_lock);
1974 			return -ENODEV;
1975 		}
1976 		skb2 = skb_clone(skb, GFP_ATOMIC);
1977 		if (!skb2) {
1978 			read_unlock(&mrt_lock);
1979 			return -ENOMEM;
1980 		}
1981 
1982 		skb_push(skb2, sizeof(struct iphdr));
1983 		skb_reset_network_header(skb2);
1984 		iph = ip_hdr(skb2);
1985 		iph->ihl = sizeof(struct iphdr) >> 2;
1986 		iph->saddr = rt->rt_src;
1987 		iph->daddr = rt->rt_dst;
1988 		iph->version = 0;
1989 		err = ipmr_cache_unresolved(mrt, vif, skb2);
1990 		read_unlock(&mrt_lock);
1991 		return err;
1992 	}
1993 
1994 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1995 		cache->mfc_flags |= MFC_NOTIFY;
1996 	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
1997 	read_unlock(&mrt_lock);
1998 	return err;
1999 }
2000 
2001 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2002 			    u32 pid, u32 seq, struct mfc_cache *c)
2003 {
2004 	struct nlmsghdr *nlh;
2005 	struct rtmsg *rtm;
2006 
2007 	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2008 	if (nlh == NULL)
2009 		return -EMSGSIZE;
2010 
2011 	rtm = nlmsg_data(nlh);
2012 	rtm->rtm_family   = RTNL_FAMILY_IPMR;
2013 	rtm->rtm_dst_len  = 32;
2014 	rtm->rtm_src_len  = 32;
2015 	rtm->rtm_tos      = 0;
2016 	rtm->rtm_table    = mrt->id;
2017 	NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2018 	rtm->rtm_type     = RTN_MULTICAST;
2019 	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2020 	rtm->rtm_protocol = RTPROT_UNSPEC;
2021 	rtm->rtm_flags    = 0;
2022 
2023 	NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2024 	NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2025 
2026 	if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2027 		goto nla_put_failure;
2028 
2029 	return nlmsg_end(skb, nlh);
2030 
2031 nla_put_failure:
2032 	nlmsg_cancel(skb, nlh);
2033 	return -EMSGSIZE;
2034 }
2035 
2036 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2037 {
2038 	struct net *net = sock_net(skb->sk);
2039 	struct mr_table *mrt;
2040 	struct mfc_cache *mfc;
2041 	unsigned int t = 0, s_t;
2042 	unsigned int h = 0, s_h;
2043 	unsigned int e = 0, s_e;
2044 
2045 	s_t = cb->args[0];
2046 	s_h = cb->args[1];
2047 	s_e = cb->args[2];
2048 
2049 	read_lock(&mrt_lock);
2050 	ipmr_for_each_table(mrt, net) {
2051 		if (t < s_t)
2052 			goto next_table;
2053 		if (t > s_t)
2054 			s_h = 0;
2055 		for (h = s_h; h < MFC_LINES; h++) {
2056 			list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2057 				if (e < s_e)
2058 					goto next_entry;
2059 				if (ipmr_fill_mroute(mrt, skb,
2060 						     NETLINK_CB(cb->skb).pid,
2061 						     cb->nlh->nlmsg_seq,
2062 						     mfc) < 0)
2063 					goto done;
2064 next_entry:
2065 				e++;
2066 			}
2067 			e = s_e = 0;
2068 		}
2069 		s_h = 0;
2070 next_table:
2071 		t++;
2072 	}
2073 done:
2074 	read_unlock(&mrt_lock);
2075 
2076 	cb->args[2] = e;
2077 	cb->args[1] = h;
2078 	cb->args[0] = t;
2079 
2080 	return skb->len;
2081 }
2082 
2083 #ifdef CONFIG_PROC_FS
2084 /*
2085  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2086  */
2087 struct ipmr_vif_iter {
2088 	struct seq_net_private p;
2089 	struct mr_table *mrt;
2090 	int ct;
2091 };
2092 
2093 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2094 					   struct ipmr_vif_iter *iter,
2095 					   loff_t pos)
2096 {
2097 	struct mr_table *mrt = iter->mrt;
2098 
2099 	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2100 		if (!VIF_EXISTS(mrt, iter->ct))
2101 			continue;
2102 		if (pos-- == 0)
2103 			return &mrt->vif_table[iter->ct];
2104 	}
2105 	return NULL;
2106 }
2107 
2108 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2109 	__acquires(mrt_lock)
2110 {
2111 	struct ipmr_vif_iter *iter = seq->private;
2112 	struct net *net = seq_file_net(seq);
2113 	struct mr_table *mrt;
2114 
2115 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2116 	if (mrt == NULL)
2117 		return ERR_PTR(-ENOENT);
2118 
2119 	iter->mrt = mrt;
2120 
2121 	read_lock(&mrt_lock);
2122 	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2123 		: SEQ_START_TOKEN;
2124 }
2125 
2126 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2127 {
2128 	struct ipmr_vif_iter *iter = seq->private;
2129 	struct net *net = seq_file_net(seq);
2130 	struct mr_table *mrt = iter->mrt;
2131 
2132 	++*pos;
2133 	if (v == SEQ_START_TOKEN)
2134 		return ipmr_vif_seq_idx(net, iter, 0);
2135 
2136 	while (++iter->ct < mrt->maxvif) {
2137 		if (!VIF_EXISTS(mrt, iter->ct))
2138 			continue;
2139 		return &mrt->vif_table[iter->ct];
2140 	}
2141 	return NULL;
2142 }
2143 
2144 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2145 	__releases(mrt_lock)
2146 {
2147 	read_unlock(&mrt_lock);
2148 }
2149 
2150 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2151 {
2152 	struct ipmr_vif_iter *iter = seq->private;
2153 	struct mr_table *mrt = iter->mrt;
2154 
2155 	if (v == SEQ_START_TOKEN) {
2156 		seq_puts(seq,
2157 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2158 	} else {
2159 		const struct vif_device *vif = v;
2160 		const char *name =  vif->dev ? vif->dev->name : "none";
2161 
2162 		seq_printf(seq,
2163 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2164 			   vif - mrt->vif_table,
2165 			   name, vif->bytes_in, vif->pkt_in,
2166 			   vif->bytes_out, vif->pkt_out,
2167 			   vif->flags, vif->local, vif->remote);
2168 	}
2169 	return 0;
2170 }
2171 
2172 static const struct seq_operations ipmr_vif_seq_ops = {
2173 	.start = ipmr_vif_seq_start,
2174 	.next  = ipmr_vif_seq_next,
2175 	.stop  = ipmr_vif_seq_stop,
2176 	.show  = ipmr_vif_seq_show,
2177 };
2178 
2179 static int ipmr_vif_open(struct inode *inode, struct file *file)
2180 {
2181 	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2182 			    sizeof(struct ipmr_vif_iter));
2183 }
2184 
2185 static const struct file_operations ipmr_vif_fops = {
2186 	.owner	 = THIS_MODULE,
2187 	.open    = ipmr_vif_open,
2188 	.read    = seq_read,
2189 	.llseek  = seq_lseek,
2190 	.release = seq_release_net,
2191 };
2192 
2193 struct ipmr_mfc_iter {
2194 	struct seq_net_private p;
2195 	struct mr_table *mrt;
2196 	struct list_head *cache;
2197 	int ct;
2198 };
2199 
2200 
2201 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2202 					  struct ipmr_mfc_iter *it, loff_t pos)
2203 {
2204 	struct mr_table *mrt = it->mrt;
2205 	struct mfc_cache *mfc;
2206 
2207 	read_lock(&mrt_lock);
2208 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2209 		it->cache = &mrt->mfc_cache_array[it->ct];
2210 		list_for_each_entry(mfc, it->cache, list)
2211 			if (pos-- == 0)
2212 				return mfc;
2213 	}
2214 	read_unlock(&mrt_lock);
2215 
2216 	spin_lock_bh(&mfc_unres_lock);
2217 	it->cache = &mrt->mfc_unres_queue;
2218 	list_for_each_entry(mfc, it->cache, list)
2219 		if (pos-- == 0)
2220 			return mfc;
2221 	spin_unlock_bh(&mfc_unres_lock);
2222 
2223 	it->cache = NULL;
2224 	return NULL;
2225 }
2226 
2227 
2228 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2229 {
2230 	struct ipmr_mfc_iter *it = seq->private;
2231 	struct net *net = seq_file_net(seq);
2232 	struct mr_table *mrt;
2233 
2234 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2235 	if (mrt == NULL)
2236 		return ERR_PTR(-ENOENT);
2237 
2238 	it->mrt = mrt;
2239 	it->cache = NULL;
2240 	it->ct = 0;
2241 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2242 		: SEQ_START_TOKEN;
2243 }
2244 
2245 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2246 {
2247 	struct mfc_cache *mfc = v;
2248 	struct ipmr_mfc_iter *it = seq->private;
2249 	struct net *net = seq_file_net(seq);
2250 	struct mr_table *mrt = it->mrt;
2251 
2252 	++*pos;
2253 
2254 	if (v == SEQ_START_TOKEN)
2255 		return ipmr_mfc_seq_idx(net, seq->private, 0);
2256 
2257 	if (mfc->list.next != it->cache)
2258 		return list_entry(mfc->list.next, struct mfc_cache, list);
2259 
2260 	if (it->cache == &mrt->mfc_unres_queue)
2261 		goto end_of_list;
2262 
2263 	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2264 
2265 	while (++it->ct < MFC_LINES) {
2266 		it->cache = &mrt->mfc_cache_array[it->ct];
2267 		if (list_empty(it->cache))
2268 			continue;
2269 		return list_first_entry(it->cache, struct mfc_cache, list);
2270 	}
2271 
2272 	/* exhausted cache_array, show unresolved */
2273 	read_unlock(&mrt_lock);
2274 	it->cache = &mrt->mfc_unres_queue;
2275 	it->ct = 0;
2276 
2277 	spin_lock_bh(&mfc_unres_lock);
2278 	if (!list_empty(it->cache))
2279 		return list_first_entry(it->cache, struct mfc_cache, list);
2280 
2281  end_of_list:
2282 	spin_unlock_bh(&mfc_unres_lock);
2283 	it->cache = NULL;
2284 
2285 	return NULL;
2286 }
2287 
2288 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2289 {
2290 	struct ipmr_mfc_iter *it = seq->private;
2291 	struct mr_table *mrt = it->mrt;
2292 
2293 	if (it->cache == &mrt->mfc_unres_queue)
2294 		spin_unlock_bh(&mfc_unres_lock);
2295 	else if (it->cache == &mrt->mfc_cache_array[it->ct])
2296 		read_unlock(&mrt_lock);
2297 }
2298 
2299 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2300 {
2301 	int n;
2302 
2303 	if (v == SEQ_START_TOKEN) {
2304 		seq_puts(seq,
2305 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2306 	} else {
2307 		const struct mfc_cache *mfc = v;
2308 		const struct ipmr_mfc_iter *it = seq->private;
2309 		const struct mr_table *mrt = it->mrt;
2310 
2311 		seq_printf(seq, "%08X %08X %-3hd",
2312 			   (__force u32) mfc->mfc_mcastgrp,
2313 			   (__force u32) mfc->mfc_origin,
2314 			   mfc->mfc_parent);
2315 
2316 		if (it->cache != &mrt->mfc_unres_queue) {
2317 			seq_printf(seq, " %8lu %8lu %8lu",
2318 				   mfc->mfc_un.res.pkt,
2319 				   mfc->mfc_un.res.bytes,
2320 				   mfc->mfc_un.res.wrong_if);
2321 			for (n = mfc->mfc_un.res.minvif;
2322 			     n < mfc->mfc_un.res.maxvif; n++ ) {
2323 				if (VIF_EXISTS(mrt, n) &&
2324 				    mfc->mfc_un.res.ttls[n] < 255)
2325 					seq_printf(seq,
2326 					   " %2d:%-3d",
2327 					   n, mfc->mfc_un.res.ttls[n]);
2328 			}
2329 		} else {
2330 			/* unresolved mfc_caches don't contain
2331 			 * pkt, bytes and wrong_if values
2332 			 */
2333 			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2334 		}
2335 		seq_putc(seq, '\n');
2336 	}
2337 	return 0;
2338 }
2339 
2340 static const struct seq_operations ipmr_mfc_seq_ops = {
2341 	.start = ipmr_mfc_seq_start,
2342 	.next  = ipmr_mfc_seq_next,
2343 	.stop  = ipmr_mfc_seq_stop,
2344 	.show  = ipmr_mfc_seq_show,
2345 };
2346 
2347 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2348 {
2349 	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2350 			    sizeof(struct ipmr_mfc_iter));
2351 }
2352 
2353 static const struct file_operations ipmr_mfc_fops = {
2354 	.owner	 = THIS_MODULE,
2355 	.open    = ipmr_mfc_open,
2356 	.read    = seq_read,
2357 	.llseek  = seq_lseek,
2358 	.release = seq_release_net,
2359 };
2360 #endif
2361 
2362 #ifdef CONFIG_IP_PIMSM_V2
2363 static const struct net_protocol pim_protocol = {
2364 	.handler	=	pim_rcv,
2365 	.netns_ok	=	1,
2366 };
2367 #endif
2368 
2369 
2370 /*
2371  *	Setup for IP multicast routing
2372  */
2373 static int __net_init ipmr_net_init(struct net *net)
2374 {
2375 	int err;
2376 
2377 	err = ipmr_rules_init(net);
2378 	if (err < 0)
2379 		goto fail;
2380 
2381 #ifdef CONFIG_PROC_FS
2382 	err = -ENOMEM;
2383 	if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2384 		goto proc_vif_fail;
2385 	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2386 		goto proc_cache_fail;
2387 #endif
2388 	return 0;
2389 
2390 #ifdef CONFIG_PROC_FS
2391 proc_cache_fail:
2392 	proc_net_remove(net, "ip_mr_vif");
2393 proc_vif_fail:
2394 	ipmr_rules_exit(net);
2395 #endif
2396 fail:
2397 	return err;
2398 }
2399 
2400 static void __net_exit ipmr_net_exit(struct net *net)
2401 {
2402 #ifdef CONFIG_PROC_FS
2403 	proc_net_remove(net, "ip_mr_cache");
2404 	proc_net_remove(net, "ip_mr_vif");
2405 #endif
2406 	ipmr_rules_exit(net);
2407 }
2408 
2409 static struct pernet_operations ipmr_net_ops = {
2410 	.init = ipmr_net_init,
2411 	.exit = ipmr_net_exit,
2412 };
2413 
2414 int __init ip_mr_init(void)
2415 {
2416 	int err;
2417 
2418 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
2419 				       sizeof(struct mfc_cache),
2420 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2421 				       NULL);
2422 	if (!mrt_cachep)
2423 		return -ENOMEM;
2424 
2425 	err = register_pernet_subsys(&ipmr_net_ops);
2426 	if (err)
2427 		goto reg_pernet_fail;
2428 
2429 	err = register_netdevice_notifier(&ip_mr_notifier);
2430 	if (err)
2431 		goto reg_notif_fail;
2432 #ifdef CONFIG_IP_PIMSM_V2
2433 	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2434 		printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2435 		err = -EAGAIN;
2436 		goto add_proto_fail;
2437 	}
2438 #endif
2439 	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2440 	return 0;
2441 
2442 #ifdef CONFIG_IP_PIMSM_V2
2443 add_proto_fail:
2444 	unregister_netdevice_notifier(&ip_mr_notifier);
2445 #endif
2446 reg_notif_fail:
2447 	unregister_pernet_subsys(&ipmr_net_ops);
2448 reg_pernet_fail:
2449 	kmem_cache_destroy(mrt_cachep);
2450 	return err;
2451 }
2452