xref: /openbmc/linux/net/ipv4/ipmr.c (revision 61a3e166)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Fixes:
13  *	Michael Chastain	:	Incorrect size of copying.
14  *	Alan Cox		:	Added the cache manager code
15  *	Alan Cox		:	Fixed the clone/copy bug and device race.
16  *	Mike McLagan		:	Routing by source
17  *	Malcolm Beattie		:	Buffer handling fixes.
18  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
19  *	SVR Anand		:	Fixed several multicast bugs and problems.
20  *	Alexey Kuznetsov	:	Status, optimisations and more.
21  *	Brad Parker		:	Better behaviour on mrouted upcall
22  *					overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25  *					Relax this requirement to work with older peers.
26  *
27  */
28 
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67 
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM	1
70 #endif
71 
72 struct mr_table {
73 	struct list_head	list;
74 #ifdef CONFIG_NET_NS
75 	struct net		*net;
76 #endif
77 	u32			id;
78 	struct sock		*mroute_sk;
79 	struct timer_list	ipmr_expire_timer;
80 	struct list_head	mfc_unres_queue;
81 	struct list_head	mfc_cache_array[MFC_LINES];
82 	struct vif_device	vif_table[MAXVIFS];
83 	int			maxvif;
84 	atomic_t		cache_resolve_queue_len;
85 	int			mroute_do_assert;
86 	int			mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88 	int			mroute_reg_vif_num;
89 #endif
90 };
91 
92 struct ipmr_rule {
93 	struct fib_rule		common;
94 };
95 
96 struct ipmr_result {
97 	struct mr_table		*mrt;
98 };
99 
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103 
104 static DEFINE_RWLOCK(mrt_lock);
105 
106 /*
107  *	Multicast router control variables
108  */
109 
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111 
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114 
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119 
120    In this case data path is free of exclusive locks at all.
121  */
122 
123 static struct kmem_cache *mrt_cachep __read_mostly;
124 
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127 			 struct sk_buff *skb, struct mfc_cache *cache,
128 			 int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130 			     struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132 			      struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134 
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137 	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138 
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141 	struct mr_table *mrt;
142 
143 	ipmr_for_each_table(mrt, net) {
144 		if (mrt->id == id)
145 			return mrt;
146 	}
147 	return NULL;
148 }
149 
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151 			   struct mr_table **mrt)
152 {
153 	struct ipmr_result res;
154 	struct fib_lookup_arg arg = { .result = &res, };
155 	int err;
156 
157 	err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158 	if (err < 0)
159 		return err;
160 	*mrt = res.mrt;
161 	return 0;
162 }
163 
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165 			    int flags, struct fib_lookup_arg *arg)
166 {
167 	struct ipmr_result *res = arg->result;
168 	struct mr_table *mrt;
169 
170 	switch (rule->action) {
171 	case FR_ACT_TO_TBL:
172 		break;
173 	case FR_ACT_UNREACHABLE:
174 		return -ENETUNREACH;
175 	case FR_ACT_PROHIBIT:
176 		return -EACCES;
177 	case FR_ACT_BLACKHOLE:
178 	default:
179 		return -EINVAL;
180 	}
181 
182 	mrt = ipmr_get_table(rule->fr_net, rule->table);
183 	if (mrt == NULL)
184 		return -EAGAIN;
185 	res->mrt = mrt;
186 	return 0;
187 }
188 
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191 	return 1;
192 }
193 
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195 	FRA_GENERIC_POLICY,
196 };
197 
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199 			       struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201 	return 0;
202 }
203 
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205 			     struct nlattr **tb)
206 {
207 	return 1;
208 }
209 
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211 			  struct fib_rule_hdr *frh)
212 {
213 	frh->dst_len = 0;
214 	frh->src_len = 0;
215 	frh->tos     = 0;
216 	return 0;
217 }
218 
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220 	.family		= RTNL_FAMILY_IPMR,
221 	.rule_size	= sizeof(struct ipmr_rule),
222 	.addr_size	= sizeof(u32),
223 	.action		= ipmr_rule_action,
224 	.match		= ipmr_rule_match,
225 	.configure	= ipmr_rule_configure,
226 	.compare	= ipmr_rule_compare,
227 	.default_pref	= fib_default_rule_pref,
228 	.fill		= ipmr_rule_fill,
229 	.nlgroup	= RTNLGRP_IPV4_RULE,
230 	.policy		= ipmr_rule_policy,
231 	.owner		= THIS_MODULE,
232 };
233 
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236 	struct fib_rules_ops *ops;
237 	struct mr_table *mrt;
238 	int err;
239 
240 	ops = fib_rules_register(&ipmr_rules_ops_template, net);
241 	if (IS_ERR(ops))
242 		return PTR_ERR(ops);
243 
244 	INIT_LIST_HEAD(&net->ipv4.mr_tables);
245 
246 	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247 	if (mrt == NULL) {
248 		err = -ENOMEM;
249 		goto err1;
250 	}
251 
252 	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253 	if (err < 0)
254 		goto err2;
255 
256 	net->ipv4.mr_rules_ops = ops;
257 	return 0;
258 
259 err2:
260 	kfree(mrt);
261 err1:
262 	fib_rules_unregister(ops);
263 	return err;
264 }
265 
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268 	struct mr_table *mrt, *next;
269 
270 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271 		list_del(&mrt->list);
272 		kfree(mrt);
273 	}
274 	fib_rules_unregister(net->ipv4.mr_rules_ops);
275 }
276 #else
277 #define ipmr_for_each_table(mrt, net) \
278 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279 
280 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281 {
282 	return net->ipv4.mrt;
283 }
284 
285 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286 			   struct mr_table **mrt)
287 {
288 	*mrt = net->ipv4.mrt;
289 	return 0;
290 }
291 
292 static int __net_init ipmr_rules_init(struct net *net)
293 {
294 	net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295 	return net->ipv4.mrt ? 0 : -ENOMEM;
296 }
297 
298 static void __net_exit ipmr_rules_exit(struct net *net)
299 {
300 	kfree(net->ipv4.mrt);
301 }
302 #endif
303 
304 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305 {
306 	struct mr_table *mrt;
307 	unsigned int i;
308 
309 	mrt = ipmr_get_table(net, id);
310 	if (mrt != NULL)
311 		return mrt;
312 
313 	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314 	if (mrt == NULL)
315 		return NULL;
316 	write_pnet(&mrt->net, net);
317 	mrt->id = id;
318 
319 	/* Forwarding cache */
320 	for (i = 0; i < MFC_LINES; i++)
321 		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322 
323 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324 
325 	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326 		    (unsigned long)mrt);
327 
328 #ifdef CONFIG_IP_PIMSM
329 	mrt->mroute_reg_vif_num = -1;
330 #endif
331 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332 	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333 #endif
334 	return mrt;
335 }
336 
337 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
338 
339 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
340 {
341 	struct net *net = dev_net(dev);
342 
343 	dev_close(dev);
344 
345 	dev = __dev_get_by_name(net, "tunl0");
346 	if (dev) {
347 		const struct net_device_ops *ops = dev->netdev_ops;
348 		struct ifreq ifr;
349 		struct ip_tunnel_parm p;
350 
351 		memset(&p, 0, sizeof(p));
352 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
353 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
354 		p.iph.version = 4;
355 		p.iph.ihl = 5;
356 		p.iph.protocol = IPPROTO_IPIP;
357 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
358 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
359 
360 		if (ops->ndo_do_ioctl) {
361 			mm_segment_t oldfs = get_fs();
362 
363 			set_fs(KERNEL_DS);
364 			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
365 			set_fs(oldfs);
366 		}
367 	}
368 }
369 
370 static
371 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
372 {
373 	struct net_device  *dev;
374 
375 	dev = __dev_get_by_name(net, "tunl0");
376 
377 	if (dev) {
378 		const struct net_device_ops *ops = dev->netdev_ops;
379 		int err;
380 		struct ifreq ifr;
381 		struct ip_tunnel_parm p;
382 		struct in_device  *in_dev;
383 
384 		memset(&p, 0, sizeof(p));
385 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
386 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
387 		p.iph.version = 4;
388 		p.iph.ihl = 5;
389 		p.iph.protocol = IPPROTO_IPIP;
390 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
391 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
392 
393 		if (ops->ndo_do_ioctl) {
394 			mm_segment_t oldfs = get_fs();
395 
396 			set_fs(KERNEL_DS);
397 			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398 			set_fs(oldfs);
399 		} else
400 			err = -EOPNOTSUPP;
401 
402 		dev = NULL;
403 
404 		if (err == 0 &&
405 		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
406 			dev->flags |= IFF_MULTICAST;
407 
408 			in_dev = __in_dev_get_rtnl(dev);
409 			if (in_dev == NULL)
410 				goto failure;
411 
412 			ipv4_devconf_setall(in_dev);
413 			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
414 
415 			if (dev_open(dev))
416 				goto failure;
417 			dev_hold(dev);
418 		}
419 	}
420 	return dev;
421 
422 failure:
423 	/* allow the register to be completed before unregistering. */
424 	rtnl_unlock();
425 	rtnl_lock();
426 
427 	unregister_netdevice(dev);
428 	return NULL;
429 }
430 
431 #ifdef CONFIG_IP_PIMSM
432 
433 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434 {
435 	struct net *net = dev_net(dev);
436 	struct mr_table *mrt;
437 	struct flowi fl = {
438 		.oif		= dev->ifindex,
439 		.iif		= skb->skb_iif,
440 		.mark		= skb->mark,
441 	};
442 	int err;
443 
444 	err = ipmr_fib_lookup(net, &fl, &mrt);
445 	if (err < 0) {
446 		kfree_skb(skb);
447 		return err;
448 	}
449 
450 	read_lock(&mrt_lock);
451 	dev->stats.tx_bytes += skb->len;
452 	dev->stats.tx_packets++;
453 	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
454 	read_unlock(&mrt_lock);
455 	kfree_skb(skb);
456 	return NETDEV_TX_OK;
457 }
458 
459 static const struct net_device_ops reg_vif_netdev_ops = {
460 	.ndo_start_xmit	= reg_vif_xmit,
461 };
462 
463 static void reg_vif_setup(struct net_device *dev)
464 {
465 	dev->type		= ARPHRD_PIMREG;
466 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
467 	dev->flags		= IFF_NOARP;
468 	dev->netdev_ops		= &reg_vif_netdev_ops,
469 	dev->destructor		= free_netdev;
470 	dev->features		|= NETIF_F_NETNS_LOCAL;
471 }
472 
473 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
474 {
475 	struct net_device *dev;
476 	struct in_device *in_dev;
477 	char name[IFNAMSIZ];
478 
479 	if (mrt->id == RT_TABLE_DEFAULT)
480 		sprintf(name, "pimreg");
481 	else
482 		sprintf(name, "pimreg%u", mrt->id);
483 
484 	dev = alloc_netdev(0, name, reg_vif_setup);
485 
486 	if (dev == NULL)
487 		return NULL;
488 
489 	dev_net_set(dev, net);
490 
491 	if (register_netdevice(dev)) {
492 		free_netdev(dev);
493 		return NULL;
494 	}
495 	dev->iflink = 0;
496 
497 	rcu_read_lock();
498 	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
499 		rcu_read_unlock();
500 		goto failure;
501 	}
502 
503 	ipv4_devconf_setall(in_dev);
504 	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
505 	rcu_read_unlock();
506 
507 	if (dev_open(dev))
508 		goto failure;
509 
510 	dev_hold(dev);
511 
512 	return dev;
513 
514 failure:
515 	/* allow the register to be completed before unregistering. */
516 	rtnl_unlock();
517 	rtnl_lock();
518 
519 	unregister_netdevice(dev);
520 	return NULL;
521 }
522 #endif
523 
524 /*
525  *	Delete a VIF entry
526  *	@notify: Set to 1, if the caller is a notifier_call
527  */
528 
529 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
530 		      struct list_head *head)
531 {
532 	struct vif_device *v;
533 	struct net_device *dev;
534 	struct in_device *in_dev;
535 
536 	if (vifi < 0 || vifi >= mrt->maxvif)
537 		return -EADDRNOTAVAIL;
538 
539 	v = &mrt->vif_table[vifi];
540 
541 	write_lock_bh(&mrt_lock);
542 	dev = v->dev;
543 	v->dev = NULL;
544 
545 	if (!dev) {
546 		write_unlock_bh(&mrt_lock);
547 		return -EADDRNOTAVAIL;
548 	}
549 
550 #ifdef CONFIG_IP_PIMSM
551 	if (vifi == mrt->mroute_reg_vif_num)
552 		mrt->mroute_reg_vif_num = -1;
553 #endif
554 
555 	if (vifi+1 == mrt->maxvif) {
556 		int tmp;
557 		for (tmp=vifi-1; tmp>=0; tmp--) {
558 			if (VIF_EXISTS(mrt, tmp))
559 				break;
560 		}
561 		mrt->maxvif = tmp+1;
562 	}
563 
564 	write_unlock_bh(&mrt_lock);
565 
566 	dev_set_allmulti(dev, -1);
567 
568 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
569 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570 		ip_rt_multicast_event(in_dev);
571 	}
572 
573 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
574 		unregister_netdevice_queue(dev, head);
575 
576 	dev_put(dev);
577 	return 0;
578 }
579 
580 static inline void ipmr_cache_free(struct mfc_cache *c)
581 {
582 	kmem_cache_free(mrt_cachep, c);
583 }
584 
585 /* Destroy an unresolved cache entry, killing queued skbs
586    and reporting error to netlink readers.
587  */
588 
589 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
590 {
591 	struct net *net = read_pnet(&mrt->net);
592 	struct sk_buff *skb;
593 	struct nlmsgerr *e;
594 
595 	atomic_dec(&mrt->cache_resolve_queue_len);
596 
597 	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
598 		if (ip_hdr(skb)->version == 0) {
599 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
600 			nlh->nlmsg_type = NLMSG_ERROR;
601 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
602 			skb_trim(skb, nlh->nlmsg_len);
603 			e = NLMSG_DATA(nlh);
604 			e->error = -ETIMEDOUT;
605 			memset(&e->msg, 0, sizeof(e->msg));
606 
607 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608 		} else
609 			kfree_skb(skb);
610 	}
611 
612 	ipmr_cache_free(c);
613 }
614 
615 
616 /* Timer process for the unresolved queue. */
617 
618 static void ipmr_expire_process(unsigned long arg)
619 {
620 	struct mr_table *mrt = (struct mr_table *)arg;
621 	unsigned long now;
622 	unsigned long expires;
623 	struct mfc_cache *c, *next;
624 
625 	if (!spin_trylock(&mfc_unres_lock)) {
626 		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
627 		return;
628 	}
629 
630 	if (list_empty(&mrt->mfc_unres_queue))
631 		goto out;
632 
633 	now = jiffies;
634 	expires = 10*HZ;
635 
636 	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
637 		if (time_after(c->mfc_un.unres.expires, now)) {
638 			unsigned long interval = c->mfc_un.unres.expires - now;
639 			if (interval < expires)
640 				expires = interval;
641 			continue;
642 		}
643 
644 		list_del(&c->list);
645 		ipmr_destroy_unres(mrt, c);
646 	}
647 
648 	if (!list_empty(&mrt->mfc_unres_queue))
649 		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
650 
651 out:
652 	spin_unlock(&mfc_unres_lock);
653 }
654 
655 /* Fill oifs list. It is called under write locked mrt_lock. */
656 
657 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
658 				   unsigned char *ttls)
659 {
660 	int vifi;
661 
662 	cache->mfc_un.res.minvif = MAXVIFS;
663 	cache->mfc_un.res.maxvif = 0;
664 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
665 
666 	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
667 		if (VIF_EXISTS(mrt, vifi) &&
668 		    ttls[vifi] && ttls[vifi] < 255) {
669 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
670 			if (cache->mfc_un.res.minvif > vifi)
671 				cache->mfc_un.res.minvif = vifi;
672 			if (cache->mfc_un.res.maxvif <= vifi)
673 				cache->mfc_un.res.maxvif = vifi + 1;
674 		}
675 	}
676 }
677 
678 static int vif_add(struct net *net, struct mr_table *mrt,
679 		   struct vifctl *vifc, int mrtsock)
680 {
681 	int vifi = vifc->vifc_vifi;
682 	struct vif_device *v = &mrt->vif_table[vifi];
683 	struct net_device *dev;
684 	struct in_device *in_dev;
685 	int err;
686 
687 	/* Is vif busy ? */
688 	if (VIF_EXISTS(mrt, vifi))
689 		return -EADDRINUSE;
690 
691 	switch (vifc->vifc_flags) {
692 #ifdef CONFIG_IP_PIMSM
693 	case VIFF_REGISTER:
694 		/*
695 		 * Special Purpose VIF in PIM
696 		 * All the packets will be sent to the daemon
697 		 */
698 		if (mrt->mroute_reg_vif_num >= 0)
699 			return -EADDRINUSE;
700 		dev = ipmr_reg_vif(net, mrt);
701 		if (!dev)
702 			return -ENOBUFS;
703 		err = dev_set_allmulti(dev, 1);
704 		if (err) {
705 			unregister_netdevice(dev);
706 			dev_put(dev);
707 			return err;
708 		}
709 		break;
710 #endif
711 	case VIFF_TUNNEL:
712 		dev = ipmr_new_tunnel(net, vifc);
713 		if (!dev)
714 			return -ENOBUFS;
715 		err = dev_set_allmulti(dev, 1);
716 		if (err) {
717 			ipmr_del_tunnel(dev, vifc);
718 			dev_put(dev);
719 			return err;
720 		}
721 		break;
722 
723 	case VIFF_USE_IFINDEX:
724 	case 0:
725 		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726 			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727 			if (dev && dev->ip_ptr == NULL) {
728 				dev_put(dev);
729 				return -EADDRNOTAVAIL;
730 			}
731 		} else
732 			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733 
734 		if (!dev)
735 			return -EADDRNOTAVAIL;
736 		err = dev_set_allmulti(dev, 1);
737 		if (err) {
738 			dev_put(dev);
739 			return err;
740 		}
741 		break;
742 	default:
743 		return -EINVAL;
744 	}
745 
746 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
747 		dev_put(dev);
748 		return -EADDRNOTAVAIL;
749 	}
750 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751 	ip_rt_multicast_event(in_dev);
752 
753 	/*
754 	 *	Fill in the VIF structures
755 	 */
756 	v->rate_limit = vifc->vifc_rate_limit;
757 	v->local = vifc->vifc_lcl_addr.s_addr;
758 	v->remote = vifc->vifc_rmt_addr.s_addr;
759 	v->flags = vifc->vifc_flags;
760 	if (!mrtsock)
761 		v->flags |= VIFF_STATIC;
762 	v->threshold = vifc->vifc_threshold;
763 	v->bytes_in = 0;
764 	v->bytes_out = 0;
765 	v->pkt_in = 0;
766 	v->pkt_out = 0;
767 	v->link = dev->ifindex;
768 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
769 		v->link = dev->iflink;
770 
771 	/* And finish update writing critical data */
772 	write_lock_bh(&mrt_lock);
773 	v->dev = dev;
774 #ifdef CONFIG_IP_PIMSM
775 	if (v->flags&VIFF_REGISTER)
776 		mrt->mroute_reg_vif_num = vifi;
777 #endif
778 	if (vifi+1 > mrt->maxvif)
779 		mrt->maxvif = vifi+1;
780 	write_unlock_bh(&mrt_lock);
781 	return 0;
782 }
783 
784 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785 					 __be32 origin,
786 					 __be32 mcastgrp)
787 {
788 	int line = MFC_HASH(mcastgrp, origin);
789 	struct mfc_cache *c;
790 
791 	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
792 		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793 			return c;
794 	}
795 	return NULL;
796 }
797 
798 /*
799  *	Allocate a multicast cache entry
800  */
801 static struct mfc_cache *ipmr_cache_alloc(void)
802 {
803 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804 	if (c == NULL)
805 		return NULL;
806 	c->mfc_un.res.minvif = MAXVIFS;
807 	return c;
808 }
809 
810 static struct mfc_cache *ipmr_cache_alloc_unres(void)
811 {
812 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813 	if (c == NULL)
814 		return NULL;
815 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
816 	c->mfc_un.unres.expires = jiffies + 10*HZ;
817 	return c;
818 }
819 
820 /*
821  *	A cache entry has gone into a resolved state from queued
822  */
823 
824 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
825 			       struct mfc_cache *uc, struct mfc_cache *c)
826 {
827 	struct sk_buff *skb;
828 	struct nlmsgerr *e;
829 
830 	/*
831 	 *	Play the pending entries through our router
832 	 */
833 
834 	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835 		if (ip_hdr(skb)->version == 0) {
836 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837 
838 			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
840 						  (u8 *)nlh);
841 			} else {
842 				nlh->nlmsg_type = NLMSG_ERROR;
843 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
844 				skb_trim(skb, nlh->nlmsg_len);
845 				e = NLMSG_DATA(nlh);
846 				e->error = -EMSGSIZE;
847 				memset(&e->msg, 0, sizeof(e->msg));
848 			}
849 
850 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851 		} else
852 			ip_mr_forward(net, mrt, skb, c, 0);
853 	}
854 }
855 
856 /*
857  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
858  *	expects the following bizarre scheme.
859  *
860  *	Called under mrt_lock.
861  */
862 
863 static int ipmr_cache_report(struct mr_table *mrt,
864 			     struct sk_buff *pkt, vifi_t vifi, int assert)
865 {
866 	struct sk_buff *skb;
867 	const int ihl = ip_hdrlen(pkt);
868 	struct igmphdr *igmp;
869 	struct igmpmsg *msg;
870 	int ret;
871 
872 #ifdef CONFIG_IP_PIMSM
873 	if (assert == IGMPMSG_WHOLEPKT)
874 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
875 	else
876 #endif
877 		skb = alloc_skb(128, GFP_ATOMIC);
878 
879 	if (!skb)
880 		return -ENOBUFS;
881 
882 #ifdef CONFIG_IP_PIMSM
883 	if (assert == IGMPMSG_WHOLEPKT) {
884 		/* Ugly, but we have no choice with this interface.
885 		   Duplicate old header, fix ihl, length etc.
886 		   And all this only to mangle msg->im_msgtype and
887 		   to set msg->im_mbz to "mbz" :-)
888 		 */
889 		skb_push(skb, sizeof(struct iphdr));
890 		skb_reset_network_header(skb);
891 		skb_reset_transport_header(skb);
892 		msg = (struct igmpmsg *)skb_network_header(skb);
893 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
894 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
895 		msg->im_mbz = 0;
896 		msg->im_vif = mrt->mroute_reg_vif_num;
897 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
898 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
899 					     sizeof(struct iphdr));
900 	} else
901 #endif
902 	{
903 
904 	/*
905 	 *	Copy the IP header
906 	 */
907 
908 	skb->network_header = skb->tail;
909 	skb_put(skb, ihl);
910 	skb_copy_to_linear_data(skb, pkt->data, ihl);
911 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
912 	msg = (struct igmpmsg *)skb_network_header(skb);
913 	msg->im_vif = vifi;
914 	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
915 
916 	/*
917 	 *	Add our header
918 	 */
919 
920 	igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
921 	igmp->type	=
922 	msg->im_msgtype = assert;
923 	igmp->code 	=	0;
924 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
925 	skb->transport_header = skb->network_header;
926 	}
927 
928 	if (mrt->mroute_sk == NULL) {
929 		kfree_skb(skb);
930 		return -EINVAL;
931 	}
932 
933 	/*
934 	 *	Deliver to mrouted
935 	 */
936 	ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
937 	if (ret < 0) {
938 		if (net_ratelimit())
939 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
940 		kfree_skb(skb);
941 	}
942 
943 	return ret;
944 }
945 
946 /*
947  *	Queue a packet for resolution. It gets locked cache entry!
948  */
949 
950 static int
951 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
952 {
953 	bool found = false;
954 	int err;
955 	struct mfc_cache *c;
956 	const struct iphdr *iph = ip_hdr(skb);
957 
958 	spin_lock_bh(&mfc_unres_lock);
959 	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
960 		if (c->mfc_mcastgrp == iph->daddr &&
961 		    c->mfc_origin == iph->saddr) {
962 			found = true;
963 			break;
964 		}
965 	}
966 
967 	if (!found) {
968 		/*
969 		 *	Create a new entry if allowable
970 		 */
971 
972 		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
973 		    (c = ipmr_cache_alloc_unres()) == NULL) {
974 			spin_unlock_bh(&mfc_unres_lock);
975 
976 			kfree_skb(skb);
977 			return -ENOBUFS;
978 		}
979 
980 		/*
981 		 *	Fill in the new cache entry
982 		 */
983 		c->mfc_parent	= -1;
984 		c->mfc_origin	= iph->saddr;
985 		c->mfc_mcastgrp	= iph->daddr;
986 
987 		/*
988 		 *	Reflect first query at mrouted.
989 		 */
990 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
991 		if (err < 0) {
992 			/* If the report failed throw the cache entry
993 			   out - Brad Parker
994 			 */
995 			spin_unlock_bh(&mfc_unres_lock);
996 
997 			ipmr_cache_free(c);
998 			kfree_skb(skb);
999 			return err;
1000 		}
1001 
1002 		atomic_inc(&mrt->cache_resolve_queue_len);
1003 		list_add(&c->list, &mrt->mfc_unres_queue);
1004 
1005 		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1006 			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007 	}
1008 
1009 	/*
1010 	 *	See if we can append the packet
1011 	 */
1012 	if (c->mfc_un.unres.unresolved.qlen>3) {
1013 		kfree_skb(skb);
1014 		err = -ENOBUFS;
1015 	} else {
1016 		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1017 		err = 0;
1018 	}
1019 
1020 	spin_unlock_bh(&mfc_unres_lock);
1021 	return err;
1022 }
1023 
1024 /*
1025  *	MFC cache manipulation by user space mroute daemon
1026  */
1027 
1028 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1029 {
1030 	int line;
1031 	struct mfc_cache *c, *next;
1032 
1033 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1034 
1035 	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038 			write_lock_bh(&mrt_lock);
1039 			list_del(&c->list);
1040 			write_unlock_bh(&mrt_lock);
1041 
1042 			ipmr_cache_free(c);
1043 			return 0;
1044 		}
1045 	}
1046 	return -ENOENT;
1047 }
1048 
1049 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1050 			struct mfcctl *mfc, int mrtsock)
1051 {
1052 	bool found = false;
1053 	int line;
1054 	struct mfc_cache *uc, *c;
1055 
1056 	if (mfc->mfcc_parent >= MAXVIFS)
1057 		return -ENFILE;
1058 
1059 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1060 
1061 	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1062 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1063 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1064 			found = true;
1065 			break;
1066 		}
1067 	}
1068 
1069 	if (found) {
1070 		write_lock_bh(&mrt_lock);
1071 		c->mfc_parent = mfc->mfcc_parent;
1072 		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1073 		if (!mrtsock)
1074 			c->mfc_flags |= MFC_STATIC;
1075 		write_unlock_bh(&mrt_lock);
1076 		return 0;
1077 	}
1078 
1079 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1080 		return -EINVAL;
1081 
1082 	c = ipmr_cache_alloc();
1083 	if (c == NULL)
1084 		return -ENOMEM;
1085 
1086 	c->mfc_origin = mfc->mfcc_origin.s_addr;
1087 	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1088 	c->mfc_parent = mfc->mfcc_parent;
1089 	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1090 	if (!mrtsock)
1091 		c->mfc_flags |= MFC_STATIC;
1092 
1093 	write_lock_bh(&mrt_lock);
1094 	list_add(&c->list, &mrt->mfc_cache_array[line]);
1095 	write_unlock_bh(&mrt_lock);
1096 
1097 	/*
1098 	 *	Check to see if we resolved a queued list. If so we
1099 	 *	need to send on the frames and tidy up.
1100 	 */
1101 	found = false;
1102 	spin_lock_bh(&mfc_unres_lock);
1103 	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1104 		if (uc->mfc_origin == c->mfc_origin &&
1105 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1106 			list_del(&uc->list);
1107 			atomic_dec(&mrt->cache_resolve_queue_len);
1108 			found = true;
1109 			break;
1110 		}
1111 	}
1112 	if (list_empty(&mrt->mfc_unres_queue))
1113 		del_timer(&mrt->ipmr_expire_timer);
1114 	spin_unlock_bh(&mfc_unres_lock);
1115 
1116 	if (found) {
1117 		ipmr_cache_resolve(net, mrt, uc, c);
1118 		ipmr_cache_free(uc);
1119 	}
1120 	return 0;
1121 }
1122 
1123 /*
1124  *	Close the multicast socket, and clear the vif tables etc
1125  */
1126 
1127 static void mroute_clean_tables(struct mr_table *mrt)
1128 {
1129 	int i;
1130 	LIST_HEAD(list);
1131 	struct mfc_cache *c, *next;
1132 
1133 	/*
1134 	 *	Shut down all active vif entries
1135 	 */
1136 	for (i = 0; i < mrt->maxvif; i++) {
1137 		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1138 			vif_delete(mrt, i, 0, &list);
1139 	}
1140 	unregister_netdevice_many(&list);
1141 
1142 	/*
1143 	 *	Wipe the cache
1144 	 */
1145 	for (i = 0; i < MFC_LINES; i++) {
1146 		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147 			if (c->mfc_flags&MFC_STATIC)
1148 				continue;
1149 			write_lock_bh(&mrt_lock);
1150 			list_del(&c->list);
1151 			write_unlock_bh(&mrt_lock);
1152 
1153 			ipmr_cache_free(c);
1154 		}
1155 	}
1156 
1157 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1158 		spin_lock_bh(&mfc_unres_lock);
1159 		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1160 			list_del(&c->list);
1161 			ipmr_destroy_unres(mrt, c);
1162 		}
1163 		spin_unlock_bh(&mfc_unres_lock);
1164 	}
1165 }
1166 
1167 static void mrtsock_destruct(struct sock *sk)
1168 {
1169 	struct net *net = sock_net(sk);
1170 	struct mr_table *mrt;
1171 
1172 	rtnl_lock();
1173 	ipmr_for_each_table(mrt, net) {
1174 		if (sk == mrt->mroute_sk) {
1175 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176 
1177 			write_lock_bh(&mrt_lock);
1178 			mrt->mroute_sk = NULL;
1179 			write_unlock_bh(&mrt_lock);
1180 
1181 			mroute_clean_tables(mrt);
1182 		}
1183 	}
1184 	rtnl_unlock();
1185 }
1186 
1187 /*
1188  *	Socket options and virtual interface manipulation. The whole
1189  *	virtual interface system is a complete heap, but unfortunately
1190  *	that's how BSD mrouted happens to think. Maybe one day with a proper
1191  *	MOSPF/PIM router set up we can clean this up.
1192  */
1193 
1194 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1195 {
1196 	int ret;
1197 	struct vifctl vif;
1198 	struct mfcctl mfc;
1199 	struct net *net = sock_net(sk);
1200 	struct mr_table *mrt;
1201 
1202 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1203 	if (mrt == NULL)
1204 		return -ENOENT;
1205 
1206 	if (optname != MRT_INIT) {
1207 		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
1208 			return -EACCES;
1209 	}
1210 
1211 	switch (optname) {
1212 	case MRT_INIT:
1213 		if (sk->sk_type != SOCK_RAW ||
1214 		    inet_sk(sk)->inet_num != IPPROTO_IGMP)
1215 			return -EOPNOTSUPP;
1216 		if (optlen != sizeof(int))
1217 			return -ENOPROTOOPT;
1218 
1219 		rtnl_lock();
1220 		if (mrt->mroute_sk) {
1221 			rtnl_unlock();
1222 			return -EADDRINUSE;
1223 		}
1224 
1225 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226 		if (ret == 0) {
1227 			write_lock_bh(&mrt_lock);
1228 			mrt->mroute_sk = sk;
1229 			write_unlock_bh(&mrt_lock);
1230 
1231 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232 		}
1233 		rtnl_unlock();
1234 		return ret;
1235 	case MRT_DONE:
1236 		if (sk != mrt->mroute_sk)
1237 			return -EACCES;
1238 		return ip_ra_control(sk, 0, NULL);
1239 	case MRT_ADD_VIF:
1240 	case MRT_DEL_VIF:
1241 		if (optlen != sizeof(vif))
1242 			return -EINVAL;
1243 		if (copy_from_user(&vif, optval, sizeof(vif)))
1244 			return -EFAULT;
1245 		if (vif.vifc_vifi >= MAXVIFS)
1246 			return -ENFILE;
1247 		rtnl_lock();
1248 		if (optname == MRT_ADD_VIF) {
1249 			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1250 		} else {
1251 			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252 		}
1253 		rtnl_unlock();
1254 		return ret;
1255 
1256 		/*
1257 		 *	Manipulate the forwarding caches. These live
1258 		 *	in a sort of kernel/user symbiosis.
1259 		 */
1260 	case MRT_ADD_MFC:
1261 	case MRT_DEL_MFC:
1262 		if (optlen != sizeof(mfc))
1263 			return -EINVAL;
1264 		if (copy_from_user(&mfc, optval, sizeof(mfc)))
1265 			return -EFAULT;
1266 		rtnl_lock();
1267 		if (optname == MRT_DEL_MFC)
1268 			ret = ipmr_mfc_delete(mrt, &mfc);
1269 		else
1270 			ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1271 		rtnl_unlock();
1272 		return ret;
1273 		/*
1274 		 *	Control PIM assert.
1275 		 */
1276 	case MRT_ASSERT:
1277 	{
1278 		int v;
1279 		if (get_user(v,(int __user *)optval))
1280 			return -EFAULT;
1281 		mrt->mroute_do_assert = (v) ? 1 : 0;
1282 		return 0;
1283 	}
1284 #ifdef CONFIG_IP_PIMSM
1285 	case MRT_PIM:
1286 	{
1287 		int v;
1288 
1289 		if (get_user(v,(int __user *)optval))
1290 			return -EFAULT;
1291 		v = (v) ? 1 : 0;
1292 
1293 		rtnl_lock();
1294 		ret = 0;
1295 		if (v != mrt->mroute_do_pim) {
1296 			mrt->mroute_do_pim = v;
1297 			mrt->mroute_do_assert = v;
1298 		}
1299 		rtnl_unlock();
1300 		return ret;
1301 	}
1302 #endif
1303 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1304 	case MRT_TABLE:
1305 	{
1306 		u32 v;
1307 
1308 		if (optlen != sizeof(u32))
1309 			return -EINVAL;
1310 		if (get_user(v, (u32 __user *)optval))
1311 			return -EFAULT;
1312 		if (sk == mrt->mroute_sk)
1313 			return -EBUSY;
1314 
1315 		rtnl_lock();
1316 		ret = 0;
1317 		if (!ipmr_new_table(net, v))
1318 			ret = -ENOMEM;
1319 		raw_sk(sk)->ipmr_table = v;
1320 		rtnl_unlock();
1321 		return ret;
1322 	}
1323 #endif
1324 	/*
1325 	 *	Spurious command, or MRT_VERSION which you cannot
1326 	 *	set.
1327 	 */
1328 	default:
1329 		return -ENOPROTOOPT;
1330 	}
1331 }
1332 
1333 /*
1334  *	Getsock opt support for the multicast routing system.
1335  */
1336 
1337 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1338 {
1339 	int olr;
1340 	int val;
1341 	struct net *net = sock_net(sk);
1342 	struct mr_table *mrt;
1343 
1344 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1345 	if (mrt == NULL)
1346 		return -ENOENT;
1347 
1348 	if (optname != MRT_VERSION &&
1349 #ifdef CONFIG_IP_PIMSM
1350 	   optname!=MRT_PIM &&
1351 #endif
1352 	   optname!=MRT_ASSERT)
1353 		return -ENOPROTOOPT;
1354 
1355 	if (get_user(olr, optlen))
1356 		return -EFAULT;
1357 
1358 	olr = min_t(unsigned int, olr, sizeof(int));
1359 	if (olr < 0)
1360 		return -EINVAL;
1361 
1362 	if (put_user(olr, optlen))
1363 		return -EFAULT;
1364 	if (optname == MRT_VERSION)
1365 		val = 0x0305;
1366 #ifdef CONFIG_IP_PIMSM
1367 	else if (optname == MRT_PIM)
1368 		val = mrt->mroute_do_pim;
1369 #endif
1370 	else
1371 		val = mrt->mroute_do_assert;
1372 	if (copy_to_user(optval, &val, olr))
1373 		return -EFAULT;
1374 	return 0;
1375 }
1376 
1377 /*
1378  *	The IP multicast ioctl support routines.
1379  */
1380 
1381 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1382 {
1383 	struct sioc_sg_req sr;
1384 	struct sioc_vif_req vr;
1385 	struct vif_device *vif;
1386 	struct mfc_cache *c;
1387 	struct net *net = sock_net(sk);
1388 	struct mr_table *mrt;
1389 
1390 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1391 	if (mrt == NULL)
1392 		return -ENOENT;
1393 
1394 	switch (cmd) {
1395 	case SIOCGETVIFCNT:
1396 		if (copy_from_user(&vr, arg, sizeof(vr)))
1397 			return -EFAULT;
1398 		if (vr.vifi >= mrt->maxvif)
1399 			return -EINVAL;
1400 		read_lock(&mrt_lock);
1401 		vif = &mrt->vif_table[vr.vifi];
1402 		if (VIF_EXISTS(mrt, vr.vifi)) {
1403 			vr.icount = vif->pkt_in;
1404 			vr.ocount = vif->pkt_out;
1405 			vr.ibytes = vif->bytes_in;
1406 			vr.obytes = vif->bytes_out;
1407 			read_unlock(&mrt_lock);
1408 
1409 			if (copy_to_user(arg, &vr, sizeof(vr)))
1410 				return -EFAULT;
1411 			return 0;
1412 		}
1413 		read_unlock(&mrt_lock);
1414 		return -EADDRNOTAVAIL;
1415 	case SIOCGETSGCNT:
1416 		if (copy_from_user(&sr, arg, sizeof(sr)))
1417 			return -EFAULT;
1418 
1419 		read_lock(&mrt_lock);
1420 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421 		if (c) {
1422 			sr.pktcnt = c->mfc_un.res.pkt;
1423 			sr.bytecnt = c->mfc_un.res.bytes;
1424 			sr.wrong_if = c->mfc_un.res.wrong_if;
1425 			read_unlock(&mrt_lock);
1426 
1427 			if (copy_to_user(arg, &sr, sizeof(sr)))
1428 				return -EFAULT;
1429 			return 0;
1430 		}
1431 		read_unlock(&mrt_lock);
1432 		return -EADDRNOTAVAIL;
1433 	default:
1434 		return -ENOIOCTLCMD;
1435 	}
1436 }
1437 
1438 
1439 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1440 {
1441 	struct net_device *dev = ptr;
1442 	struct net *net = dev_net(dev);
1443 	struct mr_table *mrt;
1444 	struct vif_device *v;
1445 	int ct;
1446 	LIST_HEAD(list);
1447 
1448 	if (event != NETDEV_UNREGISTER)
1449 		return NOTIFY_DONE;
1450 
1451 	ipmr_for_each_table(mrt, net) {
1452 		v = &mrt->vif_table[0];
1453 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1454 			if (v->dev == dev)
1455 				vif_delete(mrt, ct, 1, &list);
1456 		}
1457 	}
1458 	unregister_netdevice_many(&list);
1459 	return NOTIFY_DONE;
1460 }
1461 
1462 
1463 static struct notifier_block ip_mr_notifier = {
1464 	.notifier_call = ipmr_device_event,
1465 };
1466 
1467 /*
1468  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1469  *	This avoids tunnel drivers and other mess and gives us the speed so
1470  *	important for multicast video.
1471  */
1472 
1473 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1474 {
1475 	struct iphdr *iph;
1476 	struct iphdr *old_iph = ip_hdr(skb);
1477 
1478 	skb_push(skb, sizeof(struct iphdr));
1479 	skb->transport_header = skb->network_header;
1480 	skb_reset_network_header(skb);
1481 	iph = ip_hdr(skb);
1482 
1483 	iph->version	= 	4;
1484 	iph->tos	=	old_iph->tos;
1485 	iph->ttl	=	old_iph->ttl;
1486 	iph->frag_off	=	0;
1487 	iph->daddr	=	daddr;
1488 	iph->saddr	=	saddr;
1489 	iph->protocol	=	IPPROTO_IPIP;
1490 	iph->ihl	=	5;
1491 	iph->tot_len	=	htons(skb->len);
1492 	ip_select_ident(iph, skb_dst(skb), NULL);
1493 	ip_send_check(iph);
1494 
1495 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1496 	nf_reset(skb);
1497 }
1498 
1499 static inline int ipmr_forward_finish(struct sk_buff *skb)
1500 {
1501 	struct ip_options * opt	= &(IPCB(skb)->opt);
1502 
1503 	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504 
1505 	if (unlikely(opt->optlen))
1506 		ip_forward_options(skb);
1507 
1508 	return dst_output(skb);
1509 }
1510 
1511 /*
1512  *	Processing handlers for ipmr_forward
1513  */
1514 
1515 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1516 			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
1517 {
1518 	const struct iphdr *iph = ip_hdr(skb);
1519 	struct vif_device *vif = &mrt->vif_table[vifi];
1520 	struct net_device *dev;
1521 	struct rtable *rt;
1522 	int    encap = 0;
1523 
1524 	if (vif->dev == NULL)
1525 		goto out_free;
1526 
1527 #ifdef CONFIG_IP_PIMSM
1528 	if (vif->flags & VIFF_REGISTER) {
1529 		vif->pkt_out++;
1530 		vif->bytes_out += skb->len;
1531 		vif->dev->stats.tx_bytes += skb->len;
1532 		vif->dev->stats.tx_packets++;
1533 		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1534 		goto out_free;
1535 	}
1536 #endif
1537 
1538 	if (vif->flags&VIFF_TUNNEL) {
1539 		struct flowi fl = { .oif = vif->link,
1540 				    .nl_u = { .ip4_u =
1541 					      { .daddr = vif->remote,
1542 						.saddr = vif->local,
1543 						.tos = RT_TOS(iph->tos) } },
1544 				    .proto = IPPROTO_IPIP };
1545 		if (ip_route_output_key(net, &rt, &fl))
1546 			goto out_free;
1547 		encap = sizeof(struct iphdr);
1548 	} else {
1549 		struct flowi fl = { .oif = vif->link,
1550 				    .nl_u = { .ip4_u =
1551 					      { .daddr = iph->daddr,
1552 						.tos = RT_TOS(iph->tos) } },
1553 				    .proto = IPPROTO_IPIP };
1554 		if (ip_route_output_key(net, &rt, &fl))
1555 			goto out_free;
1556 	}
1557 
1558 	dev = rt->dst.dev;
1559 
1560 	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561 		/* Do not fragment multicasts. Alas, IPv4 does not
1562 		   allow to send ICMP, so that packets will disappear
1563 		   to blackhole.
1564 		 */
1565 
1566 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1567 		ip_rt_put(rt);
1568 		goto out_free;
1569 	}
1570 
1571 	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1572 
1573 	if (skb_cow(skb, encap)) {
1574 		ip_rt_put(rt);
1575 		goto out_free;
1576 	}
1577 
1578 	vif->pkt_out++;
1579 	vif->bytes_out += skb->len;
1580 
1581 	skb_dst_drop(skb);
1582 	skb_dst_set(skb, &rt->dst);
1583 	ip_decrease_ttl(ip_hdr(skb));
1584 
1585 	/* FIXME: forward and output firewalls used to be called here.
1586 	 * What do we do with netfilter? -- RR */
1587 	if (vif->flags & VIFF_TUNNEL) {
1588 		ip_encap(skb, vif->local, vif->remote);
1589 		/* FIXME: extra output firewall step used to be here. --RR */
1590 		vif->dev->stats.tx_packets++;
1591 		vif->dev->stats.tx_bytes += skb->len;
1592 	}
1593 
1594 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1595 
1596 	/*
1597 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1598 	 * not only before forwarding, but after forwarding on all output
1599 	 * interfaces. It is clear, if mrouter runs a multicasting
1600 	 * program, it should receive packets not depending to what interface
1601 	 * program is joined.
1602 	 * If we will not make it, the program will have to join on all
1603 	 * interfaces. On the other hand, multihoming host (or router, but
1604 	 * not mrouter) cannot join to more than one interface - it will
1605 	 * result in receiving multiple packets.
1606 	 */
1607 	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1608 		ipmr_forward_finish);
1609 	return;
1610 
1611 out_free:
1612 	kfree_skb(skb);
1613 }
1614 
1615 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1616 {
1617 	int ct;
1618 
1619 	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1620 		if (mrt->vif_table[ct].dev == dev)
1621 			break;
1622 	}
1623 	return ct;
1624 }
1625 
1626 /* "local" means that we should preserve one skb (for local delivery) */
1627 
1628 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1629 			 struct sk_buff *skb, struct mfc_cache *cache,
1630 			 int local)
1631 {
1632 	int psend = -1;
1633 	int vif, ct;
1634 
1635 	vif = cache->mfc_parent;
1636 	cache->mfc_un.res.pkt++;
1637 	cache->mfc_un.res.bytes += skb->len;
1638 
1639 	/*
1640 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1641 	 */
1642 	if (mrt->vif_table[vif].dev != skb->dev) {
1643 		int true_vifi;
1644 
1645 		if (skb_rtable(skb)->fl.iif == 0) {
1646 			/* It is our own packet, looped back.
1647 			   Very complicated situation...
1648 
1649 			   The best workaround until routing daemons will be
1650 			   fixed is not to redistribute packet, if it was
1651 			   send through wrong interface. It means, that
1652 			   multicast applications WILL NOT work for
1653 			   (S,G), which have default multicast route pointing
1654 			   to wrong oif. In any case, it is not a good
1655 			   idea to use multicasting applications on router.
1656 			 */
1657 			goto dont_forward;
1658 		}
1659 
1660 		cache->mfc_un.res.wrong_if++;
1661 		true_vifi = ipmr_find_vif(mrt, skb->dev);
1662 
1663 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664 		    /* pimsm uses asserts, when switching from RPT to SPT,
1665 		       so that we cannot check that packet arrived on an oif.
1666 		       It is bad, but otherwise we would need to move pretty
1667 		       large chunk of pimd to kernel. Ough... --ANK
1668 		     */
1669 		    (mrt->mroute_do_pim ||
1670 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
1671 		    time_after(jiffies,
1672 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1673 			cache->mfc_un.res.last_assert = jiffies;
1674 			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1675 		}
1676 		goto dont_forward;
1677 	}
1678 
1679 	mrt->vif_table[vif].pkt_in++;
1680 	mrt->vif_table[vif].bytes_in += skb->len;
1681 
1682 	/*
1683 	 *	Forward the frame
1684 	 */
1685 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1686 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687 			if (psend != -1) {
1688 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1689 				if (skb2)
1690 					ipmr_queue_xmit(net, mrt, skb2, cache,
1691 							psend);
1692 			}
1693 			psend = ct;
1694 		}
1695 	}
1696 	if (psend != -1) {
1697 		if (local) {
1698 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1699 			if (skb2)
1700 				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701 		} else {
1702 			ipmr_queue_xmit(net, mrt, skb, cache, psend);
1703 			return 0;
1704 		}
1705 	}
1706 
1707 dont_forward:
1708 	if (!local)
1709 		kfree_skb(skb);
1710 	return 0;
1711 }
1712 
1713 
1714 /*
1715  *	Multicast packets for forwarding arrive here
1716  */
1717 
1718 int ip_mr_input(struct sk_buff *skb)
1719 {
1720 	struct mfc_cache *cache;
1721 	struct net *net = dev_net(skb->dev);
1722 	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1723 	struct mr_table *mrt;
1724 	int err;
1725 
1726 	/* Packet is looped back after forward, it should not be
1727 	   forwarded second time, but still can be delivered locally.
1728 	 */
1729 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1730 		goto dont_forward;
1731 
1732 	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1733 	if (err < 0) {
1734 		kfree_skb(skb);
1735 		return err;
1736 	}
1737 
1738 	if (!local) {
1739 		    if (IPCB(skb)->opt.router_alert) {
1740 			    if (ip_call_ra_chain(skb))
1741 				    return 0;
1742 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1743 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1744 			       Cisco IOS <= 11.2(8)) do not put router alert
1745 			       option to IGMP packets destined to routable
1746 			       groups. It is very bad, because it means
1747 			       that we can forward NO IGMP messages.
1748 			     */
1749 			    read_lock(&mrt_lock);
1750 			    if (mrt->mroute_sk) {
1751 				    nf_reset(skb);
1752 				    raw_rcv(mrt->mroute_sk, skb);
1753 				    read_unlock(&mrt_lock);
1754 				    return 0;
1755 			    }
1756 			    read_unlock(&mrt_lock);
1757 		    }
1758 	}
1759 
1760 	read_lock(&mrt_lock);
1761 	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762 
1763 	/*
1764 	 *	No usable cache entry
1765 	 */
1766 	if (cache == NULL) {
1767 		int vif;
1768 
1769 		if (local) {
1770 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771 			ip_local_deliver(skb);
1772 			if (skb2 == NULL) {
1773 				read_unlock(&mrt_lock);
1774 				return -ENOBUFS;
1775 			}
1776 			skb = skb2;
1777 		}
1778 
1779 		vif = ipmr_find_vif(mrt, skb->dev);
1780 		if (vif >= 0) {
1781 			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1782 			read_unlock(&mrt_lock);
1783 
1784 			return err2;
1785 		}
1786 		read_unlock(&mrt_lock);
1787 		kfree_skb(skb);
1788 		return -ENODEV;
1789 	}
1790 
1791 	ip_mr_forward(net, mrt, skb, cache, local);
1792 
1793 	read_unlock(&mrt_lock);
1794 
1795 	if (local)
1796 		return ip_local_deliver(skb);
1797 
1798 	return 0;
1799 
1800 dont_forward:
1801 	if (local)
1802 		return ip_local_deliver(skb);
1803 	kfree_skb(skb);
1804 	return 0;
1805 }
1806 
1807 #ifdef CONFIG_IP_PIMSM
1808 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809 		     unsigned int pimlen)
1810 {
1811 	struct net_device *reg_dev = NULL;
1812 	struct iphdr *encap;
1813 
1814 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1815 	/*
1816 	   Check that:
1817 	   a. packet is really destinted to a multicast group
1818 	   b. packet is not a NULL-REGISTER
1819 	   c. packet is not truncated
1820 	 */
1821 	if (!ipv4_is_multicast(encap->daddr) ||
1822 	    encap->tot_len == 0 ||
1823 	    ntohs(encap->tot_len) + pimlen > skb->len)
1824 		return 1;
1825 
1826 	read_lock(&mrt_lock);
1827 	if (mrt->mroute_reg_vif_num >= 0)
1828 		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1829 	if (reg_dev)
1830 		dev_hold(reg_dev);
1831 	read_unlock(&mrt_lock);
1832 
1833 	if (reg_dev == NULL)
1834 		return 1;
1835 
1836 	skb->mac_header = skb->network_header;
1837 	skb_pull(skb, (u8*)encap - skb->data);
1838 	skb_reset_network_header(skb);
1839 	skb->protocol = htons(ETH_P_IP);
1840 	skb->ip_summed = 0;
1841 	skb->pkt_type = PACKET_HOST;
1842 
1843 	skb_tunnel_rx(skb, reg_dev);
1844 
1845 	netif_rx(skb);
1846 	dev_put(reg_dev);
1847 
1848 	return 0;
1849 }
1850 #endif
1851 
1852 #ifdef CONFIG_IP_PIMSM_V1
1853 /*
1854  * Handle IGMP messages of PIMv1
1855  */
1856 
1857 int pim_rcv_v1(struct sk_buff * skb)
1858 {
1859 	struct igmphdr *pim;
1860 	struct net *net = dev_net(skb->dev);
1861 	struct mr_table *mrt;
1862 
1863 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1864 		goto drop;
1865 
1866 	pim = igmp_hdr(skb);
1867 
1868 	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1869 		goto drop;
1870 
1871 	if (!mrt->mroute_do_pim ||
1872 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1873 		goto drop;
1874 
1875 	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1876 drop:
1877 		kfree_skb(skb);
1878 	}
1879 	return 0;
1880 }
1881 #endif
1882 
1883 #ifdef CONFIG_IP_PIMSM_V2
1884 static int pim_rcv(struct sk_buff * skb)
1885 {
1886 	struct pimreghdr *pim;
1887 	struct net *net = dev_net(skb->dev);
1888 	struct mr_table *mrt;
1889 
1890 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1891 		goto drop;
1892 
1893 	pim = (struct pimreghdr *)skb_transport_header(skb);
1894 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1895 	    (pim->flags&PIM_NULL_REGISTER) ||
1896 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1897 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1898 		goto drop;
1899 
1900 	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1901 		goto drop;
1902 
1903 	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1904 drop:
1905 		kfree_skb(skb);
1906 	}
1907 	return 0;
1908 }
1909 #endif
1910 
1911 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1912 			      struct mfc_cache *c, struct rtmsg *rtm)
1913 {
1914 	int ct;
1915 	struct rtnexthop *nhp;
1916 	u8 *b = skb_tail_pointer(skb);
1917 	struct rtattr *mp_head;
1918 
1919 	/* If cache is unresolved, don't try to parse IIF and OIF */
1920 	if (c->mfc_parent >= MAXVIFS)
1921 		return -ENOENT;
1922 
1923 	if (VIF_EXISTS(mrt, c->mfc_parent))
1924 		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1925 
1926 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1927 
1928 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1929 		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1930 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1931 				goto rtattr_failure;
1932 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1933 			nhp->rtnh_flags = 0;
1934 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1935 			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1936 			nhp->rtnh_len = sizeof(*nhp);
1937 		}
1938 	}
1939 	mp_head->rta_type = RTA_MULTIPATH;
1940 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1941 	rtm->rtm_type = RTN_MULTICAST;
1942 	return 1;
1943 
1944 rtattr_failure:
1945 	nlmsg_trim(skb, b);
1946 	return -EMSGSIZE;
1947 }
1948 
1949 int ipmr_get_route(struct net *net,
1950 		   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1951 {
1952 	int err;
1953 	struct mr_table *mrt;
1954 	struct mfc_cache *cache;
1955 	struct rtable *rt = skb_rtable(skb);
1956 
1957 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1958 	if (mrt == NULL)
1959 		return -ENOENT;
1960 
1961 	read_lock(&mrt_lock);
1962 	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1963 
1964 	if (cache == NULL) {
1965 		struct sk_buff *skb2;
1966 		struct iphdr *iph;
1967 		struct net_device *dev;
1968 		int vif;
1969 
1970 		if (nowait) {
1971 			read_unlock(&mrt_lock);
1972 			return -EAGAIN;
1973 		}
1974 
1975 		dev = skb->dev;
1976 		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1977 			read_unlock(&mrt_lock);
1978 			return -ENODEV;
1979 		}
1980 		skb2 = skb_clone(skb, GFP_ATOMIC);
1981 		if (!skb2) {
1982 			read_unlock(&mrt_lock);
1983 			return -ENOMEM;
1984 		}
1985 
1986 		skb_push(skb2, sizeof(struct iphdr));
1987 		skb_reset_network_header(skb2);
1988 		iph = ip_hdr(skb2);
1989 		iph->ihl = sizeof(struct iphdr) >> 2;
1990 		iph->saddr = rt->rt_src;
1991 		iph->daddr = rt->rt_dst;
1992 		iph->version = 0;
1993 		err = ipmr_cache_unresolved(mrt, vif, skb2);
1994 		read_unlock(&mrt_lock);
1995 		return err;
1996 	}
1997 
1998 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1999 		cache->mfc_flags |= MFC_NOTIFY;
2000 	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2001 	read_unlock(&mrt_lock);
2002 	return err;
2003 }
2004 
2005 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2006 			    u32 pid, u32 seq, struct mfc_cache *c)
2007 {
2008 	struct nlmsghdr *nlh;
2009 	struct rtmsg *rtm;
2010 
2011 	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2012 	if (nlh == NULL)
2013 		return -EMSGSIZE;
2014 
2015 	rtm = nlmsg_data(nlh);
2016 	rtm->rtm_family   = RTNL_FAMILY_IPMR;
2017 	rtm->rtm_dst_len  = 32;
2018 	rtm->rtm_src_len  = 32;
2019 	rtm->rtm_tos      = 0;
2020 	rtm->rtm_table    = mrt->id;
2021 	NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2022 	rtm->rtm_type     = RTN_MULTICAST;
2023 	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2024 	rtm->rtm_protocol = RTPROT_UNSPEC;
2025 	rtm->rtm_flags    = 0;
2026 
2027 	NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2028 	NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2029 
2030 	if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2031 		goto nla_put_failure;
2032 
2033 	return nlmsg_end(skb, nlh);
2034 
2035 nla_put_failure:
2036 	nlmsg_cancel(skb, nlh);
2037 	return -EMSGSIZE;
2038 }
2039 
2040 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2041 {
2042 	struct net *net = sock_net(skb->sk);
2043 	struct mr_table *mrt;
2044 	struct mfc_cache *mfc;
2045 	unsigned int t = 0, s_t;
2046 	unsigned int h = 0, s_h;
2047 	unsigned int e = 0, s_e;
2048 
2049 	s_t = cb->args[0];
2050 	s_h = cb->args[1];
2051 	s_e = cb->args[2];
2052 
2053 	read_lock(&mrt_lock);
2054 	ipmr_for_each_table(mrt, net) {
2055 		if (t < s_t)
2056 			goto next_table;
2057 		if (t > s_t)
2058 			s_h = 0;
2059 		for (h = s_h; h < MFC_LINES; h++) {
2060 			list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2061 				if (e < s_e)
2062 					goto next_entry;
2063 				if (ipmr_fill_mroute(mrt, skb,
2064 						     NETLINK_CB(cb->skb).pid,
2065 						     cb->nlh->nlmsg_seq,
2066 						     mfc) < 0)
2067 					goto done;
2068 next_entry:
2069 				e++;
2070 			}
2071 			e = s_e = 0;
2072 		}
2073 		s_h = 0;
2074 next_table:
2075 		t++;
2076 	}
2077 done:
2078 	read_unlock(&mrt_lock);
2079 
2080 	cb->args[2] = e;
2081 	cb->args[1] = h;
2082 	cb->args[0] = t;
2083 
2084 	return skb->len;
2085 }
2086 
2087 #ifdef CONFIG_PROC_FS
2088 /*
2089  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2090  */
2091 struct ipmr_vif_iter {
2092 	struct seq_net_private p;
2093 	struct mr_table *mrt;
2094 	int ct;
2095 };
2096 
2097 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2098 					   struct ipmr_vif_iter *iter,
2099 					   loff_t pos)
2100 {
2101 	struct mr_table *mrt = iter->mrt;
2102 
2103 	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2104 		if (!VIF_EXISTS(mrt, iter->ct))
2105 			continue;
2106 		if (pos-- == 0)
2107 			return &mrt->vif_table[iter->ct];
2108 	}
2109 	return NULL;
2110 }
2111 
2112 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2113 	__acquires(mrt_lock)
2114 {
2115 	struct ipmr_vif_iter *iter = seq->private;
2116 	struct net *net = seq_file_net(seq);
2117 	struct mr_table *mrt;
2118 
2119 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2120 	if (mrt == NULL)
2121 		return ERR_PTR(-ENOENT);
2122 
2123 	iter->mrt = mrt;
2124 
2125 	read_lock(&mrt_lock);
2126 	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2127 		: SEQ_START_TOKEN;
2128 }
2129 
2130 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2131 {
2132 	struct ipmr_vif_iter *iter = seq->private;
2133 	struct net *net = seq_file_net(seq);
2134 	struct mr_table *mrt = iter->mrt;
2135 
2136 	++*pos;
2137 	if (v == SEQ_START_TOKEN)
2138 		return ipmr_vif_seq_idx(net, iter, 0);
2139 
2140 	while (++iter->ct < mrt->maxvif) {
2141 		if (!VIF_EXISTS(mrt, iter->ct))
2142 			continue;
2143 		return &mrt->vif_table[iter->ct];
2144 	}
2145 	return NULL;
2146 }
2147 
2148 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2149 	__releases(mrt_lock)
2150 {
2151 	read_unlock(&mrt_lock);
2152 }
2153 
2154 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2155 {
2156 	struct ipmr_vif_iter *iter = seq->private;
2157 	struct mr_table *mrt = iter->mrt;
2158 
2159 	if (v == SEQ_START_TOKEN) {
2160 		seq_puts(seq,
2161 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2162 	} else {
2163 		const struct vif_device *vif = v;
2164 		const char *name =  vif->dev ? vif->dev->name : "none";
2165 
2166 		seq_printf(seq,
2167 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2168 			   vif - mrt->vif_table,
2169 			   name, vif->bytes_in, vif->pkt_in,
2170 			   vif->bytes_out, vif->pkt_out,
2171 			   vif->flags, vif->local, vif->remote);
2172 	}
2173 	return 0;
2174 }
2175 
2176 static const struct seq_operations ipmr_vif_seq_ops = {
2177 	.start = ipmr_vif_seq_start,
2178 	.next  = ipmr_vif_seq_next,
2179 	.stop  = ipmr_vif_seq_stop,
2180 	.show  = ipmr_vif_seq_show,
2181 };
2182 
2183 static int ipmr_vif_open(struct inode *inode, struct file *file)
2184 {
2185 	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2186 			    sizeof(struct ipmr_vif_iter));
2187 }
2188 
2189 static const struct file_operations ipmr_vif_fops = {
2190 	.owner	 = THIS_MODULE,
2191 	.open    = ipmr_vif_open,
2192 	.read    = seq_read,
2193 	.llseek  = seq_lseek,
2194 	.release = seq_release_net,
2195 };
2196 
2197 struct ipmr_mfc_iter {
2198 	struct seq_net_private p;
2199 	struct mr_table *mrt;
2200 	struct list_head *cache;
2201 	int ct;
2202 };
2203 
2204 
2205 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2206 					  struct ipmr_mfc_iter *it, loff_t pos)
2207 {
2208 	struct mr_table *mrt = it->mrt;
2209 	struct mfc_cache *mfc;
2210 
2211 	read_lock(&mrt_lock);
2212 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2213 		it->cache = &mrt->mfc_cache_array[it->ct];
2214 		list_for_each_entry(mfc, it->cache, list)
2215 			if (pos-- == 0)
2216 				return mfc;
2217 	}
2218 	read_unlock(&mrt_lock);
2219 
2220 	spin_lock_bh(&mfc_unres_lock);
2221 	it->cache = &mrt->mfc_unres_queue;
2222 	list_for_each_entry(mfc, it->cache, list)
2223 		if (pos-- == 0)
2224 			return mfc;
2225 	spin_unlock_bh(&mfc_unres_lock);
2226 
2227 	it->cache = NULL;
2228 	return NULL;
2229 }
2230 
2231 
2232 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2233 {
2234 	struct ipmr_mfc_iter *it = seq->private;
2235 	struct net *net = seq_file_net(seq);
2236 	struct mr_table *mrt;
2237 
2238 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2239 	if (mrt == NULL)
2240 		return ERR_PTR(-ENOENT);
2241 
2242 	it->mrt = mrt;
2243 	it->cache = NULL;
2244 	it->ct = 0;
2245 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2246 		: SEQ_START_TOKEN;
2247 }
2248 
2249 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2250 {
2251 	struct mfc_cache *mfc = v;
2252 	struct ipmr_mfc_iter *it = seq->private;
2253 	struct net *net = seq_file_net(seq);
2254 	struct mr_table *mrt = it->mrt;
2255 
2256 	++*pos;
2257 
2258 	if (v == SEQ_START_TOKEN)
2259 		return ipmr_mfc_seq_idx(net, seq->private, 0);
2260 
2261 	if (mfc->list.next != it->cache)
2262 		return list_entry(mfc->list.next, struct mfc_cache, list);
2263 
2264 	if (it->cache == &mrt->mfc_unres_queue)
2265 		goto end_of_list;
2266 
2267 	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2268 
2269 	while (++it->ct < MFC_LINES) {
2270 		it->cache = &mrt->mfc_cache_array[it->ct];
2271 		if (list_empty(it->cache))
2272 			continue;
2273 		return list_first_entry(it->cache, struct mfc_cache, list);
2274 	}
2275 
2276 	/* exhausted cache_array, show unresolved */
2277 	read_unlock(&mrt_lock);
2278 	it->cache = &mrt->mfc_unres_queue;
2279 	it->ct = 0;
2280 
2281 	spin_lock_bh(&mfc_unres_lock);
2282 	if (!list_empty(it->cache))
2283 		return list_first_entry(it->cache, struct mfc_cache, list);
2284 
2285  end_of_list:
2286 	spin_unlock_bh(&mfc_unres_lock);
2287 	it->cache = NULL;
2288 
2289 	return NULL;
2290 }
2291 
2292 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2293 {
2294 	struct ipmr_mfc_iter *it = seq->private;
2295 	struct mr_table *mrt = it->mrt;
2296 
2297 	if (it->cache == &mrt->mfc_unres_queue)
2298 		spin_unlock_bh(&mfc_unres_lock);
2299 	else if (it->cache == &mrt->mfc_cache_array[it->ct])
2300 		read_unlock(&mrt_lock);
2301 }
2302 
2303 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2304 {
2305 	int n;
2306 
2307 	if (v == SEQ_START_TOKEN) {
2308 		seq_puts(seq,
2309 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2310 	} else {
2311 		const struct mfc_cache *mfc = v;
2312 		const struct ipmr_mfc_iter *it = seq->private;
2313 		const struct mr_table *mrt = it->mrt;
2314 
2315 		seq_printf(seq, "%08X %08X %-3hd",
2316 			   (__force u32) mfc->mfc_mcastgrp,
2317 			   (__force u32) mfc->mfc_origin,
2318 			   mfc->mfc_parent);
2319 
2320 		if (it->cache != &mrt->mfc_unres_queue) {
2321 			seq_printf(seq, " %8lu %8lu %8lu",
2322 				   mfc->mfc_un.res.pkt,
2323 				   mfc->mfc_un.res.bytes,
2324 				   mfc->mfc_un.res.wrong_if);
2325 			for (n = mfc->mfc_un.res.minvif;
2326 			     n < mfc->mfc_un.res.maxvif; n++ ) {
2327 				if (VIF_EXISTS(mrt, n) &&
2328 				    mfc->mfc_un.res.ttls[n] < 255)
2329 					seq_printf(seq,
2330 					   " %2d:%-3d",
2331 					   n, mfc->mfc_un.res.ttls[n]);
2332 			}
2333 		} else {
2334 			/* unresolved mfc_caches don't contain
2335 			 * pkt, bytes and wrong_if values
2336 			 */
2337 			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2338 		}
2339 		seq_putc(seq, '\n');
2340 	}
2341 	return 0;
2342 }
2343 
2344 static const struct seq_operations ipmr_mfc_seq_ops = {
2345 	.start = ipmr_mfc_seq_start,
2346 	.next  = ipmr_mfc_seq_next,
2347 	.stop  = ipmr_mfc_seq_stop,
2348 	.show  = ipmr_mfc_seq_show,
2349 };
2350 
2351 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2352 {
2353 	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2354 			    sizeof(struct ipmr_mfc_iter));
2355 }
2356 
2357 static const struct file_operations ipmr_mfc_fops = {
2358 	.owner	 = THIS_MODULE,
2359 	.open    = ipmr_mfc_open,
2360 	.read    = seq_read,
2361 	.llseek  = seq_lseek,
2362 	.release = seq_release_net,
2363 };
2364 #endif
2365 
2366 #ifdef CONFIG_IP_PIMSM_V2
2367 static const struct net_protocol pim_protocol = {
2368 	.handler	=	pim_rcv,
2369 	.netns_ok	=	1,
2370 };
2371 #endif
2372 
2373 
2374 /*
2375  *	Setup for IP multicast routing
2376  */
2377 static int __net_init ipmr_net_init(struct net *net)
2378 {
2379 	int err;
2380 
2381 	err = ipmr_rules_init(net);
2382 	if (err < 0)
2383 		goto fail;
2384 
2385 #ifdef CONFIG_PROC_FS
2386 	err = -ENOMEM;
2387 	if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2388 		goto proc_vif_fail;
2389 	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2390 		goto proc_cache_fail;
2391 #endif
2392 	return 0;
2393 
2394 #ifdef CONFIG_PROC_FS
2395 proc_cache_fail:
2396 	proc_net_remove(net, "ip_mr_vif");
2397 proc_vif_fail:
2398 	ipmr_rules_exit(net);
2399 #endif
2400 fail:
2401 	return err;
2402 }
2403 
2404 static void __net_exit ipmr_net_exit(struct net *net)
2405 {
2406 #ifdef CONFIG_PROC_FS
2407 	proc_net_remove(net, "ip_mr_cache");
2408 	proc_net_remove(net, "ip_mr_vif");
2409 #endif
2410 	ipmr_rules_exit(net);
2411 }
2412 
2413 static struct pernet_operations ipmr_net_ops = {
2414 	.init = ipmr_net_init,
2415 	.exit = ipmr_net_exit,
2416 };
2417 
2418 int __init ip_mr_init(void)
2419 {
2420 	int err;
2421 
2422 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
2423 				       sizeof(struct mfc_cache),
2424 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2425 				       NULL);
2426 	if (!mrt_cachep)
2427 		return -ENOMEM;
2428 
2429 	err = register_pernet_subsys(&ipmr_net_ops);
2430 	if (err)
2431 		goto reg_pernet_fail;
2432 
2433 	err = register_netdevice_notifier(&ip_mr_notifier);
2434 	if (err)
2435 		goto reg_notif_fail;
2436 #ifdef CONFIG_IP_PIMSM_V2
2437 	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2438 		printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2439 		err = -EAGAIN;
2440 		goto add_proto_fail;
2441 	}
2442 #endif
2443 	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2444 	return 0;
2445 
2446 #ifdef CONFIG_IP_PIMSM_V2
2447 add_proto_fail:
2448 	unregister_netdevice_notifier(&ip_mr_notifier);
2449 #endif
2450 reg_notif_fail:
2451 	unregister_pernet_subsys(&ipmr_net_ops);
2452 reg_pernet_fail:
2453 	kmem_cache_destroy(mrt_cachep);
2454 	return err;
2455 }
2456