xref: /openbmc/linux/net/ipv4/route.c (revision 5a86bf34)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly	= 9;
119 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly	= HZ;
122 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly	= 256;
126 
127 /*
128  *	Interface to generic destination cache.
129  */
130 
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void		 ipv4_link_failure(struct sk_buff *skb);
136 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137 					   struct sk_buff *skb, u32 mtu);
138 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139 					struct sk_buff *skb);
140 static void		ipv4_dst_destroy(struct dst_entry *dst);
141 
142 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
143 			    int how)
144 {
145 }
146 
147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148 {
149 	WARN_ON(1);
150 	return NULL;
151 }
152 
153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
154 					   struct sk_buff *skb,
155 					   const void *daddr);
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.protocol =		cpu_to_be16(ETH_P_IP),
160 	.check =		ipv4_dst_check,
161 	.default_advmss =	ipv4_default_advmss,
162 	.mtu =			ipv4_mtu,
163 	.cow_metrics =		ipv4_cow_metrics,
164 	.destroy =		ipv4_dst_destroy,
165 	.ifdown =		ipv4_dst_ifdown,
166 	.negative_advice =	ipv4_negative_advice,
167 	.link_failure =		ipv4_link_failure,
168 	.update_pmtu =		ip_rt_update_pmtu,
169 	.redirect =		ip_do_redirect,
170 	.local_out =		__ip_local_out,
171 	.neigh_lookup =		ipv4_neigh_lookup,
172 };
173 
174 #define ECN_OR_COST(class)	TC_PRIO_##class
175 
176 const __u8 ip_tos2prio[16] = {
177 	TC_PRIO_BESTEFFORT,
178 	ECN_OR_COST(BESTEFFORT),
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BULK,
182 	ECN_OR_COST(BULK),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_INTERACTIVE,
186 	ECN_OR_COST(INTERACTIVE),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE_BULK,
190 	ECN_OR_COST(INTERACTIVE_BULK),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK)
193 };
194 EXPORT_SYMBOL(ip_tos2prio);
195 
196 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
197 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
198 
199 #ifdef CONFIG_PROC_FS
200 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
201 {
202 	if (*pos)
203 		return NULL;
204 	return SEQ_START_TOKEN;
205 }
206 
207 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
208 {
209 	++*pos;
210 	return NULL;
211 }
212 
213 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
214 {
215 }
216 
217 static int rt_cache_seq_show(struct seq_file *seq, void *v)
218 {
219 	if (v == SEQ_START_TOKEN)
220 		seq_printf(seq, "%-127s\n",
221 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
222 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
223 			   "HHUptod\tSpecDst");
224 	return 0;
225 }
226 
227 static const struct seq_operations rt_cache_seq_ops = {
228 	.start  = rt_cache_seq_start,
229 	.next   = rt_cache_seq_next,
230 	.stop   = rt_cache_seq_stop,
231 	.show   = rt_cache_seq_show,
232 };
233 
234 static int rt_cache_seq_open(struct inode *inode, struct file *file)
235 {
236 	return seq_open(file, &rt_cache_seq_ops);
237 }
238 
239 static const struct file_operations rt_cache_seq_fops = {
240 	.owner	 = THIS_MODULE,
241 	.open	 = rt_cache_seq_open,
242 	.read	 = seq_read,
243 	.llseek	 = seq_lseek,
244 	.release = seq_release,
245 };
246 
247 
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250 	int cpu;
251 
252 	if (*pos == 0)
253 		return SEQ_START_TOKEN;
254 
255 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256 		if (!cpu_possible(cpu))
257 			continue;
258 		*pos = cpu+1;
259 		return &per_cpu(rt_cache_stat, cpu);
260 	}
261 	return NULL;
262 }
263 
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266 	int cpu;
267 
268 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269 		if (!cpu_possible(cpu))
270 			continue;
271 		*pos = cpu+1;
272 		return &per_cpu(rt_cache_stat, cpu);
273 	}
274 	return NULL;
275 
276 }
277 
278 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
279 {
280 
281 }
282 
283 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
284 {
285 	struct rt_cache_stat *st = v;
286 
287 	if (v == SEQ_START_TOKEN) {
288 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
289 		return 0;
290 	}
291 
292 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
293 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
294 		   dst_entries_get_slow(&ipv4_dst_ops),
295 		   0, /* st->in_hit */
296 		   st->in_slow_tot,
297 		   st->in_slow_mc,
298 		   st->in_no_route,
299 		   st->in_brd,
300 		   st->in_martian_dst,
301 		   st->in_martian_src,
302 
303 		   0, /* st->out_hit */
304 		   st->out_slow_tot,
305 		   st->out_slow_mc,
306 
307 		   0, /* st->gc_total */
308 		   0, /* st->gc_ignored */
309 		   0, /* st->gc_goal_miss */
310 		   0, /* st->gc_dst_overflow */
311 		   0, /* st->in_hlist_search */
312 		   0  /* st->out_hlist_search */
313 		);
314 	return 0;
315 }
316 
317 static const struct seq_operations rt_cpu_seq_ops = {
318 	.start  = rt_cpu_seq_start,
319 	.next   = rt_cpu_seq_next,
320 	.stop   = rt_cpu_seq_stop,
321 	.show   = rt_cpu_seq_show,
322 };
323 
324 
325 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
326 {
327 	return seq_open(file, &rt_cpu_seq_ops);
328 }
329 
330 static const struct file_operations rt_cpu_seq_fops = {
331 	.owner	 = THIS_MODULE,
332 	.open	 = rt_cpu_seq_open,
333 	.read	 = seq_read,
334 	.llseek	 = seq_lseek,
335 	.release = seq_release,
336 };
337 
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341 	struct ip_rt_acct *dst, *src;
342 	unsigned int i, j;
343 
344 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345 	if (!dst)
346 		return -ENOMEM;
347 
348 	for_each_possible_cpu(i) {
349 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 		for (j = 0; j < 256; j++) {
351 			dst[j].o_bytes   += src[j].o_bytes;
352 			dst[j].o_packets += src[j].o_packets;
353 			dst[j].i_bytes   += src[j].i_bytes;
354 			dst[j].i_packets += src[j].i_packets;
355 		}
356 	}
357 
358 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359 	kfree(dst);
360 	return 0;
361 }
362 
363 static int rt_acct_proc_open(struct inode *inode, struct file *file)
364 {
365 	return single_open(file, rt_acct_proc_show, NULL);
366 }
367 
368 static const struct file_operations rt_acct_proc_fops = {
369 	.owner		= THIS_MODULE,
370 	.open		= rt_acct_proc_open,
371 	.read		= seq_read,
372 	.llseek		= seq_lseek,
373 	.release	= single_release,
374 };
375 #endif
376 
377 static int __net_init ip_rt_do_proc_init(struct net *net)
378 {
379 	struct proc_dir_entry *pde;
380 
381 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
382 			  &rt_cache_seq_fops);
383 	if (!pde)
384 		goto err1;
385 
386 	pde = proc_create("rt_cache", S_IRUGO,
387 			  net->proc_net_stat, &rt_cpu_seq_fops);
388 	if (!pde)
389 		goto err2;
390 
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
393 	if (!pde)
394 		goto err3;
395 #endif
396 	return 0;
397 
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 err3:
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 #endif
402 err2:
403 	remove_proc_entry("rt_cache", net->proc_net);
404 err1:
405 	return -ENOMEM;
406 }
407 
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 {
410 	remove_proc_entry("rt_cache", net->proc_net_stat);
411 	remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413 	remove_proc_entry("rt_acct", net->proc_net);
414 #endif
415 }
416 
417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
418 	.init = ip_rt_do_proc_init,
419 	.exit = ip_rt_do_proc_exit,
420 };
421 
422 static int __init ip_rt_proc_init(void)
423 {
424 	return register_pernet_subsys(&ip_rt_proc_ops);
425 }
426 
427 #else
428 static inline int ip_rt_proc_init(void)
429 {
430 	return 0;
431 }
432 #endif /* CONFIG_PROC_FS */
433 
434 static inline bool rt_is_expired(const struct rtable *rth)
435 {
436 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
437 }
438 
439 void rt_cache_flush(struct net *net)
440 {
441 	rt_genid_bump_ipv4(net);
442 }
443 
444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
445 					   struct sk_buff *skb,
446 					   const void *daddr)
447 {
448 	struct net_device *dev = dst->dev;
449 	const __be32 *pkey = daddr;
450 	const struct rtable *rt;
451 	struct neighbour *n;
452 
453 	rt = (const struct rtable *) dst;
454 	if (rt->rt_gateway)
455 		pkey = (const __be32 *) &rt->rt_gateway;
456 	else if (skb)
457 		pkey = &ip_hdr(skb)->daddr;
458 
459 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
460 	if (n)
461 		return n;
462 	return neigh_create(&arp_tbl, pkey, dev);
463 }
464 
465 /*
466  * Peer allocation may fail only in serious out-of-memory conditions.  However
467  * we still can generate some output.
468  * Random ID selection looks a bit dangerous because we have no chances to
469  * select ID being unique in a reasonable period of time.
470  * But broken packet identifier may be better than no packet at all.
471  */
472 static void ip_select_fb_ident(struct iphdr *iph)
473 {
474 	static DEFINE_SPINLOCK(ip_fb_id_lock);
475 	static u32 ip_fallback_id;
476 	u32 salt;
477 
478 	spin_lock_bh(&ip_fb_id_lock);
479 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
480 	iph->id = htons(salt & 0xFFFF);
481 	ip_fallback_id = salt;
482 	spin_unlock_bh(&ip_fb_id_lock);
483 }
484 
485 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
486 {
487 	struct net *net = dev_net(dst->dev);
488 	struct inet_peer *peer;
489 
490 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
491 	if (peer) {
492 		iph->id = htons(inet_getid(peer, more));
493 		inet_putpeer(peer);
494 		return;
495 	}
496 
497 	ip_select_fb_ident(iph);
498 }
499 EXPORT_SYMBOL(__ip_select_ident);
500 
501 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
502 			     const struct iphdr *iph,
503 			     int oif, u8 tos,
504 			     u8 prot, u32 mark, int flow_flags)
505 {
506 	if (sk) {
507 		const struct inet_sock *inet = inet_sk(sk);
508 
509 		oif = sk->sk_bound_dev_if;
510 		mark = sk->sk_mark;
511 		tos = RT_CONN_FLAGS(sk);
512 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
513 	}
514 	flowi4_init_output(fl4, oif, mark, tos,
515 			   RT_SCOPE_UNIVERSE, prot,
516 			   flow_flags,
517 			   iph->daddr, iph->saddr, 0, 0);
518 }
519 
520 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
521 			       const struct sock *sk)
522 {
523 	const struct iphdr *iph = ip_hdr(skb);
524 	int oif = skb->dev->ifindex;
525 	u8 tos = RT_TOS(iph->tos);
526 	u8 prot = iph->protocol;
527 	u32 mark = skb->mark;
528 
529 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
530 }
531 
532 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
533 {
534 	const struct inet_sock *inet = inet_sk(sk);
535 	const struct ip_options_rcu *inet_opt;
536 	__be32 daddr = inet->inet_daddr;
537 
538 	rcu_read_lock();
539 	inet_opt = rcu_dereference(inet->inet_opt);
540 	if (inet_opt && inet_opt->opt.srr)
541 		daddr = inet_opt->opt.faddr;
542 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
543 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
544 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
545 			   inet_sk_flowi_flags(sk),
546 			   daddr, inet->inet_saddr, 0, 0);
547 	rcu_read_unlock();
548 }
549 
550 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
551 				 const struct sk_buff *skb)
552 {
553 	if (skb)
554 		build_skb_flow_key(fl4, skb, sk);
555 	else
556 		build_sk_flow_key(fl4, sk);
557 }
558 
559 static inline void rt_free(struct rtable *rt)
560 {
561 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
562 }
563 
564 static DEFINE_SPINLOCK(fnhe_lock);
565 
566 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
567 {
568 	struct rtable *rt;
569 
570 	rt = rcu_dereference(fnhe->fnhe_rth_input);
571 	if (rt) {
572 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
573 		rt_free(rt);
574 	}
575 	rt = rcu_dereference(fnhe->fnhe_rth_output);
576 	if (rt) {
577 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
578 		rt_free(rt);
579 	}
580 }
581 
582 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
583 {
584 	struct fib_nh_exception *fnhe, *oldest;
585 
586 	oldest = rcu_dereference(hash->chain);
587 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
588 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
589 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
590 			oldest = fnhe;
591 	}
592 	fnhe_flush_routes(oldest);
593 	return oldest;
594 }
595 
596 static inline u32 fnhe_hashfun(__be32 daddr)
597 {
598 	u32 hval;
599 
600 	hval = (__force u32) daddr;
601 	hval ^= (hval >> 11) ^ (hval >> 22);
602 
603 	return hval & (FNHE_HASH_SIZE - 1);
604 }
605 
606 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
607 {
608 	rt->rt_pmtu = fnhe->fnhe_pmtu;
609 	rt->dst.expires = fnhe->fnhe_expires;
610 
611 	if (fnhe->fnhe_gw) {
612 		rt->rt_flags |= RTCF_REDIRECTED;
613 		rt->rt_gateway = fnhe->fnhe_gw;
614 		rt->rt_uses_gateway = 1;
615 	}
616 }
617 
618 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
619 				  u32 pmtu, unsigned long expires)
620 {
621 	struct fnhe_hash_bucket *hash;
622 	struct fib_nh_exception *fnhe;
623 	struct rtable *rt;
624 	unsigned int i;
625 	int depth;
626 	u32 hval = fnhe_hashfun(daddr);
627 
628 	spin_lock_bh(&fnhe_lock);
629 
630 	hash = nh->nh_exceptions;
631 	if (!hash) {
632 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
633 		if (!hash)
634 			goto out_unlock;
635 		nh->nh_exceptions = hash;
636 	}
637 
638 	hash += hval;
639 
640 	depth = 0;
641 	for (fnhe = rcu_dereference(hash->chain); fnhe;
642 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
643 		if (fnhe->fnhe_daddr == daddr)
644 			break;
645 		depth++;
646 	}
647 
648 	if (fnhe) {
649 		if (gw)
650 			fnhe->fnhe_gw = gw;
651 		if (pmtu) {
652 			fnhe->fnhe_pmtu = pmtu;
653 			fnhe->fnhe_expires = max(1UL, expires);
654 		}
655 		/* Update all cached dsts too */
656 		rt = rcu_dereference(fnhe->fnhe_rth_input);
657 		if (rt)
658 			fill_route_from_fnhe(rt, fnhe);
659 		rt = rcu_dereference(fnhe->fnhe_rth_output);
660 		if (rt)
661 			fill_route_from_fnhe(rt, fnhe);
662 	} else {
663 		if (depth > FNHE_RECLAIM_DEPTH)
664 			fnhe = fnhe_oldest(hash);
665 		else {
666 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
667 			if (!fnhe)
668 				goto out_unlock;
669 
670 			fnhe->fnhe_next = hash->chain;
671 			rcu_assign_pointer(hash->chain, fnhe);
672 		}
673 		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
674 		fnhe->fnhe_daddr = daddr;
675 		fnhe->fnhe_gw = gw;
676 		fnhe->fnhe_pmtu = pmtu;
677 		fnhe->fnhe_expires = expires;
678 
679 		/* Exception created; mark the cached routes for the nexthop
680 		 * stale, so anyone caching it rechecks if this exception
681 		 * applies to them.
682 		 */
683 		rt = rcu_dereference(nh->nh_rth_input);
684 		if (rt)
685 			rt->dst.obsolete = DST_OBSOLETE_KILL;
686 
687 		for_each_possible_cpu(i) {
688 			struct rtable __rcu **prt;
689 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
690 			rt = rcu_dereference(*prt);
691 			if (rt)
692 				rt->dst.obsolete = DST_OBSOLETE_KILL;
693 		}
694 	}
695 
696 	fnhe->fnhe_stamp = jiffies;
697 
698 out_unlock:
699 	spin_unlock_bh(&fnhe_lock);
700 	return;
701 }
702 
703 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
704 			     bool kill_route)
705 {
706 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
707 	__be32 old_gw = ip_hdr(skb)->saddr;
708 	struct net_device *dev = skb->dev;
709 	struct in_device *in_dev;
710 	struct fib_result res;
711 	struct neighbour *n;
712 	struct net *net;
713 
714 	switch (icmp_hdr(skb)->code & 7) {
715 	case ICMP_REDIR_NET:
716 	case ICMP_REDIR_NETTOS:
717 	case ICMP_REDIR_HOST:
718 	case ICMP_REDIR_HOSTTOS:
719 		break;
720 
721 	default:
722 		return;
723 	}
724 
725 	if (rt->rt_gateway != old_gw)
726 		return;
727 
728 	in_dev = __in_dev_get_rcu(dev);
729 	if (!in_dev)
730 		return;
731 
732 	net = dev_net(dev);
733 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
734 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
735 	    ipv4_is_zeronet(new_gw))
736 		goto reject_redirect;
737 
738 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
739 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
740 			goto reject_redirect;
741 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
742 			goto reject_redirect;
743 	} else {
744 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
745 			goto reject_redirect;
746 	}
747 
748 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
749 	if (n) {
750 		if (!(n->nud_state & NUD_VALID)) {
751 			neigh_event_send(n, NULL);
752 		} else {
753 			if (fib_lookup(net, fl4, &res) == 0) {
754 				struct fib_nh *nh = &FIB_RES_NH(res);
755 
756 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
757 						      0, 0);
758 			}
759 			if (kill_route)
760 				rt->dst.obsolete = DST_OBSOLETE_KILL;
761 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
762 		}
763 		neigh_release(n);
764 	}
765 	return;
766 
767 reject_redirect:
768 #ifdef CONFIG_IP_ROUTE_VERBOSE
769 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
770 		const struct iphdr *iph = (const struct iphdr *) skb->data;
771 		__be32 daddr = iph->daddr;
772 		__be32 saddr = iph->saddr;
773 
774 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
775 				     "  Advised path = %pI4 -> %pI4\n",
776 				     &old_gw, dev->name, &new_gw,
777 				     &saddr, &daddr);
778 	}
779 #endif
780 	;
781 }
782 
783 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
784 {
785 	struct rtable *rt;
786 	struct flowi4 fl4;
787 	const struct iphdr *iph = (const struct iphdr *) skb->data;
788 	int oif = skb->dev->ifindex;
789 	u8 tos = RT_TOS(iph->tos);
790 	u8 prot = iph->protocol;
791 	u32 mark = skb->mark;
792 
793 	rt = (struct rtable *) dst;
794 
795 	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
796 	__ip_do_redirect(rt, skb, &fl4, true);
797 }
798 
799 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
800 {
801 	struct rtable *rt = (struct rtable *)dst;
802 	struct dst_entry *ret = dst;
803 
804 	if (rt) {
805 		if (dst->obsolete > 0) {
806 			ip_rt_put(rt);
807 			ret = NULL;
808 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
809 			   rt->dst.expires) {
810 			ip_rt_put(rt);
811 			ret = NULL;
812 		}
813 	}
814 	return ret;
815 }
816 
817 /*
818  * Algorithm:
819  *	1. The first ip_rt_redirect_number redirects are sent
820  *	   with exponential backoff, then we stop sending them at all,
821  *	   assuming that the host ignores our redirects.
822  *	2. If we did not see packets requiring redirects
823  *	   during ip_rt_redirect_silence, we assume that the host
824  *	   forgot redirected route and start to send redirects again.
825  *
826  * This algorithm is much cheaper and more intelligent than dumb load limiting
827  * in icmp.c.
828  *
829  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
830  * and "frag. need" (breaks PMTU discovery) in icmp.c.
831  */
832 
833 void ip_rt_send_redirect(struct sk_buff *skb)
834 {
835 	struct rtable *rt = skb_rtable(skb);
836 	struct in_device *in_dev;
837 	struct inet_peer *peer;
838 	struct net *net;
839 	int log_martians;
840 
841 	rcu_read_lock();
842 	in_dev = __in_dev_get_rcu(rt->dst.dev);
843 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
844 		rcu_read_unlock();
845 		return;
846 	}
847 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
848 	rcu_read_unlock();
849 
850 	net = dev_net(rt->dst.dev);
851 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
852 	if (!peer) {
853 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
854 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
855 		return;
856 	}
857 
858 	/* No redirected packets during ip_rt_redirect_silence;
859 	 * reset the algorithm.
860 	 */
861 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
862 		peer->rate_tokens = 0;
863 
864 	/* Too many ignored redirects; do not send anything
865 	 * set dst.rate_last to the last seen redirected packet.
866 	 */
867 	if (peer->rate_tokens >= ip_rt_redirect_number) {
868 		peer->rate_last = jiffies;
869 		goto out_put_peer;
870 	}
871 
872 	/* Check for load limit; set rate_last to the latest sent
873 	 * redirect.
874 	 */
875 	if (peer->rate_tokens == 0 ||
876 	    time_after(jiffies,
877 		       (peer->rate_last +
878 			(ip_rt_redirect_load << peer->rate_tokens)))) {
879 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
880 
881 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
882 		peer->rate_last = jiffies;
883 		++peer->rate_tokens;
884 #ifdef CONFIG_IP_ROUTE_VERBOSE
885 		if (log_martians &&
886 		    peer->rate_tokens == ip_rt_redirect_number)
887 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
888 					     &ip_hdr(skb)->saddr, inet_iif(skb),
889 					     &ip_hdr(skb)->daddr, &gw);
890 #endif
891 	}
892 out_put_peer:
893 	inet_putpeer(peer);
894 }
895 
896 static int ip_error(struct sk_buff *skb)
897 {
898 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
899 	struct rtable *rt = skb_rtable(skb);
900 	struct inet_peer *peer;
901 	unsigned long now;
902 	struct net *net;
903 	bool send;
904 	int code;
905 
906 	net = dev_net(rt->dst.dev);
907 	if (!IN_DEV_FORWARD(in_dev)) {
908 		switch (rt->dst.error) {
909 		case EHOSTUNREACH:
910 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
911 			break;
912 
913 		case ENETUNREACH:
914 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
915 			break;
916 		}
917 		goto out;
918 	}
919 
920 	switch (rt->dst.error) {
921 	case EINVAL:
922 	default:
923 		goto out;
924 	case EHOSTUNREACH:
925 		code = ICMP_HOST_UNREACH;
926 		break;
927 	case ENETUNREACH:
928 		code = ICMP_NET_UNREACH;
929 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
930 		break;
931 	case EACCES:
932 		code = ICMP_PKT_FILTERED;
933 		break;
934 	}
935 
936 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
937 
938 	send = true;
939 	if (peer) {
940 		now = jiffies;
941 		peer->rate_tokens += now - peer->rate_last;
942 		if (peer->rate_tokens > ip_rt_error_burst)
943 			peer->rate_tokens = ip_rt_error_burst;
944 		peer->rate_last = now;
945 		if (peer->rate_tokens >= ip_rt_error_cost)
946 			peer->rate_tokens -= ip_rt_error_cost;
947 		else
948 			send = false;
949 		inet_putpeer(peer);
950 	}
951 	if (send)
952 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
953 
954 out:	kfree_skb(skb);
955 	return 0;
956 }
957 
958 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
959 {
960 	struct dst_entry *dst = &rt->dst;
961 	struct fib_result res;
962 
963 	if (dst_metric_locked(dst, RTAX_MTU))
964 		return;
965 
966 	if (dst->dev->mtu < mtu)
967 		return;
968 
969 	if (mtu < ip_rt_min_pmtu)
970 		mtu = ip_rt_min_pmtu;
971 
972 	if (rt->rt_pmtu == mtu &&
973 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
974 		return;
975 
976 	rcu_read_lock();
977 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
978 		struct fib_nh *nh = &FIB_RES_NH(res);
979 
980 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
981 				      jiffies + ip_rt_mtu_expires);
982 	}
983 	rcu_read_unlock();
984 }
985 
986 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
987 			      struct sk_buff *skb, u32 mtu)
988 {
989 	struct rtable *rt = (struct rtable *) dst;
990 	struct flowi4 fl4;
991 
992 	ip_rt_build_flow_key(&fl4, sk, skb);
993 	__ip_rt_update_pmtu(rt, &fl4, mtu);
994 }
995 
996 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
997 		      int oif, u32 mark, u8 protocol, int flow_flags)
998 {
999 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1000 	struct flowi4 fl4;
1001 	struct rtable *rt;
1002 
1003 	__build_flow_key(&fl4, NULL, iph, oif,
1004 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1005 	rt = __ip_route_output_key(net, &fl4);
1006 	if (!IS_ERR(rt)) {
1007 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1008 		ip_rt_put(rt);
1009 	}
1010 }
1011 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1012 
1013 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1014 {
1015 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1016 	struct flowi4 fl4;
1017 	struct rtable *rt;
1018 
1019 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1020 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1021 	if (!IS_ERR(rt)) {
1022 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1023 		ip_rt_put(rt);
1024 	}
1025 }
1026 
1027 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1028 {
1029 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1030 	struct flowi4 fl4;
1031 	struct rtable *rt;
1032 	struct dst_entry *dst;
1033 	bool new = false;
1034 
1035 	bh_lock_sock(sk);
1036 
1037 	if (!ip_sk_accept_pmtu(sk))
1038 		goto out;
1039 
1040 	rt = (struct rtable *) __sk_dst_get(sk);
1041 
1042 	if (sock_owned_by_user(sk) || !rt) {
1043 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1044 		goto out;
1045 	}
1046 
1047 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1048 
1049 	if (!__sk_dst_check(sk, 0)) {
1050 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1051 		if (IS_ERR(rt))
1052 			goto out;
1053 
1054 		new = true;
1055 	}
1056 
1057 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1058 
1059 	dst = dst_check(&rt->dst, 0);
1060 	if (!dst) {
1061 		if (new)
1062 			dst_release(&rt->dst);
1063 
1064 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1065 		if (IS_ERR(rt))
1066 			goto out;
1067 
1068 		new = true;
1069 	}
1070 
1071 	if (new)
1072 		__sk_dst_set(sk, &rt->dst);
1073 
1074 out:
1075 	bh_unlock_sock(sk);
1076 }
1077 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1078 
1079 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1080 		   int oif, u32 mark, u8 protocol, int flow_flags)
1081 {
1082 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1083 	struct flowi4 fl4;
1084 	struct rtable *rt;
1085 
1086 	__build_flow_key(&fl4, NULL, iph, oif,
1087 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1088 	rt = __ip_route_output_key(net, &fl4);
1089 	if (!IS_ERR(rt)) {
1090 		__ip_do_redirect(rt, skb, &fl4, false);
1091 		ip_rt_put(rt);
1092 	}
1093 }
1094 EXPORT_SYMBOL_GPL(ipv4_redirect);
1095 
1096 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1097 {
1098 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1099 	struct flowi4 fl4;
1100 	struct rtable *rt;
1101 
1102 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1103 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1104 	if (!IS_ERR(rt)) {
1105 		__ip_do_redirect(rt, skb, &fl4, false);
1106 		ip_rt_put(rt);
1107 	}
1108 }
1109 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1110 
1111 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1112 {
1113 	struct rtable *rt = (struct rtable *) dst;
1114 
1115 	/* All IPV4 dsts are created with ->obsolete set to the value
1116 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1117 	 * into this function always.
1118 	 *
1119 	 * When a PMTU/redirect information update invalidates a route,
1120 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1121 	 * DST_OBSOLETE_DEAD by dst_free().
1122 	 */
1123 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1124 		return NULL;
1125 	return dst;
1126 }
1127 
1128 static void ipv4_link_failure(struct sk_buff *skb)
1129 {
1130 	struct rtable *rt;
1131 
1132 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1133 
1134 	rt = skb_rtable(skb);
1135 	if (rt)
1136 		dst_set_expires(&rt->dst, 0);
1137 }
1138 
1139 static int ip_rt_bug(struct sk_buff *skb)
1140 {
1141 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1142 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1143 		 skb->dev ? skb->dev->name : "?");
1144 	kfree_skb(skb);
1145 	WARN_ON(1);
1146 	return 0;
1147 }
1148 
1149 /*
1150    We do not cache source address of outgoing interface,
1151    because it is used only by IP RR, TS and SRR options,
1152    so that it out of fast path.
1153 
1154    BTW remember: "addr" is allowed to be not aligned
1155    in IP options!
1156  */
1157 
1158 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1159 {
1160 	__be32 src;
1161 
1162 	if (rt_is_output_route(rt))
1163 		src = ip_hdr(skb)->saddr;
1164 	else {
1165 		struct fib_result res;
1166 		struct flowi4 fl4;
1167 		struct iphdr *iph;
1168 
1169 		iph = ip_hdr(skb);
1170 
1171 		memset(&fl4, 0, sizeof(fl4));
1172 		fl4.daddr = iph->daddr;
1173 		fl4.saddr = iph->saddr;
1174 		fl4.flowi4_tos = RT_TOS(iph->tos);
1175 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1176 		fl4.flowi4_iif = skb->dev->ifindex;
1177 		fl4.flowi4_mark = skb->mark;
1178 
1179 		rcu_read_lock();
1180 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1181 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1182 		else
1183 			src = inet_select_addr(rt->dst.dev,
1184 					       rt_nexthop(rt, iph->daddr),
1185 					       RT_SCOPE_UNIVERSE);
1186 		rcu_read_unlock();
1187 	}
1188 	memcpy(addr, &src, 4);
1189 }
1190 
1191 #ifdef CONFIG_IP_ROUTE_CLASSID
1192 static void set_class_tag(struct rtable *rt, u32 tag)
1193 {
1194 	if (!(rt->dst.tclassid & 0xFFFF))
1195 		rt->dst.tclassid |= tag & 0xFFFF;
1196 	if (!(rt->dst.tclassid & 0xFFFF0000))
1197 		rt->dst.tclassid |= tag & 0xFFFF0000;
1198 }
1199 #endif
1200 
1201 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1202 {
1203 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1204 
1205 	if (advmss == 0) {
1206 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1207 			       ip_rt_min_advmss);
1208 		if (advmss > 65535 - 40)
1209 			advmss = 65535 - 40;
1210 	}
1211 	return advmss;
1212 }
1213 
1214 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1215 {
1216 	const struct rtable *rt = (const struct rtable *) dst;
1217 	unsigned int mtu = rt->rt_pmtu;
1218 
1219 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1220 		mtu = dst_metric_raw(dst, RTAX_MTU);
1221 
1222 	if (mtu)
1223 		return mtu;
1224 
1225 	mtu = dst->dev->mtu;
1226 
1227 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1228 		if (rt->rt_uses_gateway && mtu > 576)
1229 			mtu = 576;
1230 	}
1231 
1232 	return min_t(unsigned int, mtu, IP_MAX_MTU);
1233 }
1234 
1235 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1236 {
1237 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1238 	struct fib_nh_exception *fnhe;
1239 	u32 hval;
1240 
1241 	if (!hash)
1242 		return NULL;
1243 
1244 	hval = fnhe_hashfun(daddr);
1245 
1246 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1247 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1248 		if (fnhe->fnhe_daddr == daddr)
1249 			return fnhe;
1250 	}
1251 	return NULL;
1252 }
1253 
1254 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1255 			      __be32 daddr)
1256 {
1257 	bool ret = false;
1258 
1259 	spin_lock_bh(&fnhe_lock);
1260 
1261 	if (daddr == fnhe->fnhe_daddr) {
1262 		struct rtable __rcu **porig;
1263 		struct rtable *orig;
1264 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1265 
1266 		if (rt_is_input_route(rt))
1267 			porig = &fnhe->fnhe_rth_input;
1268 		else
1269 			porig = &fnhe->fnhe_rth_output;
1270 		orig = rcu_dereference(*porig);
1271 
1272 		if (fnhe->fnhe_genid != genid) {
1273 			fnhe->fnhe_genid = genid;
1274 			fnhe->fnhe_gw = 0;
1275 			fnhe->fnhe_pmtu = 0;
1276 			fnhe->fnhe_expires = 0;
1277 			fnhe_flush_routes(fnhe);
1278 			orig = NULL;
1279 		}
1280 		fill_route_from_fnhe(rt, fnhe);
1281 		if (!rt->rt_gateway)
1282 			rt->rt_gateway = daddr;
1283 
1284 		if (!(rt->dst.flags & DST_NOCACHE)) {
1285 			rcu_assign_pointer(*porig, rt);
1286 			if (orig)
1287 				rt_free(orig);
1288 			ret = true;
1289 		}
1290 
1291 		fnhe->fnhe_stamp = jiffies;
1292 	}
1293 	spin_unlock_bh(&fnhe_lock);
1294 
1295 	return ret;
1296 }
1297 
1298 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1299 {
1300 	struct rtable *orig, *prev, **p;
1301 	bool ret = true;
1302 
1303 	if (rt_is_input_route(rt)) {
1304 		p = (struct rtable **)&nh->nh_rth_input;
1305 	} else {
1306 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1307 	}
1308 	orig = *p;
1309 
1310 	prev = cmpxchg(p, orig, rt);
1311 	if (prev == orig) {
1312 		if (orig)
1313 			rt_free(orig);
1314 	} else
1315 		ret = false;
1316 
1317 	return ret;
1318 }
1319 
1320 static DEFINE_SPINLOCK(rt_uncached_lock);
1321 static LIST_HEAD(rt_uncached_list);
1322 
1323 static void rt_add_uncached_list(struct rtable *rt)
1324 {
1325 	spin_lock_bh(&rt_uncached_lock);
1326 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1327 	spin_unlock_bh(&rt_uncached_lock);
1328 }
1329 
1330 static void ipv4_dst_destroy(struct dst_entry *dst)
1331 {
1332 	struct rtable *rt = (struct rtable *) dst;
1333 
1334 	if (!list_empty(&rt->rt_uncached)) {
1335 		spin_lock_bh(&rt_uncached_lock);
1336 		list_del(&rt->rt_uncached);
1337 		spin_unlock_bh(&rt_uncached_lock);
1338 	}
1339 }
1340 
1341 void rt_flush_dev(struct net_device *dev)
1342 {
1343 	if (!list_empty(&rt_uncached_list)) {
1344 		struct net *net = dev_net(dev);
1345 		struct rtable *rt;
1346 
1347 		spin_lock_bh(&rt_uncached_lock);
1348 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1349 			if (rt->dst.dev != dev)
1350 				continue;
1351 			rt->dst.dev = net->loopback_dev;
1352 			dev_hold(rt->dst.dev);
1353 			dev_put(dev);
1354 		}
1355 		spin_unlock_bh(&rt_uncached_lock);
1356 	}
1357 }
1358 
1359 static bool rt_cache_valid(const struct rtable *rt)
1360 {
1361 	return	rt &&
1362 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1363 		!rt_is_expired(rt);
1364 }
1365 
1366 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1367 			   const struct fib_result *res,
1368 			   struct fib_nh_exception *fnhe,
1369 			   struct fib_info *fi, u16 type, u32 itag)
1370 {
1371 	bool cached = false;
1372 
1373 	if (fi) {
1374 		struct fib_nh *nh = &FIB_RES_NH(*res);
1375 
1376 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1377 			rt->rt_gateway = nh->nh_gw;
1378 			rt->rt_uses_gateway = 1;
1379 		}
1380 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1381 #ifdef CONFIG_IP_ROUTE_CLASSID
1382 		rt->dst.tclassid = nh->nh_tclassid;
1383 #endif
1384 		if (unlikely(fnhe))
1385 			cached = rt_bind_exception(rt, fnhe, daddr);
1386 		else if (!(rt->dst.flags & DST_NOCACHE))
1387 			cached = rt_cache_route(nh, rt);
1388 		if (unlikely(!cached)) {
1389 			/* Routes we intend to cache in nexthop exception or
1390 			 * FIB nexthop have the DST_NOCACHE bit clear.
1391 			 * However, if we are unsuccessful at storing this
1392 			 * route into the cache we really need to set it.
1393 			 */
1394 			rt->dst.flags |= DST_NOCACHE;
1395 			if (!rt->rt_gateway)
1396 				rt->rt_gateway = daddr;
1397 			rt_add_uncached_list(rt);
1398 		}
1399 	} else
1400 		rt_add_uncached_list(rt);
1401 
1402 #ifdef CONFIG_IP_ROUTE_CLASSID
1403 #ifdef CONFIG_IP_MULTIPLE_TABLES
1404 	set_class_tag(rt, res->tclassid);
1405 #endif
1406 	set_class_tag(rt, itag);
1407 #endif
1408 }
1409 
1410 static struct rtable *rt_dst_alloc(struct net_device *dev,
1411 				   bool nopolicy, bool noxfrm, bool will_cache)
1412 {
1413 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1414 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1415 			 (nopolicy ? DST_NOPOLICY : 0) |
1416 			 (noxfrm ? DST_NOXFRM : 0));
1417 }
1418 
1419 /* called in rcu_read_lock() section */
1420 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1421 				u8 tos, struct net_device *dev, int our)
1422 {
1423 	struct rtable *rth;
1424 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1425 	u32 itag = 0;
1426 	int err;
1427 
1428 	/* Primary sanity checks. */
1429 
1430 	if (in_dev == NULL)
1431 		return -EINVAL;
1432 
1433 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1434 	    skb->protocol != htons(ETH_P_IP))
1435 		goto e_inval;
1436 
1437 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1438 		if (ipv4_is_loopback(saddr))
1439 			goto e_inval;
1440 
1441 	if (ipv4_is_zeronet(saddr)) {
1442 		if (!ipv4_is_local_multicast(daddr))
1443 			goto e_inval;
1444 	} else {
1445 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1446 					  in_dev, &itag);
1447 		if (err < 0)
1448 			goto e_err;
1449 	}
1450 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1451 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1452 	if (!rth)
1453 		goto e_nobufs;
1454 
1455 #ifdef CONFIG_IP_ROUTE_CLASSID
1456 	rth->dst.tclassid = itag;
1457 #endif
1458 	rth->dst.output = ip_rt_bug;
1459 
1460 	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1461 	rth->rt_flags	= RTCF_MULTICAST;
1462 	rth->rt_type	= RTN_MULTICAST;
1463 	rth->rt_is_input= 1;
1464 	rth->rt_iif	= 0;
1465 	rth->rt_pmtu	= 0;
1466 	rth->rt_gateway	= 0;
1467 	rth->rt_uses_gateway = 0;
1468 	INIT_LIST_HEAD(&rth->rt_uncached);
1469 	if (our) {
1470 		rth->dst.input= ip_local_deliver;
1471 		rth->rt_flags |= RTCF_LOCAL;
1472 	}
1473 
1474 #ifdef CONFIG_IP_MROUTE
1475 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1476 		rth->dst.input = ip_mr_input;
1477 #endif
1478 	RT_CACHE_STAT_INC(in_slow_mc);
1479 
1480 	skb_dst_set(skb, &rth->dst);
1481 	return 0;
1482 
1483 e_nobufs:
1484 	return -ENOBUFS;
1485 e_inval:
1486 	return -EINVAL;
1487 e_err:
1488 	return err;
1489 }
1490 
1491 
1492 static void ip_handle_martian_source(struct net_device *dev,
1493 				     struct in_device *in_dev,
1494 				     struct sk_buff *skb,
1495 				     __be32 daddr,
1496 				     __be32 saddr)
1497 {
1498 	RT_CACHE_STAT_INC(in_martian_src);
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1501 		/*
1502 		 *	RFC1812 recommendation, if source is martian,
1503 		 *	the only hint is MAC header.
1504 		 */
1505 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1506 			&daddr, &saddr, dev->name);
1507 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1508 			print_hex_dump(KERN_WARNING, "ll header: ",
1509 				       DUMP_PREFIX_OFFSET, 16, 1,
1510 				       skb_mac_header(skb),
1511 				       dev->hard_header_len, true);
1512 		}
1513 	}
1514 #endif
1515 }
1516 
1517 /* called in rcu_read_lock() section */
1518 static int __mkroute_input(struct sk_buff *skb,
1519 			   const struct fib_result *res,
1520 			   struct in_device *in_dev,
1521 			   __be32 daddr, __be32 saddr, u32 tos)
1522 {
1523 	struct fib_nh_exception *fnhe;
1524 	struct rtable *rth;
1525 	int err;
1526 	struct in_device *out_dev;
1527 	unsigned int flags = 0;
1528 	bool do_cache;
1529 	u32 itag;
1530 
1531 	/* get a working reference to the output device */
1532 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1533 	if (out_dev == NULL) {
1534 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1535 		return -EINVAL;
1536 	}
1537 
1538 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1539 				  in_dev->dev, in_dev, &itag);
1540 	if (err < 0) {
1541 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1542 					 saddr);
1543 
1544 		goto cleanup;
1545 	}
1546 
1547 	do_cache = res->fi && !itag;
1548 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1549 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1550 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1551 		flags |= RTCF_DOREDIRECT;
1552 		do_cache = false;
1553 	}
1554 
1555 	if (skb->protocol != htons(ETH_P_IP)) {
1556 		/* Not IP (i.e. ARP). Do not create route, if it is
1557 		 * invalid for proxy arp. DNAT routes are always valid.
1558 		 *
1559 		 * Proxy arp feature have been extended to allow, ARP
1560 		 * replies back to the same interface, to support
1561 		 * Private VLAN switch technologies. See arp.c.
1562 		 */
1563 		if (out_dev == in_dev &&
1564 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1565 			err = -EINVAL;
1566 			goto cleanup;
1567 		}
1568 	}
1569 
1570 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1571 	if (do_cache) {
1572 		if (fnhe != NULL)
1573 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1574 		else
1575 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1576 
1577 		if (rt_cache_valid(rth)) {
1578 			skb_dst_set_noref(skb, &rth->dst);
1579 			goto out;
1580 		}
1581 	}
1582 
1583 	rth = rt_dst_alloc(out_dev->dev,
1584 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1585 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1586 	if (!rth) {
1587 		err = -ENOBUFS;
1588 		goto cleanup;
1589 	}
1590 
1591 	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1592 	rth->rt_flags = flags;
1593 	rth->rt_type = res->type;
1594 	rth->rt_is_input = 1;
1595 	rth->rt_iif 	= 0;
1596 	rth->rt_pmtu	= 0;
1597 	rth->rt_gateway	= 0;
1598 	rth->rt_uses_gateway = 0;
1599 	INIT_LIST_HEAD(&rth->rt_uncached);
1600 
1601 	rth->dst.input = ip_forward;
1602 	rth->dst.output = ip_output;
1603 
1604 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1605 	skb_dst_set(skb, &rth->dst);
1606 out:
1607 	err = 0;
1608  cleanup:
1609 	return err;
1610 }
1611 
1612 static int ip_mkroute_input(struct sk_buff *skb,
1613 			    struct fib_result *res,
1614 			    const struct flowi4 *fl4,
1615 			    struct in_device *in_dev,
1616 			    __be32 daddr, __be32 saddr, u32 tos)
1617 {
1618 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1619 	if (res->fi && res->fi->fib_nhs > 1)
1620 		fib_select_multipath(res);
1621 #endif
1622 
1623 	/* create a routing cache entry */
1624 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1625 }
1626 
1627 /*
1628  *	NOTE. We drop all the packets that has local source
1629  *	addresses, because every properly looped back packet
1630  *	must have correct destination already attached by output routine.
1631  *
1632  *	Such approach solves two big problems:
1633  *	1. Not simplex devices are handled properly.
1634  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1635  *	called with rcu_read_lock()
1636  */
1637 
1638 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1639 			       u8 tos, struct net_device *dev)
1640 {
1641 	struct fib_result res;
1642 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1643 	struct flowi4	fl4;
1644 	unsigned int	flags = 0;
1645 	u32		itag = 0;
1646 	struct rtable	*rth;
1647 	int		err = -EINVAL;
1648 	struct net    *net = dev_net(dev);
1649 	bool do_cache;
1650 
1651 	/* IP on this device is disabled. */
1652 
1653 	if (!in_dev)
1654 		goto out;
1655 
1656 	/* Check for the most weird martians, which can be not detected
1657 	   by fib_lookup.
1658 	 */
1659 
1660 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1661 		goto martian_source;
1662 
1663 	res.fi = NULL;
1664 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1665 		goto brd_input;
1666 
1667 	/* Accept zero addresses only to limited broadcast;
1668 	 * I even do not know to fix it or not. Waiting for complains :-)
1669 	 */
1670 	if (ipv4_is_zeronet(saddr))
1671 		goto martian_source;
1672 
1673 	if (ipv4_is_zeronet(daddr))
1674 		goto martian_destination;
1675 
1676 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1677 	 * and call it once if daddr or/and saddr are loopback addresses
1678 	 */
1679 	if (ipv4_is_loopback(daddr)) {
1680 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1681 			goto martian_destination;
1682 	} else if (ipv4_is_loopback(saddr)) {
1683 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1684 			goto martian_source;
1685 	}
1686 
1687 	/*
1688 	 *	Now we are ready to route packet.
1689 	 */
1690 	fl4.flowi4_oif = 0;
1691 	fl4.flowi4_iif = dev->ifindex;
1692 	fl4.flowi4_mark = skb->mark;
1693 	fl4.flowi4_tos = tos;
1694 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1695 	fl4.daddr = daddr;
1696 	fl4.saddr = saddr;
1697 	err = fib_lookup(net, &fl4, &res);
1698 	if (err != 0)
1699 		goto no_route;
1700 
1701 	RT_CACHE_STAT_INC(in_slow_tot);
1702 
1703 	if (res.type == RTN_BROADCAST)
1704 		goto brd_input;
1705 
1706 	if (res.type == RTN_LOCAL) {
1707 		err = fib_validate_source(skb, saddr, daddr, tos,
1708 					  LOOPBACK_IFINDEX,
1709 					  dev, in_dev, &itag);
1710 		if (err < 0)
1711 			goto martian_source_keep_err;
1712 		goto local_input;
1713 	}
1714 
1715 	if (!IN_DEV_FORWARD(in_dev))
1716 		goto no_route;
1717 	if (res.type != RTN_UNICAST)
1718 		goto martian_destination;
1719 
1720 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1721 out:	return err;
1722 
1723 brd_input:
1724 	if (skb->protocol != htons(ETH_P_IP))
1725 		goto e_inval;
1726 
1727 	if (!ipv4_is_zeronet(saddr)) {
1728 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1729 					  in_dev, &itag);
1730 		if (err < 0)
1731 			goto martian_source_keep_err;
1732 	}
1733 	flags |= RTCF_BROADCAST;
1734 	res.type = RTN_BROADCAST;
1735 	RT_CACHE_STAT_INC(in_brd);
1736 
1737 local_input:
1738 	do_cache = false;
1739 	if (res.fi) {
1740 		if (!itag) {
1741 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1742 			if (rt_cache_valid(rth)) {
1743 				skb_dst_set_noref(skb, &rth->dst);
1744 				err = 0;
1745 				goto out;
1746 			}
1747 			do_cache = true;
1748 		}
1749 	}
1750 
1751 	rth = rt_dst_alloc(net->loopback_dev,
1752 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1753 	if (!rth)
1754 		goto e_nobufs;
1755 
1756 	rth->dst.input= ip_local_deliver;
1757 	rth->dst.output= ip_rt_bug;
1758 #ifdef CONFIG_IP_ROUTE_CLASSID
1759 	rth->dst.tclassid = itag;
1760 #endif
1761 
1762 	rth->rt_genid = rt_genid_ipv4(net);
1763 	rth->rt_flags 	= flags|RTCF_LOCAL;
1764 	rth->rt_type	= res.type;
1765 	rth->rt_is_input = 1;
1766 	rth->rt_iif	= 0;
1767 	rth->rt_pmtu	= 0;
1768 	rth->rt_gateway	= 0;
1769 	rth->rt_uses_gateway = 0;
1770 	INIT_LIST_HEAD(&rth->rt_uncached);
1771 	if (res.type == RTN_UNREACHABLE) {
1772 		rth->dst.input= ip_error;
1773 		rth->dst.error= -err;
1774 		rth->rt_flags 	&= ~RTCF_LOCAL;
1775 	}
1776 	if (do_cache) {
1777 		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1778 			rth->dst.flags |= DST_NOCACHE;
1779 			rt_add_uncached_list(rth);
1780 		}
1781 	}
1782 	skb_dst_set(skb, &rth->dst);
1783 	err = 0;
1784 	goto out;
1785 
1786 no_route:
1787 	RT_CACHE_STAT_INC(in_no_route);
1788 	res.type = RTN_UNREACHABLE;
1789 	if (err == -ESRCH)
1790 		err = -ENETUNREACH;
1791 	goto local_input;
1792 
1793 	/*
1794 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1795 	 */
1796 martian_destination:
1797 	RT_CACHE_STAT_INC(in_martian_dst);
1798 #ifdef CONFIG_IP_ROUTE_VERBOSE
1799 	if (IN_DEV_LOG_MARTIANS(in_dev))
1800 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1801 				     &daddr, &saddr, dev->name);
1802 #endif
1803 
1804 e_inval:
1805 	err = -EINVAL;
1806 	goto out;
1807 
1808 e_nobufs:
1809 	err = -ENOBUFS;
1810 	goto out;
1811 
1812 martian_source:
1813 	err = -EINVAL;
1814 martian_source_keep_err:
1815 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1816 	goto out;
1817 }
1818 
1819 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1820 			 u8 tos, struct net_device *dev)
1821 {
1822 	int res;
1823 
1824 	rcu_read_lock();
1825 
1826 	/* Multicast recognition logic is moved from route cache to here.
1827 	   The problem was that too many Ethernet cards have broken/missing
1828 	   hardware multicast filters :-( As result the host on multicasting
1829 	   network acquires a lot of useless route cache entries, sort of
1830 	   SDR messages from all the world. Now we try to get rid of them.
1831 	   Really, provided software IP multicast filter is organized
1832 	   reasonably (at least, hashed), it does not result in a slowdown
1833 	   comparing with route cache reject entries.
1834 	   Note, that multicast routers are not affected, because
1835 	   route cache entry is created eventually.
1836 	 */
1837 	if (ipv4_is_multicast(daddr)) {
1838 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1839 
1840 		if (in_dev) {
1841 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1842 						  ip_hdr(skb)->protocol);
1843 			if (our
1844 #ifdef CONFIG_IP_MROUTE
1845 				||
1846 			    (!ipv4_is_local_multicast(daddr) &&
1847 			     IN_DEV_MFORWARD(in_dev))
1848 #endif
1849 			   ) {
1850 				int res = ip_route_input_mc(skb, daddr, saddr,
1851 							    tos, dev, our);
1852 				rcu_read_unlock();
1853 				return res;
1854 			}
1855 		}
1856 		rcu_read_unlock();
1857 		return -EINVAL;
1858 	}
1859 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1860 	rcu_read_unlock();
1861 	return res;
1862 }
1863 EXPORT_SYMBOL(ip_route_input_noref);
1864 
1865 /* called with rcu_read_lock() */
1866 static struct rtable *__mkroute_output(const struct fib_result *res,
1867 				       const struct flowi4 *fl4, int orig_oif,
1868 				       struct net_device *dev_out,
1869 				       unsigned int flags)
1870 {
1871 	struct fib_info *fi = res->fi;
1872 	struct fib_nh_exception *fnhe;
1873 	struct in_device *in_dev;
1874 	u16 type = res->type;
1875 	struct rtable *rth;
1876 	bool do_cache;
1877 
1878 	in_dev = __in_dev_get_rcu(dev_out);
1879 	if (!in_dev)
1880 		return ERR_PTR(-EINVAL);
1881 
1882 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1883 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1884 			return ERR_PTR(-EINVAL);
1885 
1886 	if (ipv4_is_lbcast(fl4->daddr))
1887 		type = RTN_BROADCAST;
1888 	else if (ipv4_is_multicast(fl4->daddr))
1889 		type = RTN_MULTICAST;
1890 	else if (ipv4_is_zeronet(fl4->daddr))
1891 		return ERR_PTR(-EINVAL);
1892 
1893 	if (dev_out->flags & IFF_LOOPBACK)
1894 		flags |= RTCF_LOCAL;
1895 
1896 	do_cache = true;
1897 	if (type == RTN_BROADCAST) {
1898 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1899 		fi = NULL;
1900 	} else if (type == RTN_MULTICAST) {
1901 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1902 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1903 				     fl4->flowi4_proto))
1904 			flags &= ~RTCF_LOCAL;
1905 		else
1906 			do_cache = false;
1907 		/* If multicast route do not exist use
1908 		 * default one, but do not gateway in this case.
1909 		 * Yes, it is hack.
1910 		 */
1911 		if (fi && res->prefixlen < 4)
1912 			fi = NULL;
1913 	}
1914 
1915 	fnhe = NULL;
1916 	do_cache &= fi != NULL;
1917 	if (do_cache) {
1918 		struct rtable __rcu **prth;
1919 		struct fib_nh *nh = &FIB_RES_NH(*res);
1920 
1921 		fnhe = find_exception(nh, fl4->daddr);
1922 		if (fnhe)
1923 			prth = &fnhe->fnhe_rth_output;
1924 		else {
1925 			if (unlikely(fl4->flowi4_flags &
1926 				     FLOWI_FLAG_KNOWN_NH &&
1927 				     !(nh->nh_gw &&
1928 				       nh->nh_scope == RT_SCOPE_LINK))) {
1929 				do_cache = false;
1930 				goto add;
1931 			}
1932 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1933 		}
1934 		rth = rcu_dereference(*prth);
1935 		if (rt_cache_valid(rth)) {
1936 			dst_hold(&rth->dst);
1937 			return rth;
1938 		}
1939 	}
1940 
1941 add:
1942 	rth = rt_dst_alloc(dev_out,
1943 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1944 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1945 			   do_cache);
1946 	if (!rth)
1947 		return ERR_PTR(-ENOBUFS);
1948 
1949 	rth->dst.output = ip_output;
1950 
1951 	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1952 	rth->rt_flags	= flags;
1953 	rth->rt_type	= type;
1954 	rth->rt_is_input = 0;
1955 	rth->rt_iif	= orig_oif ? : 0;
1956 	rth->rt_pmtu	= 0;
1957 	rth->rt_gateway = 0;
1958 	rth->rt_uses_gateway = 0;
1959 	INIT_LIST_HEAD(&rth->rt_uncached);
1960 
1961 	RT_CACHE_STAT_INC(out_slow_tot);
1962 
1963 	if (flags & RTCF_LOCAL)
1964 		rth->dst.input = ip_local_deliver;
1965 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1966 		if (flags & RTCF_LOCAL &&
1967 		    !(dev_out->flags & IFF_LOOPBACK)) {
1968 			rth->dst.output = ip_mc_output;
1969 			RT_CACHE_STAT_INC(out_slow_mc);
1970 		}
1971 #ifdef CONFIG_IP_MROUTE
1972 		if (type == RTN_MULTICAST) {
1973 			if (IN_DEV_MFORWARD(in_dev) &&
1974 			    !ipv4_is_local_multicast(fl4->daddr)) {
1975 				rth->dst.input = ip_mr_input;
1976 				rth->dst.output = ip_mc_output;
1977 			}
1978 		}
1979 #endif
1980 	}
1981 
1982 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1983 
1984 	return rth;
1985 }
1986 
1987 /*
1988  * Major route resolver routine.
1989  */
1990 
1991 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1992 {
1993 	struct net_device *dev_out = NULL;
1994 	__u8 tos = RT_FL_TOS(fl4);
1995 	unsigned int flags = 0;
1996 	struct fib_result res;
1997 	struct rtable *rth;
1998 	int orig_oif;
1999 
2000 	res.tclassid	= 0;
2001 	res.fi		= NULL;
2002 	res.table	= NULL;
2003 
2004 	orig_oif = fl4->flowi4_oif;
2005 
2006 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2007 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2008 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2009 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2010 
2011 	rcu_read_lock();
2012 	if (fl4->saddr) {
2013 		rth = ERR_PTR(-EINVAL);
2014 		if (ipv4_is_multicast(fl4->saddr) ||
2015 		    ipv4_is_lbcast(fl4->saddr) ||
2016 		    ipv4_is_zeronet(fl4->saddr))
2017 			goto out;
2018 
2019 		/* I removed check for oif == dev_out->oif here.
2020 		   It was wrong for two reasons:
2021 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2022 		      is assigned to multiple interfaces.
2023 		   2. Moreover, we are allowed to send packets with saddr
2024 		      of another iface. --ANK
2025 		 */
2026 
2027 		if (fl4->flowi4_oif == 0 &&
2028 		    (ipv4_is_multicast(fl4->daddr) ||
2029 		     ipv4_is_lbcast(fl4->daddr))) {
2030 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2031 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2032 			if (dev_out == NULL)
2033 				goto out;
2034 
2035 			/* Special hack: user can direct multicasts
2036 			   and limited broadcast via necessary interface
2037 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2038 			   This hack is not just for fun, it allows
2039 			   vic,vat and friends to work.
2040 			   They bind socket to loopback, set ttl to zero
2041 			   and expect that it will work.
2042 			   From the viewpoint of routing cache they are broken,
2043 			   because we are not allowed to build multicast path
2044 			   with loopback source addr (look, routing cache
2045 			   cannot know, that ttl is zero, so that packet
2046 			   will not leave this host and route is valid).
2047 			   Luckily, this hack is good workaround.
2048 			 */
2049 
2050 			fl4->flowi4_oif = dev_out->ifindex;
2051 			goto make_route;
2052 		}
2053 
2054 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2055 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2056 			if (!__ip_dev_find(net, fl4->saddr, false))
2057 				goto out;
2058 		}
2059 	}
2060 
2061 
2062 	if (fl4->flowi4_oif) {
2063 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2064 		rth = ERR_PTR(-ENODEV);
2065 		if (dev_out == NULL)
2066 			goto out;
2067 
2068 		/* RACE: Check return value of inet_select_addr instead. */
2069 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2070 			rth = ERR_PTR(-ENETUNREACH);
2071 			goto out;
2072 		}
2073 		if (ipv4_is_local_multicast(fl4->daddr) ||
2074 		    ipv4_is_lbcast(fl4->daddr)) {
2075 			if (!fl4->saddr)
2076 				fl4->saddr = inet_select_addr(dev_out, 0,
2077 							      RT_SCOPE_LINK);
2078 			goto make_route;
2079 		}
2080 		if (!fl4->saddr) {
2081 			if (ipv4_is_multicast(fl4->daddr))
2082 				fl4->saddr = inet_select_addr(dev_out, 0,
2083 							      fl4->flowi4_scope);
2084 			else if (!fl4->daddr)
2085 				fl4->saddr = inet_select_addr(dev_out, 0,
2086 							      RT_SCOPE_HOST);
2087 		}
2088 	}
2089 
2090 	if (!fl4->daddr) {
2091 		fl4->daddr = fl4->saddr;
2092 		if (!fl4->daddr)
2093 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2094 		dev_out = net->loopback_dev;
2095 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2096 		res.type = RTN_LOCAL;
2097 		flags |= RTCF_LOCAL;
2098 		goto make_route;
2099 	}
2100 
2101 	if (fib_lookup(net, fl4, &res)) {
2102 		res.fi = NULL;
2103 		res.table = NULL;
2104 		if (fl4->flowi4_oif) {
2105 			/* Apparently, routing tables are wrong. Assume,
2106 			   that the destination is on link.
2107 
2108 			   WHY? DW.
2109 			   Because we are allowed to send to iface
2110 			   even if it has NO routes and NO assigned
2111 			   addresses. When oif is specified, routing
2112 			   tables are looked up with only one purpose:
2113 			   to catch if destination is gatewayed, rather than
2114 			   direct. Moreover, if MSG_DONTROUTE is set,
2115 			   we send packet, ignoring both routing tables
2116 			   and ifaddr state. --ANK
2117 
2118 
2119 			   We could make it even if oif is unknown,
2120 			   likely IPv6, but we do not.
2121 			 */
2122 
2123 			if (fl4->saddr == 0)
2124 				fl4->saddr = inet_select_addr(dev_out, 0,
2125 							      RT_SCOPE_LINK);
2126 			res.type = RTN_UNICAST;
2127 			goto make_route;
2128 		}
2129 		rth = ERR_PTR(-ENETUNREACH);
2130 		goto out;
2131 	}
2132 
2133 	if (res.type == RTN_LOCAL) {
2134 		if (!fl4->saddr) {
2135 			if (res.fi->fib_prefsrc)
2136 				fl4->saddr = res.fi->fib_prefsrc;
2137 			else
2138 				fl4->saddr = fl4->daddr;
2139 		}
2140 		dev_out = net->loopback_dev;
2141 		fl4->flowi4_oif = dev_out->ifindex;
2142 		flags |= RTCF_LOCAL;
2143 		goto make_route;
2144 	}
2145 
2146 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2147 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2148 		fib_select_multipath(&res);
2149 	else
2150 #endif
2151 	if (!res.prefixlen &&
2152 	    res.table->tb_num_default > 1 &&
2153 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2154 		fib_select_default(&res);
2155 
2156 	if (!fl4->saddr)
2157 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2158 
2159 	dev_out = FIB_RES_DEV(res);
2160 	fl4->flowi4_oif = dev_out->ifindex;
2161 
2162 
2163 make_route:
2164 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2165 
2166 out:
2167 	rcu_read_unlock();
2168 	return rth;
2169 }
2170 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2171 
2172 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2173 {
2174 	return NULL;
2175 }
2176 
2177 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2178 {
2179 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2180 
2181 	return mtu ? : dst->dev->mtu;
2182 }
2183 
2184 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2185 					  struct sk_buff *skb, u32 mtu)
2186 {
2187 }
2188 
2189 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2190 				       struct sk_buff *skb)
2191 {
2192 }
2193 
2194 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2195 					  unsigned long old)
2196 {
2197 	return NULL;
2198 }
2199 
2200 static struct dst_ops ipv4_dst_blackhole_ops = {
2201 	.family			=	AF_INET,
2202 	.protocol		=	cpu_to_be16(ETH_P_IP),
2203 	.check			=	ipv4_blackhole_dst_check,
2204 	.mtu			=	ipv4_blackhole_mtu,
2205 	.default_advmss		=	ipv4_default_advmss,
2206 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2207 	.redirect		=	ipv4_rt_blackhole_redirect,
2208 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2209 	.neigh_lookup		=	ipv4_neigh_lookup,
2210 };
2211 
2212 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2213 {
2214 	struct rtable *ort = (struct rtable *) dst_orig;
2215 	struct rtable *rt;
2216 
2217 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2218 	if (rt) {
2219 		struct dst_entry *new = &rt->dst;
2220 
2221 		new->__use = 1;
2222 		new->input = dst_discard;
2223 		new->output = dst_discard;
2224 
2225 		new->dev = ort->dst.dev;
2226 		if (new->dev)
2227 			dev_hold(new->dev);
2228 
2229 		rt->rt_is_input = ort->rt_is_input;
2230 		rt->rt_iif = ort->rt_iif;
2231 		rt->rt_pmtu = ort->rt_pmtu;
2232 
2233 		rt->rt_genid = rt_genid_ipv4(net);
2234 		rt->rt_flags = ort->rt_flags;
2235 		rt->rt_type = ort->rt_type;
2236 		rt->rt_gateway = ort->rt_gateway;
2237 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2238 
2239 		INIT_LIST_HEAD(&rt->rt_uncached);
2240 
2241 		dst_free(new);
2242 	}
2243 
2244 	dst_release(dst_orig);
2245 
2246 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2247 }
2248 
2249 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2250 				    struct sock *sk)
2251 {
2252 	struct rtable *rt = __ip_route_output_key(net, flp4);
2253 
2254 	if (IS_ERR(rt))
2255 		return rt;
2256 
2257 	if (flp4->flowi4_proto)
2258 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2259 						   flowi4_to_flowi(flp4),
2260 						   sk, 0);
2261 
2262 	return rt;
2263 }
2264 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2265 
2266 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2267 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2268 			u32 seq, int event, int nowait, unsigned int flags)
2269 {
2270 	struct rtable *rt = skb_rtable(skb);
2271 	struct rtmsg *r;
2272 	struct nlmsghdr *nlh;
2273 	unsigned long expires = 0;
2274 	u32 error;
2275 	u32 metrics[RTAX_MAX];
2276 
2277 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2278 	if (nlh == NULL)
2279 		return -EMSGSIZE;
2280 
2281 	r = nlmsg_data(nlh);
2282 	r->rtm_family	 = AF_INET;
2283 	r->rtm_dst_len	= 32;
2284 	r->rtm_src_len	= 0;
2285 	r->rtm_tos	= fl4->flowi4_tos;
2286 	r->rtm_table	= RT_TABLE_MAIN;
2287 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2288 		goto nla_put_failure;
2289 	r->rtm_type	= rt->rt_type;
2290 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2291 	r->rtm_protocol = RTPROT_UNSPEC;
2292 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2293 	if (rt->rt_flags & RTCF_NOTIFY)
2294 		r->rtm_flags |= RTM_F_NOTIFY;
2295 
2296 	if (nla_put_be32(skb, RTA_DST, dst))
2297 		goto nla_put_failure;
2298 	if (src) {
2299 		r->rtm_src_len = 32;
2300 		if (nla_put_be32(skb, RTA_SRC, src))
2301 			goto nla_put_failure;
2302 	}
2303 	if (rt->dst.dev &&
2304 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2305 		goto nla_put_failure;
2306 #ifdef CONFIG_IP_ROUTE_CLASSID
2307 	if (rt->dst.tclassid &&
2308 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2309 		goto nla_put_failure;
2310 #endif
2311 	if (!rt_is_input_route(rt) &&
2312 	    fl4->saddr != src) {
2313 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2314 			goto nla_put_failure;
2315 	}
2316 	if (rt->rt_uses_gateway &&
2317 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2318 		goto nla_put_failure;
2319 
2320 	expires = rt->dst.expires;
2321 	if (expires) {
2322 		unsigned long now = jiffies;
2323 
2324 		if (time_before(now, expires))
2325 			expires -= now;
2326 		else
2327 			expires = 0;
2328 	}
2329 
2330 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2331 	if (rt->rt_pmtu && expires)
2332 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2333 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2334 		goto nla_put_failure;
2335 
2336 	if (fl4->flowi4_mark &&
2337 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2338 		goto nla_put_failure;
2339 
2340 	error = rt->dst.error;
2341 
2342 	if (rt_is_input_route(rt)) {
2343 #ifdef CONFIG_IP_MROUTE
2344 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2345 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2346 			int err = ipmr_get_route(net, skb,
2347 						 fl4->saddr, fl4->daddr,
2348 						 r, nowait);
2349 			if (err <= 0) {
2350 				if (!nowait) {
2351 					if (err == 0)
2352 						return 0;
2353 					goto nla_put_failure;
2354 				} else {
2355 					if (err == -EMSGSIZE)
2356 						goto nla_put_failure;
2357 					error = err;
2358 				}
2359 			}
2360 		} else
2361 #endif
2362 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2363 				goto nla_put_failure;
2364 	}
2365 
2366 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2367 		goto nla_put_failure;
2368 
2369 	return nlmsg_end(skb, nlh);
2370 
2371 nla_put_failure:
2372 	nlmsg_cancel(skb, nlh);
2373 	return -EMSGSIZE;
2374 }
2375 
2376 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2377 {
2378 	struct net *net = sock_net(in_skb->sk);
2379 	struct rtmsg *rtm;
2380 	struct nlattr *tb[RTA_MAX+1];
2381 	struct rtable *rt = NULL;
2382 	struct flowi4 fl4;
2383 	__be32 dst = 0;
2384 	__be32 src = 0;
2385 	u32 iif;
2386 	int err;
2387 	int mark;
2388 	struct sk_buff *skb;
2389 
2390 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2391 	if (err < 0)
2392 		goto errout;
2393 
2394 	rtm = nlmsg_data(nlh);
2395 
2396 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2397 	if (skb == NULL) {
2398 		err = -ENOBUFS;
2399 		goto errout;
2400 	}
2401 
2402 	/* Reserve room for dummy headers, this skb can pass
2403 	   through good chunk of routing engine.
2404 	 */
2405 	skb_reset_mac_header(skb);
2406 	skb_reset_network_header(skb);
2407 
2408 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2409 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2410 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2411 
2412 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2413 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2414 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2415 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2416 
2417 	memset(&fl4, 0, sizeof(fl4));
2418 	fl4.daddr = dst;
2419 	fl4.saddr = src;
2420 	fl4.flowi4_tos = rtm->rtm_tos;
2421 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2422 	fl4.flowi4_mark = mark;
2423 
2424 	if (iif) {
2425 		struct net_device *dev;
2426 
2427 		dev = __dev_get_by_index(net, iif);
2428 		if (dev == NULL) {
2429 			err = -ENODEV;
2430 			goto errout_free;
2431 		}
2432 
2433 		skb->protocol	= htons(ETH_P_IP);
2434 		skb->dev	= dev;
2435 		skb->mark	= mark;
2436 		local_bh_disable();
2437 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2438 		local_bh_enable();
2439 
2440 		rt = skb_rtable(skb);
2441 		if (err == 0 && rt->dst.error)
2442 			err = -rt->dst.error;
2443 	} else {
2444 		rt = ip_route_output_key(net, &fl4);
2445 
2446 		err = 0;
2447 		if (IS_ERR(rt))
2448 			err = PTR_ERR(rt);
2449 	}
2450 
2451 	if (err)
2452 		goto errout_free;
2453 
2454 	skb_dst_set(skb, &rt->dst);
2455 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2456 		rt->rt_flags |= RTCF_NOTIFY;
2457 
2458 	err = rt_fill_info(net, dst, src, &fl4, skb,
2459 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2460 			   RTM_NEWROUTE, 0, 0);
2461 	if (err <= 0)
2462 		goto errout_free;
2463 
2464 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2465 errout:
2466 	return err;
2467 
2468 errout_free:
2469 	kfree_skb(skb);
2470 	goto errout;
2471 }
2472 
2473 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2474 {
2475 	return skb->len;
2476 }
2477 
2478 void ip_rt_multicast_event(struct in_device *in_dev)
2479 {
2480 	rt_cache_flush(dev_net(in_dev->dev));
2481 }
2482 
2483 #ifdef CONFIG_SYSCTL
2484 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2485 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2486 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2487 static int ip_rt_gc_elasticity __read_mostly	= 8;
2488 
2489 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2490 					void __user *buffer,
2491 					size_t *lenp, loff_t *ppos)
2492 {
2493 	struct net *net = (struct net *)__ctl->extra1;
2494 
2495 	if (write) {
2496 		rt_cache_flush(net);
2497 		fnhe_genid_bump(net);
2498 		return 0;
2499 	}
2500 
2501 	return -EINVAL;
2502 }
2503 
2504 static struct ctl_table ipv4_route_table[] = {
2505 	{
2506 		.procname	= "gc_thresh",
2507 		.data		= &ipv4_dst_ops.gc_thresh,
2508 		.maxlen		= sizeof(int),
2509 		.mode		= 0644,
2510 		.proc_handler	= proc_dointvec,
2511 	},
2512 	{
2513 		.procname	= "max_size",
2514 		.data		= &ip_rt_max_size,
2515 		.maxlen		= sizeof(int),
2516 		.mode		= 0644,
2517 		.proc_handler	= proc_dointvec,
2518 	},
2519 	{
2520 		/*  Deprecated. Use gc_min_interval_ms */
2521 
2522 		.procname	= "gc_min_interval",
2523 		.data		= &ip_rt_gc_min_interval,
2524 		.maxlen		= sizeof(int),
2525 		.mode		= 0644,
2526 		.proc_handler	= proc_dointvec_jiffies,
2527 	},
2528 	{
2529 		.procname	= "gc_min_interval_ms",
2530 		.data		= &ip_rt_gc_min_interval,
2531 		.maxlen		= sizeof(int),
2532 		.mode		= 0644,
2533 		.proc_handler	= proc_dointvec_ms_jiffies,
2534 	},
2535 	{
2536 		.procname	= "gc_timeout",
2537 		.data		= &ip_rt_gc_timeout,
2538 		.maxlen		= sizeof(int),
2539 		.mode		= 0644,
2540 		.proc_handler	= proc_dointvec_jiffies,
2541 	},
2542 	{
2543 		.procname	= "gc_interval",
2544 		.data		= &ip_rt_gc_interval,
2545 		.maxlen		= sizeof(int),
2546 		.mode		= 0644,
2547 		.proc_handler	= proc_dointvec_jiffies,
2548 	},
2549 	{
2550 		.procname	= "redirect_load",
2551 		.data		= &ip_rt_redirect_load,
2552 		.maxlen		= sizeof(int),
2553 		.mode		= 0644,
2554 		.proc_handler	= proc_dointvec,
2555 	},
2556 	{
2557 		.procname	= "redirect_number",
2558 		.data		= &ip_rt_redirect_number,
2559 		.maxlen		= sizeof(int),
2560 		.mode		= 0644,
2561 		.proc_handler	= proc_dointvec,
2562 	},
2563 	{
2564 		.procname	= "redirect_silence",
2565 		.data		= &ip_rt_redirect_silence,
2566 		.maxlen		= sizeof(int),
2567 		.mode		= 0644,
2568 		.proc_handler	= proc_dointvec,
2569 	},
2570 	{
2571 		.procname	= "error_cost",
2572 		.data		= &ip_rt_error_cost,
2573 		.maxlen		= sizeof(int),
2574 		.mode		= 0644,
2575 		.proc_handler	= proc_dointvec,
2576 	},
2577 	{
2578 		.procname	= "error_burst",
2579 		.data		= &ip_rt_error_burst,
2580 		.maxlen		= sizeof(int),
2581 		.mode		= 0644,
2582 		.proc_handler	= proc_dointvec,
2583 	},
2584 	{
2585 		.procname	= "gc_elasticity",
2586 		.data		= &ip_rt_gc_elasticity,
2587 		.maxlen		= sizeof(int),
2588 		.mode		= 0644,
2589 		.proc_handler	= proc_dointvec,
2590 	},
2591 	{
2592 		.procname	= "mtu_expires",
2593 		.data		= &ip_rt_mtu_expires,
2594 		.maxlen		= sizeof(int),
2595 		.mode		= 0644,
2596 		.proc_handler	= proc_dointvec_jiffies,
2597 	},
2598 	{
2599 		.procname	= "min_pmtu",
2600 		.data		= &ip_rt_min_pmtu,
2601 		.maxlen		= sizeof(int),
2602 		.mode		= 0644,
2603 		.proc_handler	= proc_dointvec,
2604 	},
2605 	{
2606 		.procname	= "min_adv_mss",
2607 		.data		= &ip_rt_min_advmss,
2608 		.maxlen		= sizeof(int),
2609 		.mode		= 0644,
2610 		.proc_handler	= proc_dointvec,
2611 	},
2612 	{ }
2613 };
2614 
2615 static struct ctl_table ipv4_route_flush_table[] = {
2616 	{
2617 		.procname	= "flush",
2618 		.maxlen		= sizeof(int),
2619 		.mode		= 0200,
2620 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2621 	},
2622 	{ },
2623 };
2624 
2625 static __net_init int sysctl_route_net_init(struct net *net)
2626 {
2627 	struct ctl_table *tbl;
2628 
2629 	tbl = ipv4_route_flush_table;
2630 	if (!net_eq(net, &init_net)) {
2631 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2632 		if (tbl == NULL)
2633 			goto err_dup;
2634 
2635 		/* Don't export sysctls to unprivileged users */
2636 		if (net->user_ns != &init_user_ns)
2637 			tbl[0].procname = NULL;
2638 	}
2639 	tbl[0].extra1 = net;
2640 
2641 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2642 	if (net->ipv4.route_hdr == NULL)
2643 		goto err_reg;
2644 	return 0;
2645 
2646 err_reg:
2647 	if (tbl != ipv4_route_flush_table)
2648 		kfree(tbl);
2649 err_dup:
2650 	return -ENOMEM;
2651 }
2652 
2653 static __net_exit void sysctl_route_net_exit(struct net *net)
2654 {
2655 	struct ctl_table *tbl;
2656 
2657 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2658 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2659 	BUG_ON(tbl == ipv4_route_flush_table);
2660 	kfree(tbl);
2661 }
2662 
2663 static __net_initdata struct pernet_operations sysctl_route_ops = {
2664 	.init = sysctl_route_net_init,
2665 	.exit = sysctl_route_net_exit,
2666 };
2667 #endif
2668 
2669 static __net_init int rt_genid_init(struct net *net)
2670 {
2671 	atomic_set(&net->ipv4.rt_genid, 0);
2672 	atomic_set(&net->fnhe_genid, 0);
2673 	get_random_bytes(&net->ipv4.dev_addr_genid,
2674 			 sizeof(net->ipv4.dev_addr_genid));
2675 	return 0;
2676 }
2677 
2678 static __net_initdata struct pernet_operations rt_genid_ops = {
2679 	.init = rt_genid_init,
2680 };
2681 
2682 static int __net_init ipv4_inetpeer_init(struct net *net)
2683 {
2684 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2685 
2686 	if (!bp)
2687 		return -ENOMEM;
2688 	inet_peer_base_init(bp);
2689 	net->ipv4.peers = bp;
2690 	return 0;
2691 }
2692 
2693 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2694 {
2695 	struct inet_peer_base *bp = net->ipv4.peers;
2696 
2697 	net->ipv4.peers = NULL;
2698 	inetpeer_invalidate_tree(bp);
2699 	kfree(bp);
2700 }
2701 
2702 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2703 	.init	=	ipv4_inetpeer_init,
2704 	.exit	=	ipv4_inetpeer_exit,
2705 };
2706 
2707 #ifdef CONFIG_IP_ROUTE_CLASSID
2708 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2709 #endif /* CONFIG_IP_ROUTE_CLASSID */
2710 
2711 int __init ip_rt_init(void)
2712 {
2713 	int rc = 0;
2714 
2715 #ifdef CONFIG_IP_ROUTE_CLASSID
2716 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2717 	if (!ip_rt_acct)
2718 		panic("IP: failed to allocate ip_rt_acct\n");
2719 #endif
2720 
2721 	ipv4_dst_ops.kmem_cachep =
2722 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2723 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2724 
2725 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2726 
2727 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2728 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2729 
2730 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2731 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2732 
2733 	ipv4_dst_ops.gc_thresh = ~0;
2734 	ip_rt_max_size = INT_MAX;
2735 
2736 	devinet_init();
2737 	ip_fib_init();
2738 
2739 	if (ip_rt_proc_init())
2740 		pr_err("Unable to create route proc files\n");
2741 #ifdef CONFIG_XFRM
2742 	xfrm_init();
2743 	xfrm4_init();
2744 #endif
2745 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2746 
2747 #ifdef CONFIG_SYSCTL
2748 	register_pernet_subsys(&sysctl_route_ops);
2749 #endif
2750 	register_pernet_subsys(&rt_genid_ops);
2751 	register_pernet_subsys(&ipv4_inetpeer_ops);
2752 	return rc;
2753 }
2754 
2755 #ifdef CONFIG_SYSCTL
2756 /*
2757  * We really need to sanitize the damn ipv4 init order, then all
2758  * this nonsense will go away.
2759  */
2760 void __init ip_static_sysctl_init(void)
2761 {
2762 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2763 }
2764 #endif
2765