xref: /openbmc/linux/net/ipv4/route.c (revision 1c2f87c2)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly	= 9;
119 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly	= HZ;
122 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly	= 256;
126 
127 /*
128  *	Interface to generic destination cache.
129  */
130 
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void		 ipv4_link_failure(struct sk_buff *skb);
136 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137 					   struct sk_buff *skb, u32 mtu);
138 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139 					struct sk_buff *skb);
140 static void		ipv4_dst_destroy(struct dst_entry *dst);
141 
142 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
143 {
144 	WARN_ON(1);
145 	return NULL;
146 }
147 
148 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
149 					   struct sk_buff *skb,
150 					   const void *daddr);
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.check =		ipv4_dst_check,
156 	.default_advmss =	ipv4_default_advmss,
157 	.mtu =			ipv4_mtu,
158 	.cow_metrics =		ipv4_cow_metrics,
159 	.destroy =		ipv4_dst_destroy,
160 	.negative_advice =	ipv4_negative_advice,
161 	.link_failure =		ipv4_link_failure,
162 	.update_pmtu =		ip_rt_update_pmtu,
163 	.redirect =		ip_do_redirect,
164 	.local_out =		__ip_local_out,
165 	.neigh_lookup =		ipv4_neigh_lookup,
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 const __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 EXPORT_SYMBOL(ip_tos2prio);
189 
190 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192 
193 #ifdef CONFIG_PROC_FS
194 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
195 {
196 	if (*pos)
197 		return NULL;
198 	return SEQ_START_TOKEN;
199 }
200 
201 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202 {
203 	++*pos;
204 	return NULL;
205 }
206 
207 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
208 {
209 }
210 
211 static int rt_cache_seq_show(struct seq_file *seq, void *v)
212 {
213 	if (v == SEQ_START_TOKEN)
214 		seq_printf(seq, "%-127s\n",
215 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
216 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
217 			   "HHUptod\tSpecDst");
218 	return 0;
219 }
220 
221 static const struct seq_operations rt_cache_seq_ops = {
222 	.start  = rt_cache_seq_start,
223 	.next   = rt_cache_seq_next,
224 	.stop   = rt_cache_seq_stop,
225 	.show   = rt_cache_seq_show,
226 };
227 
228 static int rt_cache_seq_open(struct inode *inode, struct file *file)
229 {
230 	return seq_open(file, &rt_cache_seq_ops);
231 }
232 
233 static const struct file_operations rt_cache_seq_fops = {
234 	.owner	 = THIS_MODULE,
235 	.open	 = rt_cache_seq_open,
236 	.read	 = seq_read,
237 	.llseek	 = seq_lseek,
238 	.release = seq_release,
239 };
240 
241 
242 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
243 {
244 	int cpu;
245 
246 	if (*pos == 0)
247 		return SEQ_START_TOKEN;
248 
249 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
250 		if (!cpu_possible(cpu))
251 			continue;
252 		*pos = cpu+1;
253 		return &per_cpu(rt_cache_stat, cpu);
254 	}
255 	return NULL;
256 }
257 
258 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
259 {
260 	int cpu;
261 
262 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
263 		if (!cpu_possible(cpu))
264 			continue;
265 		*pos = cpu+1;
266 		return &per_cpu(rt_cache_stat, cpu);
267 	}
268 	return NULL;
269 
270 }
271 
272 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
273 {
274 
275 }
276 
277 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
278 {
279 	struct rt_cache_stat *st = v;
280 
281 	if (v == SEQ_START_TOKEN) {
282 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
283 		return 0;
284 	}
285 
286 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
287 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
288 		   dst_entries_get_slow(&ipv4_dst_ops),
289 		   0, /* st->in_hit */
290 		   st->in_slow_tot,
291 		   st->in_slow_mc,
292 		   st->in_no_route,
293 		   st->in_brd,
294 		   st->in_martian_dst,
295 		   st->in_martian_src,
296 
297 		   0, /* st->out_hit */
298 		   st->out_slow_tot,
299 		   st->out_slow_mc,
300 
301 		   0, /* st->gc_total */
302 		   0, /* st->gc_ignored */
303 		   0, /* st->gc_goal_miss */
304 		   0, /* st->gc_dst_overflow */
305 		   0, /* st->in_hlist_search */
306 		   0  /* st->out_hlist_search */
307 		);
308 	return 0;
309 }
310 
311 static const struct seq_operations rt_cpu_seq_ops = {
312 	.start  = rt_cpu_seq_start,
313 	.next   = rt_cpu_seq_next,
314 	.stop   = rt_cpu_seq_stop,
315 	.show   = rt_cpu_seq_show,
316 };
317 
318 
319 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
320 {
321 	return seq_open(file, &rt_cpu_seq_ops);
322 }
323 
324 static const struct file_operations rt_cpu_seq_fops = {
325 	.owner	 = THIS_MODULE,
326 	.open	 = rt_cpu_seq_open,
327 	.read	 = seq_read,
328 	.llseek	 = seq_lseek,
329 	.release = seq_release,
330 };
331 
332 #ifdef CONFIG_IP_ROUTE_CLASSID
333 static int rt_acct_proc_show(struct seq_file *m, void *v)
334 {
335 	struct ip_rt_acct *dst, *src;
336 	unsigned int i, j;
337 
338 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
339 	if (!dst)
340 		return -ENOMEM;
341 
342 	for_each_possible_cpu(i) {
343 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
344 		for (j = 0; j < 256; j++) {
345 			dst[j].o_bytes   += src[j].o_bytes;
346 			dst[j].o_packets += src[j].o_packets;
347 			dst[j].i_bytes   += src[j].i_bytes;
348 			dst[j].i_packets += src[j].i_packets;
349 		}
350 	}
351 
352 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
353 	kfree(dst);
354 	return 0;
355 }
356 
357 static int rt_acct_proc_open(struct inode *inode, struct file *file)
358 {
359 	return single_open(file, rt_acct_proc_show, NULL);
360 }
361 
362 static const struct file_operations rt_acct_proc_fops = {
363 	.owner		= THIS_MODULE,
364 	.open		= rt_acct_proc_open,
365 	.read		= seq_read,
366 	.llseek		= seq_lseek,
367 	.release	= single_release,
368 };
369 #endif
370 
371 static int __net_init ip_rt_do_proc_init(struct net *net)
372 {
373 	struct proc_dir_entry *pde;
374 
375 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
376 			  &rt_cache_seq_fops);
377 	if (!pde)
378 		goto err1;
379 
380 	pde = proc_create("rt_cache", S_IRUGO,
381 			  net->proc_net_stat, &rt_cpu_seq_fops);
382 	if (!pde)
383 		goto err2;
384 
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
387 	if (!pde)
388 		goto err3;
389 #endif
390 	return 0;
391 
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 err3:
394 	remove_proc_entry("rt_cache", net->proc_net_stat);
395 #endif
396 err2:
397 	remove_proc_entry("rt_cache", net->proc_net);
398 err1:
399 	return -ENOMEM;
400 }
401 
402 static void __net_exit ip_rt_do_proc_exit(struct net *net)
403 {
404 	remove_proc_entry("rt_cache", net->proc_net_stat);
405 	remove_proc_entry("rt_cache", net->proc_net);
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407 	remove_proc_entry("rt_acct", net->proc_net);
408 #endif
409 }
410 
411 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
412 	.init = ip_rt_do_proc_init,
413 	.exit = ip_rt_do_proc_exit,
414 };
415 
416 static int __init ip_rt_proc_init(void)
417 {
418 	return register_pernet_subsys(&ip_rt_proc_ops);
419 }
420 
421 #else
422 static inline int ip_rt_proc_init(void)
423 {
424 	return 0;
425 }
426 #endif /* CONFIG_PROC_FS */
427 
428 static inline bool rt_is_expired(const struct rtable *rth)
429 {
430 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 }
432 
433 void rt_cache_flush(struct net *net)
434 {
435 	rt_genid_bump_ipv4(net);
436 }
437 
438 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
439 					   struct sk_buff *skb,
440 					   const void *daddr)
441 {
442 	struct net_device *dev = dst->dev;
443 	const __be32 *pkey = daddr;
444 	const struct rtable *rt;
445 	struct neighbour *n;
446 
447 	rt = (const struct rtable *) dst;
448 	if (rt->rt_gateway)
449 		pkey = (const __be32 *) &rt->rt_gateway;
450 	else if (skb)
451 		pkey = &ip_hdr(skb)->daddr;
452 
453 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454 	if (n)
455 		return n;
456 	return neigh_create(&arp_tbl, pkey, dev);
457 }
458 
459 /*
460  * Peer allocation may fail only in serious out-of-memory conditions.  However
461  * we still can generate some output.
462  * Random ID selection looks a bit dangerous because we have no chances to
463  * select ID being unique in a reasonable period of time.
464  * But broken packet identifier may be better than no packet at all.
465  */
466 static void ip_select_fb_ident(struct iphdr *iph)
467 {
468 	static DEFINE_SPINLOCK(ip_fb_id_lock);
469 	static u32 ip_fallback_id;
470 	u32 salt;
471 
472 	spin_lock_bh(&ip_fb_id_lock);
473 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474 	iph->id = htons(salt & 0xFFFF);
475 	ip_fallback_id = salt;
476 	spin_unlock_bh(&ip_fb_id_lock);
477 }
478 
479 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
480 {
481 	struct net *net = dev_net(dst->dev);
482 	struct inet_peer *peer;
483 
484 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
485 	if (peer) {
486 		iph->id = htons(inet_getid(peer, more));
487 		inet_putpeer(peer);
488 		return;
489 	}
490 
491 	ip_select_fb_ident(iph);
492 }
493 EXPORT_SYMBOL(__ip_select_ident);
494 
495 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
496 			     const struct iphdr *iph,
497 			     int oif, u8 tos,
498 			     u8 prot, u32 mark, int flow_flags)
499 {
500 	if (sk) {
501 		const struct inet_sock *inet = inet_sk(sk);
502 
503 		oif = sk->sk_bound_dev_if;
504 		mark = sk->sk_mark;
505 		tos = RT_CONN_FLAGS(sk);
506 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
507 	}
508 	flowi4_init_output(fl4, oif, mark, tos,
509 			   RT_SCOPE_UNIVERSE, prot,
510 			   flow_flags,
511 			   iph->daddr, iph->saddr, 0, 0);
512 }
513 
514 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
515 			       const struct sock *sk)
516 {
517 	const struct iphdr *iph = ip_hdr(skb);
518 	int oif = skb->dev->ifindex;
519 	u8 tos = RT_TOS(iph->tos);
520 	u8 prot = iph->protocol;
521 	u32 mark = skb->mark;
522 
523 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
524 }
525 
526 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
527 {
528 	const struct inet_sock *inet = inet_sk(sk);
529 	const struct ip_options_rcu *inet_opt;
530 	__be32 daddr = inet->inet_daddr;
531 
532 	rcu_read_lock();
533 	inet_opt = rcu_dereference(inet->inet_opt);
534 	if (inet_opt && inet_opt->opt.srr)
535 		daddr = inet_opt->opt.faddr;
536 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
537 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
538 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
539 			   inet_sk_flowi_flags(sk),
540 			   daddr, inet->inet_saddr, 0, 0);
541 	rcu_read_unlock();
542 }
543 
544 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
545 				 const struct sk_buff *skb)
546 {
547 	if (skb)
548 		build_skb_flow_key(fl4, skb, sk);
549 	else
550 		build_sk_flow_key(fl4, sk);
551 }
552 
553 static inline void rt_free(struct rtable *rt)
554 {
555 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
556 }
557 
558 static DEFINE_SPINLOCK(fnhe_lock);
559 
560 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
561 {
562 	struct rtable *rt;
563 
564 	rt = rcu_dereference(fnhe->fnhe_rth_input);
565 	if (rt) {
566 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
567 		rt_free(rt);
568 	}
569 	rt = rcu_dereference(fnhe->fnhe_rth_output);
570 	if (rt) {
571 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
572 		rt_free(rt);
573 	}
574 }
575 
576 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
577 {
578 	struct fib_nh_exception *fnhe, *oldest;
579 
580 	oldest = rcu_dereference(hash->chain);
581 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
582 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
583 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
584 			oldest = fnhe;
585 	}
586 	fnhe_flush_routes(oldest);
587 	return oldest;
588 }
589 
590 static inline u32 fnhe_hashfun(__be32 daddr)
591 {
592 	u32 hval;
593 
594 	hval = (__force u32) daddr;
595 	hval ^= (hval >> 11) ^ (hval >> 22);
596 
597 	return hval & (FNHE_HASH_SIZE - 1);
598 }
599 
600 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
601 {
602 	rt->rt_pmtu = fnhe->fnhe_pmtu;
603 	rt->dst.expires = fnhe->fnhe_expires;
604 
605 	if (fnhe->fnhe_gw) {
606 		rt->rt_flags |= RTCF_REDIRECTED;
607 		rt->rt_gateway = fnhe->fnhe_gw;
608 		rt->rt_uses_gateway = 1;
609 	}
610 }
611 
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613 				  u32 pmtu, unsigned long expires)
614 {
615 	struct fnhe_hash_bucket *hash;
616 	struct fib_nh_exception *fnhe;
617 	struct rtable *rt;
618 	unsigned int i;
619 	int depth;
620 	u32 hval = fnhe_hashfun(daddr);
621 
622 	spin_lock_bh(&fnhe_lock);
623 
624 	hash = nh->nh_exceptions;
625 	if (!hash) {
626 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
627 		if (!hash)
628 			goto out_unlock;
629 		nh->nh_exceptions = hash;
630 	}
631 
632 	hash += hval;
633 
634 	depth = 0;
635 	for (fnhe = rcu_dereference(hash->chain); fnhe;
636 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
637 		if (fnhe->fnhe_daddr == daddr)
638 			break;
639 		depth++;
640 	}
641 
642 	if (fnhe) {
643 		if (gw)
644 			fnhe->fnhe_gw = gw;
645 		if (pmtu) {
646 			fnhe->fnhe_pmtu = pmtu;
647 			fnhe->fnhe_expires = max(1UL, expires);
648 		}
649 		/* Update all cached dsts too */
650 		rt = rcu_dereference(fnhe->fnhe_rth_input);
651 		if (rt)
652 			fill_route_from_fnhe(rt, fnhe);
653 		rt = rcu_dereference(fnhe->fnhe_rth_output);
654 		if (rt)
655 			fill_route_from_fnhe(rt, fnhe);
656 	} else {
657 		if (depth > FNHE_RECLAIM_DEPTH)
658 			fnhe = fnhe_oldest(hash);
659 		else {
660 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
661 			if (!fnhe)
662 				goto out_unlock;
663 
664 			fnhe->fnhe_next = hash->chain;
665 			rcu_assign_pointer(hash->chain, fnhe);
666 		}
667 		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
668 		fnhe->fnhe_daddr = daddr;
669 		fnhe->fnhe_gw = gw;
670 		fnhe->fnhe_pmtu = pmtu;
671 		fnhe->fnhe_expires = expires;
672 
673 		/* Exception created; mark the cached routes for the nexthop
674 		 * stale, so anyone caching it rechecks if this exception
675 		 * applies to them.
676 		 */
677 		rt = rcu_dereference(nh->nh_rth_input);
678 		if (rt)
679 			rt->dst.obsolete = DST_OBSOLETE_KILL;
680 
681 		for_each_possible_cpu(i) {
682 			struct rtable __rcu **prt;
683 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
684 			rt = rcu_dereference(*prt);
685 			if (rt)
686 				rt->dst.obsolete = DST_OBSOLETE_KILL;
687 		}
688 	}
689 
690 	fnhe->fnhe_stamp = jiffies;
691 
692 out_unlock:
693 	spin_unlock_bh(&fnhe_lock);
694 }
695 
696 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
697 			     bool kill_route)
698 {
699 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
700 	__be32 old_gw = ip_hdr(skb)->saddr;
701 	struct net_device *dev = skb->dev;
702 	struct in_device *in_dev;
703 	struct fib_result res;
704 	struct neighbour *n;
705 	struct net *net;
706 
707 	switch (icmp_hdr(skb)->code & 7) {
708 	case ICMP_REDIR_NET:
709 	case ICMP_REDIR_NETTOS:
710 	case ICMP_REDIR_HOST:
711 	case ICMP_REDIR_HOSTTOS:
712 		break;
713 
714 	default:
715 		return;
716 	}
717 
718 	if (rt->rt_gateway != old_gw)
719 		return;
720 
721 	in_dev = __in_dev_get_rcu(dev);
722 	if (!in_dev)
723 		return;
724 
725 	net = dev_net(dev);
726 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
727 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
728 	    ipv4_is_zeronet(new_gw))
729 		goto reject_redirect;
730 
731 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
732 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
733 			goto reject_redirect;
734 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
735 			goto reject_redirect;
736 	} else {
737 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
738 			goto reject_redirect;
739 	}
740 
741 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
742 	if (n) {
743 		if (!(n->nud_state & NUD_VALID)) {
744 			neigh_event_send(n, NULL);
745 		} else {
746 			if (fib_lookup(net, fl4, &res) == 0) {
747 				struct fib_nh *nh = &FIB_RES_NH(res);
748 
749 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
750 						      0, 0);
751 			}
752 			if (kill_route)
753 				rt->dst.obsolete = DST_OBSOLETE_KILL;
754 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
755 		}
756 		neigh_release(n);
757 	}
758 	return;
759 
760 reject_redirect:
761 #ifdef CONFIG_IP_ROUTE_VERBOSE
762 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
763 		const struct iphdr *iph = (const struct iphdr *) skb->data;
764 		__be32 daddr = iph->daddr;
765 		__be32 saddr = iph->saddr;
766 
767 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
768 				     "  Advised path = %pI4 -> %pI4\n",
769 				     &old_gw, dev->name, &new_gw,
770 				     &saddr, &daddr);
771 	}
772 #endif
773 	;
774 }
775 
776 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
777 {
778 	struct rtable *rt;
779 	struct flowi4 fl4;
780 	const struct iphdr *iph = (const struct iphdr *) skb->data;
781 	int oif = skb->dev->ifindex;
782 	u8 tos = RT_TOS(iph->tos);
783 	u8 prot = iph->protocol;
784 	u32 mark = skb->mark;
785 
786 	rt = (struct rtable *) dst;
787 
788 	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
789 	__ip_do_redirect(rt, skb, &fl4, true);
790 }
791 
792 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
793 {
794 	struct rtable *rt = (struct rtable *)dst;
795 	struct dst_entry *ret = dst;
796 
797 	if (rt) {
798 		if (dst->obsolete > 0) {
799 			ip_rt_put(rt);
800 			ret = NULL;
801 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
802 			   rt->dst.expires) {
803 			ip_rt_put(rt);
804 			ret = NULL;
805 		}
806 	}
807 	return ret;
808 }
809 
810 /*
811  * Algorithm:
812  *	1. The first ip_rt_redirect_number redirects are sent
813  *	   with exponential backoff, then we stop sending them at all,
814  *	   assuming that the host ignores our redirects.
815  *	2. If we did not see packets requiring redirects
816  *	   during ip_rt_redirect_silence, we assume that the host
817  *	   forgot redirected route and start to send redirects again.
818  *
819  * This algorithm is much cheaper and more intelligent than dumb load limiting
820  * in icmp.c.
821  *
822  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
823  * and "frag. need" (breaks PMTU discovery) in icmp.c.
824  */
825 
826 void ip_rt_send_redirect(struct sk_buff *skb)
827 {
828 	struct rtable *rt = skb_rtable(skb);
829 	struct in_device *in_dev;
830 	struct inet_peer *peer;
831 	struct net *net;
832 	int log_martians;
833 
834 	rcu_read_lock();
835 	in_dev = __in_dev_get_rcu(rt->dst.dev);
836 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
837 		rcu_read_unlock();
838 		return;
839 	}
840 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
841 	rcu_read_unlock();
842 
843 	net = dev_net(rt->dst.dev);
844 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
845 	if (!peer) {
846 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
847 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
848 		return;
849 	}
850 
851 	/* No redirected packets during ip_rt_redirect_silence;
852 	 * reset the algorithm.
853 	 */
854 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
855 		peer->rate_tokens = 0;
856 
857 	/* Too many ignored redirects; do not send anything
858 	 * set dst.rate_last to the last seen redirected packet.
859 	 */
860 	if (peer->rate_tokens >= ip_rt_redirect_number) {
861 		peer->rate_last = jiffies;
862 		goto out_put_peer;
863 	}
864 
865 	/* Check for load limit; set rate_last to the latest sent
866 	 * redirect.
867 	 */
868 	if (peer->rate_tokens == 0 ||
869 	    time_after(jiffies,
870 		       (peer->rate_last +
871 			(ip_rt_redirect_load << peer->rate_tokens)))) {
872 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
873 
874 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
875 		peer->rate_last = jiffies;
876 		++peer->rate_tokens;
877 #ifdef CONFIG_IP_ROUTE_VERBOSE
878 		if (log_martians &&
879 		    peer->rate_tokens == ip_rt_redirect_number)
880 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
881 					     &ip_hdr(skb)->saddr, inet_iif(skb),
882 					     &ip_hdr(skb)->daddr, &gw);
883 #endif
884 	}
885 out_put_peer:
886 	inet_putpeer(peer);
887 }
888 
889 static int ip_error(struct sk_buff *skb)
890 {
891 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
892 	struct rtable *rt = skb_rtable(skb);
893 	struct inet_peer *peer;
894 	unsigned long now;
895 	struct net *net;
896 	bool send;
897 	int code;
898 
899 	net = dev_net(rt->dst.dev);
900 	if (!IN_DEV_FORWARD(in_dev)) {
901 		switch (rt->dst.error) {
902 		case EHOSTUNREACH:
903 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
904 			break;
905 
906 		case ENETUNREACH:
907 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
908 			break;
909 		}
910 		goto out;
911 	}
912 
913 	switch (rt->dst.error) {
914 	case EINVAL:
915 	default:
916 		goto out;
917 	case EHOSTUNREACH:
918 		code = ICMP_HOST_UNREACH;
919 		break;
920 	case ENETUNREACH:
921 		code = ICMP_NET_UNREACH;
922 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
923 		break;
924 	case EACCES:
925 		code = ICMP_PKT_FILTERED;
926 		break;
927 	}
928 
929 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
930 
931 	send = true;
932 	if (peer) {
933 		now = jiffies;
934 		peer->rate_tokens += now - peer->rate_last;
935 		if (peer->rate_tokens > ip_rt_error_burst)
936 			peer->rate_tokens = ip_rt_error_burst;
937 		peer->rate_last = now;
938 		if (peer->rate_tokens >= ip_rt_error_cost)
939 			peer->rate_tokens -= ip_rt_error_cost;
940 		else
941 			send = false;
942 		inet_putpeer(peer);
943 	}
944 	if (send)
945 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
946 
947 out:	kfree_skb(skb);
948 	return 0;
949 }
950 
951 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
952 {
953 	struct dst_entry *dst = &rt->dst;
954 	struct fib_result res;
955 
956 	if (dst_metric_locked(dst, RTAX_MTU))
957 		return;
958 
959 	if (dst->dev->mtu < mtu)
960 		return;
961 
962 	if (mtu < ip_rt_min_pmtu)
963 		mtu = ip_rt_min_pmtu;
964 
965 	if (rt->rt_pmtu == mtu &&
966 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
967 		return;
968 
969 	rcu_read_lock();
970 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
971 		struct fib_nh *nh = &FIB_RES_NH(res);
972 
973 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
974 				      jiffies + ip_rt_mtu_expires);
975 	}
976 	rcu_read_unlock();
977 }
978 
979 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
980 			      struct sk_buff *skb, u32 mtu)
981 {
982 	struct rtable *rt = (struct rtable *) dst;
983 	struct flowi4 fl4;
984 
985 	ip_rt_build_flow_key(&fl4, sk, skb);
986 	__ip_rt_update_pmtu(rt, &fl4, mtu);
987 }
988 
989 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
990 		      int oif, u32 mark, u8 protocol, int flow_flags)
991 {
992 	const struct iphdr *iph = (const struct iphdr *) skb->data;
993 	struct flowi4 fl4;
994 	struct rtable *rt;
995 
996 	__build_flow_key(&fl4, NULL, iph, oif,
997 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
998 	rt = __ip_route_output_key(net, &fl4);
999 	if (!IS_ERR(rt)) {
1000 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1001 		ip_rt_put(rt);
1002 	}
1003 }
1004 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1005 
1006 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1007 {
1008 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1009 	struct flowi4 fl4;
1010 	struct rtable *rt;
1011 
1012 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1013 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1014 	if (!IS_ERR(rt)) {
1015 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1016 		ip_rt_put(rt);
1017 	}
1018 }
1019 
1020 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021 {
1022 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1023 	struct flowi4 fl4;
1024 	struct rtable *rt;
1025 	struct dst_entry *dst;
1026 	bool new = false;
1027 
1028 	bh_lock_sock(sk);
1029 
1030 	if (!ip_sk_accept_pmtu(sk))
1031 		goto out;
1032 
1033 	rt = (struct rtable *) __sk_dst_get(sk);
1034 
1035 	if (sock_owned_by_user(sk) || !rt) {
1036 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1037 		goto out;
1038 	}
1039 
1040 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1041 
1042 	if (!__sk_dst_check(sk, 0)) {
1043 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1044 		if (IS_ERR(rt))
1045 			goto out;
1046 
1047 		new = true;
1048 	}
1049 
1050 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1051 
1052 	dst = dst_check(&rt->dst, 0);
1053 	if (!dst) {
1054 		if (new)
1055 			dst_release(&rt->dst);
1056 
1057 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058 		if (IS_ERR(rt))
1059 			goto out;
1060 
1061 		new = true;
1062 	}
1063 
1064 	if (new)
1065 		__sk_dst_set(sk, &rt->dst);
1066 
1067 out:
1068 	bh_unlock_sock(sk);
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1071 
1072 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1073 		   int oif, u32 mark, u8 protocol, int flow_flags)
1074 {
1075 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 	struct flowi4 fl4;
1077 	struct rtable *rt;
1078 
1079 	__build_flow_key(&fl4, NULL, iph, oif,
1080 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1081 	rt = __ip_route_output_key(net, &fl4);
1082 	if (!IS_ERR(rt)) {
1083 		__ip_do_redirect(rt, skb, &fl4, false);
1084 		ip_rt_put(rt);
1085 	}
1086 }
1087 EXPORT_SYMBOL_GPL(ipv4_redirect);
1088 
1089 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1090 {
1091 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1092 	struct flowi4 fl4;
1093 	struct rtable *rt;
1094 
1095 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1096 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1097 	if (!IS_ERR(rt)) {
1098 		__ip_do_redirect(rt, skb, &fl4, false);
1099 		ip_rt_put(rt);
1100 	}
1101 }
1102 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1103 
1104 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1105 {
1106 	struct rtable *rt = (struct rtable *) dst;
1107 
1108 	/* All IPV4 dsts are created with ->obsolete set to the value
1109 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1110 	 * into this function always.
1111 	 *
1112 	 * When a PMTU/redirect information update invalidates a route,
1113 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1114 	 * DST_OBSOLETE_DEAD by dst_free().
1115 	 */
1116 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1117 		return NULL;
1118 	return dst;
1119 }
1120 
1121 static void ipv4_link_failure(struct sk_buff *skb)
1122 {
1123 	struct rtable *rt;
1124 
1125 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1126 
1127 	rt = skb_rtable(skb);
1128 	if (rt)
1129 		dst_set_expires(&rt->dst, 0);
1130 }
1131 
1132 static int ip_rt_bug(struct sk_buff *skb)
1133 {
1134 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1135 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1136 		 skb->dev ? skb->dev->name : "?");
1137 	kfree_skb(skb);
1138 	WARN_ON(1);
1139 	return 0;
1140 }
1141 
1142 /*
1143    We do not cache source address of outgoing interface,
1144    because it is used only by IP RR, TS and SRR options,
1145    so that it out of fast path.
1146 
1147    BTW remember: "addr" is allowed to be not aligned
1148    in IP options!
1149  */
1150 
1151 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1152 {
1153 	__be32 src;
1154 
1155 	if (rt_is_output_route(rt))
1156 		src = ip_hdr(skb)->saddr;
1157 	else {
1158 		struct fib_result res;
1159 		struct flowi4 fl4;
1160 		struct iphdr *iph;
1161 
1162 		iph = ip_hdr(skb);
1163 
1164 		memset(&fl4, 0, sizeof(fl4));
1165 		fl4.daddr = iph->daddr;
1166 		fl4.saddr = iph->saddr;
1167 		fl4.flowi4_tos = RT_TOS(iph->tos);
1168 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1169 		fl4.flowi4_iif = skb->dev->ifindex;
1170 		fl4.flowi4_mark = skb->mark;
1171 
1172 		rcu_read_lock();
1173 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1174 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1175 		else
1176 			src = inet_select_addr(rt->dst.dev,
1177 					       rt_nexthop(rt, iph->daddr),
1178 					       RT_SCOPE_UNIVERSE);
1179 		rcu_read_unlock();
1180 	}
1181 	memcpy(addr, &src, 4);
1182 }
1183 
1184 #ifdef CONFIG_IP_ROUTE_CLASSID
1185 static void set_class_tag(struct rtable *rt, u32 tag)
1186 {
1187 	if (!(rt->dst.tclassid & 0xFFFF))
1188 		rt->dst.tclassid |= tag & 0xFFFF;
1189 	if (!(rt->dst.tclassid & 0xFFFF0000))
1190 		rt->dst.tclassid |= tag & 0xFFFF0000;
1191 }
1192 #endif
1193 
1194 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1195 {
1196 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1197 
1198 	if (advmss == 0) {
1199 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1200 			       ip_rt_min_advmss);
1201 		if (advmss > 65535 - 40)
1202 			advmss = 65535 - 40;
1203 	}
1204 	return advmss;
1205 }
1206 
1207 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1208 {
1209 	const struct rtable *rt = (const struct rtable *) dst;
1210 	unsigned int mtu = rt->rt_pmtu;
1211 
1212 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1213 		mtu = dst_metric_raw(dst, RTAX_MTU);
1214 
1215 	if (mtu)
1216 		return mtu;
1217 
1218 	mtu = dst->dev->mtu;
1219 
1220 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1221 		if (rt->rt_uses_gateway && mtu > 576)
1222 			mtu = 576;
1223 	}
1224 
1225 	return min_t(unsigned int, mtu, IP_MAX_MTU);
1226 }
1227 
1228 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1229 {
1230 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1231 	struct fib_nh_exception *fnhe;
1232 	u32 hval;
1233 
1234 	if (!hash)
1235 		return NULL;
1236 
1237 	hval = fnhe_hashfun(daddr);
1238 
1239 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1240 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1241 		if (fnhe->fnhe_daddr == daddr)
1242 			return fnhe;
1243 	}
1244 	return NULL;
1245 }
1246 
1247 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1248 			      __be32 daddr)
1249 {
1250 	bool ret = false;
1251 
1252 	spin_lock_bh(&fnhe_lock);
1253 
1254 	if (daddr == fnhe->fnhe_daddr) {
1255 		struct rtable __rcu **porig;
1256 		struct rtable *orig;
1257 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1258 
1259 		if (rt_is_input_route(rt))
1260 			porig = &fnhe->fnhe_rth_input;
1261 		else
1262 			porig = &fnhe->fnhe_rth_output;
1263 		orig = rcu_dereference(*porig);
1264 
1265 		if (fnhe->fnhe_genid != genid) {
1266 			fnhe->fnhe_genid = genid;
1267 			fnhe->fnhe_gw = 0;
1268 			fnhe->fnhe_pmtu = 0;
1269 			fnhe->fnhe_expires = 0;
1270 			fnhe_flush_routes(fnhe);
1271 			orig = NULL;
1272 		}
1273 		fill_route_from_fnhe(rt, fnhe);
1274 		if (!rt->rt_gateway)
1275 			rt->rt_gateway = daddr;
1276 
1277 		if (!(rt->dst.flags & DST_NOCACHE)) {
1278 			rcu_assign_pointer(*porig, rt);
1279 			if (orig)
1280 				rt_free(orig);
1281 			ret = true;
1282 		}
1283 
1284 		fnhe->fnhe_stamp = jiffies;
1285 	}
1286 	spin_unlock_bh(&fnhe_lock);
1287 
1288 	return ret;
1289 }
1290 
1291 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1292 {
1293 	struct rtable *orig, *prev, **p;
1294 	bool ret = true;
1295 
1296 	if (rt_is_input_route(rt)) {
1297 		p = (struct rtable **)&nh->nh_rth_input;
1298 	} else {
1299 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1300 	}
1301 	orig = *p;
1302 
1303 	prev = cmpxchg(p, orig, rt);
1304 	if (prev == orig) {
1305 		if (orig)
1306 			rt_free(orig);
1307 	} else
1308 		ret = false;
1309 
1310 	return ret;
1311 }
1312 
1313 static DEFINE_SPINLOCK(rt_uncached_lock);
1314 static LIST_HEAD(rt_uncached_list);
1315 
1316 static void rt_add_uncached_list(struct rtable *rt)
1317 {
1318 	spin_lock_bh(&rt_uncached_lock);
1319 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1320 	spin_unlock_bh(&rt_uncached_lock);
1321 }
1322 
1323 static void ipv4_dst_destroy(struct dst_entry *dst)
1324 {
1325 	struct rtable *rt = (struct rtable *) dst;
1326 
1327 	if (!list_empty(&rt->rt_uncached)) {
1328 		spin_lock_bh(&rt_uncached_lock);
1329 		list_del(&rt->rt_uncached);
1330 		spin_unlock_bh(&rt_uncached_lock);
1331 	}
1332 }
1333 
1334 void rt_flush_dev(struct net_device *dev)
1335 {
1336 	if (!list_empty(&rt_uncached_list)) {
1337 		struct net *net = dev_net(dev);
1338 		struct rtable *rt;
1339 
1340 		spin_lock_bh(&rt_uncached_lock);
1341 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1342 			if (rt->dst.dev != dev)
1343 				continue;
1344 			rt->dst.dev = net->loopback_dev;
1345 			dev_hold(rt->dst.dev);
1346 			dev_put(dev);
1347 		}
1348 		spin_unlock_bh(&rt_uncached_lock);
1349 	}
1350 }
1351 
1352 static bool rt_cache_valid(const struct rtable *rt)
1353 {
1354 	return	rt &&
1355 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1356 		!rt_is_expired(rt);
1357 }
1358 
1359 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1360 			   const struct fib_result *res,
1361 			   struct fib_nh_exception *fnhe,
1362 			   struct fib_info *fi, u16 type, u32 itag)
1363 {
1364 	bool cached = false;
1365 
1366 	if (fi) {
1367 		struct fib_nh *nh = &FIB_RES_NH(*res);
1368 
1369 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1370 			rt->rt_gateway = nh->nh_gw;
1371 			rt->rt_uses_gateway = 1;
1372 		}
1373 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1374 #ifdef CONFIG_IP_ROUTE_CLASSID
1375 		rt->dst.tclassid = nh->nh_tclassid;
1376 #endif
1377 		if (unlikely(fnhe))
1378 			cached = rt_bind_exception(rt, fnhe, daddr);
1379 		else if (!(rt->dst.flags & DST_NOCACHE))
1380 			cached = rt_cache_route(nh, rt);
1381 		if (unlikely(!cached)) {
1382 			/* Routes we intend to cache in nexthop exception or
1383 			 * FIB nexthop have the DST_NOCACHE bit clear.
1384 			 * However, if we are unsuccessful at storing this
1385 			 * route into the cache we really need to set it.
1386 			 */
1387 			rt->dst.flags |= DST_NOCACHE;
1388 			if (!rt->rt_gateway)
1389 				rt->rt_gateway = daddr;
1390 			rt_add_uncached_list(rt);
1391 		}
1392 	} else
1393 		rt_add_uncached_list(rt);
1394 
1395 #ifdef CONFIG_IP_ROUTE_CLASSID
1396 #ifdef CONFIG_IP_MULTIPLE_TABLES
1397 	set_class_tag(rt, res->tclassid);
1398 #endif
1399 	set_class_tag(rt, itag);
1400 #endif
1401 }
1402 
1403 static struct rtable *rt_dst_alloc(struct net_device *dev,
1404 				   bool nopolicy, bool noxfrm, bool will_cache)
1405 {
1406 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1407 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1408 			 (nopolicy ? DST_NOPOLICY : 0) |
1409 			 (noxfrm ? DST_NOXFRM : 0));
1410 }
1411 
1412 /* called in rcu_read_lock() section */
1413 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1414 				u8 tos, struct net_device *dev, int our)
1415 {
1416 	struct rtable *rth;
1417 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1418 	u32 itag = 0;
1419 	int err;
1420 
1421 	/* Primary sanity checks. */
1422 
1423 	if (in_dev == NULL)
1424 		return -EINVAL;
1425 
1426 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1427 	    skb->protocol != htons(ETH_P_IP))
1428 		goto e_inval;
1429 
1430 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1431 		if (ipv4_is_loopback(saddr))
1432 			goto e_inval;
1433 
1434 	if (ipv4_is_zeronet(saddr)) {
1435 		if (!ipv4_is_local_multicast(daddr))
1436 			goto e_inval;
1437 	} else {
1438 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1439 					  in_dev, &itag);
1440 		if (err < 0)
1441 			goto e_err;
1442 	}
1443 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1444 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1445 	if (!rth)
1446 		goto e_nobufs;
1447 
1448 #ifdef CONFIG_IP_ROUTE_CLASSID
1449 	rth->dst.tclassid = itag;
1450 #endif
1451 	rth->dst.output = ip_rt_bug;
1452 
1453 	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1454 	rth->rt_flags	= RTCF_MULTICAST;
1455 	rth->rt_type	= RTN_MULTICAST;
1456 	rth->rt_is_input= 1;
1457 	rth->rt_iif	= 0;
1458 	rth->rt_pmtu	= 0;
1459 	rth->rt_gateway	= 0;
1460 	rth->rt_uses_gateway = 0;
1461 	INIT_LIST_HEAD(&rth->rt_uncached);
1462 	if (our) {
1463 		rth->dst.input= ip_local_deliver;
1464 		rth->rt_flags |= RTCF_LOCAL;
1465 	}
1466 
1467 #ifdef CONFIG_IP_MROUTE
1468 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1469 		rth->dst.input = ip_mr_input;
1470 #endif
1471 	RT_CACHE_STAT_INC(in_slow_mc);
1472 
1473 	skb_dst_set(skb, &rth->dst);
1474 	return 0;
1475 
1476 e_nobufs:
1477 	return -ENOBUFS;
1478 e_inval:
1479 	return -EINVAL;
1480 e_err:
1481 	return err;
1482 }
1483 
1484 
1485 static void ip_handle_martian_source(struct net_device *dev,
1486 				     struct in_device *in_dev,
1487 				     struct sk_buff *skb,
1488 				     __be32 daddr,
1489 				     __be32 saddr)
1490 {
1491 	RT_CACHE_STAT_INC(in_martian_src);
1492 #ifdef CONFIG_IP_ROUTE_VERBOSE
1493 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1494 		/*
1495 		 *	RFC1812 recommendation, if source is martian,
1496 		 *	the only hint is MAC header.
1497 		 */
1498 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1499 			&daddr, &saddr, dev->name);
1500 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1501 			print_hex_dump(KERN_WARNING, "ll header: ",
1502 				       DUMP_PREFIX_OFFSET, 16, 1,
1503 				       skb_mac_header(skb),
1504 				       dev->hard_header_len, true);
1505 		}
1506 	}
1507 #endif
1508 }
1509 
1510 /* called in rcu_read_lock() section */
1511 static int __mkroute_input(struct sk_buff *skb,
1512 			   const struct fib_result *res,
1513 			   struct in_device *in_dev,
1514 			   __be32 daddr, __be32 saddr, u32 tos)
1515 {
1516 	struct fib_nh_exception *fnhe;
1517 	struct rtable *rth;
1518 	int err;
1519 	struct in_device *out_dev;
1520 	unsigned int flags = 0;
1521 	bool do_cache;
1522 	u32 itag;
1523 
1524 	/* get a working reference to the output device */
1525 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1526 	if (out_dev == NULL) {
1527 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1528 		return -EINVAL;
1529 	}
1530 
1531 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1532 				  in_dev->dev, in_dev, &itag);
1533 	if (err < 0) {
1534 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1535 					 saddr);
1536 
1537 		goto cleanup;
1538 	}
1539 
1540 	do_cache = res->fi && !itag;
1541 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1542 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1543 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1544 		flags |= RTCF_DOREDIRECT;
1545 		do_cache = false;
1546 	}
1547 
1548 	if (skb->protocol != htons(ETH_P_IP)) {
1549 		/* Not IP (i.e. ARP). Do not create route, if it is
1550 		 * invalid for proxy arp. DNAT routes are always valid.
1551 		 *
1552 		 * Proxy arp feature have been extended to allow, ARP
1553 		 * replies back to the same interface, to support
1554 		 * Private VLAN switch technologies. See arp.c.
1555 		 */
1556 		if (out_dev == in_dev &&
1557 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1558 			err = -EINVAL;
1559 			goto cleanup;
1560 		}
1561 	}
1562 
1563 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1564 	if (do_cache) {
1565 		if (fnhe != NULL)
1566 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1567 		else
1568 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1569 
1570 		if (rt_cache_valid(rth)) {
1571 			skb_dst_set_noref(skb, &rth->dst);
1572 			goto out;
1573 		}
1574 	}
1575 
1576 	rth = rt_dst_alloc(out_dev->dev,
1577 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1578 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1579 	if (!rth) {
1580 		err = -ENOBUFS;
1581 		goto cleanup;
1582 	}
1583 
1584 	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1585 	rth->rt_flags = flags;
1586 	rth->rt_type = res->type;
1587 	rth->rt_is_input = 1;
1588 	rth->rt_iif 	= 0;
1589 	rth->rt_pmtu	= 0;
1590 	rth->rt_gateway	= 0;
1591 	rth->rt_uses_gateway = 0;
1592 	INIT_LIST_HEAD(&rth->rt_uncached);
1593 	RT_CACHE_STAT_INC(in_slow_tot);
1594 
1595 	rth->dst.input = ip_forward;
1596 	rth->dst.output = ip_output;
1597 
1598 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1599 	skb_dst_set(skb, &rth->dst);
1600 out:
1601 	err = 0;
1602  cleanup:
1603 	return err;
1604 }
1605 
1606 static int ip_mkroute_input(struct sk_buff *skb,
1607 			    struct fib_result *res,
1608 			    const struct flowi4 *fl4,
1609 			    struct in_device *in_dev,
1610 			    __be32 daddr, __be32 saddr, u32 tos)
1611 {
1612 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1613 	if (res->fi && res->fi->fib_nhs > 1)
1614 		fib_select_multipath(res);
1615 #endif
1616 
1617 	/* create a routing cache entry */
1618 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1619 }
1620 
1621 /*
1622  *	NOTE. We drop all the packets that has local source
1623  *	addresses, because every properly looped back packet
1624  *	must have correct destination already attached by output routine.
1625  *
1626  *	Such approach solves two big problems:
1627  *	1. Not simplex devices are handled properly.
1628  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1629  *	called with rcu_read_lock()
1630  */
1631 
1632 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1633 			       u8 tos, struct net_device *dev)
1634 {
1635 	struct fib_result res;
1636 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1637 	struct flowi4	fl4;
1638 	unsigned int	flags = 0;
1639 	u32		itag = 0;
1640 	struct rtable	*rth;
1641 	int		err = -EINVAL;
1642 	struct net    *net = dev_net(dev);
1643 	bool do_cache;
1644 
1645 	/* IP on this device is disabled. */
1646 
1647 	if (!in_dev)
1648 		goto out;
1649 
1650 	/* Check for the most weird martians, which can be not detected
1651 	   by fib_lookup.
1652 	 */
1653 
1654 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1655 		goto martian_source;
1656 
1657 	res.fi = NULL;
1658 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1659 		goto brd_input;
1660 
1661 	/* Accept zero addresses only to limited broadcast;
1662 	 * I even do not know to fix it or not. Waiting for complains :-)
1663 	 */
1664 	if (ipv4_is_zeronet(saddr))
1665 		goto martian_source;
1666 
1667 	if (ipv4_is_zeronet(daddr))
1668 		goto martian_destination;
1669 
1670 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1671 	 * and call it once if daddr or/and saddr are loopback addresses
1672 	 */
1673 	if (ipv4_is_loopback(daddr)) {
1674 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1675 			goto martian_destination;
1676 	} else if (ipv4_is_loopback(saddr)) {
1677 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1678 			goto martian_source;
1679 	}
1680 
1681 	/*
1682 	 *	Now we are ready to route packet.
1683 	 */
1684 	fl4.flowi4_oif = 0;
1685 	fl4.flowi4_iif = dev->ifindex;
1686 	fl4.flowi4_mark = skb->mark;
1687 	fl4.flowi4_tos = tos;
1688 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1689 	fl4.daddr = daddr;
1690 	fl4.saddr = saddr;
1691 	err = fib_lookup(net, &fl4, &res);
1692 	if (err != 0) {
1693 		if (!IN_DEV_FORWARD(in_dev))
1694 			err = -EHOSTUNREACH;
1695 		goto no_route;
1696 	}
1697 
1698 	if (res.type == RTN_BROADCAST)
1699 		goto brd_input;
1700 
1701 	if (res.type == RTN_LOCAL) {
1702 		err = fib_validate_source(skb, saddr, daddr, tos,
1703 					  LOOPBACK_IFINDEX,
1704 					  dev, in_dev, &itag);
1705 		if (err < 0)
1706 			goto martian_source_keep_err;
1707 		goto local_input;
1708 	}
1709 
1710 	if (!IN_DEV_FORWARD(in_dev)) {
1711 		err = -EHOSTUNREACH;
1712 		goto no_route;
1713 	}
1714 	if (res.type != RTN_UNICAST)
1715 		goto martian_destination;
1716 
1717 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1718 out:	return err;
1719 
1720 brd_input:
1721 	if (skb->protocol != htons(ETH_P_IP))
1722 		goto e_inval;
1723 
1724 	if (!ipv4_is_zeronet(saddr)) {
1725 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1726 					  in_dev, &itag);
1727 		if (err < 0)
1728 			goto martian_source_keep_err;
1729 	}
1730 	flags |= RTCF_BROADCAST;
1731 	res.type = RTN_BROADCAST;
1732 	RT_CACHE_STAT_INC(in_brd);
1733 
1734 local_input:
1735 	do_cache = false;
1736 	if (res.fi) {
1737 		if (!itag) {
1738 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1739 			if (rt_cache_valid(rth)) {
1740 				skb_dst_set_noref(skb, &rth->dst);
1741 				err = 0;
1742 				goto out;
1743 			}
1744 			do_cache = true;
1745 		}
1746 	}
1747 
1748 	rth = rt_dst_alloc(net->loopback_dev,
1749 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1750 	if (!rth)
1751 		goto e_nobufs;
1752 
1753 	rth->dst.input= ip_local_deliver;
1754 	rth->dst.output= ip_rt_bug;
1755 #ifdef CONFIG_IP_ROUTE_CLASSID
1756 	rth->dst.tclassid = itag;
1757 #endif
1758 
1759 	rth->rt_genid = rt_genid_ipv4(net);
1760 	rth->rt_flags 	= flags|RTCF_LOCAL;
1761 	rth->rt_type	= res.type;
1762 	rth->rt_is_input = 1;
1763 	rth->rt_iif	= 0;
1764 	rth->rt_pmtu	= 0;
1765 	rth->rt_gateway	= 0;
1766 	rth->rt_uses_gateway = 0;
1767 	INIT_LIST_HEAD(&rth->rt_uncached);
1768 	RT_CACHE_STAT_INC(in_slow_tot);
1769 	if (res.type == RTN_UNREACHABLE) {
1770 		rth->dst.input= ip_error;
1771 		rth->dst.error= -err;
1772 		rth->rt_flags 	&= ~RTCF_LOCAL;
1773 	}
1774 	if (do_cache) {
1775 		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1776 			rth->dst.flags |= DST_NOCACHE;
1777 			rt_add_uncached_list(rth);
1778 		}
1779 	}
1780 	skb_dst_set(skb, &rth->dst);
1781 	err = 0;
1782 	goto out;
1783 
1784 no_route:
1785 	RT_CACHE_STAT_INC(in_no_route);
1786 	res.type = RTN_UNREACHABLE;
1787 	if (err == -ESRCH)
1788 		err = -ENETUNREACH;
1789 	goto local_input;
1790 
1791 	/*
1792 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1793 	 */
1794 martian_destination:
1795 	RT_CACHE_STAT_INC(in_martian_dst);
1796 #ifdef CONFIG_IP_ROUTE_VERBOSE
1797 	if (IN_DEV_LOG_MARTIANS(in_dev))
1798 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1799 				     &daddr, &saddr, dev->name);
1800 #endif
1801 
1802 e_inval:
1803 	err = -EINVAL;
1804 	goto out;
1805 
1806 e_nobufs:
1807 	err = -ENOBUFS;
1808 	goto out;
1809 
1810 martian_source:
1811 	err = -EINVAL;
1812 martian_source_keep_err:
1813 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1814 	goto out;
1815 }
1816 
1817 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1818 			 u8 tos, struct net_device *dev)
1819 {
1820 	int res;
1821 
1822 	rcu_read_lock();
1823 
1824 	/* Multicast recognition logic is moved from route cache to here.
1825 	   The problem was that too many Ethernet cards have broken/missing
1826 	   hardware multicast filters :-( As result the host on multicasting
1827 	   network acquires a lot of useless route cache entries, sort of
1828 	   SDR messages from all the world. Now we try to get rid of them.
1829 	   Really, provided software IP multicast filter is organized
1830 	   reasonably (at least, hashed), it does not result in a slowdown
1831 	   comparing with route cache reject entries.
1832 	   Note, that multicast routers are not affected, because
1833 	   route cache entry is created eventually.
1834 	 */
1835 	if (ipv4_is_multicast(daddr)) {
1836 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1837 
1838 		if (in_dev) {
1839 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1840 						  ip_hdr(skb)->protocol);
1841 			if (our
1842 #ifdef CONFIG_IP_MROUTE
1843 				||
1844 			    (!ipv4_is_local_multicast(daddr) &&
1845 			     IN_DEV_MFORWARD(in_dev))
1846 #endif
1847 			   ) {
1848 				int res = ip_route_input_mc(skb, daddr, saddr,
1849 							    tos, dev, our);
1850 				rcu_read_unlock();
1851 				return res;
1852 			}
1853 		}
1854 		rcu_read_unlock();
1855 		return -EINVAL;
1856 	}
1857 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1858 	rcu_read_unlock();
1859 	return res;
1860 }
1861 EXPORT_SYMBOL(ip_route_input_noref);
1862 
1863 /* called with rcu_read_lock() */
1864 static struct rtable *__mkroute_output(const struct fib_result *res,
1865 				       const struct flowi4 *fl4, int orig_oif,
1866 				       struct net_device *dev_out,
1867 				       unsigned int flags)
1868 {
1869 	struct fib_info *fi = res->fi;
1870 	struct fib_nh_exception *fnhe;
1871 	struct in_device *in_dev;
1872 	u16 type = res->type;
1873 	struct rtable *rth;
1874 	bool do_cache;
1875 
1876 	in_dev = __in_dev_get_rcu(dev_out);
1877 	if (!in_dev)
1878 		return ERR_PTR(-EINVAL);
1879 
1880 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1881 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1882 			return ERR_PTR(-EINVAL);
1883 
1884 	if (ipv4_is_lbcast(fl4->daddr))
1885 		type = RTN_BROADCAST;
1886 	else if (ipv4_is_multicast(fl4->daddr))
1887 		type = RTN_MULTICAST;
1888 	else if (ipv4_is_zeronet(fl4->daddr))
1889 		return ERR_PTR(-EINVAL);
1890 
1891 	if (dev_out->flags & IFF_LOOPBACK)
1892 		flags |= RTCF_LOCAL;
1893 
1894 	do_cache = true;
1895 	if (type == RTN_BROADCAST) {
1896 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1897 		fi = NULL;
1898 	} else if (type == RTN_MULTICAST) {
1899 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1900 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1901 				     fl4->flowi4_proto))
1902 			flags &= ~RTCF_LOCAL;
1903 		else
1904 			do_cache = false;
1905 		/* If multicast route do not exist use
1906 		 * default one, but do not gateway in this case.
1907 		 * Yes, it is hack.
1908 		 */
1909 		if (fi && res->prefixlen < 4)
1910 			fi = NULL;
1911 	}
1912 
1913 	fnhe = NULL;
1914 	do_cache &= fi != NULL;
1915 	if (do_cache) {
1916 		struct rtable __rcu **prth;
1917 		struct fib_nh *nh = &FIB_RES_NH(*res);
1918 
1919 		fnhe = find_exception(nh, fl4->daddr);
1920 		if (fnhe)
1921 			prth = &fnhe->fnhe_rth_output;
1922 		else {
1923 			if (unlikely(fl4->flowi4_flags &
1924 				     FLOWI_FLAG_KNOWN_NH &&
1925 				     !(nh->nh_gw &&
1926 				       nh->nh_scope == RT_SCOPE_LINK))) {
1927 				do_cache = false;
1928 				goto add;
1929 			}
1930 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1931 		}
1932 		rth = rcu_dereference(*prth);
1933 		if (rt_cache_valid(rth)) {
1934 			dst_hold(&rth->dst);
1935 			return rth;
1936 		}
1937 	}
1938 
1939 add:
1940 	rth = rt_dst_alloc(dev_out,
1941 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1942 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1943 			   do_cache);
1944 	if (!rth)
1945 		return ERR_PTR(-ENOBUFS);
1946 
1947 	rth->dst.output = ip_output;
1948 
1949 	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1950 	rth->rt_flags	= flags;
1951 	rth->rt_type	= type;
1952 	rth->rt_is_input = 0;
1953 	rth->rt_iif	= orig_oif ? : 0;
1954 	rth->rt_pmtu	= 0;
1955 	rth->rt_gateway = 0;
1956 	rth->rt_uses_gateway = 0;
1957 	INIT_LIST_HEAD(&rth->rt_uncached);
1958 
1959 	RT_CACHE_STAT_INC(out_slow_tot);
1960 
1961 	if (flags & RTCF_LOCAL)
1962 		rth->dst.input = ip_local_deliver;
1963 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1964 		if (flags & RTCF_LOCAL &&
1965 		    !(dev_out->flags & IFF_LOOPBACK)) {
1966 			rth->dst.output = ip_mc_output;
1967 			RT_CACHE_STAT_INC(out_slow_mc);
1968 		}
1969 #ifdef CONFIG_IP_MROUTE
1970 		if (type == RTN_MULTICAST) {
1971 			if (IN_DEV_MFORWARD(in_dev) &&
1972 			    !ipv4_is_local_multicast(fl4->daddr)) {
1973 				rth->dst.input = ip_mr_input;
1974 				rth->dst.output = ip_mc_output;
1975 			}
1976 		}
1977 #endif
1978 	}
1979 
1980 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1981 
1982 	return rth;
1983 }
1984 
1985 /*
1986  * Major route resolver routine.
1987  */
1988 
1989 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1990 {
1991 	struct net_device *dev_out = NULL;
1992 	__u8 tos = RT_FL_TOS(fl4);
1993 	unsigned int flags = 0;
1994 	struct fib_result res;
1995 	struct rtable *rth;
1996 	int orig_oif;
1997 
1998 	res.tclassid	= 0;
1999 	res.fi		= NULL;
2000 	res.table	= NULL;
2001 
2002 	orig_oif = fl4->flowi4_oif;
2003 
2004 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2005 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2006 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2007 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2008 
2009 	rcu_read_lock();
2010 	if (fl4->saddr) {
2011 		rth = ERR_PTR(-EINVAL);
2012 		if (ipv4_is_multicast(fl4->saddr) ||
2013 		    ipv4_is_lbcast(fl4->saddr) ||
2014 		    ipv4_is_zeronet(fl4->saddr))
2015 			goto out;
2016 
2017 		/* I removed check for oif == dev_out->oif here.
2018 		   It was wrong for two reasons:
2019 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2020 		      is assigned to multiple interfaces.
2021 		   2. Moreover, we are allowed to send packets with saddr
2022 		      of another iface. --ANK
2023 		 */
2024 
2025 		if (fl4->flowi4_oif == 0 &&
2026 		    (ipv4_is_multicast(fl4->daddr) ||
2027 		     ipv4_is_lbcast(fl4->daddr))) {
2028 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2029 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2030 			if (dev_out == NULL)
2031 				goto out;
2032 
2033 			/* Special hack: user can direct multicasts
2034 			   and limited broadcast via necessary interface
2035 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2036 			   This hack is not just for fun, it allows
2037 			   vic,vat and friends to work.
2038 			   They bind socket to loopback, set ttl to zero
2039 			   and expect that it will work.
2040 			   From the viewpoint of routing cache they are broken,
2041 			   because we are not allowed to build multicast path
2042 			   with loopback source addr (look, routing cache
2043 			   cannot know, that ttl is zero, so that packet
2044 			   will not leave this host and route is valid).
2045 			   Luckily, this hack is good workaround.
2046 			 */
2047 
2048 			fl4->flowi4_oif = dev_out->ifindex;
2049 			goto make_route;
2050 		}
2051 
2052 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2053 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2054 			if (!__ip_dev_find(net, fl4->saddr, false))
2055 				goto out;
2056 		}
2057 	}
2058 
2059 
2060 	if (fl4->flowi4_oif) {
2061 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2062 		rth = ERR_PTR(-ENODEV);
2063 		if (dev_out == NULL)
2064 			goto out;
2065 
2066 		/* RACE: Check return value of inet_select_addr instead. */
2067 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2068 			rth = ERR_PTR(-ENETUNREACH);
2069 			goto out;
2070 		}
2071 		if (ipv4_is_local_multicast(fl4->daddr) ||
2072 		    ipv4_is_lbcast(fl4->daddr)) {
2073 			if (!fl4->saddr)
2074 				fl4->saddr = inet_select_addr(dev_out, 0,
2075 							      RT_SCOPE_LINK);
2076 			goto make_route;
2077 		}
2078 		if (!fl4->saddr) {
2079 			if (ipv4_is_multicast(fl4->daddr))
2080 				fl4->saddr = inet_select_addr(dev_out, 0,
2081 							      fl4->flowi4_scope);
2082 			else if (!fl4->daddr)
2083 				fl4->saddr = inet_select_addr(dev_out, 0,
2084 							      RT_SCOPE_HOST);
2085 		}
2086 	}
2087 
2088 	if (!fl4->daddr) {
2089 		fl4->daddr = fl4->saddr;
2090 		if (!fl4->daddr)
2091 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2092 		dev_out = net->loopback_dev;
2093 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2094 		res.type = RTN_LOCAL;
2095 		flags |= RTCF_LOCAL;
2096 		goto make_route;
2097 	}
2098 
2099 	if (fib_lookup(net, fl4, &res)) {
2100 		res.fi = NULL;
2101 		res.table = NULL;
2102 		if (fl4->flowi4_oif) {
2103 			/* Apparently, routing tables are wrong. Assume,
2104 			   that the destination is on link.
2105 
2106 			   WHY? DW.
2107 			   Because we are allowed to send to iface
2108 			   even if it has NO routes and NO assigned
2109 			   addresses. When oif is specified, routing
2110 			   tables are looked up with only one purpose:
2111 			   to catch if destination is gatewayed, rather than
2112 			   direct. Moreover, if MSG_DONTROUTE is set,
2113 			   we send packet, ignoring both routing tables
2114 			   and ifaddr state. --ANK
2115 
2116 
2117 			   We could make it even if oif is unknown,
2118 			   likely IPv6, but we do not.
2119 			 */
2120 
2121 			if (fl4->saddr == 0)
2122 				fl4->saddr = inet_select_addr(dev_out, 0,
2123 							      RT_SCOPE_LINK);
2124 			res.type = RTN_UNICAST;
2125 			goto make_route;
2126 		}
2127 		rth = ERR_PTR(-ENETUNREACH);
2128 		goto out;
2129 	}
2130 
2131 	if (res.type == RTN_LOCAL) {
2132 		if (!fl4->saddr) {
2133 			if (res.fi->fib_prefsrc)
2134 				fl4->saddr = res.fi->fib_prefsrc;
2135 			else
2136 				fl4->saddr = fl4->daddr;
2137 		}
2138 		dev_out = net->loopback_dev;
2139 		fl4->flowi4_oif = dev_out->ifindex;
2140 		flags |= RTCF_LOCAL;
2141 		goto make_route;
2142 	}
2143 
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2146 		fib_select_multipath(&res);
2147 	else
2148 #endif
2149 	if (!res.prefixlen &&
2150 	    res.table->tb_num_default > 1 &&
2151 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2152 		fib_select_default(&res);
2153 
2154 	if (!fl4->saddr)
2155 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2156 
2157 	dev_out = FIB_RES_DEV(res);
2158 	fl4->flowi4_oif = dev_out->ifindex;
2159 
2160 
2161 make_route:
2162 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2163 
2164 out:
2165 	rcu_read_unlock();
2166 	return rth;
2167 }
2168 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2169 
2170 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2171 {
2172 	return NULL;
2173 }
2174 
2175 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2176 {
2177 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2178 
2179 	return mtu ? : dst->dev->mtu;
2180 }
2181 
2182 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2183 					  struct sk_buff *skb, u32 mtu)
2184 {
2185 }
2186 
2187 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2188 				       struct sk_buff *skb)
2189 {
2190 }
2191 
2192 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2193 					  unsigned long old)
2194 {
2195 	return NULL;
2196 }
2197 
2198 static struct dst_ops ipv4_dst_blackhole_ops = {
2199 	.family			=	AF_INET,
2200 	.protocol		=	cpu_to_be16(ETH_P_IP),
2201 	.check			=	ipv4_blackhole_dst_check,
2202 	.mtu			=	ipv4_blackhole_mtu,
2203 	.default_advmss		=	ipv4_default_advmss,
2204 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2205 	.redirect		=	ipv4_rt_blackhole_redirect,
2206 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2207 	.neigh_lookup		=	ipv4_neigh_lookup,
2208 };
2209 
2210 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2211 {
2212 	struct rtable *ort = (struct rtable *) dst_orig;
2213 	struct rtable *rt;
2214 
2215 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2216 	if (rt) {
2217 		struct dst_entry *new = &rt->dst;
2218 
2219 		new->__use = 1;
2220 		new->input = dst_discard;
2221 		new->output = dst_discard;
2222 
2223 		new->dev = ort->dst.dev;
2224 		if (new->dev)
2225 			dev_hold(new->dev);
2226 
2227 		rt->rt_is_input = ort->rt_is_input;
2228 		rt->rt_iif = ort->rt_iif;
2229 		rt->rt_pmtu = ort->rt_pmtu;
2230 
2231 		rt->rt_genid = rt_genid_ipv4(net);
2232 		rt->rt_flags = ort->rt_flags;
2233 		rt->rt_type = ort->rt_type;
2234 		rt->rt_gateway = ort->rt_gateway;
2235 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2236 
2237 		INIT_LIST_HEAD(&rt->rt_uncached);
2238 
2239 		dst_free(new);
2240 	}
2241 
2242 	dst_release(dst_orig);
2243 
2244 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2245 }
2246 
2247 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2248 				    struct sock *sk)
2249 {
2250 	struct rtable *rt = __ip_route_output_key(net, flp4);
2251 
2252 	if (IS_ERR(rt))
2253 		return rt;
2254 
2255 	if (flp4->flowi4_proto)
2256 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2257 						   flowi4_to_flowi(flp4),
2258 						   sk, 0);
2259 
2260 	return rt;
2261 }
2262 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2263 
2264 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2265 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2266 			u32 seq, int event, int nowait, unsigned int flags)
2267 {
2268 	struct rtable *rt = skb_rtable(skb);
2269 	struct rtmsg *r;
2270 	struct nlmsghdr *nlh;
2271 	unsigned long expires = 0;
2272 	u32 error;
2273 	u32 metrics[RTAX_MAX];
2274 
2275 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2276 	if (nlh == NULL)
2277 		return -EMSGSIZE;
2278 
2279 	r = nlmsg_data(nlh);
2280 	r->rtm_family	 = AF_INET;
2281 	r->rtm_dst_len	= 32;
2282 	r->rtm_src_len	= 0;
2283 	r->rtm_tos	= fl4->flowi4_tos;
2284 	r->rtm_table	= RT_TABLE_MAIN;
2285 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2286 		goto nla_put_failure;
2287 	r->rtm_type	= rt->rt_type;
2288 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2289 	r->rtm_protocol = RTPROT_UNSPEC;
2290 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2291 	if (rt->rt_flags & RTCF_NOTIFY)
2292 		r->rtm_flags |= RTM_F_NOTIFY;
2293 
2294 	if (nla_put_be32(skb, RTA_DST, dst))
2295 		goto nla_put_failure;
2296 	if (src) {
2297 		r->rtm_src_len = 32;
2298 		if (nla_put_be32(skb, RTA_SRC, src))
2299 			goto nla_put_failure;
2300 	}
2301 	if (rt->dst.dev &&
2302 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2303 		goto nla_put_failure;
2304 #ifdef CONFIG_IP_ROUTE_CLASSID
2305 	if (rt->dst.tclassid &&
2306 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2307 		goto nla_put_failure;
2308 #endif
2309 	if (!rt_is_input_route(rt) &&
2310 	    fl4->saddr != src) {
2311 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2312 			goto nla_put_failure;
2313 	}
2314 	if (rt->rt_uses_gateway &&
2315 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2316 		goto nla_put_failure;
2317 
2318 	expires = rt->dst.expires;
2319 	if (expires) {
2320 		unsigned long now = jiffies;
2321 
2322 		if (time_before(now, expires))
2323 			expires -= now;
2324 		else
2325 			expires = 0;
2326 	}
2327 
2328 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2329 	if (rt->rt_pmtu && expires)
2330 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2331 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2332 		goto nla_put_failure;
2333 
2334 	if (fl4->flowi4_mark &&
2335 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2336 		goto nla_put_failure;
2337 
2338 	error = rt->dst.error;
2339 
2340 	if (rt_is_input_route(rt)) {
2341 #ifdef CONFIG_IP_MROUTE
2342 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2343 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2344 			int err = ipmr_get_route(net, skb,
2345 						 fl4->saddr, fl4->daddr,
2346 						 r, nowait);
2347 			if (err <= 0) {
2348 				if (!nowait) {
2349 					if (err == 0)
2350 						return 0;
2351 					goto nla_put_failure;
2352 				} else {
2353 					if (err == -EMSGSIZE)
2354 						goto nla_put_failure;
2355 					error = err;
2356 				}
2357 			}
2358 		} else
2359 #endif
2360 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2361 				goto nla_put_failure;
2362 	}
2363 
2364 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2365 		goto nla_put_failure;
2366 
2367 	return nlmsg_end(skb, nlh);
2368 
2369 nla_put_failure:
2370 	nlmsg_cancel(skb, nlh);
2371 	return -EMSGSIZE;
2372 }
2373 
2374 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2375 {
2376 	struct net *net = sock_net(in_skb->sk);
2377 	struct rtmsg *rtm;
2378 	struct nlattr *tb[RTA_MAX+1];
2379 	struct rtable *rt = NULL;
2380 	struct flowi4 fl4;
2381 	__be32 dst = 0;
2382 	__be32 src = 0;
2383 	u32 iif;
2384 	int err;
2385 	int mark;
2386 	struct sk_buff *skb;
2387 
2388 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2389 	if (err < 0)
2390 		goto errout;
2391 
2392 	rtm = nlmsg_data(nlh);
2393 
2394 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2395 	if (skb == NULL) {
2396 		err = -ENOBUFS;
2397 		goto errout;
2398 	}
2399 
2400 	/* Reserve room for dummy headers, this skb can pass
2401 	   through good chunk of routing engine.
2402 	 */
2403 	skb_reset_mac_header(skb);
2404 	skb_reset_network_header(skb);
2405 
2406 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2407 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2408 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2409 
2410 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2411 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2412 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2413 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2414 
2415 	memset(&fl4, 0, sizeof(fl4));
2416 	fl4.daddr = dst;
2417 	fl4.saddr = src;
2418 	fl4.flowi4_tos = rtm->rtm_tos;
2419 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2420 	fl4.flowi4_mark = mark;
2421 
2422 	if (iif) {
2423 		struct net_device *dev;
2424 
2425 		dev = __dev_get_by_index(net, iif);
2426 		if (dev == NULL) {
2427 			err = -ENODEV;
2428 			goto errout_free;
2429 		}
2430 
2431 		skb->protocol	= htons(ETH_P_IP);
2432 		skb->dev	= dev;
2433 		skb->mark	= mark;
2434 		local_bh_disable();
2435 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2436 		local_bh_enable();
2437 
2438 		rt = skb_rtable(skb);
2439 		if (err == 0 && rt->dst.error)
2440 			err = -rt->dst.error;
2441 	} else {
2442 		rt = ip_route_output_key(net, &fl4);
2443 
2444 		err = 0;
2445 		if (IS_ERR(rt))
2446 			err = PTR_ERR(rt);
2447 	}
2448 
2449 	if (err)
2450 		goto errout_free;
2451 
2452 	skb_dst_set(skb, &rt->dst);
2453 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2454 		rt->rt_flags |= RTCF_NOTIFY;
2455 
2456 	err = rt_fill_info(net, dst, src, &fl4, skb,
2457 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2458 			   RTM_NEWROUTE, 0, 0);
2459 	if (err <= 0)
2460 		goto errout_free;
2461 
2462 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2463 errout:
2464 	return err;
2465 
2466 errout_free:
2467 	kfree_skb(skb);
2468 	goto errout;
2469 }
2470 
2471 void ip_rt_multicast_event(struct in_device *in_dev)
2472 {
2473 	rt_cache_flush(dev_net(in_dev->dev));
2474 }
2475 
2476 #ifdef CONFIG_SYSCTL
2477 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2478 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2479 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2480 static int ip_rt_gc_elasticity __read_mostly	= 8;
2481 
2482 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2483 					void __user *buffer,
2484 					size_t *lenp, loff_t *ppos)
2485 {
2486 	struct net *net = (struct net *)__ctl->extra1;
2487 
2488 	if (write) {
2489 		rt_cache_flush(net);
2490 		fnhe_genid_bump(net);
2491 		return 0;
2492 	}
2493 
2494 	return -EINVAL;
2495 }
2496 
2497 static struct ctl_table ipv4_route_table[] = {
2498 	{
2499 		.procname	= "gc_thresh",
2500 		.data		= &ipv4_dst_ops.gc_thresh,
2501 		.maxlen		= sizeof(int),
2502 		.mode		= 0644,
2503 		.proc_handler	= proc_dointvec,
2504 	},
2505 	{
2506 		.procname	= "max_size",
2507 		.data		= &ip_rt_max_size,
2508 		.maxlen		= sizeof(int),
2509 		.mode		= 0644,
2510 		.proc_handler	= proc_dointvec,
2511 	},
2512 	{
2513 		/*  Deprecated. Use gc_min_interval_ms */
2514 
2515 		.procname	= "gc_min_interval",
2516 		.data		= &ip_rt_gc_min_interval,
2517 		.maxlen		= sizeof(int),
2518 		.mode		= 0644,
2519 		.proc_handler	= proc_dointvec_jiffies,
2520 	},
2521 	{
2522 		.procname	= "gc_min_interval_ms",
2523 		.data		= &ip_rt_gc_min_interval,
2524 		.maxlen		= sizeof(int),
2525 		.mode		= 0644,
2526 		.proc_handler	= proc_dointvec_ms_jiffies,
2527 	},
2528 	{
2529 		.procname	= "gc_timeout",
2530 		.data		= &ip_rt_gc_timeout,
2531 		.maxlen		= sizeof(int),
2532 		.mode		= 0644,
2533 		.proc_handler	= proc_dointvec_jiffies,
2534 	},
2535 	{
2536 		.procname	= "gc_interval",
2537 		.data		= &ip_rt_gc_interval,
2538 		.maxlen		= sizeof(int),
2539 		.mode		= 0644,
2540 		.proc_handler	= proc_dointvec_jiffies,
2541 	},
2542 	{
2543 		.procname	= "redirect_load",
2544 		.data		= &ip_rt_redirect_load,
2545 		.maxlen		= sizeof(int),
2546 		.mode		= 0644,
2547 		.proc_handler	= proc_dointvec,
2548 	},
2549 	{
2550 		.procname	= "redirect_number",
2551 		.data		= &ip_rt_redirect_number,
2552 		.maxlen		= sizeof(int),
2553 		.mode		= 0644,
2554 		.proc_handler	= proc_dointvec,
2555 	},
2556 	{
2557 		.procname	= "redirect_silence",
2558 		.data		= &ip_rt_redirect_silence,
2559 		.maxlen		= sizeof(int),
2560 		.mode		= 0644,
2561 		.proc_handler	= proc_dointvec,
2562 	},
2563 	{
2564 		.procname	= "error_cost",
2565 		.data		= &ip_rt_error_cost,
2566 		.maxlen		= sizeof(int),
2567 		.mode		= 0644,
2568 		.proc_handler	= proc_dointvec,
2569 	},
2570 	{
2571 		.procname	= "error_burst",
2572 		.data		= &ip_rt_error_burst,
2573 		.maxlen		= sizeof(int),
2574 		.mode		= 0644,
2575 		.proc_handler	= proc_dointvec,
2576 	},
2577 	{
2578 		.procname	= "gc_elasticity",
2579 		.data		= &ip_rt_gc_elasticity,
2580 		.maxlen		= sizeof(int),
2581 		.mode		= 0644,
2582 		.proc_handler	= proc_dointvec,
2583 	},
2584 	{
2585 		.procname	= "mtu_expires",
2586 		.data		= &ip_rt_mtu_expires,
2587 		.maxlen		= sizeof(int),
2588 		.mode		= 0644,
2589 		.proc_handler	= proc_dointvec_jiffies,
2590 	},
2591 	{
2592 		.procname	= "min_pmtu",
2593 		.data		= &ip_rt_min_pmtu,
2594 		.maxlen		= sizeof(int),
2595 		.mode		= 0644,
2596 		.proc_handler	= proc_dointvec,
2597 	},
2598 	{
2599 		.procname	= "min_adv_mss",
2600 		.data		= &ip_rt_min_advmss,
2601 		.maxlen		= sizeof(int),
2602 		.mode		= 0644,
2603 		.proc_handler	= proc_dointvec,
2604 	},
2605 	{ }
2606 };
2607 
2608 static struct ctl_table ipv4_route_flush_table[] = {
2609 	{
2610 		.procname	= "flush",
2611 		.maxlen		= sizeof(int),
2612 		.mode		= 0200,
2613 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2614 	},
2615 	{ },
2616 };
2617 
2618 static __net_init int sysctl_route_net_init(struct net *net)
2619 {
2620 	struct ctl_table *tbl;
2621 
2622 	tbl = ipv4_route_flush_table;
2623 	if (!net_eq(net, &init_net)) {
2624 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2625 		if (tbl == NULL)
2626 			goto err_dup;
2627 
2628 		/* Don't export sysctls to unprivileged users */
2629 		if (net->user_ns != &init_user_ns)
2630 			tbl[0].procname = NULL;
2631 	}
2632 	tbl[0].extra1 = net;
2633 
2634 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2635 	if (net->ipv4.route_hdr == NULL)
2636 		goto err_reg;
2637 	return 0;
2638 
2639 err_reg:
2640 	if (tbl != ipv4_route_flush_table)
2641 		kfree(tbl);
2642 err_dup:
2643 	return -ENOMEM;
2644 }
2645 
2646 static __net_exit void sysctl_route_net_exit(struct net *net)
2647 {
2648 	struct ctl_table *tbl;
2649 
2650 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2651 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2652 	BUG_ON(tbl == ipv4_route_flush_table);
2653 	kfree(tbl);
2654 }
2655 
2656 static __net_initdata struct pernet_operations sysctl_route_ops = {
2657 	.init = sysctl_route_net_init,
2658 	.exit = sysctl_route_net_exit,
2659 };
2660 #endif
2661 
2662 static __net_init int rt_genid_init(struct net *net)
2663 {
2664 	atomic_set(&net->ipv4.rt_genid, 0);
2665 	atomic_set(&net->fnhe_genid, 0);
2666 	get_random_bytes(&net->ipv4.dev_addr_genid,
2667 			 sizeof(net->ipv4.dev_addr_genid));
2668 	return 0;
2669 }
2670 
2671 static __net_initdata struct pernet_operations rt_genid_ops = {
2672 	.init = rt_genid_init,
2673 };
2674 
2675 static int __net_init ipv4_inetpeer_init(struct net *net)
2676 {
2677 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2678 
2679 	if (!bp)
2680 		return -ENOMEM;
2681 	inet_peer_base_init(bp);
2682 	net->ipv4.peers = bp;
2683 	return 0;
2684 }
2685 
2686 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2687 {
2688 	struct inet_peer_base *bp = net->ipv4.peers;
2689 
2690 	net->ipv4.peers = NULL;
2691 	inetpeer_invalidate_tree(bp);
2692 	kfree(bp);
2693 }
2694 
2695 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2696 	.init	=	ipv4_inetpeer_init,
2697 	.exit	=	ipv4_inetpeer_exit,
2698 };
2699 
2700 #ifdef CONFIG_IP_ROUTE_CLASSID
2701 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2702 #endif /* CONFIG_IP_ROUTE_CLASSID */
2703 
2704 int __init ip_rt_init(void)
2705 {
2706 	int rc = 0;
2707 
2708 #ifdef CONFIG_IP_ROUTE_CLASSID
2709 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2710 	if (!ip_rt_acct)
2711 		panic("IP: failed to allocate ip_rt_acct\n");
2712 #endif
2713 
2714 	ipv4_dst_ops.kmem_cachep =
2715 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2716 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2717 
2718 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2719 
2720 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2721 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2722 
2723 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2724 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2725 
2726 	ipv4_dst_ops.gc_thresh = ~0;
2727 	ip_rt_max_size = INT_MAX;
2728 
2729 	devinet_init();
2730 	ip_fib_init();
2731 
2732 	if (ip_rt_proc_init())
2733 		pr_err("Unable to create route proc files\n");
2734 #ifdef CONFIG_XFRM
2735 	xfrm_init();
2736 	xfrm4_init();
2737 #endif
2738 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2739 
2740 #ifdef CONFIG_SYSCTL
2741 	register_pernet_subsys(&sysctl_route_ops);
2742 #endif
2743 	register_pernet_subsys(&rt_genid_ops);
2744 	register_pernet_subsys(&ipv4_inetpeer_ops);
2745 	return rc;
2746 }
2747 
2748 #ifdef CONFIG_SYSCTL
2749 /*
2750  * We really need to sanitize the damn ipv4 init order, then all
2751  * this nonsense will go away.
2752  */
2753 void __init ip_static_sysctl_init(void)
2754 {
2755 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2756 }
2757 #endif
2758