xref: /openbmc/linux/net/ipv4/route.c (revision 7b6d864b)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly	= 9;
121 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly	= HZ;
124 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly	= 256;
128 
129 /*
130  *	Interface to generic destination cache.
131  */
132 
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void		 ipv4_link_failure(struct sk_buff *skb);
138 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139 					   struct sk_buff *skb, u32 mtu);
140 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141 					struct sk_buff *skb);
142 static void		ipv4_dst_destroy(struct dst_entry *dst);
143 
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145 			    int how)
146 {
147 }
148 
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 
159 static struct dst_ops ipv4_dst_ops = {
160 	.family =		AF_INET,
161 	.protocol =		cpu_to_be16(ETH_P_IP),
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.ifdown =		ipv4_dst_ifdown,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct file_operations rt_cache_seq_fops = {
242 	.owner	 = THIS_MODULE,
243 	.open	 = rt_cache_seq_open,
244 	.read	 = seq_read,
245 	.llseek	 = seq_lseek,
246 	.release = seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 
278 }
279 
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   st->in_hit,
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   st->out_hit,
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   st->gc_total,
310 		   st->gc_ignored,
311 		   st->gc_goal_miss,
312 		   st->gc_dst_overflow,
313 		   st->in_hlist_search,
314 		   st->out_hlist_search
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.owner	 = THIS_MODULE,
334 	.open	 = rt_cpu_seq_open,
335 	.read	 = seq_read,
336 	.llseek	 = seq_lseek,
337 	.release = seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 	return single_open(file, rt_acct_proc_show, NULL);
368 }
369 
370 static const struct file_operations rt_acct_proc_fops = {
371 	.owner		= THIS_MODULE,
372 	.open		= rt_acct_proc_open,
373 	.read		= seq_read,
374 	.llseek		= seq_lseek,
375 	.release	= single_release,
376 };
377 #endif
378 
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381 	struct proc_dir_entry *pde;
382 
383 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 			  &rt_cache_seq_fops);
385 	if (!pde)
386 		goto err1;
387 
388 	pde = proc_create("rt_cache", S_IRUGO,
389 			  net->proc_net_stat, &rt_cpu_seq_fops);
390 	if (!pde)
391 		goto err2;
392 
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395 	if (!pde)
396 		goto err3;
397 #endif
398 	return 0;
399 
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402 	remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405 	remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407 	return -ENOMEM;
408 }
409 
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412 	remove_proc_entry("rt_cache", net->proc_net_stat);
413 	remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415 	remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418 
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420 	.init = ip_rt_do_proc_init,
421 	.exit = ip_rt_do_proc_exit,
422 };
423 
424 static int __init ip_rt_proc_init(void)
425 {
426 	return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428 
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432 	return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435 
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440 
441 void rt_cache_flush(struct net *net)
442 {
443 	rt_genid_bump(net);
444 }
445 
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 					   struct sk_buff *skb,
448 					   const void *daddr)
449 {
450 	struct net_device *dev = dst->dev;
451 	const __be32 *pkey = daddr;
452 	const struct rtable *rt;
453 	struct neighbour *n;
454 
455 	rt = (const struct rtable *) dst;
456 	if (rt->rt_gateway)
457 		pkey = (const __be32 *) &rt->rt_gateway;
458 	else if (skb)
459 		pkey = &ip_hdr(skb)->daddr;
460 
461 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 	if (n)
463 		return n;
464 	return neigh_create(&arp_tbl, pkey, dev);
465 }
466 
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476 	static DEFINE_SPINLOCK(ip_fb_id_lock);
477 	static u32 ip_fallback_id;
478 	u32 salt;
479 
480 	spin_lock_bh(&ip_fb_id_lock);
481 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482 	iph->id = htons(salt & 0xFFFF);
483 	ip_fallback_id = salt;
484 	spin_unlock_bh(&ip_fb_id_lock);
485 }
486 
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489 	struct net *net = dev_net(dst->dev);
490 	struct inet_peer *peer;
491 
492 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493 	if (peer) {
494 		iph->id = htons(inet_getid(peer, more));
495 		inet_putpeer(peer);
496 		return;
497 	}
498 
499 	ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502 
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504 			     const struct iphdr *iph,
505 			     int oif, u8 tos,
506 			     u8 prot, u32 mark, int flow_flags)
507 {
508 	if (sk) {
509 		const struct inet_sock *inet = inet_sk(sk);
510 
511 		oif = sk->sk_bound_dev_if;
512 		mark = sk->sk_mark;
513 		tos = RT_CONN_FLAGS(sk);
514 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515 	}
516 	flowi4_init_output(fl4, oif, mark, tos,
517 			   RT_SCOPE_UNIVERSE, prot,
518 			   flow_flags,
519 			   iph->daddr, iph->saddr, 0, 0);
520 }
521 
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523 			       const struct sock *sk)
524 {
525 	const struct iphdr *iph = ip_hdr(skb);
526 	int oif = skb->dev->ifindex;
527 	u8 tos = RT_TOS(iph->tos);
528 	u8 prot = iph->protocol;
529 	u32 mark = skb->mark;
530 
531 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533 
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536 	const struct inet_sock *inet = inet_sk(sk);
537 	const struct ip_options_rcu *inet_opt;
538 	__be32 daddr = inet->inet_daddr;
539 
540 	rcu_read_lock();
541 	inet_opt = rcu_dereference(inet->inet_opt);
542 	if (inet_opt && inet_opt->opt.srr)
543 		daddr = inet_opt->opt.faddr;
544 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547 			   inet_sk_flowi_flags(sk),
548 			   daddr, inet->inet_saddr, 0, 0);
549 	rcu_read_unlock();
550 }
551 
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553 				 const struct sk_buff *skb)
554 {
555 	if (skb)
556 		build_skb_flow_key(fl4, skb, sk);
557 	else
558 		build_sk_flow_key(fl4, sk);
559 }
560 
561 static inline void rt_free(struct rtable *rt)
562 {
563 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565 
566 static DEFINE_SPINLOCK(fnhe_lock);
567 
568 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
569 {
570 	struct fib_nh_exception *fnhe, *oldest;
571 	struct rtable *orig;
572 
573 	oldest = rcu_dereference(hash->chain);
574 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
576 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577 			oldest = fnhe;
578 	}
579 	orig = rcu_dereference(oldest->fnhe_rth);
580 	if (orig) {
581 		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582 		rt_free(orig);
583 	}
584 	return oldest;
585 }
586 
587 static inline u32 fnhe_hashfun(__be32 daddr)
588 {
589 	u32 hval;
590 
591 	hval = (__force u32) daddr;
592 	hval ^= (hval >> 11) ^ (hval >> 22);
593 
594 	return hval & (FNHE_HASH_SIZE - 1);
595 }
596 
597 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
598 				  u32 pmtu, unsigned long expires)
599 {
600 	struct fnhe_hash_bucket *hash;
601 	struct fib_nh_exception *fnhe;
602 	int depth;
603 	u32 hval = fnhe_hashfun(daddr);
604 
605 	spin_lock_bh(&fnhe_lock);
606 
607 	hash = nh->nh_exceptions;
608 	if (!hash) {
609 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
610 		if (!hash)
611 			goto out_unlock;
612 		nh->nh_exceptions = hash;
613 	}
614 
615 	hash += hval;
616 
617 	depth = 0;
618 	for (fnhe = rcu_dereference(hash->chain); fnhe;
619 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
620 		if (fnhe->fnhe_daddr == daddr)
621 			break;
622 		depth++;
623 	}
624 
625 	if (fnhe) {
626 		if (gw)
627 			fnhe->fnhe_gw = gw;
628 		if (pmtu) {
629 			fnhe->fnhe_pmtu = pmtu;
630 			fnhe->fnhe_expires = expires;
631 		}
632 	} else {
633 		if (depth > FNHE_RECLAIM_DEPTH)
634 			fnhe = fnhe_oldest(hash);
635 		else {
636 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
637 			if (!fnhe)
638 				goto out_unlock;
639 
640 			fnhe->fnhe_next = hash->chain;
641 			rcu_assign_pointer(hash->chain, fnhe);
642 		}
643 		fnhe->fnhe_daddr = daddr;
644 		fnhe->fnhe_gw = gw;
645 		fnhe->fnhe_pmtu = pmtu;
646 		fnhe->fnhe_expires = expires;
647 	}
648 
649 	fnhe->fnhe_stamp = jiffies;
650 
651 out_unlock:
652 	spin_unlock_bh(&fnhe_lock);
653 	return;
654 }
655 
656 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
657 			     bool kill_route)
658 {
659 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
660 	__be32 old_gw = ip_hdr(skb)->saddr;
661 	struct net_device *dev = skb->dev;
662 	struct in_device *in_dev;
663 	struct fib_result res;
664 	struct neighbour *n;
665 	struct net *net;
666 
667 	switch (icmp_hdr(skb)->code & 7) {
668 	case ICMP_REDIR_NET:
669 	case ICMP_REDIR_NETTOS:
670 	case ICMP_REDIR_HOST:
671 	case ICMP_REDIR_HOSTTOS:
672 		break;
673 
674 	default:
675 		return;
676 	}
677 
678 	if (rt->rt_gateway != old_gw)
679 		return;
680 
681 	in_dev = __in_dev_get_rcu(dev);
682 	if (!in_dev)
683 		return;
684 
685 	net = dev_net(dev);
686 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
687 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
688 	    ipv4_is_zeronet(new_gw))
689 		goto reject_redirect;
690 
691 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
692 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
693 			goto reject_redirect;
694 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
695 			goto reject_redirect;
696 	} else {
697 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
698 			goto reject_redirect;
699 	}
700 
701 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
702 	if (n) {
703 		if (!(n->nud_state & NUD_VALID)) {
704 			neigh_event_send(n, NULL);
705 		} else {
706 			if (fib_lookup(net, fl4, &res) == 0) {
707 				struct fib_nh *nh = &FIB_RES_NH(res);
708 
709 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
710 						      0, 0);
711 			}
712 			if (kill_route)
713 				rt->dst.obsolete = DST_OBSOLETE_KILL;
714 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
715 		}
716 		neigh_release(n);
717 	}
718 	return;
719 
720 reject_redirect:
721 #ifdef CONFIG_IP_ROUTE_VERBOSE
722 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
723 		const struct iphdr *iph = (const struct iphdr *) skb->data;
724 		__be32 daddr = iph->daddr;
725 		__be32 saddr = iph->saddr;
726 
727 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
728 				     "  Advised path = %pI4 -> %pI4\n",
729 				     &old_gw, dev->name, &new_gw,
730 				     &saddr, &daddr);
731 	}
732 #endif
733 	;
734 }
735 
736 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
737 {
738 	struct rtable *rt;
739 	struct flowi4 fl4;
740 	const struct iphdr *iph = (const struct iphdr *) skb->data;
741 	int oif = skb->dev->ifindex;
742 	u8 tos = RT_TOS(iph->tos);
743 	u8 prot = iph->protocol;
744 	u32 mark = skb->mark;
745 
746 	rt = (struct rtable *) dst;
747 
748 	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
749 	__ip_do_redirect(rt, skb, &fl4, true);
750 }
751 
752 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
753 {
754 	struct rtable *rt = (struct rtable *)dst;
755 	struct dst_entry *ret = dst;
756 
757 	if (rt) {
758 		if (dst->obsolete > 0) {
759 			ip_rt_put(rt);
760 			ret = NULL;
761 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
762 			   rt->dst.expires) {
763 			ip_rt_put(rt);
764 			ret = NULL;
765 		}
766 	}
767 	return ret;
768 }
769 
770 /*
771  * Algorithm:
772  *	1. The first ip_rt_redirect_number redirects are sent
773  *	   with exponential backoff, then we stop sending them at all,
774  *	   assuming that the host ignores our redirects.
775  *	2. If we did not see packets requiring redirects
776  *	   during ip_rt_redirect_silence, we assume that the host
777  *	   forgot redirected route and start to send redirects again.
778  *
779  * This algorithm is much cheaper and more intelligent than dumb load limiting
780  * in icmp.c.
781  *
782  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
783  * and "frag. need" (breaks PMTU discovery) in icmp.c.
784  */
785 
786 void ip_rt_send_redirect(struct sk_buff *skb)
787 {
788 	struct rtable *rt = skb_rtable(skb);
789 	struct in_device *in_dev;
790 	struct inet_peer *peer;
791 	struct net *net;
792 	int log_martians;
793 
794 	rcu_read_lock();
795 	in_dev = __in_dev_get_rcu(rt->dst.dev);
796 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
797 		rcu_read_unlock();
798 		return;
799 	}
800 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
801 	rcu_read_unlock();
802 
803 	net = dev_net(rt->dst.dev);
804 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
805 	if (!peer) {
806 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
807 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
808 		return;
809 	}
810 
811 	/* No redirected packets during ip_rt_redirect_silence;
812 	 * reset the algorithm.
813 	 */
814 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
815 		peer->rate_tokens = 0;
816 
817 	/* Too many ignored redirects; do not send anything
818 	 * set dst.rate_last to the last seen redirected packet.
819 	 */
820 	if (peer->rate_tokens >= ip_rt_redirect_number) {
821 		peer->rate_last = jiffies;
822 		goto out_put_peer;
823 	}
824 
825 	/* Check for load limit; set rate_last to the latest sent
826 	 * redirect.
827 	 */
828 	if (peer->rate_tokens == 0 ||
829 	    time_after(jiffies,
830 		       (peer->rate_last +
831 			(ip_rt_redirect_load << peer->rate_tokens)))) {
832 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
833 
834 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
835 		peer->rate_last = jiffies;
836 		++peer->rate_tokens;
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838 		if (log_martians &&
839 		    peer->rate_tokens == ip_rt_redirect_number)
840 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
841 					     &ip_hdr(skb)->saddr, inet_iif(skb),
842 					     &ip_hdr(skb)->daddr, &gw);
843 #endif
844 	}
845 out_put_peer:
846 	inet_putpeer(peer);
847 }
848 
849 static int ip_error(struct sk_buff *skb)
850 {
851 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
852 	struct rtable *rt = skb_rtable(skb);
853 	struct inet_peer *peer;
854 	unsigned long now;
855 	struct net *net;
856 	bool send;
857 	int code;
858 
859 	net = dev_net(rt->dst.dev);
860 	if (!IN_DEV_FORWARD(in_dev)) {
861 		switch (rt->dst.error) {
862 		case EHOSTUNREACH:
863 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
864 			break;
865 
866 		case ENETUNREACH:
867 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
868 			break;
869 		}
870 		goto out;
871 	}
872 
873 	switch (rt->dst.error) {
874 	case EINVAL:
875 	default:
876 		goto out;
877 	case EHOSTUNREACH:
878 		code = ICMP_HOST_UNREACH;
879 		break;
880 	case ENETUNREACH:
881 		code = ICMP_NET_UNREACH;
882 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
883 		break;
884 	case EACCES:
885 		code = ICMP_PKT_FILTERED;
886 		break;
887 	}
888 
889 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
890 
891 	send = true;
892 	if (peer) {
893 		now = jiffies;
894 		peer->rate_tokens += now - peer->rate_last;
895 		if (peer->rate_tokens > ip_rt_error_burst)
896 			peer->rate_tokens = ip_rt_error_burst;
897 		peer->rate_last = now;
898 		if (peer->rate_tokens >= ip_rt_error_cost)
899 			peer->rate_tokens -= ip_rt_error_cost;
900 		else
901 			send = false;
902 		inet_putpeer(peer);
903 	}
904 	if (send)
905 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
906 
907 out:	kfree_skb(skb);
908 	return 0;
909 }
910 
911 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
912 {
913 	struct dst_entry *dst = &rt->dst;
914 	struct fib_result res;
915 
916 	if (dst_metric_locked(dst, RTAX_MTU))
917 		return;
918 
919 	if (dst->dev->mtu < mtu)
920 		return;
921 
922 	if (mtu < ip_rt_min_pmtu)
923 		mtu = ip_rt_min_pmtu;
924 
925 	if (!rt->rt_pmtu) {
926 		dst->obsolete = DST_OBSOLETE_KILL;
927 	} else {
928 		rt->rt_pmtu = mtu;
929 		dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
930 	}
931 
932 	rcu_read_lock();
933 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
934 		struct fib_nh *nh = &FIB_RES_NH(res);
935 
936 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
937 				      jiffies + ip_rt_mtu_expires);
938 	}
939 	rcu_read_unlock();
940 }
941 
942 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
943 			      struct sk_buff *skb, u32 mtu)
944 {
945 	struct rtable *rt = (struct rtable *) dst;
946 	struct flowi4 fl4;
947 
948 	ip_rt_build_flow_key(&fl4, sk, skb);
949 	__ip_rt_update_pmtu(rt, &fl4, mtu);
950 }
951 
952 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
953 		      int oif, u32 mark, u8 protocol, int flow_flags)
954 {
955 	const struct iphdr *iph = (const struct iphdr *) skb->data;
956 	struct flowi4 fl4;
957 	struct rtable *rt;
958 
959 	__build_flow_key(&fl4, NULL, iph, oif,
960 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
961 	rt = __ip_route_output_key(net, &fl4);
962 	if (!IS_ERR(rt)) {
963 		__ip_rt_update_pmtu(rt, &fl4, mtu);
964 		ip_rt_put(rt);
965 	}
966 }
967 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
968 
969 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
970 {
971 	const struct iphdr *iph = (const struct iphdr *) skb->data;
972 	struct flowi4 fl4;
973 	struct rtable *rt;
974 
975 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
976 	rt = __ip_route_output_key(sock_net(sk), &fl4);
977 	if (!IS_ERR(rt)) {
978 		__ip_rt_update_pmtu(rt, &fl4, mtu);
979 		ip_rt_put(rt);
980 	}
981 }
982 
983 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
984 {
985 	const struct iphdr *iph = (const struct iphdr *) skb->data;
986 	struct flowi4 fl4;
987 	struct rtable *rt;
988 	struct dst_entry *dst;
989 	bool new = false;
990 
991 	bh_lock_sock(sk);
992 	rt = (struct rtable *) __sk_dst_get(sk);
993 
994 	if (sock_owned_by_user(sk) || !rt) {
995 		__ipv4_sk_update_pmtu(skb, sk, mtu);
996 		goto out;
997 	}
998 
999 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1000 
1001 	if (!__sk_dst_check(sk, 0)) {
1002 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1003 		if (IS_ERR(rt))
1004 			goto out;
1005 
1006 		new = true;
1007 	}
1008 
1009 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1010 
1011 	dst = dst_check(&rt->dst, 0);
1012 	if (!dst) {
1013 		if (new)
1014 			dst_release(&rt->dst);
1015 
1016 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1017 		if (IS_ERR(rt))
1018 			goto out;
1019 
1020 		new = true;
1021 	}
1022 
1023 	if (new)
1024 		__sk_dst_set(sk, &rt->dst);
1025 
1026 out:
1027 	bh_unlock_sock(sk);
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1030 
1031 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1032 		   int oif, u32 mark, u8 protocol, int flow_flags)
1033 {
1034 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1035 	struct flowi4 fl4;
1036 	struct rtable *rt;
1037 
1038 	__build_flow_key(&fl4, NULL, iph, oif,
1039 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1040 	rt = __ip_route_output_key(net, &fl4);
1041 	if (!IS_ERR(rt)) {
1042 		__ip_do_redirect(rt, skb, &fl4, false);
1043 		ip_rt_put(rt);
1044 	}
1045 }
1046 EXPORT_SYMBOL_GPL(ipv4_redirect);
1047 
1048 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1049 {
1050 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1051 	struct flowi4 fl4;
1052 	struct rtable *rt;
1053 
1054 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1055 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1056 	if (!IS_ERR(rt)) {
1057 		__ip_do_redirect(rt, skb, &fl4, false);
1058 		ip_rt_put(rt);
1059 	}
1060 }
1061 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1062 
1063 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1064 {
1065 	struct rtable *rt = (struct rtable *) dst;
1066 
1067 	/* All IPV4 dsts are created with ->obsolete set to the value
1068 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1069 	 * into this function always.
1070 	 *
1071 	 * When a PMTU/redirect information update invalidates a
1072 	 * route, this is indicated by setting obsolete to
1073 	 * DST_OBSOLETE_KILL.
1074 	 */
1075 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1076 		return NULL;
1077 	return dst;
1078 }
1079 
1080 static void ipv4_link_failure(struct sk_buff *skb)
1081 {
1082 	struct rtable *rt;
1083 
1084 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1085 
1086 	rt = skb_rtable(skb);
1087 	if (rt)
1088 		dst_set_expires(&rt->dst, 0);
1089 }
1090 
1091 static int ip_rt_bug(struct sk_buff *skb)
1092 {
1093 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1094 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1095 		 skb->dev ? skb->dev->name : "?");
1096 	kfree_skb(skb);
1097 	WARN_ON(1);
1098 	return 0;
1099 }
1100 
1101 /*
1102    We do not cache source address of outgoing interface,
1103    because it is used only by IP RR, TS and SRR options,
1104    so that it out of fast path.
1105 
1106    BTW remember: "addr" is allowed to be not aligned
1107    in IP options!
1108  */
1109 
1110 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1111 {
1112 	__be32 src;
1113 
1114 	if (rt_is_output_route(rt))
1115 		src = ip_hdr(skb)->saddr;
1116 	else {
1117 		struct fib_result res;
1118 		struct flowi4 fl4;
1119 		struct iphdr *iph;
1120 
1121 		iph = ip_hdr(skb);
1122 
1123 		memset(&fl4, 0, sizeof(fl4));
1124 		fl4.daddr = iph->daddr;
1125 		fl4.saddr = iph->saddr;
1126 		fl4.flowi4_tos = RT_TOS(iph->tos);
1127 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1128 		fl4.flowi4_iif = skb->dev->ifindex;
1129 		fl4.flowi4_mark = skb->mark;
1130 
1131 		rcu_read_lock();
1132 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1133 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1134 		else
1135 			src = inet_select_addr(rt->dst.dev,
1136 					       rt_nexthop(rt, iph->daddr),
1137 					       RT_SCOPE_UNIVERSE);
1138 		rcu_read_unlock();
1139 	}
1140 	memcpy(addr, &src, 4);
1141 }
1142 
1143 #ifdef CONFIG_IP_ROUTE_CLASSID
1144 static void set_class_tag(struct rtable *rt, u32 tag)
1145 {
1146 	if (!(rt->dst.tclassid & 0xFFFF))
1147 		rt->dst.tclassid |= tag & 0xFFFF;
1148 	if (!(rt->dst.tclassid & 0xFFFF0000))
1149 		rt->dst.tclassid |= tag & 0xFFFF0000;
1150 }
1151 #endif
1152 
1153 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1154 {
1155 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1156 
1157 	if (advmss == 0) {
1158 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1159 			       ip_rt_min_advmss);
1160 		if (advmss > 65535 - 40)
1161 			advmss = 65535 - 40;
1162 	}
1163 	return advmss;
1164 }
1165 
1166 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1167 {
1168 	const struct rtable *rt = (const struct rtable *) dst;
1169 	unsigned int mtu = rt->rt_pmtu;
1170 
1171 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1172 		mtu = dst_metric_raw(dst, RTAX_MTU);
1173 
1174 	if (mtu)
1175 		return mtu;
1176 
1177 	mtu = dst->dev->mtu;
1178 
1179 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1180 		if (rt->rt_uses_gateway && mtu > 576)
1181 			mtu = 576;
1182 	}
1183 
1184 	if (mtu > IP_MAX_MTU)
1185 		mtu = IP_MAX_MTU;
1186 
1187 	return mtu;
1188 }
1189 
1190 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1191 {
1192 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1193 	struct fib_nh_exception *fnhe;
1194 	u32 hval;
1195 
1196 	if (!hash)
1197 		return NULL;
1198 
1199 	hval = fnhe_hashfun(daddr);
1200 
1201 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1202 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1203 		if (fnhe->fnhe_daddr == daddr)
1204 			return fnhe;
1205 	}
1206 	return NULL;
1207 }
1208 
1209 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1210 			      __be32 daddr)
1211 {
1212 	bool ret = false;
1213 
1214 	spin_lock_bh(&fnhe_lock);
1215 
1216 	if (daddr == fnhe->fnhe_daddr) {
1217 		struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1218 		if (orig && rt_is_expired(orig)) {
1219 			fnhe->fnhe_gw = 0;
1220 			fnhe->fnhe_pmtu = 0;
1221 			fnhe->fnhe_expires = 0;
1222 		}
1223 		if (fnhe->fnhe_pmtu) {
1224 			unsigned long expires = fnhe->fnhe_expires;
1225 			unsigned long diff = expires - jiffies;
1226 
1227 			if (time_before(jiffies, expires)) {
1228 				rt->rt_pmtu = fnhe->fnhe_pmtu;
1229 				dst_set_expires(&rt->dst, diff);
1230 			}
1231 		}
1232 		if (fnhe->fnhe_gw) {
1233 			rt->rt_flags |= RTCF_REDIRECTED;
1234 			rt->rt_gateway = fnhe->fnhe_gw;
1235 			rt->rt_uses_gateway = 1;
1236 		} else if (!rt->rt_gateway)
1237 			rt->rt_gateway = daddr;
1238 
1239 		rcu_assign_pointer(fnhe->fnhe_rth, rt);
1240 		if (orig)
1241 			rt_free(orig);
1242 
1243 		fnhe->fnhe_stamp = jiffies;
1244 		ret = true;
1245 	}
1246 	spin_unlock_bh(&fnhe_lock);
1247 
1248 	return ret;
1249 }
1250 
1251 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1252 {
1253 	struct rtable *orig, *prev, **p;
1254 	bool ret = true;
1255 
1256 	if (rt_is_input_route(rt)) {
1257 		p = (struct rtable **)&nh->nh_rth_input;
1258 	} else {
1259 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1260 	}
1261 	orig = *p;
1262 
1263 	prev = cmpxchg(p, orig, rt);
1264 	if (prev == orig) {
1265 		if (orig)
1266 			rt_free(orig);
1267 	} else
1268 		ret = false;
1269 
1270 	return ret;
1271 }
1272 
1273 static DEFINE_SPINLOCK(rt_uncached_lock);
1274 static LIST_HEAD(rt_uncached_list);
1275 
1276 static void rt_add_uncached_list(struct rtable *rt)
1277 {
1278 	spin_lock_bh(&rt_uncached_lock);
1279 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1280 	spin_unlock_bh(&rt_uncached_lock);
1281 }
1282 
1283 static void ipv4_dst_destroy(struct dst_entry *dst)
1284 {
1285 	struct rtable *rt = (struct rtable *) dst;
1286 
1287 	if (!list_empty(&rt->rt_uncached)) {
1288 		spin_lock_bh(&rt_uncached_lock);
1289 		list_del(&rt->rt_uncached);
1290 		spin_unlock_bh(&rt_uncached_lock);
1291 	}
1292 }
1293 
1294 void rt_flush_dev(struct net_device *dev)
1295 {
1296 	if (!list_empty(&rt_uncached_list)) {
1297 		struct net *net = dev_net(dev);
1298 		struct rtable *rt;
1299 
1300 		spin_lock_bh(&rt_uncached_lock);
1301 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1302 			if (rt->dst.dev != dev)
1303 				continue;
1304 			rt->dst.dev = net->loopback_dev;
1305 			dev_hold(rt->dst.dev);
1306 			dev_put(dev);
1307 		}
1308 		spin_unlock_bh(&rt_uncached_lock);
1309 	}
1310 }
1311 
1312 static bool rt_cache_valid(const struct rtable *rt)
1313 {
1314 	return	rt &&
1315 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1316 		!rt_is_expired(rt);
1317 }
1318 
1319 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1320 			   const struct fib_result *res,
1321 			   struct fib_nh_exception *fnhe,
1322 			   struct fib_info *fi, u16 type, u32 itag)
1323 {
1324 	bool cached = false;
1325 
1326 	if (fi) {
1327 		struct fib_nh *nh = &FIB_RES_NH(*res);
1328 
1329 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1330 			rt->rt_gateway = nh->nh_gw;
1331 			rt->rt_uses_gateway = 1;
1332 		}
1333 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1334 #ifdef CONFIG_IP_ROUTE_CLASSID
1335 		rt->dst.tclassid = nh->nh_tclassid;
1336 #endif
1337 		if (unlikely(fnhe))
1338 			cached = rt_bind_exception(rt, fnhe, daddr);
1339 		else if (!(rt->dst.flags & DST_NOCACHE))
1340 			cached = rt_cache_route(nh, rt);
1341 		if (unlikely(!cached)) {
1342 			/* Routes we intend to cache in nexthop exception or
1343 			 * FIB nexthop have the DST_NOCACHE bit clear.
1344 			 * However, if we are unsuccessful at storing this
1345 			 * route into the cache we really need to set it.
1346 			 */
1347 			rt->dst.flags |= DST_NOCACHE;
1348 			if (!rt->rt_gateway)
1349 				rt->rt_gateway = daddr;
1350 			rt_add_uncached_list(rt);
1351 		}
1352 	} else
1353 		rt_add_uncached_list(rt);
1354 
1355 #ifdef CONFIG_IP_ROUTE_CLASSID
1356 #ifdef CONFIG_IP_MULTIPLE_TABLES
1357 	set_class_tag(rt, res->tclassid);
1358 #endif
1359 	set_class_tag(rt, itag);
1360 #endif
1361 }
1362 
1363 static struct rtable *rt_dst_alloc(struct net_device *dev,
1364 				   bool nopolicy, bool noxfrm, bool will_cache)
1365 {
1366 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1367 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1368 			 (nopolicy ? DST_NOPOLICY : 0) |
1369 			 (noxfrm ? DST_NOXFRM : 0));
1370 }
1371 
1372 /* called in rcu_read_lock() section */
1373 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1374 				u8 tos, struct net_device *dev, int our)
1375 {
1376 	struct rtable *rth;
1377 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1378 	u32 itag = 0;
1379 	int err;
1380 
1381 	/* Primary sanity checks. */
1382 
1383 	if (in_dev == NULL)
1384 		return -EINVAL;
1385 
1386 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1387 	    skb->protocol != htons(ETH_P_IP))
1388 		goto e_inval;
1389 
1390 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1391 		if (ipv4_is_loopback(saddr))
1392 			goto e_inval;
1393 
1394 	if (ipv4_is_zeronet(saddr)) {
1395 		if (!ipv4_is_local_multicast(daddr))
1396 			goto e_inval;
1397 	} else {
1398 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1399 					  in_dev, &itag);
1400 		if (err < 0)
1401 			goto e_err;
1402 	}
1403 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1404 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1405 	if (!rth)
1406 		goto e_nobufs;
1407 
1408 #ifdef CONFIG_IP_ROUTE_CLASSID
1409 	rth->dst.tclassid = itag;
1410 #endif
1411 	rth->dst.output = ip_rt_bug;
1412 
1413 	rth->rt_genid	= rt_genid(dev_net(dev));
1414 	rth->rt_flags	= RTCF_MULTICAST;
1415 	rth->rt_type	= RTN_MULTICAST;
1416 	rth->rt_is_input= 1;
1417 	rth->rt_iif	= 0;
1418 	rth->rt_pmtu	= 0;
1419 	rth->rt_gateway	= 0;
1420 	rth->rt_uses_gateway = 0;
1421 	INIT_LIST_HEAD(&rth->rt_uncached);
1422 	if (our) {
1423 		rth->dst.input= ip_local_deliver;
1424 		rth->rt_flags |= RTCF_LOCAL;
1425 	}
1426 
1427 #ifdef CONFIG_IP_MROUTE
1428 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1429 		rth->dst.input = ip_mr_input;
1430 #endif
1431 	RT_CACHE_STAT_INC(in_slow_mc);
1432 
1433 	skb_dst_set(skb, &rth->dst);
1434 	return 0;
1435 
1436 e_nobufs:
1437 	return -ENOBUFS;
1438 e_inval:
1439 	return -EINVAL;
1440 e_err:
1441 	return err;
1442 }
1443 
1444 
1445 static void ip_handle_martian_source(struct net_device *dev,
1446 				     struct in_device *in_dev,
1447 				     struct sk_buff *skb,
1448 				     __be32 daddr,
1449 				     __be32 saddr)
1450 {
1451 	RT_CACHE_STAT_INC(in_martian_src);
1452 #ifdef CONFIG_IP_ROUTE_VERBOSE
1453 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1454 		/*
1455 		 *	RFC1812 recommendation, if source is martian,
1456 		 *	the only hint is MAC header.
1457 		 */
1458 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1459 			&daddr, &saddr, dev->name);
1460 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1461 			print_hex_dump(KERN_WARNING, "ll header: ",
1462 				       DUMP_PREFIX_OFFSET, 16, 1,
1463 				       skb_mac_header(skb),
1464 				       dev->hard_header_len, true);
1465 		}
1466 	}
1467 #endif
1468 }
1469 
1470 /* called in rcu_read_lock() section */
1471 static int __mkroute_input(struct sk_buff *skb,
1472 			   const struct fib_result *res,
1473 			   struct in_device *in_dev,
1474 			   __be32 daddr, __be32 saddr, u32 tos)
1475 {
1476 	struct rtable *rth;
1477 	int err;
1478 	struct in_device *out_dev;
1479 	unsigned int flags = 0;
1480 	bool do_cache;
1481 	u32 itag;
1482 
1483 	/* get a working reference to the output device */
1484 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1485 	if (out_dev == NULL) {
1486 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1487 		return -EINVAL;
1488 	}
1489 
1490 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1491 				  in_dev->dev, in_dev, &itag);
1492 	if (err < 0) {
1493 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1494 					 saddr);
1495 
1496 		goto cleanup;
1497 	}
1498 
1499 	do_cache = res->fi && !itag;
1500 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1501 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1502 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1503 		flags |= RTCF_DOREDIRECT;
1504 		do_cache = false;
1505 	}
1506 
1507 	if (skb->protocol != htons(ETH_P_IP)) {
1508 		/* Not IP (i.e. ARP). Do not create route, if it is
1509 		 * invalid for proxy arp. DNAT routes are always valid.
1510 		 *
1511 		 * Proxy arp feature have been extended to allow, ARP
1512 		 * replies back to the same interface, to support
1513 		 * Private VLAN switch technologies. See arp.c.
1514 		 */
1515 		if (out_dev == in_dev &&
1516 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1517 			err = -EINVAL;
1518 			goto cleanup;
1519 		}
1520 	}
1521 
1522 	if (do_cache) {
1523 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1524 		if (rt_cache_valid(rth)) {
1525 			skb_dst_set_noref(skb, &rth->dst);
1526 			goto out;
1527 		}
1528 	}
1529 
1530 	rth = rt_dst_alloc(out_dev->dev,
1531 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1532 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1533 	if (!rth) {
1534 		err = -ENOBUFS;
1535 		goto cleanup;
1536 	}
1537 
1538 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1539 	rth->rt_flags = flags;
1540 	rth->rt_type = res->type;
1541 	rth->rt_is_input = 1;
1542 	rth->rt_iif 	= 0;
1543 	rth->rt_pmtu	= 0;
1544 	rth->rt_gateway	= 0;
1545 	rth->rt_uses_gateway = 0;
1546 	INIT_LIST_HEAD(&rth->rt_uncached);
1547 
1548 	rth->dst.input = ip_forward;
1549 	rth->dst.output = ip_output;
1550 
1551 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1552 	skb_dst_set(skb, &rth->dst);
1553 out:
1554 	err = 0;
1555  cleanup:
1556 	return err;
1557 }
1558 
1559 static int ip_mkroute_input(struct sk_buff *skb,
1560 			    struct fib_result *res,
1561 			    const struct flowi4 *fl4,
1562 			    struct in_device *in_dev,
1563 			    __be32 daddr, __be32 saddr, u32 tos)
1564 {
1565 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1566 	if (res->fi && res->fi->fib_nhs > 1)
1567 		fib_select_multipath(res);
1568 #endif
1569 
1570 	/* create a routing cache entry */
1571 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1572 }
1573 
1574 /*
1575  *	NOTE. We drop all the packets that has local source
1576  *	addresses, because every properly looped back packet
1577  *	must have correct destination already attached by output routine.
1578  *
1579  *	Such approach solves two big problems:
1580  *	1. Not simplex devices are handled properly.
1581  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1582  *	called with rcu_read_lock()
1583  */
1584 
1585 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1586 			       u8 tos, struct net_device *dev)
1587 {
1588 	struct fib_result res;
1589 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1590 	struct flowi4	fl4;
1591 	unsigned int	flags = 0;
1592 	u32		itag = 0;
1593 	struct rtable	*rth;
1594 	int		err = -EINVAL;
1595 	struct net    *net = dev_net(dev);
1596 	bool do_cache;
1597 
1598 	/* IP on this device is disabled. */
1599 
1600 	if (!in_dev)
1601 		goto out;
1602 
1603 	/* Check for the most weird martians, which can be not detected
1604 	   by fib_lookup.
1605 	 */
1606 
1607 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1608 		goto martian_source;
1609 
1610 	res.fi = NULL;
1611 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1612 		goto brd_input;
1613 
1614 	/* Accept zero addresses only to limited broadcast;
1615 	 * I even do not know to fix it or not. Waiting for complains :-)
1616 	 */
1617 	if (ipv4_is_zeronet(saddr))
1618 		goto martian_source;
1619 
1620 	if (ipv4_is_zeronet(daddr))
1621 		goto martian_destination;
1622 
1623 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1624 	 * and call it once if daddr or/and saddr are loopback addresses
1625 	 */
1626 	if (ipv4_is_loopback(daddr)) {
1627 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1628 			goto martian_destination;
1629 	} else if (ipv4_is_loopback(saddr)) {
1630 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1631 			goto martian_source;
1632 	}
1633 
1634 	/*
1635 	 *	Now we are ready to route packet.
1636 	 */
1637 	fl4.flowi4_oif = 0;
1638 	fl4.flowi4_iif = dev->ifindex;
1639 	fl4.flowi4_mark = skb->mark;
1640 	fl4.flowi4_tos = tos;
1641 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1642 	fl4.daddr = daddr;
1643 	fl4.saddr = saddr;
1644 	err = fib_lookup(net, &fl4, &res);
1645 	if (err != 0)
1646 		goto no_route;
1647 
1648 	RT_CACHE_STAT_INC(in_slow_tot);
1649 
1650 	if (res.type == RTN_BROADCAST)
1651 		goto brd_input;
1652 
1653 	if (res.type == RTN_LOCAL) {
1654 		err = fib_validate_source(skb, saddr, daddr, tos,
1655 					  LOOPBACK_IFINDEX,
1656 					  dev, in_dev, &itag);
1657 		if (err < 0)
1658 			goto martian_source_keep_err;
1659 		goto local_input;
1660 	}
1661 
1662 	if (!IN_DEV_FORWARD(in_dev))
1663 		goto no_route;
1664 	if (res.type != RTN_UNICAST)
1665 		goto martian_destination;
1666 
1667 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1668 out:	return err;
1669 
1670 brd_input:
1671 	if (skb->protocol != htons(ETH_P_IP))
1672 		goto e_inval;
1673 
1674 	if (!ipv4_is_zeronet(saddr)) {
1675 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1676 					  in_dev, &itag);
1677 		if (err < 0)
1678 			goto martian_source_keep_err;
1679 	}
1680 	flags |= RTCF_BROADCAST;
1681 	res.type = RTN_BROADCAST;
1682 	RT_CACHE_STAT_INC(in_brd);
1683 
1684 local_input:
1685 	do_cache = false;
1686 	if (res.fi) {
1687 		if (!itag) {
1688 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1689 			if (rt_cache_valid(rth)) {
1690 				skb_dst_set_noref(skb, &rth->dst);
1691 				err = 0;
1692 				goto out;
1693 			}
1694 			do_cache = true;
1695 		}
1696 	}
1697 
1698 	rth = rt_dst_alloc(net->loopback_dev,
1699 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1700 	if (!rth)
1701 		goto e_nobufs;
1702 
1703 	rth->dst.input= ip_local_deliver;
1704 	rth->dst.output= ip_rt_bug;
1705 #ifdef CONFIG_IP_ROUTE_CLASSID
1706 	rth->dst.tclassid = itag;
1707 #endif
1708 
1709 	rth->rt_genid = rt_genid(net);
1710 	rth->rt_flags 	= flags|RTCF_LOCAL;
1711 	rth->rt_type	= res.type;
1712 	rth->rt_is_input = 1;
1713 	rth->rt_iif	= 0;
1714 	rth->rt_pmtu	= 0;
1715 	rth->rt_gateway	= 0;
1716 	rth->rt_uses_gateway = 0;
1717 	INIT_LIST_HEAD(&rth->rt_uncached);
1718 	if (res.type == RTN_UNREACHABLE) {
1719 		rth->dst.input= ip_error;
1720 		rth->dst.error= -err;
1721 		rth->rt_flags 	&= ~RTCF_LOCAL;
1722 	}
1723 	if (do_cache)
1724 		rt_cache_route(&FIB_RES_NH(res), rth);
1725 	skb_dst_set(skb, &rth->dst);
1726 	err = 0;
1727 	goto out;
1728 
1729 no_route:
1730 	RT_CACHE_STAT_INC(in_no_route);
1731 	res.type = RTN_UNREACHABLE;
1732 	if (err == -ESRCH)
1733 		err = -ENETUNREACH;
1734 	goto local_input;
1735 
1736 	/*
1737 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1738 	 */
1739 martian_destination:
1740 	RT_CACHE_STAT_INC(in_martian_dst);
1741 #ifdef CONFIG_IP_ROUTE_VERBOSE
1742 	if (IN_DEV_LOG_MARTIANS(in_dev))
1743 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1744 				     &daddr, &saddr, dev->name);
1745 #endif
1746 
1747 e_inval:
1748 	err = -EINVAL;
1749 	goto out;
1750 
1751 e_nobufs:
1752 	err = -ENOBUFS;
1753 	goto out;
1754 
1755 martian_source:
1756 	err = -EINVAL;
1757 martian_source_keep_err:
1758 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1759 	goto out;
1760 }
1761 
1762 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763 			 u8 tos, struct net_device *dev)
1764 {
1765 	int res;
1766 
1767 	rcu_read_lock();
1768 
1769 	/* Multicast recognition logic is moved from route cache to here.
1770 	   The problem was that too many Ethernet cards have broken/missing
1771 	   hardware multicast filters :-( As result the host on multicasting
1772 	   network acquires a lot of useless route cache entries, sort of
1773 	   SDR messages from all the world. Now we try to get rid of them.
1774 	   Really, provided software IP multicast filter is organized
1775 	   reasonably (at least, hashed), it does not result in a slowdown
1776 	   comparing with route cache reject entries.
1777 	   Note, that multicast routers are not affected, because
1778 	   route cache entry is created eventually.
1779 	 */
1780 	if (ipv4_is_multicast(daddr)) {
1781 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1782 
1783 		if (in_dev) {
1784 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1785 						  ip_hdr(skb)->protocol);
1786 			if (our
1787 #ifdef CONFIG_IP_MROUTE
1788 				||
1789 			    (!ipv4_is_local_multicast(daddr) &&
1790 			     IN_DEV_MFORWARD(in_dev))
1791 #endif
1792 			   ) {
1793 				int res = ip_route_input_mc(skb, daddr, saddr,
1794 							    tos, dev, our);
1795 				rcu_read_unlock();
1796 				return res;
1797 			}
1798 		}
1799 		rcu_read_unlock();
1800 		return -EINVAL;
1801 	}
1802 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1803 	rcu_read_unlock();
1804 	return res;
1805 }
1806 EXPORT_SYMBOL(ip_route_input_noref);
1807 
1808 /* called with rcu_read_lock() */
1809 static struct rtable *__mkroute_output(const struct fib_result *res,
1810 				       const struct flowi4 *fl4, int orig_oif,
1811 				       struct net_device *dev_out,
1812 				       unsigned int flags)
1813 {
1814 	struct fib_info *fi = res->fi;
1815 	struct fib_nh_exception *fnhe;
1816 	struct in_device *in_dev;
1817 	u16 type = res->type;
1818 	struct rtable *rth;
1819 	bool do_cache;
1820 
1821 	in_dev = __in_dev_get_rcu(dev_out);
1822 	if (!in_dev)
1823 		return ERR_PTR(-EINVAL);
1824 
1825 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1826 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1827 			return ERR_PTR(-EINVAL);
1828 
1829 	if (ipv4_is_lbcast(fl4->daddr))
1830 		type = RTN_BROADCAST;
1831 	else if (ipv4_is_multicast(fl4->daddr))
1832 		type = RTN_MULTICAST;
1833 	else if (ipv4_is_zeronet(fl4->daddr))
1834 		return ERR_PTR(-EINVAL);
1835 
1836 	if (dev_out->flags & IFF_LOOPBACK)
1837 		flags |= RTCF_LOCAL;
1838 
1839 	do_cache = true;
1840 	if (type == RTN_BROADCAST) {
1841 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1842 		fi = NULL;
1843 	} else if (type == RTN_MULTICAST) {
1844 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1845 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1846 				     fl4->flowi4_proto))
1847 			flags &= ~RTCF_LOCAL;
1848 		else
1849 			do_cache = false;
1850 		/* If multicast route do not exist use
1851 		 * default one, but do not gateway in this case.
1852 		 * Yes, it is hack.
1853 		 */
1854 		if (fi && res->prefixlen < 4)
1855 			fi = NULL;
1856 	}
1857 
1858 	fnhe = NULL;
1859 	do_cache &= fi != NULL;
1860 	if (do_cache) {
1861 		struct rtable __rcu **prth;
1862 		struct fib_nh *nh = &FIB_RES_NH(*res);
1863 
1864 		fnhe = find_exception(nh, fl4->daddr);
1865 		if (fnhe)
1866 			prth = &fnhe->fnhe_rth;
1867 		else {
1868 			if (unlikely(fl4->flowi4_flags &
1869 				     FLOWI_FLAG_KNOWN_NH &&
1870 				     !(nh->nh_gw &&
1871 				       nh->nh_scope == RT_SCOPE_LINK))) {
1872 				do_cache = false;
1873 				goto add;
1874 			}
1875 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1876 		}
1877 		rth = rcu_dereference(*prth);
1878 		if (rt_cache_valid(rth)) {
1879 			dst_hold(&rth->dst);
1880 			return rth;
1881 		}
1882 	}
1883 
1884 add:
1885 	rth = rt_dst_alloc(dev_out,
1886 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1887 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1888 			   do_cache);
1889 	if (!rth)
1890 		return ERR_PTR(-ENOBUFS);
1891 
1892 	rth->dst.output = ip_output;
1893 
1894 	rth->rt_genid = rt_genid(dev_net(dev_out));
1895 	rth->rt_flags	= flags;
1896 	rth->rt_type	= type;
1897 	rth->rt_is_input = 0;
1898 	rth->rt_iif	= orig_oif ? : 0;
1899 	rth->rt_pmtu	= 0;
1900 	rth->rt_gateway = 0;
1901 	rth->rt_uses_gateway = 0;
1902 	INIT_LIST_HEAD(&rth->rt_uncached);
1903 
1904 	RT_CACHE_STAT_INC(out_slow_tot);
1905 
1906 	if (flags & RTCF_LOCAL)
1907 		rth->dst.input = ip_local_deliver;
1908 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1909 		if (flags & RTCF_LOCAL &&
1910 		    !(dev_out->flags & IFF_LOOPBACK)) {
1911 			rth->dst.output = ip_mc_output;
1912 			RT_CACHE_STAT_INC(out_slow_mc);
1913 		}
1914 #ifdef CONFIG_IP_MROUTE
1915 		if (type == RTN_MULTICAST) {
1916 			if (IN_DEV_MFORWARD(in_dev) &&
1917 			    !ipv4_is_local_multicast(fl4->daddr)) {
1918 				rth->dst.input = ip_mr_input;
1919 				rth->dst.output = ip_mc_output;
1920 			}
1921 		}
1922 #endif
1923 	}
1924 
1925 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1926 
1927 	return rth;
1928 }
1929 
1930 /*
1931  * Major route resolver routine.
1932  */
1933 
1934 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1935 {
1936 	struct net_device *dev_out = NULL;
1937 	__u8 tos = RT_FL_TOS(fl4);
1938 	unsigned int flags = 0;
1939 	struct fib_result res;
1940 	struct rtable *rth;
1941 	int orig_oif;
1942 
1943 	res.tclassid	= 0;
1944 	res.fi		= NULL;
1945 	res.table	= NULL;
1946 
1947 	orig_oif = fl4->flowi4_oif;
1948 
1949 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
1950 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1951 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1952 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1953 
1954 	rcu_read_lock();
1955 	if (fl4->saddr) {
1956 		rth = ERR_PTR(-EINVAL);
1957 		if (ipv4_is_multicast(fl4->saddr) ||
1958 		    ipv4_is_lbcast(fl4->saddr) ||
1959 		    ipv4_is_zeronet(fl4->saddr))
1960 			goto out;
1961 
1962 		/* I removed check for oif == dev_out->oif here.
1963 		   It was wrong for two reasons:
1964 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1965 		      is assigned to multiple interfaces.
1966 		   2. Moreover, we are allowed to send packets with saddr
1967 		      of another iface. --ANK
1968 		 */
1969 
1970 		if (fl4->flowi4_oif == 0 &&
1971 		    (ipv4_is_multicast(fl4->daddr) ||
1972 		     ipv4_is_lbcast(fl4->daddr))) {
1973 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1974 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1975 			if (dev_out == NULL)
1976 				goto out;
1977 
1978 			/* Special hack: user can direct multicasts
1979 			   and limited broadcast via necessary interface
1980 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1981 			   This hack is not just for fun, it allows
1982 			   vic,vat and friends to work.
1983 			   They bind socket to loopback, set ttl to zero
1984 			   and expect that it will work.
1985 			   From the viewpoint of routing cache they are broken,
1986 			   because we are not allowed to build multicast path
1987 			   with loopback source addr (look, routing cache
1988 			   cannot know, that ttl is zero, so that packet
1989 			   will not leave this host and route is valid).
1990 			   Luckily, this hack is good workaround.
1991 			 */
1992 
1993 			fl4->flowi4_oif = dev_out->ifindex;
1994 			goto make_route;
1995 		}
1996 
1997 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1998 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1999 			if (!__ip_dev_find(net, fl4->saddr, false))
2000 				goto out;
2001 		}
2002 	}
2003 
2004 
2005 	if (fl4->flowi4_oif) {
2006 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2007 		rth = ERR_PTR(-ENODEV);
2008 		if (dev_out == NULL)
2009 			goto out;
2010 
2011 		/* RACE: Check return value of inet_select_addr instead. */
2012 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2013 			rth = ERR_PTR(-ENETUNREACH);
2014 			goto out;
2015 		}
2016 		if (ipv4_is_local_multicast(fl4->daddr) ||
2017 		    ipv4_is_lbcast(fl4->daddr)) {
2018 			if (!fl4->saddr)
2019 				fl4->saddr = inet_select_addr(dev_out, 0,
2020 							      RT_SCOPE_LINK);
2021 			goto make_route;
2022 		}
2023 		if (fl4->saddr) {
2024 			if (ipv4_is_multicast(fl4->daddr))
2025 				fl4->saddr = inet_select_addr(dev_out, 0,
2026 							      fl4->flowi4_scope);
2027 			else if (!fl4->daddr)
2028 				fl4->saddr = inet_select_addr(dev_out, 0,
2029 							      RT_SCOPE_HOST);
2030 		}
2031 	}
2032 
2033 	if (!fl4->daddr) {
2034 		fl4->daddr = fl4->saddr;
2035 		if (!fl4->daddr)
2036 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2037 		dev_out = net->loopback_dev;
2038 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2039 		res.type = RTN_LOCAL;
2040 		flags |= RTCF_LOCAL;
2041 		goto make_route;
2042 	}
2043 
2044 	if (fib_lookup(net, fl4, &res)) {
2045 		res.fi = NULL;
2046 		res.table = NULL;
2047 		if (fl4->flowi4_oif) {
2048 			/* Apparently, routing tables are wrong. Assume,
2049 			   that the destination is on link.
2050 
2051 			   WHY? DW.
2052 			   Because we are allowed to send to iface
2053 			   even if it has NO routes and NO assigned
2054 			   addresses. When oif is specified, routing
2055 			   tables are looked up with only one purpose:
2056 			   to catch if destination is gatewayed, rather than
2057 			   direct. Moreover, if MSG_DONTROUTE is set,
2058 			   we send packet, ignoring both routing tables
2059 			   and ifaddr state. --ANK
2060 
2061 
2062 			   We could make it even if oif is unknown,
2063 			   likely IPv6, but we do not.
2064 			 */
2065 
2066 			if (fl4->saddr == 0)
2067 				fl4->saddr = inet_select_addr(dev_out, 0,
2068 							      RT_SCOPE_LINK);
2069 			res.type = RTN_UNICAST;
2070 			goto make_route;
2071 		}
2072 		rth = ERR_PTR(-ENETUNREACH);
2073 		goto out;
2074 	}
2075 
2076 	if (res.type == RTN_LOCAL) {
2077 		if (!fl4->saddr) {
2078 			if (res.fi->fib_prefsrc)
2079 				fl4->saddr = res.fi->fib_prefsrc;
2080 			else
2081 				fl4->saddr = fl4->daddr;
2082 		}
2083 		dev_out = net->loopback_dev;
2084 		fl4->flowi4_oif = dev_out->ifindex;
2085 		flags |= RTCF_LOCAL;
2086 		goto make_route;
2087 	}
2088 
2089 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2090 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2091 		fib_select_multipath(&res);
2092 	else
2093 #endif
2094 	if (!res.prefixlen &&
2095 	    res.table->tb_num_default > 1 &&
2096 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2097 		fib_select_default(&res);
2098 
2099 	if (!fl4->saddr)
2100 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2101 
2102 	dev_out = FIB_RES_DEV(res);
2103 	fl4->flowi4_oif = dev_out->ifindex;
2104 
2105 
2106 make_route:
2107 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2108 
2109 out:
2110 	rcu_read_unlock();
2111 	return rth;
2112 }
2113 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2114 
2115 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2116 {
2117 	return NULL;
2118 }
2119 
2120 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2121 {
2122 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2123 
2124 	return mtu ? : dst->dev->mtu;
2125 }
2126 
2127 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2128 					  struct sk_buff *skb, u32 mtu)
2129 {
2130 }
2131 
2132 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2133 				       struct sk_buff *skb)
2134 {
2135 }
2136 
2137 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2138 					  unsigned long old)
2139 {
2140 	return NULL;
2141 }
2142 
2143 static struct dst_ops ipv4_dst_blackhole_ops = {
2144 	.family			=	AF_INET,
2145 	.protocol		=	cpu_to_be16(ETH_P_IP),
2146 	.check			=	ipv4_blackhole_dst_check,
2147 	.mtu			=	ipv4_blackhole_mtu,
2148 	.default_advmss		=	ipv4_default_advmss,
2149 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2150 	.redirect		=	ipv4_rt_blackhole_redirect,
2151 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2152 	.neigh_lookup		=	ipv4_neigh_lookup,
2153 };
2154 
2155 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2156 {
2157 	struct rtable *ort = (struct rtable *) dst_orig;
2158 	struct rtable *rt;
2159 
2160 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2161 	if (rt) {
2162 		struct dst_entry *new = &rt->dst;
2163 
2164 		new->__use = 1;
2165 		new->input = dst_discard;
2166 		new->output = dst_discard;
2167 
2168 		new->dev = ort->dst.dev;
2169 		if (new->dev)
2170 			dev_hold(new->dev);
2171 
2172 		rt->rt_is_input = ort->rt_is_input;
2173 		rt->rt_iif = ort->rt_iif;
2174 		rt->rt_pmtu = ort->rt_pmtu;
2175 
2176 		rt->rt_genid = rt_genid(net);
2177 		rt->rt_flags = ort->rt_flags;
2178 		rt->rt_type = ort->rt_type;
2179 		rt->rt_gateway = ort->rt_gateway;
2180 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2181 
2182 		INIT_LIST_HEAD(&rt->rt_uncached);
2183 
2184 		dst_free(new);
2185 	}
2186 
2187 	dst_release(dst_orig);
2188 
2189 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2190 }
2191 
2192 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2193 				    struct sock *sk)
2194 {
2195 	struct rtable *rt = __ip_route_output_key(net, flp4);
2196 
2197 	if (IS_ERR(rt))
2198 		return rt;
2199 
2200 	if (flp4->flowi4_proto)
2201 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2202 						   flowi4_to_flowi(flp4),
2203 						   sk, 0);
2204 
2205 	return rt;
2206 }
2207 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2208 
2209 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2210 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2211 			u32 seq, int event, int nowait, unsigned int flags)
2212 {
2213 	struct rtable *rt = skb_rtable(skb);
2214 	struct rtmsg *r;
2215 	struct nlmsghdr *nlh;
2216 	unsigned long expires = 0;
2217 	u32 error;
2218 	u32 metrics[RTAX_MAX];
2219 
2220 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2221 	if (nlh == NULL)
2222 		return -EMSGSIZE;
2223 
2224 	r = nlmsg_data(nlh);
2225 	r->rtm_family	 = AF_INET;
2226 	r->rtm_dst_len	= 32;
2227 	r->rtm_src_len	= 0;
2228 	r->rtm_tos	= fl4->flowi4_tos;
2229 	r->rtm_table	= RT_TABLE_MAIN;
2230 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2231 		goto nla_put_failure;
2232 	r->rtm_type	= rt->rt_type;
2233 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2234 	r->rtm_protocol = RTPROT_UNSPEC;
2235 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2236 	if (rt->rt_flags & RTCF_NOTIFY)
2237 		r->rtm_flags |= RTM_F_NOTIFY;
2238 
2239 	if (nla_put_be32(skb, RTA_DST, dst))
2240 		goto nla_put_failure;
2241 	if (src) {
2242 		r->rtm_src_len = 32;
2243 		if (nla_put_be32(skb, RTA_SRC, src))
2244 			goto nla_put_failure;
2245 	}
2246 	if (rt->dst.dev &&
2247 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2248 		goto nla_put_failure;
2249 #ifdef CONFIG_IP_ROUTE_CLASSID
2250 	if (rt->dst.tclassid &&
2251 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2252 		goto nla_put_failure;
2253 #endif
2254 	if (!rt_is_input_route(rt) &&
2255 	    fl4->saddr != src) {
2256 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2257 			goto nla_put_failure;
2258 	}
2259 	if (rt->rt_uses_gateway &&
2260 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2261 		goto nla_put_failure;
2262 
2263 	expires = rt->dst.expires;
2264 	if (expires) {
2265 		unsigned long now = jiffies;
2266 
2267 		if (time_before(now, expires))
2268 			expires -= now;
2269 		else
2270 			expires = 0;
2271 	}
2272 
2273 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2274 	if (rt->rt_pmtu && expires)
2275 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2276 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2277 		goto nla_put_failure;
2278 
2279 	if (fl4->flowi4_mark &&
2280 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2281 		goto nla_put_failure;
2282 
2283 	error = rt->dst.error;
2284 
2285 	if (rt_is_input_route(rt)) {
2286 #ifdef CONFIG_IP_MROUTE
2287 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2288 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2289 			int err = ipmr_get_route(net, skb,
2290 						 fl4->saddr, fl4->daddr,
2291 						 r, nowait);
2292 			if (err <= 0) {
2293 				if (!nowait) {
2294 					if (err == 0)
2295 						return 0;
2296 					goto nla_put_failure;
2297 				} else {
2298 					if (err == -EMSGSIZE)
2299 						goto nla_put_failure;
2300 					error = err;
2301 				}
2302 			}
2303 		} else
2304 #endif
2305 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2306 				goto nla_put_failure;
2307 	}
2308 
2309 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2310 		goto nla_put_failure;
2311 
2312 	return nlmsg_end(skb, nlh);
2313 
2314 nla_put_failure:
2315 	nlmsg_cancel(skb, nlh);
2316 	return -EMSGSIZE;
2317 }
2318 
2319 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2320 {
2321 	struct net *net = sock_net(in_skb->sk);
2322 	struct rtmsg *rtm;
2323 	struct nlattr *tb[RTA_MAX+1];
2324 	struct rtable *rt = NULL;
2325 	struct flowi4 fl4;
2326 	__be32 dst = 0;
2327 	__be32 src = 0;
2328 	u32 iif;
2329 	int err;
2330 	int mark;
2331 	struct sk_buff *skb;
2332 
2333 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2334 	if (err < 0)
2335 		goto errout;
2336 
2337 	rtm = nlmsg_data(nlh);
2338 
2339 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2340 	if (skb == NULL) {
2341 		err = -ENOBUFS;
2342 		goto errout;
2343 	}
2344 
2345 	/* Reserve room for dummy headers, this skb can pass
2346 	   through good chunk of routing engine.
2347 	 */
2348 	skb_reset_mac_header(skb);
2349 	skb_reset_network_header(skb);
2350 
2351 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2352 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2353 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2354 
2355 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2356 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2357 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2358 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2359 
2360 	memset(&fl4, 0, sizeof(fl4));
2361 	fl4.daddr = dst;
2362 	fl4.saddr = src;
2363 	fl4.flowi4_tos = rtm->rtm_tos;
2364 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2365 	fl4.flowi4_mark = mark;
2366 
2367 	if (iif) {
2368 		struct net_device *dev;
2369 
2370 		dev = __dev_get_by_index(net, iif);
2371 		if (dev == NULL) {
2372 			err = -ENODEV;
2373 			goto errout_free;
2374 		}
2375 
2376 		skb->protocol	= htons(ETH_P_IP);
2377 		skb->dev	= dev;
2378 		skb->mark	= mark;
2379 		local_bh_disable();
2380 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2381 		local_bh_enable();
2382 
2383 		rt = skb_rtable(skb);
2384 		if (err == 0 && rt->dst.error)
2385 			err = -rt->dst.error;
2386 	} else {
2387 		rt = ip_route_output_key(net, &fl4);
2388 
2389 		err = 0;
2390 		if (IS_ERR(rt))
2391 			err = PTR_ERR(rt);
2392 	}
2393 
2394 	if (err)
2395 		goto errout_free;
2396 
2397 	skb_dst_set(skb, &rt->dst);
2398 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2399 		rt->rt_flags |= RTCF_NOTIFY;
2400 
2401 	err = rt_fill_info(net, dst, src, &fl4, skb,
2402 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2403 			   RTM_NEWROUTE, 0, 0);
2404 	if (err <= 0)
2405 		goto errout_free;
2406 
2407 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2408 errout:
2409 	return err;
2410 
2411 errout_free:
2412 	kfree_skb(skb);
2413 	goto errout;
2414 }
2415 
2416 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2417 {
2418 	return skb->len;
2419 }
2420 
2421 void ip_rt_multicast_event(struct in_device *in_dev)
2422 {
2423 	rt_cache_flush(dev_net(in_dev->dev));
2424 }
2425 
2426 #ifdef CONFIG_SYSCTL
2427 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2428 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2429 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2430 static int ip_rt_gc_elasticity __read_mostly	= 8;
2431 
2432 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2433 					void __user *buffer,
2434 					size_t *lenp, loff_t *ppos)
2435 {
2436 	if (write) {
2437 		rt_cache_flush((struct net *)__ctl->extra1);
2438 		return 0;
2439 	}
2440 
2441 	return -EINVAL;
2442 }
2443 
2444 static ctl_table ipv4_route_table[] = {
2445 	{
2446 		.procname	= "gc_thresh",
2447 		.data		= &ipv4_dst_ops.gc_thresh,
2448 		.maxlen		= sizeof(int),
2449 		.mode		= 0644,
2450 		.proc_handler	= proc_dointvec,
2451 	},
2452 	{
2453 		.procname	= "max_size",
2454 		.data		= &ip_rt_max_size,
2455 		.maxlen		= sizeof(int),
2456 		.mode		= 0644,
2457 		.proc_handler	= proc_dointvec,
2458 	},
2459 	{
2460 		/*  Deprecated. Use gc_min_interval_ms */
2461 
2462 		.procname	= "gc_min_interval",
2463 		.data		= &ip_rt_gc_min_interval,
2464 		.maxlen		= sizeof(int),
2465 		.mode		= 0644,
2466 		.proc_handler	= proc_dointvec_jiffies,
2467 	},
2468 	{
2469 		.procname	= "gc_min_interval_ms",
2470 		.data		= &ip_rt_gc_min_interval,
2471 		.maxlen		= sizeof(int),
2472 		.mode		= 0644,
2473 		.proc_handler	= proc_dointvec_ms_jiffies,
2474 	},
2475 	{
2476 		.procname	= "gc_timeout",
2477 		.data		= &ip_rt_gc_timeout,
2478 		.maxlen		= sizeof(int),
2479 		.mode		= 0644,
2480 		.proc_handler	= proc_dointvec_jiffies,
2481 	},
2482 	{
2483 		.procname	= "gc_interval",
2484 		.data		= &ip_rt_gc_interval,
2485 		.maxlen		= sizeof(int),
2486 		.mode		= 0644,
2487 		.proc_handler	= proc_dointvec_jiffies,
2488 	},
2489 	{
2490 		.procname	= "redirect_load",
2491 		.data		= &ip_rt_redirect_load,
2492 		.maxlen		= sizeof(int),
2493 		.mode		= 0644,
2494 		.proc_handler	= proc_dointvec,
2495 	},
2496 	{
2497 		.procname	= "redirect_number",
2498 		.data		= &ip_rt_redirect_number,
2499 		.maxlen		= sizeof(int),
2500 		.mode		= 0644,
2501 		.proc_handler	= proc_dointvec,
2502 	},
2503 	{
2504 		.procname	= "redirect_silence",
2505 		.data		= &ip_rt_redirect_silence,
2506 		.maxlen		= sizeof(int),
2507 		.mode		= 0644,
2508 		.proc_handler	= proc_dointvec,
2509 	},
2510 	{
2511 		.procname	= "error_cost",
2512 		.data		= &ip_rt_error_cost,
2513 		.maxlen		= sizeof(int),
2514 		.mode		= 0644,
2515 		.proc_handler	= proc_dointvec,
2516 	},
2517 	{
2518 		.procname	= "error_burst",
2519 		.data		= &ip_rt_error_burst,
2520 		.maxlen		= sizeof(int),
2521 		.mode		= 0644,
2522 		.proc_handler	= proc_dointvec,
2523 	},
2524 	{
2525 		.procname	= "gc_elasticity",
2526 		.data		= &ip_rt_gc_elasticity,
2527 		.maxlen		= sizeof(int),
2528 		.mode		= 0644,
2529 		.proc_handler	= proc_dointvec,
2530 	},
2531 	{
2532 		.procname	= "mtu_expires",
2533 		.data		= &ip_rt_mtu_expires,
2534 		.maxlen		= sizeof(int),
2535 		.mode		= 0644,
2536 		.proc_handler	= proc_dointvec_jiffies,
2537 	},
2538 	{
2539 		.procname	= "min_pmtu",
2540 		.data		= &ip_rt_min_pmtu,
2541 		.maxlen		= sizeof(int),
2542 		.mode		= 0644,
2543 		.proc_handler	= proc_dointvec,
2544 	},
2545 	{
2546 		.procname	= "min_adv_mss",
2547 		.data		= &ip_rt_min_advmss,
2548 		.maxlen		= sizeof(int),
2549 		.mode		= 0644,
2550 		.proc_handler	= proc_dointvec,
2551 	},
2552 	{ }
2553 };
2554 
2555 static struct ctl_table ipv4_route_flush_table[] = {
2556 	{
2557 		.procname	= "flush",
2558 		.maxlen		= sizeof(int),
2559 		.mode		= 0200,
2560 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2561 	},
2562 	{ },
2563 };
2564 
2565 static __net_init int sysctl_route_net_init(struct net *net)
2566 {
2567 	struct ctl_table *tbl;
2568 
2569 	tbl = ipv4_route_flush_table;
2570 	if (!net_eq(net, &init_net)) {
2571 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2572 		if (tbl == NULL)
2573 			goto err_dup;
2574 
2575 		/* Don't export sysctls to unprivileged users */
2576 		if (net->user_ns != &init_user_ns)
2577 			tbl[0].procname = NULL;
2578 	}
2579 	tbl[0].extra1 = net;
2580 
2581 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2582 	if (net->ipv4.route_hdr == NULL)
2583 		goto err_reg;
2584 	return 0;
2585 
2586 err_reg:
2587 	if (tbl != ipv4_route_flush_table)
2588 		kfree(tbl);
2589 err_dup:
2590 	return -ENOMEM;
2591 }
2592 
2593 static __net_exit void sysctl_route_net_exit(struct net *net)
2594 {
2595 	struct ctl_table *tbl;
2596 
2597 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2598 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2599 	BUG_ON(tbl == ipv4_route_flush_table);
2600 	kfree(tbl);
2601 }
2602 
2603 static __net_initdata struct pernet_operations sysctl_route_ops = {
2604 	.init = sysctl_route_net_init,
2605 	.exit = sysctl_route_net_exit,
2606 };
2607 #endif
2608 
2609 static __net_init int rt_genid_init(struct net *net)
2610 {
2611 	atomic_set(&net->rt_genid, 0);
2612 	get_random_bytes(&net->ipv4.dev_addr_genid,
2613 			 sizeof(net->ipv4.dev_addr_genid));
2614 	return 0;
2615 }
2616 
2617 static __net_initdata struct pernet_operations rt_genid_ops = {
2618 	.init = rt_genid_init,
2619 };
2620 
2621 static int __net_init ipv4_inetpeer_init(struct net *net)
2622 {
2623 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2624 
2625 	if (!bp)
2626 		return -ENOMEM;
2627 	inet_peer_base_init(bp);
2628 	net->ipv4.peers = bp;
2629 	return 0;
2630 }
2631 
2632 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2633 {
2634 	struct inet_peer_base *bp = net->ipv4.peers;
2635 
2636 	net->ipv4.peers = NULL;
2637 	inetpeer_invalidate_tree(bp);
2638 	kfree(bp);
2639 }
2640 
2641 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2642 	.init	=	ipv4_inetpeer_init,
2643 	.exit	=	ipv4_inetpeer_exit,
2644 };
2645 
2646 #ifdef CONFIG_IP_ROUTE_CLASSID
2647 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2648 #endif /* CONFIG_IP_ROUTE_CLASSID */
2649 
2650 int __init ip_rt_init(void)
2651 {
2652 	int rc = 0;
2653 
2654 #ifdef CONFIG_IP_ROUTE_CLASSID
2655 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2656 	if (!ip_rt_acct)
2657 		panic("IP: failed to allocate ip_rt_acct\n");
2658 #endif
2659 
2660 	ipv4_dst_ops.kmem_cachep =
2661 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2662 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2663 
2664 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2665 
2666 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2667 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2668 
2669 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2670 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2671 
2672 	ipv4_dst_ops.gc_thresh = ~0;
2673 	ip_rt_max_size = INT_MAX;
2674 
2675 	devinet_init();
2676 	ip_fib_init();
2677 
2678 	if (ip_rt_proc_init())
2679 		pr_err("Unable to create route proc files\n");
2680 #ifdef CONFIG_XFRM
2681 	xfrm_init();
2682 	xfrm4_init();
2683 #endif
2684 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2685 
2686 #ifdef CONFIG_SYSCTL
2687 	register_pernet_subsys(&sysctl_route_ops);
2688 #endif
2689 	register_pernet_subsys(&rt_genid_ops);
2690 	register_pernet_subsys(&ipv4_inetpeer_ops);
2691 	return rc;
2692 }
2693 
2694 #ifdef CONFIG_SYSCTL
2695 /*
2696  * We really need to sanitize the damn ipv4 init order, then all
2697  * this nonsense will go away.
2698  */
2699 void __init ip_static_sysctl_init(void)
2700 {
2701 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2702 }
2703 #endif
2704