xref: /openbmc/linux/net/ipv4/route.c (revision 1bff292e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu);
144 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 					struct sk_buff *skb);
146 static void		ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 			    int how)
150 {
151 }
152 
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 	WARN_ON(1);
156 	return NULL;
157 }
158 
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 					   struct sk_buff *skb,
161 					   const void *daddr);
162 
163 static struct dst_ops ipv4_dst_ops = {
164 	.family =		AF_INET,
165 	.protocol =		cpu_to_be16(ETH_P_IP),
166 	.check =		ipv4_dst_check,
167 	.default_advmss =	ipv4_default_advmss,
168 	.mtu =			ipv4_mtu,
169 	.cow_metrics =		ipv4_cow_metrics,
170 	.destroy =		ipv4_dst_destroy,
171 	.ifdown =		ipv4_dst_ifdown,
172 	.negative_advice =	ipv4_negative_advice,
173 	.link_failure =		ipv4_link_failure,
174 	.update_pmtu =		ip_rt_update_pmtu,
175 	.redirect =		ip_do_redirect,
176 	.local_out =		__ip_local_out,
177 	.neigh_lookup =		ipv4_neigh_lookup,
178 };
179 
180 #define ECN_OR_COST(class)	TC_PRIO_##class
181 
182 const __u8 ip_tos2prio[16] = {
183 	TC_PRIO_BESTEFFORT,
184 	ECN_OR_COST(BESTEFFORT),
185 	TC_PRIO_BESTEFFORT,
186 	ECN_OR_COST(BESTEFFORT),
187 	TC_PRIO_BULK,
188 	ECN_OR_COST(BULK),
189 	TC_PRIO_BULK,
190 	ECN_OR_COST(BULK),
191 	TC_PRIO_INTERACTIVE,
192 	ECN_OR_COST(INTERACTIVE),
193 	TC_PRIO_INTERACTIVE,
194 	ECN_OR_COST(INTERACTIVE),
195 	TC_PRIO_INTERACTIVE_BULK,
196 	ECN_OR_COST(INTERACTIVE_BULK),
197 	TC_PRIO_INTERACTIVE_BULK,
198 	ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201 
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204 
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208 	if (*pos)
209 		return NULL;
210 	return SEQ_START_TOKEN;
211 }
212 
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215 	++*pos;
216 	return NULL;
217 }
218 
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222 
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225 	if (v == SEQ_START_TOKEN)
226 		seq_printf(seq, "%-127s\n",
227 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 			   "HHUptod\tSpecDst");
230 	return 0;
231 }
232 
233 static const struct seq_operations rt_cache_seq_ops = {
234 	.start  = rt_cache_seq_start,
235 	.next   = rt_cache_seq_next,
236 	.stop   = rt_cache_seq_stop,
237 	.show   = rt_cache_seq_show,
238 };
239 
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242 	return seq_open(file, &rt_cache_seq_ops);
243 }
244 
245 static const struct file_operations rt_cache_seq_fops = {
246 	.owner	 = THIS_MODULE,
247 	.open	 = rt_cache_seq_open,
248 	.read	 = seq_read,
249 	.llseek	 = seq_lseek,
250 	.release = seq_release,
251 };
252 
253 
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256 	int cpu;
257 
258 	if (*pos == 0)
259 		return SEQ_START_TOKEN;
260 
261 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 		if (!cpu_possible(cpu))
263 			continue;
264 		*pos = cpu+1;
265 		return &per_cpu(rt_cache_stat, cpu);
266 	}
267 	return NULL;
268 }
269 
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272 	int cpu;
273 
274 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 		if (!cpu_possible(cpu))
276 			continue;
277 		*pos = cpu+1;
278 		return &per_cpu(rt_cache_stat, cpu);
279 	}
280 	return NULL;
281 
282 }
283 
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286 
287 }
288 
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291 	struct rt_cache_stat *st = v;
292 
293 	if (v == SEQ_START_TOKEN) {
294 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 		return 0;
296 	}
297 
298 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 		   dst_entries_get_slow(&ipv4_dst_ops),
301 		   st->in_hit,
302 		   st->in_slow_tot,
303 		   st->in_slow_mc,
304 		   st->in_no_route,
305 		   st->in_brd,
306 		   st->in_martian_dst,
307 		   st->in_martian_src,
308 
309 		   st->out_hit,
310 		   st->out_slow_tot,
311 		   st->out_slow_mc,
312 
313 		   st->gc_total,
314 		   st->gc_ignored,
315 		   st->gc_goal_miss,
316 		   st->gc_dst_overflow,
317 		   st->in_hlist_search,
318 		   st->out_hlist_search
319 		);
320 	return 0;
321 }
322 
323 static const struct seq_operations rt_cpu_seq_ops = {
324 	.start  = rt_cpu_seq_start,
325 	.next   = rt_cpu_seq_next,
326 	.stop   = rt_cpu_seq_stop,
327 	.show   = rt_cpu_seq_show,
328 };
329 
330 
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333 	return seq_open(file, &rt_cpu_seq_ops);
334 }
335 
336 static const struct file_operations rt_cpu_seq_fops = {
337 	.owner	 = THIS_MODULE,
338 	.open	 = rt_cpu_seq_open,
339 	.read	 = seq_read,
340 	.llseek	 = seq_lseek,
341 	.release = seq_release,
342 };
343 
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347 	struct ip_rt_acct *dst, *src;
348 	unsigned int i, j;
349 
350 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 	if (!dst)
352 		return -ENOMEM;
353 
354 	for_each_possible_cpu(i) {
355 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 		for (j = 0; j < 256; j++) {
357 			dst[j].o_bytes   += src[j].o_bytes;
358 			dst[j].o_packets += src[j].o_packets;
359 			dst[j].i_bytes   += src[j].i_bytes;
360 			dst[j].i_packets += src[j].i_packets;
361 		}
362 	}
363 
364 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 	kfree(dst);
366 	return 0;
367 }
368 
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371 	return single_open(file, rt_acct_proc_show, NULL);
372 }
373 
374 static const struct file_operations rt_acct_proc_fops = {
375 	.owner		= THIS_MODULE,
376 	.open		= rt_acct_proc_open,
377 	.read		= seq_read,
378 	.llseek		= seq_lseek,
379 	.release	= single_release,
380 };
381 #endif
382 
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385 	struct proc_dir_entry *pde;
386 
387 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 			&rt_cache_seq_fops);
389 	if (!pde)
390 		goto err1;
391 
392 	pde = proc_create("rt_cache", S_IRUGO,
393 			  net->proc_net_stat, &rt_cpu_seq_fops);
394 	if (!pde)
395 		goto err2;
396 
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 	if (!pde)
400 		goto err3;
401 #endif
402 	return 0;
403 
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406 	remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409 	remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411 	return -ENOMEM;
412 }
413 
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416 	remove_proc_entry("rt_cache", net->proc_net_stat);
417 	remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 	remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422 
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424 	.init = ip_rt_do_proc_init,
425 	.exit = ip_rt_do_proc_exit,
426 };
427 
428 static int __init ip_rt_proc_init(void)
429 {
430 	return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432 
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436 	return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439 
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444 
445 void rt_cache_flush(struct net *net)
446 {
447 	rt_genid_bump(net);
448 }
449 
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 					   struct sk_buff *skb,
452 					   const void *daddr)
453 {
454 	struct net_device *dev = dst->dev;
455 	const __be32 *pkey = daddr;
456 	const struct rtable *rt;
457 	struct neighbour *n;
458 
459 	rt = (const struct rtable *) dst;
460 	if (rt->rt_gateway)
461 		pkey = (const __be32 *) &rt->rt_gateway;
462 	else if (skb)
463 		pkey = &ip_hdr(skb)->daddr;
464 
465 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 	if (n)
467 		return n;
468 	return neigh_create(&arp_tbl, pkey, dev);
469 }
470 
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480 	static DEFINE_SPINLOCK(ip_fb_id_lock);
481 	static u32 ip_fallback_id;
482 	u32 salt;
483 
484 	spin_lock_bh(&ip_fb_id_lock);
485 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 	iph->id = htons(salt & 0xFFFF);
487 	ip_fallback_id = salt;
488 	spin_unlock_bh(&ip_fb_id_lock);
489 }
490 
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493 	struct net *net = dev_net(dst->dev);
494 	struct inet_peer *peer;
495 
496 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 	if (peer) {
498 		iph->id = htons(inet_getid(peer, more));
499 		inet_putpeer(peer);
500 		return;
501 	}
502 
503 	ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506 
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 			     const struct iphdr *iph,
509 			     int oif, u8 tos,
510 			     u8 prot, u32 mark, int flow_flags)
511 {
512 	if (sk) {
513 		const struct inet_sock *inet = inet_sk(sk);
514 
515 		oif = sk->sk_bound_dev_if;
516 		mark = sk->sk_mark;
517 		tos = RT_CONN_FLAGS(sk);
518 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 	}
520 	flowi4_init_output(fl4, oif, mark, tos,
521 			   RT_SCOPE_UNIVERSE, prot,
522 			   flow_flags,
523 			   iph->daddr, iph->saddr, 0, 0);
524 }
525 
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 			       const struct sock *sk)
528 {
529 	const struct iphdr *iph = ip_hdr(skb);
530 	int oif = skb->dev->ifindex;
531 	u8 tos = RT_TOS(iph->tos);
532 	u8 prot = iph->protocol;
533 	u32 mark = skb->mark;
534 
535 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537 
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540 	const struct inet_sock *inet = inet_sk(sk);
541 	const struct ip_options_rcu *inet_opt;
542 	__be32 daddr = inet->inet_daddr;
543 
544 	rcu_read_lock();
545 	inet_opt = rcu_dereference(inet->inet_opt);
546 	if (inet_opt && inet_opt->opt.srr)
547 		daddr = inet_opt->opt.faddr;
548 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 			   inet_sk_flowi_flags(sk),
552 			   daddr, inet->inet_saddr, 0, 0);
553 	rcu_read_unlock();
554 }
555 
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 				 const struct sk_buff *skb)
558 {
559 	if (skb)
560 		build_skb_flow_key(fl4, skb, sk);
561 	else
562 		build_sk_flow_key(fl4, sk);
563 }
564 
565 static inline void rt_free(struct rtable *rt)
566 {
567 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569 
570 static DEFINE_SPINLOCK(fnhe_lock);
571 
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574 	struct fib_nh_exception *fnhe, *oldest;
575 	struct rtable *orig;
576 
577 	oldest = rcu_dereference(hash->chain);
578 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 			oldest = fnhe;
582 	}
583 	orig = rcu_dereference(oldest->fnhe_rth);
584 	if (orig) {
585 		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 		rt_free(orig);
587 	}
588 	return oldest;
589 }
590 
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593 	u32 hval;
594 
595 	hval = (__force u32) daddr;
596 	hval ^= (hval >> 11) ^ (hval >> 22);
597 
598 	return hval & (FNHE_HASH_SIZE - 1);
599 }
600 
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 				  u32 pmtu, unsigned long expires)
603 {
604 	struct fnhe_hash_bucket *hash;
605 	struct fib_nh_exception *fnhe;
606 	int depth;
607 	u32 hval = fnhe_hashfun(daddr);
608 
609 	spin_lock_bh(&fnhe_lock);
610 
611 	hash = nh->nh_exceptions;
612 	if (!hash) {
613 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614 		if (!hash)
615 			goto out_unlock;
616 		nh->nh_exceptions = hash;
617 	}
618 
619 	hash += hval;
620 
621 	depth = 0;
622 	for (fnhe = rcu_dereference(hash->chain); fnhe;
623 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 		if (fnhe->fnhe_daddr == daddr)
625 			break;
626 		depth++;
627 	}
628 
629 	if (fnhe) {
630 		if (gw)
631 			fnhe->fnhe_gw = gw;
632 		if (pmtu) {
633 			fnhe->fnhe_pmtu = pmtu;
634 			fnhe->fnhe_expires = expires;
635 		}
636 	} else {
637 		if (depth > FNHE_RECLAIM_DEPTH)
638 			fnhe = fnhe_oldest(hash);
639 		else {
640 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 			if (!fnhe)
642 				goto out_unlock;
643 
644 			fnhe->fnhe_next = hash->chain;
645 			rcu_assign_pointer(hash->chain, fnhe);
646 		}
647 		fnhe->fnhe_daddr = daddr;
648 		fnhe->fnhe_gw = gw;
649 		fnhe->fnhe_pmtu = pmtu;
650 		fnhe->fnhe_expires = expires;
651 	}
652 
653 	fnhe->fnhe_stamp = jiffies;
654 
655 out_unlock:
656 	spin_unlock_bh(&fnhe_lock);
657 	return;
658 }
659 
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 			     bool kill_route)
662 {
663 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
664 	__be32 old_gw = ip_hdr(skb)->saddr;
665 	struct net_device *dev = skb->dev;
666 	struct in_device *in_dev;
667 	struct fib_result res;
668 	struct neighbour *n;
669 	struct net *net;
670 
671 	switch (icmp_hdr(skb)->code & 7) {
672 	case ICMP_REDIR_NET:
673 	case ICMP_REDIR_NETTOS:
674 	case ICMP_REDIR_HOST:
675 	case ICMP_REDIR_HOSTTOS:
676 		break;
677 
678 	default:
679 		return;
680 	}
681 
682 	if (rt->rt_gateway != old_gw)
683 		return;
684 
685 	in_dev = __in_dev_get_rcu(dev);
686 	if (!in_dev)
687 		return;
688 
689 	net = dev_net(dev);
690 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 	    ipv4_is_zeronet(new_gw))
693 		goto reject_redirect;
694 
695 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 			goto reject_redirect;
698 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 			goto reject_redirect;
700 	} else {
701 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 			goto reject_redirect;
703 	}
704 
705 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706 	if (n) {
707 		if (!(n->nud_state & NUD_VALID)) {
708 			neigh_event_send(n, NULL);
709 		} else {
710 			if (fib_lookup(net, fl4, &res) == 0) {
711 				struct fib_nh *nh = &FIB_RES_NH(res);
712 
713 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 						      0, 0);
715 			}
716 			if (kill_route)
717 				rt->dst.obsolete = DST_OBSOLETE_KILL;
718 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 		}
720 		neigh_release(n);
721 	}
722 	return;
723 
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 		const struct iphdr *iph = (const struct iphdr *) skb->data;
728 		__be32 daddr = iph->daddr;
729 		__be32 saddr = iph->saddr;
730 
731 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 				     "  Advised path = %pI4 -> %pI4\n",
733 				     &old_gw, dev->name, &new_gw,
734 				     &saddr, &daddr);
735 	}
736 #endif
737 	;
738 }
739 
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742 	struct rtable *rt;
743 	struct flowi4 fl4;
744 
745 	rt = (struct rtable *) dst;
746 
747 	ip_rt_build_flow_key(&fl4, sk, skb);
748 	__ip_do_redirect(rt, skb, &fl4, true);
749 }
750 
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753 	struct rtable *rt = (struct rtable *)dst;
754 	struct dst_entry *ret = dst;
755 
756 	if (rt) {
757 		if (dst->obsolete > 0) {
758 			ip_rt_put(rt);
759 			ret = NULL;
760 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 			   rt->dst.expires) {
762 			ip_rt_put(rt);
763 			ret = NULL;
764 		}
765 	}
766 	return ret;
767 }
768 
769 /*
770  * Algorithm:
771  *	1. The first ip_rt_redirect_number redirects are sent
772  *	   with exponential backoff, then we stop sending them at all,
773  *	   assuming that the host ignores our redirects.
774  *	2. If we did not see packets requiring redirects
775  *	   during ip_rt_redirect_silence, we assume that the host
776  *	   forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784 
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787 	struct rtable *rt = skb_rtable(skb);
788 	struct in_device *in_dev;
789 	struct inet_peer *peer;
790 	struct net *net;
791 	int log_martians;
792 
793 	rcu_read_lock();
794 	in_dev = __in_dev_get_rcu(rt->dst.dev);
795 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 		rcu_read_unlock();
797 		return;
798 	}
799 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 	rcu_read_unlock();
801 
802 	net = dev_net(rt->dst.dev);
803 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804 	if (!peer) {
805 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
807 		return;
808 	}
809 
810 	/* No redirected packets during ip_rt_redirect_silence;
811 	 * reset the algorithm.
812 	 */
813 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814 		peer->rate_tokens = 0;
815 
816 	/* Too many ignored redirects; do not send anything
817 	 * set dst.rate_last to the last seen redirected packet.
818 	 */
819 	if (peer->rate_tokens >= ip_rt_redirect_number) {
820 		peer->rate_last = jiffies;
821 		goto out_put_peer;
822 	}
823 
824 	/* Check for load limit; set rate_last to the latest sent
825 	 * redirect.
826 	 */
827 	if (peer->rate_tokens == 0 ||
828 	    time_after(jiffies,
829 		       (peer->rate_last +
830 			(ip_rt_redirect_load << peer->rate_tokens)))) {
831 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832 
833 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834 		peer->rate_last = jiffies;
835 		++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 		if (log_martians &&
838 		    peer->rate_tokens == ip_rt_redirect_number)
839 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840 					     &ip_hdr(skb)->saddr, inet_iif(skb),
841 					     &ip_hdr(skb)->daddr, &gw);
842 #endif
843 	}
844 out_put_peer:
845 	inet_putpeer(peer);
846 }
847 
848 static int ip_error(struct sk_buff *skb)
849 {
850 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851 	struct rtable *rt = skb_rtable(skb);
852 	struct inet_peer *peer;
853 	unsigned long now;
854 	struct net *net;
855 	bool send;
856 	int code;
857 
858 	net = dev_net(rt->dst.dev);
859 	if (!IN_DEV_FORWARD(in_dev)) {
860 		switch (rt->dst.error) {
861 		case EHOSTUNREACH:
862 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 			break;
864 
865 		case ENETUNREACH:
866 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867 			break;
868 		}
869 		goto out;
870 	}
871 
872 	switch (rt->dst.error) {
873 	case EINVAL:
874 	default:
875 		goto out;
876 	case EHOSTUNREACH:
877 		code = ICMP_HOST_UNREACH;
878 		break;
879 	case ENETUNREACH:
880 		code = ICMP_NET_UNREACH;
881 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882 		break;
883 	case EACCES:
884 		code = ICMP_PKT_FILTERED;
885 		break;
886 	}
887 
888 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889 
890 	send = true;
891 	if (peer) {
892 		now = jiffies;
893 		peer->rate_tokens += now - peer->rate_last;
894 		if (peer->rate_tokens > ip_rt_error_burst)
895 			peer->rate_tokens = ip_rt_error_burst;
896 		peer->rate_last = now;
897 		if (peer->rate_tokens >= ip_rt_error_cost)
898 			peer->rate_tokens -= ip_rt_error_cost;
899 		else
900 			send = false;
901 		inet_putpeer(peer);
902 	}
903 	if (send)
904 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905 
906 out:	kfree_skb(skb);
907 	return 0;
908 }
909 
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912 	struct dst_entry *dst = &rt->dst;
913 	struct fib_result res;
914 
915 	if (dst->dev->mtu < mtu)
916 		return;
917 
918 	if (mtu < ip_rt_min_pmtu)
919 		mtu = ip_rt_min_pmtu;
920 
921 	if (!rt->rt_pmtu) {
922 		dst->obsolete = DST_OBSOLETE_KILL;
923 	} else {
924 		rt->rt_pmtu = mtu;
925 		dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926 	}
927 
928 	rcu_read_lock();
929 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
930 		struct fib_nh *nh = &FIB_RES_NH(res);
931 
932 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933 				      jiffies + ip_rt_mtu_expires);
934 	}
935 	rcu_read_unlock();
936 }
937 
938 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
939 			      struct sk_buff *skb, u32 mtu)
940 {
941 	struct rtable *rt = (struct rtable *) dst;
942 	struct flowi4 fl4;
943 
944 	ip_rt_build_flow_key(&fl4, sk, skb);
945 	__ip_rt_update_pmtu(rt, &fl4, mtu);
946 }
947 
948 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
949 		      int oif, u32 mark, u8 protocol, int flow_flags)
950 {
951 	const struct iphdr *iph = (const struct iphdr *) skb->data;
952 	struct flowi4 fl4;
953 	struct rtable *rt;
954 
955 	__build_flow_key(&fl4, NULL, iph, oif,
956 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
957 	rt = __ip_route_output_key(net, &fl4);
958 	if (!IS_ERR(rt)) {
959 		__ip_rt_update_pmtu(rt, &fl4, mtu);
960 		ip_rt_put(rt);
961 	}
962 }
963 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
964 
965 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
966 {
967 	const struct iphdr *iph = (const struct iphdr *) skb->data;
968 	struct flowi4 fl4;
969 	struct rtable *rt;
970 
971 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
972 	rt = __ip_route_output_key(sock_net(sk), &fl4);
973 	if (!IS_ERR(rt)) {
974 		__ip_rt_update_pmtu(rt, &fl4, mtu);
975 		ip_rt_put(rt);
976 	}
977 }
978 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
979 
980 void ipv4_redirect(struct sk_buff *skb, struct net *net,
981 		   int oif, u32 mark, u8 protocol, int flow_flags)
982 {
983 	const struct iphdr *iph = (const struct iphdr *) skb->data;
984 	struct flowi4 fl4;
985 	struct rtable *rt;
986 
987 	__build_flow_key(&fl4, NULL, iph, oif,
988 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
989 	rt = __ip_route_output_key(net, &fl4);
990 	if (!IS_ERR(rt)) {
991 		__ip_do_redirect(rt, skb, &fl4, false);
992 		ip_rt_put(rt);
993 	}
994 }
995 EXPORT_SYMBOL_GPL(ipv4_redirect);
996 
997 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
998 {
999 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1000 	struct flowi4 fl4;
1001 	struct rtable *rt;
1002 
1003 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1005 	if (!IS_ERR(rt)) {
1006 		__ip_do_redirect(rt, skb, &fl4, false);
1007 		ip_rt_put(rt);
1008 	}
1009 }
1010 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011 
1012 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013 {
1014 	struct rtable *rt = (struct rtable *) dst;
1015 
1016 	/* All IPV4 dsts are created with ->obsolete set to the value
1017 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018 	 * into this function always.
1019 	 *
1020 	 * When a PMTU/redirect information update invalidates a
1021 	 * route, this is indicated by setting obsolete to
1022 	 * DST_OBSOLETE_KILL.
1023 	 */
1024 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025 		return NULL;
1026 	return dst;
1027 }
1028 
1029 static void ipv4_link_failure(struct sk_buff *skb)
1030 {
1031 	struct rtable *rt;
1032 
1033 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034 
1035 	rt = skb_rtable(skb);
1036 	if (rt)
1037 		dst_set_expires(&rt->dst, 0);
1038 }
1039 
1040 static int ip_rt_bug(struct sk_buff *skb)
1041 {
1042 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1043 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044 		 skb->dev ? skb->dev->name : "?");
1045 	kfree_skb(skb);
1046 	WARN_ON(1);
1047 	return 0;
1048 }
1049 
1050 /*
1051    We do not cache source address of outgoing interface,
1052    because it is used only by IP RR, TS and SRR options,
1053    so that it out of fast path.
1054 
1055    BTW remember: "addr" is allowed to be not aligned
1056    in IP options!
1057  */
1058 
1059 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060 {
1061 	__be32 src;
1062 
1063 	if (rt_is_output_route(rt))
1064 		src = ip_hdr(skb)->saddr;
1065 	else {
1066 		struct fib_result res;
1067 		struct flowi4 fl4;
1068 		struct iphdr *iph;
1069 
1070 		iph = ip_hdr(skb);
1071 
1072 		memset(&fl4, 0, sizeof(fl4));
1073 		fl4.daddr = iph->daddr;
1074 		fl4.saddr = iph->saddr;
1075 		fl4.flowi4_tos = RT_TOS(iph->tos);
1076 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1077 		fl4.flowi4_iif = skb->dev->ifindex;
1078 		fl4.flowi4_mark = skb->mark;
1079 
1080 		rcu_read_lock();
1081 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083 		else
1084 			src = inet_select_addr(rt->dst.dev,
1085 					       rt_nexthop(rt, iph->daddr),
1086 					       RT_SCOPE_UNIVERSE);
1087 		rcu_read_unlock();
1088 	}
1089 	memcpy(addr, &src, 4);
1090 }
1091 
1092 #ifdef CONFIG_IP_ROUTE_CLASSID
1093 static void set_class_tag(struct rtable *rt, u32 tag)
1094 {
1095 	if (!(rt->dst.tclassid & 0xFFFF))
1096 		rt->dst.tclassid |= tag & 0xFFFF;
1097 	if (!(rt->dst.tclassid & 0xFFFF0000))
1098 		rt->dst.tclassid |= tag & 0xFFFF0000;
1099 }
1100 #endif
1101 
1102 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103 {
1104 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105 
1106 	if (advmss == 0) {
1107 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108 			       ip_rt_min_advmss);
1109 		if (advmss > 65535 - 40)
1110 			advmss = 65535 - 40;
1111 	}
1112 	return advmss;
1113 }
1114 
1115 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116 {
1117 	const struct rtable *rt = (const struct rtable *) dst;
1118 	unsigned int mtu = rt->rt_pmtu;
1119 
1120 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1121 		mtu = dst_metric_raw(dst, RTAX_MTU);
1122 
1123 	if (mtu && rt_is_output_route(rt))
1124 		return mtu;
1125 
1126 	mtu = dst->dev->mtu;
1127 
1128 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1129 		if (rt->rt_uses_gateway && mtu > 576)
1130 			mtu = 576;
1131 	}
1132 
1133 	if (mtu > IP_MAX_MTU)
1134 		mtu = IP_MAX_MTU;
1135 
1136 	return mtu;
1137 }
1138 
1139 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1140 {
1141 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1142 	struct fib_nh_exception *fnhe;
1143 	u32 hval;
1144 
1145 	if (!hash)
1146 		return NULL;
1147 
1148 	hval = fnhe_hashfun(daddr);
1149 
1150 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1151 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1152 		if (fnhe->fnhe_daddr == daddr)
1153 			return fnhe;
1154 	}
1155 	return NULL;
1156 }
1157 
1158 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1159 			      __be32 daddr)
1160 {
1161 	bool ret = false;
1162 
1163 	spin_lock_bh(&fnhe_lock);
1164 
1165 	if (daddr == fnhe->fnhe_daddr) {
1166 		struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1167 		if (orig && rt_is_expired(orig)) {
1168 			fnhe->fnhe_gw = 0;
1169 			fnhe->fnhe_pmtu = 0;
1170 			fnhe->fnhe_expires = 0;
1171 		}
1172 		if (fnhe->fnhe_pmtu) {
1173 			unsigned long expires = fnhe->fnhe_expires;
1174 			unsigned long diff = expires - jiffies;
1175 
1176 			if (time_before(jiffies, expires)) {
1177 				rt->rt_pmtu = fnhe->fnhe_pmtu;
1178 				dst_set_expires(&rt->dst, diff);
1179 			}
1180 		}
1181 		if (fnhe->fnhe_gw) {
1182 			rt->rt_flags |= RTCF_REDIRECTED;
1183 			rt->rt_gateway = fnhe->fnhe_gw;
1184 			rt->rt_uses_gateway = 1;
1185 		} else if (!rt->rt_gateway)
1186 			rt->rt_gateway = daddr;
1187 
1188 		rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189 		if (orig)
1190 			rt_free(orig);
1191 
1192 		fnhe->fnhe_stamp = jiffies;
1193 		ret = true;
1194 	}
1195 	spin_unlock_bh(&fnhe_lock);
1196 
1197 	return ret;
1198 }
1199 
1200 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201 {
1202 	struct rtable *orig, *prev, **p;
1203 	bool ret = true;
1204 
1205 	if (rt_is_input_route(rt)) {
1206 		p = (struct rtable **)&nh->nh_rth_input;
1207 	} else {
1208 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1209 	}
1210 	orig = *p;
1211 
1212 	prev = cmpxchg(p, orig, rt);
1213 	if (prev == orig) {
1214 		if (orig)
1215 			rt_free(orig);
1216 	} else
1217 		ret = false;
1218 
1219 	return ret;
1220 }
1221 
1222 static DEFINE_SPINLOCK(rt_uncached_lock);
1223 static LIST_HEAD(rt_uncached_list);
1224 
1225 static void rt_add_uncached_list(struct rtable *rt)
1226 {
1227 	spin_lock_bh(&rt_uncached_lock);
1228 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229 	spin_unlock_bh(&rt_uncached_lock);
1230 }
1231 
1232 static void ipv4_dst_destroy(struct dst_entry *dst)
1233 {
1234 	struct rtable *rt = (struct rtable *) dst;
1235 
1236 	if (!list_empty(&rt->rt_uncached)) {
1237 		spin_lock_bh(&rt_uncached_lock);
1238 		list_del(&rt->rt_uncached);
1239 		spin_unlock_bh(&rt_uncached_lock);
1240 	}
1241 }
1242 
1243 void rt_flush_dev(struct net_device *dev)
1244 {
1245 	if (!list_empty(&rt_uncached_list)) {
1246 		struct net *net = dev_net(dev);
1247 		struct rtable *rt;
1248 
1249 		spin_lock_bh(&rt_uncached_lock);
1250 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251 			if (rt->dst.dev != dev)
1252 				continue;
1253 			rt->dst.dev = net->loopback_dev;
1254 			dev_hold(rt->dst.dev);
1255 			dev_put(dev);
1256 		}
1257 		spin_unlock_bh(&rt_uncached_lock);
1258 	}
1259 }
1260 
1261 static bool rt_cache_valid(const struct rtable *rt)
1262 {
1263 	return	rt &&
1264 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265 		!rt_is_expired(rt);
1266 }
1267 
1268 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269 			   const struct fib_result *res,
1270 			   struct fib_nh_exception *fnhe,
1271 			   struct fib_info *fi, u16 type, u32 itag)
1272 {
1273 	bool cached = false;
1274 
1275 	if (fi) {
1276 		struct fib_nh *nh = &FIB_RES_NH(*res);
1277 
1278 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1279 			rt->rt_gateway = nh->nh_gw;
1280 			rt->rt_uses_gateway = 1;
1281 		}
1282 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283 #ifdef CONFIG_IP_ROUTE_CLASSID
1284 		rt->dst.tclassid = nh->nh_tclassid;
1285 #endif
1286 		if (unlikely(fnhe))
1287 			cached = rt_bind_exception(rt, fnhe, daddr);
1288 		else if (!(rt->dst.flags & DST_NOCACHE))
1289 			cached = rt_cache_route(nh, rt);
1290 		if (unlikely(!cached)) {
1291 			/* Routes we intend to cache in nexthop exception or
1292 			 * FIB nexthop have the DST_NOCACHE bit clear.
1293 			 * However, if we are unsuccessful at storing this
1294 			 * route into the cache we really need to set it.
1295 			 */
1296 			rt->dst.flags |= DST_NOCACHE;
1297 			if (!rt->rt_gateway)
1298 				rt->rt_gateway = daddr;
1299 			rt_add_uncached_list(rt);
1300 		}
1301 	} else
1302 		rt_add_uncached_list(rt);
1303 
1304 #ifdef CONFIG_IP_ROUTE_CLASSID
1305 #ifdef CONFIG_IP_MULTIPLE_TABLES
1306 	set_class_tag(rt, res->tclassid);
1307 #endif
1308 	set_class_tag(rt, itag);
1309 #endif
1310 }
1311 
1312 static struct rtable *rt_dst_alloc(struct net_device *dev,
1313 				   bool nopolicy, bool noxfrm, bool will_cache)
1314 {
1315 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1316 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1317 			 (nopolicy ? DST_NOPOLICY : 0) |
1318 			 (noxfrm ? DST_NOXFRM : 0));
1319 }
1320 
1321 /* called in rcu_read_lock() section */
1322 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323 				u8 tos, struct net_device *dev, int our)
1324 {
1325 	struct rtable *rth;
1326 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1327 	u32 itag = 0;
1328 	int err;
1329 
1330 	/* Primary sanity checks. */
1331 
1332 	if (in_dev == NULL)
1333 		return -EINVAL;
1334 
1335 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336 	    skb->protocol != htons(ETH_P_IP))
1337 		goto e_inval;
1338 
1339 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340 		if (ipv4_is_loopback(saddr))
1341 			goto e_inval;
1342 
1343 	if (ipv4_is_zeronet(saddr)) {
1344 		if (!ipv4_is_local_multicast(daddr))
1345 			goto e_inval;
1346 	} else {
1347 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1348 					  in_dev, &itag);
1349 		if (err < 0)
1350 			goto e_err;
1351 	}
1352 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1353 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354 	if (!rth)
1355 		goto e_nobufs;
1356 
1357 #ifdef CONFIG_IP_ROUTE_CLASSID
1358 	rth->dst.tclassid = itag;
1359 #endif
1360 	rth->dst.output = ip_rt_bug;
1361 
1362 	rth->rt_genid	= rt_genid(dev_net(dev));
1363 	rth->rt_flags	= RTCF_MULTICAST;
1364 	rth->rt_type	= RTN_MULTICAST;
1365 	rth->rt_is_input= 1;
1366 	rth->rt_iif	= 0;
1367 	rth->rt_pmtu	= 0;
1368 	rth->rt_gateway	= 0;
1369 	rth->rt_uses_gateway = 0;
1370 	INIT_LIST_HEAD(&rth->rt_uncached);
1371 	if (our) {
1372 		rth->dst.input= ip_local_deliver;
1373 		rth->rt_flags |= RTCF_LOCAL;
1374 	}
1375 
1376 #ifdef CONFIG_IP_MROUTE
1377 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1378 		rth->dst.input = ip_mr_input;
1379 #endif
1380 	RT_CACHE_STAT_INC(in_slow_mc);
1381 
1382 	skb_dst_set(skb, &rth->dst);
1383 	return 0;
1384 
1385 e_nobufs:
1386 	return -ENOBUFS;
1387 e_inval:
1388 	return -EINVAL;
1389 e_err:
1390 	return err;
1391 }
1392 
1393 
1394 static void ip_handle_martian_source(struct net_device *dev,
1395 				     struct in_device *in_dev,
1396 				     struct sk_buff *skb,
1397 				     __be32 daddr,
1398 				     __be32 saddr)
1399 {
1400 	RT_CACHE_STAT_INC(in_martian_src);
1401 #ifdef CONFIG_IP_ROUTE_VERBOSE
1402 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1403 		/*
1404 		 *	RFC1812 recommendation, if source is martian,
1405 		 *	the only hint is MAC header.
1406 		 */
1407 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1408 			&daddr, &saddr, dev->name);
1409 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410 			print_hex_dump(KERN_WARNING, "ll header: ",
1411 				       DUMP_PREFIX_OFFSET, 16, 1,
1412 				       skb_mac_header(skb),
1413 				       dev->hard_header_len, true);
1414 		}
1415 	}
1416 #endif
1417 }
1418 
1419 /* called in rcu_read_lock() section */
1420 static int __mkroute_input(struct sk_buff *skb,
1421 			   const struct fib_result *res,
1422 			   struct in_device *in_dev,
1423 			   __be32 daddr, __be32 saddr, u32 tos)
1424 {
1425 	struct rtable *rth;
1426 	int err;
1427 	struct in_device *out_dev;
1428 	unsigned int flags = 0;
1429 	bool do_cache;
1430 	u32 itag;
1431 
1432 	/* get a working reference to the output device */
1433 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434 	if (out_dev == NULL) {
1435 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436 		return -EINVAL;
1437 	}
1438 
1439 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440 				  in_dev->dev, in_dev, &itag);
1441 	if (err < 0) {
1442 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443 					 saddr);
1444 
1445 		goto cleanup;
1446 	}
1447 
1448 	do_cache = res->fi && !itag;
1449 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1450 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1451 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1452 		flags |= RTCF_DOREDIRECT;
1453 		do_cache = false;
1454 	}
1455 
1456 	if (skb->protocol != htons(ETH_P_IP)) {
1457 		/* Not IP (i.e. ARP). Do not create route, if it is
1458 		 * invalid for proxy arp. DNAT routes are always valid.
1459 		 *
1460 		 * Proxy arp feature have been extended to allow, ARP
1461 		 * replies back to the same interface, to support
1462 		 * Private VLAN switch technologies. See arp.c.
1463 		 */
1464 		if (out_dev == in_dev &&
1465 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1466 			err = -EINVAL;
1467 			goto cleanup;
1468 		}
1469 	}
1470 
1471 	if (do_cache) {
1472 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473 		if (rt_cache_valid(rth)) {
1474 			skb_dst_set_noref(skb, &rth->dst);
1475 			goto out;
1476 		}
1477 	}
1478 
1479 	rth = rt_dst_alloc(out_dev->dev,
1480 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482 	if (!rth) {
1483 		err = -ENOBUFS;
1484 		goto cleanup;
1485 	}
1486 
1487 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488 	rth->rt_flags = flags;
1489 	rth->rt_type = res->type;
1490 	rth->rt_is_input = 1;
1491 	rth->rt_iif 	= 0;
1492 	rth->rt_pmtu	= 0;
1493 	rth->rt_gateway	= 0;
1494 	rth->rt_uses_gateway = 0;
1495 	INIT_LIST_HEAD(&rth->rt_uncached);
1496 
1497 	rth->dst.input = ip_forward;
1498 	rth->dst.output = ip_output;
1499 
1500 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1501 	skb_dst_set(skb, &rth->dst);
1502 out:
1503 	err = 0;
1504  cleanup:
1505 	return err;
1506 }
1507 
1508 static int ip_mkroute_input(struct sk_buff *skb,
1509 			    struct fib_result *res,
1510 			    const struct flowi4 *fl4,
1511 			    struct in_device *in_dev,
1512 			    __be32 daddr, __be32 saddr, u32 tos)
1513 {
1514 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1515 	if (res->fi && res->fi->fib_nhs > 1)
1516 		fib_select_multipath(res);
1517 #endif
1518 
1519 	/* create a routing cache entry */
1520 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521 }
1522 
1523 /*
1524  *	NOTE. We drop all the packets that has local source
1525  *	addresses, because every properly looped back packet
1526  *	must have correct destination already attached by output routine.
1527  *
1528  *	Such approach solves two big problems:
1529  *	1. Not simplex devices are handled properly.
1530  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1531  *	called with rcu_read_lock()
1532  */
1533 
1534 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535 			       u8 tos, struct net_device *dev)
1536 {
1537 	struct fib_result res;
1538 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1539 	struct flowi4	fl4;
1540 	unsigned int	flags = 0;
1541 	u32		itag = 0;
1542 	struct rtable	*rth;
1543 	int		err = -EINVAL;
1544 	struct net    *net = dev_net(dev);
1545 	bool do_cache;
1546 
1547 	/* IP on this device is disabled. */
1548 
1549 	if (!in_dev)
1550 		goto out;
1551 
1552 	/* Check for the most weird martians, which can be not detected
1553 	   by fib_lookup.
1554 	 */
1555 
1556 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1557 		goto martian_source;
1558 
1559 	res.fi = NULL;
1560 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561 		goto brd_input;
1562 
1563 	/* Accept zero addresses only to limited broadcast;
1564 	 * I even do not know to fix it or not. Waiting for complains :-)
1565 	 */
1566 	if (ipv4_is_zeronet(saddr))
1567 		goto martian_source;
1568 
1569 	if (ipv4_is_zeronet(daddr))
1570 		goto martian_destination;
1571 
1572 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1573 	 * and call it once if daddr or/and saddr are loopback addresses
1574 	 */
1575 	if (ipv4_is_loopback(daddr)) {
1576 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577 			goto martian_destination;
1578 	} else if (ipv4_is_loopback(saddr)) {
1579 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1580 			goto martian_source;
1581 	}
1582 
1583 	/*
1584 	 *	Now we are ready to route packet.
1585 	 */
1586 	fl4.flowi4_oif = 0;
1587 	fl4.flowi4_iif = dev->ifindex;
1588 	fl4.flowi4_mark = skb->mark;
1589 	fl4.flowi4_tos = tos;
1590 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1591 	fl4.daddr = daddr;
1592 	fl4.saddr = saddr;
1593 	err = fib_lookup(net, &fl4, &res);
1594 	if (err != 0)
1595 		goto no_route;
1596 
1597 	RT_CACHE_STAT_INC(in_slow_tot);
1598 
1599 	if (res.type == RTN_BROADCAST)
1600 		goto brd_input;
1601 
1602 	if (res.type == RTN_LOCAL) {
1603 		err = fib_validate_source(skb, saddr, daddr, tos,
1604 					  LOOPBACK_IFINDEX,
1605 					  dev, in_dev, &itag);
1606 		if (err < 0)
1607 			goto martian_source_keep_err;
1608 		goto local_input;
1609 	}
1610 
1611 	if (!IN_DEV_FORWARD(in_dev))
1612 		goto no_route;
1613 	if (res.type != RTN_UNICAST)
1614 		goto martian_destination;
1615 
1616 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1617 out:	return err;
1618 
1619 brd_input:
1620 	if (skb->protocol != htons(ETH_P_IP))
1621 		goto e_inval;
1622 
1623 	if (!ipv4_is_zeronet(saddr)) {
1624 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625 					  in_dev, &itag);
1626 		if (err < 0)
1627 			goto martian_source_keep_err;
1628 	}
1629 	flags |= RTCF_BROADCAST;
1630 	res.type = RTN_BROADCAST;
1631 	RT_CACHE_STAT_INC(in_brd);
1632 
1633 local_input:
1634 	do_cache = false;
1635 	if (res.fi) {
1636 		if (!itag) {
1637 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638 			if (rt_cache_valid(rth)) {
1639 				skb_dst_set_noref(skb, &rth->dst);
1640 				err = 0;
1641 				goto out;
1642 			}
1643 			do_cache = true;
1644 		}
1645 	}
1646 
1647 	rth = rt_dst_alloc(net->loopback_dev,
1648 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1649 	if (!rth)
1650 		goto e_nobufs;
1651 
1652 	rth->dst.input= ip_local_deliver;
1653 	rth->dst.output= ip_rt_bug;
1654 #ifdef CONFIG_IP_ROUTE_CLASSID
1655 	rth->dst.tclassid = itag;
1656 #endif
1657 
1658 	rth->rt_genid = rt_genid(net);
1659 	rth->rt_flags 	= flags|RTCF_LOCAL;
1660 	rth->rt_type	= res.type;
1661 	rth->rt_is_input = 1;
1662 	rth->rt_iif	= 0;
1663 	rth->rt_pmtu	= 0;
1664 	rth->rt_gateway	= 0;
1665 	rth->rt_uses_gateway = 0;
1666 	INIT_LIST_HEAD(&rth->rt_uncached);
1667 	if (res.type == RTN_UNREACHABLE) {
1668 		rth->dst.input= ip_error;
1669 		rth->dst.error= -err;
1670 		rth->rt_flags 	&= ~RTCF_LOCAL;
1671 	}
1672 	if (do_cache)
1673 		rt_cache_route(&FIB_RES_NH(res), rth);
1674 	skb_dst_set(skb, &rth->dst);
1675 	err = 0;
1676 	goto out;
1677 
1678 no_route:
1679 	RT_CACHE_STAT_INC(in_no_route);
1680 	res.type = RTN_UNREACHABLE;
1681 	if (err == -ESRCH)
1682 		err = -ENETUNREACH;
1683 	goto local_input;
1684 
1685 	/*
1686 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1687 	 */
1688 martian_destination:
1689 	RT_CACHE_STAT_INC(in_martian_dst);
1690 #ifdef CONFIG_IP_ROUTE_VERBOSE
1691 	if (IN_DEV_LOG_MARTIANS(in_dev))
1692 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1693 				     &daddr, &saddr, dev->name);
1694 #endif
1695 
1696 e_inval:
1697 	err = -EINVAL;
1698 	goto out;
1699 
1700 e_nobufs:
1701 	err = -ENOBUFS;
1702 	goto out;
1703 
1704 martian_source:
1705 	err = -EINVAL;
1706 martian_source_keep_err:
1707 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1708 	goto out;
1709 }
1710 
1711 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712 			 u8 tos, struct net_device *dev)
1713 {
1714 	int res;
1715 
1716 	rcu_read_lock();
1717 
1718 	/* Multicast recognition logic is moved from route cache to here.
1719 	   The problem was that too many Ethernet cards have broken/missing
1720 	   hardware multicast filters :-( As result the host on multicasting
1721 	   network acquires a lot of useless route cache entries, sort of
1722 	   SDR messages from all the world. Now we try to get rid of them.
1723 	   Really, provided software IP multicast filter is organized
1724 	   reasonably (at least, hashed), it does not result in a slowdown
1725 	   comparing with route cache reject entries.
1726 	   Note, that multicast routers are not affected, because
1727 	   route cache entry is created eventually.
1728 	 */
1729 	if (ipv4_is_multicast(daddr)) {
1730 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1731 
1732 		if (in_dev) {
1733 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1734 						  ip_hdr(skb)->protocol);
1735 			if (our
1736 #ifdef CONFIG_IP_MROUTE
1737 				||
1738 			    (!ipv4_is_local_multicast(daddr) &&
1739 			     IN_DEV_MFORWARD(in_dev))
1740 #endif
1741 			   ) {
1742 				int res = ip_route_input_mc(skb, daddr, saddr,
1743 							    tos, dev, our);
1744 				rcu_read_unlock();
1745 				return res;
1746 			}
1747 		}
1748 		rcu_read_unlock();
1749 		return -EINVAL;
1750 	}
1751 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1752 	rcu_read_unlock();
1753 	return res;
1754 }
1755 EXPORT_SYMBOL(ip_route_input_noref);
1756 
1757 /* called with rcu_read_lock() */
1758 static struct rtable *__mkroute_output(const struct fib_result *res,
1759 				       const struct flowi4 *fl4, int orig_oif,
1760 				       struct net_device *dev_out,
1761 				       unsigned int flags)
1762 {
1763 	struct fib_info *fi = res->fi;
1764 	struct fib_nh_exception *fnhe;
1765 	struct in_device *in_dev;
1766 	u16 type = res->type;
1767 	struct rtable *rth;
1768 	bool do_cache;
1769 
1770 	in_dev = __in_dev_get_rcu(dev_out);
1771 	if (!in_dev)
1772 		return ERR_PTR(-EINVAL);
1773 
1774 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776 			return ERR_PTR(-EINVAL);
1777 
1778 	if (ipv4_is_lbcast(fl4->daddr))
1779 		type = RTN_BROADCAST;
1780 	else if (ipv4_is_multicast(fl4->daddr))
1781 		type = RTN_MULTICAST;
1782 	else if (ipv4_is_zeronet(fl4->daddr))
1783 		return ERR_PTR(-EINVAL);
1784 
1785 	if (dev_out->flags & IFF_LOOPBACK)
1786 		flags |= RTCF_LOCAL;
1787 
1788 	do_cache = true;
1789 	if (type == RTN_BROADCAST) {
1790 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1791 		fi = NULL;
1792 	} else if (type == RTN_MULTICAST) {
1793 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1794 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1795 				     fl4->flowi4_proto))
1796 			flags &= ~RTCF_LOCAL;
1797 		else
1798 			do_cache = false;
1799 		/* If multicast route do not exist use
1800 		 * default one, but do not gateway in this case.
1801 		 * Yes, it is hack.
1802 		 */
1803 		if (fi && res->prefixlen < 4)
1804 			fi = NULL;
1805 	}
1806 
1807 	fnhe = NULL;
1808 	do_cache &= fi != NULL;
1809 	if (do_cache) {
1810 		struct rtable __rcu **prth;
1811 		struct fib_nh *nh = &FIB_RES_NH(*res);
1812 
1813 		fnhe = find_exception(nh, fl4->daddr);
1814 		if (fnhe)
1815 			prth = &fnhe->fnhe_rth;
1816 		else {
1817 			if (unlikely(fl4->flowi4_flags &
1818 				     FLOWI_FLAG_KNOWN_NH &&
1819 				     !(nh->nh_gw &&
1820 				       nh->nh_scope == RT_SCOPE_LINK))) {
1821 				do_cache = false;
1822 				goto add;
1823 			}
1824 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1825 		}
1826 		rth = rcu_dereference(*prth);
1827 		if (rt_cache_valid(rth)) {
1828 			dst_hold(&rth->dst);
1829 			return rth;
1830 		}
1831 	}
1832 
1833 add:
1834 	rth = rt_dst_alloc(dev_out,
1835 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1836 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1837 			   do_cache);
1838 	if (!rth)
1839 		return ERR_PTR(-ENOBUFS);
1840 
1841 	rth->dst.output = ip_output;
1842 
1843 	rth->rt_genid = rt_genid(dev_net(dev_out));
1844 	rth->rt_flags	= flags;
1845 	rth->rt_type	= type;
1846 	rth->rt_is_input = 0;
1847 	rth->rt_iif	= orig_oif ? : 0;
1848 	rth->rt_pmtu	= 0;
1849 	rth->rt_gateway = 0;
1850 	rth->rt_uses_gateway = 0;
1851 	INIT_LIST_HEAD(&rth->rt_uncached);
1852 
1853 	RT_CACHE_STAT_INC(out_slow_tot);
1854 
1855 	if (flags & RTCF_LOCAL)
1856 		rth->dst.input = ip_local_deliver;
1857 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1858 		if (flags & RTCF_LOCAL &&
1859 		    !(dev_out->flags & IFF_LOOPBACK)) {
1860 			rth->dst.output = ip_mc_output;
1861 			RT_CACHE_STAT_INC(out_slow_mc);
1862 		}
1863 #ifdef CONFIG_IP_MROUTE
1864 		if (type == RTN_MULTICAST) {
1865 			if (IN_DEV_MFORWARD(in_dev) &&
1866 			    !ipv4_is_local_multicast(fl4->daddr)) {
1867 				rth->dst.input = ip_mr_input;
1868 				rth->dst.output = ip_mc_output;
1869 			}
1870 		}
1871 #endif
1872 	}
1873 
1874 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1875 
1876 	return rth;
1877 }
1878 
1879 /*
1880  * Major route resolver routine.
1881  */
1882 
1883 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1884 {
1885 	struct net_device *dev_out = NULL;
1886 	__u8 tos = RT_FL_TOS(fl4);
1887 	unsigned int flags = 0;
1888 	struct fib_result res;
1889 	struct rtable *rth;
1890 	int orig_oif;
1891 
1892 	res.tclassid	= 0;
1893 	res.fi		= NULL;
1894 	res.table	= NULL;
1895 
1896 	orig_oif = fl4->flowi4_oif;
1897 
1898 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
1899 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1900 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1901 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1902 
1903 	rcu_read_lock();
1904 	if (fl4->saddr) {
1905 		rth = ERR_PTR(-EINVAL);
1906 		if (ipv4_is_multicast(fl4->saddr) ||
1907 		    ipv4_is_lbcast(fl4->saddr) ||
1908 		    ipv4_is_zeronet(fl4->saddr))
1909 			goto out;
1910 
1911 		/* I removed check for oif == dev_out->oif here.
1912 		   It was wrong for two reasons:
1913 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1914 		      is assigned to multiple interfaces.
1915 		   2. Moreover, we are allowed to send packets with saddr
1916 		      of another iface. --ANK
1917 		 */
1918 
1919 		if (fl4->flowi4_oif == 0 &&
1920 		    (ipv4_is_multicast(fl4->daddr) ||
1921 		     ipv4_is_lbcast(fl4->daddr))) {
1922 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1923 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1924 			if (dev_out == NULL)
1925 				goto out;
1926 
1927 			/* Special hack: user can direct multicasts
1928 			   and limited broadcast via necessary interface
1929 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1930 			   This hack is not just for fun, it allows
1931 			   vic,vat and friends to work.
1932 			   They bind socket to loopback, set ttl to zero
1933 			   and expect that it will work.
1934 			   From the viewpoint of routing cache they are broken,
1935 			   because we are not allowed to build multicast path
1936 			   with loopback source addr (look, routing cache
1937 			   cannot know, that ttl is zero, so that packet
1938 			   will not leave this host and route is valid).
1939 			   Luckily, this hack is good workaround.
1940 			 */
1941 
1942 			fl4->flowi4_oif = dev_out->ifindex;
1943 			goto make_route;
1944 		}
1945 
1946 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1947 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1948 			if (!__ip_dev_find(net, fl4->saddr, false))
1949 				goto out;
1950 		}
1951 	}
1952 
1953 
1954 	if (fl4->flowi4_oif) {
1955 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1956 		rth = ERR_PTR(-ENODEV);
1957 		if (dev_out == NULL)
1958 			goto out;
1959 
1960 		/* RACE: Check return value of inet_select_addr instead. */
1961 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1962 			rth = ERR_PTR(-ENETUNREACH);
1963 			goto out;
1964 		}
1965 		if (ipv4_is_local_multicast(fl4->daddr) ||
1966 		    ipv4_is_lbcast(fl4->daddr)) {
1967 			if (!fl4->saddr)
1968 				fl4->saddr = inet_select_addr(dev_out, 0,
1969 							      RT_SCOPE_LINK);
1970 			goto make_route;
1971 		}
1972 		if (fl4->saddr) {
1973 			if (ipv4_is_multicast(fl4->daddr))
1974 				fl4->saddr = inet_select_addr(dev_out, 0,
1975 							      fl4->flowi4_scope);
1976 			else if (!fl4->daddr)
1977 				fl4->saddr = inet_select_addr(dev_out, 0,
1978 							      RT_SCOPE_HOST);
1979 		}
1980 	}
1981 
1982 	if (!fl4->daddr) {
1983 		fl4->daddr = fl4->saddr;
1984 		if (!fl4->daddr)
1985 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1986 		dev_out = net->loopback_dev;
1987 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
1988 		res.type = RTN_LOCAL;
1989 		flags |= RTCF_LOCAL;
1990 		goto make_route;
1991 	}
1992 
1993 	if (fib_lookup(net, fl4, &res)) {
1994 		res.fi = NULL;
1995 		res.table = NULL;
1996 		if (fl4->flowi4_oif) {
1997 			/* Apparently, routing tables are wrong. Assume,
1998 			   that the destination is on link.
1999 
2000 			   WHY? DW.
2001 			   Because we are allowed to send to iface
2002 			   even if it has NO routes and NO assigned
2003 			   addresses. When oif is specified, routing
2004 			   tables are looked up with only one purpose:
2005 			   to catch if destination is gatewayed, rather than
2006 			   direct. Moreover, if MSG_DONTROUTE is set,
2007 			   we send packet, ignoring both routing tables
2008 			   and ifaddr state. --ANK
2009 
2010 
2011 			   We could make it even if oif is unknown,
2012 			   likely IPv6, but we do not.
2013 			 */
2014 
2015 			if (fl4->saddr == 0)
2016 				fl4->saddr = inet_select_addr(dev_out, 0,
2017 							      RT_SCOPE_LINK);
2018 			res.type = RTN_UNICAST;
2019 			goto make_route;
2020 		}
2021 		rth = ERR_PTR(-ENETUNREACH);
2022 		goto out;
2023 	}
2024 
2025 	if (res.type == RTN_LOCAL) {
2026 		if (!fl4->saddr) {
2027 			if (res.fi->fib_prefsrc)
2028 				fl4->saddr = res.fi->fib_prefsrc;
2029 			else
2030 				fl4->saddr = fl4->daddr;
2031 		}
2032 		dev_out = net->loopback_dev;
2033 		fl4->flowi4_oif = dev_out->ifindex;
2034 		flags |= RTCF_LOCAL;
2035 		goto make_route;
2036 	}
2037 
2038 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2039 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2040 		fib_select_multipath(&res);
2041 	else
2042 #endif
2043 	if (!res.prefixlen &&
2044 	    res.table->tb_num_default > 1 &&
2045 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2046 		fib_select_default(&res);
2047 
2048 	if (!fl4->saddr)
2049 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2050 
2051 	dev_out = FIB_RES_DEV(res);
2052 	fl4->flowi4_oif = dev_out->ifindex;
2053 
2054 
2055 make_route:
2056 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2057 
2058 out:
2059 	rcu_read_unlock();
2060 	return rth;
2061 }
2062 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2063 
2064 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066 	return NULL;
2067 }
2068 
2069 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2070 {
2071 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2072 
2073 	return mtu ? : dst->dev->mtu;
2074 }
2075 
2076 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2077 					  struct sk_buff *skb, u32 mtu)
2078 {
2079 }
2080 
2081 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2082 				       struct sk_buff *skb)
2083 {
2084 }
2085 
2086 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2087 					  unsigned long old)
2088 {
2089 	return NULL;
2090 }
2091 
2092 static struct dst_ops ipv4_dst_blackhole_ops = {
2093 	.family			=	AF_INET,
2094 	.protocol		=	cpu_to_be16(ETH_P_IP),
2095 	.check			=	ipv4_blackhole_dst_check,
2096 	.mtu			=	ipv4_blackhole_mtu,
2097 	.default_advmss		=	ipv4_default_advmss,
2098 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2099 	.redirect		=	ipv4_rt_blackhole_redirect,
2100 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2101 	.neigh_lookup		=	ipv4_neigh_lookup,
2102 };
2103 
2104 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105 {
2106 	struct rtable *ort = (struct rtable *) dst_orig;
2107 	struct rtable *rt;
2108 
2109 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2110 	if (rt) {
2111 		struct dst_entry *new = &rt->dst;
2112 
2113 		new->__use = 1;
2114 		new->input = dst_discard;
2115 		new->output = dst_discard;
2116 
2117 		new->dev = ort->dst.dev;
2118 		if (new->dev)
2119 			dev_hold(new->dev);
2120 
2121 		rt->rt_is_input = ort->rt_is_input;
2122 		rt->rt_iif = ort->rt_iif;
2123 		rt->rt_pmtu = ort->rt_pmtu;
2124 
2125 		rt->rt_genid = rt_genid(net);
2126 		rt->rt_flags = ort->rt_flags;
2127 		rt->rt_type = ort->rt_type;
2128 		rt->rt_gateway = ort->rt_gateway;
2129 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2130 
2131 		INIT_LIST_HEAD(&rt->rt_uncached);
2132 
2133 		dst_free(new);
2134 	}
2135 
2136 	dst_release(dst_orig);
2137 
2138 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2139 }
2140 
2141 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2142 				    struct sock *sk)
2143 {
2144 	struct rtable *rt = __ip_route_output_key(net, flp4);
2145 
2146 	if (IS_ERR(rt))
2147 		return rt;
2148 
2149 	if (flp4->flowi4_proto)
2150 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2151 						   flowi4_to_flowi(flp4),
2152 						   sk, 0);
2153 
2154 	return rt;
2155 }
2156 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2157 
2158 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2159 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2160 			u32 seq, int event, int nowait, unsigned int flags)
2161 {
2162 	struct rtable *rt = skb_rtable(skb);
2163 	struct rtmsg *r;
2164 	struct nlmsghdr *nlh;
2165 	unsigned long expires = 0;
2166 	u32 error;
2167 	u32 metrics[RTAX_MAX];
2168 
2169 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2170 	if (nlh == NULL)
2171 		return -EMSGSIZE;
2172 
2173 	r = nlmsg_data(nlh);
2174 	r->rtm_family	 = AF_INET;
2175 	r->rtm_dst_len	= 32;
2176 	r->rtm_src_len	= 0;
2177 	r->rtm_tos	= fl4->flowi4_tos;
2178 	r->rtm_table	= RT_TABLE_MAIN;
2179 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2180 		goto nla_put_failure;
2181 	r->rtm_type	= rt->rt_type;
2182 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2183 	r->rtm_protocol = RTPROT_UNSPEC;
2184 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2185 	if (rt->rt_flags & RTCF_NOTIFY)
2186 		r->rtm_flags |= RTM_F_NOTIFY;
2187 
2188 	if (nla_put_be32(skb, RTA_DST, dst))
2189 		goto nla_put_failure;
2190 	if (src) {
2191 		r->rtm_src_len = 32;
2192 		if (nla_put_be32(skb, RTA_SRC, src))
2193 			goto nla_put_failure;
2194 	}
2195 	if (rt->dst.dev &&
2196 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2197 		goto nla_put_failure;
2198 #ifdef CONFIG_IP_ROUTE_CLASSID
2199 	if (rt->dst.tclassid &&
2200 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2201 		goto nla_put_failure;
2202 #endif
2203 	if (!rt_is_input_route(rt) &&
2204 	    fl4->saddr != src) {
2205 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2206 			goto nla_put_failure;
2207 	}
2208 	if (rt->rt_uses_gateway &&
2209 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2210 		goto nla_put_failure;
2211 
2212 	expires = rt->dst.expires;
2213 	if (expires) {
2214 		unsigned long now = jiffies;
2215 
2216 		if (time_before(now, expires))
2217 			expires -= now;
2218 		else
2219 			expires = 0;
2220 	}
2221 
2222 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2223 	if (rt->rt_pmtu && expires)
2224 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2225 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2226 		goto nla_put_failure;
2227 
2228 	if (fl4->flowi4_mark &&
2229 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2230 		goto nla_put_failure;
2231 
2232 	error = rt->dst.error;
2233 
2234 	if (rt_is_input_route(rt)) {
2235 		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2236 			goto nla_put_failure;
2237 	}
2238 
2239 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2240 		goto nla_put_failure;
2241 
2242 	return nlmsg_end(skb, nlh);
2243 
2244 nla_put_failure:
2245 	nlmsg_cancel(skb, nlh);
2246 	return -EMSGSIZE;
2247 }
2248 
2249 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2250 {
2251 	struct net *net = sock_net(in_skb->sk);
2252 	struct rtmsg *rtm;
2253 	struct nlattr *tb[RTA_MAX+1];
2254 	struct rtable *rt = NULL;
2255 	struct flowi4 fl4;
2256 	__be32 dst = 0;
2257 	__be32 src = 0;
2258 	u32 iif;
2259 	int err;
2260 	int mark;
2261 	struct sk_buff *skb;
2262 
2263 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2264 	if (err < 0)
2265 		goto errout;
2266 
2267 	rtm = nlmsg_data(nlh);
2268 
2269 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2270 	if (skb == NULL) {
2271 		err = -ENOBUFS;
2272 		goto errout;
2273 	}
2274 
2275 	/* Reserve room for dummy headers, this skb can pass
2276 	   through good chunk of routing engine.
2277 	 */
2278 	skb_reset_mac_header(skb);
2279 	skb_reset_network_header(skb);
2280 
2281 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2282 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2283 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2284 
2285 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2286 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2287 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2288 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2289 
2290 	memset(&fl4, 0, sizeof(fl4));
2291 	fl4.daddr = dst;
2292 	fl4.saddr = src;
2293 	fl4.flowi4_tos = rtm->rtm_tos;
2294 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2295 	fl4.flowi4_mark = mark;
2296 
2297 	if (iif) {
2298 		struct net_device *dev;
2299 
2300 		dev = __dev_get_by_index(net, iif);
2301 		if (dev == NULL) {
2302 			err = -ENODEV;
2303 			goto errout_free;
2304 		}
2305 
2306 		skb->protocol	= htons(ETH_P_IP);
2307 		skb->dev	= dev;
2308 		skb->mark	= mark;
2309 		local_bh_disable();
2310 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2311 		local_bh_enable();
2312 
2313 		rt = skb_rtable(skb);
2314 		if (err == 0 && rt->dst.error)
2315 			err = -rt->dst.error;
2316 	} else {
2317 		rt = ip_route_output_key(net, &fl4);
2318 
2319 		err = 0;
2320 		if (IS_ERR(rt))
2321 			err = PTR_ERR(rt);
2322 	}
2323 
2324 	if (err)
2325 		goto errout_free;
2326 
2327 	skb_dst_set(skb, &rt->dst);
2328 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2329 		rt->rt_flags |= RTCF_NOTIFY;
2330 
2331 	err = rt_fill_info(net, dst, src, &fl4, skb,
2332 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2333 			   RTM_NEWROUTE, 0, 0);
2334 	if (err <= 0)
2335 		goto errout_free;
2336 
2337 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2338 errout:
2339 	return err;
2340 
2341 errout_free:
2342 	kfree_skb(skb);
2343 	goto errout;
2344 }
2345 
2346 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2347 {
2348 	return skb->len;
2349 }
2350 
2351 void ip_rt_multicast_event(struct in_device *in_dev)
2352 {
2353 	rt_cache_flush(dev_net(in_dev->dev));
2354 }
2355 
2356 #ifdef CONFIG_SYSCTL
2357 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2358 					void __user *buffer,
2359 					size_t *lenp, loff_t *ppos)
2360 {
2361 	if (write) {
2362 		rt_cache_flush((struct net *)__ctl->extra1);
2363 		return 0;
2364 	}
2365 
2366 	return -EINVAL;
2367 }
2368 
2369 static ctl_table ipv4_route_table[] = {
2370 	{
2371 		.procname	= "gc_thresh",
2372 		.data		= &ipv4_dst_ops.gc_thresh,
2373 		.maxlen		= sizeof(int),
2374 		.mode		= 0644,
2375 		.proc_handler	= proc_dointvec,
2376 	},
2377 	{
2378 		.procname	= "max_size",
2379 		.data		= &ip_rt_max_size,
2380 		.maxlen		= sizeof(int),
2381 		.mode		= 0644,
2382 		.proc_handler	= proc_dointvec,
2383 	},
2384 	{
2385 		/*  Deprecated. Use gc_min_interval_ms */
2386 
2387 		.procname	= "gc_min_interval",
2388 		.data		= &ip_rt_gc_min_interval,
2389 		.maxlen		= sizeof(int),
2390 		.mode		= 0644,
2391 		.proc_handler	= proc_dointvec_jiffies,
2392 	},
2393 	{
2394 		.procname	= "gc_min_interval_ms",
2395 		.data		= &ip_rt_gc_min_interval,
2396 		.maxlen		= sizeof(int),
2397 		.mode		= 0644,
2398 		.proc_handler	= proc_dointvec_ms_jiffies,
2399 	},
2400 	{
2401 		.procname	= "gc_timeout",
2402 		.data		= &ip_rt_gc_timeout,
2403 		.maxlen		= sizeof(int),
2404 		.mode		= 0644,
2405 		.proc_handler	= proc_dointvec_jiffies,
2406 	},
2407 	{
2408 		.procname	= "gc_interval",
2409 		.data		= &ip_rt_gc_interval,
2410 		.maxlen		= sizeof(int),
2411 		.mode		= 0644,
2412 		.proc_handler	= proc_dointvec_jiffies,
2413 	},
2414 	{
2415 		.procname	= "redirect_load",
2416 		.data		= &ip_rt_redirect_load,
2417 		.maxlen		= sizeof(int),
2418 		.mode		= 0644,
2419 		.proc_handler	= proc_dointvec,
2420 	},
2421 	{
2422 		.procname	= "redirect_number",
2423 		.data		= &ip_rt_redirect_number,
2424 		.maxlen		= sizeof(int),
2425 		.mode		= 0644,
2426 		.proc_handler	= proc_dointvec,
2427 	},
2428 	{
2429 		.procname	= "redirect_silence",
2430 		.data		= &ip_rt_redirect_silence,
2431 		.maxlen		= sizeof(int),
2432 		.mode		= 0644,
2433 		.proc_handler	= proc_dointvec,
2434 	},
2435 	{
2436 		.procname	= "error_cost",
2437 		.data		= &ip_rt_error_cost,
2438 		.maxlen		= sizeof(int),
2439 		.mode		= 0644,
2440 		.proc_handler	= proc_dointvec,
2441 	},
2442 	{
2443 		.procname	= "error_burst",
2444 		.data		= &ip_rt_error_burst,
2445 		.maxlen		= sizeof(int),
2446 		.mode		= 0644,
2447 		.proc_handler	= proc_dointvec,
2448 	},
2449 	{
2450 		.procname	= "gc_elasticity",
2451 		.data		= &ip_rt_gc_elasticity,
2452 		.maxlen		= sizeof(int),
2453 		.mode		= 0644,
2454 		.proc_handler	= proc_dointvec,
2455 	},
2456 	{
2457 		.procname	= "mtu_expires",
2458 		.data		= &ip_rt_mtu_expires,
2459 		.maxlen		= sizeof(int),
2460 		.mode		= 0644,
2461 		.proc_handler	= proc_dointvec_jiffies,
2462 	},
2463 	{
2464 		.procname	= "min_pmtu",
2465 		.data		= &ip_rt_min_pmtu,
2466 		.maxlen		= sizeof(int),
2467 		.mode		= 0644,
2468 		.proc_handler	= proc_dointvec,
2469 	},
2470 	{
2471 		.procname	= "min_adv_mss",
2472 		.data		= &ip_rt_min_advmss,
2473 		.maxlen		= sizeof(int),
2474 		.mode		= 0644,
2475 		.proc_handler	= proc_dointvec,
2476 	},
2477 	{ }
2478 };
2479 
2480 static struct ctl_table ipv4_route_flush_table[] = {
2481 	{
2482 		.procname	= "flush",
2483 		.maxlen		= sizeof(int),
2484 		.mode		= 0200,
2485 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2486 	},
2487 	{ },
2488 };
2489 
2490 static __net_init int sysctl_route_net_init(struct net *net)
2491 {
2492 	struct ctl_table *tbl;
2493 
2494 	tbl = ipv4_route_flush_table;
2495 	if (!net_eq(net, &init_net)) {
2496 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2497 		if (tbl == NULL)
2498 			goto err_dup;
2499 	}
2500 	tbl[0].extra1 = net;
2501 
2502 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2503 	if (net->ipv4.route_hdr == NULL)
2504 		goto err_reg;
2505 	return 0;
2506 
2507 err_reg:
2508 	if (tbl != ipv4_route_flush_table)
2509 		kfree(tbl);
2510 err_dup:
2511 	return -ENOMEM;
2512 }
2513 
2514 static __net_exit void sysctl_route_net_exit(struct net *net)
2515 {
2516 	struct ctl_table *tbl;
2517 
2518 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2519 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2520 	BUG_ON(tbl == ipv4_route_flush_table);
2521 	kfree(tbl);
2522 }
2523 
2524 static __net_initdata struct pernet_operations sysctl_route_ops = {
2525 	.init = sysctl_route_net_init,
2526 	.exit = sysctl_route_net_exit,
2527 };
2528 #endif
2529 
2530 static __net_init int rt_genid_init(struct net *net)
2531 {
2532 	atomic_set(&net->rt_genid, 0);
2533 	get_random_bytes(&net->ipv4.dev_addr_genid,
2534 			 sizeof(net->ipv4.dev_addr_genid));
2535 	return 0;
2536 }
2537 
2538 static __net_initdata struct pernet_operations rt_genid_ops = {
2539 	.init = rt_genid_init,
2540 };
2541 
2542 static int __net_init ipv4_inetpeer_init(struct net *net)
2543 {
2544 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2545 
2546 	if (!bp)
2547 		return -ENOMEM;
2548 	inet_peer_base_init(bp);
2549 	net->ipv4.peers = bp;
2550 	return 0;
2551 }
2552 
2553 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2554 {
2555 	struct inet_peer_base *bp = net->ipv4.peers;
2556 
2557 	net->ipv4.peers = NULL;
2558 	inetpeer_invalidate_tree(bp);
2559 	kfree(bp);
2560 }
2561 
2562 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2563 	.init	=	ipv4_inetpeer_init,
2564 	.exit	=	ipv4_inetpeer_exit,
2565 };
2566 
2567 #ifdef CONFIG_IP_ROUTE_CLASSID
2568 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2569 #endif /* CONFIG_IP_ROUTE_CLASSID */
2570 
2571 int __init ip_rt_init(void)
2572 {
2573 	int rc = 0;
2574 
2575 #ifdef CONFIG_IP_ROUTE_CLASSID
2576 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2577 	if (!ip_rt_acct)
2578 		panic("IP: failed to allocate ip_rt_acct\n");
2579 #endif
2580 
2581 	ipv4_dst_ops.kmem_cachep =
2582 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2583 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2584 
2585 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2586 
2587 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2588 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2589 
2590 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2591 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2592 
2593 	ipv4_dst_ops.gc_thresh = ~0;
2594 	ip_rt_max_size = INT_MAX;
2595 
2596 	devinet_init();
2597 	ip_fib_init();
2598 
2599 	if (ip_rt_proc_init())
2600 		pr_err("Unable to create route proc files\n");
2601 #ifdef CONFIG_XFRM
2602 	xfrm_init();
2603 	xfrm4_init();
2604 #endif
2605 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2606 
2607 #ifdef CONFIG_SYSCTL
2608 	register_pernet_subsys(&sysctl_route_ops);
2609 #endif
2610 	register_pernet_subsys(&rt_genid_ops);
2611 	register_pernet_subsys(&ipv4_inetpeer_ops);
2612 	return rc;
2613 }
2614 
2615 #ifdef CONFIG_SYSCTL
2616 /*
2617  * We really need to sanitize the damn ipv4 init order, then all
2618  * this nonsense will go away.
2619  */
2620 void __init ip_static_sysctl_init(void)
2621 {
2622 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2623 }
2624 #endif
2625