xref: /openbmc/linux/net/ipv4/route.c (revision 0a73d21e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116 
117 #include "fib_lookup.h"
118 
119 #define RT_FL_TOS(oldflp4) \
120 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121 
122 #define RT_GC_TIMEOUT (300*HZ)
123 
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 
134 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
135 
136 /*
137  *	Interface to generic destination cache.
138  */
139 
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void		 ipv4_link_failure(struct sk_buff *skb);
145 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 					   struct sk_buff *skb, u32 mtu);
147 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148 					struct sk_buff *skb);
149 static void		ipv4_dst_destroy(struct dst_entry *dst);
150 
151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
152 {
153 	WARN_ON(1);
154 	return NULL;
155 }
156 
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 					   struct sk_buff *skb,
159 					   const void *daddr);
160 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
161 
162 static struct dst_ops ipv4_dst_ops = {
163 	.family =		AF_INET,
164 	.check =		ipv4_dst_check,
165 	.default_advmss =	ipv4_default_advmss,
166 	.mtu =			ipv4_mtu,
167 	.cow_metrics =		ipv4_cow_metrics,
168 	.destroy =		ipv4_dst_destroy,
169 	.negative_advice =	ipv4_negative_advice,
170 	.link_failure =		ipv4_link_failure,
171 	.update_pmtu =		ip_rt_update_pmtu,
172 	.redirect =		ip_do_redirect,
173 	.local_out =		__ip_local_out,
174 	.neigh_lookup =		ipv4_neigh_lookup,
175 	.confirm_neigh =	ipv4_confirm_neigh,
176 };
177 
178 #define ECN_OR_COST(class)	TC_PRIO_##class
179 
180 const __u8 ip_tos2prio[16] = {
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BESTEFFORT,
184 	ECN_OR_COST(BESTEFFORT),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_BULK,
188 	ECN_OR_COST(BULK),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE,
192 	ECN_OR_COST(INTERACTIVE),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK),
195 	TC_PRIO_INTERACTIVE_BULK,
196 	ECN_OR_COST(INTERACTIVE_BULK)
197 };
198 EXPORT_SYMBOL(ip_tos2prio);
199 
200 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
201 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
202 
203 #ifdef CONFIG_PROC_FS
204 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
205 {
206 	if (*pos)
207 		return NULL;
208 	return SEQ_START_TOKEN;
209 }
210 
211 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
212 {
213 	++*pos;
214 	return NULL;
215 }
216 
217 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
218 {
219 }
220 
221 static int rt_cache_seq_show(struct seq_file *seq, void *v)
222 {
223 	if (v == SEQ_START_TOKEN)
224 		seq_printf(seq, "%-127s\n",
225 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
226 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
227 			   "HHUptod\tSpecDst");
228 	return 0;
229 }
230 
231 static const struct seq_operations rt_cache_seq_ops = {
232 	.start  = rt_cache_seq_start,
233 	.next   = rt_cache_seq_next,
234 	.stop   = rt_cache_seq_stop,
235 	.show   = rt_cache_seq_show,
236 };
237 
238 static int rt_cache_seq_open(struct inode *inode, struct file *file)
239 {
240 	return seq_open(file, &rt_cache_seq_ops);
241 }
242 
243 static const struct file_operations rt_cache_seq_fops = {
244 	.open	 = rt_cache_seq_open,
245 	.read	 = seq_read,
246 	.llseek	 = seq_lseek,
247 	.release = seq_release,
248 };
249 
250 
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253 	int cpu;
254 
255 	if (*pos == 0)
256 		return SEQ_START_TOKEN;
257 
258 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259 		if (!cpu_possible(cpu))
260 			continue;
261 		*pos = cpu+1;
262 		return &per_cpu(rt_cache_stat, cpu);
263 	}
264 	return NULL;
265 }
266 
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269 	int cpu;
270 
271 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272 		if (!cpu_possible(cpu))
273 			continue;
274 		*pos = cpu+1;
275 		return &per_cpu(rt_cache_stat, cpu);
276 	}
277 	return NULL;
278 
279 }
280 
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct file_operations rt_cpu_seq_fops = {
334 	.open	 = rt_cpu_seq_open,
335 	.read	 = seq_read,
336 	.llseek	 = seq_lseek,
337 	.release = seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 	return single_open(file, rt_acct_proc_show, NULL);
368 }
369 
370 static const struct file_operations rt_acct_proc_fops = {
371 	.open		= rt_acct_proc_open,
372 	.read		= seq_read,
373 	.llseek		= seq_lseek,
374 	.release	= single_release,
375 };
376 #endif
377 
378 static int __net_init ip_rt_do_proc_init(struct net *net)
379 {
380 	struct proc_dir_entry *pde;
381 
382 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
383 			  &rt_cache_seq_fops);
384 	if (!pde)
385 		goto err1;
386 
387 	pde = proc_create("rt_cache", S_IRUGO,
388 			  net->proc_net_stat, &rt_cpu_seq_fops);
389 	if (!pde)
390 		goto err2;
391 
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
394 	if (!pde)
395 		goto err3;
396 #endif
397 	return 0;
398 
399 #ifdef CONFIG_IP_ROUTE_CLASSID
400 err3:
401 	remove_proc_entry("rt_cache", net->proc_net_stat);
402 #endif
403 err2:
404 	remove_proc_entry("rt_cache", net->proc_net);
405 err1:
406 	return -ENOMEM;
407 }
408 
409 static void __net_exit ip_rt_do_proc_exit(struct net *net)
410 {
411 	remove_proc_entry("rt_cache", net->proc_net_stat);
412 	remove_proc_entry("rt_cache", net->proc_net);
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 	remove_proc_entry("rt_acct", net->proc_net);
415 #endif
416 }
417 
418 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
419 	.init = ip_rt_do_proc_init,
420 	.exit = ip_rt_do_proc_exit,
421 };
422 
423 static int __init ip_rt_proc_init(void)
424 {
425 	return register_pernet_subsys(&ip_rt_proc_ops);
426 }
427 
428 #else
429 static inline int ip_rt_proc_init(void)
430 {
431 	return 0;
432 }
433 #endif /* CONFIG_PROC_FS */
434 
435 static inline bool rt_is_expired(const struct rtable *rth)
436 {
437 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
438 }
439 
440 void rt_cache_flush(struct net *net)
441 {
442 	rt_genid_bump_ipv4(net);
443 }
444 
445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
446 					   struct sk_buff *skb,
447 					   const void *daddr)
448 {
449 	struct net_device *dev = dst->dev;
450 	const __be32 *pkey = daddr;
451 	const struct rtable *rt;
452 	struct neighbour *n;
453 
454 	rt = (const struct rtable *) dst;
455 	if (rt->rt_gateway)
456 		pkey = (const __be32 *) &rt->rt_gateway;
457 	else if (skb)
458 		pkey = &ip_hdr(skb)->daddr;
459 
460 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
461 	if (n)
462 		return n;
463 	return neigh_create(&arp_tbl, pkey, dev);
464 }
465 
466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467 {
468 	struct net_device *dev = dst->dev;
469 	const __be32 *pkey = daddr;
470 	const struct rtable *rt;
471 
472 	rt = (const struct rtable *)dst;
473 	if (rt->rt_gateway)
474 		pkey = (const __be32 *)&rt->rt_gateway;
475 	else if (!daddr ||
476 		 (rt->rt_flags &
477 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478 		return;
479 
480 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481 }
482 
483 #define IP_IDENTS_SZ 2048u
484 
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487 
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
495 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
496 	u32 old = READ_ONCE(*p_tstamp);
497 	u32 now = (u32)jiffies;
498 	u32 new, delta = 0;
499 
500 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
501 		delta = prandom_u32_max(now - old);
502 
503 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
504 	do {
505 		old = (u32)atomic_read(p_id);
506 		new = old + delta + segs;
507 	} while (atomic_cmpxchg(p_id, old, new) != old);
508 
509 	return new - segs;
510 }
511 EXPORT_SYMBOL(ip_idents_reserve);
512 
513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
514 {
515 	static u32 ip_idents_hashrnd __read_mostly;
516 	u32 hash, id;
517 
518 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
519 
520 	hash = jhash_3words((__force u32)iph->daddr,
521 			    (__force u32)iph->saddr,
522 			    iph->protocol ^ net_hash_mix(net),
523 			    ip_idents_hashrnd);
524 	id = ip_idents_reserve(hash, segs);
525 	iph->id = htons(id);
526 }
527 EXPORT_SYMBOL(__ip_select_ident);
528 
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530 			     const struct sock *sk,
531 			     const struct iphdr *iph,
532 			     int oif, u8 tos,
533 			     u8 prot, u32 mark, int flow_flags)
534 {
535 	if (sk) {
536 		const struct inet_sock *inet = inet_sk(sk);
537 
538 		oif = sk->sk_bound_dev_if;
539 		mark = sk->sk_mark;
540 		tos = RT_CONN_FLAGS(sk);
541 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542 	}
543 	flowi4_init_output(fl4, oif, mark, tos,
544 			   RT_SCOPE_UNIVERSE, prot,
545 			   flow_flags,
546 			   iph->daddr, iph->saddr, 0, 0,
547 			   sock_net_uid(net, sk));
548 }
549 
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551 			       const struct sock *sk)
552 {
553 	const struct net *net = dev_net(skb->dev);
554 	const struct iphdr *iph = ip_hdr(skb);
555 	int oif = skb->dev->ifindex;
556 	u8 tos = RT_TOS(iph->tos);
557 	u8 prot = iph->protocol;
558 	u32 mark = skb->mark;
559 
560 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562 
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565 	const struct inet_sock *inet = inet_sk(sk);
566 	const struct ip_options_rcu *inet_opt;
567 	__be32 daddr = inet->inet_daddr;
568 
569 	rcu_read_lock();
570 	inet_opt = rcu_dereference(inet->inet_opt);
571 	if (inet_opt && inet_opt->opt.srr)
572 		daddr = inet_opt->opt.faddr;
573 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 			   inet_sk_flowi_flags(sk),
577 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578 	rcu_read_unlock();
579 }
580 
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 				 const struct sk_buff *skb)
583 {
584 	if (skb)
585 		build_skb_flow_key(fl4, skb, sk);
586 	else
587 		build_sk_flow_key(fl4, sk);
588 }
589 
590 static DEFINE_SPINLOCK(fnhe_lock);
591 
592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 {
594 	struct rtable *rt;
595 
596 	rt = rcu_dereference(fnhe->fnhe_rth_input);
597 	if (rt) {
598 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
599 		dst_dev_put(&rt->dst);
600 		dst_release(&rt->dst);
601 	}
602 	rt = rcu_dereference(fnhe->fnhe_rth_output);
603 	if (rt) {
604 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
605 		dst_dev_put(&rt->dst);
606 		dst_release(&rt->dst);
607 	}
608 }
609 
610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
611 {
612 	struct fib_nh_exception *fnhe, *oldest;
613 
614 	oldest = rcu_dereference(hash->chain);
615 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
616 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
617 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
618 			oldest = fnhe;
619 	}
620 	fnhe_flush_routes(oldest);
621 	return oldest;
622 }
623 
624 static inline u32 fnhe_hashfun(__be32 daddr)
625 {
626 	static u32 fnhe_hashrnd __read_mostly;
627 	u32 hval;
628 
629 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
630 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
631 	return hash_32(hval, FNHE_HASH_SHIFT);
632 }
633 
634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
635 {
636 	rt->rt_pmtu = fnhe->fnhe_pmtu;
637 	rt->dst.expires = fnhe->fnhe_expires;
638 
639 	if (fnhe->fnhe_gw) {
640 		rt->rt_flags |= RTCF_REDIRECTED;
641 		rt->rt_gateway = fnhe->fnhe_gw;
642 		rt->rt_uses_gateway = 1;
643 	}
644 }
645 
646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
647 				  u32 pmtu, unsigned long expires)
648 {
649 	struct fnhe_hash_bucket *hash;
650 	struct fib_nh_exception *fnhe;
651 	struct rtable *rt;
652 	u32 genid, hval;
653 	unsigned int i;
654 	int depth;
655 
656 	genid = fnhe_genid(dev_net(nh->nh_dev));
657 	hval = fnhe_hashfun(daddr);
658 
659 	spin_lock_bh(&fnhe_lock);
660 
661 	hash = rcu_dereference(nh->nh_exceptions);
662 	if (!hash) {
663 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
664 		if (!hash)
665 			goto out_unlock;
666 		rcu_assign_pointer(nh->nh_exceptions, hash);
667 	}
668 
669 	hash += hval;
670 
671 	depth = 0;
672 	for (fnhe = rcu_dereference(hash->chain); fnhe;
673 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
674 		if (fnhe->fnhe_daddr == daddr)
675 			break;
676 		depth++;
677 	}
678 
679 	if (fnhe) {
680 		if (fnhe->fnhe_genid != genid)
681 			fnhe->fnhe_genid = genid;
682 		if (gw)
683 			fnhe->fnhe_gw = gw;
684 		if (pmtu)
685 			fnhe->fnhe_pmtu = pmtu;
686 		fnhe->fnhe_expires = max(1UL, expires);
687 		/* Update all cached dsts too */
688 		rt = rcu_dereference(fnhe->fnhe_rth_input);
689 		if (rt)
690 			fill_route_from_fnhe(rt, fnhe);
691 		rt = rcu_dereference(fnhe->fnhe_rth_output);
692 		if (rt)
693 			fill_route_from_fnhe(rt, fnhe);
694 	} else {
695 		if (depth > FNHE_RECLAIM_DEPTH)
696 			fnhe = fnhe_oldest(hash);
697 		else {
698 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
699 			if (!fnhe)
700 				goto out_unlock;
701 
702 			fnhe->fnhe_next = hash->chain;
703 			rcu_assign_pointer(hash->chain, fnhe);
704 		}
705 		fnhe->fnhe_genid = genid;
706 		fnhe->fnhe_daddr = daddr;
707 		fnhe->fnhe_gw = gw;
708 		fnhe->fnhe_pmtu = pmtu;
709 		fnhe->fnhe_expires = expires;
710 
711 		/* Exception created; mark the cached routes for the nexthop
712 		 * stale, so anyone caching it rechecks if this exception
713 		 * applies to them.
714 		 */
715 		rt = rcu_dereference(nh->nh_rth_input);
716 		if (rt)
717 			rt->dst.obsolete = DST_OBSOLETE_KILL;
718 
719 		for_each_possible_cpu(i) {
720 			struct rtable __rcu **prt;
721 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
722 			rt = rcu_dereference(*prt);
723 			if (rt)
724 				rt->dst.obsolete = DST_OBSOLETE_KILL;
725 		}
726 	}
727 
728 	fnhe->fnhe_stamp = jiffies;
729 
730 out_unlock:
731 	spin_unlock_bh(&fnhe_lock);
732 }
733 
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 			     bool kill_route)
736 {
737 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
738 	__be32 old_gw = ip_hdr(skb)->saddr;
739 	struct net_device *dev = skb->dev;
740 	struct in_device *in_dev;
741 	struct fib_result res;
742 	struct neighbour *n;
743 	struct net *net;
744 
745 	switch (icmp_hdr(skb)->code & 7) {
746 	case ICMP_REDIR_NET:
747 	case ICMP_REDIR_NETTOS:
748 	case ICMP_REDIR_HOST:
749 	case ICMP_REDIR_HOSTTOS:
750 		break;
751 
752 	default:
753 		return;
754 	}
755 
756 	if (rt->rt_gateway != old_gw)
757 		return;
758 
759 	in_dev = __in_dev_get_rcu(dev);
760 	if (!in_dev)
761 		return;
762 
763 	net = dev_net(dev);
764 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 	    ipv4_is_zeronet(new_gw))
767 		goto reject_redirect;
768 
769 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 			goto reject_redirect;
772 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 			goto reject_redirect;
774 	} else {
775 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776 			goto reject_redirect;
777 	}
778 
779 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 	if (!n)
781 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782 	if (!IS_ERR(n)) {
783 		if (!(n->nud_state & NUD_VALID)) {
784 			neigh_event_send(n, NULL);
785 		} else {
786 			if (fib_lookup(net, fl4, &res, 0) == 0) {
787 				struct fib_nh *nh = &FIB_RES_NH(res);
788 
789 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
790 						0, jiffies + ip_rt_gc_timeout);
791 			}
792 			if (kill_route)
793 				rt->dst.obsolete = DST_OBSOLETE_KILL;
794 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
795 		}
796 		neigh_release(n);
797 	}
798 	return;
799 
800 reject_redirect:
801 #ifdef CONFIG_IP_ROUTE_VERBOSE
802 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
803 		const struct iphdr *iph = (const struct iphdr *) skb->data;
804 		__be32 daddr = iph->daddr;
805 		__be32 saddr = iph->saddr;
806 
807 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
808 				     "  Advised path = %pI4 -> %pI4\n",
809 				     &old_gw, dev->name, &new_gw,
810 				     &saddr, &daddr);
811 	}
812 #endif
813 	;
814 }
815 
816 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
817 {
818 	struct rtable *rt;
819 	struct flowi4 fl4;
820 	const struct iphdr *iph = (const struct iphdr *) skb->data;
821 	struct net *net = dev_net(skb->dev);
822 	int oif = skb->dev->ifindex;
823 	u8 tos = RT_TOS(iph->tos);
824 	u8 prot = iph->protocol;
825 	u32 mark = skb->mark;
826 
827 	rt = (struct rtable *) dst;
828 
829 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
830 	__ip_do_redirect(rt, skb, &fl4, true);
831 }
832 
833 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
834 {
835 	struct rtable *rt = (struct rtable *)dst;
836 	struct dst_entry *ret = dst;
837 
838 	if (rt) {
839 		if (dst->obsolete > 0) {
840 			ip_rt_put(rt);
841 			ret = NULL;
842 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
843 			   rt->dst.expires) {
844 			ip_rt_put(rt);
845 			ret = NULL;
846 		}
847 	}
848 	return ret;
849 }
850 
851 /*
852  * Algorithm:
853  *	1. The first ip_rt_redirect_number redirects are sent
854  *	   with exponential backoff, then we stop sending them at all,
855  *	   assuming that the host ignores our redirects.
856  *	2. If we did not see packets requiring redirects
857  *	   during ip_rt_redirect_silence, we assume that the host
858  *	   forgot redirected route and start to send redirects again.
859  *
860  * This algorithm is much cheaper and more intelligent than dumb load limiting
861  * in icmp.c.
862  *
863  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
864  * and "frag. need" (breaks PMTU discovery) in icmp.c.
865  */
866 
867 void ip_rt_send_redirect(struct sk_buff *skb)
868 {
869 	struct rtable *rt = skb_rtable(skb);
870 	struct in_device *in_dev;
871 	struct inet_peer *peer;
872 	struct net *net;
873 	int log_martians;
874 	int vif;
875 
876 	rcu_read_lock();
877 	in_dev = __in_dev_get_rcu(rt->dst.dev);
878 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
879 		rcu_read_unlock();
880 		return;
881 	}
882 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
883 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
884 	rcu_read_unlock();
885 
886 	net = dev_net(rt->dst.dev);
887 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
888 	if (!peer) {
889 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
890 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
891 		return;
892 	}
893 
894 	/* No redirected packets during ip_rt_redirect_silence;
895 	 * reset the algorithm.
896 	 */
897 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
898 		peer->rate_tokens = 0;
899 
900 	/* Too many ignored redirects; do not send anything
901 	 * set dst.rate_last to the last seen redirected packet.
902 	 */
903 	if (peer->rate_tokens >= ip_rt_redirect_number) {
904 		peer->rate_last = jiffies;
905 		goto out_put_peer;
906 	}
907 
908 	/* Check for load limit; set rate_last to the latest sent
909 	 * redirect.
910 	 */
911 	if (peer->rate_tokens == 0 ||
912 	    time_after(jiffies,
913 		       (peer->rate_last +
914 			(ip_rt_redirect_load << peer->rate_tokens)))) {
915 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
916 
917 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
918 		peer->rate_last = jiffies;
919 		++peer->rate_tokens;
920 #ifdef CONFIG_IP_ROUTE_VERBOSE
921 		if (log_martians &&
922 		    peer->rate_tokens == ip_rt_redirect_number)
923 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
924 					     &ip_hdr(skb)->saddr, inet_iif(skb),
925 					     &ip_hdr(skb)->daddr, &gw);
926 #endif
927 	}
928 out_put_peer:
929 	inet_putpeer(peer);
930 }
931 
932 static int ip_error(struct sk_buff *skb)
933 {
934 	struct rtable *rt = skb_rtable(skb);
935 	struct net_device *dev = skb->dev;
936 	struct in_device *in_dev;
937 	struct inet_peer *peer;
938 	unsigned long now;
939 	struct net *net;
940 	bool send;
941 	int code;
942 
943 	if (netif_is_l3_master(skb->dev)) {
944 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
945 		if (!dev)
946 			goto out;
947 	}
948 
949 	in_dev = __in_dev_get_rcu(dev);
950 
951 	/* IP on this device is disabled. */
952 	if (!in_dev)
953 		goto out;
954 
955 	net = dev_net(rt->dst.dev);
956 	if (!IN_DEV_FORWARD(in_dev)) {
957 		switch (rt->dst.error) {
958 		case EHOSTUNREACH:
959 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
960 			break;
961 
962 		case ENETUNREACH:
963 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
964 			break;
965 		}
966 		goto out;
967 	}
968 
969 	switch (rt->dst.error) {
970 	case EINVAL:
971 	default:
972 		goto out;
973 	case EHOSTUNREACH:
974 		code = ICMP_HOST_UNREACH;
975 		break;
976 	case ENETUNREACH:
977 		code = ICMP_NET_UNREACH;
978 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
979 		break;
980 	case EACCES:
981 		code = ICMP_PKT_FILTERED;
982 		break;
983 	}
984 
985 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
986 			       l3mdev_master_ifindex(skb->dev), 1);
987 
988 	send = true;
989 	if (peer) {
990 		now = jiffies;
991 		peer->rate_tokens += now - peer->rate_last;
992 		if (peer->rate_tokens > ip_rt_error_burst)
993 			peer->rate_tokens = ip_rt_error_burst;
994 		peer->rate_last = now;
995 		if (peer->rate_tokens >= ip_rt_error_cost)
996 			peer->rate_tokens -= ip_rt_error_cost;
997 		else
998 			send = false;
999 		inet_putpeer(peer);
1000 	}
1001 	if (send)
1002 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1003 
1004 out:	kfree_skb(skb);
1005 	return 0;
1006 }
1007 
1008 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1009 {
1010 	struct dst_entry *dst = &rt->dst;
1011 	struct fib_result res;
1012 
1013 	if (dst_metric_locked(dst, RTAX_MTU))
1014 		return;
1015 
1016 	if (ipv4_mtu(dst) < mtu)
1017 		return;
1018 
1019 	if (mtu < ip_rt_min_pmtu)
1020 		mtu = ip_rt_min_pmtu;
1021 
1022 	if (rt->rt_pmtu == mtu &&
1023 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1024 		return;
1025 
1026 	rcu_read_lock();
1027 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1028 		struct fib_nh *nh = &FIB_RES_NH(res);
1029 
1030 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1031 				      jiffies + ip_rt_mtu_expires);
1032 	}
1033 	rcu_read_unlock();
1034 }
1035 
1036 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1037 			      struct sk_buff *skb, u32 mtu)
1038 {
1039 	struct rtable *rt = (struct rtable *) dst;
1040 	struct flowi4 fl4;
1041 
1042 	ip_rt_build_flow_key(&fl4, sk, skb);
1043 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1044 }
1045 
1046 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1047 		      int oif, u32 mark, u8 protocol, int flow_flags)
1048 {
1049 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1050 	struct flowi4 fl4;
1051 	struct rtable *rt;
1052 
1053 	if (!mark)
1054 		mark = IP4_REPLY_MARK(net, skb->mark);
1055 
1056 	__build_flow_key(net, &fl4, NULL, iph, oif,
1057 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1058 	rt = __ip_route_output_key(net, &fl4);
1059 	if (!IS_ERR(rt)) {
1060 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1061 		ip_rt_put(rt);
1062 	}
1063 }
1064 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1065 
1066 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1067 {
1068 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1069 	struct flowi4 fl4;
1070 	struct rtable *rt;
1071 
1072 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1073 
1074 	if (!fl4.flowi4_mark)
1075 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1076 
1077 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1078 	if (!IS_ERR(rt)) {
1079 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1080 		ip_rt_put(rt);
1081 	}
1082 }
1083 
1084 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1085 {
1086 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1087 	struct flowi4 fl4;
1088 	struct rtable *rt;
1089 	struct dst_entry *odst = NULL;
1090 	bool new = false;
1091 	struct net *net = sock_net(sk);
1092 
1093 	bh_lock_sock(sk);
1094 
1095 	if (!ip_sk_accept_pmtu(sk))
1096 		goto out;
1097 
1098 	odst = sk_dst_get(sk);
1099 
1100 	if (sock_owned_by_user(sk) || !odst) {
1101 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1102 		goto out;
1103 	}
1104 
1105 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1106 
1107 	rt = (struct rtable *)odst;
1108 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1109 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1110 		if (IS_ERR(rt))
1111 			goto out;
1112 
1113 		new = true;
1114 	}
1115 
1116 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1117 
1118 	if (!dst_check(&rt->dst, 0)) {
1119 		if (new)
1120 			dst_release(&rt->dst);
1121 
1122 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1123 		if (IS_ERR(rt))
1124 			goto out;
1125 
1126 		new = true;
1127 	}
1128 
1129 	if (new)
1130 		sk_dst_set(sk, &rt->dst);
1131 
1132 out:
1133 	bh_unlock_sock(sk);
1134 	dst_release(odst);
1135 }
1136 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1137 
1138 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1139 		   int oif, u32 mark, u8 protocol, int flow_flags)
1140 {
1141 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1142 	struct flowi4 fl4;
1143 	struct rtable *rt;
1144 
1145 	__build_flow_key(net, &fl4, NULL, iph, oif,
1146 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1147 	rt = __ip_route_output_key(net, &fl4);
1148 	if (!IS_ERR(rt)) {
1149 		__ip_do_redirect(rt, skb, &fl4, false);
1150 		ip_rt_put(rt);
1151 	}
1152 }
1153 EXPORT_SYMBOL_GPL(ipv4_redirect);
1154 
1155 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1156 {
1157 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1158 	struct flowi4 fl4;
1159 	struct rtable *rt;
1160 	struct net *net = sock_net(sk);
1161 
1162 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1163 	rt = __ip_route_output_key(net, &fl4);
1164 	if (!IS_ERR(rt)) {
1165 		__ip_do_redirect(rt, skb, &fl4, false);
1166 		ip_rt_put(rt);
1167 	}
1168 }
1169 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1170 
1171 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1172 {
1173 	struct rtable *rt = (struct rtable *) dst;
1174 
1175 	/* All IPV4 dsts are created with ->obsolete set to the value
1176 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1177 	 * into this function always.
1178 	 *
1179 	 * When a PMTU/redirect information update invalidates a route,
1180 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1181 	 * DST_OBSOLETE_DEAD by dst_free().
1182 	 */
1183 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1184 		return NULL;
1185 	return dst;
1186 }
1187 
1188 static void ipv4_link_failure(struct sk_buff *skb)
1189 {
1190 	struct rtable *rt;
1191 
1192 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1193 
1194 	rt = skb_rtable(skb);
1195 	if (rt)
1196 		dst_set_expires(&rt->dst, 0);
1197 }
1198 
1199 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1200 {
1201 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1202 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1203 		 skb->dev ? skb->dev->name : "?");
1204 	kfree_skb(skb);
1205 	WARN_ON(1);
1206 	return 0;
1207 }
1208 
1209 /*
1210    We do not cache source address of outgoing interface,
1211    because it is used only by IP RR, TS and SRR options,
1212    so that it out of fast path.
1213 
1214    BTW remember: "addr" is allowed to be not aligned
1215    in IP options!
1216  */
1217 
1218 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1219 {
1220 	__be32 src;
1221 
1222 	if (rt_is_output_route(rt))
1223 		src = ip_hdr(skb)->saddr;
1224 	else {
1225 		struct fib_result res;
1226 		struct flowi4 fl4;
1227 		struct iphdr *iph;
1228 
1229 		iph = ip_hdr(skb);
1230 
1231 		memset(&fl4, 0, sizeof(fl4));
1232 		fl4.daddr = iph->daddr;
1233 		fl4.saddr = iph->saddr;
1234 		fl4.flowi4_tos = RT_TOS(iph->tos);
1235 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1236 		fl4.flowi4_iif = skb->dev->ifindex;
1237 		fl4.flowi4_mark = skb->mark;
1238 
1239 		rcu_read_lock();
1240 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1241 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1242 		else
1243 			src = inet_select_addr(rt->dst.dev,
1244 					       rt_nexthop(rt, iph->daddr),
1245 					       RT_SCOPE_UNIVERSE);
1246 		rcu_read_unlock();
1247 	}
1248 	memcpy(addr, &src, 4);
1249 }
1250 
1251 #ifdef CONFIG_IP_ROUTE_CLASSID
1252 static void set_class_tag(struct rtable *rt, u32 tag)
1253 {
1254 	if (!(rt->dst.tclassid & 0xFFFF))
1255 		rt->dst.tclassid |= tag & 0xFFFF;
1256 	if (!(rt->dst.tclassid & 0xFFFF0000))
1257 		rt->dst.tclassid |= tag & 0xFFFF0000;
1258 }
1259 #endif
1260 
1261 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1262 {
1263 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1264 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1265 				    ip_rt_min_advmss);
1266 
1267 	return min(advmss, IPV4_MAX_PMTU - header_size);
1268 }
1269 
1270 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1271 {
1272 	const struct rtable *rt = (const struct rtable *) dst;
1273 	unsigned int mtu = rt->rt_pmtu;
1274 
1275 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1276 		mtu = dst_metric_raw(dst, RTAX_MTU);
1277 
1278 	if (mtu)
1279 		return mtu;
1280 
1281 	mtu = READ_ONCE(dst->dev->mtu);
1282 
1283 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1284 		if (rt->rt_uses_gateway && mtu > 576)
1285 			mtu = 576;
1286 	}
1287 
1288 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1289 
1290 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1291 }
1292 
1293 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1294 {
1295 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1296 	struct fib_nh_exception *fnhe;
1297 	u32 hval;
1298 
1299 	if (!hash)
1300 		return NULL;
1301 
1302 	hval = fnhe_hashfun(daddr);
1303 
1304 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1305 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1306 		if (fnhe->fnhe_daddr == daddr)
1307 			return fnhe;
1308 	}
1309 	return NULL;
1310 }
1311 
1312 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1313 			      __be32 daddr, const bool do_cache)
1314 {
1315 	bool ret = false;
1316 
1317 	spin_lock_bh(&fnhe_lock);
1318 
1319 	if (daddr == fnhe->fnhe_daddr) {
1320 		struct rtable __rcu **porig;
1321 		struct rtable *orig;
1322 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1323 
1324 		if (rt_is_input_route(rt))
1325 			porig = &fnhe->fnhe_rth_input;
1326 		else
1327 			porig = &fnhe->fnhe_rth_output;
1328 		orig = rcu_dereference(*porig);
1329 
1330 		if (fnhe->fnhe_genid != genid) {
1331 			fnhe->fnhe_genid = genid;
1332 			fnhe->fnhe_gw = 0;
1333 			fnhe->fnhe_pmtu = 0;
1334 			fnhe->fnhe_expires = 0;
1335 			fnhe_flush_routes(fnhe);
1336 			orig = NULL;
1337 		}
1338 		fill_route_from_fnhe(rt, fnhe);
1339 		if (!rt->rt_gateway)
1340 			rt->rt_gateway = daddr;
1341 
1342 		if (do_cache) {
1343 			dst_hold(&rt->dst);
1344 			rcu_assign_pointer(*porig, rt);
1345 			if (orig) {
1346 				dst_dev_put(&orig->dst);
1347 				dst_release(&orig->dst);
1348 			}
1349 			ret = true;
1350 		}
1351 
1352 		fnhe->fnhe_stamp = jiffies;
1353 	}
1354 	spin_unlock_bh(&fnhe_lock);
1355 
1356 	return ret;
1357 }
1358 
1359 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1360 {
1361 	struct rtable *orig, *prev, **p;
1362 	bool ret = true;
1363 
1364 	if (rt_is_input_route(rt)) {
1365 		p = (struct rtable **)&nh->nh_rth_input;
1366 	} else {
1367 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1368 	}
1369 	orig = *p;
1370 
1371 	/* hold dst before doing cmpxchg() to avoid race condition
1372 	 * on this dst
1373 	 */
1374 	dst_hold(&rt->dst);
1375 	prev = cmpxchg(p, orig, rt);
1376 	if (prev == orig) {
1377 		if (orig) {
1378 			dst_dev_put(&orig->dst);
1379 			dst_release(&orig->dst);
1380 		}
1381 	} else {
1382 		dst_release(&rt->dst);
1383 		ret = false;
1384 	}
1385 
1386 	return ret;
1387 }
1388 
1389 struct uncached_list {
1390 	spinlock_t		lock;
1391 	struct list_head	head;
1392 };
1393 
1394 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1395 
1396 static void rt_add_uncached_list(struct rtable *rt)
1397 {
1398 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1399 
1400 	rt->rt_uncached_list = ul;
1401 
1402 	spin_lock_bh(&ul->lock);
1403 	list_add_tail(&rt->rt_uncached, &ul->head);
1404 	spin_unlock_bh(&ul->lock);
1405 }
1406 
1407 static void ipv4_dst_destroy(struct dst_entry *dst)
1408 {
1409 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1410 	struct rtable *rt = (struct rtable *) dst;
1411 
1412 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1413 		kfree(p);
1414 
1415 	if (!list_empty(&rt->rt_uncached)) {
1416 		struct uncached_list *ul = rt->rt_uncached_list;
1417 
1418 		spin_lock_bh(&ul->lock);
1419 		list_del(&rt->rt_uncached);
1420 		spin_unlock_bh(&ul->lock);
1421 	}
1422 }
1423 
1424 void rt_flush_dev(struct net_device *dev)
1425 {
1426 	struct net *net = dev_net(dev);
1427 	struct rtable *rt;
1428 	int cpu;
1429 
1430 	for_each_possible_cpu(cpu) {
1431 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1432 
1433 		spin_lock_bh(&ul->lock);
1434 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1435 			if (rt->dst.dev != dev)
1436 				continue;
1437 			rt->dst.dev = net->loopback_dev;
1438 			dev_hold(rt->dst.dev);
1439 			dev_put(dev);
1440 		}
1441 		spin_unlock_bh(&ul->lock);
1442 	}
1443 }
1444 
1445 static bool rt_cache_valid(const struct rtable *rt)
1446 {
1447 	return	rt &&
1448 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1449 		!rt_is_expired(rt);
1450 }
1451 
1452 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1453 			   const struct fib_result *res,
1454 			   struct fib_nh_exception *fnhe,
1455 			   struct fib_info *fi, u16 type, u32 itag,
1456 			   const bool do_cache)
1457 {
1458 	bool cached = false;
1459 
1460 	if (fi) {
1461 		struct fib_nh *nh = &FIB_RES_NH(*res);
1462 
1463 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1464 			rt->rt_gateway = nh->nh_gw;
1465 			rt->rt_uses_gateway = 1;
1466 		}
1467 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1468 		if (fi->fib_metrics != &dst_default_metrics) {
1469 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1470 			refcount_inc(&fi->fib_metrics->refcnt);
1471 		}
1472 #ifdef CONFIG_IP_ROUTE_CLASSID
1473 		rt->dst.tclassid = nh->nh_tclassid;
1474 #endif
1475 		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1476 		if (unlikely(fnhe))
1477 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1478 		else if (do_cache)
1479 			cached = rt_cache_route(nh, rt);
1480 		if (unlikely(!cached)) {
1481 			/* Routes we intend to cache in nexthop exception or
1482 			 * FIB nexthop have the DST_NOCACHE bit clear.
1483 			 * However, if we are unsuccessful at storing this
1484 			 * route into the cache we really need to set it.
1485 			 */
1486 			if (!rt->rt_gateway)
1487 				rt->rt_gateway = daddr;
1488 			rt_add_uncached_list(rt);
1489 		}
1490 	} else
1491 		rt_add_uncached_list(rt);
1492 
1493 #ifdef CONFIG_IP_ROUTE_CLASSID
1494 #ifdef CONFIG_IP_MULTIPLE_TABLES
1495 	set_class_tag(rt, res->tclassid);
1496 #endif
1497 	set_class_tag(rt, itag);
1498 #endif
1499 }
1500 
1501 struct rtable *rt_dst_alloc(struct net_device *dev,
1502 			    unsigned int flags, u16 type,
1503 			    bool nopolicy, bool noxfrm, bool will_cache)
1504 {
1505 	struct rtable *rt;
1506 
1507 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1508 		       (will_cache ? 0 : DST_HOST) |
1509 		       (nopolicy ? DST_NOPOLICY : 0) |
1510 		       (noxfrm ? DST_NOXFRM : 0));
1511 
1512 	if (rt) {
1513 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1514 		rt->rt_flags = flags;
1515 		rt->rt_type = type;
1516 		rt->rt_is_input = 0;
1517 		rt->rt_iif = 0;
1518 		rt->rt_pmtu = 0;
1519 		rt->rt_gateway = 0;
1520 		rt->rt_uses_gateway = 0;
1521 		rt->rt_table_id = 0;
1522 		INIT_LIST_HEAD(&rt->rt_uncached);
1523 
1524 		rt->dst.output = ip_output;
1525 		if (flags & RTCF_LOCAL)
1526 			rt->dst.input = ip_local_deliver;
1527 	}
1528 
1529 	return rt;
1530 }
1531 EXPORT_SYMBOL(rt_dst_alloc);
1532 
1533 /* called in rcu_read_lock() section */
1534 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535 			  u8 tos, struct net_device *dev,
1536 			  struct in_device *in_dev, u32 *itag)
1537 {
1538 	int err;
1539 
1540 	/* Primary sanity checks. */
1541 	if (!in_dev)
1542 		return -EINVAL;
1543 
1544 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1545 	    skb->protocol != htons(ETH_P_IP))
1546 		return -EINVAL;
1547 
1548 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1549 		return -EINVAL;
1550 
1551 	if (ipv4_is_zeronet(saddr)) {
1552 		if (!ipv4_is_local_multicast(daddr))
1553 			return -EINVAL;
1554 	} else {
1555 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1556 					  in_dev, itag);
1557 		if (err < 0)
1558 			return err;
1559 	}
1560 	return 0;
1561 }
1562 
1563 /* called in rcu_read_lock() section */
1564 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1565 			     u8 tos, struct net_device *dev, int our)
1566 {
1567 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1568 	unsigned int flags = RTCF_MULTICAST;
1569 	struct rtable *rth;
1570 	u32 itag = 0;
1571 	int err;
1572 
1573 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1574 	if (err)
1575 		return err;
1576 
1577 	if (our)
1578 		flags |= RTCF_LOCAL;
1579 
1580 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1581 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1582 	if (!rth)
1583 		return -ENOBUFS;
1584 
1585 #ifdef CONFIG_IP_ROUTE_CLASSID
1586 	rth->dst.tclassid = itag;
1587 #endif
1588 	rth->dst.output = ip_rt_bug;
1589 	rth->rt_is_input= 1;
1590 
1591 #ifdef CONFIG_IP_MROUTE
1592 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1593 		rth->dst.input = ip_mr_input;
1594 #endif
1595 	RT_CACHE_STAT_INC(in_slow_mc);
1596 
1597 	skb_dst_set(skb, &rth->dst);
1598 	return 0;
1599 }
1600 
1601 
1602 static void ip_handle_martian_source(struct net_device *dev,
1603 				     struct in_device *in_dev,
1604 				     struct sk_buff *skb,
1605 				     __be32 daddr,
1606 				     __be32 saddr)
1607 {
1608 	RT_CACHE_STAT_INC(in_martian_src);
1609 #ifdef CONFIG_IP_ROUTE_VERBOSE
1610 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1611 		/*
1612 		 *	RFC1812 recommendation, if source is martian,
1613 		 *	the only hint is MAC header.
1614 		 */
1615 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1616 			&daddr, &saddr, dev->name);
1617 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1618 			print_hex_dump(KERN_WARNING, "ll header: ",
1619 				       DUMP_PREFIX_OFFSET, 16, 1,
1620 				       skb_mac_header(skb),
1621 				       dev->hard_header_len, true);
1622 		}
1623 	}
1624 #endif
1625 }
1626 
1627 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1628 {
1629 	struct fnhe_hash_bucket *hash;
1630 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1631 	u32 hval = fnhe_hashfun(daddr);
1632 
1633 	spin_lock_bh(&fnhe_lock);
1634 
1635 	hash = rcu_dereference_protected(nh->nh_exceptions,
1636 					 lockdep_is_held(&fnhe_lock));
1637 	hash += hval;
1638 
1639 	fnhe_p = &hash->chain;
1640 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1641 	while (fnhe) {
1642 		if (fnhe->fnhe_daddr == daddr) {
1643 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1644 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1645 			fnhe_flush_routes(fnhe);
1646 			kfree_rcu(fnhe, rcu);
1647 			break;
1648 		}
1649 		fnhe_p = &fnhe->fnhe_next;
1650 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1651 						 lockdep_is_held(&fnhe_lock));
1652 	}
1653 
1654 	spin_unlock_bh(&fnhe_lock);
1655 }
1656 
1657 static void set_lwt_redirect(struct rtable *rth)
1658 {
1659 	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1660 		rth->dst.lwtstate->orig_output = rth->dst.output;
1661 		rth->dst.output = lwtunnel_output;
1662 	}
1663 
1664 	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1665 		rth->dst.lwtstate->orig_input = rth->dst.input;
1666 		rth->dst.input = lwtunnel_input;
1667 	}
1668 }
1669 
1670 /* called in rcu_read_lock() section */
1671 static int __mkroute_input(struct sk_buff *skb,
1672 			   const struct fib_result *res,
1673 			   struct in_device *in_dev,
1674 			   __be32 daddr, __be32 saddr, u32 tos)
1675 {
1676 	struct fib_nh_exception *fnhe;
1677 	struct rtable *rth;
1678 	int err;
1679 	struct in_device *out_dev;
1680 	bool do_cache;
1681 	u32 itag = 0;
1682 
1683 	/* get a working reference to the output device */
1684 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1685 	if (!out_dev) {
1686 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1687 		return -EINVAL;
1688 	}
1689 
1690 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1691 				  in_dev->dev, in_dev, &itag);
1692 	if (err < 0) {
1693 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1694 					 saddr);
1695 
1696 		goto cleanup;
1697 	}
1698 
1699 	do_cache = res->fi && !itag;
1700 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1701 	    skb->protocol == htons(ETH_P_IP) &&
1702 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1703 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1704 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1705 
1706 	if (skb->protocol != htons(ETH_P_IP)) {
1707 		/* Not IP (i.e. ARP). Do not create route, if it is
1708 		 * invalid for proxy arp. DNAT routes are always valid.
1709 		 *
1710 		 * Proxy arp feature have been extended to allow, ARP
1711 		 * replies back to the same interface, to support
1712 		 * Private VLAN switch technologies. See arp.c.
1713 		 */
1714 		if (out_dev == in_dev &&
1715 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1716 			err = -EINVAL;
1717 			goto cleanup;
1718 		}
1719 	}
1720 
1721 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1722 	if (do_cache) {
1723 		if (fnhe) {
1724 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1725 			if (rth && rth->dst.expires &&
1726 			    time_after(jiffies, rth->dst.expires)) {
1727 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1728 				fnhe = NULL;
1729 			} else {
1730 				goto rt_cache;
1731 			}
1732 		}
1733 
1734 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1735 
1736 rt_cache:
1737 		if (rt_cache_valid(rth)) {
1738 			skb_dst_set_noref(skb, &rth->dst);
1739 			goto out;
1740 		}
1741 	}
1742 
1743 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1744 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1745 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1746 	if (!rth) {
1747 		err = -ENOBUFS;
1748 		goto cleanup;
1749 	}
1750 
1751 	rth->rt_is_input = 1;
1752 	if (res->table)
1753 		rth->rt_table_id = res->table->tb_id;
1754 	RT_CACHE_STAT_INC(in_slow_tot);
1755 
1756 	rth->dst.input = ip_forward;
1757 
1758 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1759 		       do_cache);
1760 	set_lwt_redirect(rth);
1761 	skb_dst_set(skb, &rth->dst);
1762 out:
1763 	err = 0;
1764  cleanup:
1765 	return err;
1766 }
1767 
1768 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1769 /* To make ICMP packets follow the right flow, the multipath hash is
1770  * calculated from the inner IP addresses.
1771  */
1772 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1773 				 struct flow_keys *hash_keys)
1774 {
1775 	const struct iphdr *outer_iph = ip_hdr(skb);
1776 	const struct iphdr *inner_iph;
1777 	const struct icmphdr *icmph;
1778 	struct iphdr _inner_iph;
1779 	struct icmphdr _icmph;
1780 
1781 	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1782 	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1783 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1784 		return;
1785 
1786 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1787 		return;
1788 
1789 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1790 				   &_icmph);
1791 	if (!icmph)
1792 		return;
1793 
1794 	if (icmph->type != ICMP_DEST_UNREACH &&
1795 	    icmph->type != ICMP_REDIRECT &&
1796 	    icmph->type != ICMP_TIME_EXCEEDED &&
1797 	    icmph->type != ICMP_PARAMETERPROB)
1798 		return;
1799 
1800 	inner_iph = skb_header_pointer(skb,
1801 				       outer_iph->ihl * 4 + sizeof(_icmph),
1802 				       sizeof(_inner_iph), &_inner_iph);
1803 	if (!inner_iph)
1804 		return;
1805 	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1806 	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1807 }
1808 
1809 /* if skb is set it will be used and fl4 can be NULL */
1810 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1811 		       const struct sk_buff *skb)
1812 {
1813 	struct net *net = fi->fib_net;
1814 	struct flow_keys hash_keys;
1815 	u32 mhash;
1816 
1817 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1818 	case 0:
1819 		memset(&hash_keys, 0, sizeof(hash_keys));
1820 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1821 		if (skb) {
1822 			ip_multipath_l3_keys(skb, &hash_keys);
1823 		} else {
1824 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1825 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1826 		}
1827 		break;
1828 	case 1:
1829 		/* skb is currently provided only when forwarding */
1830 		if (skb) {
1831 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1832 			struct flow_keys keys;
1833 
1834 			/* short-circuit if we already have L4 hash present */
1835 			if (skb->l4_hash)
1836 				return skb_get_hash_raw(skb) >> 1;
1837 			memset(&hash_keys, 0, sizeof(hash_keys));
1838 			skb_flow_dissect_flow_keys(skb, &keys, flag);
1839 
1840 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841 			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1842 			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1843 			hash_keys.ports.src = keys.ports.src;
1844 			hash_keys.ports.dst = keys.ports.dst;
1845 			hash_keys.basic.ip_proto = keys.basic.ip_proto;
1846 		} else {
1847 			memset(&hash_keys, 0, sizeof(hash_keys));
1848 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1850 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851 			hash_keys.ports.src = fl4->fl4_sport;
1852 			hash_keys.ports.dst = fl4->fl4_dport;
1853 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854 		}
1855 		break;
1856 	}
1857 	mhash = flow_hash_from_keys(&hash_keys);
1858 
1859 	return mhash >> 1;
1860 }
1861 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1862 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1863 
1864 static int ip_mkroute_input(struct sk_buff *skb,
1865 			    struct fib_result *res,
1866 			    struct in_device *in_dev,
1867 			    __be32 daddr, __be32 saddr, u32 tos)
1868 {
1869 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1870 	if (res->fi && res->fi->fib_nhs > 1) {
1871 		int h = fib_multipath_hash(res->fi, NULL, skb);
1872 
1873 		fib_select_multipath(res, h);
1874 	}
1875 #endif
1876 
1877 	/* create a routing cache entry */
1878 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1879 }
1880 
1881 /*
1882  *	NOTE. We drop all the packets that has local source
1883  *	addresses, because every properly looped back packet
1884  *	must have correct destination already attached by output routine.
1885  *
1886  *	Such approach solves two big problems:
1887  *	1. Not simplex devices are handled properly.
1888  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1889  *	called with rcu_read_lock()
1890  */
1891 
1892 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893 			       u8 tos, struct net_device *dev,
1894 			       struct fib_result *res)
1895 {
1896 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1897 	struct ip_tunnel_info *tun_info;
1898 	struct flowi4	fl4;
1899 	unsigned int	flags = 0;
1900 	u32		itag = 0;
1901 	struct rtable	*rth;
1902 	int		err = -EINVAL;
1903 	struct net    *net = dev_net(dev);
1904 	bool do_cache;
1905 
1906 	/* IP on this device is disabled. */
1907 
1908 	if (!in_dev)
1909 		goto out;
1910 
1911 	/* Check for the most weird martians, which can be not detected
1912 	   by fib_lookup.
1913 	 */
1914 
1915 	tun_info = skb_tunnel_info(skb);
1916 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1917 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1918 	else
1919 		fl4.flowi4_tun_key.tun_id = 0;
1920 	skb_dst_drop(skb);
1921 
1922 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1923 		goto martian_source;
1924 
1925 	res->fi = NULL;
1926 	res->table = NULL;
1927 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1928 		goto brd_input;
1929 
1930 	/* Accept zero addresses only to limited broadcast;
1931 	 * I even do not know to fix it or not. Waiting for complains :-)
1932 	 */
1933 	if (ipv4_is_zeronet(saddr))
1934 		goto martian_source;
1935 
1936 	if (ipv4_is_zeronet(daddr))
1937 		goto martian_destination;
1938 
1939 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1940 	 * and call it once if daddr or/and saddr are loopback addresses
1941 	 */
1942 	if (ipv4_is_loopback(daddr)) {
1943 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1944 			goto martian_destination;
1945 	} else if (ipv4_is_loopback(saddr)) {
1946 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1947 			goto martian_source;
1948 	}
1949 
1950 	/*
1951 	 *	Now we are ready to route packet.
1952 	 */
1953 	fl4.flowi4_oif = 0;
1954 	fl4.flowi4_iif = dev->ifindex;
1955 	fl4.flowi4_mark = skb->mark;
1956 	fl4.flowi4_tos = tos;
1957 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1958 	fl4.flowi4_flags = 0;
1959 	fl4.daddr = daddr;
1960 	fl4.saddr = saddr;
1961 	fl4.flowi4_uid = sock_net_uid(net, NULL);
1962 	err = fib_lookup(net, &fl4, res, 0);
1963 	if (err != 0) {
1964 		if (!IN_DEV_FORWARD(in_dev))
1965 			err = -EHOSTUNREACH;
1966 		goto no_route;
1967 	}
1968 
1969 	if (res->type == RTN_BROADCAST)
1970 		goto brd_input;
1971 
1972 	if (res->type == RTN_LOCAL) {
1973 		err = fib_validate_source(skb, saddr, daddr, tos,
1974 					  0, dev, in_dev, &itag);
1975 		if (err < 0)
1976 			goto martian_source;
1977 		goto local_input;
1978 	}
1979 
1980 	if (!IN_DEV_FORWARD(in_dev)) {
1981 		err = -EHOSTUNREACH;
1982 		goto no_route;
1983 	}
1984 	if (res->type != RTN_UNICAST)
1985 		goto martian_destination;
1986 
1987 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1988 out:	return err;
1989 
1990 brd_input:
1991 	if (skb->protocol != htons(ETH_P_IP))
1992 		goto e_inval;
1993 
1994 	if (!ipv4_is_zeronet(saddr)) {
1995 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1996 					  in_dev, &itag);
1997 		if (err < 0)
1998 			goto martian_source;
1999 	}
2000 	flags |= RTCF_BROADCAST;
2001 	res->type = RTN_BROADCAST;
2002 	RT_CACHE_STAT_INC(in_brd);
2003 
2004 local_input:
2005 	do_cache = false;
2006 	if (res->fi) {
2007 		if (!itag) {
2008 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2009 			if (rt_cache_valid(rth)) {
2010 				skb_dst_set_noref(skb, &rth->dst);
2011 				err = 0;
2012 				goto out;
2013 			}
2014 			do_cache = true;
2015 		}
2016 	}
2017 
2018 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2019 			   flags | RTCF_LOCAL, res->type,
2020 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2021 	if (!rth)
2022 		goto e_nobufs;
2023 
2024 	rth->dst.output= ip_rt_bug;
2025 #ifdef CONFIG_IP_ROUTE_CLASSID
2026 	rth->dst.tclassid = itag;
2027 #endif
2028 	rth->rt_is_input = 1;
2029 	if (res->table)
2030 		rth->rt_table_id = res->table->tb_id;
2031 
2032 	RT_CACHE_STAT_INC(in_slow_tot);
2033 	if (res->type == RTN_UNREACHABLE) {
2034 		rth->dst.input= ip_error;
2035 		rth->dst.error= -err;
2036 		rth->rt_flags 	&= ~RTCF_LOCAL;
2037 	}
2038 
2039 	if (do_cache) {
2040 		struct fib_nh *nh = &FIB_RES_NH(*res);
2041 
2042 		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2043 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2044 			WARN_ON(rth->dst.input == lwtunnel_input);
2045 			rth->dst.lwtstate->orig_input = rth->dst.input;
2046 			rth->dst.input = lwtunnel_input;
2047 		}
2048 
2049 		if (unlikely(!rt_cache_route(nh, rth)))
2050 			rt_add_uncached_list(rth);
2051 	}
2052 	skb_dst_set(skb, &rth->dst);
2053 	err = 0;
2054 	goto out;
2055 
2056 no_route:
2057 	RT_CACHE_STAT_INC(in_no_route);
2058 	res->type = RTN_UNREACHABLE;
2059 	res->fi = NULL;
2060 	res->table = NULL;
2061 	goto local_input;
2062 
2063 	/*
2064 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2065 	 */
2066 martian_destination:
2067 	RT_CACHE_STAT_INC(in_martian_dst);
2068 #ifdef CONFIG_IP_ROUTE_VERBOSE
2069 	if (IN_DEV_LOG_MARTIANS(in_dev))
2070 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2071 				     &daddr, &saddr, dev->name);
2072 #endif
2073 
2074 e_inval:
2075 	err = -EINVAL;
2076 	goto out;
2077 
2078 e_nobufs:
2079 	err = -ENOBUFS;
2080 	goto out;
2081 
2082 martian_source:
2083 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2084 	goto out;
2085 }
2086 
2087 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2088 			 u8 tos, struct net_device *dev)
2089 {
2090 	struct fib_result res;
2091 	int err;
2092 
2093 	tos &= IPTOS_RT_MASK;
2094 	rcu_read_lock();
2095 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2096 	rcu_read_unlock();
2097 
2098 	return err;
2099 }
2100 EXPORT_SYMBOL(ip_route_input_noref);
2101 
2102 /* called with rcu_read_lock held */
2103 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2104 		       u8 tos, struct net_device *dev, struct fib_result *res)
2105 {
2106 	/* Multicast recognition logic is moved from route cache to here.
2107 	   The problem was that too many Ethernet cards have broken/missing
2108 	   hardware multicast filters :-( As result the host on multicasting
2109 	   network acquires a lot of useless route cache entries, sort of
2110 	   SDR messages from all the world. Now we try to get rid of them.
2111 	   Really, provided software IP multicast filter is organized
2112 	   reasonably (at least, hashed), it does not result in a slowdown
2113 	   comparing with route cache reject entries.
2114 	   Note, that multicast routers are not affected, because
2115 	   route cache entry is created eventually.
2116 	 */
2117 	if (ipv4_is_multicast(daddr)) {
2118 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2119 		int our = 0;
2120 		int err = -EINVAL;
2121 
2122 		if (in_dev)
2123 			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2124 					      ip_hdr(skb)->protocol);
2125 
2126 		/* check l3 master if no match yet */
2127 		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2128 			struct in_device *l3_in_dev;
2129 
2130 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2131 			if (l3_in_dev)
2132 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2133 						      ip_hdr(skb)->protocol);
2134 		}
2135 
2136 		if (our
2137 #ifdef CONFIG_IP_MROUTE
2138 			||
2139 		    (!ipv4_is_local_multicast(daddr) &&
2140 		     IN_DEV_MFORWARD(in_dev))
2141 #endif
2142 		   ) {
2143 			err = ip_route_input_mc(skb, daddr, saddr,
2144 						tos, dev, our);
2145 		}
2146 		return err;
2147 	}
2148 
2149 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2150 }
2151 
2152 /* called with rcu_read_lock() */
2153 static struct rtable *__mkroute_output(const struct fib_result *res,
2154 				       const struct flowi4 *fl4, int orig_oif,
2155 				       struct net_device *dev_out,
2156 				       unsigned int flags)
2157 {
2158 	struct fib_info *fi = res->fi;
2159 	struct fib_nh_exception *fnhe;
2160 	struct in_device *in_dev;
2161 	u16 type = res->type;
2162 	struct rtable *rth;
2163 	bool do_cache;
2164 
2165 	in_dev = __in_dev_get_rcu(dev_out);
2166 	if (!in_dev)
2167 		return ERR_PTR(-EINVAL);
2168 
2169 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2170 		if (ipv4_is_loopback(fl4->saddr) &&
2171 		    !(dev_out->flags & IFF_LOOPBACK) &&
2172 		    !netif_is_l3_master(dev_out))
2173 			return ERR_PTR(-EINVAL);
2174 
2175 	if (ipv4_is_lbcast(fl4->daddr))
2176 		type = RTN_BROADCAST;
2177 	else if (ipv4_is_multicast(fl4->daddr))
2178 		type = RTN_MULTICAST;
2179 	else if (ipv4_is_zeronet(fl4->daddr))
2180 		return ERR_PTR(-EINVAL);
2181 
2182 	if (dev_out->flags & IFF_LOOPBACK)
2183 		flags |= RTCF_LOCAL;
2184 
2185 	do_cache = true;
2186 	if (type == RTN_BROADCAST) {
2187 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2188 		fi = NULL;
2189 	} else if (type == RTN_MULTICAST) {
2190 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2191 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2192 				     fl4->flowi4_proto))
2193 			flags &= ~RTCF_LOCAL;
2194 		else
2195 			do_cache = false;
2196 		/* If multicast route do not exist use
2197 		 * default one, but do not gateway in this case.
2198 		 * Yes, it is hack.
2199 		 */
2200 		if (fi && res->prefixlen < 4)
2201 			fi = NULL;
2202 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2203 		   (orig_oif != dev_out->ifindex)) {
2204 		/* For local routes that require a particular output interface
2205 		 * we do not want to cache the result.  Caching the result
2206 		 * causes incorrect behaviour when there are multiple source
2207 		 * addresses on the interface, the end result being that if the
2208 		 * intended recipient is waiting on that interface for the
2209 		 * packet he won't receive it because it will be delivered on
2210 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2211 		 * be set to the loopback interface as well.
2212 		 */
2213 		fi = NULL;
2214 	}
2215 
2216 	fnhe = NULL;
2217 	do_cache &= fi != NULL;
2218 	if (do_cache) {
2219 		struct rtable __rcu **prth;
2220 		struct fib_nh *nh = &FIB_RES_NH(*res);
2221 
2222 		fnhe = find_exception(nh, fl4->daddr);
2223 		if (fnhe) {
2224 			prth = &fnhe->fnhe_rth_output;
2225 			rth = rcu_dereference(*prth);
2226 			if (rth && rth->dst.expires &&
2227 			    time_after(jiffies, rth->dst.expires)) {
2228 				ip_del_fnhe(nh, fl4->daddr);
2229 				fnhe = NULL;
2230 			} else {
2231 				goto rt_cache;
2232 			}
2233 		}
2234 
2235 		if (unlikely(fl4->flowi4_flags &
2236 			     FLOWI_FLAG_KNOWN_NH &&
2237 			     !(nh->nh_gw &&
2238 			       nh->nh_scope == RT_SCOPE_LINK))) {
2239 			do_cache = false;
2240 			goto add;
2241 		}
2242 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2243 		rth = rcu_dereference(*prth);
2244 
2245 rt_cache:
2246 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2247 			return rth;
2248 	}
2249 
2250 add:
2251 	rth = rt_dst_alloc(dev_out, flags, type,
2252 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2253 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2254 			   do_cache);
2255 	if (!rth)
2256 		return ERR_PTR(-ENOBUFS);
2257 
2258 	rth->rt_iif = orig_oif;
2259 	if (res->table)
2260 		rth->rt_table_id = res->table->tb_id;
2261 
2262 	RT_CACHE_STAT_INC(out_slow_tot);
2263 
2264 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2265 		if (flags & RTCF_LOCAL &&
2266 		    !(dev_out->flags & IFF_LOOPBACK)) {
2267 			rth->dst.output = ip_mc_output;
2268 			RT_CACHE_STAT_INC(out_slow_mc);
2269 		}
2270 #ifdef CONFIG_IP_MROUTE
2271 		if (type == RTN_MULTICAST) {
2272 			if (IN_DEV_MFORWARD(in_dev) &&
2273 			    !ipv4_is_local_multicast(fl4->daddr)) {
2274 				rth->dst.input = ip_mr_input;
2275 				rth->dst.output = ip_mc_output;
2276 			}
2277 		}
2278 #endif
2279 	}
2280 
2281 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2282 	set_lwt_redirect(rth);
2283 
2284 	return rth;
2285 }
2286 
2287 /*
2288  * Major route resolver routine.
2289  */
2290 
2291 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2292 					const struct sk_buff *skb)
2293 {
2294 	__u8 tos = RT_FL_TOS(fl4);
2295 	struct fib_result res;
2296 	struct rtable *rth;
2297 
2298 	res.tclassid	= 0;
2299 	res.fi		= NULL;
2300 	res.table	= NULL;
2301 
2302 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2303 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2304 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2305 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2306 
2307 	rcu_read_lock();
2308 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2309 	rcu_read_unlock();
2310 
2311 	return rth;
2312 }
2313 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2314 
2315 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2316 					    struct fib_result *res,
2317 					    const struct sk_buff *skb)
2318 {
2319 	struct net_device *dev_out = NULL;
2320 	int orig_oif = fl4->flowi4_oif;
2321 	unsigned int flags = 0;
2322 	struct rtable *rth;
2323 	int err = -ENETUNREACH;
2324 
2325 	if (fl4->saddr) {
2326 		rth = ERR_PTR(-EINVAL);
2327 		if (ipv4_is_multicast(fl4->saddr) ||
2328 		    ipv4_is_lbcast(fl4->saddr) ||
2329 		    ipv4_is_zeronet(fl4->saddr))
2330 			goto out;
2331 
2332 		/* I removed check for oif == dev_out->oif here.
2333 		   It was wrong for two reasons:
2334 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2335 		      is assigned to multiple interfaces.
2336 		   2. Moreover, we are allowed to send packets with saddr
2337 		      of another iface. --ANK
2338 		 */
2339 
2340 		if (fl4->flowi4_oif == 0 &&
2341 		    (ipv4_is_multicast(fl4->daddr) ||
2342 		     ipv4_is_lbcast(fl4->daddr))) {
2343 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2344 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2345 			if (!dev_out)
2346 				goto out;
2347 
2348 			/* Special hack: user can direct multicasts
2349 			   and limited broadcast via necessary interface
2350 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2351 			   This hack is not just for fun, it allows
2352 			   vic,vat and friends to work.
2353 			   They bind socket to loopback, set ttl to zero
2354 			   and expect that it will work.
2355 			   From the viewpoint of routing cache they are broken,
2356 			   because we are not allowed to build multicast path
2357 			   with loopback source addr (look, routing cache
2358 			   cannot know, that ttl is zero, so that packet
2359 			   will not leave this host and route is valid).
2360 			   Luckily, this hack is good workaround.
2361 			 */
2362 
2363 			fl4->flowi4_oif = dev_out->ifindex;
2364 			goto make_route;
2365 		}
2366 
2367 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2368 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2369 			if (!__ip_dev_find(net, fl4->saddr, false))
2370 				goto out;
2371 		}
2372 	}
2373 
2374 
2375 	if (fl4->flowi4_oif) {
2376 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2377 		rth = ERR_PTR(-ENODEV);
2378 		if (!dev_out)
2379 			goto out;
2380 
2381 		/* RACE: Check return value of inet_select_addr instead. */
2382 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2383 			rth = ERR_PTR(-ENETUNREACH);
2384 			goto out;
2385 		}
2386 		if (ipv4_is_local_multicast(fl4->daddr) ||
2387 		    ipv4_is_lbcast(fl4->daddr) ||
2388 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2389 			if (!fl4->saddr)
2390 				fl4->saddr = inet_select_addr(dev_out, 0,
2391 							      RT_SCOPE_LINK);
2392 			goto make_route;
2393 		}
2394 		if (!fl4->saddr) {
2395 			if (ipv4_is_multicast(fl4->daddr))
2396 				fl4->saddr = inet_select_addr(dev_out, 0,
2397 							      fl4->flowi4_scope);
2398 			else if (!fl4->daddr)
2399 				fl4->saddr = inet_select_addr(dev_out, 0,
2400 							      RT_SCOPE_HOST);
2401 		}
2402 	}
2403 
2404 	if (!fl4->daddr) {
2405 		fl4->daddr = fl4->saddr;
2406 		if (!fl4->daddr)
2407 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2408 		dev_out = net->loopback_dev;
2409 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2410 		res->type = RTN_LOCAL;
2411 		flags |= RTCF_LOCAL;
2412 		goto make_route;
2413 	}
2414 
2415 	err = fib_lookup(net, fl4, res, 0);
2416 	if (err) {
2417 		res->fi = NULL;
2418 		res->table = NULL;
2419 		if (fl4->flowi4_oif &&
2420 		    (ipv4_is_multicast(fl4->daddr) ||
2421 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2422 			/* Apparently, routing tables are wrong. Assume,
2423 			   that the destination is on link.
2424 
2425 			   WHY? DW.
2426 			   Because we are allowed to send to iface
2427 			   even if it has NO routes and NO assigned
2428 			   addresses. When oif is specified, routing
2429 			   tables are looked up with only one purpose:
2430 			   to catch if destination is gatewayed, rather than
2431 			   direct. Moreover, if MSG_DONTROUTE is set,
2432 			   we send packet, ignoring both routing tables
2433 			   and ifaddr state. --ANK
2434 
2435 
2436 			   We could make it even if oif is unknown,
2437 			   likely IPv6, but we do not.
2438 			 */
2439 
2440 			if (fl4->saddr == 0)
2441 				fl4->saddr = inet_select_addr(dev_out, 0,
2442 							      RT_SCOPE_LINK);
2443 			res->type = RTN_UNICAST;
2444 			goto make_route;
2445 		}
2446 		rth = ERR_PTR(err);
2447 		goto out;
2448 	}
2449 
2450 	if (res->type == RTN_LOCAL) {
2451 		if (!fl4->saddr) {
2452 			if (res->fi->fib_prefsrc)
2453 				fl4->saddr = res->fi->fib_prefsrc;
2454 			else
2455 				fl4->saddr = fl4->daddr;
2456 		}
2457 
2458 		/* L3 master device is the loopback for that domain */
2459 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2460 			net->loopback_dev;
2461 
2462 		/* make sure orig_oif points to fib result device even
2463 		 * though packet rx/tx happens over loopback or l3mdev
2464 		 */
2465 		orig_oif = FIB_RES_OIF(*res);
2466 
2467 		fl4->flowi4_oif = dev_out->ifindex;
2468 		flags |= RTCF_LOCAL;
2469 		goto make_route;
2470 	}
2471 
2472 	fib_select_path(net, res, fl4, skb);
2473 
2474 	dev_out = FIB_RES_DEV(*res);
2475 	fl4->flowi4_oif = dev_out->ifindex;
2476 
2477 
2478 make_route:
2479 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2480 
2481 out:
2482 	return rth;
2483 }
2484 
2485 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2486 {
2487 	return NULL;
2488 }
2489 
2490 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2491 {
2492 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2493 
2494 	return mtu ? : dst->dev->mtu;
2495 }
2496 
2497 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2498 					  struct sk_buff *skb, u32 mtu)
2499 {
2500 }
2501 
2502 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2503 				       struct sk_buff *skb)
2504 {
2505 }
2506 
2507 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2508 					  unsigned long old)
2509 {
2510 	return NULL;
2511 }
2512 
2513 static struct dst_ops ipv4_dst_blackhole_ops = {
2514 	.family			=	AF_INET,
2515 	.check			=	ipv4_blackhole_dst_check,
2516 	.mtu			=	ipv4_blackhole_mtu,
2517 	.default_advmss		=	ipv4_default_advmss,
2518 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2519 	.redirect		=	ipv4_rt_blackhole_redirect,
2520 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2521 	.neigh_lookup		=	ipv4_neigh_lookup,
2522 };
2523 
2524 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2525 {
2526 	struct rtable *ort = (struct rtable *) dst_orig;
2527 	struct rtable *rt;
2528 
2529 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2530 	if (rt) {
2531 		struct dst_entry *new = &rt->dst;
2532 
2533 		new->__use = 1;
2534 		new->input = dst_discard;
2535 		new->output = dst_discard_out;
2536 
2537 		new->dev = net->loopback_dev;
2538 		if (new->dev)
2539 			dev_hold(new->dev);
2540 
2541 		rt->rt_is_input = ort->rt_is_input;
2542 		rt->rt_iif = ort->rt_iif;
2543 		rt->rt_pmtu = ort->rt_pmtu;
2544 
2545 		rt->rt_genid = rt_genid_ipv4(net);
2546 		rt->rt_flags = ort->rt_flags;
2547 		rt->rt_type = ort->rt_type;
2548 		rt->rt_gateway = ort->rt_gateway;
2549 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2550 
2551 		INIT_LIST_HEAD(&rt->rt_uncached);
2552 	}
2553 
2554 	dst_release(dst_orig);
2555 
2556 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557 }
2558 
2559 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560 				    const struct sock *sk)
2561 {
2562 	struct rtable *rt = __ip_route_output_key(net, flp4);
2563 
2564 	if (IS_ERR(rt))
2565 		return rt;
2566 
2567 	if (flp4->flowi4_proto)
2568 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569 							flowi4_to_flowi(flp4),
2570 							sk, 0);
2571 
2572 	return rt;
2573 }
2574 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575 
2576 /* called with rcu_read_lock held */
2577 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579 			u32 seq)
2580 {
2581 	struct rtable *rt = skb_rtable(skb);
2582 	struct rtmsg *r;
2583 	struct nlmsghdr *nlh;
2584 	unsigned long expires = 0;
2585 	u32 error;
2586 	u32 metrics[RTAX_MAX];
2587 
2588 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589 	if (!nlh)
2590 		return -EMSGSIZE;
2591 
2592 	r = nlmsg_data(nlh);
2593 	r->rtm_family	 = AF_INET;
2594 	r->rtm_dst_len	= 32;
2595 	r->rtm_src_len	= 0;
2596 	r->rtm_tos	= fl4->flowi4_tos;
2597 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2599 		goto nla_put_failure;
2600 	r->rtm_type	= rt->rt_type;
2601 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2602 	r->rtm_protocol = RTPROT_UNSPEC;
2603 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604 	if (rt->rt_flags & RTCF_NOTIFY)
2605 		r->rtm_flags |= RTM_F_NOTIFY;
2606 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607 		r->rtm_flags |= RTCF_DOREDIRECT;
2608 
2609 	if (nla_put_in_addr(skb, RTA_DST, dst))
2610 		goto nla_put_failure;
2611 	if (src) {
2612 		r->rtm_src_len = 32;
2613 		if (nla_put_in_addr(skb, RTA_SRC, src))
2614 			goto nla_put_failure;
2615 	}
2616 	if (rt->dst.dev &&
2617 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618 		goto nla_put_failure;
2619 #ifdef CONFIG_IP_ROUTE_CLASSID
2620 	if (rt->dst.tclassid &&
2621 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622 		goto nla_put_failure;
2623 #endif
2624 	if (!rt_is_input_route(rt) &&
2625 	    fl4->saddr != src) {
2626 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2627 			goto nla_put_failure;
2628 	}
2629 	if (rt->rt_uses_gateway &&
2630 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631 		goto nla_put_failure;
2632 
2633 	expires = rt->dst.expires;
2634 	if (expires) {
2635 		unsigned long now = jiffies;
2636 
2637 		if (time_before(now, expires))
2638 			expires -= now;
2639 		else
2640 			expires = 0;
2641 	}
2642 
2643 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644 	if (rt->rt_pmtu && expires)
2645 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2647 		goto nla_put_failure;
2648 
2649 	if (fl4->flowi4_mark &&
2650 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2651 		goto nla_put_failure;
2652 
2653 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2654 	    nla_put_u32(skb, RTA_UID,
2655 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2656 		goto nla_put_failure;
2657 
2658 	error = rt->dst.error;
2659 
2660 	if (rt_is_input_route(rt)) {
2661 #ifdef CONFIG_IP_MROUTE
2662 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2663 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2664 			int err = ipmr_get_route(net, skb,
2665 						 fl4->saddr, fl4->daddr,
2666 						 r, portid);
2667 
2668 			if (err <= 0) {
2669 				if (err == 0)
2670 					return 0;
2671 				goto nla_put_failure;
2672 			}
2673 		} else
2674 #endif
2675 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2676 				goto nla_put_failure;
2677 	}
2678 
2679 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2680 		goto nla_put_failure;
2681 
2682 	nlmsg_end(skb, nlh);
2683 	return 0;
2684 
2685 nla_put_failure:
2686 	nlmsg_cancel(skb, nlh);
2687 	return -EMSGSIZE;
2688 }
2689 
2690 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2691 			     struct netlink_ext_ack *extack)
2692 {
2693 	struct net *net = sock_net(in_skb->sk);
2694 	struct rtmsg *rtm;
2695 	struct nlattr *tb[RTA_MAX+1];
2696 	struct fib_result res = {};
2697 	struct rtable *rt = NULL;
2698 	struct flowi4 fl4;
2699 	__be32 dst = 0;
2700 	__be32 src = 0;
2701 	u32 iif;
2702 	int err;
2703 	int mark;
2704 	struct sk_buff *skb;
2705 	u32 table_id = RT_TABLE_MAIN;
2706 	kuid_t uid;
2707 
2708 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2709 			  extack);
2710 	if (err < 0)
2711 		goto errout;
2712 
2713 	rtm = nlmsg_data(nlh);
2714 
2715 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2716 	if (!skb) {
2717 		err = -ENOBUFS;
2718 		goto errout;
2719 	}
2720 
2721 	/* Reserve room for dummy headers, this skb can pass
2722 	   through good chunk of routing engine.
2723 	 */
2724 	skb_reset_mac_header(skb);
2725 	skb_reset_network_header(skb);
2726 
2727 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2728 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2729 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2730 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2731 	if (tb[RTA_UID])
2732 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2733 	else
2734 		uid = (iif ? INVALID_UID : current_uid());
2735 
2736 	/* Bugfix: need to give ip_route_input enough of an IP header to
2737 	 * not gag.
2738 	 */
2739 	ip_hdr(skb)->protocol = IPPROTO_UDP;
2740 	ip_hdr(skb)->saddr = src;
2741 	ip_hdr(skb)->daddr = dst;
2742 
2743 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2744 
2745 	memset(&fl4, 0, sizeof(fl4));
2746 	fl4.daddr = dst;
2747 	fl4.saddr = src;
2748 	fl4.flowi4_tos = rtm->rtm_tos;
2749 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2750 	fl4.flowi4_mark = mark;
2751 	fl4.flowi4_uid = uid;
2752 
2753 	rcu_read_lock();
2754 
2755 	if (iif) {
2756 		struct net_device *dev;
2757 
2758 		dev = dev_get_by_index_rcu(net, iif);
2759 		if (!dev) {
2760 			err = -ENODEV;
2761 			goto errout_free;
2762 		}
2763 
2764 		skb->protocol	= htons(ETH_P_IP);
2765 		skb->dev	= dev;
2766 		skb->mark	= mark;
2767 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2768 					 dev, &res);
2769 
2770 		rt = skb_rtable(skb);
2771 		if (err == 0 && rt->dst.error)
2772 			err = -rt->dst.error;
2773 	} else {
2774 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2775 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2776 		err = 0;
2777 		if (IS_ERR(rt))
2778 			err = PTR_ERR(rt);
2779 		else
2780 			skb_dst_set(skb, &rt->dst);
2781 	}
2782 
2783 	if (err)
2784 		goto errout_free;
2785 
2786 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2787 		rt->rt_flags |= RTCF_NOTIFY;
2788 
2789 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2790 		table_id = rt->rt_table_id;
2791 
2792 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2793 		if (!res.fi) {
2794 			err = fib_props[res.type].error;
2795 			if (!err)
2796 				err = -EHOSTUNREACH;
2797 			goto errout_free;
2798 		}
2799 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2800 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2801 				    rt->rt_type, res.prefix, res.prefixlen,
2802 				    fl4.flowi4_tos, res.fi, 0);
2803 	} else {
2804 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2805 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2806 	}
2807 	if (err < 0)
2808 		goto errout_free;
2809 
2810 	rcu_read_unlock();
2811 
2812 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2813 errout:
2814 	return err;
2815 
2816 errout_free:
2817 	rcu_read_unlock();
2818 	kfree_skb(skb);
2819 	goto errout;
2820 }
2821 
2822 void ip_rt_multicast_event(struct in_device *in_dev)
2823 {
2824 	rt_cache_flush(dev_net(in_dev->dev));
2825 }
2826 
2827 #ifdef CONFIG_SYSCTL
2828 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2829 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2830 static int ip_rt_gc_elasticity __read_mostly	= 8;
2831 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
2832 
2833 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2834 					void __user *buffer,
2835 					size_t *lenp, loff_t *ppos)
2836 {
2837 	struct net *net = (struct net *)__ctl->extra1;
2838 
2839 	if (write) {
2840 		rt_cache_flush(net);
2841 		fnhe_genid_bump(net);
2842 		return 0;
2843 	}
2844 
2845 	return -EINVAL;
2846 }
2847 
2848 static struct ctl_table ipv4_route_table[] = {
2849 	{
2850 		.procname	= "gc_thresh",
2851 		.data		= &ipv4_dst_ops.gc_thresh,
2852 		.maxlen		= sizeof(int),
2853 		.mode		= 0644,
2854 		.proc_handler	= proc_dointvec,
2855 	},
2856 	{
2857 		.procname	= "max_size",
2858 		.data		= &ip_rt_max_size,
2859 		.maxlen		= sizeof(int),
2860 		.mode		= 0644,
2861 		.proc_handler	= proc_dointvec,
2862 	},
2863 	{
2864 		/*  Deprecated. Use gc_min_interval_ms */
2865 
2866 		.procname	= "gc_min_interval",
2867 		.data		= &ip_rt_gc_min_interval,
2868 		.maxlen		= sizeof(int),
2869 		.mode		= 0644,
2870 		.proc_handler	= proc_dointvec_jiffies,
2871 	},
2872 	{
2873 		.procname	= "gc_min_interval_ms",
2874 		.data		= &ip_rt_gc_min_interval,
2875 		.maxlen		= sizeof(int),
2876 		.mode		= 0644,
2877 		.proc_handler	= proc_dointvec_ms_jiffies,
2878 	},
2879 	{
2880 		.procname	= "gc_timeout",
2881 		.data		= &ip_rt_gc_timeout,
2882 		.maxlen		= sizeof(int),
2883 		.mode		= 0644,
2884 		.proc_handler	= proc_dointvec_jiffies,
2885 	},
2886 	{
2887 		.procname	= "gc_interval",
2888 		.data		= &ip_rt_gc_interval,
2889 		.maxlen		= sizeof(int),
2890 		.mode		= 0644,
2891 		.proc_handler	= proc_dointvec_jiffies,
2892 	},
2893 	{
2894 		.procname	= "redirect_load",
2895 		.data		= &ip_rt_redirect_load,
2896 		.maxlen		= sizeof(int),
2897 		.mode		= 0644,
2898 		.proc_handler	= proc_dointvec,
2899 	},
2900 	{
2901 		.procname	= "redirect_number",
2902 		.data		= &ip_rt_redirect_number,
2903 		.maxlen		= sizeof(int),
2904 		.mode		= 0644,
2905 		.proc_handler	= proc_dointvec,
2906 	},
2907 	{
2908 		.procname	= "redirect_silence",
2909 		.data		= &ip_rt_redirect_silence,
2910 		.maxlen		= sizeof(int),
2911 		.mode		= 0644,
2912 		.proc_handler	= proc_dointvec,
2913 	},
2914 	{
2915 		.procname	= "error_cost",
2916 		.data		= &ip_rt_error_cost,
2917 		.maxlen		= sizeof(int),
2918 		.mode		= 0644,
2919 		.proc_handler	= proc_dointvec,
2920 	},
2921 	{
2922 		.procname	= "error_burst",
2923 		.data		= &ip_rt_error_burst,
2924 		.maxlen		= sizeof(int),
2925 		.mode		= 0644,
2926 		.proc_handler	= proc_dointvec,
2927 	},
2928 	{
2929 		.procname	= "gc_elasticity",
2930 		.data		= &ip_rt_gc_elasticity,
2931 		.maxlen		= sizeof(int),
2932 		.mode		= 0644,
2933 		.proc_handler	= proc_dointvec,
2934 	},
2935 	{
2936 		.procname	= "mtu_expires",
2937 		.data		= &ip_rt_mtu_expires,
2938 		.maxlen		= sizeof(int),
2939 		.mode		= 0644,
2940 		.proc_handler	= proc_dointvec_jiffies,
2941 	},
2942 	{
2943 		.procname	= "min_pmtu",
2944 		.data		= &ip_rt_min_pmtu,
2945 		.maxlen		= sizeof(int),
2946 		.mode		= 0644,
2947 		.proc_handler	= proc_dointvec_minmax,
2948 		.extra1		= &ip_min_valid_pmtu,
2949 	},
2950 	{
2951 		.procname	= "min_adv_mss",
2952 		.data		= &ip_rt_min_advmss,
2953 		.maxlen		= sizeof(int),
2954 		.mode		= 0644,
2955 		.proc_handler	= proc_dointvec,
2956 	},
2957 	{ }
2958 };
2959 
2960 static struct ctl_table ipv4_route_flush_table[] = {
2961 	{
2962 		.procname	= "flush",
2963 		.maxlen		= sizeof(int),
2964 		.mode		= 0200,
2965 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2966 	},
2967 	{ },
2968 };
2969 
2970 static __net_init int sysctl_route_net_init(struct net *net)
2971 {
2972 	struct ctl_table *tbl;
2973 
2974 	tbl = ipv4_route_flush_table;
2975 	if (!net_eq(net, &init_net)) {
2976 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2977 		if (!tbl)
2978 			goto err_dup;
2979 
2980 		/* Don't export sysctls to unprivileged users */
2981 		if (net->user_ns != &init_user_ns)
2982 			tbl[0].procname = NULL;
2983 	}
2984 	tbl[0].extra1 = net;
2985 
2986 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2987 	if (!net->ipv4.route_hdr)
2988 		goto err_reg;
2989 	return 0;
2990 
2991 err_reg:
2992 	if (tbl != ipv4_route_flush_table)
2993 		kfree(tbl);
2994 err_dup:
2995 	return -ENOMEM;
2996 }
2997 
2998 static __net_exit void sysctl_route_net_exit(struct net *net)
2999 {
3000 	struct ctl_table *tbl;
3001 
3002 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3003 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3004 	BUG_ON(tbl == ipv4_route_flush_table);
3005 	kfree(tbl);
3006 }
3007 
3008 static __net_initdata struct pernet_operations sysctl_route_ops = {
3009 	.init = sysctl_route_net_init,
3010 	.exit = sysctl_route_net_exit,
3011 };
3012 #endif
3013 
3014 static __net_init int rt_genid_init(struct net *net)
3015 {
3016 	atomic_set(&net->ipv4.rt_genid, 0);
3017 	atomic_set(&net->fnhe_genid, 0);
3018 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3019 	return 0;
3020 }
3021 
3022 static __net_initdata struct pernet_operations rt_genid_ops = {
3023 	.init = rt_genid_init,
3024 };
3025 
3026 static int __net_init ipv4_inetpeer_init(struct net *net)
3027 {
3028 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3029 
3030 	if (!bp)
3031 		return -ENOMEM;
3032 	inet_peer_base_init(bp);
3033 	net->ipv4.peers = bp;
3034 	return 0;
3035 }
3036 
3037 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3038 {
3039 	struct inet_peer_base *bp = net->ipv4.peers;
3040 
3041 	net->ipv4.peers = NULL;
3042 	inetpeer_invalidate_tree(bp);
3043 	kfree(bp);
3044 }
3045 
3046 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3047 	.init	=	ipv4_inetpeer_init,
3048 	.exit	=	ipv4_inetpeer_exit,
3049 };
3050 
3051 #ifdef CONFIG_IP_ROUTE_CLASSID
3052 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3053 #endif /* CONFIG_IP_ROUTE_CLASSID */
3054 
3055 int __init ip_rt_init(void)
3056 {
3057 	int cpu;
3058 
3059 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3060 	if (!ip_idents)
3061 		panic("IP: failed to allocate ip_idents\n");
3062 
3063 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3064 
3065 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3066 	if (!ip_tstamps)
3067 		panic("IP: failed to allocate ip_tstamps\n");
3068 
3069 	for_each_possible_cpu(cpu) {
3070 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3071 
3072 		INIT_LIST_HEAD(&ul->head);
3073 		spin_lock_init(&ul->lock);
3074 	}
3075 #ifdef CONFIG_IP_ROUTE_CLASSID
3076 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3077 	if (!ip_rt_acct)
3078 		panic("IP: failed to allocate ip_rt_acct\n");
3079 #endif
3080 
3081 	ipv4_dst_ops.kmem_cachep =
3082 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3083 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3084 
3085 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3086 
3087 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3088 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3089 
3090 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3091 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3092 
3093 	ipv4_dst_ops.gc_thresh = ~0;
3094 	ip_rt_max_size = INT_MAX;
3095 
3096 	devinet_init();
3097 	ip_fib_init();
3098 
3099 	if (ip_rt_proc_init())
3100 		pr_err("Unable to create route proc files\n");
3101 #ifdef CONFIG_XFRM
3102 	xfrm_init();
3103 	xfrm4_init();
3104 #endif
3105 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3106 		      RTNL_FLAG_DOIT_UNLOCKED);
3107 
3108 #ifdef CONFIG_SYSCTL
3109 	register_pernet_subsys(&sysctl_route_ops);
3110 #endif
3111 	register_pernet_subsys(&rt_genid_ops);
3112 	register_pernet_subsys(&ipv4_inetpeer_ops);
3113 	return 0;
3114 }
3115 
3116 #ifdef CONFIG_SYSCTL
3117 /*
3118  * We really need to sanitize the damn ipv4 init order, then all
3119  * this nonsense will go away.
3120  */
3121 void __init ip_static_sysctl_init(void)
3122 {
3123 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3124 }
3125 #endif
3126