xref: /openbmc/linux/net/ipv4/route.c (revision 5a244f48)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116 
117 #include "fib_lookup.h"
118 
119 #define RT_FL_TOS(oldflp4) \
120 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121 
122 #define RT_GC_TIMEOUT (300*HZ)
123 
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 
134 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
135 /*
136  *	Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void		 ipv4_link_failure(struct sk_buff *skb);
144 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 					   struct sk_buff *skb, u32 mtu);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct file_operations rt_cache_seq_fops = {
243 	.owner	 = THIS_MODULE,
244 	.open	 = rt_cache_seq_open,
245 	.read	 = seq_read,
246 	.llseek	 = seq_lseek,
247 	.release = seq_release,
248 };
249 
250 
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253 	int cpu;
254 
255 	if (*pos == 0)
256 		return SEQ_START_TOKEN;
257 
258 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259 		if (!cpu_possible(cpu))
260 			continue;
261 		*pos = cpu+1;
262 		return &per_cpu(rt_cache_stat, cpu);
263 	}
264 	return NULL;
265 }
266 
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269 	int cpu;
270 
271 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272 		if (!cpu_possible(cpu))
273 			continue;
274 		*pos = cpu+1;
275 		return &per_cpu(rt_cache_stat, cpu);
276 	}
277 	return NULL;
278 
279 }
280 
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct file_operations rt_cpu_seq_fops = {
334 	.owner	 = THIS_MODULE,
335 	.open	 = rt_cpu_seq_open,
336 	.read	 = seq_read,
337 	.llseek	 = seq_lseek,
338 	.release = seq_release,
339 };
340 
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344 	struct ip_rt_acct *dst, *src;
345 	unsigned int i, j;
346 
347 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 	if (!dst)
349 		return -ENOMEM;
350 
351 	for_each_possible_cpu(i) {
352 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 		for (j = 0; j < 256; j++) {
354 			dst[j].o_bytes   += src[j].o_bytes;
355 			dst[j].o_packets += src[j].o_packets;
356 			dst[j].i_bytes   += src[j].i_bytes;
357 			dst[j].i_packets += src[j].i_packets;
358 		}
359 	}
360 
361 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 	kfree(dst);
363 	return 0;
364 }
365 
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368 	return single_open(file, rt_acct_proc_show, NULL);
369 }
370 
371 static const struct file_operations rt_acct_proc_fops = {
372 	.owner		= THIS_MODULE,
373 	.open		= rt_acct_proc_open,
374 	.read		= seq_read,
375 	.llseek		= seq_lseek,
376 	.release	= single_release,
377 };
378 #endif
379 
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382 	struct proc_dir_entry *pde;
383 
384 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 			  &rt_cache_seq_fops);
386 	if (!pde)
387 		goto err1;
388 
389 	pde = proc_create("rt_cache", S_IRUGO,
390 			  net->proc_net_stat, &rt_cpu_seq_fops);
391 	if (!pde)
392 		goto err2;
393 
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396 	if (!pde)
397 		goto err3;
398 #endif
399 	return 0;
400 
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403 	remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406 	remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408 	return -ENOMEM;
409 }
410 
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413 	remove_proc_entry("rt_cache", net->proc_net_stat);
414 	remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416 	remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419 
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421 	.init = ip_rt_do_proc_init,
422 	.exit = ip_rt_do_proc_exit,
423 };
424 
425 static int __init ip_rt_proc_init(void)
426 {
427 	return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429 
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433 	return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436 
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
440 }
441 
442 void rt_cache_flush(struct net *net)
443 {
444 	rt_genid_bump_ipv4(net);
445 }
446 
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 					   struct sk_buff *skb,
449 					   const void *daddr)
450 {
451 	struct net_device *dev = dst->dev;
452 	const __be32 *pkey = daddr;
453 	const struct rtable *rt;
454 	struct neighbour *n;
455 
456 	rt = (const struct rtable *) dst;
457 	if (rt->rt_gateway)
458 		pkey = (const __be32 *) &rt->rt_gateway;
459 	else if (skb)
460 		pkey = &ip_hdr(skb)->daddr;
461 
462 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463 	if (n)
464 		return n;
465 	return neigh_create(&arp_tbl, pkey, dev);
466 }
467 
468 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469 {
470 	struct net_device *dev = dst->dev;
471 	const __be32 *pkey = daddr;
472 	const struct rtable *rt;
473 
474 	rt = (const struct rtable *)dst;
475 	if (rt->rt_gateway)
476 		pkey = (const __be32 *)&rt->rt_gateway;
477 	else if (!daddr ||
478 		 (rt->rt_flags &
479 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480 		return;
481 
482 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483 }
484 
485 #define IP_IDENTS_SZ 2048u
486 
487 static atomic_t *ip_idents __read_mostly;
488 static u32 *ip_tstamps __read_mostly;
489 
490 /* In order to protect privacy, we add a perturbation to identifiers
491  * if one generator is seldom used. This makes hard for an attacker
492  * to infer how many packets were sent between two points in time.
493  */
494 u32 ip_idents_reserve(u32 hash, int segs)
495 {
496 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
498 	u32 old = ACCESS_ONCE(*p_tstamp);
499 	u32 now = (u32)jiffies;
500 	u32 new, delta = 0;
501 
502 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
503 		delta = prandom_u32_max(now - old);
504 
505 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 	do {
507 		old = (u32)atomic_read(p_id);
508 		new = old + delta + segs;
509 	} while (atomic_cmpxchg(p_id, old, new) != old);
510 
511 	return new - segs;
512 }
513 EXPORT_SYMBOL(ip_idents_reserve);
514 
515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 {
517 	static u32 ip_idents_hashrnd __read_mostly;
518 	u32 hash, id;
519 
520 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
521 
522 	hash = jhash_3words((__force u32)iph->daddr,
523 			    (__force u32)iph->saddr,
524 			    iph->protocol ^ net_hash_mix(net),
525 			    ip_idents_hashrnd);
526 	id = ip_idents_reserve(hash, segs);
527 	iph->id = htons(id);
528 }
529 EXPORT_SYMBOL(__ip_select_ident);
530 
531 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532 			     const struct sock *sk,
533 			     const struct iphdr *iph,
534 			     int oif, u8 tos,
535 			     u8 prot, u32 mark, int flow_flags)
536 {
537 	if (sk) {
538 		const struct inet_sock *inet = inet_sk(sk);
539 
540 		oif = sk->sk_bound_dev_if;
541 		mark = sk->sk_mark;
542 		tos = RT_CONN_FLAGS(sk);
543 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 	}
545 	flowi4_init_output(fl4, oif, mark, tos,
546 			   RT_SCOPE_UNIVERSE, prot,
547 			   flow_flags,
548 			   iph->daddr, iph->saddr, 0, 0,
549 			   sock_net_uid(net, sk));
550 }
551 
552 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553 			       const struct sock *sk)
554 {
555 	const struct net *net = dev_net(skb->dev);
556 	const struct iphdr *iph = ip_hdr(skb);
557 	int oif = skb->dev->ifindex;
558 	u8 tos = RT_TOS(iph->tos);
559 	u8 prot = iph->protocol;
560 	u32 mark = skb->mark;
561 
562 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
563 }
564 
565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 {
567 	const struct inet_sock *inet = inet_sk(sk);
568 	const struct ip_options_rcu *inet_opt;
569 	__be32 daddr = inet->inet_daddr;
570 
571 	rcu_read_lock();
572 	inet_opt = rcu_dereference(inet->inet_opt);
573 	if (inet_opt && inet_opt->opt.srr)
574 		daddr = inet_opt->opt.faddr;
575 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 			   inet_sk_flowi_flags(sk),
579 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
580 	rcu_read_unlock();
581 }
582 
583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 				 const struct sk_buff *skb)
585 {
586 	if (skb)
587 		build_skb_flow_key(fl4, skb, sk);
588 	else
589 		build_sk_flow_key(fl4, sk);
590 }
591 
592 static DEFINE_SPINLOCK(fnhe_lock);
593 
594 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595 {
596 	struct rtable *rt;
597 
598 	rt = rcu_dereference(fnhe->fnhe_rth_input);
599 	if (rt) {
600 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
601 		dst_dev_put(&rt->dst);
602 		dst_release(&rt->dst);
603 	}
604 	rt = rcu_dereference(fnhe->fnhe_rth_output);
605 	if (rt) {
606 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
607 		dst_dev_put(&rt->dst);
608 		dst_release(&rt->dst);
609 	}
610 }
611 
612 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
613 {
614 	struct fib_nh_exception *fnhe, *oldest;
615 
616 	oldest = rcu_dereference(hash->chain);
617 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
619 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 			oldest = fnhe;
621 	}
622 	fnhe_flush_routes(oldest);
623 	return oldest;
624 }
625 
626 static inline u32 fnhe_hashfun(__be32 daddr)
627 {
628 	static u32 fnhe_hashrnd __read_mostly;
629 	u32 hval;
630 
631 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633 	return hash_32(hval, FNHE_HASH_SHIFT);
634 }
635 
636 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637 {
638 	rt->rt_pmtu = fnhe->fnhe_pmtu;
639 	rt->dst.expires = fnhe->fnhe_expires;
640 
641 	if (fnhe->fnhe_gw) {
642 		rt->rt_flags |= RTCF_REDIRECTED;
643 		rt->rt_gateway = fnhe->fnhe_gw;
644 		rt->rt_uses_gateway = 1;
645 	}
646 }
647 
648 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 				  u32 pmtu, unsigned long expires)
650 {
651 	struct fnhe_hash_bucket *hash;
652 	struct fib_nh_exception *fnhe;
653 	struct rtable *rt;
654 	unsigned int i;
655 	int depth;
656 	u32 hval = fnhe_hashfun(daddr);
657 
658 	spin_lock_bh(&fnhe_lock);
659 
660 	hash = rcu_dereference(nh->nh_exceptions);
661 	if (!hash) {
662 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
663 		if (!hash)
664 			goto out_unlock;
665 		rcu_assign_pointer(nh->nh_exceptions, hash);
666 	}
667 
668 	hash += hval;
669 
670 	depth = 0;
671 	for (fnhe = rcu_dereference(hash->chain); fnhe;
672 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 		if (fnhe->fnhe_daddr == daddr)
674 			break;
675 		depth++;
676 	}
677 
678 	if (fnhe) {
679 		if (gw)
680 			fnhe->fnhe_gw = gw;
681 		if (pmtu) {
682 			fnhe->fnhe_pmtu = pmtu;
683 			fnhe->fnhe_expires = max(1UL, expires);
684 		}
685 		/* Update all cached dsts too */
686 		rt = rcu_dereference(fnhe->fnhe_rth_input);
687 		if (rt)
688 			fill_route_from_fnhe(rt, fnhe);
689 		rt = rcu_dereference(fnhe->fnhe_rth_output);
690 		if (rt)
691 			fill_route_from_fnhe(rt, fnhe);
692 	} else {
693 		if (depth > FNHE_RECLAIM_DEPTH)
694 			fnhe = fnhe_oldest(hash);
695 		else {
696 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
697 			if (!fnhe)
698 				goto out_unlock;
699 
700 			fnhe->fnhe_next = hash->chain;
701 			rcu_assign_pointer(hash->chain, fnhe);
702 		}
703 		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
704 		fnhe->fnhe_daddr = daddr;
705 		fnhe->fnhe_gw = gw;
706 		fnhe->fnhe_pmtu = pmtu;
707 		fnhe->fnhe_expires = expires;
708 
709 		/* Exception created; mark the cached routes for the nexthop
710 		 * stale, so anyone caching it rechecks if this exception
711 		 * applies to them.
712 		 */
713 		rt = rcu_dereference(nh->nh_rth_input);
714 		if (rt)
715 			rt->dst.obsolete = DST_OBSOLETE_KILL;
716 
717 		for_each_possible_cpu(i) {
718 			struct rtable __rcu **prt;
719 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
720 			rt = rcu_dereference(*prt);
721 			if (rt)
722 				rt->dst.obsolete = DST_OBSOLETE_KILL;
723 		}
724 	}
725 
726 	fnhe->fnhe_stamp = jiffies;
727 
728 out_unlock:
729 	spin_unlock_bh(&fnhe_lock);
730 }
731 
732 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
733 			     bool kill_route)
734 {
735 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
736 	__be32 old_gw = ip_hdr(skb)->saddr;
737 	struct net_device *dev = skb->dev;
738 	struct in_device *in_dev;
739 	struct fib_result res;
740 	struct neighbour *n;
741 	struct net *net;
742 
743 	switch (icmp_hdr(skb)->code & 7) {
744 	case ICMP_REDIR_NET:
745 	case ICMP_REDIR_NETTOS:
746 	case ICMP_REDIR_HOST:
747 	case ICMP_REDIR_HOSTTOS:
748 		break;
749 
750 	default:
751 		return;
752 	}
753 
754 	if (rt->rt_gateway != old_gw)
755 		return;
756 
757 	in_dev = __in_dev_get_rcu(dev);
758 	if (!in_dev)
759 		return;
760 
761 	net = dev_net(dev);
762 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
763 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
764 	    ipv4_is_zeronet(new_gw))
765 		goto reject_redirect;
766 
767 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
768 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
769 			goto reject_redirect;
770 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
771 			goto reject_redirect;
772 	} else {
773 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
774 			goto reject_redirect;
775 	}
776 
777 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
778 	if (!n)
779 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
780 	if (!IS_ERR(n)) {
781 		if (!(n->nud_state & NUD_VALID)) {
782 			neigh_event_send(n, NULL);
783 		} else {
784 			if (fib_lookup(net, fl4, &res, 0) == 0) {
785 				struct fib_nh *nh = &FIB_RES_NH(res);
786 
787 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
788 						0, jiffies + ip_rt_gc_timeout);
789 			}
790 			if (kill_route)
791 				rt->dst.obsolete = DST_OBSOLETE_KILL;
792 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
793 		}
794 		neigh_release(n);
795 	}
796 	return;
797 
798 reject_redirect:
799 #ifdef CONFIG_IP_ROUTE_VERBOSE
800 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
801 		const struct iphdr *iph = (const struct iphdr *) skb->data;
802 		__be32 daddr = iph->daddr;
803 		__be32 saddr = iph->saddr;
804 
805 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
806 				     "  Advised path = %pI4 -> %pI4\n",
807 				     &old_gw, dev->name, &new_gw,
808 				     &saddr, &daddr);
809 	}
810 #endif
811 	;
812 }
813 
814 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
815 {
816 	struct rtable *rt;
817 	struct flowi4 fl4;
818 	const struct iphdr *iph = (const struct iphdr *) skb->data;
819 	struct net *net = dev_net(skb->dev);
820 	int oif = skb->dev->ifindex;
821 	u8 tos = RT_TOS(iph->tos);
822 	u8 prot = iph->protocol;
823 	u32 mark = skb->mark;
824 
825 	rt = (struct rtable *) dst;
826 
827 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
828 	__ip_do_redirect(rt, skb, &fl4, true);
829 }
830 
831 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
832 {
833 	struct rtable *rt = (struct rtable *)dst;
834 	struct dst_entry *ret = dst;
835 
836 	if (rt) {
837 		if (dst->obsolete > 0) {
838 			ip_rt_put(rt);
839 			ret = NULL;
840 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
841 			   rt->dst.expires) {
842 			ip_rt_put(rt);
843 			ret = NULL;
844 		}
845 	}
846 	return ret;
847 }
848 
849 /*
850  * Algorithm:
851  *	1. The first ip_rt_redirect_number redirects are sent
852  *	   with exponential backoff, then we stop sending them at all,
853  *	   assuming that the host ignores our redirects.
854  *	2. If we did not see packets requiring redirects
855  *	   during ip_rt_redirect_silence, we assume that the host
856  *	   forgot redirected route and start to send redirects again.
857  *
858  * This algorithm is much cheaper and more intelligent than dumb load limiting
859  * in icmp.c.
860  *
861  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
862  * and "frag. need" (breaks PMTU discovery) in icmp.c.
863  */
864 
865 void ip_rt_send_redirect(struct sk_buff *skb)
866 {
867 	struct rtable *rt = skb_rtable(skb);
868 	struct in_device *in_dev;
869 	struct inet_peer *peer;
870 	struct net *net;
871 	int log_martians;
872 	int vif;
873 
874 	rcu_read_lock();
875 	in_dev = __in_dev_get_rcu(rt->dst.dev);
876 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
877 		rcu_read_unlock();
878 		return;
879 	}
880 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
881 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
882 	rcu_read_unlock();
883 
884 	net = dev_net(rt->dst.dev);
885 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
886 	if (!peer) {
887 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
888 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
889 		return;
890 	}
891 
892 	/* No redirected packets during ip_rt_redirect_silence;
893 	 * reset the algorithm.
894 	 */
895 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
896 		peer->rate_tokens = 0;
897 
898 	/* Too many ignored redirects; do not send anything
899 	 * set dst.rate_last to the last seen redirected packet.
900 	 */
901 	if (peer->rate_tokens >= ip_rt_redirect_number) {
902 		peer->rate_last = jiffies;
903 		goto out_put_peer;
904 	}
905 
906 	/* Check for load limit; set rate_last to the latest sent
907 	 * redirect.
908 	 */
909 	if (peer->rate_tokens == 0 ||
910 	    time_after(jiffies,
911 		       (peer->rate_last +
912 			(ip_rt_redirect_load << peer->rate_tokens)))) {
913 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
914 
915 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
916 		peer->rate_last = jiffies;
917 		++peer->rate_tokens;
918 #ifdef CONFIG_IP_ROUTE_VERBOSE
919 		if (log_martians &&
920 		    peer->rate_tokens == ip_rt_redirect_number)
921 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
922 					     &ip_hdr(skb)->saddr, inet_iif(skb),
923 					     &ip_hdr(skb)->daddr, &gw);
924 #endif
925 	}
926 out_put_peer:
927 	inet_putpeer(peer);
928 }
929 
930 static int ip_error(struct sk_buff *skb)
931 {
932 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
933 	struct rtable *rt = skb_rtable(skb);
934 	struct inet_peer *peer;
935 	unsigned long now;
936 	struct net *net;
937 	bool send;
938 	int code;
939 
940 	/* IP on this device is disabled. */
941 	if (!in_dev)
942 		goto out;
943 
944 	net = dev_net(rt->dst.dev);
945 	if (!IN_DEV_FORWARD(in_dev)) {
946 		switch (rt->dst.error) {
947 		case EHOSTUNREACH:
948 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
949 			break;
950 
951 		case ENETUNREACH:
952 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
953 			break;
954 		}
955 		goto out;
956 	}
957 
958 	switch (rt->dst.error) {
959 	case EINVAL:
960 	default:
961 		goto out;
962 	case EHOSTUNREACH:
963 		code = ICMP_HOST_UNREACH;
964 		break;
965 	case ENETUNREACH:
966 		code = ICMP_NET_UNREACH;
967 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968 		break;
969 	case EACCES:
970 		code = ICMP_PKT_FILTERED;
971 		break;
972 	}
973 
974 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
975 			       l3mdev_master_ifindex(skb->dev), 1);
976 
977 	send = true;
978 	if (peer) {
979 		now = jiffies;
980 		peer->rate_tokens += now - peer->rate_last;
981 		if (peer->rate_tokens > ip_rt_error_burst)
982 			peer->rate_tokens = ip_rt_error_burst;
983 		peer->rate_last = now;
984 		if (peer->rate_tokens >= ip_rt_error_cost)
985 			peer->rate_tokens -= ip_rt_error_cost;
986 		else
987 			send = false;
988 		inet_putpeer(peer);
989 	}
990 	if (send)
991 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
992 
993 out:	kfree_skb(skb);
994 	return 0;
995 }
996 
997 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
998 {
999 	struct dst_entry *dst = &rt->dst;
1000 	struct fib_result res;
1001 
1002 	if (dst_metric_locked(dst, RTAX_MTU))
1003 		return;
1004 
1005 	if (ipv4_mtu(dst) < mtu)
1006 		return;
1007 
1008 	if (mtu < ip_rt_min_pmtu)
1009 		mtu = ip_rt_min_pmtu;
1010 
1011 	if (rt->rt_pmtu == mtu &&
1012 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013 		return;
1014 
1015 	rcu_read_lock();
1016 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1017 		struct fib_nh *nh = &FIB_RES_NH(res);
1018 
1019 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020 				      jiffies + ip_rt_mtu_expires);
1021 	}
1022 	rcu_read_unlock();
1023 }
1024 
1025 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026 			      struct sk_buff *skb, u32 mtu)
1027 {
1028 	struct rtable *rt = (struct rtable *) dst;
1029 	struct flowi4 fl4;
1030 
1031 	ip_rt_build_flow_key(&fl4, sk, skb);
1032 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1033 }
1034 
1035 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036 		      int oif, u32 mark, u8 protocol, int flow_flags)
1037 {
1038 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1039 	struct flowi4 fl4;
1040 	struct rtable *rt;
1041 
1042 	if (!mark)
1043 		mark = IP4_REPLY_MARK(net, skb->mark);
1044 
1045 	__build_flow_key(net, &fl4, NULL, iph, oif,
1046 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1047 	rt = __ip_route_output_key(net, &fl4);
1048 	if (!IS_ERR(rt)) {
1049 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1050 		ip_rt_put(rt);
1051 	}
1052 }
1053 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054 
1055 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1056 {
1057 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 	struct flowi4 fl4;
1059 	struct rtable *rt;
1060 
1061 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1062 
1063 	if (!fl4.flowi4_mark)
1064 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065 
1066 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1067 	if (!IS_ERR(rt)) {
1068 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1069 		ip_rt_put(rt);
1070 	}
1071 }
1072 
1073 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 	struct flowi4 fl4;
1077 	struct rtable *rt;
1078 	struct dst_entry *odst = NULL;
1079 	bool new = false;
1080 	struct net *net = sock_net(sk);
1081 
1082 	bh_lock_sock(sk);
1083 
1084 	if (!ip_sk_accept_pmtu(sk))
1085 		goto out;
1086 
1087 	odst = sk_dst_get(sk);
1088 
1089 	if (sock_owned_by_user(sk) || !odst) {
1090 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1091 		goto out;
1092 	}
1093 
1094 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1095 
1096 	rt = (struct rtable *)odst;
1097 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1098 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099 		if (IS_ERR(rt))
1100 			goto out;
1101 
1102 		new = true;
1103 	}
1104 
1105 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106 
1107 	if (!dst_check(&rt->dst, 0)) {
1108 		if (new)
1109 			dst_release(&rt->dst);
1110 
1111 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112 		if (IS_ERR(rt))
1113 			goto out;
1114 
1115 		new = true;
1116 	}
1117 
1118 	if (new)
1119 		sk_dst_set(sk, &rt->dst);
1120 
1121 out:
1122 	bh_unlock_sock(sk);
1123 	dst_release(odst);
1124 }
1125 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1126 
1127 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128 		   int oif, u32 mark, u8 protocol, int flow_flags)
1129 {
1130 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1131 	struct flowi4 fl4;
1132 	struct rtable *rt;
1133 
1134 	__build_flow_key(net, &fl4, NULL, iph, oif,
1135 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1136 	rt = __ip_route_output_key(net, &fl4);
1137 	if (!IS_ERR(rt)) {
1138 		__ip_do_redirect(rt, skb, &fl4, false);
1139 		ip_rt_put(rt);
1140 	}
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_redirect);
1143 
1144 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145 {
1146 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1147 	struct flowi4 fl4;
1148 	struct rtable *rt;
1149 	struct net *net = sock_net(sk);
1150 
1151 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152 	rt = __ip_route_output_key(net, &fl4);
1153 	if (!IS_ERR(rt)) {
1154 		__ip_do_redirect(rt, skb, &fl4, false);
1155 		ip_rt_put(rt);
1156 	}
1157 }
1158 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159 
1160 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161 {
1162 	struct rtable *rt = (struct rtable *) dst;
1163 
1164 	/* All IPV4 dsts are created with ->obsolete set to the value
1165 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166 	 * into this function always.
1167 	 *
1168 	 * When a PMTU/redirect information update invalidates a route,
1169 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170 	 * DST_OBSOLETE_DEAD by dst_free().
1171 	 */
1172 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1173 		return NULL;
1174 	return dst;
1175 }
1176 
1177 static void ipv4_link_failure(struct sk_buff *skb)
1178 {
1179 	struct rtable *rt;
1180 
1181 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182 
1183 	rt = skb_rtable(skb);
1184 	if (rt)
1185 		dst_set_expires(&rt->dst, 0);
1186 }
1187 
1188 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1189 {
1190 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1191 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192 		 skb->dev ? skb->dev->name : "?");
1193 	kfree_skb(skb);
1194 	WARN_ON(1);
1195 	return 0;
1196 }
1197 
1198 /*
1199    We do not cache source address of outgoing interface,
1200    because it is used only by IP RR, TS and SRR options,
1201    so that it out of fast path.
1202 
1203    BTW remember: "addr" is allowed to be not aligned
1204    in IP options!
1205  */
1206 
1207 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1208 {
1209 	__be32 src;
1210 
1211 	if (rt_is_output_route(rt))
1212 		src = ip_hdr(skb)->saddr;
1213 	else {
1214 		struct fib_result res;
1215 		struct flowi4 fl4;
1216 		struct iphdr *iph;
1217 
1218 		iph = ip_hdr(skb);
1219 
1220 		memset(&fl4, 0, sizeof(fl4));
1221 		fl4.daddr = iph->daddr;
1222 		fl4.saddr = iph->saddr;
1223 		fl4.flowi4_tos = RT_TOS(iph->tos);
1224 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1225 		fl4.flowi4_iif = skb->dev->ifindex;
1226 		fl4.flowi4_mark = skb->mark;
1227 
1228 		rcu_read_lock();
1229 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1230 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1231 		else
1232 			src = inet_select_addr(rt->dst.dev,
1233 					       rt_nexthop(rt, iph->daddr),
1234 					       RT_SCOPE_UNIVERSE);
1235 		rcu_read_unlock();
1236 	}
1237 	memcpy(addr, &src, 4);
1238 }
1239 
1240 #ifdef CONFIG_IP_ROUTE_CLASSID
1241 static void set_class_tag(struct rtable *rt, u32 tag)
1242 {
1243 	if (!(rt->dst.tclassid & 0xFFFF))
1244 		rt->dst.tclassid |= tag & 0xFFFF;
1245 	if (!(rt->dst.tclassid & 0xFFFF0000))
1246 		rt->dst.tclassid |= tag & 0xFFFF0000;
1247 }
1248 #endif
1249 
1250 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251 {
1252 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253 	unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254 				    ip_rt_min_advmss);
1255 
1256 	return min(advmss, IPV4_MAX_PMTU - header_size);
1257 }
1258 
1259 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1260 {
1261 	const struct rtable *rt = (const struct rtable *) dst;
1262 	unsigned int mtu = rt->rt_pmtu;
1263 
1264 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1265 		mtu = dst_metric_raw(dst, RTAX_MTU);
1266 
1267 	if (mtu)
1268 		return mtu;
1269 
1270 	mtu = READ_ONCE(dst->dev->mtu);
1271 
1272 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1273 		if (rt->rt_uses_gateway && mtu > 576)
1274 			mtu = 576;
1275 	}
1276 
1277 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278 
1279 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1280 }
1281 
1282 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1283 {
1284 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1285 	struct fib_nh_exception *fnhe;
1286 	u32 hval;
1287 
1288 	if (!hash)
1289 		return NULL;
1290 
1291 	hval = fnhe_hashfun(daddr);
1292 
1293 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1295 		if (fnhe->fnhe_daddr == daddr)
1296 			return fnhe;
1297 	}
1298 	return NULL;
1299 }
1300 
1301 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1302 			      __be32 daddr, const bool do_cache)
1303 {
1304 	bool ret = false;
1305 
1306 	spin_lock_bh(&fnhe_lock);
1307 
1308 	if (daddr == fnhe->fnhe_daddr) {
1309 		struct rtable __rcu **porig;
1310 		struct rtable *orig;
1311 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1312 
1313 		if (rt_is_input_route(rt))
1314 			porig = &fnhe->fnhe_rth_input;
1315 		else
1316 			porig = &fnhe->fnhe_rth_output;
1317 		orig = rcu_dereference(*porig);
1318 
1319 		if (fnhe->fnhe_genid != genid) {
1320 			fnhe->fnhe_genid = genid;
1321 			fnhe->fnhe_gw = 0;
1322 			fnhe->fnhe_pmtu = 0;
1323 			fnhe->fnhe_expires = 0;
1324 			fnhe_flush_routes(fnhe);
1325 			orig = NULL;
1326 		}
1327 		fill_route_from_fnhe(rt, fnhe);
1328 		if (!rt->rt_gateway)
1329 			rt->rt_gateway = daddr;
1330 
1331 		if (do_cache) {
1332 			dst_hold(&rt->dst);
1333 			rcu_assign_pointer(*porig, rt);
1334 			if (orig) {
1335 				dst_dev_put(&orig->dst);
1336 				dst_release(&orig->dst);
1337 			}
1338 			ret = true;
1339 		}
1340 
1341 		fnhe->fnhe_stamp = jiffies;
1342 	}
1343 	spin_unlock_bh(&fnhe_lock);
1344 
1345 	return ret;
1346 }
1347 
1348 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1349 {
1350 	struct rtable *orig, *prev, **p;
1351 	bool ret = true;
1352 
1353 	if (rt_is_input_route(rt)) {
1354 		p = (struct rtable **)&nh->nh_rth_input;
1355 	} else {
1356 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1357 	}
1358 	orig = *p;
1359 
1360 	/* hold dst before doing cmpxchg() to avoid race condition
1361 	 * on this dst
1362 	 */
1363 	dst_hold(&rt->dst);
1364 	prev = cmpxchg(p, orig, rt);
1365 	if (prev == orig) {
1366 		if (orig) {
1367 			dst_dev_put(&orig->dst);
1368 			dst_release(&orig->dst);
1369 		}
1370 	} else {
1371 		dst_release(&rt->dst);
1372 		ret = false;
1373 	}
1374 
1375 	return ret;
1376 }
1377 
1378 struct uncached_list {
1379 	spinlock_t		lock;
1380 	struct list_head	head;
1381 };
1382 
1383 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1384 
1385 static void rt_add_uncached_list(struct rtable *rt)
1386 {
1387 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388 
1389 	rt->rt_uncached_list = ul;
1390 
1391 	spin_lock_bh(&ul->lock);
1392 	list_add_tail(&rt->rt_uncached, &ul->head);
1393 	spin_unlock_bh(&ul->lock);
1394 }
1395 
1396 static void ipv4_dst_destroy(struct dst_entry *dst)
1397 {
1398 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1399 	struct rtable *rt = (struct rtable *) dst;
1400 
1401 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1402 		kfree(p);
1403 
1404 	if (!list_empty(&rt->rt_uncached)) {
1405 		struct uncached_list *ul = rt->rt_uncached_list;
1406 
1407 		spin_lock_bh(&ul->lock);
1408 		list_del(&rt->rt_uncached);
1409 		spin_unlock_bh(&ul->lock);
1410 	}
1411 }
1412 
1413 void rt_flush_dev(struct net_device *dev)
1414 {
1415 	struct net *net = dev_net(dev);
1416 	struct rtable *rt;
1417 	int cpu;
1418 
1419 	for_each_possible_cpu(cpu) {
1420 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1421 
1422 		spin_lock_bh(&ul->lock);
1423 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1424 			if (rt->dst.dev != dev)
1425 				continue;
1426 			rt->dst.dev = net->loopback_dev;
1427 			dev_hold(rt->dst.dev);
1428 			dev_put(dev);
1429 		}
1430 		spin_unlock_bh(&ul->lock);
1431 	}
1432 }
1433 
1434 static bool rt_cache_valid(const struct rtable *rt)
1435 {
1436 	return	rt &&
1437 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438 		!rt_is_expired(rt);
1439 }
1440 
1441 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1442 			   const struct fib_result *res,
1443 			   struct fib_nh_exception *fnhe,
1444 			   struct fib_info *fi, u16 type, u32 itag,
1445 			   const bool do_cache)
1446 {
1447 	bool cached = false;
1448 
1449 	if (fi) {
1450 		struct fib_nh *nh = &FIB_RES_NH(*res);
1451 
1452 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1453 			rt->rt_gateway = nh->nh_gw;
1454 			rt->rt_uses_gateway = 1;
1455 		}
1456 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1457 		if (fi->fib_metrics != &dst_default_metrics) {
1458 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1459 			refcount_inc(&fi->fib_metrics->refcnt);
1460 		}
1461 #ifdef CONFIG_IP_ROUTE_CLASSID
1462 		rt->dst.tclassid = nh->nh_tclassid;
1463 #endif
1464 		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1465 		if (unlikely(fnhe))
1466 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1467 		else if (do_cache)
1468 			cached = rt_cache_route(nh, rt);
1469 		if (unlikely(!cached)) {
1470 			/* Routes we intend to cache in nexthop exception or
1471 			 * FIB nexthop have the DST_NOCACHE bit clear.
1472 			 * However, if we are unsuccessful at storing this
1473 			 * route into the cache we really need to set it.
1474 			 */
1475 			if (!rt->rt_gateway)
1476 				rt->rt_gateway = daddr;
1477 			rt_add_uncached_list(rt);
1478 		}
1479 	} else
1480 		rt_add_uncached_list(rt);
1481 
1482 #ifdef CONFIG_IP_ROUTE_CLASSID
1483 #ifdef CONFIG_IP_MULTIPLE_TABLES
1484 	set_class_tag(rt, res->tclassid);
1485 #endif
1486 	set_class_tag(rt, itag);
1487 #endif
1488 }
1489 
1490 struct rtable *rt_dst_alloc(struct net_device *dev,
1491 			    unsigned int flags, u16 type,
1492 			    bool nopolicy, bool noxfrm, bool will_cache)
1493 {
1494 	struct rtable *rt;
1495 
1496 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1497 		       (will_cache ? 0 : DST_HOST) |
1498 		       (nopolicy ? DST_NOPOLICY : 0) |
1499 		       (noxfrm ? DST_NOXFRM : 0));
1500 
1501 	if (rt) {
1502 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1503 		rt->rt_flags = flags;
1504 		rt->rt_type = type;
1505 		rt->rt_is_input = 0;
1506 		rt->rt_iif = 0;
1507 		rt->rt_pmtu = 0;
1508 		rt->rt_gateway = 0;
1509 		rt->rt_uses_gateway = 0;
1510 		rt->rt_table_id = 0;
1511 		INIT_LIST_HEAD(&rt->rt_uncached);
1512 
1513 		rt->dst.output = ip_output;
1514 		if (flags & RTCF_LOCAL)
1515 			rt->dst.input = ip_local_deliver;
1516 	}
1517 
1518 	return rt;
1519 }
1520 EXPORT_SYMBOL(rt_dst_alloc);
1521 
1522 /* called in rcu_read_lock() section */
1523 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1524 			  u8 tos, struct net_device *dev,
1525 			  struct in_device *in_dev, u32 *itag)
1526 {
1527 	int err;
1528 
1529 	/* Primary sanity checks. */
1530 	if (!in_dev)
1531 		return -EINVAL;
1532 
1533 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1534 	    skb->protocol != htons(ETH_P_IP))
1535 		return -EINVAL;
1536 
1537 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1538 		return -EINVAL;
1539 
1540 	if (ipv4_is_zeronet(saddr)) {
1541 		if (!ipv4_is_local_multicast(daddr))
1542 			return -EINVAL;
1543 	} else {
1544 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1545 					  in_dev, itag);
1546 		if (err < 0)
1547 			return err;
1548 	}
1549 	return 0;
1550 }
1551 
1552 /* called in rcu_read_lock() section */
1553 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1554 			     u8 tos, struct net_device *dev, int our)
1555 {
1556 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1557 	unsigned int flags = RTCF_MULTICAST;
1558 	struct rtable *rth;
1559 	u32 itag = 0;
1560 	int err;
1561 
1562 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1563 	if (err)
1564 		return err;
1565 
1566 	if (our)
1567 		flags |= RTCF_LOCAL;
1568 
1569 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1570 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1571 	if (!rth)
1572 		return -ENOBUFS;
1573 
1574 #ifdef CONFIG_IP_ROUTE_CLASSID
1575 	rth->dst.tclassid = itag;
1576 #endif
1577 	rth->dst.output = ip_rt_bug;
1578 	rth->rt_is_input= 1;
1579 
1580 #ifdef CONFIG_IP_MROUTE
1581 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1582 		rth->dst.input = ip_mr_input;
1583 #endif
1584 	RT_CACHE_STAT_INC(in_slow_mc);
1585 
1586 	skb_dst_set(skb, &rth->dst);
1587 	return 0;
1588 }
1589 
1590 
1591 static void ip_handle_martian_source(struct net_device *dev,
1592 				     struct in_device *in_dev,
1593 				     struct sk_buff *skb,
1594 				     __be32 daddr,
1595 				     __be32 saddr)
1596 {
1597 	RT_CACHE_STAT_INC(in_martian_src);
1598 #ifdef CONFIG_IP_ROUTE_VERBOSE
1599 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1600 		/*
1601 		 *	RFC1812 recommendation, if source is martian,
1602 		 *	the only hint is MAC header.
1603 		 */
1604 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1605 			&daddr, &saddr, dev->name);
1606 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1607 			print_hex_dump(KERN_WARNING, "ll header: ",
1608 				       DUMP_PREFIX_OFFSET, 16, 1,
1609 				       skb_mac_header(skb),
1610 				       dev->hard_header_len, true);
1611 		}
1612 	}
1613 #endif
1614 }
1615 
1616 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1617 {
1618 	struct fnhe_hash_bucket *hash;
1619 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1620 	u32 hval = fnhe_hashfun(daddr);
1621 
1622 	spin_lock_bh(&fnhe_lock);
1623 
1624 	hash = rcu_dereference_protected(nh->nh_exceptions,
1625 					 lockdep_is_held(&fnhe_lock));
1626 	hash += hval;
1627 
1628 	fnhe_p = &hash->chain;
1629 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1630 	while (fnhe) {
1631 		if (fnhe->fnhe_daddr == daddr) {
1632 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1633 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1634 			fnhe_flush_routes(fnhe);
1635 			kfree_rcu(fnhe, rcu);
1636 			break;
1637 		}
1638 		fnhe_p = &fnhe->fnhe_next;
1639 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1640 						 lockdep_is_held(&fnhe_lock));
1641 	}
1642 
1643 	spin_unlock_bh(&fnhe_lock);
1644 }
1645 
1646 static void set_lwt_redirect(struct rtable *rth)
1647 {
1648 	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1649 		rth->dst.lwtstate->orig_output = rth->dst.output;
1650 		rth->dst.output = lwtunnel_output;
1651 	}
1652 
1653 	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1654 		rth->dst.lwtstate->orig_input = rth->dst.input;
1655 		rth->dst.input = lwtunnel_input;
1656 	}
1657 }
1658 
1659 /* called in rcu_read_lock() section */
1660 static int __mkroute_input(struct sk_buff *skb,
1661 			   const struct fib_result *res,
1662 			   struct in_device *in_dev,
1663 			   __be32 daddr, __be32 saddr, u32 tos)
1664 {
1665 	struct fib_nh_exception *fnhe;
1666 	struct rtable *rth;
1667 	int err;
1668 	struct in_device *out_dev;
1669 	bool do_cache;
1670 	u32 itag = 0;
1671 
1672 	/* get a working reference to the output device */
1673 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1674 	if (!out_dev) {
1675 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1676 		return -EINVAL;
1677 	}
1678 
1679 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1680 				  in_dev->dev, in_dev, &itag);
1681 	if (err < 0) {
1682 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1683 					 saddr);
1684 
1685 		goto cleanup;
1686 	}
1687 
1688 	do_cache = res->fi && !itag;
1689 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1690 	    skb->protocol == htons(ETH_P_IP) &&
1691 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1692 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1693 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1694 
1695 	if (skb->protocol != htons(ETH_P_IP)) {
1696 		/* Not IP (i.e. ARP). Do not create route, if it is
1697 		 * invalid for proxy arp. DNAT routes are always valid.
1698 		 *
1699 		 * Proxy arp feature have been extended to allow, ARP
1700 		 * replies back to the same interface, to support
1701 		 * Private VLAN switch technologies. See arp.c.
1702 		 */
1703 		if (out_dev == in_dev &&
1704 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1705 			err = -EINVAL;
1706 			goto cleanup;
1707 		}
1708 	}
1709 
1710 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1711 	if (do_cache) {
1712 		if (fnhe) {
1713 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1714 			if (rth && rth->dst.expires &&
1715 			    time_after(jiffies, rth->dst.expires)) {
1716 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1717 				fnhe = NULL;
1718 			} else {
1719 				goto rt_cache;
1720 			}
1721 		}
1722 
1723 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1724 
1725 rt_cache:
1726 		if (rt_cache_valid(rth)) {
1727 			skb_dst_set_noref(skb, &rth->dst);
1728 			goto out;
1729 		}
1730 	}
1731 
1732 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1733 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1734 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1735 	if (!rth) {
1736 		err = -ENOBUFS;
1737 		goto cleanup;
1738 	}
1739 
1740 	rth->rt_is_input = 1;
1741 	if (res->table)
1742 		rth->rt_table_id = res->table->tb_id;
1743 	RT_CACHE_STAT_INC(in_slow_tot);
1744 
1745 	rth->dst.input = ip_forward;
1746 
1747 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1748 		       do_cache);
1749 	set_lwt_redirect(rth);
1750 	skb_dst_set(skb, &rth->dst);
1751 out:
1752 	err = 0;
1753  cleanup:
1754 	return err;
1755 }
1756 
1757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1758 /* To make ICMP packets follow the right flow, the multipath hash is
1759  * calculated from the inner IP addresses.
1760  */
1761 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1762 				 struct flow_keys *hash_keys)
1763 {
1764 	const struct iphdr *outer_iph = ip_hdr(skb);
1765 	const struct iphdr *inner_iph;
1766 	const struct icmphdr *icmph;
1767 	struct iphdr _inner_iph;
1768 	struct icmphdr _icmph;
1769 
1770 	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1771 	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1772 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1773 		return;
1774 
1775 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1776 		return;
1777 
1778 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1779 				   &_icmph);
1780 	if (!icmph)
1781 		return;
1782 
1783 	if (icmph->type != ICMP_DEST_UNREACH &&
1784 	    icmph->type != ICMP_REDIRECT &&
1785 	    icmph->type != ICMP_TIME_EXCEEDED &&
1786 	    icmph->type != ICMP_PARAMETERPROB)
1787 		return;
1788 
1789 	inner_iph = skb_header_pointer(skb,
1790 				       outer_iph->ihl * 4 + sizeof(_icmph),
1791 				       sizeof(_inner_iph), &_inner_iph);
1792 	if (!inner_iph)
1793 		return;
1794 	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1795 	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1796 }
1797 
1798 /* if skb is set it will be used and fl4 can be NULL */
1799 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1800 		       const struct sk_buff *skb)
1801 {
1802 	struct net *net = fi->fib_net;
1803 	struct flow_keys hash_keys;
1804 	u32 mhash;
1805 
1806 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1807 	case 0:
1808 		memset(&hash_keys, 0, sizeof(hash_keys));
1809 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1810 		if (skb) {
1811 			ip_multipath_l3_keys(skb, &hash_keys);
1812 		} else {
1813 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1814 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1815 		}
1816 		break;
1817 	case 1:
1818 		/* skb is currently provided only when forwarding */
1819 		if (skb) {
1820 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1821 			struct flow_keys keys;
1822 
1823 			/* short-circuit if we already have L4 hash present */
1824 			if (skb->l4_hash)
1825 				return skb_get_hash_raw(skb) >> 1;
1826 			memset(&hash_keys, 0, sizeof(hash_keys));
1827 			skb_flow_dissect_flow_keys(skb, &keys, flag);
1828 			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1829 			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1830 			hash_keys.ports.src = keys.ports.src;
1831 			hash_keys.ports.dst = keys.ports.dst;
1832 			hash_keys.basic.ip_proto = keys.basic.ip_proto;
1833 		} else {
1834 			memset(&hash_keys, 0, sizeof(hash_keys));
1835 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1836 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1837 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1838 			hash_keys.ports.src = fl4->fl4_sport;
1839 			hash_keys.ports.dst = fl4->fl4_dport;
1840 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1841 		}
1842 		break;
1843 	}
1844 	mhash = flow_hash_from_keys(&hash_keys);
1845 
1846 	return mhash >> 1;
1847 }
1848 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1849 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1850 
1851 static int ip_mkroute_input(struct sk_buff *skb,
1852 			    struct fib_result *res,
1853 			    struct in_device *in_dev,
1854 			    __be32 daddr, __be32 saddr, u32 tos)
1855 {
1856 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1857 	if (res->fi && res->fi->fib_nhs > 1) {
1858 		int h = fib_multipath_hash(res->fi, NULL, skb);
1859 
1860 		fib_select_multipath(res, h);
1861 	}
1862 #endif
1863 
1864 	/* create a routing cache entry */
1865 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1866 }
1867 
1868 /*
1869  *	NOTE. We drop all the packets that has local source
1870  *	addresses, because every properly looped back packet
1871  *	must have correct destination already attached by output routine.
1872  *
1873  *	Such approach solves two big problems:
1874  *	1. Not simplex devices are handled properly.
1875  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1876  *	called with rcu_read_lock()
1877  */
1878 
1879 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1880 			       u8 tos, struct net_device *dev,
1881 			       struct fib_result *res)
1882 {
1883 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1884 	struct ip_tunnel_info *tun_info;
1885 	struct flowi4	fl4;
1886 	unsigned int	flags = 0;
1887 	u32		itag = 0;
1888 	struct rtable	*rth;
1889 	int		err = -EINVAL;
1890 	struct net    *net = dev_net(dev);
1891 	bool do_cache;
1892 
1893 	/* IP on this device is disabled. */
1894 
1895 	if (!in_dev)
1896 		goto out;
1897 
1898 	/* Check for the most weird martians, which can be not detected
1899 	   by fib_lookup.
1900 	 */
1901 
1902 	tun_info = skb_tunnel_info(skb);
1903 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1904 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1905 	else
1906 		fl4.flowi4_tun_key.tun_id = 0;
1907 	skb_dst_drop(skb);
1908 
1909 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1910 		goto martian_source;
1911 
1912 	res->fi = NULL;
1913 	res->table = NULL;
1914 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1915 		goto brd_input;
1916 
1917 	/* Accept zero addresses only to limited broadcast;
1918 	 * I even do not know to fix it or not. Waiting for complains :-)
1919 	 */
1920 	if (ipv4_is_zeronet(saddr))
1921 		goto martian_source;
1922 
1923 	if (ipv4_is_zeronet(daddr))
1924 		goto martian_destination;
1925 
1926 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1927 	 * and call it once if daddr or/and saddr are loopback addresses
1928 	 */
1929 	if (ipv4_is_loopback(daddr)) {
1930 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1931 			goto martian_destination;
1932 	} else if (ipv4_is_loopback(saddr)) {
1933 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1934 			goto martian_source;
1935 	}
1936 
1937 	/*
1938 	 *	Now we are ready to route packet.
1939 	 */
1940 	fl4.flowi4_oif = 0;
1941 	fl4.flowi4_iif = dev->ifindex;
1942 	fl4.flowi4_mark = skb->mark;
1943 	fl4.flowi4_tos = tos;
1944 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1945 	fl4.flowi4_flags = 0;
1946 	fl4.daddr = daddr;
1947 	fl4.saddr = saddr;
1948 	fl4.flowi4_uid = sock_net_uid(net, NULL);
1949 	err = fib_lookup(net, &fl4, res, 0);
1950 	if (err != 0) {
1951 		if (!IN_DEV_FORWARD(in_dev))
1952 			err = -EHOSTUNREACH;
1953 		goto no_route;
1954 	}
1955 
1956 	if (res->type == RTN_BROADCAST)
1957 		goto brd_input;
1958 
1959 	if (res->type == RTN_LOCAL) {
1960 		err = fib_validate_source(skb, saddr, daddr, tos,
1961 					  0, dev, in_dev, &itag);
1962 		if (err < 0)
1963 			goto martian_source;
1964 		goto local_input;
1965 	}
1966 
1967 	if (!IN_DEV_FORWARD(in_dev)) {
1968 		err = -EHOSTUNREACH;
1969 		goto no_route;
1970 	}
1971 	if (res->type != RTN_UNICAST)
1972 		goto martian_destination;
1973 
1974 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1975 out:	return err;
1976 
1977 brd_input:
1978 	if (skb->protocol != htons(ETH_P_IP))
1979 		goto e_inval;
1980 
1981 	if (!ipv4_is_zeronet(saddr)) {
1982 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1983 					  in_dev, &itag);
1984 		if (err < 0)
1985 			goto martian_source;
1986 	}
1987 	flags |= RTCF_BROADCAST;
1988 	res->type = RTN_BROADCAST;
1989 	RT_CACHE_STAT_INC(in_brd);
1990 
1991 local_input:
1992 	do_cache = false;
1993 	if (res->fi) {
1994 		if (!itag) {
1995 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1996 			if (rt_cache_valid(rth)) {
1997 				skb_dst_set_noref(skb, &rth->dst);
1998 				err = 0;
1999 				goto out;
2000 			}
2001 			do_cache = true;
2002 		}
2003 	}
2004 
2005 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2006 			   flags | RTCF_LOCAL, res->type,
2007 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2008 	if (!rth)
2009 		goto e_nobufs;
2010 
2011 	rth->dst.output= ip_rt_bug;
2012 #ifdef CONFIG_IP_ROUTE_CLASSID
2013 	rth->dst.tclassid = itag;
2014 #endif
2015 	rth->rt_is_input = 1;
2016 	if (res->table)
2017 		rth->rt_table_id = res->table->tb_id;
2018 
2019 	RT_CACHE_STAT_INC(in_slow_tot);
2020 	if (res->type == RTN_UNREACHABLE) {
2021 		rth->dst.input= ip_error;
2022 		rth->dst.error= -err;
2023 		rth->rt_flags 	&= ~RTCF_LOCAL;
2024 	}
2025 
2026 	if (do_cache) {
2027 		struct fib_nh *nh = &FIB_RES_NH(*res);
2028 
2029 		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2030 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2031 			WARN_ON(rth->dst.input == lwtunnel_input);
2032 			rth->dst.lwtstate->orig_input = rth->dst.input;
2033 			rth->dst.input = lwtunnel_input;
2034 		}
2035 
2036 		if (unlikely(!rt_cache_route(nh, rth)))
2037 			rt_add_uncached_list(rth);
2038 	}
2039 	skb_dst_set(skb, &rth->dst);
2040 	err = 0;
2041 	goto out;
2042 
2043 no_route:
2044 	RT_CACHE_STAT_INC(in_no_route);
2045 	res->type = RTN_UNREACHABLE;
2046 	res->fi = NULL;
2047 	res->table = NULL;
2048 	goto local_input;
2049 
2050 	/*
2051 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2052 	 */
2053 martian_destination:
2054 	RT_CACHE_STAT_INC(in_martian_dst);
2055 #ifdef CONFIG_IP_ROUTE_VERBOSE
2056 	if (IN_DEV_LOG_MARTIANS(in_dev))
2057 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2058 				     &daddr, &saddr, dev->name);
2059 #endif
2060 
2061 e_inval:
2062 	err = -EINVAL;
2063 	goto out;
2064 
2065 e_nobufs:
2066 	err = -ENOBUFS;
2067 	goto out;
2068 
2069 martian_source:
2070 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2071 	goto out;
2072 }
2073 
2074 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075 			 u8 tos, struct net_device *dev)
2076 {
2077 	struct fib_result res;
2078 	int err;
2079 
2080 	tos &= IPTOS_RT_MASK;
2081 	rcu_read_lock();
2082 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2083 	rcu_read_unlock();
2084 
2085 	return err;
2086 }
2087 EXPORT_SYMBOL(ip_route_input_noref);
2088 
2089 /* called with rcu_read_lock held */
2090 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 		       u8 tos, struct net_device *dev, struct fib_result *res)
2092 {
2093 	/* Multicast recognition logic is moved from route cache to here.
2094 	   The problem was that too many Ethernet cards have broken/missing
2095 	   hardware multicast filters :-( As result the host on multicasting
2096 	   network acquires a lot of useless route cache entries, sort of
2097 	   SDR messages from all the world. Now we try to get rid of them.
2098 	   Really, provided software IP multicast filter is organized
2099 	   reasonably (at least, hashed), it does not result in a slowdown
2100 	   comparing with route cache reject entries.
2101 	   Note, that multicast routers are not affected, because
2102 	   route cache entry is created eventually.
2103 	 */
2104 	if (ipv4_is_multicast(daddr)) {
2105 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2106 		int our = 0;
2107 		int err = -EINVAL;
2108 
2109 		if (in_dev)
2110 			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2111 					      ip_hdr(skb)->protocol);
2112 
2113 		/* check l3 master if no match yet */
2114 		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2115 			struct in_device *l3_in_dev;
2116 
2117 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2118 			if (l3_in_dev)
2119 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2120 						      ip_hdr(skb)->protocol);
2121 		}
2122 
2123 		if (our
2124 #ifdef CONFIG_IP_MROUTE
2125 			||
2126 		    (!ipv4_is_local_multicast(daddr) &&
2127 		     IN_DEV_MFORWARD(in_dev))
2128 #endif
2129 		   ) {
2130 			err = ip_route_input_mc(skb, daddr, saddr,
2131 						tos, dev, our);
2132 		}
2133 		return err;
2134 	}
2135 
2136 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2137 }
2138 
2139 /* called with rcu_read_lock() */
2140 static struct rtable *__mkroute_output(const struct fib_result *res,
2141 				       const struct flowi4 *fl4, int orig_oif,
2142 				       struct net_device *dev_out,
2143 				       unsigned int flags)
2144 {
2145 	struct fib_info *fi = res->fi;
2146 	struct fib_nh_exception *fnhe;
2147 	struct in_device *in_dev;
2148 	u16 type = res->type;
2149 	struct rtable *rth;
2150 	bool do_cache;
2151 
2152 	in_dev = __in_dev_get_rcu(dev_out);
2153 	if (!in_dev)
2154 		return ERR_PTR(-EINVAL);
2155 
2156 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2157 		if (ipv4_is_loopback(fl4->saddr) &&
2158 		    !(dev_out->flags & IFF_LOOPBACK) &&
2159 		    !netif_is_l3_master(dev_out))
2160 			return ERR_PTR(-EINVAL);
2161 
2162 	if (ipv4_is_lbcast(fl4->daddr))
2163 		type = RTN_BROADCAST;
2164 	else if (ipv4_is_multicast(fl4->daddr))
2165 		type = RTN_MULTICAST;
2166 	else if (ipv4_is_zeronet(fl4->daddr))
2167 		return ERR_PTR(-EINVAL);
2168 
2169 	if (dev_out->flags & IFF_LOOPBACK)
2170 		flags |= RTCF_LOCAL;
2171 
2172 	do_cache = true;
2173 	if (type == RTN_BROADCAST) {
2174 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175 		fi = NULL;
2176 	} else if (type == RTN_MULTICAST) {
2177 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2178 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2179 				     fl4->flowi4_proto))
2180 			flags &= ~RTCF_LOCAL;
2181 		else
2182 			do_cache = false;
2183 		/* If multicast route do not exist use
2184 		 * default one, but do not gateway in this case.
2185 		 * Yes, it is hack.
2186 		 */
2187 		if (fi && res->prefixlen < 4)
2188 			fi = NULL;
2189 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2190 		   (orig_oif != dev_out->ifindex)) {
2191 		/* For local routes that require a particular output interface
2192 		 * we do not want to cache the result.  Caching the result
2193 		 * causes incorrect behaviour when there are multiple source
2194 		 * addresses on the interface, the end result being that if the
2195 		 * intended recipient is waiting on that interface for the
2196 		 * packet he won't receive it because it will be delivered on
2197 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2198 		 * be set to the loopback interface as well.
2199 		 */
2200 		fi = NULL;
2201 	}
2202 
2203 	fnhe = NULL;
2204 	do_cache &= fi != NULL;
2205 	if (do_cache) {
2206 		struct rtable __rcu **prth;
2207 		struct fib_nh *nh = &FIB_RES_NH(*res);
2208 
2209 		fnhe = find_exception(nh, fl4->daddr);
2210 		if (fnhe) {
2211 			prth = &fnhe->fnhe_rth_output;
2212 			rth = rcu_dereference(*prth);
2213 			if (rth && rth->dst.expires &&
2214 			    time_after(jiffies, rth->dst.expires)) {
2215 				ip_del_fnhe(nh, fl4->daddr);
2216 				fnhe = NULL;
2217 			} else {
2218 				goto rt_cache;
2219 			}
2220 		}
2221 
2222 		if (unlikely(fl4->flowi4_flags &
2223 			     FLOWI_FLAG_KNOWN_NH &&
2224 			     !(nh->nh_gw &&
2225 			       nh->nh_scope == RT_SCOPE_LINK))) {
2226 			do_cache = false;
2227 			goto add;
2228 		}
2229 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2230 		rth = rcu_dereference(*prth);
2231 
2232 rt_cache:
2233 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2234 			return rth;
2235 	}
2236 
2237 add:
2238 	rth = rt_dst_alloc(dev_out, flags, type,
2239 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2240 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2241 			   do_cache);
2242 	if (!rth)
2243 		return ERR_PTR(-ENOBUFS);
2244 
2245 	rth->rt_iif = orig_oif;
2246 	if (res->table)
2247 		rth->rt_table_id = res->table->tb_id;
2248 
2249 	RT_CACHE_STAT_INC(out_slow_tot);
2250 
2251 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252 		if (flags & RTCF_LOCAL &&
2253 		    !(dev_out->flags & IFF_LOOPBACK)) {
2254 			rth->dst.output = ip_mc_output;
2255 			RT_CACHE_STAT_INC(out_slow_mc);
2256 		}
2257 #ifdef CONFIG_IP_MROUTE
2258 		if (type == RTN_MULTICAST) {
2259 			if (IN_DEV_MFORWARD(in_dev) &&
2260 			    !ipv4_is_local_multicast(fl4->daddr)) {
2261 				rth->dst.input = ip_mr_input;
2262 				rth->dst.output = ip_mc_output;
2263 			}
2264 		}
2265 #endif
2266 	}
2267 
2268 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2269 	set_lwt_redirect(rth);
2270 
2271 	return rth;
2272 }
2273 
2274 /*
2275  * Major route resolver routine.
2276  */
2277 
2278 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2279 					const struct sk_buff *skb)
2280 {
2281 	__u8 tos = RT_FL_TOS(fl4);
2282 	struct fib_result res;
2283 	struct rtable *rth;
2284 
2285 	res.tclassid	= 0;
2286 	res.fi		= NULL;
2287 	res.table	= NULL;
2288 
2289 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2290 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2291 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2292 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2293 
2294 	rcu_read_lock();
2295 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2296 	rcu_read_unlock();
2297 
2298 	return rth;
2299 }
2300 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2301 
2302 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2303 					    struct fib_result *res,
2304 					    const struct sk_buff *skb)
2305 {
2306 	struct net_device *dev_out = NULL;
2307 	int orig_oif = fl4->flowi4_oif;
2308 	unsigned int flags = 0;
2309 	struct rtable *rth;
2310 	int err = -ENETUNREACH;
2311 
2312 	if (fl4->saddr) {
2313 		rth = ERR_PTR(-EINVAL);
2314 		if (ipv4_is_multicast(fl4->saddr) ||
2315 		    ipv4_is_lbcast(fl4->saddr) ||
2316 		    ipv4_is_zeronet(fl4->saddr))
2317 			goto out;
2318 
2319 		/* I removed check for oif == dev_out->oif here.
2320 		   It was wrong for two reasons:
2321 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322 		      is assigned to multiple interfaces.
2323 		   2. Moreover, we are allowed to send packets with saddr
2324 		      of another iface. --ANK
2325 		 */
2326 
2327 		if (fl4->flowi4_oif == 0 &&
2328 		    (ipv4_is_multicast(fl4->daddr) ||
2329 		     ipv4_is_lbcast(fl4->daddr))) {
2330 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2331 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2332 			if (!dev_out)
2333 				goto out;
2334 
2335 			/* Special hack: user can direct multicasts
2336 			   and limited broadcast via necessary interface
2337 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2338 			   This hack is not just for fun, it allows
2339 			   vic,vat and friends to work.
2340 			   They bind socket to loopback, set ttl to zero
2341 			   and expect that it will work.
2342 			   From the viewpoint of routing cache they are broken,
2343 			   because we are not allowed to build multicast path
2344 			   with loopback source addr (look, routing cache
2345 			   cannot know, that ttl is zero, so that packet
2346 			   will not leave this host and route is valid).
2347 			   Luckily, this hack is good workaround.
2348 			 */
2349 
2350 			fl4->flowi4_oif = dev_out->ifindex;
2351 			goto make_route;
2352 		}
2353 
2354 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2355 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2356 			if (!__ip_dev_find(net, fl4->saddr, false))
2357 				goto out;
2358 		}
2359 	}
2360 
2361 
2362 	if (fl4->flowi4_oif) {
2363 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2364 		rth = ERR_PTR(-ENODEV);
2365 		if (!dev_out)
2366 			goto out;
2367 
2368 		/* RACE: Check return value of inet_select_addr instead. */
2369 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2370 			rth = ERR_PTR(-ENETUNREACH);
2371 			goto out;
2372 		}
2373 		if (ipv4_is_local_multicast(fl4->daddr) ||
2374 		    ipv4_is_lbcast(fl4->daddr) ||
2375 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2376 			if (!fl4->saddr)
2377 				fl4->saddr = inet_select_addr(dev_out, 0,
2378 							      RT_SCOPE_LINK);
2379 			goto make_route;
2380 		}
2381 		if (!fl4->saddr) {
2382 			if (ipv4_is_multicast(fl4->daddr))
2383 				fl4->saddr = inet_select_addr(dev_out, 0,
2384 							      fl4->flowi4_scope);
2385 			else if (!fl4->daddr)
2386 				fl4->saddr = inet_select_addr(dev_out, 0,
2387 							      RT_SCOPE_HOST);
2388 		}
2389 	}
2390 
2391 	if (!fl4->daddr) {
2392 		fl4->daddr = fl4->saddr;
2393 		if (!fl4->daddr)
2394 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2395 		dev_out = net->loopback_dev;
2396 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2397 		res->type = RTN_LOCAL;
2398 		flags |= RTCF_LOCAL;
2399 		goto make_route;
2400 	}
2401 
2402 	err = fib_lookup(net, fl4, res, 0);
2403 	if (err) {
2404 		res->fi = NULL;
2405 		res->table = NULL;
2406 		if (fl4->flowi4_oif &&
2407 		    (ipv4_is_multicast(fl4->daddr) ||
2408 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2409 			/* Apparently, routing tables are wrong. Assume,
2410 			   that the destination is on link.
2411 
2412 			   WHY? DW.
2413 			   Because we are allowed to send to iface
2414 			   even if it has NO routes and NO assigned
2415 			   addresses. When oif is specified, routing
2416 			   tables are looked up with only one purpose:
2417 			   to catch if destination is gatewayed, rather than
2418 			   direct. Moreover, if MSG_DONTROUTE is set,
2419 			   we send packet, ignoring both routing tables
2420 			   and ifaddr state. --ANK
2421 
2422 
2423 			   We could make it even if oif is unknown,
2424 			   likely IPv6, but we do not.
2425 			 */
2426 
2427 			if (fl4->saddr == 0)
2428 				fl4->saddr = inet_select_addr(dev_out, 0,
2429 							      RT_SCOPE_LINK);
2430 			res->type = RTN_UNICAST;
2431 			goto make_route;
2432 		}
2433 		rth = ERR_PTR(err);
2434 		goto out;
2435 	}
2436 
2437 	if (res->type == RTN_LOCAL) {
2438 		if (!fl4->saddr) {
2439 			if (res->fi->fib_prefsrc)
2440 				fl4->saddr = res->fi->fib_prefsrc;
2441 			else
2442 				fl4->saddr = fl4->daddr;
2443 		}
2444 
2445 		/* L3 master device is the loopback for that domain */
2446 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2447 			net->loopback_dev;
2448 
2449 		/* make sure orig_oif points to fib result device even
2450 		 * though packet rx/tx happens over loopback or l3mdev
2451 		 */
2452 		orig_oif = FIB_RES_OIF(*res);
2453 
2454 		fl4->flowi4_oif = dev_out->ifindex;
2455 		flags |= RTCF_LOCAL;
2456 		goto make_route;
2457 	}
2458 
2459 	fib_select_path(net, res, fl4, skb);
2460 
2461 	dev_out = FIB_RES_DEV(*res);
2462 	fl4->flowi4_oif = dev_out->ifindex;
2463 
2464 
2465 make_route:
2466 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2467 
2468 out:
2469 	return rth;
2470 }
2471 
2472 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2473 {
2474 	return NULL;
2475 }
2476 
2477 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2478 {
2479 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2480 
2481 	return mtu ? : dst->dev->mtu;
2482 }
2483 
2484 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2485 					  struct sk_buff *skb, u32 mtu)
2486 {
2487 }
2488 
2489 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2490 				       struct sk_buff *skb)
2491 {
2492 }
2493 
2494 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2495 					  unsigned long old)
2496 {
2497 	return NULL;
2498 }
2499 
2500 static struct dst_ops ipv4_dst_blackhole_ops = {
2501 	.family			=	AF_INET,
2502 	.check			=	ipv4_blackhole_dst_check,
2503 	.mtu			=	ipv4_blackhole_mtu,
2504 	.default_advmss		=	ipv4_default_advmss,
2505 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2506 	.redirect		=	ipv4_rt_blackhole_redirect,
2507 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2508 	.neigh_lookup		=	ipv4_neigh_lookup,
2509 };
2510 
2511 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2512 {
2513 	struct rtable *ort = (struct rtable *) dst_orig;
2514 	struct rtable *rt;
2515 
2516 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2517 	if (rt) {
2518 		struct dst_entry *new = &rt->dst;
2519 
2520 		new->__use = 1;
2521 		new->input = dst_discard;
2522 		new->output = dst_discard_out;
2523 
2524 		new->dev = net->loopback_dev;
2525 		if (new->dev)
2526 			dev_hold(new->dev);
2527 
2528 		rt->rt_is_input = ort->rt_is_input;
2529 		rt->rt_iif = ort->rt_iif;
2530 		rt->rt_pmtu = ort->rt_pmtu;
2531 
2532 		rt->rt_genid = rt_genid_ipv4(net);
2533 		rt->rt_flags = ort->rt_flags;
2534 		rt->rt_type = ort->rt_type;
2535 		rt->rt_gateway = ort->rt_gateway;
2536 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2537 
2538 		INIT_LIST_HEAD(&rt->rt_uncached);
2539 	}
2540 
2541 	dst_release(dst_orig);
2542 
2543 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2544 }
2545 
2546 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2547 				    const struct sock *sk)
2548 {
2549 	struct rtable *rt = __ip_route_output_key(net, flp4);
2550 
2551 	if (IS_ERR(rt))
2552 		return rt;
2553 
2554 	if (flp4->flowi4_proto)
2555 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2556 							flowi4_to_flowi(flp4),
2557 							sk, 0);
2558 
2559 	return rt;
2560 }
2561 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2562 
2563 /* called with rcu_read_lock held */
2564 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2565 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2566 			u32 seq)
2567 {
2568 	struct rtable *rt = skb_rtable(skb);
2569 	struct rtmsg *r;
2570 	struct nlmsghdr *nlh;
2571 	unsigned long expires = 0;
2572 	u32 error;
2573 	u32 metrics[RTAX_MAX];
2574 
2575 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2576 	if (!nlh)
2577 		return -EMSGSIZE;
2578 
2579 	r = nlmsg_data(nlh);
2580 	r->rtm_family	 = AF_INET;
2581 	r->rtm_dst_len	= 32;
2582 	r->rtm_src_len	= 0;
2583 	r->rtm_tos	= fl4->flowi4_tos;
2584 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2585 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2586 		goto nla_put_failure;
2587 	r->rtm_type	= rt->rt_type;
2588 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2589 	r->rtm_protocol = RTPROT_UNSPEC;
2590 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2591 	if (rt->rt_flags & RTCF_NOTIFY)
2592 		r->rtm_flags |= RTM_F_NOTIFY;
2593 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2594 		r->rtm_flags |= RTCF_DOREDIRECT;
2595 
2596 	if (nla_put_in_addr(skb, RTA_DST, dst))
2597 		goto nla_put_failure;
2598 	if (src) {
2599 		r->rtm_src_len = 32;
2600 		if (nla_put_in_addr(skb, RTA_SRC, src))
2601 			goto nla_put_failure;
2602 	}
2603 	if (rt->dst.dev &&
2604 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2605 		goto nla_put_failure;
2606 #ifdef CONFIG_IP_ROUTE_CLASSID
2607 	if (rt->dst.tclassid &&
2608 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2609 		goto nla_put_failure;
2610 #endif
2611 	if (!rt_is_input_route(rt) &&
2612 	    fl4->saddr != src) {
2613 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2614 			goto nla_put_failure;
2615 	}
2616 	if (rt->rt_uses_gateway &&
2617 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2618 		goto nla_put_failure;
2619 
2620 	expires = rt->dst.expires;
2621 	if (expires) {
2622 		unsigned long now = jiffies;
2623 
2624 		if (time_before(now, expires))
2625 			expires -= now;
2626 		else
2627 			expires = 0;
2628 	}
2629 
2630 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2631 	if (rt->rt_pmtu && expires)
2632 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2633 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2634 		goto nla_put_failure;
2635 
2636 	if (fl4->flowi4_mark &&
2637 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2638 		goto nla_put_failure;
2639 
2640 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2641 	    nla_put_u32(skb, RTA_UID,
2642 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2643 		goto nla_put_failure;
2644 
2645 	error = rt->dst.error;
2646 
2647 	if (rt_is_input_route(rt)) {
2648 #ifdef CONFIG_IP_MROUTE
2649 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2650 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2651 			int err = ipmr_get_route(net, skb,
2652 						 fl4->saddr, fl4->daddr,
2653 						 r, portid);
2654 
2655 			if (err <= 0) {
2656 				if (err == 0)
2657 					return 0;
2658 				goto nla_put_failure;
2659 			}
2660 		} else
2661 #endif
2662 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2663 				goto nla_put_failure;
2664 	}
2665 
2666 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2667 		goto nla_put_failure;
2668 
2669 	nlmsg_end(skb, nlh);
2670 	return 0;
2671 
2672 nla_put_failure:
2673 	nlmsg_cancel(skb, nlh);
2674 	return -EMSGSIZE;
2675 }
2676 
2677 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2678 			     struct netlink_ext_ack *extack)
2679 {
2680 	struct net *net = sock_net(in_skb->sk);
2681 	struct rtmsg *rtm;
2682 	struct nlattr *tb[RTA_MAX+1];
2683 	struct fib_result res = {};
2684 	struct rtable *rt = NULL;
2685 	struct flowi4 fl4;
2686 	__be32 dst = 0;
2687 	__be32 src = 0;
2688 	u32 iif;
2689 	int err;
2690 	int mark;
2691 	struct sk_buff *skb;
2692 	u32 table_id = RT_TABLE_MAIN;
2693 	kuid_t uid;
2694 
2695 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2696 			  extack);
2697 	if (err < 0)
2698 		goto errout;
2699 
2700 	rtm = nlmsg_data(nlh);
2701 
2702 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2703 	if (!skb) {
2704 		err = -ENOBUFS;
2705 		goto errout;
2706 	}
2707 
2708 	/* Reserve room for dummy headers, this skb can pass
2709 	   through good chunk of routing engine.
2710 	 */
2711 	skb_reset_mac_header(skb);
2712 	skb_reset_network_header(skb);
2713 
2714 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2715 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2716 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2717 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2718 	if (tb[RTA_UID])
2719 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2720 	else
2721 		uid = (iif ? INVALID_UID : current_uid());
2722 
2723 	/* Bugfix: need to give ip_route_input enough of an IP header to
2724 	 * not gag.
2725 	 */
2726 	ip_hdr(skb)->protocol = IPPROTO_UDP;
2727 	ip_hdr(skb)->saddr = src;
2728 	ip_hdr(skb)->daddr = dst;
2729 
2730 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2731 
2732 	memset(&fl4, 0, sizeof(fl4));
2733 	fl4.daddr = dst;
2734 	fl4.saddr = src;
2735 	fl4.flowi4_tos = rtm->rtm_tos;
2736 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2737 	fl4.flowi4_mark = mark;
2738 	fl4.flowi4_uid = uid;
2739 
2740 	rcu_read_lock();
2741 
2742 	if (iif) {
2743 		struct net_device *dev;
2744 
2745 		dev = dev_get_by_index_rcu(net, iif);
2746 		if (!dev) {
2747 			err = -ENODEV;
2748 			goto errout_free;
2749 		}
2750 
2751 		skb->protocol	= htons(ETH_P_IP);
2752 		skb->dev	= dev;
2753 		skb->mark	= mark;
2754 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2755 					 dev, &res);
2756 
2757 		rt = skb_rtable(skb);
2758 		if (err == 0 && rt->dst.error)
2759 			err = -rt->dst.error;
2760 	} else {
2761 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2762 		err = 0;
2763 		if (IS_ERR(rt))
2764 			err = PTR_ERR(rt);
2765 		else
2766 			skb_dst_set(skb, &rt->dst);
2767 	}
2768 
2769 	if (err)
2770 		goto errout_free;
2771 
2772 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2773 		rt->rt_flags |= RTCF_NOTIFY;
2774 
2775 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2776 		table_id = rt->rt_table_id;
2777 
2778 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2779 		if (!res.fi) {
2780 			err = fib_props[res.type].error;
2781 			if (!err)
2782 				err = -EHOSTUNREACH;
2783 			goto errout_free;
2784 		}
2785 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2786 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2787 				    rt->rt_type, res.prefix, res.prefixlen,
2788 				    fl4.flowi4_tos, res.fi, 0);
2789 	} else {
2790 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2791 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2792 	}
2793 	if (err < 0)
2794 		goto errout_free;
2795 
2796 	rcu_read_unlock();
2797 
2798 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2799 errout:
2800 	return err;
2801 
2802 errout_free:
2803 	rcu_read_unlock();
2804 	kfree_skb(skb);
2805 	goto errout;
2806 }
2807 
2808 void ip_rt_multicast_event(struct in_device *in_dev)
2809 {
2810 	rt_cache_flush(dev_net(in_dev->dev));
2811 }
2812 
2813 #ifdef CONFIG_SYSCTL
2814 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2815 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2816 static int ip_rt_gc_elasticity __read_mostly	= 8;
2817 
2818 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2819 					void __user *buffer,
2820 					size_t *lenp, loff_t *ppos)
2821 {
2822 	struct net *net = (struct net *)__ctl->extra1;
2823 
2824 	if (write) {
2825 		rt_cache_flush(net);
2826 		fnhe_genid_bump(net);
2827 		return 0;
2828 	}
2829 
2830 	return -EINVAL;
2831 }
2832 
2833 static struct ctl_table ipv4_route_table[] = {
2834 	{
2835 		.procname	= "gc_thresh",
2836 		.data		= &ipv4_dst_ops.gc_thresh,
2837 		.maxlen		= sizeof(int),
2838 		.mode		= 0644,
2839 		.proc_handler	= proc_dointvec,
2840 	},
2841 	{
2842 		.procname	= "max_size",
2843 		.data		= &ip_rt_max_size,
2844 		.maxlen		= sizeof(int),
2845 		.mode		= 0644,
2846 		.proc_handler	= proc_dointvec,
2847 	},
2848 	{
2849 		/*  Deprecated. Use gc_min_interval_ms */
2850 
2851 		.procname	= "gc_min_interval",
2852 		.data		= &ip_rt_gc_min_interval,
2853 		.maxlen		= sizeof(int),
2854 		.mode		= 0644,
2855 		.proc_handler	= proc_dointvec_jiffies,
2856 	},
2857 	{
2858 		.procname	= "gc_min_interval_ms",
2859 		.data		= &ip_rt_gc_min_interval,
2860 		.maxlen		= sizeof(int),
2861 		.mode		= 0644,
2862 		.proc_handler	= proc_dointvec_ms_jiffies,
2863 	},
2864 	{
2865 		.procname	= "gc_timeout",
2866 		.data		= &ip_rt_gc_timeout,
2867 		.maxlen		= sizeof(int),
2868 		.mode		= 0644,
2869 		.proc_handler	= proc_dointvec_jiffies,
2870 	},
2871 	{
2872 		.procname	= "gc_interval",
2873 		.data		= &ip_rt_gc_interval,
2874 		.maxlen		= sizeof(int),
2875 		.mode		= 0644,
2876 		.proc_handler	= proc_dointvec_jiffies,
2877 	},
2878 	{
2879 		.procname	= "redirect_load",
2880 		.data		= &ip_rt_redirect_load,
2881 		.maxlen		= sizeof(int),
2882 		.mode		= 0644,
2883 		.proc_handler	= proc_dointvec,
2884 	},
2885 	{
2886 		.procname	= "redirect_number",
2887 		.data		= &ip_rt_redirect_number,
2888 		.maxlen		= sizeof(int),
2889 		.mode		= 0644,
2890 		.proc_handler	= proc_dointvec,
2891 	},
2892 	{
2893 		.procname	= "redirect_silence",
2894 		.data		= &ip_rt_redirect_silence,
2895 		.maxlen		= sizeof(int),
2896 		.mode		= 0644,
2897 		.proc_handler	= proc_dointvec,
2898 	},
2899 	{
2900 		.procname	= "error_cost",
2901 		.data		= &ip_rt_error_cost,
2902 		.maxlen		= sizeof(int),
2903 		.mode		= 0644,
2904 		.proc_handler	= proc_dointvec,
2905 	},
2906 	{
2907 		.procname	= "error_burst",
2908 		.data		= &ip_rt_error_burst,
2909 		.maxlen		= sizeof(int),
2910 		.mode		= 0644,
2911 		.proc_handler	= proc_dointvec,
2912 	},
2913 	{
2914 		.procname	= "gc_elasticity",
2915 		.data		= &ip_rt_gc_elasticity,
2916 		.maxlen		= sizeof(int),
2917 		.mode		= 0644,
2918 		.proc_handler	= proc_dointvec,
2919 	},
2920 	{
2921 		.procname	= "mtu_expires",
2922 		.data		= &ip_rt_mtu_expires,
2923 		.maxlen		= sizeof(int),
2924 		.mode		= 0644,
2925 		.proc_handler	= proc_dointvec_jiffies,
2926 	},
2927 	{
2928 		.procname	= "min_pmtu",
2929 		.data		= &ip_rt_min_pmtu,
2930 		.maxlen		= sizeof(int),
2931 		.mode		= 0644,
2932 		.proc_handler	= proc_dointvec,
2933 	},
2934 	{
2935 		.procname	= "min_adv_mss",
2936 		.data		= &ip_rt_min_advmss,
2937 		.maxlen		= sizeof(int),
2938 		.mode		= 0644,
2939 		.proc_handler	= proc_dointvec,
2940 	},
2941 	{ }
2942 };
2943 
2944 static struct ctl_table ipv4_route_flush_table[] = {
2945 	{
2946 		.procname	= "flush",
2947 		.maxlen		= sizeof(int),
2948 		.mode		= 0200,
2949 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2950 	},
2951 	{ },
2952 };
2953 
2954 static __net_init int sysctl_route_net_init(struct net *net)
2955 {
2956 	struct ctl_table *tbl;
2957 
2958 	tbl = ipv4_route_flush_table;
2959 	if (!net_eq(net, &init_net)) {
2960 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2961 		if (!tbl)
2962 			goto err_dup;
2963 
2964 		/* Don't export sysctls to unprivileged users */
2965 		if (net->user_ns != &init_user_ns)
2966 			tbl[0].procname = NULL;
2967 	}
2968 	tbl[0].extra1 = net;
2969 
2970 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2971 	if (!net->ipv4.route_hdr)
2972 		goto err_reg;
2973 	return 0;
2974 
2975 err_reg:
2976 	if (tbl != ipv4_route_flush_table)
2977 		kfree(tbl);
2978 err_dup:
2979 	return -ENOMEM;
2980 }
2981 
2982 static __net_exit void sysctl_route_net_exit(struct net *net)
2983 {
2984 	struct ctl_table *tbl;
2985 
2986 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2987 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2988 	BUG_ON(tbl == ipv4_route_flush_table);
2989 	kfree(tbl);
2990 }
2991 
2992 static __net_initdata struct pernet_operations sysctl_route_ops = {
2993 	.init = sysctl_route_net_init,
2994 	.exit = sysctl_route_net_exit,
2995 };
2996 #endif
2997 
2998 static __net_init int rt_genid_init(struct net *net)
2999 {
3000 	atomic_set(&net->ipv4.rt_genid, 0);
3001 	atomic_set(&net->fnhe_genid, 0);
3002 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3003 	return 0;
3004 }
3005 
3006 static __net_initdata struct pernet_operations rt_genid_ops = {
3007 	.init = rt_genid_init,
3008 };
3009 
3010 static int __net_init ipv4_inetpeer_init(struct net *net)
3011 {
3012 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3013 
3014 	if (!bp)
3015 		return -ENOMEM;
3016 	inet_peer_base_init(bp);
3017 	net->ipv4.peers = bp;
3018 	return 0;
3019 }
3020 
3021 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3022 {
3023 	struct inet_peer_base *bp = net->ipv4.peers;
3024 
3025 	net->ipv4.peers = NULL;
3026 	inetpeer_invalidate_tree(bp);
3027 	kfree(bp);
3028 }
3029 
3030 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3031 	.init	=	ipv4_inetpeer_init,
3032 	.exit	=	ipv4_inetpeer_exit,
3033 };
3034 
3035 #ifdef CONFIG_IP_ROUTE_CLASSID
3036 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3037 #endif /* CONFIG_IP_ROUTE_CLASSID */
3038 
3039 int __init ip_rt_init(void)
3040 {
3041 	int rc = 0;
3042 	int cpu;
3043 
3044 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3045 	if (!ip_idents)
3046 		panic("IP: failed to allocate ip_idents\n");
3047 
3048 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3049 
3050 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3051 	if (!ip_tstamps)
3052 		panic("IP: failed to allocate ip_tstamps\n");
3053 
3054 	for_each_possible_cpu(cpu) {
3055 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3056 
3057 		INIT_LIST_HEAD(&ul->head);
3058 		spin_lock_init(&ul->lock);
3059 	}
3060 #ifdef CONFIG_IP_ROUTE_CLASSID
3061 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3062 	if (!ip_rt_acct)
3063 		panic("IP: failed to allocate ip_rt_acct\n");
3064 #endif
3065 
3066 	ipv4_dst_ops.kmem_cachep =
3067 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3068 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3069 
3070 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3071 
3072 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3073 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3074 
3075 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3076 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3077 
3078 	ipv4_dst_ops.gc_thresh = ~0;
3079 	ip_rt_max_size = INT_MAX;
3080 
3081 	devinet_init();
3082 	ip_fib_init();
3083 
3084 	if (ip_rt_proc_init())
3085 		pr_err("Unable to create route proc files\n");
3086 #ifdef CONFIG_XFRM
3087 	xfrm_init();
3088 	xfrm4_init();
3089 #endif
3090 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3091 		      RTNL_FLAG_DOIT_UNLOCKED);
3092 
3093 #ifdef CONFIG_SYSCTL
3094 	register_pernet_subsys(&sysctl_route_ops);
3095 #endif
3096 	register_pernet_subsys(&rt_genid_ops);
3097 	register_pernet_subsys(&ipv4_inetpeer_ops);
3098 	return rc;
3099 }
3100 
3101 #ifdef CONFIG_SYSCTL
3102 /*
3103  * We really need to sanitize the damn ipv4 init order, then all
3104  * this nonsense will go away.
3105  */
3106 void __init ip_static_sysctl_init(void)
3107 {
3108 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3109 }
3110 #endif
3111