xref: /openbmc/linux/net/ipv4/route.c (revision 93707cbabcc8baf2b2b5f4a99c1f08ee83eb7abd)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116 
117 #include "fib_lookup.h"
118 
119 #define RT_FL_TOS(oldflp4) \
120 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121 
122 #define RT_GC_TIMEOUT (300*HZ)
123 
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 
134 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
135 /*
136  *	Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void		 ipv4_link_failure(struct sk_buff *skb);
144 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 					   struct sk_buff *skb, u32 mtu);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct file_operations rt_cache_seq_fops = {
243 	.open	 = rt_cache_seq_open,
244 	.read	 = seq_read,
245 	.llseek	 = seq_lseek,
246 	.release = seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 
278 }
279 
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.open	 = rt_cpu_seq_open,
334 	.read	 = seq_read,
335 	.llseek	 = seq_lseek,
336 	.release = seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 
364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
365 {
366 	return single_open(file, rt_acct_proc_show, NULL);
367 }
368 
369 static const struct file_operations rt_acct_proc_fops = {
370 	.open		= rt_acct_proc_open,
371 	.read		= seq_read,
372 	.llseek		= seq_lseek,
373 	.release	= single_release,
374 };
375 #endif
376 
377 static int __net_init ip_rt_do_proc_init(struct net *net)
378 {
379 	struct proc_dir_entry *pde;
380 
381 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
382 			  &rt_cache_seq_fops);
383 	if (!pde)
384 		goto err1;
385 
386 	pde = proc_create("rt_cache", S_IRUGO,
387 			  net->proc_net_stat, &rt_cpu_seq_fops);
388 	if (!pde)
389 		goto err2;
390 
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
393 	if (!pde)
394 		goto err3;
395 #endif
396 	return 0;
397 
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 err3:
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 #endif
402 err2:
403 	remove_proc_entry("rt_cache", net->proc_net);
404 err1:
405 	return -ENOMEM;
406 }
407 
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 {
410 	remove_proc_entry("rt_cache", net->proc_net_stat);
411 	remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413 	remove_proc_entry("rt_acct", net->proc_net);
414 #endif
415 }
416 
417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
418 	.init = ip_rt_do_proc_init,
419 	.exit = ip_rt_do_proc_exit,
420 	.async = true,
421 };
422 
423 static int __init ip_rt_proc_init(void)
424 {
425 	return register_pernet_subsys(&ip_rt_proc_ops);
426 }
427 
428 #else
429 static inline int ip_rt_proc_init(void)
430 {
431 	return 0;
432 }
433 #endif /* CONFIG_PROC_FS */
434 
435 static inline bool rt_is_expired(const struct rtable *rth)
436 {
437 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
438 }
439 
440 void rt_cache_flush(struct net *net)
441 {
442 	rt_genid_bump_ipv4(net);
443 }
444 
445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
446 					   struct sk_buff *skb,
447 					   const void *daddr)
448 {
449 	struct net_device *dev = dst->dev;
450 	const __be32 *pkey = daddr;
451 	const struct rtable *rt;
452 	struct neighbour *n;
453 
454 	rt = (const struct rtable *) dst;
455 	if (rt->rt_gateway)
456 		pkey = (const __be32 *) &rt->rt_gateway;
457 	else if (skb)
458 		pkey = &ip_hdr(skb)->daddr;
459 
460 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
461 	if (n)
462 		return n;
463 	return neigh_create(&arp_tbl, pkey, dev);
464 }
465 
466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467 {
468 	struct net_device *dev = dst->dev;
469 	const __be32 *pkey = daddr;
470 	const struct rtable *rt;
471 
472 	rt = (const struct rtable *)dst;
473 	if (rt->rt_gateway)
474 		pkey = (const __be32 *)&rt->rt_gateway;
475 	else if (!daddr ||
476 		 (rt->rt_flags &
477 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478 		return;
479 
480 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481 }
482 
483 #define IP_IDENTS_SZ 2048u
484 
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487 
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
495 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
496 	u32 old = READ_ONCE(*p_tstamp);
497 	u32 now = (u32)jiffies;
498 	u32 new, delta = 0;
499 
500 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
501 		delta = prandom_u32_max(now - old);
502 
503 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
504 	do {
505 		old = (u32)atomic_read(p_id);
506 		new = old + delta + segs;
507 	} while (atomic_cmpxchg(p_id, old, new) != old);
508 
509 	return new - segs;
510 }
511 EXPORT_SYMBOL(ip_idents_reserve);
512 
513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
514 {
515 	static u32 ip_idents_hashrnd __read_mostly;
516 	u32 hash, id;
517 
518 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
519 
520 	hash = jhash_3words((__force u32)iph->daddr,
521 			    (__force u32)iph->saddr,
522 			    iph->protocol ^ net_hash_mix(net),
523 			    ip_idents_hashrnd);
524 	id = ip_idents_reserve(hash, segs);
525 	iph->id = htons(id);
526 }
527 EXPORT_SYMBOL(__ip_select_ident);
528 
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530 			     const struct sock *sk,
531 			     const struct iphdr *iph,
532 			     int oif, u8 tos,
533 			     u8 prot, u32 mark, int flow_flags)
534 {
535 	if (sk) {
536 		const struct inet_sock *inet = inet_sk(sk);
537 
538 		oif = sk->sk_bound_dev_if;
539 		mark = sk->sk_mark;
540 		tos = RT_CONN_FLAGS(sk);
541 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542 	}
543 	flowi4_init_output(fl4, oif, mark, tos,
544 			   RT_SCOPE_UNIVERSE, prot,
545 			   flow_flags,
546 			   iph->daddr, iph->saddr, 0, 0,
547 			   sock_net_uid(net, sk));
548 }
549 
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551 			       const struct sock *sk)
552 {
553 	const struct net *net = dev_net(skb->dev);
554 	const struct iphdr *iph = ip_hdr(skb);
555 	int oif = skb->dev->ifindex;
556 	u8 tos = RT_TOS(iph->tos);
557 	u8 prot = iph->protocol;
558 	u32 mark = skb->mark;
559 
560 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562 
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565 	const struct inet_sock *inet = inet_sk(sk);
566 	const struct ip_options_rcu *inet_opt;
567 	__be32 daddr = inet->inet_daddr;
568 
569 	rcu_read_lock();
570 	inet_opt = rcu_dereference(inet->inet_opt);
571 	if (inet_opt && inet_opt->opt.srr)
572 		daddr = inet_opt->opt.faddr;
573 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 			   inet_sk_flowi_flags(sk),
577 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578 	rcu_read_unlock();
579 }
580 
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 				 const struct sk_buff *skb)
583 {
584 	if (skb)
585 		build_skb_flow_key(fl4, skb, sk);
586 	else
587 		build_sk_flow_key(fl4, sk);
588 }
589 
590 static DEFINE_SPINLOCK(fnhe_lock);
591 
592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 {
594 	struct rtable *rt;
595 
596 	rt = rcu_dereference(fnhe->fnhe_rth_input);
597 	if (rt) {
598 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
599 		dst_dev_put(&rt->dst);
600 		dst_release(&rt->dst);
601 	}
602 	rt = rcu_dereference(fnhe->fnhe_rth_output);
603 	if (rt) {
604 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
605 		dst_dev_put(&rt->dst);
606 		dst_release(&rt->dst);
607 	}
608 }
609 
610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
611 {
612 	struct fib_nh_exception *fnhe, *oldest;
613 
614 	oldest = rcu_dereference(hash->chain);
615 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
616 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
617 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
618 			oldest = fnhe;
619 	}
620 	fnhe_flush_routes(oldest);
621 	return oldest;
622 }
623 
624 static inline u32 fnhe_hashfun(__be32 daddr)
625 {
626 	static u32 fnhe_hashrnd __read_mostly;
627 	u32 hval;
628 
629 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
630 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
631 	return hash_32(hval, FNHE_HASH_SHIFT);
632 }
633 
634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
635 {
636 	rt->rt_pmtu = fnhe->fnhe_pmtu;
637 	rt->dst.expires = fnhe->fnhe_expires;
638 
639 	if (fnhe->fnhe_gw) {
640 		rt->rt_flags |= RTCF_REDIRECTED;
641 		rt->rt_gateway = fnhe->fnhe_gw;
642 		rt->rt_uses_gateway = 1;
643 	}
644 }
645 
646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
647 				  u32 pmtu, unsigned long expires)
648 {
649 	struct fnhe_hash_bucket *hash;
650 	struct fib_nh_exception *fnhe;
651 	struct rtable *rt;
652 	u32 genid, hval;
653 	unsigned int i;
654 	int depth;
655 
656 	genid = fnhe_genid(dev_net(nh->nh_dev));
657 	hval = fnhe_hashfun(daddr);
658 
659 	spin_lock_bh(&fnhe_lock);
660 
661 	hash = rcu_dereference(nh->nh_exceptions);
662 	if (!hash) {
663 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
664 		if (!hash)
665 			goto out_unlock;
666 		rcu_assign_pointer(nh->nh_exceptions, hash);
667 	}
668 
669 	hash += hval;
670 
671 	depth = 0;
672 	for (fnhe = rcu_dereference(hash->chain); fnhe;
673 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
674 		if (fnhe->fnhe_daddr == daddr)
675 			break;
676 		depth++;
677 	}
678 
679 	if (fnhe) {
680 		if (fnhe->fnhe_genid != genid)
681 			fnhe->fnhe_genid = genid;
682 		if (gw)
683 			fnhe->fnhe_gw = gw;
684 		if (pmtu)
685 			fnhe->fnhe_pmtu = pmtu;
686 		fnhe->fnhe_expires = max(1UL, expires);
687 		/* Update all cached dsts too */
688 		rt = rcu_dereference(fnhe->fnhe_rth_input);
689 		if (rt)
690 			fill_route_from_fnhe(rt, fnhe);
691 		rt = rcu_dereference(fnhe->fnhe_rth_output);
692 		if (rt)
693 			fill_route_from_fnhe(rt, fnhe);
694 	} else {
695 		if (depth > FNHE_RECLAIM_DEPTH)
696 			fnhe = fnhe_oldest(hash);
697 		else {
698 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
699 			if (!fnhe)
700 				goto out_unlock;
701 
702 			fnhe->fnhe_next = hash->chain;
703 			rcu_assign_pointer(hash->chain, fnhe);
704 		}
705 		fnhe->fnhe_genid = genid;
706 		fnhe->fnhe_daddr = daddr;
707 		fnhe->fnhe_gw = gw;
708 		fnhe->fnhe_pmtu = pmtu;
709 		fnhe->fnhe_expires = expires;
710 
711 		/* Exception created; mark the cached routes for the nexthop
712 		 * stale, so anyone caching it rechecks if this exception
713 		 * applies to them.
714 		 */
715 		rt = rcu_dereference(nh->nh_rth_input);
716 		if (rt)
717 			rt->dst.obsolete = DST_OBSOLETE_KILL;
718 
719 		for_each_possible_cpu(i) {
720 			struct rtable __rcu **prt;
721 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
722 			rt = rcu_dereference(*prt);
723 			if (rt)
724 				rt->dst.obsolete = DST_OBSOLETE_KILL;
725 		}
726 	}
727 
728 	fnhe->fnhe_stamp = jiffies;
729 
730 out_unlock:
731 	spin_unlock_bh(&fnhe_lock);
732 }
733 
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 			     bool kill_route)
736 {
737 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
738 	__be32 old_gw = ip_hdr(skb)->saddr;
739 	struct net_device *dev = skb->dev;
740 	struct in_device *in_dev;
741 	struct fib_result res;
742 	struct neighbour *n;
743 	struct net *net;
744 
745 	switch (icmp_hdr(skb)->code & 7) {
746 	case ICMP_REDIR_NET:
747 	case ICMP_REDIR_NETTOS:
748 	case ICMP_REDIR_HOST:
749 	case ICMP_REDIR_HOSTTOS:
750 		break;
751 
752 	default:
753 		return;
754 	}
755 
756 	if (rt->rt_gateway != old_gw)
757 		return;
758 
759 	in_dev = __in_dev_get_rcu(dev);
760 	if (!in_dev)
761 		return;
762 
763 	net = dev_net(dev);
764 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 	    ipv4_is_zeronet(new_gw))
767 		goto reject_redirect;
768 
769 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 			goto reject_redirect;
772 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 			goto reject_redirect;
774 	} else {
775 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776 			goto reject_redirect;
777 	}
778 
779 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 	if (!n)
781 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782 	if (!IS_ERR(n)) {
783 		if (!(n->nud_state & NUD_VALID)) {
784 			neigh_event_send(n, NULL);
785 		} else {
786 			if (fib_lookup(net, fl4, &res, 0) == 0) {
787 				struct fib_nh *nh = &FIB_RES_NH(res);
788 
789 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
790 						0, jiffies + ip_rt_gc_timeout);
791 			}
792 			if (kill_route)
793 				rt->dst.obsolete = DST_OBSOLETE_KILL;
794 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
795 		}
796 		neigh_release(n);
797 	}
798 	return;
799 
800 reject_redirect:
801 #ifdef CONFIG_IP_ROUTE_VERBOSE
802 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
803 		const struct iphdr *iph = (const struct iphdr *) skb->data;
804 		__be32 daddr = iph->daddr;
805 		__be32 saddr = iph->saddr;
806 
807 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
808 				     "  Advised path = %pI4 -> %pI4\n",
809 				     &old_gw, dev->name, &new_gw,
810 				     &saddr, &daddr);
811 	}
812 #endif
813 	;
814 }
815 
816 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
817 {
818 	struct rtable *rt;
819 	struct flowi4 fl4;
820 	const struct iphdr *iph = (const struct iphdr *) skb->data;
821 	struct net *net = dev_net(skb->dev);
822 	int oif = skb->dev->ifindex;
823 	u8 tos = RT_TOS(iph->tos);
824 	u8 prot = iph->protocol;
825 	u32 mark = skb->mark;
826 
827 	rt = (struct rtable *) dst;
828 
829 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
830 	__ip_do_redirect(rt, skb, &fl4, true);
831 }
832 
833 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
834 {
835 	struct rtable *rt = (struct rtable *)dst;
836 	struct dst_entry *ret = dst;
837 
838 	if (rt) {
839 		if (dst->obsolete > 0) {
840 			ip_rt_put(rt);
841 			ret = NULL;
842 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
843 			   rt->dst.expires) {
844 			ip_rt_put(rt);
845 			ret = NULL;
846 		}
847 	}
848 	return ret;
849 }
850 
851 /*
852  * Algorithm:
853  *	1. The first ip_rt_redirect_number redirects are sent
854  *	   with exponential backoff, then we stop sending them at all,
855  *	   assuming that the host ignores our redirects.
856  *	2. If we did not see packets requiring redirects
857  *	   during ip_rt_redirect_silence, we assume that the host
858  *	   forgot redirected route and start to send redirects again.
859  *
860  * This algorithm is much cheaper and more intelligent than dumb load limiting
861  * in icmp.c.
862  *
863  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
864  * and "frag. need" (breaks PMTU discovery) in icmp.c.
865  */
866 
867 void ip_rt_send_redirect(struct sk_buff *skb)
868 {
869 	struct rtable *rt = skb_rtable(skb);
870 	struct in_device *in_dev;
871 	struct inet_peer *peer;
872 	struct net *net;
873 	int log_martians;
874 	int vif;
875 
876 	rcu_read_lock();
877 	in_dev = __in_dev_get_rcu(rt->dst.dev);
878 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
879 		rcu_read_unlock();
880 		return;
881 	}
882 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
883 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
884 	rcu_read_unlock();
885 
886 	net = dev_net(rt->dst.dev);
887 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
888 	if (!peer) {
889 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
890 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
891 		return;
892 	}
893 
894 	/* No redirected packets during ip_rt_redirect_silence;
895 	 * reset the algorithm.
896 	 */
897 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
898 		peer->rate_tokens = 0;
899 
900 	/* Too many ignored redirects; do not send anything
901 	 * set dst.rate_last to the last seen redirected packet.
902 	 */
903 	if (peer->rate_tokens >= ip_rt_redirect_number) {
904 		peer->rate_last = jiffies;
905 		goto out_put_peer;
906 	}
907 
908 	/* Check for load limit; set rate_last to the latest sent
909 	 * redirect.
910 	 */
911 	if (peer->rate_tokens == 0 ||
912 	    time_after(jiffies,
913 		       (peer->rate_last +
914 			(ip_rt_redirect_load << peer->rate_tokens)))) {
915 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
916 
917 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
918 		peer->rate_last = jiffies;
919 		++peer->rate_tokens;
920 #ifdef CONFIG_IP_ROUTE_VERBOSE
921 		if (log_martians &&
922 		    peer->rate_tokens == ip_rt_redirect_number)
923 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
924 					     &ip_hdr(skb)->saddr, inet_iif(skb),
925 					     &ip_hdr(skb)->daddr, &gw);
926 #endif
927 	}
928 out_put_peer:
929 	inet_putpeer(peer);
930 }
931 
932 static int ip_error(struct sk_buff *skb)
933 {
934 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
935 	struct rtable *rt = skb_rtable(skb);
936 	struct inet_peer *peer;
937 	unsigned long now;
938 	struct net *net;
939 	bool send;
940 	int code;
941 
942 	/* IP on this device is disabled. */
943 	if (!in_dev)
944 		goto out;
945 
946 	net = dev_net(rt->dst.dev);
947 	if (!IN_DEV_FORWARD(in_dev)) {
948 		switch (rt->dst.error) {
949 		case EHOSTUNREACH:
950 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
951 			break;
952 
953 		case ENETUNREACH:
954 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
955 			break;
956 		}
957 		goto out;
958 	}
959 
960 	switch (rt->dst.error) {
961 	case EINVAL:
962 	default:
963 		goto out;
964 	case EHOSTUNREACH:
965 		code = ICMP_HOST_UNREACH;
966 		break;
967 	case ENETUNREACH:
968 		code = ICMP_NET_UNREACH;
969 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
970 		break;
971 	case EACCES:
972 		code = ICMP_PKT_FILTERED;
973 		break;
974 	}
975 
976 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
977 			       l3mdev_master_ifindex(skb->dev), 1);
978 
979 	send = true;
980 	if (peer) {
981 		now = jiffies;
982 		peer->rate_tokens += now - peer->rate_last;
983 		if (peer->rate_tokens > ip_rt_error_burst)
984 			peer->rate_tokens = ip_rt_error_burst;
985 		peer->rate_last = now;
986 		if (peer->rate_tokens >= ip_rt_error_cost)
987 			peer->rate_tokens -= ip_rt_error_cost;
988 		else
989 			send = false;
990 		inet_putpeer(peer);
991 	}
992 	if (send)
993 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
994 
995 out:	kfree_skb(skb);
996 	return 0;
997 }
998 
999 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1000 {
1001 	struct dst_entry *dst = &rt->dst;
1002 	struct fib_result res;
1003 
1004 	if (dst_metric_locked(dst, RTAX_MTU))
1005 		return;
1006 
1007 	if (ipv4_mtu(dst) < mtu)
1008 		return;
1009 
1010 	if (mtu < ip_rt_min_pmtu)
1011 		mtu = ip_rt_min_pmtu;
1012 
1013 	if (rt->rt_pmtu == mtu &&
1014 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1015 		return;
1016 
1017 	rcu_read_lock();
1018 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1019 		struct fib_nh *nh = &FIB_RES_NH(res);
1020 
1021 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1022 				      jiffies + ip_rt_mtu_expires);
1023 	}
1024 	rcu_read_unlock();
1025 }
1026 
1027 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1028 			      struct sk_buff *skb, u32 mtu)
1029 {
1030 	struct rtable *rt = (struct rtable *) dst;
1031 	struct flowi4 fl4;
1032 
1033 	ip_rt_build_flow_key(&fl4, sk, skb);
1034 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1035 }
1036 
1037 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1038 		      int oif, u32 mark, u8 protocol, int flow_flags)
1039 {
1040 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1041 	struct flowi4 fl4;
1042 	struct rtable *rt;
1043 
1044 	if (!mark)
1045 		mark = IP4_REPLY_MARK(net, skb->mark);
1046 
1047 	__build_flow_key(net, &fl4, NULL, iph, oif,
1048 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1049 	rt = __ip_route_output_key(net, &fl4);
1050 	if (!IS_ERR(rt)) {
1051 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1052 		ip_rt_put(rt);
1053 	}
1054 }
1055 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1056 
1057 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1058 {
1059 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1060 	struct flowi4 fl4;
1061 	struct rtable *rt;
1062 
1063 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1064 
1065 	if (!fl4.flowi4_mark)
1066 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1067 
1068 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1069 	if (!IS_ERR(rt)) {
1070 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1071 		ip_rt_put(rt);
1072 	}
1073 }
1074 
1075 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1078 	struct flowi4 fl4;
1079 	struct rtable *rt;
1080 	struct dst_entry *odst = NULL;
1081 	bool new = false;
1082 	struct net *net = sock_net(sk);
1083 
1084 	bh_lock_sock(sk);
1085 
1086 	if (!ip_sk_accept_pmtu(sk))
1087 		goto out;
1088 
1089 	odst = sk_dst_get(sk);
1090 
1091 	if (sock_owned_by_user(sk) || !odst) {
1092 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1093 		goto out;
1094 	}
1095 
1096 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1097 
1098 	rt = (struct rtable *)odst;
1099 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1100 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1101 		if (IS_ERR(rt))
1102 			goto out;
1103 
1104 		new = true;
1105 	}
1106 
1107 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1108 
1109 	if (!dst_check(&rt->dst, 0)) {
1110 		if (new)
1111 			dst_release(&rt->dst);
1112 
1113 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114 		if (IS_ERR(rt))
1115 			goto out;
1116 
1117 		new = true;
1118 	}
1119 
1120 	if (new)
1121 		sk_dst_set(sk, &rt->dst);
1122 
1123 out:
1124 	bh_unlock_sock(sk);
1125 	dst_release(odst);
1126 }
1127 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1128 
1129 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1130 		   int oif, u32 mark, u8 protocol, int flow_flags)
1131 {
1132 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1133 	struct flowi4 fl4;
1134 	struct rtable *rt;
1135 
1136 	__build_flow_key(net, &fl4, NULL, iph, oif,
1137 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1138 	rt = __ip_route_output_key(net, &fl4);
1139 	if (!IS_ERR(rt)) {
1140 		__ip_do_redirect(rt, skb, &fl4, false);
1141 		ip_rt_put(rt);
1142 	}
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_redirect);
1145 
1146 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1147 {
1148 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149 	struct flowi4 fl4;
1150 	struct rtable *rt;
1151 	struct net *net = sock_net(sk);
1152 
1153 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1154 	rt = __ip_route_output_key(net, &fl4);
1155 	if (!IS_ERR(rt)) {
1156 		__ip_do_redirect(rt, skb, &fl4, false);
1157 		ip_rt_put(rt);
1158 	}
1159 }
1160 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1161 
1162 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1163 {
1164 	struct rtable *rt = (struct rtable *) dst;
1165 
1166 	/* All IPV4 dsts are created with ->obsolete set to the value
1167 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1168 	 * into this function always.
1169 	 *
1170 	 * When a PMTU/redirect information update invalidates a route,
1171 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1172 	 * DST_OBSOLETE_DEAD by dst_free().
1173 	 */
1174 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1175 		return NULL;
1176 	return dst;
1177 }
1178 
1179 static void ipv4_link_failure(struct sk_buff *skb)
1180 {
1181 	struct rtable *rt;
1182 
1183 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1184 
1185 	rt = skb_rtable(skb);
1186 	if (rt)
1187 		dst_set_expires(&rt->dst, 0);
1188 }
1189 
1190 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1191 {
1192 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1193 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1194 		 skb->dev ? skb->dev->name : "?");
1195 	kfree_skb(skb);
1196 	WARN_ON(1);
1197 	return 0;
1198 }
1199 
1200 /*
1201    We do not cache source address of outgoing interface,
1202    because it is used only by IP RR, TS and SRR options,
1203    so that it out of fast path.
1204 
1205    BTW remember: "addr" is allowed to be not aligned
1206    in IP options!
1207  */
1208 
1209 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1210 {
1211 	__be32 src;
1212 
1213 	if (rt_is_output_route(rt))
1214 		src = ip_hdr(skb)->saddr;
1215 	else {
1216 		struct fib_result res;
1217 		struct flowi4 fl4;
1218 		struct iphdr *iph;
1219 
1220 		iph = ip_hdr(skb);
1221 
1222 		memset(&fl4, 0, sizeof(fl4));
1223 		fl4.daddr = iph->daddr;
1224 		fl4.saddr = iph->saddr;
1225 		fl4.flowi4_tos = RT_TOS(iph->tos);
1226 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1227 		fl4.flowi4_iif = skb->dev->ifindex;
1228 		fl4.flowi4_mark = skb->mark;
1229 
1230 		rcu_read_lock();
1231 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1232 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1233 		else
1234 			src = inet_select_addr(rt->dst.dev,
1235 					       rt_nexthop(rt, iph->daddr),
1236 					       RT_SCOPE_UNIVERSE);
1237 		rcu_read_unlock();
1238 	}
1239 	memcpy(addr, &src, 4);
1240 }
1241 
1242 #ifdef CONFIG_IP_ROUTE_CLASSID
1243 static void set_class_tag(struct rtable *rt, u32 tag)
1244 {
1245 	if (!(rt->dst.tclassid & 0xFFFF))
1246 		rt->dst.tclassid |= tag & 0xFFFF;
1247 	if (!(rt->dst.tclassid & 0xFFFF0000))
1248 		rt->dst.tclassid |= tag & 0xFFFF0000;
1249 }
1250 #endif
1251 
1252 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1253 {
1254 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1255 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1256 				    ip_rt_min_advmss);
1257 
1258 	return min(advmss, IPV4_MAX_PMTU - header_size);
1259 }
1260 
1261 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1262 {
1263 	const struct rtable *rt = (const struct rtable *) dst;
1264 	unsigned int mtu = rt->rt_pmtu;
1265 
1266 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1267 		mtu = dst_metric_raw(dst, RTAX_MTU);
1268 
1269 	if (mtu)
1270 		return mtu;
1271 
1272 	mtu = READ_ONCE(dst->dev->mtu);
1273 
1274 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1275 		if (rt->rt_uses_gateway && mtu > 576)
1276 			mtu = 576;
1277 	}
1278 
1279 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1280 
1281 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1282 }
1283 
1284 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1285 {
1286 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1287 	struct fib_nh_exception *fnhe;
1288 	u32 hval;
1289 
1290 	if (!hash)
1291 		return NULL;
1292 
1293 	hval = fnhe_hashfun(daddr);
1294 
1295 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1296 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1297 		if (fnhe->fnhe_daddr == daddr)
1298 			return fnhe;
1299 	}
1300 	return NULL;
1301 }
1302 
1303 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1304 			      __be32 daddr, const bool do_cache)
1305 {
1306 	bool ret = false;
1307 
1308 	spin_lock_bh(&fnhe_lock);
1309 
1310 	if (daddr == fnhe->fnhe_daddr) {
1311 		struct rtable __rcu **porig;
1312 		struct rtable *orig;
1313 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1314 
1315 		if (rt_is_input_route(rt))
1316 			porig = &fnhe->fnhe_rth_input;
1317 		else
1318 			porig = &fnhe->fnhe_rth_output;
1319 		orig = rcu_dereference(*porig);
1320 
1321 		if (fnhe->fnhe_genid != genid) {
1322 			fnhe->fnhe_genid = genid;
1323 			fnhe->fnhe_gw = 0;
1324 			fnhe->fnhe_pmtu = 0;
1325 			fnhe->fnhe_expires = 0;
1326 			fnhe_flush_routes(fnhe);
1327 			orig = NULL;
1328 		}
1329 		fill_route_from_fnhe(rt, fnhe);
1330 		if (!rt->rt_gateway)
1331 			rt->rt_gateway = daddr;
1332 
1333 		if (do_cache) {
1334 			dst_hold(&rt->dst);
1335 			rcu_assign_pointer(*porig, rt);
1336 			if (orig) {
1337 				dst_dev_put(&orig->dst);
1338 				dst_release(&orig->dst);
1339 			}
1340 			ret = true;
1341 		}
1342 
1343 		fnhe->fnhe_stamp = jiffies;
1344 	}
1345 	spin_unlock_bh(&fnhe_lock);
1346 
1347 	return ret;
1348 }
1349 
1350 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1351 {
1352 	struct rtable *orig, *prev, **p;
1353 	bool ret = true;
1354 
1355 	if (rt_is_input_route(rt)) {
1356 		p = (struct rtable **)&nh->nh_rth_input;
1357 	} else {
1358 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1359 	}
1360 	orig = *p;
1361 
1362 	/* hold dst before doing cmpxchg() to avoid race condition
1363 	 * on this dst
1364 	 */
1365 	dst_hold(&rt->dst);
1366 	prev = cmpxchg(p, orig, rt);
1367 	if (prev == orig) {
1368 		if (orig) {
1369 			dst_dev_put(&orig->dst);
1370 			dst_release(&orig->dst);
1371 		}
1372 	} else {
1373 		dst_release(&rt->dst);
1374 		ret = false;
1375 	}
1376 
1377 	return ret;
1378 }
1379 
1380 struct uncached_list {
1381 	spinlock_t		lock;
1382 	struct list_head	head;
1383 };
1384 
1385 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1386 
1387 static void rt_add_uncached_list(struct rtable *rt)
1388 {
1389 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1390 
1391 	rt->rt_uncached_list = ul;
1392 
1393 	spin_lock_bh(&ul->lock);
1394 	list_add_tail(&rt->rt_uncached, &ul->head);
1395 	spin_unlock_bh(&ul->lock);
1396 }
1397 
1398 static void ipv4_dst_destroy(struct dst_entry *dst)
1399 {
1400 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1401 	struct rtable *rt = (struct rtable *) dst;
1402 
1403 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1404 		kfree(p);
1405 
1406 	if (!list_empty(&rt->rt_uncached)) {
1407 		struct uncached_list *ul = rt->rt_uncached_list;
1408 
1409 		spin_lock_bh(&ul->lock);
1410 		list_del(&rt->rt_uncached);
1411 		spin_unlock_bh(&ul->lock);
1412 	}
1413 }
1414 
1415 void rt_flush_dev(struct net_device *dev)
1416 {
1417 	struct net *net = dev_net(dev);
1418 	struct rtable *rt;
1419 	int cpu;
1420 
1421 	for_each_possible_cpu(cpu) {
1422 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1423 
1424 		spin_lock_bh(&ul->lock);
1425 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1426 			if (rt->dst.dev != dev)
1427 				continue;
1428 			rt->dst.dev = net->loopback_dev;
1429 			dev_hold(rt->dst.dev);
1430 			dev_put(dev);
1431 		}
1432 		spin_unlock_bh(&ul->lock);
1433 	}
1434 }
1435 
1436 static bool rt_cache_valid(const struct rtable *rt)
1437 {
1438 	return	rt &&
1439 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1440 		!rt_is_expired(rt);
1441 }
1442 
1443 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1444 			   const struct fib_result *res,
1445 			   struct fib_nh_exception *fnhe,
1446 			   struct fib_info *fi, u16 type, u32 itag,
1447 			   const bool do_cache)
1448 {
1449 	bool cached = false;
1450 
1451 	if (fi) {
1452 		struct fib_nh *nh = &FIB_RES_NH(*res);
1453 
1454 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1455 			rt->rt_gateway = nh->nh_gw;
1456 			rt->rt_uses_gateway = 1;
1457 		}
1458 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1459 		if (fi->fib_metrics != &dst_default_metrics) {
1460 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1461 			refcount_inc(&fi->fib_metrics->refcnt);
1462 		}
1463 #ifdef CONFIG_IP_ROUTE_CLASSID
1464 		rt->dst.tclassid = nh->nh_tclassid;
1465 #endif
1466 		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1467 		if (unlikely(fnhe))
1468 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1469 		else if (do_cache)
1470 			cached = rt_cache_route(nh, rt);
1471 		if (unlikely(!cached)) {
1472 			/* Routes we intend to cache in nexthop exception or
1473 			 * FIB nexthop have the DST_NOCACHE bit clear.
1474 			 * However, if we are unsuccessful at storing this
1475 			 * route into the cache we really need to set it.
1476 			 */
1477 			if (!rt->rt_gateway)
1478 				rt->rt_gateway = daddr;
1479 			rt_add_uncached_list(rt);
1480 		}
1481 	} else
1482 		rt_add_uncached_list(rt);
1483 
1484 #ifdef CONFIG_IP_ROUTE_CLASSID
1485 #ifdef CONFIG_IP_MULTIPLE_TABLES
1486 	set_class_tag(rt, res->tclassid);
1487 #endif
1488 	set_class_tag(rt, itag);
1489 #endif
1490 }
1491 
1492 struct rtable *rt_dst_alloc(struct net_device *dev,
1493 			    unsigned int flags, u16 type,
1494 			    bool nopolicy, bool noxfrm, bool will_cache)
1495 {
1496 	struct rtable *rt;
1497 
1498 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1499 		       (will_cache ? 0 : DST_HOST) |
1500 		       (nopolicy ? DST_NOPOLICY : 0) |
1501 		       (noxfrm ? DST_NOXFRM : 0));
1502 
1503 	if (rt) {
1504 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1505 		rt->rt_flags = flags;
1506 		rt->rt_type = type;
1507 		rt->rt_is_input = 0;
1508 		rt->rt_iif = 0;
1509 		rt->rt_pmtu = 0;
1510 		rt->rt_gateway = 0;
1511 		rt->rt_uses_gateway = 0;
1512 		rt->rt_table_id = 0;
1513 		INIT_LIST_HEAD(&rt->rt_uncached);
1514 
1515 		rt->dst.output = ip_output;
1516 		if (flags & RTCF_LOCAL)
1517 			rt->dst.input = ip_local_deliver;
1518 	}
1519 
1520 	return rt;
1521 }
1522 EXPORT_SYMBOL(rt_dst_alloc);
1523 
1524 /* called in rcu_read_lock() section */
1525 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1526 			  u8 tos, struct net_device *dev,
1527 			  struct in_device *in_dev, u32 *itag)
1528 {
1529 	int err;
1530 
1531 	/* Primary sanity checks. */
1532 	if (!in_dev)
1533 		return -EINVAL;
1534 
1535 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1536 	    skb->protocol != htons(ETH_P_IP))
1537 		return -EINVAL;
1538 
1539 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1540 		return -EINVAL;
1541 
1542 	if (ipv4_is_zeronet(saddr)) {
1543 		if (!ipv4_is_local_multicast(daddr))
1544 			return -EINVAL;
1545 	} else {
1546 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1547 					  in_dev, itag);
1548 		if (err < 0)
1549 			return err;
1550 	}
1551 	return 0;
1552 }
1553 
1554 /* called in rcu_read_lock() section */
1555 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1556 			     u8 tos, struct net_device *dev, int our)
1557 {
1558 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1559 	unsigned int flags = RTCF_MULTICAST;
1560 	struct rtable *rth;
1561 	u32 itag = 0;
1562 	int err;
1563 
1564 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1565 	if (err)
1566 		return err;
1567 
1568 	if (our)
1569 		flags |= RTCF_LOCAL;
1570 
1571 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1572 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1573 	if (!rth)
1574 		return -ENOBUFS;
1575 
1576 #ifdef CONFIG_IP_ROUTE_CLASSID
1577 	rth->dst.tclassid = itag;
1578 #endif
1579 	rth->dst.output = ip_rt_bug;
1580 	rth->rt_is_input= 1;
1581 
1582 #ifdef CONFIG_IP_MROUTE
1583 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1584 		rth->dst.input = ip_mr_input;
1585 #endif
1586 	RT_CACHE_STAT_INC(in_slow_mc);
1587 
1588 	skb_dst_set(skb, &rth->dst);
1589 	return 0;
1590 }
1591 
1592 
1593 static void ip_handle_martian_source(struct net_device *dev,
1594 				     struct in_device *in_dev,
1595 				     struct sk_buff *skb,
1596 				     __be32 daddr,
1597 				     __be32 saddr)
1598 {
1599 	RT_CACHE_STAT_INC(in_martian_src);
1600 #ifdef CONFIG_IP_ROUTE_VERBOSE
1601 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1602 		/*
1603 		 *	RFC1812 recommendation, if source is martian,
1604 		 *	the only hint is MAC header.
1605 		 */
1606 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1607 			&daddr, &saddr, dev->name);
1608 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1609 			print_hex_dump(KERN_WARNING, "ll header: ",
1610 				       DUMP_PREFIX_OFFSET, 16, 1,
1611 				       skb_mac_header(skb),
1612 				       dev->hard_header_len, true);
1613 		}
1614 	}
1615 #endif
1616 }
1617 
1618 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1619 {
1620 	struct fnhe_hash_bucket *hash;
1621 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1622 	u32 hval = fnhe_hashfun(daddr);
1623 
1624 	spin_lock_bh(&fnhe_lock);
1625 
1626 	hash = rcu_dereference_protected(nh->nh_exceptions,
1627 					 lockdep_is_held(&fnhe_lock));
1628 	hash += hval;
1629 
1630 	fnhe_p = &hash->chain;
1631 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1632 	while (fnhe) {
1633 		if (fnhe->fnhe_daddr == daddr) {
1634 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1635 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1636 			fnhe_flush_routes(fnhe);
1637 			kfree_rcu(fnhe, rcu);
1638 			break;
1639 		}
1640 		fnhe_p = &fnhe->fnhe_next;
1641 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1642 						 lockdep_is_held(&fnhe_lock));
1643 	}
1644 
1645 	spin_unlock_bh(&fnhe_lock);
1646 }
1647 
1648 /* called in rcu_read_lock() section */
1649 static int __mkroute_input(struct sk_buff *skb,
1650 			   const struct fib_result *res,
1651 			   struct in_device *in_dev,
1652 			   __be32 daddr, __be32 saddr, u32 tos)
1653 {
1654 	struct fib_nh_exception *fnhe;
1655 	struct rtable *rth;
1656 	int err;
1657 	struct in_device *out_dev;
1658 	bool do_cache;
1659 	u32 itag = 0;
1660 
1661 	/* get a working reference to the output device */
1662 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1663 	if (!out_dev) {
1664 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1665 		return -EINVAL;
1666 	}
1667 
1668 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1669 				  in_dev->dev, in_dev, &itag);
1670 	if (err < 0) {
1671 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1672 					 saddr);
1673 
1674 		goto cleanup;
1675 	}
1676 
1677 	do_cache = res->fi && !itag;
1678 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1679 	    skb->protocol == htons(ETH_P_IP) &&
1680 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1681 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1682 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1683 
1684 	if (skb->protocol != htons(ETH_P_IP)) {
1685 		/* Not IP (i.e. ARP). Do not create route, if it is
1686 		 * invalid for proxy arp. DNAT routes are always valid.
1687 		 *
1688 		 * Proxy arp feature have been extended to allow, ARP
1689 		 * replies back to the same interface, to support
1690 		 * Private VLAN switch technologies. See arp.c.
1691 		 */
1692 		if (out_dev == in_dev &&
1693 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1694 			err = -EINVAL;
1695 			goto cleanup;
1696 		}
1697 	}
1698 
1699 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1700 	if (do_cache) {
1701 		if (fnhe) {
1702 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1703 			if (rth && rth->dst.expires &&
1704 			    time_after(jiffies, rth->dst.expires)) {
1705 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1706 				fnhe = NULL;
1707 			} else {
1708 				goto rt_cache;
1709 			}
1710 		}
1711 
1712 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1713 
1714 rt_cache:
1715 		if (rt_cache_valid(rth)) {
1716 			skb_dst_set_noref(skb, &rth->dst);
1717 			goto out;
1718 		}
1719 	}
1720 
1721 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1722 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1723 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1724 	if (!rth) {
1725 		err = -ENOBUFS;
1726 		goto cleanup;
1727 	}
1728 
1729 	rth->rt_is_input = 1;
1730 	if (res->table)
1731 		rth->rt_table_id = res->table->tb_id;
1732 	RT_CACHE_STAT_INC(in_slow_tot);
1733 
1734 	rth->dst.input = ip_forward;
1735 
1736 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1737 		       do_cache);
1738 	lwtunnel_set_redirect(&rth->dst);
1739 	skb_dst_set(skb, &rth->dst);
1740 out:
1741 	err = 0;
1742  cleanup:
1743 	return err;
1744 }
1745 
1746 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1747 /* To make ICMP packets follow the right flow, the multipath hash is
1748  * calculated from the inner IP addresses.
1749  */
1750 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1751 				 struct flow_keys *hash_keys)
1752 {
1753 	const struct iphdr *outer_iph = ip_hdr(skb);
1754 	const struct iphdr *inner_iph;
1755 	const struct icmphdr *icmph;
1756 	struct iphdr _inner_iph;
1757 	struct icmphdr _icmph;
1758 
1759 	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1760 	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1761 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1762 		return;
1763 
1764 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1765 		return;
1766 
1767 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1768 				   &_icmph);
1769 	if (!icmph)
1770 		return;
1771 
1772 	if (icmph->type != ICMP_DEST_UNREACH &&
1773 	    icmph->type != ICMP_REDIRECT &&
1774 	    icmph->type != ICMP_TIME_EXCEEDED &&
1775 	    icmph->type != ICMP_PARAMETERPROB)
1776 		return;
1777 
1778 	inner_iph = skb_header_pointer(skb,
1779 				       outer_iph->ihl * 4 + sizeof(_icmph),
1780 				       sizeof(_inner_iph), &_inner_iph);
1781 	if (!inner_iph)
1782 		return;
1783 	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1784 	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1785 }
1786 
1787 /* if skb is set it will be used and fl4 can be NULL */
1788 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1789 		       const struct sk_buff *skb)
1790 {
1791 	struct net *net = fi->fib_net;
1792 	struct flow_keys hash_keys;
1793 	u32 mhash;
1794 
1795 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1796 	case 0:
1797 		memset(&hash_keys, 0, sizeof(hash_keys));
1798 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1799 		if (skb) {
1800 			ip_multipath_l3_keys(skb, &hash_keys);
1801 		} else {
1802 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1803 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1804 		}
1805 		break;
1806 	case 1:
1807 		/* skb is currently provided only when forwarding */
1808 		if (skb) {
1809 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1810 			struct flow_keys keys;
1811 
1812 			/* short-circuit if we already have L4 hash present */
1813 			if (skb->l4_hash)
1814 				return skb_get_hash_raw(skb) >> 1;
1815 			memset(&hash_keys, 0, sizeof(hash_keys));
1816 			skb_flow_dissect_flow_keys(skb, &keys, flag);
1817 			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1818 			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1819 			hash_keys.ports.src = keys.ports.src;
1820 			hash_keys.ports.dst = keys.ports.dst;
1821 			hash_keys.basic.ip_proto = keys.basic.ip_proto;
1822 		} else {
1823 			memset(&hash_keys, 0, sizeof(hash_keys));
1824 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1825 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1826 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1827 			hash_keys.ports.src = fl4->fl4_sport;
1828 			hash_keys.ports.dst = fl4->fl4_dport;
1829 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1830 		}
1831 		break;
1832 	}
1833 	mhash = flow_hash_from_keys(&hash_keys);
1834 
1835 	return mhash >> 1;
1836 }
1837 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1838 
1839 static int ip_mkroute_input(struct sk_buff *skb,
1840 			    struct fib_result *res,
1841 			    struct in_device *in_dev,
1842 			    __be32 daddr, __be32 saddr, u32 tos)
1843 {
1844 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1845 	if (res->fi && res->fi->fib_nhs > 1) {
1846 		int h = fib_multipath_hash(res->fi, NULL, skb);
1847 
1848 		fib_select_multipath(res, h);
1849 	}
1850 #endif
1851 
1852 	/* create a routing cache entry */
1853 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1854 }
1855 
1856 /*
1857  *	NOTE. We drop all the packets that has local source
1858  *	addresses, because every properly looped back packet
1859  *	must have correct destination already attached by output routine.
1860  *
1861  *	Such approach solves two big problems:
1862  *	1. Not simplex devices are handled properly.
1863  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1864  *	called with rcu_read_lock()
1865  */
1866 
1867 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1868 			       u8 tos, struct net_device *dev,
1869 			       struct fib_result *res)
1870 {
1871 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1872 	struct ip_tunnel_info *tun_info;
1873 	struct flowi4	fl4;
1874 	unsigned int	flags = 0;
1875 	u32		itag = 0;
1876 	struct rtable	*rth;
1877 	int		err = -EINVAL;
1878 	struct net    *net = dev_net(dev);
1879 	bool do_cache;
1880 
1881 	/* IP on this device is disabled. */
1882 
1883 	if (!in_dev)
1884 		goto out;
1885 
1886 	/* Check for the most weird martians, which can be not detected
1887 	   by fib_lookup.
1888 	 */
1889 
1890 	tun_info = skb_tunnel_info(skb);
1891 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1892 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1893 	else
1894 		fl4.flowi4_tun_key.tun_id = 0;
1895 	skb_dst_drop(skb);
1896 
1897 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1898 		goto martian_source;
1899 
1900 	res->fi = NULL;
1901 	res->table = NULL;
1902 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1903 		goto brd_input;
1904 
1905 	/* Accept zero addresses only to limited broadcast;
1906 	 * I even do not know to fix it or not. Waiting for complains :-)
1907 	 */
1908 	if (ipv4_is_zeronet(saddr))
1909 		goto martian_source;
1910 
1911 	if (ipv4_is_zeronet(daddr))
1912 		goto martian_destination;
1913 
1914 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1915 	 * and call it once if daddr or/and saddr are loopback addresses
1916 	 */
1917 	if (ipv4_is_loopback(daddr)) {
1918 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1919 			goto martian_destination;
1920 	} else if (ipv4_is_loopback(saddr)) {
1921 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1922 			goto martian_source;
1923 	}
1924 
1925 	/*
1926 	 *	Now we are ready to route packet.
1927 	 */
1928 	fl4.flowi4_oif = 0;
1929 	fl4.flowi4_iif = dev->ifindex;
1930 	fl4.flowi4_mark = skb->mark;
1931 	fl4.flowi4_tos = tos;
1932 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1933 	fl4.flowi4_flags = 0;
1934 	fl4.daddr = daddr;
1935 	fl4.saddr = saddr;
1936 	fl4.flowi4_uid = sock_net_uid(net, NULL);
1937 	err = fib_lookup(net, &fl4, res, 0);
1938 	if (err != 0) {
1939 		if (!IN_DEV_FORWARD(in_dev))
1940 			err = -EHOSTUNREACH;
1941 		goto no_route;
1942 	}
1943 
1944 	if (res->type == RTN_BROADCAST)
1945 		goto brd_input;
1946 
1947 	if (res->type == RTN_LOCAL) {
1948 		err = fib_validate_source(skb, saddr, daddr, tos,
1949 					  0, dev, in_dev, &itag);
1950 		if (err < 0)
1951 			goto martian_source;
1952 		goto local_input;
1953 	}
1954 
1955 	if (!IN_DEV_FORWARD(in_dev)) {
1956 		err = -EHOSTUNREACH;
1957 		goto no_route;
1958 	}
1959 	if (res->type != RTN_UNICAST)
1960 		goto martian_destination;
1961 
1962 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1963 out:	return err;
1964 
1965 brd_input:
1966 	if (skb->protocol != htons(ETH_P_IP))
1967 		goto e_inval;
1968 
1969 	if (!ipv4_is_zeronet(saddr)) {
1970 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1971 					  in_dev, &itag);
1972 		if (err < 0)
1973 			goto martian_source;
1974 	}
1975 	flags |= RTCF_BROADCAST;
1976 	res->type = RTN_BROADCAST;
1977 	RT_CACHE_STAT_INC(in_brd);
1978 
1979 local_input:
1980 	do_cache = false;
1981 	if (res->fi) {
1982 		if (!itag) {
1983 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1984 			if (rt_cache_valid(rth)) {
1985 				skb_dst_set_noref(skb, &rth->dst);
1986 				err = 0;
1987 				goto out;
1988 			}
1989 			do_cache = true;
1990 		}
1991 	}
1992 
1993 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1994 			   flags | RTCF_LOCAL, res->type,
1995 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1996 	if (!rth)
1997 		goto e_nobufs;
1998 
1999 	rth->dst.output= ip_rt_bug;
2000 #ifdef CONFIG_IP_ROUTE_CLASSID
2001 	rth->dst.tclassid = itag;
2002 #endif
2003 	rth->rt_is_input = 1;
2004 	if (res->table)
2005 		rth->rt_table_id = res->table->tb_id;
2006 
2007 	RT_CACHE_STAT_INC(in_slow_tot);
2008 	if (res->type == RTN_UNREACHABLE) {
2009 		rth->dst.input= ip_error;
2010 		rth->dst.error= -err;
2011 		rth->rt_flags 	&= ~RTCF_LOCAL;
2012 	}
2013 
2014 	if (do_cache) {
2015 		struct fib_nh *nh = &FIB_RES_NH(*res);
2016 
2017 		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2018 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2019 			WARN_ON(rth->dst.input == lwtunnel_input);
2020 			rth->dst.lwtstate->orig_input = rth->dst.input;
2021 			rth->dst.input = lwtunnel_input;
2022 		}
2023 
2024 		if (unlikely(!rt_cache_route(nh, rth)))
2025 			rt_add_uncached_list(rth);
2026 	}
2027 	skb_dst_set(skb, &rth->dst);
2028 	err = 0;
2029 	goto out;
2030 
2031 no_route:
2032 	RT_CACHE_STAT_INC(in_no_route);
2033 	res->type = RTN_UNREACHABLE;
2034 	res->fi = NULL;
2035 	res->table = NULL;
2036 	goto local_input;
2037 
2038 	/*
2039 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2040 	 */
2041 martian_destination:
2042 	RT_CACHE_STAT_INC(in_martian_dst);
2043 #ifdef CONFIG_IP_ROUTE_VERBOSE
2044 	if (IN_DEV_LOG_MARTIANS(in_dev))
2045 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2046 				     &daddr, &saddr, dev->name);
2047 #endif
2048 
2049 e_inval:
2050 	err = -EINVAL;
2051 	goto out;
2052 
2053 e_nobufs:
2054 	err = -ENOBUFS;
2055 	goto out;
2056 
2057 martian_source:
2058 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2059 	goto out;
2060 }
2061 
2062 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2063 			 u8 tos, struct net_device *dev)
2064 {
2065 	struct fib_result res;
2066 	int err;
2067 
2068 	tos &= IPTOS_RT_MASK;
2069 	rcu_read_lock();
2070 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2071 	rcu_read_unlock();
2072 
2073 	return err;
2074 }
2075 EXPORT_SYMBOL(ip_route_input_noref);
2076 
2077 /* called with rcu_read_lock held */
2078 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2079 		       u8 tos, struct net_device *dev, struct fib_result *res)
2080 {
2081 	/* Multicast recognition logic is moved from route cache to here.
2082 	   The problem was that too many Ethernet cards have broken/missing
2083 	   hardware multicast filters :-( As result the host on multicasting
2084 	   network acquires a lot of useless route cache entries, sort of
2085 	   SDR messages from all the world. Now we try to get rid of them.
2086 	   Really, provided software IP multicast filter is organized
2087 	   reasonably (at least, hashed), it does not result in a slowdown
2088 	   comparing with route cache reject entries.
2089 	   Note, that multicast routers are not affected, because
2090 	   route cache entry is created eventually.
2091 	 */
2092 	if (ipv4_is_multicast(daddr)) {
2093 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2094 		int our = 0;
2095 		int err = -EINVAL;
2096 
2097 		if (in_dev)
2098 			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2099 					      ip_hdr(skb)->protocol);
2100 
2101 		/* check l3 master if no match yet */
2102 		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2103 			struct in_device *l3_in_dev;
2104 
2105 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2106 			if (l3_in_dev)
2107 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2108 						      ip_hdr(skb)->protocol);
2109 		}
2110 
2111 		if (our
2112 #ifdef CONFIG_IP_MROUTE
2113 			||
2114 		    (!ipv4_is_local_multicast(daddr) &&
2115 		     IN_DEV_MFORWARD(in_dev))
2116 #endif
2117 		   ) {
2118 			err = ip_route_input_mc(skb, daddr, saddr,
2119 						tos, dev, our);
2120 		}
2121 		return err;
2122 	}
2123 
2124 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2125 }
2126 
2127 /* called with rcu_read_lock() */
2128 static struct rtable *__mkroute_output(const struct fib_result *res,
2129 				       const struct flowi4 *fl4, int orig_oif,
2130 				       struct net_device *dev_out,
2131 				       unsigned int flags)
2132 {
2133 	struct fib_info *fi = res->fi;
2134 	struct fib_nh_exception *fnhe;
2135 	struct in_device *in_dev;
2136 	u16 type = res->type;
2137 	struct rtable *rth;
2138 	bool do_cache;
2139 
2140 	in_dev = __in_dev_get_rcu(dev_out);
2141 	if (!in_dev)
2142 		return ERR_PTR(-EINVAL);
2143 
2144 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2145 		if (ipv4_is_loopback(fl4->saddr) &&
2146 		    !(dev_out->flags & IFF_LOOPBACK) &&
2147 		    !netif_is_l3_master(dev_out))
2148 			return ERR_PTR(-EINVAL);
2149 
2150 	if (ipv4_is_lbcast(fl4->daddr))
2151 		type = RTN_BROADCAST;
2152 	else if (ipv4_is_multicast(fl4->daddr))
2153 		type = RTN_MULTICAST;
2154 	else if (ipv4_is_zeronet(fl4->daddr))
2155 		return ERR_PTR(-EINVAL);
2156 
2157 	if (dev_out->flags & IFF_LOOPBACK)
2158 		flags |= RTCF_LOCAL;
2159 
2160 	do_cache = true;
2161 	if (type == RTN_BROADCAST) {
2162 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2163 		fi = NULL;
2164 	} else if (type == RTN_MULTICAST) {
2165 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2166 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2167 				     fl4->flowi4_proto))
2168 			flags &= ~RTCF_LOCAL;
2169 		else
2170 			do_cache = false;
2171 		/* If multicast route do not exist use
2172 		 * default one, but do not gateway in this case.
2173 		 * Yes, it is hack.
2174 		 */
2175 		if (fi && res->prefixlen < 4)
2176 			fi = NULL;
2177 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2178 		   (orig_oif != dev_out->ifindex)) {
2179 		/* For local routes that require a particular output interface
2180 		 * we do not want to cache the result.  Caching the result
2181 		 * causes incorrect behaviour when there are multiple source
2182 		 * addresses on the interface, the end result being that if the
2183 		 * intended recipient is waiting on that interface for the
2184 		 * packet he won't receive it because it will be delivered on
2185 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2186 		 * be set to the loopback interface as well.
2187 		 */
2188 		fi = NULL;
2189 	}
2190 
2191 	fnhe = NULL;
2192 	do_cache &= fi != NULL;
2193 	if (do_cache) {
2194 		struct rtable __rcu **prth;
2195 		struct fib_nh *nh = &FIB_RES_NH(*res);
2196 
2197 		fnhe = find_exception(nh, fl4->daddr);
2198 		if (fnhe) {
2199 			prth = &fnhe->fnhe_rth_output;
2200 			rth = rcu_dereference(*prth);
2201 			if (rth && rth->dst.expires &&
2202 			    time_after(jiffies, rth->dst.expires)) {
2203 				ip_del_fnhe(nh, fl4->daddr);
2204 				fnhe = NULL;
2205 			} else {
2206 				goto rt_cache;
2207 			}
2208 		}
2209 
2210 		if (unlikely(fl4->flowi4_flags &
2211 			     FLOWI_FLAG_KNOWN_NH &&
2212 			     !(nh->nh_gw &&
2213 			       nh->nh_scope == RT_SCOPE_LINK))) {
2214 			do_cache = false;
2215 			goto add;
2216 		}
2217 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2218 		rth = rcu_dereference(*prth);
2219 
2220 rt_cache:
2221 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2222 			return rth;
2223 	}
2224 
2225 add:
2226 	rth = rt_dst_alloc(dev_out, flags, type,
2227 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2228 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2229 			   do_cache);
2230 	if (!rth)
2231 		return ERR_PTR(-ENOBUFS);
2232 
2233 	rth->rt_iif = orig_oif;
2234 	if (res->table)
2235 		rth->rt_table_id = res->table->tb_id;
2236 
2237 	RT_CACHE_STAT_INC(out_slow_tot);
2238 
2239 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2240 		if (flags & RTCF_LOCAL &&
2241 		    !(dev_out->flags & IFF_LOOPBACK)) {
2242 			rth->dst.output = ip_mc_output;
2243 			RT_CACHE_STAT_INC(out_slow_mc);
2244 		}
2245 #ifdef CONFIG_IP_MROUTE
2246 		if (type == RTN_MULTICAST) {
2247 			if (IN_DEV_MFORWARD(in_dev) &&
2248 			    !ipv4_is_local_multicast(fl4->daddr)) {
2249 				rth->dst.input = ip_mr_input;
2250 				rth->dst.output = ip_mc_output;
2251 			}
2252 		}
2253 #endif
2254 	}
2255 
2256 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2257 	lwtunnel_set_redirect(&rth->dst);
2258 
2259 	return rth;
2260 }
2261 
2262 /*
2263  * Major route resolver routine.
2264  */
2265 
2266 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2267 					const struct sk_buff *skb)
2268 {
2269 	__u8 tos = RT_FL_TOS(fl4);
2270 	struct fib_result res;
2271 	struct rtable *rth;
2272 
2273 	res.tclassid	= 0;
2274 	res.fi		= NULL;
2275 	res.table	= NULL;
2276 
2277 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2278 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2279 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2280 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2281 
2282 	rcu_read_lock();
2283 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2284 	rcu_read_unlock();
2285 
2286 	return rth;
2287 }
2288 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2289 
2290 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2291 					    struct fib_result *res,
2292 					    const struct sk_buff *skb)
2293 {
2294 	struct net_device *dev_out = NULL;
2295 	int orig_oif = fl4->flowi4_oif;
2296 	unsigned int flags = 0;
2297 	struct rtable *rth;
2298 	int err = -ENETUNREACH;
2299 
2300 	if (fl4->saddr) {
2301 		rth = ERR_PTR(-EINVAL);
2302 		if (ipv4_is_multicast(fl4->saddr) ||
2303 		    ipv4_is_lbcast(fl4->saddr) ||
2304 		    ipv4_is_zeronet(fl4->saddr))
2305 			goto out;
2306 
2307 		/* I removed check for oif == dev_out->oif here.
2308 		   It was wrong for two reasons:
2309 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2310 		      is assigned to multiple interfaces.
2311 		   2. Moreover, we are allowed to send packets with saddr
2312 		      of another iface. --ANK
2313 		 */
2314 
2315 		if (fl4->flowi4_oif == 0 &&
2316 		    (ipv4_is_multicast(fl4->daddr) ||
2317 		     ipv4_is_lbcast(fl4->daddr))) {
2318 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2319 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2320 			if (!dev_out)
2321 				goto out;
2322 
2323 			/* Special hack: user can direct multicasts
2324 			   and limited broadcast via necessary interface
2325 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2326 			   This hack is not just for fun, it allows
2327 			   vic,vat and friends to work.
2328 			   They bind socket to loopback, set ttl to zero
2329 			   and expect that it will work.
2330 			   From the viewpoint of routing cache they are broken,
2331 			   because we are not allowed to build multicast path
2332 			   with loopback source addr (look, routing cache
2333 			   cannot know, that ttl is zero, so that packet
2334 			   will not leave this host and route is valid).
2335 			   Luckily, this hack is good workaround.
2336 			 */
2337 
2338 			fl4->flowi4_oif = dev_out->ifindex;
2339 			goto make_route;
2340 		}
2341 
2342 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2343 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2344 			if (!__ip_dev_find(net, fl4->saddr, false))
2345 				goto out;
2346 		}
2347 	}
2348 
2349 
2350 	if (fl4->flowi4_oif) {
2351 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2352 		rth = ERR_PTR(-ENODEV);
2353 		if (!dev_out)
2354 			goto out;
2355 
2356 		/* RACE: Check return value of inet_select_addr instead. */
2357 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2358 			rth = ERR_PTR(-ENETUNREACH);
2359 			goto out;
2360 		}
2361 		if (ipv4_is_local_multicast(fl4->daddr) ||
2362 		    ipv4_is_lbcast(fl4->daddr) ||
2363 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2364 			if (!fl4->saddr)
2365 				fl4->saddr = inet_select_addr(dev_out, 0,
2366 							      RT_SCOPE_LINK);
2367 			goto make_route;
2368 		}
2369 		if (!fl4->saddr) {
2370 			if (ipv4_is_multicast(fl4->daddr))
2371 				fl4->saddr = inet_select_addr(dev_out, 0,
2372 							      fl4->flowi4_scope);
2373 			else if (!fl4->daddr)
2374 				fl4->saddr = inet_select_addr(dev_out, 0,
2375 							      RT_SCOPE_HOST);
2376 		}
2377 	}
2378 
2379 	if (!fl4->daddr) {
2380 		fl4->daddr = fl4->saddr;
2381 		if (!fl4->daddr)
2382 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2383 		dev_out = net->loopback_dev;
2384 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2385 		res->type = RTN_LOCAL;
2386 		flags |= RTCF_LOCAL;
2387 		goto make_route;
2388 	}
2389 
2390 	err = fib_lookup(net, fl4, res, 0);
2391 	if (err) {
2392 		res->fi = NULL;
2393 		res->table = NULL;
2394 		if (fl4->flowi4_oif &&
2395 		    (ipv4_is_multicast(fl4->daddr) ||
2396 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2397 			/* Apparently, routing tables are wrong. Assume,
2398 			   that the destination is on link.
2399 
2400 			   WHY? DW.
2401 			   Because we are allowed to send to iface
2402 			   even if it has NO routes and NO assigned
2403 			   addresses. When oif is specified, routing
2404 			   tables are looked up with only one purpose:
2405 			   to catch if destination is gatewayed, rather than
2406 			   direct. Moreover, if MSG_DONTROUTE is set,
2407 			   we send packet, ignoring both routing tables
2408 			   and ifaddr state. --ANK
2409 
2410 
2411 			   We could make it even if oif is unknown,
2412 			   likely IPv6, but we do not.
2413 			 */
2414 
2415 			if (fl4->saddr == 0)
2416 				fl4->saddr = inet_select_addr(dev_out, 0,
2417 							      RT_SCOPE_LINK);
2418 			res->type = RTN_UNICAST;
2419 			goto make_route;
2420 		}
2421 		rth = ERR_PTR(err);
2422 		goto out;
2423 	}
2424 
2425 	if (res->type == RTN_LOCAL) {
2426 		if (!fl4->saddr) {
2427 			if (res->fi->fib_prefsrc)
2428 				fl4->saddr = res->fi->fib_prefsrc;
2429 			else
2430 				fl4->saddr = fl4->daddr;
2431 		}
2432 
2433 		/* L3 master device is the loopback for that domain */
2434 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2435 			net->loopback_dev;
2436 
2437 		/* make sure orig_oif points to fib result device even
2438 		 * though packet rx/tx happens over loopback or l3mdev
2439 		 */
2440 		orig_oif = FIB_RES_OIF(*res);
2441 
2442 		fl4->flowi4_oif = dev_out->ifindex;
2443 		flags |= RTCF_LOCAL;
2444 		goto make_route;
2445 	}
2446 
2447 	fib_select_path(net, res, fl4, skb);
2448 
2449 	dev_out = FIB_RES_DEV(*res);
2450 	fl4->flowi4_oif = dev_out->ifindex;
2451 
2452 
2453 make_route:
2454 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2455 
2456 out:
2457 	return rth;
2458 }
2459 
2460 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2461 {
2462 	return NULL;
2463 }
2464 
2465 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2466 {
2467 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2468 
2469 	return mtu ? : dst->dev->mtu;
2470 }
2471 
2472 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2473 					  struct sk_buff *skb, u32 mtu)
2474 {
2475 }
2476 
2477 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2478 				       struct sk_buff *skb)
2479 {
2480 }
2481 
2482 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2483 					  unsigned long old)
2484 {
2485 	return NULL;
2486 }
2487 
2488 static struct dst_ops ipv4_dst_blackhole_ops = {
2489 	.family			=	AF_INET,
2490 	.check			=	ipv4_blackhole_dst_check,
2491 	.mtu			=	ipv4_blackhole_mtu,
2492 	.default_advmss		=	ipv4_default_advmss,
2493 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2494 	.redirect		=	ipv4_rt_blackhole_redirect,
2495 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2496 	.neigh_lookup		=	ipv4_neigh_lookup,
2497 };
2498 
2499 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2500 {
2501 	struct rtable *ort = (struct rtable *) dst_orig;
2502 	struct rtable *rt;
2503 
2504 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2505 	if (rt) {
2506 		struct dst_entry *new = &rt->dst;
2507 
2508 		new->__use = 1;
2509 		new->input = dst_discard;
2510 		new->output = dst_discard_out;
2511 
2512 		new->dev = net->loopback_dev;
2513 		if (new->dev)
2514 			dev_hold(new->dev);
2515 
2516 		rt->rt_is_input = ort->rt_is_input;
2517 		rt->rt_iif = ort->rt_iif;
2518 		rt->rt_pmtu = ort->rt_pmtu;
2519 
2520 		rt->rt_genid = rt_genid_ipv4(net);
2521 		rt->rt_flags = ort->rt_flags;
2522 		rt->rt_type = ort->rt_type;
2523 		rt->rt_gateway = ort->rt_gateway;
2524 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2525 
2526 		INIT_LIST_HEAD(&rt->rt_uncached);
2527 	}
2528 
2529 	dst_release(dst_orig);
2530 
2531 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2532 }
2533 
2534 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2535 				    const struct sock *sk)
2536 {
2537 	struct rtable *rt = __ip_route_output_key(net, flp4);
2538 
2539 	if (IS_ERR(rt))
2540 		return rt;
2541 
2542 	if (flp4->flowi4_proto)
2543 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2544 							flowi4_to_flowi(flp4),
2545 							sk, 0);
2546 
2547 	return rt;
2548 }
2549 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2550 
2551 /* called with rcu_read_lock held */
2552 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2553 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2554 			u32 seq)
2555 {
2556 	struct rtable *rt = skb_rtable(skb);
2557 	struct rtmsg *r;
2558 	struct nlmsghdr *nlh;
2559 	unsigned long expires = 0;
2560 	u32 error;
2561 	u32 metrics[RTAX_MAX];
2562 
2563 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2564 	if (!nlh)
2565 		return -EMSGSIZE;
2566 
2567 	r = nlmsg_data(nlh);
2568 	r->rtm_family	 = AF_INET;
2569 	r->rtm_dst_len	= 32;
2570 	r->rtm_src_len	= 0;
2571 	r->rtm_tos	= fl4->flowi4_tos;
2572 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2573 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2574 		goto nla_put_failure;
2575 	r->rtm_type	= rt->rt_type;
2576 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2577 	r->rtm_protocol = RTPROT_UNSPEC;
2578 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2579 	if (rt->rt_flags & RTCF_NOTIFY)
2580 		r->rtm_flags |= RTM_F_NOTIFY;
2581 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2582 		r->rtm_flags |= RTCF_DOREDIRECT;
2583 
2584 	if (nla_put_in_addr(skb, RTA_DST, dst))
2585 		goto nla_put_failure;
2586 	if (src) {
2587 		r->rtm_src_len = 32;
2588 		if (nla_put_in_addr(skb, RTA_SRC, src))
2589 			goto nla_put_failure;
2590 	}
2591 	if (rt->dst.dev &&
2592 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2593 		goto nla_put_failure;
2594 #ifdef CONFIG_IP_ROUTE_CLASSID
2595 	if (rt->dst.tclassid &&
2596 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2597 		goto nla_put_failure;
2598 #endif
2599 	if (!rt_is_input_route(rt) &&
2600 	    fl4->saddr != src) {
2601 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2602 			goto nla_put_failure;
2603 	}
2604 	if (rt->rt_uses_gateway &&
2605 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2606 		goto nla_put_failure;
2607 
2608 	expires = rt->dst.expires;
2609 	if (expires) {
2610 		unsigned long now = jiffies;
2611 
2612 		if (time_before(now, expires))
2613 			expires -= now;
2614 		else
2615 			expires = 0;
2616 	}
2617 
2618 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2619 	if (rt->rt_pmtu && expires)
2620 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2621 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2622 		goto nla_put_failure;
2623 
2624 	if (fl4->flowi4_mark &&
2625 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2626 		goto nla_put_failure;
2627 
2628 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2629 	    nla_put_u32(skb, RTA_UID,
2630 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2631 		goto nla_put_failure;
2632 
2633 	error = rt->dst.error;
2634 
2635 	if (rt_is_input_route(rt)) {
2636 #ifdef CONFIG_IP_MROUTE
2637 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2638 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2639 			int err = ipmr_get_route(net, skb,
2640 						 fl4->saddr, fl4->daddr,
2641 						 r, portid);
2642 
2643 			if (err <= 0) {
2644 				if (err == 0)
2645 					return 0;
2646 				goto nla_put_failure;
2647 			}
2648 		} else
2649 #endif
2650 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2651 				goto nla_put_failure;
2652 	}
2653 
2654 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2655 		goto nla_put_failure;
2656 
2657 	nlmsg_end(skb, nlh);
2658 	return 0;
2659 
2660 nla_put_failure:
2661 	nlmsg_cancel(skb, nlh);
2662 	return -EMSGSIZE;
2663 }
2664 
2665 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2666 			     struct netlink_ext_ack *extack)
2667 {
2668 	struct net *net = sock_net(in_skb->sk);
2669 	struct rtmsg *rtm;
2670 	struct nlattr *tb[RTA_MAX+1];
2671 	struct fib_result res = {};
2672 	struct rtable *rt = NULL;
2673 	struct flowi4 fl4;
2674 	__be32 dst = 0;
2675 	__be32 src = 0;
2676 	u32 iif;
2677 	int err;
2678 	int mark;
2679 	struct sk_buff *skb;
2680 	u32 table_id = RT_TABLE_MAIN;
2681 	kuid_t uid;
2682 
2683 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2684 			  extack);
2685 	if (err < 0)
2686 		goto errout;
2687 
2688 	rtm = nlmsg_data(nlh);
2689 
2690 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2691 	if (!skb) {
2692 		err = -ENOBUFS;
2693 		goto errout;
2694 	}
2695 
2696 	/* Reserve room for dummy headers, this skb can pass
2697 	   through good chunk of routing engine.
2698 	 */
2699 	skb_reset_mac_header(skb);
2700 	skb_reset_network_header(skb);
2701 
2702 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2703 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2704 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2705 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2706 	if (tb[RTA_UID])
2707 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2708 	else
2709 		uid = (iif ? INVALID_UID : current_uid());
2710 
2711 	/* Bugfix: need to give ip_route_input enough of an IP header to
2712 	 * not gag.
2713 	 */
2714 	ip_hdr(skb)->protocol = IPPROTO_UDP;
2715 	ip_hdr(skb)->saddr = src;
2716 	ip_hdr(skb)->daddr = dst;
2717 
2718 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2719 
2720 	memset(&fl4, 0, sizeof(fl4));
2721 	fl4.daddr = dst;
2722 	fl4.saddr = src;
2723 	fl4.flowi4_tos = rtm->rtm_tos;
2724 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2725 	fl4.flowi4_mark = mark;
2726 	fl4.flowi4_uid = uid;
2727 
2728 	rcu_read_lock();
2729 
2730 	if (iif) {
2731 		struct net_device *dev;
2732 
2733 		dev = dev_get_by_index_rcu(net, iif);
2734 		if (!dev) {
2735 			err = -ENODEV;
2736 			goto errout_free;
2737 		}
2738 
2739 		skb->protocol	= htons(ETH_P_IP);
2740 		skb->dev	= dev;
2741 		skb->mark	= mark;
2742 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2743 					 dev, &res);
2744 
2745 		rt = skb_rtable(skb);
2746 		if (err == 0 && rt->dst.error)
2747 			err = -rt->dst.error;
2748 	} else {
2749 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2750 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2751 		err = 0;
2752 		if (IS_ERR(rt))
2753 			err = PTR_ERR(rt);
2754 		else
2755 			skb_dst_set(skb, &rt->dst);
2756 	}
2757 
2758 	if (err)
2759 		goto errout_free;
2760 
2761 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2762 		rt->rt_flags |= RTCF_NOTIFY;
2763 
2764 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2765 		table_id = rt->rt_table_id;
2766 
2767 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2768 		if (!res.fi) {
2769 			err = fib_props[res.type].error;
2770 			if (!err)
2771 				err = -EHOSTUNREACH;
2772 			goto errout_free;
2773 		}
2774 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2775 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2776 				    rt->rt_type, res.prefix, res.prefixlen,
2777 				    fl4.flowi4_tos, res.fi, 0);
2778 	} else {
2779 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2780 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2781 	}
2782 	if (err < 0)
2783 		goto errout_free;
2784 
2785 	rcu_read_unlock();
2786 
2787 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2788 errout:
2789 	return err;
2790 
2791 errout_free:
2792 	rcu_read_unlock();
2793 	kfree_skb(skb);
2794 	goto errout;
2795 }
2796 
2797 void ip_rt_multicast_event(struct in_device *in_dev)
2798 {
2799 	rt_cache_flush(dev_net(in_dev->dev));
2800 }
2801 
2802 #ifdef CONFIG_SYSCTL
2803 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2804 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2805 static int ip_rt_gc_elasticity __read_mostly	= 8;
2806 
2807 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2808 					void __user *buffer,
2809 					size_t *lenp, loff_t *ppos)
2810 {
2811 	struct net *net = (struct net *)__ctl->extra1;
2812 
2813 	if (write) {
2814 		rt_cache_flush(net);
2815 		fnhe_genid_bump(net);
2816 		return 0;
2817 	}
2818 
2819 	return -EINVAL;
2820 }
2821 
2822 static struct ctl_table ipv4_route_table[] = {
2823 	{
2824 		.procname	= "gc_thresh",
2825 		.data		= &ipv4_dst_ops.gc_thresh,
2826 		.maxlen		= sizeof(int),
2827 		.mode		= 0644,
2828 		.proc_handler	= proc_dointvec,
2829 	},
2830 	{
2831 		.procname	= "max_size",
2832 		.data		= &ip_rt_max_size,
2833 		.maxlen		= sizeof(int),
2834 		.mode		= 0644,
2835 		.proc_handler	= proc_dointvec,
2836 	},
2837 	{
2838 		/*  Deprecated. Use gc_min_interval_ms */
2839 
2840 		.procname	= "gc_min_interval",
2841 		.data		= &ip_rt_gc_min_interval,
2842 		.maxlen		= sizeof(int),
2843 		.mode		= 0644,
2844 		.proc_handler	= proc_dointvec_jiffies,
2845 	},
2846 	{
2847 		.procname	= "gc_min_interval_ms",
2848 		.data		= &ip_rt_gc_min_interval,
2849 		.maxlen		= sizeof(int),
2850 		.mode		= 0644,
2851 		.proc_handler	= proc_dointvec_ms_jiffies,
2852 	},
2853 	{
2854 		.procname	= "gc_timeout",
2855 		.data		= &ip_rt_gc_timeout,
2856 		.maxlen		= sizeof(int),
2857 		.mode		= 0644,
2858 		.proc_handler	= proc_dointvec_jiffies,
2859 	},
2860 	{
2861 		.procname	= "gc_interval",
2862 		.data		= &ip_rt_gc_interval,
2863 		.maxlen		= sizeof(int),
2864 		.mode		= 0644,
2865 		.proc_handler	= proc_dointvec_jiffies,
2866 	},
2867 	{
2868 		.procname	= "redirect_load",
2869 		.data		= &ip_rt_redirect_load,
2870 		.maxlen		= sizeof(int),
2871 		.mode		= 0644,
2872 		.proc_handler	= proc_dointvec,
2873 	},
2874 	{
2875 		.procname	= "redirect_number",
2876 		.data		= &ip_rt_redirect_number,
2877 		.maxlen		= sizeof(int),
2878 		.mode		= 0644,
2879 		.proc_handler	= proc_dointvec,
2880 	},
2881 	{
2882 		.procname	= "redirect_silence",
2883 		.data		= &ip_rt_redirect_silence,
2884 		.maxlen		= sizeof(int),
2885 		.mode		= 0644,
2886 		.proc_handler	= proc_dointvec,
2887 	},
2888 	{
2889 		.procname	= "error_cost",
2890 		.data		= &ip_rt_error_cost,
2891 		.maxlen		= sizeof(int),
2892 		.mode		= 0644,
2893 		.proc_handler	= proc_dointvec,
2894 	},
2895 	{
2896 		.procname	= "error_burst",
2897 		.data		= &ip_rt_error_burst,
2898 		.maxlen		= sizeof(int),
2899 		.mode		= 0644,
2900 		.proc_handler	= proc_dointvec,
2901 	},
2902 	{
2903 		.procname	= "gc_elasticity",
2904 		.data		= &ip_rt_gc_elasticity,
2905 		.maxlen		= sizeof(int),
2906 		.mode		= 0644,
2907 		.proc_handler	= proc_dointvec,
2908 	},
2909 	{
2910 		.procname	= "mtu_expires",
2911 		.data		= &ip_rt_mtu_expires,
2912 		.maxlen		= sizeof(int),
2913 		.mode		= 0644,
2914 		.proc_handler	= proc_dointvec_jiffies,
2915 	},
2916 	{
2917 		.procname	= "min_pmtu",
2918 		.data		= &ip_rt_min_pmtu,
2919 		.maxlen		= sizeof(int),
2920 		.mode		= 0644,
2921 		.proc_handler	= proc_dointvec,
2922 	},
2923 	{
2924 		.procname	= "min_adv_mss",
2925 		.data		= &ip_rt_min_advmss,
2926 		.maxlen		= sizeof(int),
2927 		.mode		= 0644,
2928 		.proc_handler	= proc_dointvec,
2929 	},
2930 	{ }
2931 };
2932 
2933 static struct ctl_table ipv4_route_flush_table[] = {
2934 	{
2935 		.procname	= "flush",
2936 		.maxlen		= sizeof(int),
2937 		.mode		= 0200,
2938 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2939 	},
2940 	{ },
2941 };
2942 
2943 static __net_init int sysctl_route_net_init(struct net *net)
2944 {
2945 	struct ctl_table *tbl;
2946 
2947 	tbl = ipv4_route_flush_table;
2948 	if (!net_eq(net, &init_net)) {
2949 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2950 		if (!tbl)
2951 			goto err_dup;
2952 
2953 		/* Don't export sysctls to unprivileged users */
2954 		if (net->user_ns != &init_user_ns)
2955 			tbl[0].procname = NULL;
2956 	}
2957 	tbl[0].extra1 = net;
2958 
2959 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2960 	if (!net->ipv4.route_hdr)
2961 		goto err_reg;
2962 	return 0;
2963 
2964 err_reg:
2965 	if (tbl != ipv4_route_flush_table)
2966 		kfree(tbl);
2967 err_dup:
2968 	return -ENOMEM;
2969 }
2970 
2971 static __net_exit void sysctl_route_net_exit(struct net *net)
2972 {
2973 	struct ctl_table *tbl;
2974 
2975 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2976 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2977 	BUG_ON(tbl == ipv4_route_flush_table);
2978 	kfree(tbl);
2979 }
2980 
2981 static __net_initdata struct pernet_operations sysctl_route_ops = {
2982 	.init = sysctl_route_net_init,
2983 	.exit = sysctl_route_net_exit,
2984 	.async = true,
2985 };
2986 #endif
2987 
2988 static __net_init int rt_genid_init(struct net *net)
2989 {
2990 	atomic_set(&net->ipv4.rt_genid, 0);
2991 	atomic_set(&net->fnhe_genid, 0);
2992 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
2993 	return 0;
2994 }
2995 
2996 static __net_initdata struct pernet_operations rt_genid_ops = {
2997 	.init = rt_genid_init,
2998 	.async = true,
2999 };
3000 
3001 static int __net_init ipv4_inetpeer_init(struct net *net)
3002 {
3003 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3004 
3005 	if (!bp)
3006 		return -ENOMEM;
3007 	inet_peer_base_init(bp);
3008 	net->ipv4.peers = bp;
3009 	return 0;
3010 }
3011 
3012 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3013 {
3014 	struct inet_peer_base *bp = net->ipv4.peers;
3015 
3016 	net->ipv4.peers = NULL;
3017 	inetpeer_invalidate_tree(bp);
3018 	kfree(bp);
3019 }
3020 
3021 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3022 	.init	=	ipv4_inetpeer_init,
3023 	.exit	=	ipv4_inetpeer_exit,
3024 	.async	=	true,
3025 };
3026 
3027 #ifdef CONFIG_IP_ROUTE_CLASSID
3028 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3029 #endif /* CONFIG_IP_ROUTE_CLASSID */
3030 
3031 int __init ip_rt_init(void)
3032 {
3033 	int cpu;
3034 
3035 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3036 	if (!ip_idents)
3037 		panic("IP: failed to allocate ip_idents\n");
3038 
3039 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3040 
3041 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3042 	if (!ip_tstamps)
3043 		panic("IP: failed to allocate ip_tstamps\n");
3044 
3045 	for_each_possible_cpu(cpu) {
3046 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3047 
3048 		INIT_LIST_HEAD(&ul->head);
3049 		spin_lock_init(&ul->lock);
3050 	}
3051 #ifdef CONFIG_IP_ROUTE_CLASSID
3052 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3053 	if (!ip_rt_acct)
3054 		panic("IP: failed to allocate ip_rt_acct\n");
3055 #endif
3056 
3057 	ipv4_dst_ops.kmem_cachep =
3058 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3059 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3060 
3061 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3062 
3063 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3064 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3065 
3066 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3067 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3068 
3069 	ipv4_dst_ops.gc_thresh = ~0;
3070 	ip_rt_max_size = INT_MAX;
3071 
3072 	devinet_init();
3073 	ip_fib_init();
3074 
3075 	if (ip_rt_proc_init())
3076 		pr_err("Unable to create route proc files\n");
3077 #ifdef CONFIG_XFRM
3078 	xfrm_init();
3079 	xfrm4_init();
3080 #endif
3081 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3082 		      RTNL_FLAG_DOIT_UNLOCKED);
3083 
3084 #ifdef CONFIG_SYSCTL
3085 	register_pernet_subsys(&sysctl_route_ops);
3086 #endif
3087 	register_pernet_subsys(&rt_genid_ops);
3088 	register_pernet_subsys(&ipv4_inetpeer_ops);
3089 	return 0;
3090 }
3091 
3092 #ifdef CONFIG_SYSCTL
3093 /*
3094  * We really need to sanitize the damn ipv4 init order, then all
3095  * this nonsense will go away.
3096  */
3097 void __init ip_static_sysctl_init(void)
3098 {
3099 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3100 }
3101 #endif
3102