xref: /openbmc/linux/net/ipv4/route.c (revision 5e21a3ec)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112 
113 #include "fib_lookup.h"
114 
115 #define RT_FL_TOS(oldflp4) \
116 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly	= 9;
122 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly	= HZ;
125 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly	= 256;
129 
130 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
131 
132 /*
133  *	Interface to generic destination cache.
134  */
135 
136 INDIRECT_CALLABLE_SCOPE
137 struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 INDIRECT_CALLABLE_SCOPE
140 unsigned int		ipv4_mtu(const struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void		 ipv4_link_failure(struct sk_buff *skb);
143 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 					   struct sk_buff *skb, u32 mtu,
145 					   bool confirm_neigh);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct proc_ops rt_cache_proc_ops = {
243 	.proc_open	= rt_cache_seq_open,
244 	.proc_read	= seq_read,
245 	.proc_lseek	= seq_lseek,
246 	.proc_release	= seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	(*pos)++;
277 	return NULL;
278 
279 }
280 
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct proc_ops rt_cpu_proc_ops = {
334 	.proc_open	= rt_cpu_seq_open,
335 	.proc_read	= seq_read,
336 	.proc_lseek	= seq_lseek,
337 	.proc_release	= seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 #endif
365 
366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368 	struct proc_dir_entry *pde;
369 
370 	pde = proc_create("rt_cache", 0444, net->proc_net,
371 			  &rt_cache_proc_ops);
372 	if (!pde)
373 		goto err1;
374 
375 	pde = proc_create("rt_cache", 0444,
376 			  net->proc_net_stat, &rt_cpu_proc_ops);
377 	if (!pde)
378 		goto err2;
379 
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 	pde = proc_create_single("rt_acct", 0, net->proc_net,
382 			rt_acct_proc_show);
383 	if (!pde)
384 		goto err3;
385 #endif
386 	return 0;
387 
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390 	remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393 	remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395 	return -ENOMEM;
396 }
397 
398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 	remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 	remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406 
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408 	.init = ip_rt_do_proc_init,
409 	.exit = ip_rt_do_proc_exit,
410 };
411 
412 static int __init ip_rt_proc_init(void)
413 {
414 	return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416 
417 #else
418 static inline int ip_rt_proc_init(void)
419 {
420 	return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423 
424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428 
429 void rt_cache_flush(struct net *net)
430 {
431 	rt_genid_bump_ipv4(net);
432 }
433 
434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435 					   struct sk_buff *skb,
436 					   const void *daddr)
437 {
438 	const struct rtable *rt = container_of(dst, struct rtable, dst);
439 	struct net_device *dev = dst->dev;
440 	struct neighbour *n;
441 
442 	rcu_read_lock_bh();
443 
444 	if (likely(rt->rt_gw_family == AF_INET)) {
445 		n = ip_neigh_gw4(dev, rt->rt_gw4);
446 	} else if (rt->rt_gw_family == AF_INET6) {
447 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449 		__be32 pkey;
450 
451 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 		n = ip_neigh_gw4(dev, pkey);
453 	}
454 
455 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 		n = NULL;
457 
458 	rcu_read_unlock_bh();
459 
460 	return n;
461 }
462 
463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465 	const struct rtable *rt = container_of(dst, struct rtable, dst);
466 	struct net_device *dev = dst->dev;
467 	const __be32 *pkey = daddr;
468 
469 	if (rt->rt_gw_family == AF_INET) {
470 		pkey = (const __be32 *)&rt->rt_gw4;
471 	} else if (rt->rt_gw_family == AF_INET6) {
472 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 	} else if (!daddr ||
474 		 (rt->rt_flags &
475 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 		return;
477 	}
478 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480 
481 #define IP_IDENTS_SZ 2048u
482 
483 static atomic_t *ip_idents __read_mostly;
484 static u32 *ip_tstamps __read_mostly;
485 
486 /* In order to protect privacy, we add a perturbation to identifiers
487  * if one generator is seldom used. This makes hard for an attacker
488  * to infer how many packets were sent between two points in time.
489  */
490 u32 ip_idents_reserve(u32 hash, int segs)
491 {
492 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
493 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
494 	u32 old = READ_ONCE(*p_tstamp);
495 	u32 now = (u32)jiffies;
496 	u32 delta = 0;
497 
498 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
499 		delta = prandom_u32_max(now - old);
500 
501 	/* If UBSAN reports an error there, please make sure your compiler
502 	 * supports -fno-strict-overflow before reporting it that was a bug
503 	 * in UBSAN, and it has been fixed in GCC-8.
504 	 */
505 	return atomic_add_return(segs + delta, p_id) - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508 
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511 	u32 hash, id;
512 
513 	/* Note the following code is not safe, but this is okay. */
514 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515 		get_random_bytes(&net->ipv4.ip_id_key,
516 				 sizeof(net->ipv4.ip_id_key));
517 
518 	hash = siphash_3u32((__force u32)iph->daddr,
519 			    (__force u32)iph->saddr,
520 			    iph->protocol,
521 			    &net->ipv4.ip_id_key);
522 	id = ip_idents_reserve(hash, segs);
523 	iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526 
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528 			     const struct sock *sk,
529 			     const struct iphdr *iph,
530 			     int oif, u8 tos,
531 			     u8 prot, u32 mark, int flow_flags)
532 {
533 	if (sk) {
534 		const struct inet_sock *inet = inet_sk(sk);
535 
536 		oif = sk->sk_bound_dev_if;
537 		mark = sk->sk_mark;
538 		tos = RT_CONN_FLAGS(sk);
539 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540 	}
541 	flowi4_init_output(fl4, oif, mark, tos,
542 			   RT_SCOPE_UNIVERSE, prot,
543 			   flow_flags,
544 			   iph->daddr, iph->saddr, 0, 0,
545 			   sock_net_uid(net, sk));
546 }
547 
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549 			       const struct sock *sk)
550 {
551 	const struct net *net = dev_net(skb->dev);
552 	const struct iphdr *iph = ip_hdr(skb);
553 	int oif = skb->dev->ifindex;
554 	u8 tos = RT_TOS(iph->tos);
555 	u8 prot = iph->protocol;
556 	u32 mark = skb->mark;
557 
558 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560 
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563 	const struct inet_sock *inet = inet_sk(sk);
564 	const struct ip_options_rcu *inet_opt;
565 	__be32 daddr = inet->inet_daddr;
566 
567 	rcu_read_lock();
568 	inet_opt = rcu_dereference(inet->inet_opt);
569 	if (inet_opt && inet_opt->opt.srr)
570 		daddr = inet_opt->opt.faddr;
571 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574 			   inet_sk_flowi_flags(sk),
575 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576 	rcu_read_unlock();
577 }
578 
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580 				 const struct sk_buff *skb)
581 {
582 	if (skb)
583 		build_skb_flow_key(fl4, skb, sk);
584 	else
585 		build_sk_flow_key(fl4, sk);
586 }
587 
588 static DEFINE_SPINLOCK(fnhe_lock);
589 
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592 	struct rtable *rt;
593 
594 	rt = rcu_dereference(fnhe->fnhe_rth_input);
595 	if (rt) {
596 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597 		dst_dev_put(&rt->dst);
598 		dst_release(&rt->dst);
599 	}
600 	rt = rcu_dereference(fnhe->fnhe_rth_output);
601 	if (rt) {
602 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603 		dst_dev_put(&rt->dst);
604 		dst_release(&rt->dst);
605 	}
606 }
607 
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610 	struct fib_nh_exception *fnhe, *oldest;
611 
612 	oldest = rcu_dereference(hash->chain);
613 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
615 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616 			oldest = fnhe;
617 	}
618 	fnhe_flush_routes(oldest);
619 	return oldest;
620 }
621 
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624 	static u32 fnhe_hashrnd __read_mostly;
625 	u32 hval;
626 
627 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628 	hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
629 	return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631 
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634 	rt->rt_pmtu = fnhe->fnhe_pmtu;
635 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636 	rt->dst.expires = fnhe->fnhe_expires;
637 
638 	if (fnhe->fnhe_gw) {
639 		rt->rt_flags |= RTCF_REDIRECTED;
640 		rt->rt_uses_gateway = 1;
641 		rt->rt_gw_family = AF_INET;
642 		rt->rt_gw4 = fnhe->fnhe_gw;
643 	}
644 }
645 
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647 				  __be32 gw, u32 pmtu, bool lock,
648 				  unsigned long expires)
649 {
650 	struct fnhe_hash_bucket *hash;
651 	struct fib_nh_exception *fnhe;
652 	struct rtable *rt;
653 	u32 genid, hval;
654 	unsigned int i;
655 	int depth;
656 
657 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
658 	hval = fnhe_hashfun(daddr);
659 
660 	spin_lock_bh(&fnhe_lock);
661 
662 	hash = rcu_dereference(nhc->nhc_exceptions);
663 	if (!hash) {
664 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665 		if (!hash)
666 			goto out_unlock;
667 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
668 	}
669 
670 	hash += hval;
671 
672 	depth = 0;
673 	for (fnhe = rcu_dereference(hash->chain); fnhe;
674 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
675 		if (fnhe->fnhe_daddr == daddr)
676 			break;
677 		depth++;
678 	}
679 
680 	if (fnhe) {
681 		if (fnhe->fnhe_genid != genid)
682 			fnhe->fnhe_genid = genid;
683 		if (gw)
684 			fnhe->fnhe_gw = gw;
685 		if (pmtu) {
686 			fnhe->fnhe_pmtu = pmtu;
687 			fnhe->fnhe_mtu_locked = lock;
688 		}
689 		fnhe->fnhe_expires = max(1UL, expires);
690 		/* Update all cached dsts too */
691 		rt = rcu_dereference(fnhe->fnhe_rth_input);
692 		if (rt)
693 			fill_route_from_fnhe(rt, fnhe);
694 		rt = rcu_dereference(fnhe->fnhe_rth_output);
695 		if (rt)
696 			fill_route_from_fnhe(rt, fnhe);
697 	} else {
698 		if (depth > FNHE_RECLAIM_DEPTH)
699 			fnhe = fnhe_oldest(hash);
700 		else {
701 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702 			if (!fnhe)
703 				goto out_unlock;
704 
705 			fnhe->fnhe_next = hash->chain;
706 			rcu_assign_pointer(hash->chain, fnhe);
707 		}
708 		fnhe->fnhe_genid = genid;
709 		fnhe->fnhe_daddr = daddr;
710 		fnhe->fnhe_gw = gw;
711 		fnhe->fnhe_pmtu = pmtu;
712 		fnhe->fnhe_mtu_locked = lock;
713 		fnhe->fnhe_expires = max(1UL, expires);
714 
715 		/* Exception created; mark the cached routes for the nexthop
716 		 * stale, so anyone caching it rechecks if this exception
717 		 * applies to them.
718 		 */
719 		rt = rcu_dereference(nhc->nhc_rth_input);
720 		if (rt)
721 			rt->dst.obsolete = DST_OBSOLETE_KILL;
722 
723 		for_each_possible_cpu(i) {
724 			struct rtable __rcu **prt;
725 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726 			rt = rcu_dereference(*prt);
727 			if (rt)
728 				rt->dst.obsolete = DST_OBSOLETE_KILL;
729 		}
730 	}
731 
732 	fnhe->fnhe_stamp = jiffies;
733 
734 out_unlock:
735 	spin_unlock_bh(&fnhe_lock);
736 }
737 
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739 			     bool kill_route)
740 {
741 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
742 	__be32 old_gw = ip_hdr(skb)->saddr;
743 	struct net_device *dev = skb->dev;
744 	struct in_device *in_dev;
745 	struct fib_result res;
746 	struct neighbour *n;
747 	struct net *net;
748 
749 	switch (icmp_hdr(skb)->code & 7) {
750 	case ICMP_REDIR_NET:
751 	case ICMP_REDIR_NETTOS:
752 	case ICMP_REDIR_HOST:
753 	case ICMP_REDIR_HOSTTOS:
754 		break;
755 
756 	default:
757 		return;
758 	}
759 
760 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761 		return;
762 
763 	in_dev = __in_dev_get_rcu(dev);
764 	if (!in_dev)
765 		return;
766 
767 	net = dev_net(dev);
768 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770 	    ipv4_is_zeronet(new_gw))
771 		goto reject_redirect;
772 
773 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775 			goto reject_redirect;
776 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777 			goto reject_redirect;
778 	} else {
779 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780 			goto reject_redirect;
781 	}
782 
783 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784 	if (!n)
785 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786 	if (!IS_ERR(n)) {
787 		if (!(n->nud_state & NUD_VALID)) {
788 			neigh_event_send(n, NULL);
789 		} else {
790 			if (fib_lookup(net, fl4, &res, 0) == 0) {
791 				struct fib_nh_common *nhc;
792 
793 				fib_select_path(net, &res, fl4, skb);
794 				nhc = FIB_RES_NHC(res);
795 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
796 						0, false,
797 						jiffies + ip_rt_gc_timeout);
798 			}
799 			if (kill_route)
800 				rt->dst.obsolete = DST_OBSOLETE_KILL;
801 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
802 		}
803 		neigh_release(n);
804 	}
805 	return;
806 
807 reject_redirect:
808 #ifdef CONFIG_IP_ROUTE_VERBOSE
809 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
810 		const struct iphdr *iph = (const struct iphdr *) skb->data;
811 		__be32 daddr = iph->daddr;
812 		__be32 saddr = iph->saddr;
813 
814 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
815 				     "  Advised path = %pI4 -> %pI4\n",
816 				     &old_gw, dev->name, &new_gw,
817 				     &saddr, &daddr);
818 	}
819 #endif
820 	;
821 }
822 
823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
824 {
825 	struct rtable *rt;
826 	struct flowi4 fl4;
827 	const struct iphdr *iph = (const struct iphdr *) skb->data;
828 	struct net *net = dev_net(skb->dev);
829 	int oif = skb->dev->ifindex;
830 	u8 tos = RT_TOS(iph->tos);
831 	u8 prot = iph->protocol;
832 	u32 mark = skb->mark;
833 
834 	rt = (struct rtable *) dst;
835 
836 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
837 	__ip_do_redirect(rt, skb, &fl4, true);
838 }
839 
840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841 {
842 	struct rtable *rt = (struct rtable *)dst;
843 	struct dst_entry *ret = dst;
844 
845 	if (rt) {
846 		if (dst->obsolete > 0) {
847 			ip_rt_put(rt);
848 			ret = NULL;
849 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
850 			   rt->dst.expires) {
851 			ip_rt_put(rt);
852 			ret = NULL;
853 		}
854 	}
855 	return ret;
856 }
857 
858 /*
859  * Algorithm:
860  *	1. The first ip_rt_redirect_number redirects are sent
861  *	   with exponential backoff, then we stop sending them at all,
862  *	   assuming that the host ignores our redirects.
863  *	2. If we did not see packets requiring redirects
864  *	   during ip_rt_redirect_silence, we assume that the host
865  *	   forgot redirected route and start to send redirects again.
866  *
867  * This algorithm is much cheaper and more intelligent than dumb load limiting
868  * in icmp.c.
869  *
870  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
871  * and "frag. need" (breaks PMTU discovery) in icmp.c.
872  */
873 
874 void ip_rt_send_redirect(struct sk_buff *skb)
875 {
876 	struct rtable *rt = skb_rtable(skb);
877 	struct in_device *in_dev;
878 	struct inet_peer *peer;
879 	struct net *net;
880 	int log_martians;
881 	int vif;
882 
883 	rcu_read_lock();
884 	in_dev = __in_dev_get_rcu(rt->dst.dev);
885 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
886 		rcu_read_unlock();
887 		return;
888 	}
889 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
890 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
891 	rcu_read_unlock();
892 
893 	net = dev_net(rt->dst.dev);
894 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895 	if (!peer) {
896 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
897 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
898 		return;
899 	}
900 
901 	/* No redirected packets during ip_rt_redirect_silence;
902 	 * reset the algorithm.
903 	 */
904 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
905 		peer->rate_tokens = 0;
906 		peer->n_redirects = 0;
907 	}
908 
909 	/* Too many ignored redirects; do not send anything
910 	 * set dst.rate_last to the last seen redirected packet.
911 	 */
912 	if (peer->n_redirects >= ip_rt_redirect_number) {
913 		peer->rate_last = jiffies;
914 		goto out_put_peer;
915 	}
916 
917 	/* Check for load limit; set rate_last to the latest sent
918 	 * redirect.
919 	 */
920 	if (peer->n_redirects == 0 ||
921 	    time_after(jiffies,
922 		       (peer->rate_last +
923 			(ip_rt_redirect_load << peer->n_redirects)))) {
924 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
925 
926 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
927 		peer->rate_last = jiffies;
928 		++peer->n_redirects;
929 #ifdef CONFIG_IP_ROUTE_VERBOSE
930 		if (log_martians &&
931 		    peer->n_redirects == ip_rt_redirect_number)
932 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933 					     &ip_hdr(skb)->saddr, inet_iif(skb),
934 					     &ip_hdr(skb)->daddr, &gw);
935 #endif
936 	}
937 out_put_peer:
938 	inet_putpeer(peer);
939 }
940 
941 static int ip_error(struct sk_buff *skb)
942 {
943 	struct rtable *rt = skb_rtable(skb);
944 	struct net_device *dev = skb->dev;
945 	struct in_device *in_dev;
946 	struct inet_peer *peer;
947 	unsigned long now;
948 	struct net *net;
949 	bool send;
950 	int code;
951 
952 	if (netif_is_l3_master(skb->dev)) {
953 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
954 		if (!dev)
955 			goto out;
956 	}
957 
958 	in_dev = __in_dev_get_rcu(dev);
959 
960 	/* IP on this device is disabled. */
961 	if (!in_dev)
962 		goto out;
963 
964 	net = dev_net(rt->dst.dev);
965 	if (!IN_DEV_FORWARD(in_dev)) {
966 		switch (rt->dst.error) {
967 		case EHOSTUNREACH:
968 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
969 			break;
970 
971 		case ENETUNREACH:
972 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
973 			break;
974 		}
975 		goto out;
976 	}
977 
978 	switch (rt->dst.error) {
979 	case EINVAL:
980 	default:
981 		goto out;
982 	case EHOSTUNREACH:
983 		code = ICMP_HOST_UNREACH;
984 		break;
985 	case ENETUNREACH:
986 		code = ICMP_NET_UNREACH;
987 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
988 		break;
989 	case EACCES:
990 		code = ICMP_PKT_FILTERED;
991 		break;
992 	}
993 
994 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
995 			       l3mdev_master_ifindex(skb->dev), 1);
996 
997 	send = true;
998 	if (peer) {
999 		now = jiffies;
1000 		peer->rate_tokens += now - peer->rate_last;
1001 		if (peer->rate_tokens > ip_rt_error_burst)
1002 			peer->rate_tokens = ip_rt_error_burst;
1003 		peer->rate_last = now;
1004 		if (peer->rate_tokens >= ip_rt_error_cost)
1005 			peer->rate_tokens -= ip_rt_error_cost;
1006 		else
1007 			send = false;
1008 		inet_putpeer(peer);
1009 	}
1010 	if (send)
1011 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1012 
1013 out:	kfree_skb(skb);
1014 	return 0;
1015 }
1016 
1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1018 {
1019 	struct dst_entry *dst = &rt->dst;
1020 	struct net *net = dev_net(dst->dev);
1021 	struct fib_result res;
1022 	bool lock = false;
1023 	u32 old_mtu;
1024 
1025 	if (ip_mtu_locked(dst))
1026 		return;
1027 
1028 	old_mtu = ipv4_mtu(dst);
1029 	if (old_mtu < mtu)
1030 		return;
1031 
1032 	if (mtu < ip_rt_min_pmtu) {
1033 		lock = true;
1034 		mtu = min(old_mtu, ip_rt_min_pmtu);
1035 	}
1036 
1037 	if (rt->rt_pmtu == mtu && !lock &&
1038 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1039 		return;
1040 
1041 	rcu_read_lock();
1042 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1043 		struct fib_nh_common *nhc;
1044 
1045 		fib_select_path(net, &res, fl4, NULL);
1046 		nhc = FIB_RES_NHC(res);
1047 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1048 				      jiffies + ip_rt_mtu_expires);
1049 	}
1050 	rcu_read_unlock();
1051 }
1052 
1053 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1054 			      struct sk_buff *skb, u32 mtu,
1055 			      bool confirm_neigh)
1056 {
1057 	struct rtable *rt = (struct rtable *) dst;
1058 	struct flowi4 fl4;
1059 
1060 	ip_rt_build_flow_key(&fl4, sk, skb);
1061 
1062 	/* Don't make lookup fail for bridged encapsulations */
1063 	if (skb && netif_is_any_bridge_port(skb->dev))
1064 		fl4.flowi4_oif = 0;
1065 
1066 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1067 }
1068 
1069 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1070 		      int oif, u8 protocol)
1071 {
1072 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1073 	struct flowi4 fl4;
1074 	struct rtable *rt;
1075 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1076 
1077 	__build_flow_key(net, &fl4, NULL, iph, oif,
1078 			 RT_TOS(iph->tos), protocol, mark, 0);
1079 	rt = __ip_route_output_key(net, &fl4);
1080 	if (!IS_ERR(rt)) {
1081 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1082 		ip_rt_put(rt);
1083 	}
1084 }
1085 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1086 
1087 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1088 {
1089 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1090 	struct flowi4 fl4;
1091 	struct rtable *rt;
1092 
1093 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1094 
1095 	if (!fl4.flowi4_mark)
1096 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1097 
1098 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1099 	if (!IS_ERR(rt)) {
1100 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1101 		ip_rt_put(rt);
1102 	}
1103 }
1104 
1105 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1106 {
1107 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1108 	struct flowi4 fl4;
1109 	struct rtable *rt;
1110 	struct dst_entry *odst = NULL;
1111 	bool new = false;
1112 	struct net *net = sock_net(sk);
1113 
1114 	bh_lock_sock(sk);
1115 
1116 	if (!ip_sk_accept_pmtu(sk))
1117 		goto out;
1118 
1119 	odst = sk_dst_get(sk);
1120 
1121 	if (sock_owned_by_user(sk) || !odst) {
1122 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1123 		goto out;
1124 	}
1125 
1126 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1127 
1128 	rt = (struct rtable *)odst;
1129 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1130 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131 		if (IS_ERR(rt))
1132 			goto out;
1133 
1134 		new = true;
1135 	}
1136 
1137 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1138 
1139 	if (!dst_check(&rt->dst, 0)) {
1140 		if (new)
1141 			dst_release(&rt->dst);
1142 
1143 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 		if (IS_ERR(rt))
1145 			goto out;
1146 
1147 		new = true;
1148 	}
1149 
1150 	if (new)
1151 		sk_dst_set(sk, &rt->dst);
1152 
1153 out:
1154 	bh_unlock_sock(sk);
1155 	dst_release(odst);
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1158 
1159 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1160 		   int oif, u8 protocol)
1161 {
1162 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1163 	struct flowi4 fl4;
1164 	struct rtable *rt;
1165 
1166 	__build_flow_key(net, &fl4, NULL, iph, oif,
1167 			 RT_TOS(iph->tos), protocol, 0, 0);
1168 	rt = __ip_route_output_key(net, &fl4);
1169 	if (!IS_ERR(rt)) {
1170 		__ip_do_redirect(rt, skb, &fl4, false);
1171 		ip_rt_put(rt);
1172 	}
1173 }
1174 EXPORT_SYMBOL_GPL(ipv4_redirect);
1175 
1176 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1177 {
1178 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1179 	struct flowi4 fl4;
1180 	struct rtable *rt;
1181 	struct net *net = sock_net(sk);
1182 
1183 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1184 	rt = __ip_route_output_key(net, &fl4);
1185 	if (!IS_ERR(rt)) {
1186 		__ip_do_redirect(rt, skb, &fl4, false);
1187 		ip_rt_put(rt);
1188 	}
1189 }
1190 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1191 
1192 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1193 							 u32 cookie)
1194 {
1195 	struct rtable *rt = (struct rtable *) dst;
1196 
1197 	/* All IPV4 dsts are created with ->obsolete set to the value
1198 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1199 	 * into this function always.
1200 	 *
1201 	 * When a PMTU/redirect information update invalidates a route,
1202 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1203 	 * DST_OBSOLETE_DEAD.
1204 	 */
1205 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1206 		return NULL;
1207 	return dst;
1208 }
1209 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1210 
1211 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1212 {
1213 	struct ip_options opt;
1214 	int res;
1215 
1216 	/* Recompile ip options since IPCB may not be valid anymore.
1217 	 * Also check we have a reasonable ipv4 header.
1218 	 */
1219 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1220 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1221 		return;
1222 
1223 	memset(&opt, 0, sizeof(opt));
1224 	if (ip_hdr(skb)->ihl > 5) {
1225 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1226 			return;
1227 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1228 
1229 		rcu_read_lock();
1230 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1231 		rcu_read_unlock();
1232 
1233 		if (res)
1234 			return;
1235 	}
1236 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1237 }
1238 
1239 static void ipv4_link_failure(struct sk_buff *skb)
1240 {
1241 	struct rtable *rt;
1242 
1243 	ipv4_send_dest_unreach(skb);
1244 
1245 	rt = skb_rtable(skb);
1246 	if (rt)
1247 		dst_set_expires(&rt->dst, 0);
1248 }
1249 
1250 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1251 {
1252 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1253 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1254 		 skb->dev ? skb->dev->name : "?");
1255 	kfree_skb(skb);
1256 	WARN_ON(1);
1257 	return 0;
1258 }
1259 
1260 /*
1261    We do not cache source address of outgoing interface,
1262    because it is used only by IP RR, TS and SRR options,
1263    so that it out of fast path.
1264 
1265    BTW remember: "addr" is allowed to be not aligned
1266    in IP options!
1267  */
1268 
1269 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1270 {
1271 	__be32 src;
1272 
1273 	if (rt_is_output_route(rt))
1274 		src = ip_hdr(skb)->saddr;
1275 	else {
1276 		struct fib_result res;
1277 		struct iphdr *iph = ip_hdr(skb);
1278 		struct flowi4 fl4 = {
1279 			.daddr = iph->daddr,
1280 			.saddr = iph->saddr,
1281 			.flowi4_tos = RT_TOS(iph->tos),
1282 			.flowi4_oif = rt->dst.dev->ifindex,
1283 			.flowi4_iif = skb->dev->ifindex,
1284 			.flowi4_mark = skb->mark,
1285 		};
1286 
1287 		rcu_read_lock();
1288 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1289 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1290 		else
1291 			src = inet_select_addr(rt->dst.dev,
1292 					       rt_nexthop(rt, iph->daddr),
1293 					       RT_SCOPE_UNIVERSE);
1294 		rcu_read_unlock();
1295 	}
1296 	memcpy(addr, &src, 4);
1297 }
1298 
1299 #ifdef CONFIG_IP_ROUTE_CLASSID
1300 static void set_class_tag(struct rtable *rt, u32 tag)
1301 {
1302 	if (!(rt->dst.tclassid & 0xFFFF))
1303 		rt->dst.tclassid |= tag & 0xFFFF;
1304 	if (!(rt->dst.tclassid & 0xFFFF0000))
1305 		rt->dst.tclassid |= tag & 0xFFFF0000;
1306 }
1307 #endif
1308 
1309 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1310 {
1311 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1312 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1313 				    ip_rt_min_advmss);
1314 
1315 	return min(advmss, IPV4_MAX_PMTU - header_size);
1316 }
1317 
1318 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1319 {
1320 	const struct rtable *rt = (const struct rtable *)dst;
1321 	unsigned int mtu = rt->rt_pmtu;
1322 
1323 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1324 		mtu = dst_metric_raw(dst, RTAX_MTU);
1325 
1326 	if (mtu)
1327 		return mtu;
1328 
1329 	mtu = READ_ONCE(dst->dev->mtu);
1330 
1331 	if (unlikely(ip_mtu_locked(dst))) {
1332 		if (rt->rt_uses_gateway && mtu > 576)
1333 			mtu = 576;
1334 	}
1335 
1336 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1337 
1338 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1339 }
1340 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1341 
1342 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1343 {
1344 	struct fnhe_hash_bucket *hash;
1345 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1346 	u32 hval = fnhe_hashfun(daddr);
1347 
1348 	spin_lock_bh(&fnhe_lock);
1349 
1350 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1351 					 lockdep_is_held(&fnhe_lock));
1352 	hash += hval;
1353 
1354 	fnhe_p = &hash->chain;
1355 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1356 	while (fnhe) {
1357 		if (fnhe->fnhe_daddr == daddr) {
1358 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1359 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1360 			/* set fnhe_daddr to 0 to ensure it won't bind with
1361 			 * new dsts in rt_bind_exception().
1362 			 */
1363 			fnhe->fnhe_daddr = 0;
1364 			fnhe_flush_routes(fnhe);
1365 			kfree_rcu(fnhe, rcu);
1366 			break;
1367 		}
1368 		fnhe_p = &fnhe->fnhe_next;
1369 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1370 						 lockdep_is_held(&fnhe_lock));
1371 	}
1372 
1373 	spin_unlock_bh(&fnhe_lock);
1374 }
1375 
1376 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1377 					       __be32 daddr)
1378 {
1379 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1380 	struct fib_nh_exception *fnhe;
1381 	u32 hval;
1382 
1383 	if (!hash)
1384 		return NULL;
1385 
1386 	hval = fnhe_hashfun(daddr);
1387 
1388 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1389 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1390 		if (fnhe->fnhe_daddr == daddr) {
1391 			if (fnhe->fnhe_expires &&
1392 			    time_after(jiffies, fnhe->fnhe_expires)) {
1393 				ip_del_fnhe(nhc, daddr);
1394 				break;
1395 			}
1396 			return fnhe;
1397 		}
1398 	}
1399 	return NULL;
1400 }
1401 
1402 /* MTU selection:
1403  * 1. mtu on route is locked - use it
1404  * 2. mtu from nexthop exception
1405  * 3. mtu from egress device
1406  */
1407 
1408 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1409 {
1410 	struct fib_nh_common *nhc = res->nhc;
1411 	struct net_device *dev = nhc->nhc_dev;
1412 	struct fib_info *fi = res->fi;
1413 	u32 mtu = 0;
1414 
1415 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1416 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1417 		mtu = fi->fib_mtu;
1418 
1419 	if (likely(!mtu)) {
1420 		struct fib_nh_exception *fnhe;
1421 
1422 		fnhe = find_exception(nhc, daddr);
1423 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1424 			mtu = fnhe->fnhe_pmtu;
1425 	}
1426 
1427 	if (likely(!mtu))
1428 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1429 
1430 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1431 }
1432 
1433 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1434 			      __be32 daddr, const bool do_cache)
1435 {
1436 	bool ret = false;
1437 
1438 	spin_lock_bh(&fnhe_lock);
1439 
1440 	if (daddr == fnhe->fnhe_daddr) {
1441 		struct rtable __rcu **porig;
1442 		struct rtable *orig;
1443 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1444 
1445 		if (rt_is_input_route(rt))
1446 			porig = &fnhe->fnhe_rth_input;
1447 		else
1448 			porig = &fnhe->fnhe_rth_output;
1449 		orig = rcu_dereference(*porig);
1450 
1451 		if (fnhe->fnhe_genid != genid) {
1452 			fnhe->fnhe_genid = genid;
1453 			fnhe->fnhe_gw = 0;
1454 			fnhe->fnhe_pmtu = 0;
1455 			fnhe->fnhe_expires = 0;
1456 			fnhe->fnhe_mtu_locked = false;
1457 			fnhe_flush_routes(fnhe);
1458 			orig = NULL;
1459 		}
1460 		fill_route_from_fnhe(rt, fnhe);
1461 		if (!rt->rt_gw4) {
1462 			rt->rt_gw4 = daddr;
1463 			rt->rt_gw_family = AF_INET;
1464 		}
1465 
1466 		if (do_cache) {
1467 			dst_hold(&rt->dst);
1468 			rcu_assign_pointer(*porig, rt);
1469 			if (orig) {
1470 				dst_dev_put(&orig->dst);
1471 				dst_release(&orig->dst);
1472 			}
1473 			ret = true;
1474 		}
1475 
1476 		fnhe->fnhe_stamp = jiffies;
1477 	}
1478 	spin_unlock_bh(&fnhe_lock);
1479 
1480 	return ret;
1481 }
1482 
1483 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1484 {
1485 	struct rtable *orig, *prev, **p;
1486 	bool ret = true;
1487 
1488 	if (rt_is_input_route(rt)) {
1489 		p = (struct rtable **)&nhc->nhc_rth_input;
1490 	} else {
1491 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1492 	}
1493 	orig = *p;
1494 
1495 	/* hold dst before doing cmpxchg() to avoid race condition
1496 	 * on this dst
1497 	 */
1498 	dst_hold(&rt->dst);
1499 	prev = cmpxchg(p, orig, rt);
1500 	if (prev == orig) {
1501 		if (orig) {
1502 			rt_add_uncached_list(orig);
1503 			dst_release(&orig->dst);
1504 		}
1505 	} else {
1506 		dst_release(&rt->dst);
1507 		ret = false;
1508 	}
1509 
1510 	return ret;
1511 }
1512 
1513 struct uncached_list {
1514 	spinlock_t		lock;
1515 	struct list_head	head;
1516 };
1517 
1518 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1519 
1520 void rt_add_uncached_list(struct rtable *rt)
1521 {
1522 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1523 
1524 	rt->rt_uncached_list = ul;
1525 
1526 	spin_lock_bh(&ul->lock);
1527 	list_add_tail(&rt->rt_uncached, &ul->head);
1528 	spin_unlock_bh(&ul->lock);
1529 }
1530 
1531 void rt_del_uncached_list(struct rtable *rt)
1532 {
1533 	if (!list_empty(&rt->rt_uncached)) {
1534 		struct uncached_list *ul = rt->rt_uncached_list;
1535 
1536 		spin_lock_bh(&ul->lock);
1537 		list_del(&rt->rt_uncached);
1538 		spin_unlock_bh(&ul->lock);
1539 	}
1540 }
1541 
1542 static void ipv4_dst_destroy(struct dst_entry *dst)
1543 {
1544 	struct rtable *rt = (struct rtable *)dst;
1545 
1546 	ip_dst_metrics_put(dst);
1547 	rt_del_uncached_list(rt);
1548 }
1549 
1550 void rt_flush_dev(struct net_device *dev)
1551 {
1552 	struct rtable *rt;
1553 	int cpu;
1554 
1555 	for_each_possible_cpu(cpu) {
1556 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1557 
1558 		spin_lock_bh(&ul->lock);
1559 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1560 			if (rt->dst.dev != dev)
1561 				continue;
1562 			rt->dst.dev = blackhole_netdev;
1563 			dev_hold(rt->dst.dev);
1564 			dev_put(dev);
1565 		}
1566 		spin_unlock_bh(&ul->lock);
1567 	}
1568 }
1569 
1570 static bool rt_cache_valid(const struct rtable *rt)
1571 {
1572 	return	rt &&
1573 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1574 		!rt_is_expired(rt);
1575 }
1576 
1577 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1578 			   const struct fib_result *res,
1579 			   struct fib_nh_exception *fnhe,
1580 			   struct fib_info *fi, u16 type, u32 itag,
1581 			   const bool do_cache)
1582 {
1583 	bool cached = false;
1584 
1585 	if (fi) {
1586 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1587 
1588 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1589 			rt->rt_uses_gateway = 1;
1590 			rt->rt_gw_family = nhc->nhc_gw_family;
1591 			/* only INET and INET6 are supported */
1592 			if (likely(nhc->nhc_gw_family == AF_INET))
1593 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1594 			else
1595 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1596 		}
1597 
1598 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1599 
1600 #ifdef CONFIG_IP_ROUTE_CLASSID
1601 		if (nhc->nhc_family == AF_INET) {
1602 			struct fib_nh *nh;
1603 
1604 			nh = container_of(nhc, struct fib_nh, nh_common);
1605 			rt->dst.tclassid = nh->nh_tclassid;
1606 		}
1607 #endif
1608 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1609 		if (unlikely(fnhe))
1610 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1611 		else if (do_cache)
1612 			cached = rt_cache_route(nhc, rt);
1613 		if (unlikely(!cached)) {
1614 			/* Routes we intend to cache in nexthop exception or
1615 			 * FIB nexthop have the DST_NOCACHE bit clear.
1616 			 * However, if we are unsuccessful at storing this
1617 			 * route into the cache we really need to set it.
1618 			 */
1619 			if (!rt->rt_gw4) {
1620 				rt->rt_gw_family = AF_INET;
1621 				rt->rt_gw4 = daddr;
1622 			}
1623 			rt_add_uncached_list(rt);
1624 		}
1625 	} else
1626 		rt_add_uncached_list(rt);
1627 
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 #ifdef CONFIG_IP_MULTIPLE_TABLES
1630 	set_class_tag(rt, res->tclassid);
1631 #endif
1632 	set_class_tag(rt, itag);
1633 #endif
1634 }
1635 
1636 struct rtable *rt_dst_alloc(struct net_device *dev,
1637 			    unsigned int flags, u16 type,
1638 			    bool nopolicy, bool noxfrm)
1639 {
1640 	struct rtable *rt;
1641 
1642 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1643 		       (nopolicy ? DST_NOPOLICY : 0) |
1644 		       (noxfrm ? DST_NOXFRM : 0));
1645 
1646 	if (rt) {
1647 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648 		rt->rt_flags = flags;
1649 		rt->rt_type = type;
1650 		rt->rt_is_input = 0;
1651 		rt->rt_iif = 0;
1652 		rt->rt_pmtu = 0;
1653 		rt->rt_mtu_locked = 0;
1654 		rt->rt_uses_gateway = 0;
1655 		rt->rt_gw_family = 0;
1656 		rt->rt_gw4 = 0;
1657 		INIT_LIST_HEAD(&rt->rt_uncached);
1658 
1659 		rt->dst.output = ip_output;
1660 		if (flags & RTCF_LOCAL)
1661 			rt->dst.input = ip_local_deliver;
1662 	}
1663 
1664 	return rt;
1665 }
1666 EXPORT_SYMBOL(rt_dst_alloc);
1667 
1668 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1669 {
1670 	struct rtable *new_rt;
1671 
1672 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673 			   rt->dst.flags);
1674 
1675 	if (new_rt) {
1676 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1677 		new_rt->rt_flags = rt->rt_flags;
1678 		new_rt->rt_type = rt->rt_type;
1679 		new_rt->rt_is_input = rt->rt_is_input;
1680 		new_rt->rt_iif = rt->rt_iif;
1681 		new_rt->rt_pmtu = rt->rt_pmtu;
1682 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1683 		new_rt->rt_gw_family = rt->rt_gw_family;
1684 		if (rt->rt_gw_family == AF_INET)
1685 			new_rt->rt_gw4 = rt->rt_gw4;
1686 		else if (rt->rt_gw_family == AF_INET6)
1687 			new_rt->rt_gw6 = rt->rt_gw6;
1688 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1689 
1690 		new_rt->dst.input = rt->dst.input;
1691 		new_rt->dst.output = rt->dst.output;
1692 		new_rt->dst.error = rt->dst.error;
1693 		new_rt->dst.lastuse = jiffies;
1694 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1695 	}
1696 	return new_rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_clone);
1699 
1700 /* called in rcu_read_lock() section */
1701 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1702 			  u8 tos, struct net_device *dev,
1703 			  struct in_device *in_dev, u32 *itag)
1704 {
1705 	int err;
1706 
1707 	/* Primary sanity checks. */
1708 	if (!in_dev)
1709 		return -EINVAL;
1710 
1711 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1712 	    skb->protocol != htons(ETH_P_IP))
1713 		return -EINVAL;
1714 
1715 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1716 		return -EINVAL;
1717 
1718 	if (ipv4_is_zeronet(saddr)) {
1719 		if (!ipv4_is_local_multicast(daddr) &&
1720 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1721 			return -EINVAL;
1722 	} else {
1723 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1724 					  in_dev, itag);
1725 		if (err < 0)
1726 			return err;
1727 	}
1728 	return 0;
1729 }
1730 
1731 /* called in rcu_read_lock() section */
1732 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1733 			     u8 tos, struct net_device *dev, int our)
1734 {
1735 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1736 	unsigned int flags = RTCF_MULTICAST;
1737 	struct rtable *rth;
1738 	u32 itag = 0;
1739 	int err;
1740 
1741 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1742 	if (err)
1743 		return err;
1744 
1745 	if (our)
1746 		flags |= RTCF_LOCAL;
1747 
1748 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1749 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1750 	if (!rth)
1751 		return -ENOBUFS;
1752 
1753 #ifdef CONFIG_IP_ROUTE_CLASSID
1754 	rth->dst.tclassid = itag;
1755 #endif
1756 	rth->dst.output = ip_rt_bug;
1757 	rth->rt_is_input= 1;
1758 
1759 #ifdef CONFIG_IP_MROUTE
1760 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1761 		rth->dst.input = ip_mr_input;
1762 #endif
1763 	RT_CACHE_STAT_INC(in_slow_mc);
1764 
1765 	skb_dst_set(skb, &rth->dst);
1766 	return 0;
1767 }
1768 
1769 
1770 static void ip_handle_martian_source(struct net_device *dev,
1771 				     struct in_device *in_dev,
1772 				     struct sk_buff *skb,
1773 				     __be32 daddr,
1774 				     __be32 saddr)
1775 {
1776 	RT_CACHE_STAT_INC(in_martian_src);
1777 #ifdef CONFIG_IP_ROUTE_VERBOSE
1778 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1779 		/*
1780 		 *	RFC1812 recommendation, if source is martian,
1781 		 *	the only hint is MAC header.
1782 		 */
1783 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1784 			&daddr, &saddr, dev->name);
1785 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1786 			print_hex_dump(KERN_WARNING, "ll header: ",
1787 				       DUMP_PREFIX_OFFSET, 16, 1,
1788 				       skb_mac_header(skb),
1789 				       dev->hard_header_len, false);
1790 		}
1791 	}
1792 #endif
1793 }
1794 
1795 /* called in rcu_read_lock() section */
1796 static int __mkroute_input(struct sk_buff *skb,
1797 			   const struct fib_result *res,
1798 			   struct in_device *in_dev,
1799 			   __be32 daddr, __be32 saddr, u32 tos)
1800 {
1801 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1802 	struct net_device *dev = nhc->nhc_dev;
1803 	struct fib_nh_exception *fnhe;
1804 	struct rtable *rth;
1805 	int err;
1806 	struct in_device *out_dev;
1807 	bool do_cache;
1808 	u32 itag = 0;
1809 
1810 	/* get a working reference to the output device */
1811 	out_dev = __in_dev_get_rcu(dev);
1812 	if (!out_dev) {
1813 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1814 		return -EINVAL;
1815 	}
1816 
1817 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1818 				  in_dev->dev, in_dev, &itag);
1819 	if (err < 0) {
1820 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1821 					 saddr);
1822 
1823 		goto cleanup;
1824 	}
1825 
1826 	do_cache = res->fi && !itag;
1827 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1828 	    skb->protocol == htons(ETH_P_IP)) {
1829 		__be32 gw;
1830 
1831 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1832 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1833 		    inet_addr_onlink(out_dev, saddr, gw))
1834 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1835 	}
1836 
1837 	if (skb->protocol != htons(ETH_P_IP)) {
1838 		/* Not IP (i.e. ARP). Do not create route, if it is
1839 		 * invalid for proxy arp. DNAT routes are always valid.
1840 		 *
1841 		 * Proxy arp feature have been extended to allow, ARP
1842 		 * replies back to the same interface, to support
1843 		 * Private VLAN switch technologies. See arp.c.
1844 		 */
1845 		if (out_dev == in_dev &&
1846 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1847 			err = -EINVAL;
1848 			goto cleanup;
1849 		}
1850 	}
1851 
1852 	fnhe = find_exception(nhc, daddr);
1853 	if (do_cache) {
1854 		if (fnhe)
1855 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1856 		else
1857 			rth = rcu_dereference(nhc->nhc_rth_input);
1858 		if (rt_cache_valid(rth)) {
1859 			skb_dst_set_noref(skb, &rth->dst);
1860 			goto out;
1861 		}
1862 	}
1863 
1864 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1865 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
1866 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1867 	if (!rth) {
1868 		err = -ENOBUFS;
1869 		goto cleanup;
1870 	}
1871 
1872 	rth->rt_is_input = 1;
1873 	RT_CACHE_STAT_INC(in_slow_tot);
1874 
1875 	rth->dst.input = ip_forward;
1876 
1877 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1878 		       do_cache);
1879 	lwtunnel_set_redirect(&rth->dst);
1880 	skb_dst_set(skb, &rth->dst);
1881 out:
1882 	err = 0;
1883  cleanup:
1884 	return err;
1885 }
1886 
1887 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1888 /* To make ICMP packets follow the right flow, the multipath hash is
1889  * calculated from the inner IP addresses.
1890  */
1891 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1892 				 struct flow_keys *hash_keys)
1893 {
1894 	const struct iphdr *outer_iph = ip_hdr(skb);
1895 	const struct iphdr *key_iph = outer_iph;
1896 	const struct iphdr *inner_iph;
1897 	const struct icmphdr *icmph;
1898 	struct iphdr _inner_iph;
1899 	struct icmphdr _icmph;
1900 
1901 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1902 		goto out;
1903 
1904 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1905 		goto out;
1906 
1907 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1908 				   &_icmph);
1909 	if (!icmph)
1910 		goto out;
1911 
1912 	if (!icmp_is_err(icmph->type))
1913 		goto out;
1914 
1915 	inner_iph = skb_header_pointer(skb,
1916 				       outer_iph->ihl * 4 + sizeof(_icmph),
1917 				       sizeof(_inner_iph), &_inner_iph);
1918 	if (!inner_iph)
1919 		goto out;
1920 
1921 	key_iph = inner_iph;
1922 out:
1923 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1924 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1925 }
1926 
1927 /* if skb is set it will be used and fl4 can be NULL */
1928 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1929 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1930 {
1931 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1932 	struct flow_keys hash_keys;
1933 	u32 mhash;
1934 
1935 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1936 	case 0:
1937 		memset(&hash_keys, 0, sizeof(hash_keys));
1938 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1939 		if (skb) {
1940 			ip_multipath_l3_keys(skb, &hash_keys);
1941 		} else {
1942 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1943 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1944 		}
1945 		break;
1946 	case 1:
1947 		/* skb is currently provided only when forwarding */
1948 		if (skb) {
1949 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1950 			struct flow_keys keys;
1951 
1952 			/* short-circuit if we already have L4 hash present */
1953 			if (skb->l4_hash)
1954 				return skb_get_hash_raw(skb) >> 1;
1955 
1956 			memset(&hash_keys, 0, sizeof(hash_keys));
1957 
1958 			if (!flkeys) {
1959 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1960 				flkeys = &keys;
1961 			}
1962 
1963 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1964 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1965 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1966 			hash_keys.ports.src = flkeys->ports.src;
1967 			hash_keys.ports.dst = flkeys->ports.dst;
1968 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1969 		} else {
1970 			memset(&hash_keys, 0, sizeof(hash_keys));
1971 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1972 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1973 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1974 			hash_keys.ports.src = fl4->fl4_sport;
1975 			hash_keys.ports.dst = fl4->fl4_dport;
1976 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1977 		}
1978 		break;
1979 	case 2:
1980 		memset(&hash_keys, 0, sizeof(hash_keys));
1981 		/* skb is currently provided only when forwarding */
1982 		if (skb) {
1983 			struct flow_keys keys;
1984 
1985 			skb_flow_dissect_flow_keys(skb, &keys, 0);
1986 			/* Inner can be v4 or v6 */
1987 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1988 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1990 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1991 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1992 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1993 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1994 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1995 				hash_keys.tags.flow_label = keys.tags.flow_label;
1996 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1997 			} else {
1998 				/* Same as case 0 */
1999 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2000 				ip_multipath_l3_keys(skb, &hash_keys);
2001 			}
2002 		} else {
2003 			/* Same as case 0 */
2004 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2006 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2007 		}
2008 		break;
2009 	}
2010 	mhash = flow_hash_from_keys(&hash_keys);
2011 
2012 	if (multipath_hash)
2013 		mhash = jhash_2words(mhash, multipath_hash, 0);
2014 
2015 	return mhash >> 1;
2016 }
2017 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2018 
2019 static int ip_mkroute_input(struct sk_buff *skb,
2020 			    struct fib_result *res,
2021 			    struct in_device *in_dev,
2022 			    __be32 daddr, __be32 saddr, u32 tos,
2023 			    struct flow_keys *hkeys)
2024 {
2025 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2026 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2027 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2028 
2029 		fib_select_multipath(res, h);
2030 	}
2031 #endif
2032 
2033 	/* create a routing cache entry */
2034 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2035 }
2036 
2037 /* Implements all the saddr-related checks as ip_route_input_slow(),
2038  * assuming daddr is valid and the destination is not a local broadcast one.
2039  * Uses the provided hint instead of performing a route lookup.
2040  */
2041 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042 		      u8 tos, struct net_device *dev,
2043 		      const struct sk_buff *hint)
2044 {
2045 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2046 	struct rtable *rt = skb_rtable(hint);
2047 	struct net *net = dev_net(dev);
2048 	int err = -EINVAL;
2049 	u32 tag = 0;
2050 
2051 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2052 		goto martian_source;
2053 
2054 	if (ipv4_is_zeronet(saddr))
2055 		goto martian_source;
2056 
2057 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2058 		goto martian_source;
2059 
2060 	if (rt->rt_type != RTN_LOCAL)
2061 		goto skip_validate_source;
2062 
2063 	tos &= IPTOS_RT_MASK;
2064 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2065 	if (err < 0)
2066 		goto martian_source;
2067 
2068 skip_validate_source:
2069 	skb_dst_copy(skb, hint);
2070 	return 0;
2071 
2072 martian_source:
2073 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074 	return err;
2075 }
2076 
2077 /*
2078  *	NOTE. We drop all the packets that has local source
2079  *	addresses, because every properly looped back packet
2080  *	must have correct destination already attached by output routine.
2081  *	Changes in the enforced policies must be applied also to
2082  *	ip_route_use_hint().
2083  *
2084  *	Such approach solves two big problems:
2085  *	1. Not simplex devices are handled properly.
2086  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2087  *	called with rcu_read_lock()
2088  */
2089 
2090 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 			       u8 tos, struct net_device *dev,
2092 			       struct fib_result *res)
2093 {
2094 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2095 	struct flow_keys *flkeys = NULL, _flkeys;
2096 	struct net    *net = dev_net(dev);
2097 	struct ip_tunnel_info *tun_info;
2098 	int		err = -EINVAL;
2099 	unsigned int	flags = 0;
2100 	u32		itag = 0;
2101 	struct rtable	*rth;
2102 	struct flowi4	fl4;
2103 	bool do_cache = true;
2104 
2105 	/* IP on this device is disabled. */
2106 
2107 	if (!in_dev)
2108 		goto out;
2109 
2110 	/* Check for the most weird martians, which can be not detected
2111 	   by fib_lookup.
2112 	 */
2113 
2114 	tun_info = skb_tunnel_info(skb);
2115 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2116 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2117 	else
2118 		fl4.flowi4_tun_key.tun_id = 0;
2119 	skb_dst_drop(skb);
2120 
2121 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2122 		goto martian_source;
2123 
2124 	res->fi = NULL;
2125 	res->table = NULL;
2126 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2127 		goto brd_input;
2128 
2129 	/* Accept zero addresses only to limited broadcast;
2130 	 * I even do not know to fix it or not. Waiting for complains :-)
2131 	 */
2132 	if (ipv4_is_zeronet(saddr))
2133 		goto martian_source;
2134 
2135 	if (ipv4_is_zeronet(daddr))
2136 		goto martian_destination;
2137 
2138 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2139 	 * and call it once if daddr or/and saddr are loopback addresses
2140 	 */
2141 	if (ipv4_is_loopback(daddr)) {
2142 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2143 			goto martian_destination;
2144 	} else if (ipv4_is_loopback(saddr)) {
2145 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2146 			goto martian_source;
2147 	}
2148 
2149 	/*
2150 	 *	Now we are ready to route packet.
2151 	 */
2152 	fl4.flowi4_oif = 0;
2153 	fl4.flowi4_iif = dev->ifindex;
2154 	fl4.flowi4_mark = skb->mark;
2155 	fl4.flowi4_tos = tos;
2156 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2157 	fl4.flowi4_flags = 0;
2158 	fl4.daddr = daddr;
2159 	fl4.saddr = saddr;
2160 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2161 	fl4.flowi4_multipath_hash = 0;
2162 
2163 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2164 		flkeys = &_flkeys;
2165 	} else {
2166 		fl4.flowi4_proto = 0;
2167 		fl4.fl4_sport = 0;
2168 		fl4.fl4_dport = 0;
2169 	}
2170 
2171 	err = fib_lookup(net, &fl4, res, 0);
2172 	if (err != 0) {
2173 		if (!IN_DEV_FORWARD(in_dev))
2174 			err = -EHOSTUNREACH;
2175 		goto no_route;
2176 	}
2177 
2178 	if (res->type == RTN_BROADCAST) {
2179 		if (IN_DEV_BFORWARD(in_dev))
2180 			goto make_route;
2181 		/* not do cache if bc_forwarding is enabled */
2182 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2183 			do_cache = false;
2184 		goto brd_input;
2185 	}
2186 
2187 	if (res->type == RTN_LOCAL) {
2188 		err = fib_validate_source(skb, saddr, daddr, tos,
2189 					  0, dev, in_dev, &itag);
2190 		if (err < 0)
2191 			goto martian_source;
2192 		goto local_input;
2193 	}
2194 
2195 	if (!IN_DEV_FORWARD(in_dev)) {
2196 		err = -EHOSTUNREACH;
2197 		goto no_route;
2198 	}
2199 	if (res->type != RTN_UNICAST)
2200 		goto martian_destination;
2201 
2202 make_route:
2203 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2204 out:	return err;
2205 
2206 brd_input:
2207 	if (skb->protocol != htons(ETH_P_IP))
2208 		goto e_inval;
2209 
2210 	if (!ipv4_is_zeronet(saddr)) {
2211 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2212 					  in_dev, &itag);
2213 		if (err < 0)
2214 			goto martian_source;
2215 	}
2216 	flags |= RTCF_BROADCAST;
2217 	res->type = RTN_BROADCAST;
2218 	RT_CACHE_STAT_INC(in_brd);
2219 
2220 local_input:
2221 	do_cache &= res->fi && !itag;
2222 	if (do_cache) {
2223 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2224 
2225 		rth = rcu_dereference(nhc->nhc_rth_input);
2226 		if (rt_cache_valid(rth)) {
2227 			skb_dst_set_noref(skb, &rth->dst);
2228 			err = 0;
2229 			goto out;
2230 		}
2231 	}
2232 
2233 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2234 			   flags | RTCF_LOCAL, res->type,
2235 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2236 	if (!rth)
2237 		goto e_nobufs;
2238 
2239 	rth->dst.output= ip_rt_bug;
2240 #ifdef CONFIG_IP_ROUTE_CLASSID
2241 	rth->dst.tclassid = itag;
2242 #endif
2243 	rth->rt_is_input = 1;
2244 
2245 	RT_CACHE_STAT_INC(in_slow_tot);
2246 	if (res->type == RTN_UNREACHABLE) {
2247 		rth->dst.input= ip_error;
2248 		rth->dst.error= -err;
2249 		rth->rt_flags 	&= ~RTCF_LOCAL;
2250 	}
2251 
2252 	if (do_cache) {
2253 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2254 
2255 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2256 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2257 			WARN_ON(rth->dst.input == lwtunnel_input);
2258 			rth->dst.lwtstate->orig_input = rth->dst.input;
2259 			rth->dst.input = lwtunnel_input;
2260 		}
2261 
2262 		if (unlikely(!rt_cache_route(nhc, rth)))
2263 			rt_add_uncached_list(rth);
2264 	}
2265 	skb_dst_set(skb, &rth->dst);
2266 	err = 0;
2267 	goto out;
2268 
2269 no_route:
2270 	RT_CACHE_STAT_INC(in_no_route);
2271 	res->type = RTN_UNREACHABLE;
2272 	res->fi = NULL;
2273 	res->table = NULL;
2274 	goto local_input;
2275 
2276 	/*
2277 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2278 	 */
2279 martian_destination:
2280 	RT_CACHE_STAT_INC(in_martian_dst);
2281 #ifdef CONFIG_IP_ROUTE_VERBOSE
2282 	if (IN_DEV_LOG_MARTIANS(in_dev))
2283 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2284 				     &daddr, &saddr, dev->name);
2285 #endif
2286 
2287 e_inval:
2288 	err = -EINVAL;
2289 	goto out;
2290 
2291 e_nobufs:
2292 	err = -ENOBUFS;
2293 	goto out;
2294 
2295 martian_source:
2296 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297 	goto out;
2298 }
2299 
2300 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301 			 u8 tos, struct net_device *dev)
2302 {
2303 	struct fib_result res;
2304 	int err;
2305 
2306 	tos &= IPTOS_RT_MASK;
2307 	rcu_read_lock();
2308 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2309 	rcu_read_unlock();
2310 
2311 	return err;
2312 }
2313 EXPORT_SYMBOL(ip_route_input_noref);
2314 
2315 /* called with rcu_read_lock held */
2316 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2317 		       u8 tos, struct net_device *dev, struct fib_result *res)
2318 {
2319 	/* Multicast recognition logic is moved from route cache to here.
2320 	   The problem was that too many Ethernet cards have broken/missing
2321 	   hardware multicast filters :-( As result the host on multicasting
2322 	   network acquires a lot of useless route cache entries, sort of
2323 	   SDR messages from all the world. Now we try to get rid of them.
2324 	   Really, provided software IP multicast filter is organized
2325 	   reasonably (at least, hashed), it does not result in a slowdown
2326 	   comparing with route cache reject entries.
2327 	   Note, that multicast routers are not affected, because
2328 	   route cache entry is created eventually.
2329 	 */
2330 	if (ipv4_is_multicast(daddr)) {
2331 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2332 		int our = 0;
2333 		int err = -EINVAL;
2334 
2335 		if (!in_dev)
2336 			return err;
2337 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2338 				      ip_hdr(skb)->protocol);
2339 
2340 		/* check l3 master if no match yet */
2341 		if (!our && netif_is_l3_slave(dev)) {
2342 			struct in_device *l3_in_dev;
2343 
2344 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2345 			if (l3_in_dev)
2346 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2347 						      ip_hdr(skb)->protocol);
2348 		}
2349 
2350 		if (our
2351 #ifdef CONFIG_IP_MROUTE
2352 			||
2353 		    (!ipv4_is_local_multicast(daddr) &&
2354 		     IN_DEV_MFORWARD(in_dev))
2355 #endif
2356 		   ) {
2357 			err = ip_route_input_mc(skb, daddr, saddr,
2358 						tos, dev, our);
2359 		}
2360 		return err;
2361 	}
2362 
2363 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2364 }
2365 
2366 /* called with rcu_read_lock() */
2367 static struct rtable *__mkroute_output(const struct fib_result *res,
2368 				       const struct flowi4 *fl4, int orig_oif,
2369 				       struct net_device *dev_out,
2370 				       unsigned int flags)
2371 {
2372 	struct fib_info *fi = res->fi;
2373 	struct fib_nh_exception *fnhe;
2374 	struct in_device *in_dev;
2375 	u16 type = res->type;
2376 	struct rtable *rth;
2377 	bool do_cache;
2378 
2379 	in_dev = __in_dev_get_rcu(dev_out);
2380 	if (!in_dev)
2381 		return ERR_PTR(-EINVAL);
2382 
2383 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2384 		if (ipv4_is_loopback(fl4->saddr) &&
2385 		    !(dev_out->flags & IFF_LOOPBACK) &&
2386 		    !netif_is_l3_master(dev_out))
2387 			return ERR_PTR(-EINVAL);
2388 
2389 	if (ipv4_is_lbcast(fl4->daddr))
2390 		type = RTN_BROADCAST;
2391 	else if (ipv4_is_multicast(fl4->daddr))
2392 		type = RTN_MULTICAST;
2393 	else if (ipv4_is_zeronet(fl4->daddr))
2394 		return ERR_PTR(-EINVAL);
2395 
2396 	if (dev_out->flags & IFF_LOOPBACK)
2397 		flags |= RTCF_LOCAL;
2398 
2399 	do_cache = true;
2400 	if (type == RTN_BROADCAST) {
2401 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2402 		fi = NULL;
2403 	} else if (type == RTN_MULTICAST) {
2404 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2405 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2406 				     fl4->flowi4_proto))
2407 			flags &= ~RTCF_LOCAL;
2408 		else
2409 			do_cache = false;
2410 		/* If multicast route do not exist use
2411 		 * default one, but do not gateway in this case.
2412 		 * Yes, it is hack.
2413 		 */
2414 		if (fi && res->prefixlen < 4)
2415 			fi = NULL;
2416 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2417 		   (orig_oif != dev_out->ifindex)) {
2418 		/* For local routes that require a particular output interface
2419 		 * we do not want to cache the result.  Caching the result
2420 		 * causes incorrect behaviour when there are multiple source
2421 		 * addresses on the interface, the end result being that if the
2422 		 * intended recipient is waiting on that interface for the
2423 		 * packet he won't receive it because it will be delivered on
2424 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2425 		 * be set to the loopback interface as well.
2426 		 */
2427 		do_cache = false;
2428 	}
2429 
2430 	fnhe = NULL;
2431 	do_cache &= fi != NULL;
2432 	if (fi) {
2433 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2434 		struct rtable __rcu **prth;
2435 
2436 		fnhe = find_exception(nhc, fl4->daddr);
2437 		if (!do_cache)
2438 			goto add;
2439 		if (fnhe) {
2440 			prth = &fnhe->fnhe_rth_output;
2441 		} else {
2442 			if (unlikely(fl4->flowi4_flags &
2443 				     FLOWI_FLAG_KNOWN_NH &&
2444 				     !(nhc->nhc_gw_family &&
2445 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2446 				do_cache = false;
2447 				goto add;
2448 			}
2449 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2450 		}
2451 		rth = rcu_dereference(*prth);
2452 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2453 			return rth;
2454 	}
2455 
2456 add:
2457 	rth = rt_dst_alloc(dev_out, flags, type,
2458 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2459 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2460 	if (!rth)
2461 		return ERR_PTR(-ENOBUFS);
2462 
2463 	rth->rt_iif = orig_oif;
2464 
2465 	RT_CACHE_STAT_INC(out_slow_tot);
2466 
2467 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468 		if (flags & RTCF_LOCAL &&
2469 		    !(dev_out->flags & IFF_LOOPBACK)) {
2470 			rth->dst.output = ip_mc_output;
2471 			RT_CACHE_STAT_INC(out_slow_mc);
2472 		}
2473 #ifdef CONFIG_IP_MROUTE
2474 		if (type == RTN_MULTICAST) {
2475 			if (IN_DEV_MFORWARD(in_dev) &&
2476 			    !ipv4_is_local_multicast(fl4->daddr)) {
2477 				rth->dst.input = ip_mr_input;
2478 				rth->dst.output = ip_mc_output;
2479 			}
2480 		}
2481 #endif
2482 	}
2483 
2484 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2485 	lwtunnel_set_redirect(&rth->dst);
2486 
2487 	return rth;
2488 }
2489 
2490 /*
2491  * Major route resolver routine.
2492  */
2493 
2494 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2495 					const struct sk_buff *skb)
2496 {
2497 	__u8 tos = RT_FL_TOS(fl4);
2498 	struct fib_result res = {
2499 		.type		= RTN_UNSPEC,
2500 		.fi		= NULL,
2501 		.table		= NULL,
2502 		.tclassid	= 0,
2503 	};
2504 	struct rtable *rth;
2505 
2506 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2507 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2508 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2509 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2510 
2511 	rcu_read_lock();
2512 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2513 	rcu_read_unlock();
2514 
2515 	return rth;
2516 }
2517 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2518 
2519 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2520 					    struct fib_result *res,
2521 					    const struct sk_buff *skb)
2522 {
2523 	struct net_device *dev_out = NULL;
2524 	int orig_oif = fl4->flowi4_oif;
2525 	unsigned int flags = 0;
2526 	struct rtable *rth;
2527 	int err;
2528 
2529 	if (fl4->saddr) {
2530 		if (ipv4_is_multicast(fl4->saddr) ||
2531 		    ipv4_is_lbcast(fl4->saddr) ||
2532 		    ipv4_is_zeronet(fl4->saddr)) {
2533 			rth = ERR_PTR(-EINVAL);
2534 			goto out;
2535 		}
2536 
2537 		rth = ERR_PTR(-ENETUNREACH);
2538 
2539 		/* I removed check for oif == dev_out->oif here.
2540 		   It was wrong for two reasons:
2541 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2542 		      is assigned to multiple interfaces.
2543 		   2. Moreover, we are allowed to send packets with saddr
2544 		      of another iface. --ANK
2545 		 */
2546 
2547 		if (fl4->flowi4_oif == 0 &&
2548 		    (ipv4_is_multicast(fl4->daddr) ||
2549 		     ipv4_is_lbcast(fl4->daddr))) {
2550 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2552 			if (!dev_out)
2553 				goto out;
2554 
2555 			/* Special hack: user can direct multicasts
2556 			   and limited broadcast via necessary interface
2557 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2558 			   This hack is not just for fun, it allows
2559 			   vic,vat and friends to work.
2560 			   They bind socket to loopback, set ttl to zero
2561 			   and expect that it will work.
2562 			   From the viewpoint of routing cache they are broken,
2563 			   because we are not allowed to build multicast path
2564 			   with loopback source addr (look, routing cache
2565 			   cannot know, that ttl is zero, so that packet
2566 			   will not leave this host and route is valid).
2567 			   Luckily, this hack is good workaround.
2568 			 */
2569 
2570 			fl4->flowi4_oif = dev_out->ifindex;
2571 			goto make_route;
2572 		}
2573 
2574 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2575 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2576 			if (!__ip_dev_find(net, fl4->saddr, false))
2577 				goto out;
2578 		}
2579 	}
2580 
2581 
2582 	if (fl4->flowi4_oif) {
2583 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2584 		rth = ERR_PTR(-ENODEV);
2585 		if (!dev_out)
2586 			goto out;
2587 
2588 		/* RACE: Check return value of inet_select_addr instead. */
2589 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590 			rth = ERR_PTR(-ENETUNREACH);
2591 			goto out;
2592 		}
2593 		if (ipv4_is_local_multicast(fl4->daddr) ||
2594 		    ipv4_is_lbcast(fl4->daddr) ||
2595 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2596 			if (!fl4->saddr)
2597 				fl4->saddr = inet_select_addr(dev_out, 0,
2598 							      RT_SCOPE_LINK);
2599 			goto make_route;
2600 		}
2601 		if (!fl4->saddr) {
2602 			if (ipv4_is_multicast(fl4->daddr))
2603 				fl4->saddr = inet_select_addr(dev_out, 0,
2604 							      fl4->flowi4_scope);
2605 			else if (!fl4->daddr)
2606 				fl4->saddr = inet_select_addr(dev_out, 0,
2607 							      RT_SCOPE_HOST);
2608 		}
2609 	}
2610 
2611 	if (!fl4->daddr) {
2612 		fl4->daddr = fl4->saddr;
2613 		if (!fl4->daddr)
2614 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2615 		dev_out = net->loopback_dev;
2616 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2617 		res->type = RTN_LOCAL;
2618 		flags |= RTCF_LOCAL;
2619 		goto make_route;
2620 	}
2621 
2622 	err = fib_lookup(net, fl4, res, 0);
2623 	if (err) {
2624 		res->fi = NULL;
2625 		res->table = NULL;
2626 		if (fl4->flowi4_oif &&
2627 		    (ipv4_is_multicast(fl4->daddr) ||
2628 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2629 			/* Apparently, routing tables are wrong. Assume,
2630 			   that the destination is on link.
2631 
2632 			   WHY? DW.
2633 			   Because we are allowed to send to iface
2634 			   even if it has NO routes and NO assigned
2635 			   addresses. When oif is specified, routing
2636 			   tables are looked up with only one purpose:
2637 			   to catch if destination is gatewayed, rather than
2638 			   direct. Moreover, if MSG_DONTROUTE is set,
2639 			   we send packet, ignoring both routing tables
2640 			   and ifaddr state. --ANK
2641 
2642 
2643 			   We could make it even if oif is unknown,
2644 			   likely IPv6, but we do not.
2645 			 */
2646 
2647 			if (fl4->saddr == 0)
2648 				fl4->saddr = inet_select_addr(dev_out, 0,
2649 							      RT_SCOPE_LINK);
2650 			res->type = RTN_UNICAST;
2651 			goto make_route;
2652 		}
2653 		rth = ERR_PTR(err);
2654 		goto out;
2655 	}
2656 
2657 	if (res->type == RTN_LOCAL) {
2658 		if (!fl4->saddr) {
2659 			if (res->fi->fib_prefsrc)
2660 				fl4->saddr = res->fi->fib_prefsrc;
2661 			else
2662 				fl4->saddr = fl4->daddr;
2663 		}
2664 
2665 		/* L3 master device is the loopback for that domain */
2666 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2667 			net->loopback_dev;
2668 
2669 		/* make sure orig_oif points to fib result device even
2670 		 * though packet rx/tx happens over loopback or l3mdev
2671 		 */
2672 		orig_oif = FIB_RES_OIF(*res);
2673 
2674 		fl4->flowi4_oif = dev_out->ifindex;
2675 		flags |= RTCF_LOCAL;
2676 		goto make_route;
2677 	}
2678 
2679 	fib_select_path(net, res, fl4, skb);
2680 
2681 	dev_out = FIB_RES_DEV(*res);
2682 
2683 make_route:
2684 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2685 
2686 out:
2687 	return rth;
2688 }
2689 
2690 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2691 {
2692 	return NULL;
2693 }
2694 
2695 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2696 {
2697 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2698 
2699 	return mtu ? : dst->dev->mtu;
2700 }
2701 
2702 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2703 					  struct sk_buff *skb, u32 mtu,
2704 					  bool confirm_neigh)
2705 {
2706 }
2707 
2708 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2709 				       struct sk_buff *skb)
2710 {
2711 }
2712 
2713 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2714 					  unsigned long old)
2715 {
2716 	return NULL;
2717 }
2718 
2719 static struct dst_ops ipv4_dst_blackhole_ops = {
2720 	.family			=	AF_INET,
2721 	.check			=	ipv4_blackhole_dst_check,
2722 	.mtu			=	ipv4_blackhole_mtu,
2723 	.default_advmss		=	ipv4_default_advmss,
2724 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2725 	.redirect		=	ipv4_rt_blackhole_redirect,
2726 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2727 	.neigh_lookup		=	ipv4_neigh_lookup,
2728 };
2729 
2730 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2731 {
2732 	struct rtable *ort = (struct rtable *) dst_orig;
2733 	struct rtable *rt;
2734 
2735 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2736 	if (rt) {
2737 		struct dst_entry *new = &rt->dst;
2738 
2739 		new->__use = 1;
2740 		new->input = dst_discard;
2741 		new->output = dst_discard_out;
2742 
2743 		new->dev = net->loopback_dev;
2744 		if (new->dev)
2745 			dev_hold(new->dev);
2746 
2747 		rt->rt_is_input = ort->rt_is_input;
2748 		rt->rt_iif = ort->rt_iif;
2749 		rt->rt_pmtu = ort->rt_pmtu;
2750 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2751 
2752 		rt->rt_genid = rt_genid_ipv4(net);
2753 		rt->rt_flags = ort->rt_flags;
2754 		rt->rt_type = ort->rt_type;
2755 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2756 		rt->rt_gw_family = ort->rt_gw_family;
2757 		if (rt->rt_gw_family == AF_INET)
2758 			rt->rt_gw4 = ort->rt_gw4;
2759 		else if (rt->rt_gw_family == AF_INET6)
2760 			rt->rt_gw6 = ort->rt_gw6;
2761 
2762 		INIT_LIST_HEAD(&rt->rt_uncached);
2763 	}
2764 
2765 	dst_release(dst_orig);
2766 
2767 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2768 }
2769 
2770 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2771 				    const struct sock *sk)
2772 {
2773 	struct rtable *rt = __ip_route_output_key(net, flp4);
2774 
2775 	if (IS_ERR(rt))
2776 		return rt;
2777 
2778 	if (flp4->flowi4_proto) {
2779 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2780 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2781 							flowi4_to_flowi(flp4),
2782 							sk, 0);
2783 	}
2784 
2785 	return rt;
2786 }
2787 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2788 
2789 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2790 				      struct net_device *dev,
2791 				      struct net *net, __be32 *saddr,
2792 				      const struct ip_tunnel_info *info,
2793 				      u8 protocol, bool use_cache)
2794 {
2795 #ifdef CONFIG_DST_CACHE
2796 	struct dst_cache *dst_cache;
2797 #endif
2798 	struct rtable *rt = NULL;
2799 	struct flowi4 fl4;
2800 	__u8 tos;
2801 
2802 #ifdef CONFIG_DST_CACHE
2803 	dst_cache = (struct dst_cache *)&info->dst_cache;
2804 	if (use_cache) {
2805 		rt = dst_cache_get_ip4(dst_cache, saddr);
2806 		if (rt)
2807 			return rt;
2808 	}
2809 #endif
2810 	memset(&fl4, 0, sizeof(fl4));
2811 	fl4.flowi4_mark = skb->mark;
2812 	fl4.flowi4_proto = protocol;
2813 	fl4.daddr = info->key.u.ipv4.dst;
2814 	fl4.saddr = info->key.u.ipv4.src;
2815 	tos = info->key.tos;
2816 	fl4.flowi4_tos = RT_TOS(tos);
2817 
2818 	rt = ip_route_output_key(net, &fl4);
2819 	if (IS_ERR(rt)) {
2820 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2821 		return ERR_PTR(-ENETUNREACH);
2822 	}
2823 	if (rt->dst.dev == dev) { /* is this necessary? */
2824 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2825 		ip_rt_put(rt);
2826 		return ERR_PTR(-ELOOP);
2827 	}
2828 #ifdef CONFIG_DST_CACHE
2829 	if (use_cache)
2830 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2831 #endif
2832 	*saddr = fl4.saddr;
2833 	return rt;
2834 }
2835 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2836 
2837 /* called with rcu_read_lock held */
2838 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2839 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2840 			struct sk_buff *skb, u32 portid, u32 seq,
2841 			unsigned int flags)
2842 {
2843 	struct rtmsg *r;
2844 	struct nlmsghdr *nlh;
2845 	unsigned long expires = 0;
2846 	u32 error;
2847 	u32 metrics[RTAX_MAX];
2848 
2849 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2850 	if (!nlh)
2851 		return -EMSGSIZE;
2852 
2853 	r = nlmsg_data(nlh);
2854 	r->rtm_family	 = AF_INET;
2855 	r->rtm_dst_len	= 32;
2856 	r->rtm_src_len	= 0;
2857 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2858 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2859 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2860 		goto nla_put_failure;
2861 	r->rtm_type	= rt->rt_type;
2862 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2863 	r->rtm_protocol = RTPROT_UNSPEC;
2864 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2865 	if (rt->rt_flags & RTCF_NOTIFY)
2866 		r->rtm_flags |= RTM_F_NOTIFY;
2867 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2868 		r->rtm_flags |= RTCF_DOREDIRECT;
2869 
2870 	if (nla_put_in_addr(skb, RTA_DST, dst))
2871 		goto nla_put_failure;
2872 	if (src) {
2873 		r->rtm_src_len = 32;
2874 		if (nla_put_in_addr(skb, RTA_SRC, src))
2875 			goto nla_put_failure;
2876 	}
2877 	if (rt->dst.dev &&
2878 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2879 		goto nla_put_failure;
2880 	if (rt->dst.lwtstate &&
2881 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2882 		goto nla_put_failure;
2883 #ifdef CONFIG_IP_ROUTE_CLASSID
2884 	if (rt->dst.tclassid &&
2885 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2886 		goto nla_put_failure;
2887 #endif
2888 	if (fl4 && !rt_is_input_route(rt) &&
2889 	    fl4->saddr != src) {
2890 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2891 			goto nla_put_failure;
2892 	}
2893 	if (rt->rt_uses_gateway) {
2894 		if (rt->rt_gw_family == AF_INET &&
2895 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2896 			goto nla_put_failure;
2897 		} else if (rt->rt_gw_family == AF_INET6) {
2898 			int alen = sizeof(struct in6_addr);
2899 			struct nlattr *nla;
2900 			struct rtvia *via;
2901 
2902 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2903 			if (!nla)
2904 				goto nla_put_failure;
2905 
2906 			via = nla_data(nla);
2907 			via->rtvia_family = AF_INET6;
2908 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2909 		}
2910 	}
2911 
2912 	expires = rt->dst.expires;
2913 	if (expires) {
2914 		unsigned long now = jiffies;
2915 
2916 		if (time_before(now, expires))
2917 			expires -= now;
2918 		else
2919 			expires = 0;
2920 	}
2921 
2922 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2923 	if (rt->rt_pmtu && expires)
2924 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2925 	if (rt->rt_mtu_locked && expires)
2926 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2927 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2928 		goto nla_put_failure;
2929 
2930 	if (fl4) {
2931 		if (fl4->flowi4_mark &&
2932 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2933 			goto nla_put_failure;
2934 
2935 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2936 		    nla_put_u32(skb, RTA_UID,
2937 				from_kuid_munged(current_user_ns(),
2938 						 fl4->flowi4_uid)))
2939 			goto nla_put_failure;
2940 
2941 		if (rt_is_input_route(rt)) {
2942 #ifdef CONFIG_IP_MROUTE
2943 			if (ipv4_is_multicast(dst) &&
2944 			    !ipv4_is_local_multicast(dst) &&
2945 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2946 				int err = ipmr_get_route(net, skb,
2947 							 fl4->saddr, fl4->daddr,
2948 							 r, portid);
2949 
2950 				if (err <= 0) {
2951 					if (err == 0)
2952 						return 0;
2953 					goto nla_put_failure;
2954 				}
2955 			} else
2956 #endif
2957 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2958 					goto nla_put_failure;
2959 		}
2960 	}
2961 
2962 	error = rt->dst.error;
2963 
2964 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2965 		goto nla_put_failure;
2966 
2967 	nlmsg_end(skb, nlh);
2968 	return 0;
2969 
2970 nla_put_failure:
2971 	nlmsg_cancel(skb, nlh);
2972 	return -EMSGSIZE;
2973 }
2974 
2975 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2976 			    struct netlink_callback *cb, u32 table_id,
2977 			    struct fnhe_hash_bucket *bucket, int genid,
2978 			    int *fa_index, int fa_start, unsigned int flags)
2979 {
2980 	int i;
2981 
2982 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2983 		struct fib_nh_exception *fnhe;
2984 
2985 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2986 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2987 			struct rtable *rt;
2988 			int err;
2989 
2990 			if (*fa_index < fa_start)
2991 				goto next;
2992 
2993 			if (fnhe->fnhe_genid != genid)
2994 				goto next;
2995 
2996 			if (fnhe->fnhe_expires &&
2997 			    time_after(jiffies, fnhe->fnhe_expires))
2998 				goto next;
2999 
3000 			rt = rcu_dereference(fnhe->fnhe_rth_input);
3001 			if (!rt)
3002 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3003 			if (!rt)
3004 				goto next;
3005 
3006 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3007 					   table_id, NULL, skb,
3008 					   NETLINK_CB(cb->skb).portid,
3009 					   cb->nlh->nlmsg_seq, flags);
3010 			if (err)
3011 				return err;
3012 next:
3013 			(*fa_index)++;
3014 		}
3015 	}
3016 
3017 	return 0;
3018 }
3019 
3020 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3021 		       u32 table_id, struct fib_info *fi,
3022 		       int *fa_index, int fa_start, unsigned int flags)
3023 {
3024 	struct net *net = sock_net(cb->skb->sk);
3025 	int nhsel, genid = fnhe_genid(net);
3026 
3027 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3028 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3029 		struct fnhe_hash_bucket *bucket;
3030 		int err;
3031 
3032 		if (nhc->nhc_flags & RTNH_F_DEAD)
3033 			continue;
3034 
3035 		rcu_read_lock();
3036 		bucket = rcu_dereference(nhc->nhc_exceptions);
3037 		err = 0;
3038 		if (bucket)
3039 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3040 					       genid, fa_index, fa_start,
3041 					       flags);
3042 		rcu_read_unlock();
3043 		if (err)
3044 			return err;
3045 	}
3046 
3047 	return 0;
3048 }
3049 
3050 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3051 						   u8 ip_proto, __be16 sport,
3052 						   __be16 dport)
3053 {
3054 	struct sk_buff *skb;
3055 	struct iphdr *iph;
3056 
3057 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3058 	if (!skb)
3059 		return NULL;
3060 
3061 	/* Reserve room for dummy headers, this skb can pass
3062 	 * through good chunk of routing engine.
3063 	 */
3064 	skb_reset_mac_header(skb);
3065 	skb_reset_network_header(skb);
3066 	skb->protocol = htons(ETH_P_IP);
3067 	iph = skb_put(skb, sizeof(struct iphdr));
3068 	iph->protocol = ip_proto;
3069 	iph->saddr = src;
3070 	iph->daddr = dst;
3071 	iph->version = 0x4;
3072 	iph->frag_off = 0;
3073 	iph->ihl = 0x5;
3074 	skb_set_transport_header(skb, skb->len);
3075 
3076 	switch (iph->protocol) {
3077 	case IPPROTO_UDP: {
3078 		struct udphdr *udph;
3079 
3080 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3081 		udph->source = sport;
3082 		udph->dest = dport;
3083 		udph->len = sizeof(struct udphdr);
3084 		udph->check = 0;
3085 		break;
3086 	}
3087 	case IPPROTO_TCP: {
3088 		struct tcphdr *tcph;
3089 
3090 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3091 		tcph->source	= sport;
3092 		tcph->dest	= dport;
3093 		tcph->doff	= sizeof(struct tcphdr) / 4;
3094 		tcph->rst = 1;
3095 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3096 					    src, dst, 0);
3097 		break;
3098 	}
3099 	case IPPROTO_ICMP: {
3100 		struct icmphdr *icmph;
3101 
3102 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3103 		icmph->type = ICMP_ECHO;
3104 		icmph->code = 0;
3105 	}
3106 	}
3107 
3108 	return skb;
3109 }
3110 
3111 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3112 				       const struct nlmsghdr *nlh,
3113 				       struct nlattr **tb,
3114 				       struct netlink_ext_ack *extack)
3115 {
3116 	struct rtmsg *rtm;
3117 	int i, err;
3118 
3119 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3120 		NL_SET_ERR_MSG(extack,
3121 			       "ipv4: Invalid header for route get request");
3122 		return -EINVAL;
3123 	}
3124 
3125 	if (!netlink_strict_get_check(skb))
3126 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3127 					      rtm_ipv4_policy, extack);
3128 
3129 	rtm = nlmsg_data(nlh);
3130 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3131 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3132 	    rtm->rtm_table || rtm->rtm_protocol ||
3133 	    rtm->rtm_scope || rtm->rtm_type) {
3134 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3135 		return -EINVAL;
3136 	}
3137 
3138 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3139 			       RTM_F_LOOKUP_TABLE |
3140 			       RTM_F_FIB_MATCH)) {
3141 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3142 		return -EINVAL;
3143 	}
3144 
3145 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3146 					    rtm_ipv4_policy, extack);
3147 	if (err)
3148 		return err;
3149 
3150 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3151 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3152 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3153 		return -EINVAL;
3154 	}
3155 
3156 	for (i = 0; i <= RTA_MAX; i++) {
3157 		if (!tb[i])
3158 			continue;
3159 
3160 		switch (i) {
3161 		case RTA_IIF:
3162 		case RTA_OIF:
3163 		case RTA_SRC:
3164 		case RTA_DST:
3165 		case RTA_IP_PROTO:
3166 		case RTA_SPORT:
3167 		case RTA_DPORT:
3168 		case RTA_MARK:
3169 		case RTA_UID:
3170 			break;
3171 		default:
3172 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3173 			return -EINVAL;
3174 		}
3175 	}
3176 
3177 	return 0;
3178 }
3179 
3180 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3181 			     struct netlink_ext_ack *extack)
3182 {
3183 	struct net *net = sock_net(in_skb->sk);
3184 	struct nlattr *tb[RTA_MAX+1];
3185 	u32 table_id = RT_TABLE_MAIN;
3186 	__be16 sport = 0, dport = 0;
3187 	struct fib_result res = {};
3188 	u8 ip_proto = IPPROTO_UDP;
3189 	struct rtable *rt = NULL;
3190 	struct sk_buff *skb;
3191 	struct rtmsg *rtm;
3192 	struct flowi4 fl4 = {};
3193 	__be32 dst = 0;
3194 	__be32 src = 0;
3195 	kuid_t uid;
3196 	u32 iif;
3197 	int err;
3198 	int mark;
3199 
3200 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3201 	if (err < 0)
3202 		return err;
3203 
3204 	rtm = nlmsg_data(nlh);
3205 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3206 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3207 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3208 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3209 	if (tb[RTA_UID])
3210 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3211 	else
3212 		uid = (iif ? INVALID_UID : current_uid());
3213 
3214 	if (tb[RTA_IP_PROTO]) {
3215 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3216 						  &ip_proto, AF_INET, extack);
3217 		if (err)
3218 			return err;
3219 	}
3220 
3221 	if (tb[RTA_SPORT])
3222 		sport = nla_get_be16(tb[RTA_SPORT]);
3223 
3224 	if (tb[RTA_DPORT])
3225 		dport = nla_get_be16(tb[RTA_DPORT]);
3226 
3227 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3228 	if (!skb)
3229 		return -ENOBUFS;
3230 
3231 	fl4.daddr = dst;
3232 	fl4.saddr = src;
3233 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3234 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3235 	fl4.flowi4_mark = mark;
3236 	fl4.flowi4_uid = uid;
3237 	if (sport)
3238 		fl4.fl4_sport = sport;
3239 	if (dport)
3240 		fl4.fl4_dport = dport;
3241 	fl4.flowi4_proto = ip_proto;
3242 
3243 	rcu_read_lock();
3244 
3245 	if (iif) {
3246 		struct net_device *dev;
3247 
3248 		dev = dev_get_by_index_rcu(net, iif);
3249 		if (!dev) {
3250 			err = -ENODEV;
3251 			goto errout_rcu;
3252 		}
3253 
3254 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3255 		skb->dev	= dev;
3256 		skb->mark	= mark;
3257 		err = ip_route_input_rcu(skb, dst, src,
3258 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3259 					 &res);
3260 
3261 		rt = skb_rtable(skb);
3262 		if (err == 0 && rt->dst.error)
3263 			err = -rt->dst.error;
3264 	} else {
3265 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3266 		skb->dev = net->loopback_dev;
3267 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3268 		err = 0;
3269 		if (IS_ERR(rt))
3270 			err = PTR_ERR(rt);
3271 		else
3272 			skb_dst_set(skb, &rt->dst);
3273 	}
3274 
3275 	if (err)
3276 		goto errout_rcu;
3277 
3278 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3279 		rt->rt_flags |= RTCF_NOTIFY;
3280 
3281 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3282 		table_id = res.table ? res.table->tb_id : 0;
3283 
3284 	/* reset skb for netlink reply msg */
3285 	skb_trim(skb, 0);
3286 	skb_reset_network_header(skb);
3287 	skb_reset_transport_header(skb);
3288 	skb_reset_mac_header(skb);
3289 
3290 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3291 		struct fib_rt_info fri;
3292 
3293 		if (!res.fi) {
3294 			err = fib_props[res.type].error;
3295 			if (!err)
3296 				err = -EHOSTUNREACH;
3297 			goto errout_rcu;
3298 		}
3299 		fri.fi = res.fi;
3300 		fri.tb_id = table_id;
3301 		fri.dst = res.prefix;
3302 		fri.dst_len = res.prefixlen;
3303 		fri.tos = fl4.flowi4_tos;
3304 		fri.type = rt->rt_type;
3305 		fri.offload = 0;
3306 		fri.trap = 0;
3307 		fri.offload_failed = 0;
3308 		if (res.fa_head) {
3309 			struct fib_alias *fa;
3310 
3311 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3312 				u8 slen = 32 - fri.dst_len;
3313 
3314 				if (fa->fa_slen == slen &&
3315 				    fa->tb_id == fri.tb_id &&
3316 				    fa->fa_tos == fri.tos &&
3317 				    fa->fa_info == res.fi &&
3318 				    fa->fa_type == fri.type) {
3319 					fri.offload = fa->offload;
3320 					fri.trap = fa->trap;
3321 					break;
3322 				}
3323 			}
3324 		}
3325 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3326 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3327 	} else {
3328 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3329 				   NETLINK_CB(in_skb).portid,
3330 				   nlh->nlmsg_seq, 0);
3331 	}
3332 	if (err < 0)
3333 		goto errout_rcu;
3334 
3335 	rcu_read_unlock();
3336 
3337 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3338 
3339 errout_free:
3340 	return err;
3341 errout_rcu:
3342 	rcu_read_unlock();
3343 	kfree_skb(skb);
3344 	goto errout_free;
3345 }
3346 
3347 void ip_rt_multicast_event(struct in_device *in_dev)
3348 {
3349 	rt_cache_flush(dev_net(in_dev->dev));
3350 }
3351 
3352 #ifdef CONFIG_SYSCTL
3353 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3354 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3355 static int ip_rt_gc_elasticity __read_mostly	= 8;
3356 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3357 
3358 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3359 		void *buffer, size_t *lenp, loff_t *ppos)
3360 {
3361 	struct net *net = (struct net *)__ctl->extra1;
3362 
3363 	if (write) {
3364 		rt_cache_flush(net);
3365 		fnhe_genid_bump(net);
3366 		return 0;
3367 	}
3368 
3369 	return -EINVAL;
3370 }
3371 
3372 static struct ctl_table ipv4_route_table[] = {
3373 	{
3374 		.procname	= "gc_thresh",
3375 		.data		= &ipv4_dst_ops.gc_thresh,
3376 		.maxlen		= sizeof(int),
3377 		.mode		= 0644,
3378 		.proc_handler	= proc_dointvec,
3379 	},
3380 	{
3381 		.procname	= "max_size",
3382 		.data		= &ip_rt_max_size,
3383 		.maxlen		= sizeof(int),
3384 		.mode		= 0644,
3385 		.proc_handler	= proc_dointvec,
3386 	},
3387 	{
3388 		/*  Deprecated. Use gc_min_interval_ms */
3389 
3390 		.procname	= "gc_min_interval",
3391 		.data		= &ip_rt_gc_min_interval,
3392 		.maxlen		= sizeof(int),
3393 		.mode		= 0644,
3394 		.proc_handler	= proc_dointvec_jiffies,
3395 	},
3396 	{
3397 		.procname	= "gc_min_interval_ms",
3398 		.data		= &ip_rt_gc_min_interval,
3399 		.maxlen		= sizeof(int),
3400 		.mode		= 0644,
3401 		.proc_handler	= proc_dointvec_ms_jiffies,
3402 	},
3403 	{
3404 		.procname	= "gc_timeout",
3405 		.data		= &ip_rt_gc_timeout,
3406 		.maxlen		= sizeof(int),
3407 		.mode		= 0644,
3408 		.proc_handler	= proc_dointvec_jiffies,
3409 	},
3410 	{
3411 		.procname	= "gc_interval",
3412 		.data		= &ip_rt_gc_interval,
3413 		.maxlen		= sizeof(int),
3414 		.mode		= 0644,
3415 		.proc_handler	= proc_dointvec_jiffies,
3416 	},
3417 	{
3418 		.procname	= "redirect_load",
3419 		.data		= &ip_rt_redirect_load,
3420 		.maxlen		= sizeof(int),
3421 		.mode		= 0644,
3422 		.proc_handler	= proc_dointvec,
3423 	},
3424 	{
3425 		.procname	= "redirect_number",
3426 		.data		= &ip_rt_redirect_number,
3427 		.maxlen		= sizeof(int),
3428 		.mode		= 0644,
3429 		.proc_handler	= proc_dointvec,
3430 	},
3431 	{
3432 		.procname	= "redirect_silence",
3433 		.data		= &ip_rt_redirect_silence,
3434 		.maxlen		= sizeof(int),
3435 		.mode		= 0644,
3436 		.proc_handler	= proc_dointvec,
3437 	},
3438 	{
3439 		.procname	= "error_cost",
3440 		.data		= &ip_rt_error_cost,
3441 		.maxlen		= sizeof(int),
3442 		.mode		= 0644,
3443 		.proc_handler	= proc_dointvec,
3444 	},
3445 	{
3446 		.procname	= "error_burst",
3447 		.data		= &ip_rt_error_burst,
3448 		.maxlen		= sizeof(int),
3449 		.mode		= 0644,
3450 		.proc_handler	= proc_dointvec,
3451 	},
3452 	{
3453 		.procname	= "gc_elasticity",
3454 		.data		= &ip_rt_gc_elasticity,
3455 		.maxlen		= sizeof(int),
3456 		.mode		= 0644,
3457 		.proc_handler	= proc_dointvec,
3458 	},
3459 	{
3460 		.procname	= "mtu_expires",
3461 		.data		= &ip_rt_mtu_expires,
3462 		.maxlen		= sizeof(int),
3463 		.mode		= 0644,
3464 		.proc_handler	= proc_dointvec_jiffies,
3465 	},
3466 	{
3467 		.procname	= "min_pmtu",
3468 		.data		= &ip_rt_min_pmtu,
3469 		.maxlen		= sizeof(int),
3470 		.mode		= 0644,
3471 		.proc_handler	= proc_dointvec_minmax,
3472 		.extra1		= &ip_min_valid_pmtu,
3473 	},
3474 	{
3475 		.procname	= "min_adv_mss",
3476 		.data		= &ip_rt_min_advmss,
3477 		.maxlen		= sizeof(int),
3478 		.mode		= 0644,
3479 		.proc_handler	= proc_dointvec,
3480 	},
3481 	{ }
3482 };
3483 
3484 static const char ipv4_route_flush_procname[] = "flush";
3485 
3486 static struct ctl_table ipv4_route_flush_table[] = {
3487 	{
3488 		.procname	= ipv4_route_flush_procname,
3489 		.maxlen		= sizeof(int),
3490 		.mode		= 0200,
3491 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3492 	},
3493 	{ },
3494 };
3495 
3496 static __net_init int sysctl_route_net_init(struct net *net)
3497 {
3498 	struct ctl_table *tbl;
3499 
3500 	tbl = ipv4_route_flush_table;
3501 	if (!net_eq(net, &init_net)) {
3502 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3503 		if (!tbl)
3504 			goto err_dup;
3505 
3506 		/* Don't export non-whitelisted sysctls to unprivileged users */
3507 		if (net->user_ns != &init_user_ns) {
3508 			if (tbl[0].procname != ipv4_route_flush_procname)
3509 				tbl[0].procname = NULL;
3510 		}
3511 	}
3512 	tbl[0].extra1 = net;
3513 
3514 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3515 	if (!net->ipv4.route_hdr)
3516 		goto err_reg;
3517 	return 0;
3518 
3519 err_reg:
3520 	if (tbl != ipv4_route_flush_table)
3521 		kfree(tbl);
3522 err_dup:
3523 	return -ENOMEM;
3524 }
3525 
3526 static __net_exit void sysctl_route_net_exit(struct net *net)
3527 {
3528 	struct ctl_table *tbl;
3529 
3530 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3531 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3532 	BUG_ON(tbl == ipv4_route_flush_table);
3533 	kfree(tbl);
3534 }
3535 
3536 static __net_initdata struct pernet_operations sysctl_route_ops = {
3537 	.init = sysctl_route_net_init,
3538 	.exit = sysctl_route_net_exit,
3539 };
3540 #endif
3541 
3542 static __net_init int rt_genid_init(struct net *net)
3543 {
3544 	atomic_set(&net->ipv4.rt_genid, 0);
3545 	atomic_set(&net->fnhe_genid, 0);
3546 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3547 	return 0;
3548 }
3549 
3550 static __net_initdata struct pernet_operations rt_genid_ops = {
3551 	.init = rt_genid_init,
3552 };
3553 
3554 static int __net_init ipv4_inetpeer_init(struct net *net)
3555 {
3556 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3557 
3558 	if (!bp)
3559 		return -ENOMEM;
3560 	inet_peer_base_init(bp);
3561 	net->ipv4.peers = bp;
3562 	return 0;
3563 }
3564 
3565 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3566 {
3567 	struct inet_peer_base *bp = net->ipv4.peers;
3568 
3569 	net->ipv4.peers = NULL;
3570 	inetpeer_invalidate_tree(bp);
3571 	kfree(bp);
3572 }
3573 
3574 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3575 	.init	=	ipv4_inetpeer_init,
3576 	.exit	=	ipv4_inetpeer_exit,
3577 };
3578 
3579 #ifdef CONFIG_IP_ROUTE_CLASSID
3580 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3581 #endif /* CONFIG_IP_ROUTE_CLASSID */
3582 
3583 int __init ip_rt_init(void)
3584 {
3585 	int cpu;
3586 
3587 	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3588 				  GFP_KERNEL);
3589 	if (!ip_idents)
3590 		panic("IP: failed to allocate ip_idents\n");
3591 
3592 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3593 
3594 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3595 	if (!ip_tstamps)
3596 		panic("IP: failed to allocate ip_tstamps\n");
3597 
3598 	for_each_possible_cpu(cpu) {
3599 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3600 
3601 		INIT_LIST_HEAD(&ul->head);
3602 		spin_lock_init(&ul->lock);
3603 	}
3604 #ifdef CONFIG_IP_ROUTE_CLASSID
3605 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3606 	if (!ip_rt_acct)
3607 		panic("IP: failed to allocate ip_rt_acct\n");
3608 #endif
3609 
3610 	ipv4_dst_ops.kmem_cachep =
3611 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3612 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3613 
3614 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3615 
3616 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3617 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3618 
3619 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3620 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3621 
3622 	ipv4_dst_ops.gc_thresh = ~0;
3623 	ip_rt_max_size = INT_MAX;
3624 
3625 	devinet_init();
3626 	ip_fib_init();
3627 
3628 	if (ip_rt_proc_init())
3629 		pr_err("Unable to create route proc files\n");
3630 #ifdef CONFIG_XFRM
3631 	xfrm_init();
3632 	xfrm4_init();
3633 #endif
3634 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3635 		      RTNL_FLAG_DOIT_UNLOCKED);
3636 
3637 #ifdef CONFIG_SYSCTL
3638 	register_pernet_subsys(&sysctl_route_ops);
3639 #endif
3640 	register_pernet_subsys(&rt_genid_ops);
3641 	register_pernet_subsys(&ipv4_inetpeer_ops);
3642 	return 0;
3643 }
3644 
3645 #ifdef CONFIG_SYSCTL
3646 /*
3647  * We really need to sanitize the damn ipv4 init order, then all
3648  * this nonsense will go away.
3649  */
3650 void __init ip_static_sysctl_init(void)
3651 {
3652 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3653 }
3654 #endif
3655