xref: /openbmc/linux/net/ipv4/route.c (revision 8ffdff6a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112 
113 #include "fib_lookup.h"
114 
115 #define RT_FL_TOS(oldflp4) \
116 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly	= 9;
122 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly	= HZ;
125 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly	= 256;
129 
130 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
131 
132 /*
133  *	Interface to generic destination cache.
134  */
135 
136 INDIRECT_CALLABLE_SCOPE
137 struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 INDIRECT_CALLABLE_SCOPE
140 unsigned int		ipv4_mtu(const struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void		 ipv4_link_failure(struct sk_buff *skb);
143 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 					   struct sk_buff *skb, u32 mtu,
145 					   bool confirm_neigh);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct proc_ops rt_cache_proc_ops = {
243 	.proc_open	= rt_cache_seq_open,
244 	.proc_read	= seq_read,
245 	.proc_lseek	= seq_lseek,
246 	.proc_release	= seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	(*pos)++;
277 	return NULL;
278 
279 }
280 
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct proc_ops rt_cpu_proc_ops = {
334 	.proc_open	= rt_cpu_seq_open,
335 	.proc_read	= seq_read,
336 	.proc_lseek	= seq_lseek,
337 	.proc_release	= seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 #endif
365 
366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368 	struct proc_dir_entry *pde;
369 
370 	pde = proc_create("rt_cache", 0444, net->proc_net,
371 			  &rt_cache_proc_ops);
372 	if (!pde)
373 		goto err1;
374 
375 	pde = proc_create("rt_cache", 0444,
376 			  net->proc_net_stat, &rt_cpu_proc_ops);
377 	if (!pde)
378 		goto err2;
379 
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 	pde = proc_create_single("rt_acct", 0, net->proc_net,
382 			rt_acct_proc_show);
383 	if (!pde)
384 		goto err3;
385 #endif
386 	return 0;
387 
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390 	remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393 	remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395 	return -ENOMEM;
396 }
397 
398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 	remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 	remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406 
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408 	.init = ip_rt_do_proc_init,
409 	.exit = ip_rt_do_proc_exit,
410 };
411 
412 static int __init ip_rt_proc_init(void)
413 {
414 	return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416 
417 #else
418 static inline int ip_rt_proc_init(void)
419 {
420 	return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423 
424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428 
429 void rt_cache_flush(struct net *net)
430 {
431 	rt_genid_bump_ipv4(net);
432 }
433 
434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435 					   struct sk_buff *skb,
436 					   const void *daddr)
437 {
438 	const struct rtable *rt = container_of(dst, struct rtable, dst);
439 	struct net_device *dev = dst->dev;
440 	struct neighbour *n;
441 
442 	rcu_read_lock_bh();
443 
444 	if (likely(rt->rt_gw_family == AF_INET)) {
445 		n = ip_neigh_gw4(dev, rt->rt_gw4);
446 	} else if (rt->rt_gw_family == AF_INET6) {
447 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449 		__be32 pkey;
450 
451 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 		n = ip_neigh_gw4(dev, pkey);
453 	}
454 
455 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 		n = NULL;
457 
458 	rcu_read_unlock_bh();
459 
460 	return n;
461 }
462 
463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465 	const struct rtable *rt = container_of(dst, struct rtable, dst);
466 	struct net_device *dev = dst->dev;
467 	const __be32 *pkey = daddr;
468 
469 	if (rt->rt_gw_family == AF_INET) {
470 		pkey = (const __be32 *)&rt->rt_gw4;
471 	} else if (rt->rt_gw_family == AF_INET6) {
472 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 	} else if (!daddr ||
474 		 (rt->rt_flags &
475 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 		return;
477 	}
478 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480 
481 #define IP_IDENTS_SZ 2048u
482 
483 static atomic_t *ip_idents __read_mostly;
484 static u32 *ip_tstamps __read_mostly;
485 
486 /* In order to protect privacy, we add a perturbation to identifiers
487  * if one generator is seldom used. This makes hard for an attacker
488  * to infer how many packets were sent between two points in time.
489  */
490 u32 ip_idents_reserve(u32 hash, int segs)
491 {
492 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
493 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
494 	u32 old = READ_ONCE(*p_tstamp);
495 	u32 now = (u32)jiffies;
496 	u32 delta = 0;
497 
498 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
499 		delta = prandom_u32_max(now - old);
500 
501 	/* If UBSAN reports an error there, please make sure your compiler
502 	 * supports -fno-strict-overflow before reporting it that was a bug
503 	 * in UBSAN, and it has been fixed in GCC-8.
504 	 */
505 	return atomic_add_return(segs + delta, p_id) - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508 
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511 	u32 hash, id;
512 
513 	/* Note the following code is not safe, but this is okay. */
514 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515 		get_random_bytes(&net->ipv4.ip_id_key,
516 				 sizeof(net->ipv4.ip_id_key));
517 
518 	hash = siphash_3u32((__force u32)iph->daddr,
519 			    (__force u32)iph->saddr,
520 			    iph->protocol,
521 			    &net->ipv4.ip_id_key);
522 	id = ip_idents_reserve(hash, segs);
523 	iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526 
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528 			     const struct sock *sk,
529 			     const struct iphdr *iph,
530 			     int oif, u8 tos,
531 			     u8 prot, u32 mark, int flow_flags)
532 {
533 	if (sk) {
534 		const struct inet_sock *inet = inet_sk(sk);
535 
536 		oif = sk->sk_bound_dev_if;
537 		mark = sk->sk_mark;
538 		tos = RT_CONN_FLAGS(sk);
539 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540 	}
541 	flowi4_init_output(fl4, oif, mark, tos,
542 			   RT_SCOPE_UNIVERSE, prot,
543 			   flow_flags,
544 			   iph->daddr, iph->saddr, 0, 0,
545 			   sock_net_uid(net, sk));
546 }
547 
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549 			       const struct sock *sk)
550 {
551 	const struct net *net = dev_net(skb->dev);
552 	const struct iphdr *iph = ip_hdr(skb);
553 	int oif = skb->dev->ifindex;
554 	u8 tos = RT_TOS(iph->tos);
555 	u8 prot = iph->protocol;
556 	u32 mark = skb->mark;
557 
558 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560 
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563 	const struct inet_sock *inet = inet_sk(sk);
564 	const struct ip_options_rcu *inet_opt;
565 	__be32 daddr = inet->inet_daddr;
566 
567 	rcu_read_lock();
568 	inet_opt = rcu_dereference(inet->inet_opt);
569 	if (inet_opt && inet_opt->opt.srr)
570 		daddr = inet_opt->opt.faddr;
571 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574 			   inet_sk_flowi_flags(sk),
575 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576 	rcu_read_unlock();
577 }
578 
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580 				 const struct sk_buff *skb)
581 {
582 	if (skb)
583 		build_skb_flow_key(fl4, skb, sk);
584 	else
585 		build_sk_flow_key(fl4, sk);
586 }
587 
588 static DEFINE_SPINLOCK(fnhe_lock);
589 
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592 	struct rtable *rt;
593 
594 	rt = rcu_dereference(fnhe->fnhe_rth_input);
595 	if (rt) {
596 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597 		dst_dev_put(&rt->dst);
598 		dst_release(&rt->dst);
599 	}
600 	rt = rcu_dereference(fnhe->fnhe_rth_output);
601 	if (rt) {
602 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603 		dst_dev_put(&rt->dst);
604 		dst_release(&rt->dst);
605 	}
606 }
607 
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610 	struct fib_nh_exception *fnhe, *oldest;
611 
612 	oldest = rcu_dereference(hash->chain);
613 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
615 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616 			oldest = fnhe;
617 	}
618 	fnhe_flush_routes(oldest);
619 	return oldest;
620 }
621 
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624 	static u32 fnhe_hashrnd __read_mostly;
625 	u32 hval;
626 
627 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628 	hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
629 	return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631 
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634 	rt->rt_pmtu = fnhe->fnhe_pmtu;
635 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636 	rt->dst.expires = fnhe->fnhe_expires;
637 
638 	if (fnhe->fnhe_gw) {
639 		rt->rt_flags |= RTCF_REDIRECTED;
640 		rt->rt_uses_gateway = 1;
641 		rt->rt_gw_family = AF_INET;
642 		rt->rt_gw4 = fnhe->fnhe_gw;
643 	}
644 }
645 
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647 				  __be32 gw, u32 pmtu, bool lock,
648 				  unsigned long expires)
649 {
650 	struct fnhe_hash_bucket *hash;
651 	struct fib_nh_exception *fnhe;
652 	struct rtable *rt;
653 	u32 genid, hval;
654 	unsigned int i;
655 	int depth;
656 
657 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
658 	hval = fnhe_hashfun(daddr);
659 
660 	spin_lock_bh(&fnhe_lock);
661 
662 	hash = rcu_dereference(nhc->nhc_exceptions);
663 	if (!hash) {
664 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665 		if (!hash)
666 			goto out_unlock;
667 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
668 	}
669 
670 	hash += hval;
671 
672 	depth = 0;
673 	for (fnhe = rcu_dereference(hash->chain); fnhe;
674 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
675 		if (fnhe->fnhe_daddr == daddr)
676 			break;
677 		depth++;
678 	}
679 
680 	if (fnhe) {
681 		if (fnhe->fnhe_genid != genid)
682 			fnhe->fnhe_genid = genid;
683 		if (gw)
684 			fnhe->fnhe_gw = gw;
685 		if (pmtu) {
686 			fnhe->fnhe_pmtu = pmtu;
687 			fnhe->fnhe_mtu_locked = lock;
688 		}
689 		fnhe->fnhe_expires = max(1UL, expires);
690 		/* Update all cached dsts too */
691 		rt = rcu_dereference(fnhe->fnhe_rth_input);
692 		if (rt)
693 			fill_route_from_fnhe(rt, fnhe);
694 		rt = rcu_dereference(fnhe->fnhe_rth_output);
695 		if (rt)
696 			fill_route_from_fnhe(rt, fnhe);
697 	} else {
698 		if (depth > FNHE_RECLAIM_DEPTH)
699 			fnhe = fnhe_oldest(hash);
700 		else {
701 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702 			if (!fnhe)
703 				goto out_unlock;
704 
705 			fnhe->fnhe_next = hash->chain;
706 			rcu_assign_pointer(hash->chain, fnhe);
707 		}
708 		fnhe->fnhe_genid = genid;
709 		fnhe->fnhe_daddr = daddr;
710 		fnhe->fnhe_gw = gw;
711 		fnhe->fnhe_pmtu = pmtu;
712 		fnhe->fnhe_mtu_locked = lock;
713 		fnhe->fnhe_expires = max(1UL, expires);
714 
715 		/* Exception created; mark the cached routes for the nexthop
716 		 * stale, so anyone caching it rechecks if this exception
717 		 * applies to them.
718 		 */
719 		rt = rcu_dereference(nhc->nhc_rth_input);
720 		if (rt)
721 			rt->dst.obsolete = DST_OBSOLETE_KILL;
722 
723 		for_each_possible_cpu(i) {
724 			struct rtable __rcu **prt;
725 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726 			rt = rcu_dereference(*prt);
727 			if (rt)
728 				rt->dst.obsolete = DST_OBSOLETE_KILL;
729 		}
730 	}
731 
732 	fnhe->fnhe_stamp = jiffies;
733 
734 out_unlock:
735 	spin_unlock_bh(&fnhe_lock);
736 }
737 
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739 			     bool kill_route)
740 {
741 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
742 	__be32 old_gw = ip_hdr(skb)->saddr;
743 	struct net_device *dev = skb->dev;
744 	struct in_device *in_dev;
745 	struct fib_result res;
746 	struct neighbour *n;
747 	struct net *net;
748 
749 	switch (icmp_hdr(skb)->code & 7) {
750 	case ICMP_REDIR_NET:
751 	case ICMP_REDIR_NETTOS:
752 	case ICMP_REDIR_HOST:
753 	case ICMP_REDIR_HOSTTOS:
754 		break;
755 
756 	default:
757 		return;
758 	}
759 
760 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761 		return;
762 
763 	in_dev = __in_dev_get_rcu(dev);
764 	if (!in_dev)
765 		return;
766 
767 	net = dev_net(dev);
768 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770 	    ipv4_is_zeronet(new_gw))
771 		goto reject_redirect;
772 
773 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775 			goto reject_redirect;
776 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777 			goto reject_redirect;
778 	} else {
779 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780 			goto reject_redirect;
781 	}
782 
783 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784 	if (!n)
785 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786 	if (!IS_ERR(n)) {
787 		if (!(n->nud_state & NUD_VALID)) {
788 			neigh_event_send(n, NULL);
789 		} else {
790 			if (fib_lookup(net, fl4, &res, 0) == 0) {
791 				struct fib_nh_common *nhc;
792 
793 				fib_select_path(net, &res, fl4, skb);
794 				nhc = FIB_RES_NHC(res);
795 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
796 						0, false,
797 						jiffies + ip_rt_gc_timeout);
798 			}
799 			if (kill_route)
800 				rt->dst.obsolete = DST_OBSOLETE_KILL;
801 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
802 		}
803 		neigh_release(n);
804 	}
805 	return;
806 
807 reject_redirect:
808 #ifdef CONFIG_IP_ROUTE_VERBOSE
809 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
810 		const struct iphdr *iph = (const struct iphdr *) skb->data;
811 		__be32 daddr = iph->daddr;
812 		__be32 saddr = iph->saddr;
813 
814 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
815 				     "  Advised path = %pI4 -> %pI4\n",
816 				     &old_gw, dev->name, &new_gw,
817 				     &saddr, &daddr);
818 	}
819 #endif
820 	;
821 }
822 
823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
824 {
825 	struct rtable *rt;
826 	struct flowi4 fl4;
827 	const struct iphdr *iph = (const struct iphdr *) skb->data;
828 	struct net *net = dev_net(skb->dev);
829 	int oif = skb->dev->ifindex;
830 	u8 tos = RT_TOS(iph->tos);
831 	u8 prot = iph->protocol;
832 	u32 mark = skb->mark;
833 
834 	rt = (struct rtable *) dst;
835 
836 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
837 	__ip_do_redirect(rt, skb, &fl4, true);
838 }
839 
840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841 {
842 	struct rtable *rt = (struct rtable *)dst;
843 	struct dst_entry *ret = dst;
844 
845 	if (rt) {
846 		if (dst->obsolete > 0) {
847 			ip_rt_put(rt);
848 			ret = NULL;
849 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
850 			   rt->dst.expires) {
851 			ip_rt_put(rt);
852 			ret = NULL;
853 		}
854 	}
855 	return ret;
856 }
857 
858 /*
859  * Algorithm:
860  *	1. The first ip_rt_redirect_number redirects are sent
861  *	   with exponential backoff, then we stop sending them at all,
862  *	   assuming that the host ignores our redirects.
863  *	2. If we did not see packets requiring redirects
864  *	   during ip_rt_redirect_silence, we assume that the host
865  *	   forgot redirected route and start to send redirects again.
866  *
867  * This algorithm is much cheaper and more intelligent than dumb load limiting
868  * in icmp.c.
869  *
870  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
871  * and "frag. need" (breaks PMTU discovery) in icmp.c.
872  */
873 
874 void ip_rt_send_redirect(struct sk_buff *skb)
875 {
876 	struct rtable *rt = skb_rtable(skb);
877 	struct in_device *in_dev;
878 	struct inet_peer *peer;
879 	struct net *net;
880 	int log_martians;
881 	int vif;
882 
883 	rcu_read_lock();
884 	in_dev = __in_dev_get_rcu(rt->dst.dev);
885 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
886 		rcu_read_unlock();
887 		return;
888 	}
889 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
890 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
891 	rcu_read_unlock();
892 
893 	net = dev_net(rt->dst.dev);
894 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895 	if (!peer) {
896 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
897 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
898 		return;
899 	}
900 
901 	/* No redirected packets during ip_rt_redirect_silence;
902 	 * reset the algorithm.
903 	 */
904 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
905 		peer->rate_tokens = 0;
906 		peer->n_redirects = 0;
907 	}
908 
909 	/* Too many ignored redirects; do not send anything
910 	 * set dst.rate_last to the last seen redirected packet.
911 	 */
912 	if (peer->n_redirects >= ip_rt_redirect_number) {
913 		peer->rate_last = jiffies;
914 		goto out_put_peer;
915 	}
916 
917 	/* Check for load limit; set rate_last to the latest sent
918 	 * redirect.
919 	 */
920 	if (peer->n_redirects == 0 ||
921 	    time_after(jiffies,
922 		       (peer->rate_last +
923 			(ip_rt_redirect_load << peer->n_redirects)))) {
924 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
925 
926 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
927 		peer->rate_last = jiffies;
928 		++peer->n_redirects;
929 #ifdef CONFIG_IP_ROUTE_VERBOSE
930 		if (log_martians &&
931 		    peer->n_redirects == ip_rt_redirect_number)
932 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933 					     &ip_hdr(skb)->saddr, inet_iif(skb),
934 					     &ip_hdr(skb)->daddr, &gw);
935 #endif
936 	}
937 out_put_peer:
938 	inet_putpeer(peer);
939 }
940 
941 static int ip_error(struct sk_buff *skb)
942 {
943 	struct rtable *rt = skb_rtable(skb);
944 	struct net_device *dev = skb->dev;
945 	struct in_device *in_dev;
946 	struct inet_peer *peer;
947 	unsigned long now;
948 	struct net *net;
949 	bool send;
950 	int code;
951 
952 	if (netif_is_l3_master(skb->dev)) {
953 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
954 		if (!dev)
955 			goto out;
956 	}
957 
958 	in_dev = __in_dev_get_rcu(dev);
959 
960 	/* IP on this device is disabled. */
961 	if (!in_dev)
962 		goto out;
963 
964 	net = dev_net(rt->dst.dev);
965 	if (!IN_DEV_FORWARD(in_dev)) {
966 		switch (rt->dst.error) {
967 		case EHOSTUNREACH:
968 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
969 			break;
970 
971 		case ENETUNREACH:
972 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
973 			break;
974 		}
975 		goto out;
976 	}
977 
978 	switch (rt->dst.error) {
979 	case EINVAL:
980 	default:
981 		goto out;
982 	case EHOSTUNREACH:
983 		code = ICMP_HOST_UNREACH;
984 		break;
985 	case ENETUNREACH:
986 		code = ICMP_NET_UNREACH;
987 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
988 		break;
989 	case EACCES:
990 		code = ICMP_PKT_FILTERED;
991 		break;
992 	}
993 
994 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
995 			       l3mdev_master_ifindex(skb->dev), 1);
996 
997 	send = true;
998 	if (peer) {
999 		now = jiffies;
1000 		peer->rate_tokens += now - peer->rate_last;
1001 		if (peer->rate_tokens > ip_rt_error_burst)
1002 			peer->rate_tokens = ip_rt_error_burst;
1003 		peer->rate_last = now;
1004 		if (peer->rate_tokens >= ip_rt_error_cost)
1005 			peer->rate_tokens -= ip_rt_error_cost;
1006 		else
1007 			send = false;
1008 		inet_putpeer(peer);
1009 	}
1010 	if (send)
1011 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1012 
1013 out:	kfree_skb(skb);
1014 	return 0;
1015 }
1016 
1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1018 {
1019 	struct dst_entry *dst = &rt->dst;
1020 	struct net *net = dev_net(dst->dev);
1021 	struct fib_result res;
1022 	bool lock = false;
1023 	u32 old_mtu;
1024 
1025 	if (ip_mtu_locked(dst))
1026 		return;
1027 
1028 	old_mtu = ipv4_mtu(dst);
1029 	if (old_mtu < mtu)
1030 		return;
1031 
1032 	if (mtu < ip_rt_min_pmtu) {
1033 		lock = true;
1034 		mtu = min(old_mtu, ip_rt_min_pmtu);
1035 	}
1036 
1037 	if (rt->rt_pmtu == mtu && !lock &&
1038 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1039 		return;
1040 
1041 	rcu_read_lock();
1042 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1043 		struct fib_nh_common *nhc;
1044 
1045 		fib_select_path(net, &res, fl4, NULL);
1046 		nhc = FIB_RES_NHC(res);
1047 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1048 				      jiffies + ip_rt_mtu_expires);
1049 	}
1050 	rcu_read_unlock();
1051 }
1052 
1053 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1054 			      struct sk_buff *skb, u32 mtu,
1055 			      bool confirm_neigh)
1056 {
1057 	struct rtable *rt = (struct rtable *) dst;
1058 	struct flowi4 fl4;
1059 
1060 	ip_rt_build_flow_key(&fl4, sk, skb);
1061 
1062 	/* Don't make lookup fail for bridged encapsulations */
1063 	if (skb && netif_is_any_bridge_port(skb->dev))
1064 		fl4.flowi4_oif = 0;
1065 
1066 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1067 }
1068 
1069 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1070 		      int oif, u8 protocol)
1071 {
1072 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1073 	struct flowi4 fl4;
1074 	struct rtable *rt;
1075 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1076 
1077 	__build_flow_key(net, &fl4, NULL, iph, oif,
1078 			 RT_TOS(iph->tos), protocol, mark, 0);
1079 	rt = __ip_route_output_key(net, &fl4);
1080 	if (!IS_ERR(rt)) {
1081 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1082 		ip_rt_put(rt);
1083 	}
1084 }
1085 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1086 
1087 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1088 {
1089 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1090 	struct flowi4 fl4;
1091 	struct rtable *rt;
1092 
1093 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1094 
1095 	if (!fl4.flowi4_mark)
1096 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1097 
1098 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1099 	if (!IS_ERR(rt)) {
1100 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1101 		ip_rt_put(rt);
1102 	}
1103 }
1104 
1105 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1106 {
1107 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1108 	struct flowi4 fl4;
1109 	struct rtable *rt;
1110 	struct dst_entry *odst = NULL;
1111 	bool new = false;
1112 	struct net *net = sock_net(sk);
1113 
1114 	bh_lock_sock(sk);
1115 
1116 	if (!ip_sk_accept_pmtu(sk))
1117 		goto out;
1118 
1119 	odst = sk_dst_get(sk);
1120 
1121 	if (sock_owned_by_user(sk) || !odst) {
1122 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1123 		goto out;
1124 	}
1125 
1126 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1127 
1128 	rt = (struct rtable *)odst;
1129 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1130 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131 		if (IS_ERR(rt))
1132 			goto out;
1133 
1134 		new = true;
1135 	}
1136 
1137 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1138 
1139 	if (!dst_check(&rt->dst, 0)) {
1140 		if (new)
1141 			dst_release(&rt->dst);
1142 
1143 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 		if (IS_ERR(rt))
1145 			goto out;
1146 
1147 		new = true;
1148 	}
1149 
1150 	if (new)
1151 		sk_dst_set(sk, &rt->dst);
1152 
1153 out:
1154 	bh_unlock_sock(sk);
1155 	dst_release(odst);
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1158 
1159 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1160 		   int oif, u8 protocol)
1161 {
1162 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1163 	struct flowi4 fl4;
1164 	struct rtable *rt;
1165 
1166 	__build_flow_key(net, &fl4, NULL, iph, oif,
1167 			 RT_TOS(iph->tos), protocol, 0, 0);
1168 	rt = __ip_route_output_key(net, &fl4);
1169 	if (!IS_ERR(rt)) {
1170 		__ip_do_redirect(rt, skb, &fl4, false);
1171 		ip_rt_put(rt);
1172 	}
1173 }
1174 EXPORT_SYMBOL_GPL(ipv4_redirect);
1175 
1176 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1177 {
1178 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1179 	struct flowi4 fl4;
1180 	struct rtable *rt;
1181 	struct net *net = sock_net(sk);
1182 
1183 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1184 	rt = __ip_route_output_key(net, &fl4);
1185 	if (!IS_ERR(rt)) {
1186 		__ip_do_redirect(rt, skb, &fl4, false);
1187 		ip_rt_put(rt);
1188 	}
1189 }
1190 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1191 
1192 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1193 							 u32 cookie)
1194 {
1195 	struct rtable *rt = (struct rtable *) dst;
1196 
1197 	/* All IPV4 dsts are created with ->obsolete set to the value
1198 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1199 	 * into this function always.
1200 	 *
1201 	 * When a PMTU/redirect information update invalidates a route,
1202 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1203 	 * DST_OBSOLETE_DEAD.
1204 	 */
1205 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1206 		return NULL;
1207 	return dst;
1208 }
1209 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1210 
1211 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1212 {
1213 	struct ip_options opt;
1214 	int res;
1215 
1216 	/* Recompile ip options since IPCB may not be valid anymore.
1217 	 * Also check we have a reasonable ipv4 header.
1218 	 */
1219 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1220 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1221 		return;
1222 
1223 	memset(&opt, 0, sizeof(opt));
1224 	if (ip_hdr(skb)->ihl > 5) {
1225 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1226 			return;
1227 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1228 
1229 		rcu_read_lock();
1230 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1231 		rcu_read_unlock();
1232 
1233 		if (res)
1234 			return;
1235 	}
1236 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1237 }
1238 
1239 static void ipv4_link_failure(struct sk_buff *skb)
1240 {
1241 	struct rtable *rt;
1242 
1243 	ipv4_send_dest_unreach(skb);
1244 
1245 	rt = skb_rtable(skb);
1246 	if (rt)
1247 		dst_set_expires(&rt->dst, 0);
1248 }
1249 
1250 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1251 {
1252 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1253 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1254 		 skb->dev ? skb->dev->name : "?");
1255 	kfree_skb(skb);
1256 	WARN_ON(1);
1257 	return 0;
1258 }
1259 
1260 /*
1261    We do not cache source address of outgoing interface,
1262    because it is used only by IP RR, TS and SRR options,
1263    so that it out of fast path.
1264 
1265    BTW remember: "addr" is allowed to be not aligned
1266    in IP options!
1267  */
1268 
1269 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1270 {
1271 	__be32 src;
1272 
1273 	if (rt_is_output_route(rt))
1274 		src = ip_hdr(skb)->saddr;
1275 	else {
1276 		struct fib_result res;
1277 		struct iphdr *iph = ip_hdr(skb);
1278 		struct flowi4 fl4 = {
1279 			.daddr = iph->daddr,
1280 			.saddr = iph->saddr,
1281 			.flowi4_tos = RT_TOS(iph->tos),
1282 			.flowi4_oif = rt->dst.dev->ifindex,
1283 			.flowi4_iif = skb->dev->ifindex,
1284 			.flowi4_mark = skb->mark,
1285 		};
1286 
1287 		rcu_read_lock();
1288 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1289 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1290 		else
1291 			src = inet_select_addr(rt->dst.dev,
1292 					       rt_nexthop(rt, iph->daddr),
1293 					       RT_SCOPE_UNIVERSE);
1294 		rcu_read_unlock();
1295 	}
1296 	memcpy(addr, &src, 4);
1297 }
1298 
1299 #ifdef CONFIG_IP_ROUTE_CLASSID
1300 static void set_class_tag(struct rtable *rt, u32 tag)
1301 {
1302 	if (!(rt->dst.tclassid & 0xFFFF))
1303 		rt->dst.tclassid |= tag & 0xFFFF;
1304 	if (!(rt->dst.tclassid & 0xFFFF0000))
1305 		rt->dst.tclassid |= tag & 0xFFFF0000;
1306 }
1307 #endif
1308 
1309 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1310 {
1311 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1312 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1313 				    ip_rt_min_advmss);
1314 
1315 	return min(advmss, IPV4_MAX_PMTU - header_size);
1316 }
1317 
1318 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1319 {
1320 	const struct rtable *rt = (const struct rtable *)dst;
1321 	unsigned int mtu = rt->rt_pmtu;
1322 
1323 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1324 		mtu = dst_metric_raw(dst, RTAX_MTU);
1325 
1326 	if (mtu)
1327 		return mtu;
1328 
1329 	mtu = READ_ONCE(dst->dev->mtu);
1330 
1331 	if (unlikely(ip_mtu_locked(dst))) {
1332 		if (rt->rt_uses_gateway && mtu > 576)
1333 			mtu = 576;
1334 	}
1335 
1336 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1337 
1338 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1339 }
1340 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1341 
1342 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1343 {
1344 	struct fnhe_hash_bucket *hash;
1345 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1346 	u32 hval = fnhe_hashfun(daddr);
1347 
1348 	spin_lock_bh(&fnhe_lock);
1349 
1350 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1351 					 lockdep_is_held(&fnhe_lock));
1352 	hash += hval;
1353 
1354 	fnhe_p = &hash->chain;
1355 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1356 	while (fnhe) {
1357 		if (fnhe->fnhe_daddr == daddr) {
1358 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1359 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1360 			/* set fnhe_daddr to 0 to ensure it won't bind with
1361 			 * new dsts in rt_bind_exception().
1362 			 */
1363 			fnhe->fnhe_daddr = 0;
1364 			fnhe_flush_routes(fnhe);
1365 			kfree_rcu(fnhe, rcu);
1366 			break;
1367 		}
1368 		fnhe_p = &fnhe->fnhe_next;
1369 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1370 						 lockdep_is_held(&fnhe_lock));
1371 	}
1372 
1373 	spin_unlock_bh(&fnhe_lock);
1374 }
1375 
1376 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1377 					       __be32 daddr)
1378 {
1379 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1380 	struct fib_nh_exception *fnhe;
1381 	u32 hval;
1382 
1383 	if (!hash)
1384 		return NULL;
1385 
1386 	hval = fnhe_hashfun(daddr);
1387 
1388 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1389 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1390 		if (fnhe->fnhe_daddr == daddr) {
1391 			if (fnhe->fnhe_expires &&
1392 			    time_after(jiffies, fnhe->fnhe_expires)) {
1393 				ip_del_fnhe(nhc, daddr);
1394 				break;
1395 			}
1396 			return fnhe;
1397 		}
1398 	}
1399 	return NULL;
1400 }
1401 
1402 /* MTU selection:
1403  * 1. mtu on route is locked - use it
1404  * 2. mtu from nexthop exception
1405  * 3. mtu from egress device
1406  */
1407 
1408 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1409 {
1410 	struct fib_nh_common *nhc = res->nhc;
1411 	struct net_device *dev = nhc->nhc_dev;
1412 	struct fib_info *fi = res->fi;
1413 	u32 mtu = 0;
1414 
1415 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1416 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1417 		mtu = fi->fib_mtu;
1418 
1419 	if (likely(!mtu)) {
1420 		struct fib_nh_exception *fnhe;
1421 
1422 		fnhe = find_exception(nhc, daddr);
1423 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1424 			mtu = fnhe->fnhe_pmtu;
1425 	}
1426 
1427 	if (likely(!mtu))
1428 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1429 
1430 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1431 }
1432 
1433 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1434 			      __be32 daddr, const bool do_cache)
1435 {
1436 	bool ret = false;
1437 
1438 	spin_lock_bh(&fnhe_lock);
1439 
1440 	if (daddr == fnhe->fnhe_daddr) {
1441 		struct rtable __rcu **porig;
1442 		struct rtable *orig;
1443 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1444 
1445 		if (rt_is_input_route(rt))
1446 			porig = &fnhe->fnhe_rth_input;
1447 		else
1448 			porig = &fnhe->fnhe_rth_output;
1449 		orig = rcu_dereference(*porig);
1450 
1451 		if (fnhe->fnhe_genid != genid) {
1452 			fnhe->fnhe_genid = genid;
1453 			fnhe->fnhe_gw = 0;
1454 			fnhe->fnhe_pmtu = 0;
1455 			fnhe->fnhe_expires = 0;
1456 			fnhe->fnhe_mtu_locked = false;
1457 			fnhe_flush_routes(fnhe);
1458 			orig = NULL;
1459 		}
1460 		fill_route_from_fnhe(rt, fnhe);
1461 		if (!rt->rt_gw4) {
1462 			rt->rt_gw4 = daddr;
1463 			rt->rt_gw_family = AF_INET;
1464 		}
1465 
1466 		if (do_cache) {
1467 			dst_hold(&rt->dst);
1468 			rcu_assign_pointer(*porig, rt);
1469 			if (orig) {
1470 				dst_dev_put(&orig->dst);
1471 				dst_release(&orig->dst);
1472 			}
1473 			ret = true;
1474 		}
1475 
1476 		fnhe->fnhe_stamp = jiffies;
1477 	}
1478 	spin_unlock_bh(&fnhe_lock);
1479 
1480 	return ret;
1481 }
1482 
1483 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1484 {
1485 	struct rtable *orig, *prev, **p;
1486 	bool ret = true;
1487 
1488 	if (rt_is_input_route(rt)) {
1489 		p = (struct rtable **)&nhc->nhc_rth_input;
1490 	} else {
1491 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1492 	}
1493 	orig = *p;
1494 
1495 	/* hold dst before doing cmpxchg() to avoid race condition
1496 	 * on this dst
1497 	 */
1498 	dst_hold(&rt->dst);
1499 	prev = cmpxchg(p, orig, rt);
1500 	if (prev == orig) {
1501 		if (orig) {
1502 			rt_add_uncached_list(orig);
1503 			dst_release(&orig->dst);
1504 		}
1505 	} else {
1506 		dst_release(&rt->dst);
1507 		ret = false;
1508 	}
1509 
1510 	return ret;
1511 }
1512 
1513 struct uncached_list {
1514 	spinlock_t		lock;
1515 	struct list_head	head;
1516 };
1517 
1518 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1519 
1520 void rt_add_uncached_list(struct rtable *rt)
1521 {
1522 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1523 
1524 	rt->rt_uncached_list = ul;
1525 
1526 	spin_lock_bh(&ul->lock);
1527 	list_add_tail(&rt->rt_uncached, &ul->head);
1528 	spin_unlock_bh(&ul->lock);
1529 }
1530 
1531 void rt_del_uncached_list(struct rtable *rt)
1532 {
1533 	if (!list_empty(&rt->rt_uncached)) {
1534 		struct uncached_list *ul = rt->rt_uncached_list;
1535 
1536 		spin_lock_bh(&ul->lock);
1537 		list_del(&rt->rt_uncached);
1538 		spin_unlock_bh(&ul->lock);
1539 	}
1540 }
1541 
1542 static void ipv4_dst_destroy(struct dst_entry *dst)
1543 {
1544 	struct rtable *rt = (struct rtable *)dst;
1545 
1546 	ip_dst_metrics_put(dst);
1547 	rt_del_uncached_list(rt);
1548 }
1549 
1550 void rt_flush_dev(struct net_device *dev)
1551 {
1552 	struct rtable *rt;
1553 	int cpu;
1554 
1555 	for_each_possible_cpu(cpu) {
1556 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1557 
1558 		spin_lock_bh(&ul->lock);
1559 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1560 			if (rt->dst.dev != dev)
1561 				continue;
1562 			rt->dst.dev = blackhole_netdev;
1563 			dev_hold(rt->dst.dev);
1564 			dev_put(dev);
1565 		}
1566 		spin_unlock_bh(&ul->lock);
1567 	}
1568 }
1569 
1570 static bool rt_cache_valid(const struct rtable *rt)
1571 {
1572 	return	rt &&
1573 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1574 		!rt_is_expired(rt);
1575 }
1576 
1577 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1578 			   const struct fib_result *res,
1579 			   struct fib_nh_exception *fnhe,
1580 			   struct fib_info *fi, u16 type, u32 itag,
1581 			   const bool do_cache)
1582 {
1583 	bool cached = false;
1584 
1585 	if (fi) {
1586 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1587 
1588 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1589 			rt->rt_uses_gateway = 1;
1590 			rt->rt_gw_family = nhc->nhc_gw_family;
1591 			/* only INET and INET6 are supported */
1592 			if (likely(nhc->nhc_gw_family == AF_INET))
1593 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1594 			else
1595 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1596 		}
1597 
1598 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1599 
1600 #ifdef CONFIG_IP_ROUTE_CLASSID
1601 		if (nhc->nhc_family == AF_INET) {
1602 			struct fib_nh *nh;
1603 
1604 			nh = container_of(nhc, struct fib_nh, nh_common);
1605 			rt->dst.tclassid = nh->nh_tclassid;
1606 		}
1607 #endif
1608 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1609 		if (unlikely(fnhe))
1610 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1611 		else if (do_cache)
1612 			cached = rt_cache_route(nhc, rt);
1613 		if (unlikely(!cached)) {
1614 			/* Routes we intend to cache in nexthop exception or
1615 			 * FIB nexthop have the DST_NOCACHE bit clear.
1616 			 * However, if we are unsuccessful at storing this
1617 			 * route into the cache we really need to set it.
1618 			 */
1619 			if (!rt->rt_gw4) {
1620 				rt->rt_gw_family = AF_INET;
1621 				rt->rt_gw4 = daddr;
1622 			}
1623 			rt_add_uncached_list(rt);
1624 		}
1625 	} else
1626 		rt_add_uncached_list(rt);
1627 
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 #ifdef CONFIG_IP_MULTIPLE_TABLES
1630 	set_class_tag(rt, res->tclassid);
1631 #endif
1632 	set_class_tag(rt, itag);
1633 #endif
1634 }
1635 
1636 struct rtable *rt_dst_alloc(struct net_device *dev,
1637 			    unsigned int flags, u16 type,
1638 			    bool nopolicy, bool noxfrm)
1639 {
1640 	struct rtable *rt;
1641 
1642 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1643 		       (nopolicy ? DST_NOPOLICY : 0) |
1644 		       (noxfrm ? DST_NOXFRM : 0));
1645 
1646 	if (rt) {
1647 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648 		rt->rt_flags = flags;
1649 		rt->rt_type = type;
1650 		rt->rt_is_input = 0;
1651 		rt->rt_iif = 0;
1652 		rt->rt_pmtu = 0;
1653 		rt->rt_mtu_locked = 0;
1654 		rt->rt_uses_gateway = 0;
1655 		rt->rt_gw_family = 0;
1656 		rt->rt_gw4 = 0;
1657 		INIT_LIST_HEAD(&rt->rt_uncached);
1658 
1659 		rt->dst.output = ip_output;
1660 		if (flags & RTCF_LOCAL)
1661 			rt->dst.input = ip_local_deliver;
1662 	}
1663 
1664 	return rt;
1665 }
1666 EXPORT_SYMBOL(rt_dst_alloc);
1667 
1668 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1669 {
1670 	struct rtable *new_rt;
1671 
1672 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673 			   rt->dst.flags);
1674 
1675 	if (new_rt) {
1676 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1677 		new_rt->rt_flags = rt->rt_flags;
1678 		new_rt->rt_type = rt->rt_type;
1679 		new_rt->rt_is_input = rt->rt_is_input;
1680 		new_rt->rt_iif = rt->rt_iif;
1681 		new_rt->rt_pmtu = rt->rt_pmtu;
1682 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1683 		new_rt->rt_gw_family = rt->rt_gw_family;
1684 		if (rt->rt_gw_family == AF_INET)
1685 			new_rt->rt_gw4 = rt->rt_gw4;
1686 		else if (rt->rt_gw_family == AF_INET6)
1687 			new_rt->rt_gw6 = rt->rt_gw6;
1688 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1689 
1690 		new_rt->dst.input = rt->dst.input;
1691 		new_rt->dst.output = rt->dst.output;
1692 		new_rt->dst.error = rt->dst.error;
1693 		new_rt->dst.lastuse = jiffies;
1694 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1695 	}
1696 	return new_rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_clone);
1699 
1700 /* called in rcu_read_lock() section */
1701 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1702 			  u8 tos, struct net_device *dev,
1703 			  struct in_device *in_dev, u32 *itag)
1704 {
1705 	int err;
1706 
1707 	/* Primary sanity checks. */
1708 	if (!in_dev)
1709 		return -EINVAL;
1710 
1711 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1712 	    skb->protocol != htons(ETH_P_IP))
1713 		return -EINVAL;
1714 
1715 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1716 		return -EINVAL;
1717 
1718 	if (ipv4_is_zeronet(saddr)) {
1719 		if (!ipv4_is_local_multicast(daddr) &&
1720 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1721 			return -EINVAL;
1722 	} else {
1723 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1724 					  in_dev, itag);
1725 		if (err < 0)
1726 			return err;
1727 	}
1728 	return 0;
1729 }
1730 
1731 /* called in rcu_read_lock() section */
1732 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1733 			     u8 tos, struct net_device *dev, int our)
1734 {
1735 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1736 	unsigned int flags = RTCF_MULTICAST;
1737 	struct rtable *rth;
1738 	u32 itag = 0;
1739 	int err;
1740 
1741 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1742 	if (err)
1743 		return err;
1744 
1745 	if (our)
1746 		flags |= RTCF_LOCAL;
1747 
1748 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1749 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1750 	if (!rth)
1751 		return -ENOBUFS;
1752 
1753 #ifdef CONFIG_IP_ROUTE_CLASSID
1754 	rth->dst.tclassid = itag;
1755 #endif
1756 	rth->dst.output = ip_rt_bug;
1757 	rth->rt_is_input= 1;
1758 
1759 #ifdef CONFIG_IP_MROUTE
1760 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1761 		rth->dst.input = ip_mr_input;
1762 #endif
1763 	RT_CACHE_STAT_INC(in_slow_mc);
1764 
1765 	skb_dst_set(skb, &rth->dst);
1766 	return 0;
1767 }
1768 
1769 
1770 static void ip_handle_martian_source(struct net_device *dev,
1771 				     struct in_device *in_dev,
1772 				     struct sk_buff *skb,
1773 				     __be32 daddr,
1774 				     __be32 saddr)
1775 {
1776 	RT_CACHE_STAT_INC(in_martian_src);
1777 #ifdef CONFIG_IP_ROUTE_VERBOSE
1778 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1779 		/*
1780 		 *	RFC1812 recommendation, if source is martian,
1781 		 *	the only hint is MAC header.
1782 		 */
1783 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1784 			&daddr, &saddr, dev->name);
1785 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1786 			print_hex_dump(KERN_WARNING, "ll header: ",
1787 				       DUMP_PREFIX_OFFSET, 16, 1,
1788 				       skb_mac_header(skb),
1789 				       dev->hard_header_len, false);
1790 		}
1791 	}
1792 #endif
1793 }
1794 
1795 /* called in rcu_read_lock() section */
1796 static int __mkroute_input(struct sk_buff *skb,
1797 			   const struct fib_result *res,
1798 			   struct in_device *in_dev,
1799 			   __be32 daddr, __be32 saddr, u32 tos)
1800 {
1801 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1802 	struct net_device *dev = nhc->nhc_dev;
1803 	struct fib_nh_exception *fnhe;
1804 	struct rtable *rth;
1805 	int err;
1806 	struct in_device *out_dev;
1807 	bool do_cache;
1808 	u32 itag = 0;
1809 
1810 	/* get a working reference to the output device */
1811 	out_dev = __in_dev_get_rcu(dev);
1812 	if (!out_dev) {
1813 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1814 		return -EINVAL;
1815 	}
1816 
1817 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1818 				  in_dev->dev, in_dev, &itag);
1819 	if (err < 0) {
1820 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1821 					 saddr);
1822 
1823 		goto cleanup;
1824 	}
1825 
1826 	do_cache = res->fi && !itag;
1827 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1828 	    skb->protocol == htons(ETH_P_IP)) {
1829 		__be32 gw;
1830 
1831 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1832 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1833 		    inet_addr_onlink(out_dev, saddr, gw))
1834 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1835 	}
1836 
1837 	if (skb->protocol != htons(ETH_P_IP)) {
1838 		/* Not IP (i.e. ARP). Do not create route, if it is
1839 		 * invalid for proxy arp. DNAT routes are always valid.
1840 		 *
1841 		 * Proxy arp feature have been extended to allow, ARP
1842 		 * replies back to the same interface, to support
1843 		 * Private VLAN switch technologies. See arp.c.
1844 		 */
1845 		if (out_dev == in_dev &&
1846 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1847 			err = -EINVAL;
1848 			goto cleanup;
1849 		}
1850 	}
1851 
1852 	fnhe = find_exception(nhc, daddr);
1853 	if (do_cache) {
1854 		if (fnhe)
1855 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1856 		else
1857 			rth = rcu_dereference(nhc->nhc_rth_input);
1858 		if (rt_cache_valid(rth)) {
1859 			skb_dst_set_noref(skb, &rth->dst);
1860 			goto out;
1861 		}
1862 	}
1863 
1864 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1865 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
1866 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1867 	if (!rth) {
1868 		err = -ENOBUFS;
1869 		goto cleanup;
1870 	}
1871 
1872 	rth->rt_is_input = 1;
1873 	RT_CACHE_STAT_INC(in_slow_tot);
1874 
1875 	rth->dst.input = ip_forward;
1876 
1877 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1878 		       do_cache);
1879 	lwtunnel_set_redirect(&rth->dst);
1880 	skb_dst_set(skb, &rth->dst);
1881 out:
1882 	err = 0;
1883  cleanup:
1884 	return err;
1885 }
1886 
1887 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1888 /* To make ICMP packets follow the right flow, the multipath hash is
1889  * calculated from the inner IP addresses.
1890  */
1891 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1892 				 struct flow_keys *hash_keys)
1893 {
1894 	const struct iphdr *outer_iph = ip_hdr(skb);
1895 	const struct iphdr *key_iph = outer_iph;
1896 	const struct iphdr *inner_iph;
1897 	const struct icmphdr *icmph;
1898 	struct iphdr _inner_iph;
1899 	struct icmphdr _icmph;
1900 
1901 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1902 		goto out;
1903 
1904 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1905 		goto out;
1906 
1907 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1908 				   &_icmph);
1909 	if (!icmph)
1910 		goto out;
1911 
1912 	if (!icmp_is_err(icmph->type))
1913 		goto out;
1914 
1915 	inner_iph = skb_header_pointer(skb,
1916 				       outer_iph->ihl * 4 + sizeof(_icmph),
1917 				       sizeof(_inner_iph), &_inner_iph);
1918 	if (!inner_iph)
1919 		goto out;
1920 
1921 	key_iph = inner_iph;
1922 out:
1923 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1924 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1925 }
1926 
1927 /* if skb is set it will be used and fl4 can be NULL */
1928 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1929 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1930 {
1931 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1932 	struct flow_keys hash_keys;
1933 	u32 mhash;
1934 
1935 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1936 	case 0:
1937 		memset(&hash_keys, 0, sizeof(hash_keys));
1938 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1939 		if (skb) {
1940 			ip_multipath_l3_keys(skb, &hash_keys);
1941 		} else {
1942 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1943 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1944 		}
1945 		break;
1946 	case 1:
1947 		/* skb is currently provided only when forwarding */
1948 		if (skb) {
1949 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1950 			struct flow_keys keys;
1951 
1952 			/* short-circuit if we already have L4 hash present */
1953 			if (skb->l4_hash)
1954 				return skb_get_hash_raw(skb) >> 1;
1955 
1956 			memset(&hash_keys, 0, sizeof(hash_keys));
1957 
1958 			if (!flkeys) {
1959 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1960 				flkeys = &keys;
1961 			}
1962 
1963 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1964 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1965 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1966 			hash_keys.ports.src = flkeys->ports.src;
1967 			hash_keys.ports.dst = flkeys->ports.dst;
1968 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1969 		} else {
1970 			memset(&hash_keys, 0, sizeof(hash_keys));
1971 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1972 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1973 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1974 			hash_keys.ports.src = fl4->fl4_sport;
1975 			hash_keys.ports.dst = fl4->fl4_dport;
1976 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1977 		}
1978 		break;
1979 	case 2:
1980 		memset(&hash_keys, 0, sizeof(hash_keys));
1981 		/* skb is currently provided only when forwarding */
1982 		if (skb) {
1983 			struct flow_keys keys;
1984 
1985 			skb_flow_dissect_flow_keys(skb, &keys, 0);
1986 			/* Inner can be v4 or v6 */
1987 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1988 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1990 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1991 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1992 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1993 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1994 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1995 				hash_keys.tags.flow_label = keys.tags.flow_label;
1996 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1997 			} else {
1998 				/* Same as case 0 */
1999 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2000 				ip_multipath_l3_keys(skb, &hash_keys);
2001 			}
2002 		} else {
2003 			/* Same as case 0 */
2004 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2006 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2007 		}
2008 		break;
2009 	}
2010 	mhash = flow_hash_from_keys(&hash_keys);
2011 
2012 	if (multipath_hash)
2013 		mhash = jhash_2words(mhash, multipath_hash, 0);
2014 
2015 	return mhash >> 1;
2016 }
2017 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2018 
2019 static int ip_mkroute_input(struct sk_buff *skb,
2020 			    struct fib_result *res,
2021 			    struct in_device *in_dev,
2022 			    __be32 daddr, __be32 saddr, u32 tos,
2023 			    struct flow_keys *hkeys)
2024 {
2025 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2026 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2027 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2028 
2029 		fib_select_multipath(res, h);
2030 	}
2031 #endif
2032 
2033 	/* create a routing cache entry */
2034 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2035 }
2036 
2037 /* Implements all the saddr-related checks as ip_route_input_slow(),
2038  * assuming daddr is valid and the destination is not a local broadcast one.
2039  * Uses the provided hint instead of performing a route lookup.
2040  */
2041 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042 		      u8 tos, struct net_device *dev,
2043 		      const struct sk_buff *hint)
2044 {
2045 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2046 	struct rtable *rt = skb_rtable(hint);
2047 	struct net *net = dev_net(dev);
2048 	int err = -EINVAL;
2049 	u32 tag = 0;
2050 
2051 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2052 		goto martian_source;
2053 
2054 	if (ipv4_is_zeronet(saddr))
2055 		goto martian_source;
2056 
2057 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2058 		goto martian_source;
2059 
2060 	if (rt->rt_type != RTN_LOCAL)
2061 		goto skip_validate_source;
2062 
2063 	tos &= IPTOS_RT_MASK;
2064 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2065 	if (err < 0)
2066 		goto martian_source;
2067 
2068 skip_validate_source:
2069 	skb_dst_copy(skb, hint);
2070 	return 0;
2071 
2072 martian_source:
2073 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074 	return err;
2075 }
2076 
2077 /*
2078  *	NOTE. We drop all the packets that has local source
2079  *	addresses, because every properly looped back packet
2080  *	must have correct destination already attached by output routine.
2081  *	Changes in the enforced policies must be applied also to
2082  *	ip_route_use_hint().
2083  *
2084  *	Such approach solves two big problems:
2085  *	1. Not simplex devices are handled properly.
2086  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2087  *	called with rcu_read_lock()
2088  */
2089 
2090 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 			       u8 tos, struct net_device *dev,
2092 			       struct fib_result *res)
2093 {
2094 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2095 	struct flow_keys *flkeys = NULL, _flkeys;
2096 	struct net    *net = dev_net(dev);
2097 	struct ip_tunnel_info *tun_info;
2098 	int		err = -EINVAL;
2099 	unsigned int	flags = 0;
2100 	u32		itag = 0;
2101 	struct rtable	*rth;
2102 	struct flowi4	fl4;
2103 	bool do_cache = true;
2104 
2105 	/* IP on this device is disabled. */
2106 
2107 	if (!in_dev)
2108 		goto out;
2109 
2110 	/* Check for the most weird martians, which can be not detected
2111 	   by fib_lookup.
2112 	 */
2113 
2114 	tun_info = skb_tunnel_info(skb);
2115 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2116 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2117 	else
2118 		fl4.flowi4_tun_key.tun_id = 0;
2119 	skb_dst_drop(skb);
2120 
2121 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2122 		goto martian_source;
2123 
2124 	res->fi = NULL;
2125 	res->table = NULL;
2126 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2127 		goto brd_input;
2128 
2129 	/* Accept zero addresses only to limited broadcast;
2130 	 * I even do not know to fix it or not. Waiting for complains :-)
2131 	 */
2132 	if (ipv4_is_zeronet(saddr))
2133 		goto martian_source;
2134 
2135 	if (ipv4_is_zeronet(daddr))
2136 		goto martian_destination;
2137 
2138 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2139 	 * and call it once if daddr or/and saddr are loopback addresses
2140 	 */
2141 	if (ipv4_is_loopback(daddr)) {
2142 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2143 			goto martian_destination;
2144 	} else if (ipv4_is_loopback(saddr)) {
2145 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2146 			goto martian_source;
2147 	}
2148 
2149 	/*
2150 	 *	Now we are ready to route packet.
2151 	 */
2152 	fl4.flowi4_oif = 0;
2153 	fl4.flowi4_iif = dev->ifindex;
2154 	fl4.flowi4_mark = skb->mark;
2155 	fl4.flowi4_tos = tos;
2156 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2157 	fl4.flowi4_flags = 0;
2158 	fl4.daddr = daddr;
2159 	fl4.saddr = saddr;
2160 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2161 	fl4.flowi4_multipath_hash = 0;
2162 
2163 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2164 		flkeys = &_flkeys;
2165 	} else {
2166 		fl4.flowi4_proto = 0;
2167 		fl4.fl4_sport = 0;
2168 		fl4.fl4_dport = 0;
2169 	}
2170 
2171 	err = fib_lookup(net, &fl4, res, 0);
2172 	if (err != 0) {
2173 		if (!IN_DEV_FORWARD(in_dev))
2174 			err = -EHOSTUNREACH;
2175 		goto no_route;
2176 	}
2177 
2178 	if (res->type == RTN_BROADCAST) {
2179 		if (IN_DEV_BFORWARD(in_dev))
2180 			goto make_route;
2181 		/* not do cache if bc_forwarding is enabled */
2182 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2183 			do_cache = false;
2184 		goto brd_input;
2185 	}
2186 
2187 	if (res->type == RTN_LOCAL) {
2188 		err = fib_validate_source(skb, saddr, daddr, tos,
2189 					  0, dev, in_dev, &itag);
2190 		if (err < 0)
2191 			goto martian_source;
2192 		goto local_input;
2193 	}
2194 
2195 	if (!IN_DEV_FORWARD(in_dev)) {
2196 		err = -EHOSTUNREACH;
2197 		goto no_route;
2198 	}
2199 	if (res->type != RTN_UNICAST)
2200 		goto martian_destination;
2201 
2202 make_route:
2203 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2204 out:	return err;
2205 
2206 brd_input:
2207 	if (skb->protocol != htons(ETH_P_IP))
2208 		goto e_inval;
2209 
2210 	if (!ipv4_is_zeronet(saddr)) {
2211 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2212 					  in_dev, &itag);
2213 		if (err < 0)
2214 			goto martian_source;
2215 	}
2216 	flags |= RTCF_BROADCAST;
2217 	res->type = RTN_BROADCAST;
2218 	RT_CACHE_STAT_INC(in_brd);
2219 
2220 local_input:
2221 	do_cache &= res->fi && !itag;
2222 	if (do_cache) {
2223 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2224 
2225 		rth = rcu_dereference(nhc->nhc_rth_input);
2226 		if (rt_cache_valid(rth)) {
2227 			skb_dst_set_noref(skb, &rth->dst);
2228 			err = 0;
2229 			goto out;
2230 		}
2231 	}
2232 
2233 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2234 			   flags | RTCF_LOCAL, res->type,
2235 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2236 	if (!rth)
2237 		goto e_nobufs;
2238 
2239 	rth->dst.output= ip_rt_bug;
2240 #ifdef CONFIG_IP_ROUTE_CLASSID
2241 	rth->dst.tclassid = itag;
2242 #endif
2243 	rth->rt_is_input = 1;
2244 
2245 	RT_CACHE_STAT_INC(in_slow_tot);
2246 	if (res->type == RTN_UNREACHABLE) {
2247 		rth->dst.input= ip_error;
2248 		rth->dst.error= -err;
2249 		rth->rt_flags 	&= ~RTCF_LOCAL;
2250 	}
2251 
2252 	if (do_cache) {
2253 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2254 
2255 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2256 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2257 			WARN_ON(rth->dst.input == lwtunnel_input);
2258 			rth->dst.lwtstate->orig_input = rth->dst.input;
2259 			rth->dst.input = lwtunnel_input;
2260 		}
2261 
2262 		if (unlikely(!rt_cache_route(nhc, rth)))
2263 			rt_add_uncached_list(rth);
2264 	}
2265 	skb_dst_set(skb, &rth->dst);
2266 	err = 0;
2267 	goto out;
2268 
2269 no_route:
2270 	RT_CACHE_STAT_INC(in_no_route);
2271 	res->type = RTN_UNREACHABLE;
2272 	res->fi = NULL;
2273 	res->table = NULL;
2274 	goto local_input;
2275 
2276 	/*
2277 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2278 	 */
2279 martian_destination:
2280 	RT_CACHE_STAT_INC(in_martian_dst);
2281 #ifdef CONFIG_IP_ROUTE_VERBOSE
2282 	if (IN_DEV_LOG_MARTIANS(in_dev))
2283 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2284 				     &daddr, &saddr, dev->name);
2285 #endif
2286 
2287 e_inval:
2288 	err = -EINVAL;
2289 	goto out;
2290 
2291 e_nobufs:
2292 	err = -ENOBUFS;
2293 	goto out;
2294 
2295 martian_source:
2296 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297 	goto out;
2298 }
2299 
2300 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301 			 u8 tos, struct net_device *dev)
2302 {
2303 	struct fib_result res;
2304 	int err;
2305 
2306 	tos &= IPTOS_RT_MASK;
2307 	rcu_read_lock();
2308 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2309 	rcu_read_unlock();
2310 
2311 	return err;
2312 }
2313 EXPORT_SYMBOL(ip_route_input_noref);
2314 
2315 /* called with rcu_read_lock held */
2316 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2317 		       u8 tos, struct net_device *dev, struct fib_result *res)
2318 {
2319 	/* Multicast recognition logic is moved from route cache to here.
2320 	   The problem was that too many Ethernet cards have broken/missing
2321 	   hardware multicast filters :-( As result the host on multicasting
2322 	   network acquires a lot of useless route cache entries, sort of
2323 	   SDR messages from all the world. Now we try to get rid of them.
2324 	   Really, provided software IP multicast filter is organized
2325 	   reasonably (at least, hashed), it does not result in a slowdown
2326 	   comparing with route cache reject entries.
2327 	   Note, that multicast routers are not affected, because
2328 	   route cache entry is created eventually.
2329 	 */
2330 	if (ipv4_is_multicast(daddr)) {
2331 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2332 		int our = 0;
2333 		int err = -EINVAL;
2334 
2335 		if (!in_dev)
2336 			return err;
2337 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2338 				      ip_hdr(skb)->protocol);
2339 
2340 		/* check l3 master if no match yet */
2341 		if (!our && netif_is_l3_slave(dev)) {
2342 			struct in_device *l3_in_dev;
2343 
2344 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2345 			if (l3_in_dev)
2346 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2347 						      ip_hdr(skb)->protocol);
2348 		}
2349 
2350 		if (our
2351 #ifdef CONFIG_IP_MROUTE
2352 			||
2353 		    (!ipv4_is_local_multicast(daddr) &&
2354 		     IN_DEV_MFORWARD(in_dev))
2355 #endif
2356 		   ) {
2357 			err = ip_route_input_mc(skb, daddr, saddr,
2358 						tos, dev, our);
2359 		}
2360 		return err;
2361 	}
2362 
2363 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2364 }
2365 
2366 /* called with rcu_read_lock() */
2367 static struct rtable *__mkroute_output(const struct fib_result *res,
2368 				       const struct flowi4 *fl4, int orig_oif,
2369 				       struct net_device *dev_out,
2370 				       unsigned int flags)
2371 {
2372 	struct fib_info *fi = res->fi;
2373 	struct fib_nh_exception *fnhe;
2374 	struct in_device *in_dev;
2375 	u16 type = res->type;
2376 	struct rtable *rth;
2377 	bool do_cache;
2378 
2379 	in_dev = __in_dev_get_rcu(dev_out);
2380 	if (!in_dev)
2381 		return ERR_PTR(-EINVAL);
2382 
2383 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2384 		if (ipv4_is_loopback(fl4->saddr) &&
2385 		    !(dev_out->flags & IFF_LOOPBACK) &&
2386 		    !netif_is_l3_master(dev_out))
2387 			return ERR_PTR(-EINVAL);
2388 
2389 	if (ipv4_is_lbcast(fl4->daddr))
2390 		type = RTN_BROADCAST;
2391 	else if (ipv4_is_multicast(fl4->daddr))
2392 		type = RTN_MULTICAST;
2393 	else if (ipv4_is_zeronet(fl4->daddr))
2394 		return ERR_PTR(-EINVAL);
2395 
2396 	if (dev_out->flags & IFF_LOOPBACK)
2397 		flags |= RTCF_LOCAL;
2398 
2399 	do_cache = true;
2400 	if (type == RTN_BROADCAST) {
2401 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2402 		fi = NULL;
2403 	} else if (type == RTN_MULTICAST) {
2404 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2405 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2406 				     fl4->flowi4_proto))
2407 			flags &= ~RTCF_LOCAL;
2408 		else
2409 			do_cache = false;
2410 		/* If multicast route do not exist use
2411 		 * default one, but do not gateway in this case.
2412 		 * Yes, it is hack.
2413 		 */
2414 		if (fi && res->prefixlen < 4)
2415 			fi = NULL;
2416 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2417 		   (orig_oif != dev_out->ifindex)) {
2418 		/* For local routes that require a particular output interface
2419 		 * we do not want to cache the result.  Caching the result
2420 		 * causes incorrect behaviour when there are multiple source
2421 		 * addresses on the interface, the end result being that if the
2422 		 * intended recipient is waiting on that interface for the
2423 		 * packet he won't receive it because it will be delivered on
2424 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2425 		 * be set to the loopback interface as well.
2426 		 */
2427 		do_cache = false;
2428 	}
2429 
2430 	fnhe = NULL;
2431 	do_cache &= fi != NULL;
2432 	if (fi) {
2433 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2434 		struct rtable __rcu **prth;
2435 
2436 		fnhe = find_exception(nhc, fl4->daddr);
2437 		if (!do_cache)
2438 			goto add;
2439 		if (fnhe) {
2440 			prth = &fnhe->fnhe_rth_output;
2441 		} else {
2442 			if (unlikely(fl4->flowi4_flags &
2443 				     FLOWI_FLAG_KNOWN_NH &&
2444 				     !(nhc->nhc_gw_family &&
2445 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2446 				do_cache = false;
2447 				goto add;
2448 			}
2449 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2450 		}
2451 		rth = rcu_dereference(*prth);
2452 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2453 			return rth;
2454 	}
2455 
2456 add:
2457 	rth = rt_dst_alloc(dev_out, flags, type,
2458 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2459 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2460 	if (!rth)
2461 		return ERR_PTR(-ENOBUFS);
2462 
2463 	rth->rt_iif = orig_oif;
2464 
2465 	RT_CACHE_STAT_INC(out_slow_tot);
2466 
2467 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468 		if (flags & RTCF_LOCAL &&
2469 		    !(dev_out->flags & IFF_LOOPBACK)) {
2470 			rth->dst.output = ip_mc_output;
2471 			RT_CACHE_STAT_INC(out_slow_mc);
2472 		}
2473 #ifdef CONFIG_IP_MROUTE
2474 		if (type == RTN_MULTICAST) {
2475 			if (IN_DEV_MFORWARD(in_dev) &&
2476 			    !ipv4_is_local_multicast(fl4->daddr)) {
2477 				rth->dst.input = ip_mr_input;
2478 				rth->dst.output = ip_mc_output;
2479 			}
2480 		}
2481 #endif
2482 	}
2483 
2484 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2485 	lwtunnel_set_redirect(&rth->dst);
2486 
2487 	return rth;
2488 }
2489 
2490 /*
2491  * Major route resolver routine.
2492  */
2493 
2494 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2495 					const struct sk_buff *skb)
2496 {
2497 	__u8 tos = RT_FL_TOS(fl4);
2498 	struct fib_result res = {
2499 		.type		= RTN_UNSPEC,
2500 		.fi		= NULL,
2501 		.table		= NULL,
2502 		.tclassid	= 0,
2503 	};
2504 	struct rtable *rth;
2505 
2506 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2507 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2508 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2509 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2510 
2511 	rcu_read_lock();
2512 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2513 	rcu_read_unlock();
2514 
2515 	return rth;
2516 }
2517 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2518 
2519 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2520 					    struct fib_result *res,
2521 					    const struct sk_buff *skb)
2522 {
2523 	struct net_device *dev_out = NULL;
2524 	int orig_oif = fl4->flowi4_oif;
2525 	unsigned int flags = 0;
2526 	struct rtable *rth;
2527 	int err;
2528 
2529 	if (fl4->saddr) {
2530 		if (ipv4_is_multicast(fl4->saddr) ||
2531 		    ipv4_is_lbcast(fl4->saddr) ||
2532 		    ipv4_is_zeronet(fl4->saddr)) {
2533 			rth = ERR_PTR(-EINVAL);
2534 			goto out;
2535 		}
2536 
2537 		rth = ERR_PTR(-ENETUNREACH);
2538 
2539 		/* I removed check for oif == dev_out->oif here.
2540 		   It was wrong for two reasons:
2541 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2542 		      is assigned to multiple interfaces.
2543 		   2. Moreover, we are allowed to send packets with saddr
2544 		      of another iface. --ANK
2545 		 */
2546 
2547 		if (fl4->flowi4_oif == 0 &&
2548 		    (ipv4_is_multicast(fl4->daddr) ||
2549 		     ipv4_is_lbcast(fl4->daddr))) {
2550 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2552 			if (!dev_out)
2553 				goto out;
2554 
2555 			/* Special hack: user can direct multicasts
2556 			   and limited broadcast via necessary interface
2557 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2558 			   This hack is not just for fun, it allows
2559 			   vic,vat and friends to work.
2560 			   They bind socket to loopback, set ttl to zero
2561 			   and expect that it will work.
2562 			   From the viewpoint of routing cache they are broken,
2563 			   because we are not allowed to build multicast path
2564 			   with loopback source addr (look, routing cache
2565 			   cannot know, that ttl is zero, so that packet
2566 			   will not leave this host and route is valid).
2567 			   Luckily, this hack is good workaround.
2568 			 */
2569 
2570 			fl4->flowi4_oif = dev_out->ifindex;
2571 			goto make_route;
2572 		}
2573 
2574 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2575 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2576 			if (!__ip_dev_find(net, fl4->saddr, false))
2577 				goto out;
2578 		}
2579 	}
2580 
2581 
2582 	if (fl4->flowi4_oif) {
2583 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2584 		rth = ERR_PTR(-ENODEV);
2585 		if (!dev_out)
2586 			goto out;
2587 
2588 		/* RACE: Check return value of inet_select_addr instead. */
2589 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590 			rth = ERR_PTR(-ENETUNREACH);
2591 			goto out;
2592 		}
2593 		if (ipv4_is_local_multicast(fl4->daddr) ||
2594 		    ipv4_is_lbcast(fl4->daddr) ||
2595 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2596 			if (!fl4->saddr)
2597 				fl4->saddr = inet_select_addr(dev_out, 0,
2598 							      RT_SCOPE_LINK);
2599 			goto make_route;
2600 		}
2601 		if (!fl4->saddr) {
2602 			if (ipv4_is_multicast(fl4->daddr))
2603 				fl4->saddr = inet_select_addr(dev_out, 0,
2604 							      fl4->flowi4_scope);
2605 			else if (!fl4->daddr)
2606 				fl4->saddr = inet_select_addr(dev_out, 0,
2607 							      RT_SCOPE_HOST);
2608 		}
2609 	}
2610 
2611 	if (!fl4->daddr) {
2612 		fl4->daddr = fl4->saddr;
2613 		if (!fl4->daddr)
2614 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2615 		dev_out = net->loopback_dev;
2616 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2617 		res->type = RTN_LOCAL;
2618 		flags |= RTCF_LOCAL;
2619 		goto make_route;
2620 	}
2621 
2622 	err = fib_lookup(net, fl4, res, 0);
2623 	if (err) {
2624 		res->fi = NULL;
2625 		res->table = NULL;
2626 		if (fl4->flowi4_oif &&
2627 		    (ipv4_is_multicast(fl4->daddr) ||
2628 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2629 			/* Apparently, routing tables are wrong. Assume,
2630 			   that the destination is on link.
2631 
2632 			   WHY? DW.
2633 			   Because we are allowed to send to iface
2634 			   even if it has NO routes and NO assigned
2635 			   addresses. When oif is specified, routing
2636 			   tables are looked up with only one purpose:
2637 			   to catch if destination is gatewayed, rather than
2638 			   direct. Moreover, if MSG_DONTROUTE is set,
2639 			   we send packet, ignoring both routing tables
2640 			   and ifaddr state. --ANK
2641 
2642 
2643 			   We could make it even if oif is unknown,
2644 			   likely IPv6, but we do not.
2645 			 */
2646 
2647 			if (fl4->saddr == 0)
2648 				fl4->saddr = inet_select_addr(dev_out, 0,
2649 							      RT_SCOPE_LINK);
2650 			res->type = RTN_UNICAST;
2651 			goto make_route;
2652 		}
2653 		rth = ERR_PTR(err);
2654 		goto out;
2655 	}
2656 
2657 	if (res->type == RTN_LOCAL) {
2658 		if (!fl4->saddr) {
2659 			if (res->fi->fib_prefsrc)
2660 				fl4->saddr = res->fi->fib_prefsrc;
2661 			else
2662 				fl4->saddr = fl4->daddr;
2663 		}
2664 
2665 		/* L3 master device is the loopback for that domain */
2666 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2667 			net->loopback_dev;
2668 
2669 		/* make sure orig_oif points to fib result device even
2670 		 * though packet rx/tx happens over loopback or l3mdev
2671 		 */
2672 		orig_oif = FIB_RES_OIF(*res);
2673 
2674 		fl4->flowi4_oif = dev_out->ifindex;
2675 		flags |= RTCF_LOCAL;
2676 		goto make_route;
2677 	}
2678 
2679 	fib_select_path(net, res, fl4, skb);
2680 
2681 	dev_out = FIB_RES_DEV(*res);
2682 
2683 make_route:
2684 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2685 
2686 out:
2687 	return rth;
2688 }
2689 
2690 static struct dst_ops ipv4_dst_blackhole_ops = {
2691 	.family			= AF_INET,
2692 	.default_advmss		= ipv4_default_advmss,
2693 	.neigh_lookup		= ipv4_neigh_lookup,
2694 	.check			= dst_blackhole_check,
2695 	.cow_metrics		= dst_blackhole_cow_metrics,
2696 	.update_pmtu		= dst_blackhole_update_pmtu,
2697 	.redirect		= dst_blackhole_redirect,
2698 	.mtu			= dst_blackhole_mtu,
2699 };
2700 
2701 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2702 {
2703 	struct rtable *ort = (struct rtable *) dst_orig;
2704 	struct rtable *rt;
2705 
2706 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2707 	if (rt) {
2708 		struct dst_entry *new = &rt->dst;
2709 
2710 		new->__use = 1;
2711 		new->input = dst_discard;
2712 		new->output = dst_discard_out;
2713 
2714 		new->dev = net->loopback_dev;
2715 		if (new->dev)
2716 			dev_hold(new->dev);
2717 
2718 		rt->rt_is_input = ort->rt_is_input;
2719 		rt->rt_iif = ort->rt_iif;
2720 		rt->rt_pmtu = ort->rt_pmtu;
2721 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2722 
2723 		rt->rt_genid = rt_genid_ipv4(net);
2724 		rt->rt_flags = ort->rt_flags;
2725 		rt->rt_type = ort->rt_type;
2726 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2727 		rt->rt_gw_family = ort->rt_gw_family;
2728 		if (rt->rt_gw_family == AF_INET)
2729 			rt->rt_gw4 = ort->rt_gw4;
2730 		else if (rt->rt_gw_family == AF_INET6)
2731 			rt->rt_gw6 = ort->rt_gw6;
2732 
2733 		INIT_LIST_HEAD(&rt->rt_uncached);
2734 	}
2735 
2736 	dst_release(dst_orig);
2737 
2738 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2739 }
2740 
2741 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2742 				    const struct sock *sk)
2743 {
2744 	struct rtable *rt = __ip_route_output_key(net, flp4);
2745 
2746 	if (IS_ERR(rt))
2747 		return rt;
2748 
2749 	if (flp4->flowi4_proto) {
2750 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2751 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2752 							flowi4_to_flowi(flp4),
2753 							sk, 0);
2754 	}
2755 
2756 	return rt;
2757 }
2758 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2759 
2760 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2761 				      struct net_device *dev,
2762 				      struct net *net, __be32 *saddr,
2763 				      const struct ip_tunnel_info *info,
2764 				      u8 protocol, bool use_cache)
2765 {
2766 #ifdef CONFIG_DST_CACHE
2767 	struct dst_cache *dst_cache;
2768 #endif
2769 	struct rtable *rt = NULL;
2770 	struct flowi4 fl4;
2771 	__u8 tos;
2772 
2773 #ifdef CONFIG_DST_CACHE
2774 	dst_cache = (struct dst_cache *)&info->dst_cache;
2775 	if (use_cache) {
2776 		rt = dst_cache_get_ip4(dst_cache, saddr);
2777 		if (rt)
2778 			return rt;
2779 	}
2780 #endif
2781 	memset(&fl4, 0, sizeof(fl4));
2782 	fl4.flowi4_mark = skb->mark;
2783 	fl4.flowi4_proto = protocol;
2784 	fl4.daddr = info->key.u.ipv4.dst;
2785 	fl4.saddr = info->key.u.ipv4.src;
2786 	tos = info->key.tos;
2787 	fl4.flowi4_tos = RT_TOS(tos);
2788 
2789 	rt = ip_route_output_key(net, &fl4);
2790 	if (IS_ERR(rt)) {
2791 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2792 		return ERR_PTR(-ENETUNREACH);
2793 	}
2794 	if (rt->dst.dev == dev) { /* is this necessary? */
2795 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2796 		ip_rt_put(rt);
2797 		return ERR_PTR(-ELOOP);
2798 	}
2799 #ifdef CONFIG_DST_CACHE
2800 	if (use_cache)
2801 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2802 #endif
2803 	*saddr = fl4.saddr;
2804 	return rt;
2805 }
2806 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2807 
2808 /* called with rcu_read_lock held */
2809 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2810 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2811 			struct sk_buff *skb, u32 portid, u32 seq,
2812 			unsigned int flags)
2813 {
2814 	struct rtmsg *r;
2815 	struct nlmsghdr *nlh;
2816 	unsigned long expires = 0;
2817 	u32 error;
2818 	u32 metrics[RTAX_MAX];
2819 
2820 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2821 	if (!nlh)
2822 		return -EMSGSIZE;
2823 
2824 	r = nlmsg_data(nlh);
2825 	r->rtm_family	 = AF_INET;
2826 	r->rtm_dst_len	= 32;
2827 	r->rtm_src_len	= 0;
2828 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2829 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2830 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2831 		goto nla_put_failure;
2832 	r->rtm_type	= rt->rt_type;
2833 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2834 	r->rtm_protocol = RTPROT_UNSPEC;
2835 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2836 	if (rt->rt_flags & RTCF_NOTIFY)
2837 		r->rtm_flags |= RTM_F_NOTIFY;
2838 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2839 		r->rtm_flags |= RTCF_DOREDIRECT;
2840 
2841 	if (nla_put_in_addr(skb, RTA_DST, dst))
2842 		goto nla_put_failure;
2843 	if (src) {
2844 		r->rtm_src_len = 32;
2845 		if (nla_put_in_addr(skb, RTA_SRC, src))
2846 			goto nla_put_failure;
2847 	}
2848 	if (rt->dst.dev &&
2849 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2850 		goto nla_put_failure;
2851 	if (rt->dst.lwtstate &&
2852 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2853 		goto nla_put_failure;
2854 #ifdef CONFIG_IP_ROUTE_CLASSID
2855 	if (rt->dst.tclassid &&
2856 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2857 		goto nla_put_failure;
2858 #endif
2859 	if (fl4 && !rt_is_input_route(rt) &&
2860 	    fl4->saddr != src) {
2861 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2862 			goto nla_put_failure;
2863 	}
2864 	if (rt->rt_uses_gateway) {
2865 		if (rt->rt_gw_family == AF_INET &&
2866 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2867 			goto nla_put_failure;
2868 		} else if (rt->rt_gw_family == AF_INET6) {
2869 			int alen = sizeof(struct in6_addr);
2870 			struct nlattr *nla;
2871 			struct rtvia *via;
2872 
2873 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2874 			if (!nla)
2875 				goto nla_put_failure;
2876 
2877 			via = nla_data(nla);
2878 			via->rtvia_family = AF_INET6;
2879 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2880 		}
2881 	}
2882 
2883 	expires = rt->dst.expires;
2884 	if (expires) {
2885 		unsigned long now = jiffies;
2886 
2887 		if (time_before(now, expires))
2888 			expires -= now;
2889 		else
2890 			expires = 0;
2891 	}
2892 
2893 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2894 	if (rt->rt_pmtu && expires)
2895 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2896 	if (rt->rt_mtu_locked && expires)
2897 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2898 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2899 		goto nla_put_failure;
2900 
2901 	if (fl4) {
2902 		if (fl4->flowi4_mark &&
2903 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2904 			goto nla_put_failure;
2905 
2906 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2907 		    nla_put_u32(skb, RTA_UID,
2908 				from_kuid_munged(current_user_ns(),
2909 						 fl4->flowi4_uid)))
2910 			goto nla_put_failure;
2911 
2912 		if (rt_is_input_route(rt)) {
2913 #ifdef CONFIG_IP_MROUTE
2914 			if (ipv4_is_multicast(dst) &&
2915 			    !ipv4_is_local_multicast(dst) &&
2916 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917 				int err = ipmr_get_route(net, skb,
2918 							 fl4->saddr, fl4->daddr,
2919 							 r, portid);
2920 
2921 				if (err <= 0) {
2922 					if (err == 0)
2923 						return 0;
2924 					goto nla_put_failure;
2925 				}
2926 			} else
2927 #endif
2928 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2929 					goto nla_put_failure;
2930 		}
2931 	}
2932 
2933 	error = rt->dst.error;
2934 
2935 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2936 		goto nla_put_failure;
2937 
2938 	nlmsg_end(skb, nlh);
2939 	return 0;
2940 
2941 nla_put_failure:
2942 	nlmsg_cancel(skb, nlh);
2943 	return -EMSGSIZE;
2944 }
2945 
2946 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2947 			    struct netlink_callback *cb, u32 table_id,
2948 			    struct fnhe_hash_bucket *bucket, int genid,
2949 			    int *fa_index, int fa_start, unsigned int flags)
2950 {
2951 	int i;
2952 
2953 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2954 		struct fib_nh_exception *fnhe;
2955 
2956 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2957 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2958 			struct rtable *rt;
2959 			int err;
2960 
2961 			if (*fa_index < fa_start)
2962 				goto next;
2963 
2964 			if (fnhe->fnhe_genid != genid)
2965 				goto next;
2966 
2967 			if (fnhe->fnhe_expires &&
2968 			    time_after(jiffies, fnhe->fnhe_expires))
2969 				goto next;
2970 
2971 			rt = rcu_dereference(fnhe->fnhe_rth_input);
2972 			if (!rt)
2973 				rt = rcu_dereference(fnhe->fnhe_rth_output);
2974 			if (!rt)
2975 				goto next;
2976 
2977 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2978 					   table_id, NULL, skb,
2979 					   NETLINK_CB(cb->skb).portid,
2980 					   cb->nlh->nlmsg_seq, flags);
2981 			if (err)
2982 				return err;
2983 next:
2984 			(*fa_index)++;
2985 		}
2986 	}
2987 
2988 	return 0;
2989 }
2990 
2991 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2992 		       u32 table_id, struct fib_info *fi,
2993 		       int *fa_index, int fa_start, unsigned int flags)
2994 {
2995 	struct net *net = sock_net(cb->skb->sk);
2996 	int nhsel, genid = fnhe_genid(net);
2997 
2998 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2999 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3000 		struct fnhe_hash_bucket *bucket;
3001 		int err;
3002 
3003 		if (nhc->nhc_flags & RTNH_F_DEAD)
3004 			continue;
3005 
3006 		rcu_read_lock();
3007 		bucket = rcu_dereference(nhc->nhc_exceptions);
3008 		err = 0;
3009 		if (bucket)
3010 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3011 					       genid, fa_index, fa_start,
3012 					       flags);
3013 		rcu_read_unlock();
3014 		if (err)
3015 			return err;
3016 	}
3017 
3018 	return 0;
3019 }
3020 
3021 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3022 						   u8 ip_proto, __be16 sport,
3023 						   __be16 dport)
3024 {
3025 	struct sk_buff *skb;
3026 	struct iphdr *iph;
3027 
3028 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3029 	if (!skb)
3030 		return NULL;
3031 
3032 	/* Reserve room for dummy headers, this skb can pass
3033 	 * through good chunk of routing engine.
3034 	 */
3035 	skb_reset_mac_header(skb);
3036 	skb_reset_network_header(skb);
3037 	skb->protocol = htons(ETH_P_IP);
3038 	iph = skb_put(skb, sizeof(struct iphdr));
3039 	iph->protocol = ip_proto;
3040 	iph->saddr = src;
3041 	iph->daddr = dst;
3042 	iph->version = 0x4;
3043 	iph->frag_off = 0;
3044 	iph->ihl = 0x5;
3045 	skb_set_transport_header(skb, skb->len);
3046 
3047 	switch (iph->protocol) {
3048 	case IPPROTO_UDP: {
3049 		struct udphdr *udph;
3050 
3051 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3052 		udph->source = sport;
3053 		udph->dest = dport;
3054 		udph->len = sizeof(struct udphdr);
3055 		udph->check = 0;
3056 		break;
3057 	}
3058 	case IPPROTO_TCP: {
3059 		struct tcphdr *tcph;
3060 
3061 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3062 		tcph->source	= sport;
3063 		tcph->dest	= dport;
3064 		tcph->doff	= sizeof(struct tcphdr) / 4;
3065 		tcph->rst = 1;
3066 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3067 					    src, dst, 0);
3068 		break;
3069 	}
3070 	case IPPROTO_ICMP: {
3071 		struct icmphdr *icmph;
3072 
3073 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3074 		icmph->type = ICMP_ECHO;
3075 		icmph->code = 0;
3076 	}
3077 	}
3078 
3079 	return skb;
3080 }
3081 
3082 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3083 				       const struct nlmsghdr *nlh,
3084 				       struct nlattr **tb,
3085 				       struct netlink_ext_ack *extack)
3086 {
3087 	struct rtmsg *rtm;
3088 	int i, err;
3089 
3090 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3091 		NL_SET_ERR_MSG(extack,
3092 			       "ipv4: Invalid header for route get request");
3093 		return -EINVAL;
3094 	}
3095 
3096 	if (!netlink_strict_get_check(skb))
3097 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3098 					      rtm_ipv4_policy, extack);
3099 
3100 	rtm = nlmsg_data(nlh);
3101 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3102 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3103 	    rtm->rtm_table || rtm->rtm_protocol ||
3104 	    rtm->rtm_scope || rtm->rtm_type) {
3105 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3106 		return -EINVAL;
3107 	}
3108 
3109 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3110 			       RTM_F_LOOKUP_TABLE |
3111 			       RTM_F_FIB_MATCH)) {
3112 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3113 		return -EINVAL;
3114 	}
3115 
3116 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3117 					    rtm_ipv4_policy, extack);
3118 	if (err)
3119 		return err;
3120 
3121 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3122 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3123 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3124 		return -EINVAL;
3125 	}
3126 
3127 	for (i = 0; i <= RTA_MAX; i++) {
3128 		if (!tb[i])
3129 			continue;
3130 
3131 		switch (i) {
3132 		case RTA_IIF:
3133 		case RTA_OIF:
3134 		case RTA_SRC:
3135 		case RTA_DST:
3136 		case RTA_IP_PROTO:
3137 		case RTA_SPORT:
3138 		case RTA_DPORT:
3139 		case RTA_MARK:
3140 		case RTA_UID:
3141 			break;
3142 		default:
3143 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3144 			return -EINVAL;
3145 		}
3146 	}
3147 
3148 	return 0;
3149 }
3150 
3151 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3152 			     struct netlink_ext_ack *extack)
3153 {
3154 	struct net *net = sock_net(in_skb->sk);
3155 	struct nlattr *tb[RTA_MAX+1];
3156 	u32 table_id = RT_TABLE_MAIN;
3157 	__be16 sport = 0, dport = 0;
3158 	struct fib_result res = {};
3159 	u8 ip_proto = IPPROTO_UDP;
3160 	struct rtable *rt = NULL;
3161 	struct sk_buff *skb;
3162 	struct rtmsg *rtm;
3163 	struct flowi4 fl4 = {};
3164 	__be32 dst = 0;
3165 	__be32 src = 0;
3166 	kuid_t uid;
3167 	u32 iif;
3168 	int err;
3169 	int mark;
3170 
3171 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3172 	if (err < 0)
3173 		return err;
3174 
3175 	rtm = nlmsg_data(nlh);
3176 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3177 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3178 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3179 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3180 	if (tb[RTA_UID])
3181 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3182 	else
3183 		uid = (iif ? INVALID_UID : current_uid());
3184 
3185 	if (tb[RTA_IP_PROTO]) {
3186 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3187 						  &ip_proto, AF_INET, extack);
3188 		if (err)
3189 			return err;
3190 	}
3191 
3192 	if (tb[RTA_SPORT])
3193 		sport = nla_get_be16(tb[RTA_SPORT]);
3194 
3195 	if (tb[RTA_DPORT])
3196 		dport = nla_get_be16(tb[RTA_DPORT]);
3197 
3198 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3199 	if (!skb)
3200 		return -ENOBUFS;
3201 
3202 	fl4.daddr = dst;
3203 	fl4.saddr = src;
3204 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3205 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3206 	fl4.flowi4_mark = mark;
3207 	fl4.flowi4_uid = uid;
3208 	if (sport)
3209 		fl4.fl4_sport = sport;
3210 	if (dport)
3211 		fl4.fl4_dport = dport;
3212 	fl4.flowi4_proto = ip_proto;
3213 
3214 	rcu_read_lock();
3215 
3216 	if (iif) {
3217 		struct net_device *dev;
3218 
3219 		dev = dev_get_by_index_rcu(net, iif);
3220 		if (!dev) {
3221 			err = -ENODEV;
3222 			goto errout_rcu;
3223 		}
3224 
3225 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3226 		skb->dev	= dev;
3227 		skb->mark	= mark;
3228 		err = ip_route_input_rcu(skb, dst, src,
3229 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3230 					 &res);
3231 
3232 		rt = skb_rtable(skb);
3233 		if (err == 0 && rt->dst.error)
3234 			err = -rt->dst.error;
3235 	} else {
3236 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3237 		skb->dev = net->loopback_dev;
3238 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3239 		err = 0;
3240 		if (IS_ERR(rt))
3241 			err = PTR_ERR(rt);
3242 		else
3243 			skb_dst_set(skb, &rt->dst);
3244 	}
3245 
3246 	if (err)
3247 		goto errout_rcu;
3248 
3249 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3250 		rt->rt_flags |= RTCF_NOTIFY;
3251 
3252 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3253 		table_id = res.table ? res.table->tb_id : 0;
3254 
3255 	/* reset skb for netlink reply msg */
3256 	skb_trim(skb, 0);
3257 	skb_reset_network_header(skb);
3258 	skb_reset_transport_header(skb);
3259 	skb_reset_mac_header(skb);
3260 
3261 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3262 		struct fib_rt_info fri;
3263 
3264 		if (!res.fi) {
3265 			err = fib_props[res.type].error;
3266 			if (!err)
3267 				err = -EHOSTUNREACH;
3268 			goto errout_rcu;
3269 		}
3270 		fri.fi = res.fi;
3271 		fri.tb_id = table_id;
3272 		fri.dst = res.prefix;
3273 		fri.dst_len = res.prefixlen;
3274 		fri.tos = fl4.flowi4_tos;
3275 		fri.type = rt->rt_type;
3276 		fri.offload = 0;
3277 		fri.trap = 0;
3278 		fri.offload_failed = 0;
3279 		if (res.fa_head) {
3280 			struct fib_alias *fa;
3281 
3282 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3283 				u8 slen = 32 - fri.dst_len;
3284 
3285 				if (fa->fa_slen == slen &&
3286 				    fa->tb_id == fri.tb_id &&
3287 				    fa->fa_tos == fri.tos &&
3288 				    fa->fa_info == res.fi &&
3289 				    fa->fa_type == fri.type) {
3290 					fri.offload = fa->offload;
3291 					fri.trap = fa->trap;
3292 					break;
3293 				}
3294 			}
3295 		}
3296 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3297 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3298 	} else {
3299 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3300 				   NETLINK_CB(in_skb).portid,
3301 				   nlh->nlmsg_seq, 0);
3302 	}
3303 	if (err < 0)
3304 		goto errout_rcu;
3305 
3306 	rcu_read_unlock();
3307 
3308 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3309 
3310 errout_free:
3311 	return err;
3312 errout_rcu:
3313 	rcu_read_unlock();
3314 	kfree_skb(skb);
3315 	goto errout_free;
3316 }
3317 
3318 void ip_rt_multicast_event(struct in_device *in_dev)
3319 {
3320 	rt_cache_flush(dev_net(in_dev->dev));
3321 }
3322 
3323 #ifdef CONFIG_SYSCTL
3324 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3325 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3326 static int ip_rt_gc_elasticity __read_mostly	= 8;
3327 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3328 
3329 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3330 		void *buffer, size_t *lenp, loff_t *ppos)
3331 {
3332 	struct net *net = (struct net *)__ctl->extra1;
3333 
3334 	if (write) {
3335 		rt_cache_flush(net);
3336 		fnhe_genid_bump(net);
3337 		return 0;
3338 	}
3339 
3340 	return -EINVAL;
3341 }
3342 
3343 static struct ctl_table ipv4_route_table[] = {
3344 	{
3345 		.procname	= "gc_thresh",
3346 		.data		= &ipv4_dst_ops.gc_thresh,
3347 		.maxlen		= sizeof(int),
3348 		.mode		= 0644,
3349 		.proc_handler	= proc_dointvec,
3350 	},
3351 	{
3352 		.procname	= "max_size",
3353 		.data		= &ip_rt_max_size,
3354 		.maxlen		= sizeof(int),
3355 		.mode		= 0644,
3356 		.proc_handler	= proc_dointvec,
3357 	},
3358 	{
3359 		/*  Deprecated. Use gc_min_interval_ms */
3360 
3361 		.procname	= "gc_min_interval",
3362 		.data		= &ip_rt_gc_min_interval,
3363 		.maxlen		= sizeof(int),
3364 		.mode		= 0644,
3365 		.proc_handler	= proc_dointvec_jiffies,
3366 	},
3367 	{
3368 		.procname	= "gc_min_interval_ms",
3369 		.data		= &ip_rt_gc_min_interval,
3370 		.maxlen		= sizeof(int),
3371 		.mode		= 0644,
3372 		.proc_handler	= proc_dointvec_ms_jiffies,
3373 	},
3374 	{
3375 		.procname	= "gc_timeout",
3376 		.data		= &ip_rt_gc_timeout,
3377 		.maxlen		= sizeof(int),
3378 		.mode		= 0644,
3379 		.proc_handler	= proc_dointvec_jiffies,
3380 	},
3381 	{
3382 		.procname	= "gc_interval",
3383 		.data		= &ip_rt_gc_interval,
3384 		.maxlen		= sizeof(int),
3385 		.mode		= 0644,
3386 		.proc_handler	= proc_dointvec_jiffies,
3387 	},
3388 	{
3389 		.procname	= "redirect_load",
3390 		.data		= &ip_rt_redirect_load,
3391 		.maxlen		= sizeof(int),
3392 		.mode		= 0644,
3393 		.proc_handler	= proc_dointvec,
3394 	},
3395 	{
3396 		.procname	= "redirect_number",
3397 		.data		= &ip_rt_redirect_number,
3398 		.maxlen		= sizeof(int),
3399 		.mode		= 0644,
3400 		.proc_handler	= proc_dointvec,
3401 	},
3402 	{
3403 		.procname	= "redirect_silence",
3404 		.data		= &ip_rt_redirect_silence,
3405 		.maxlen		= sizeof(int),
3406 		.mode		= 0644,
3407 		.proc_handler	= proc_dointvec,
3408 	},
3409 	{
3410 		.procname	= "error_cost",
3411 		.data		= &ip_rt_error_cost,
3412 		.maxlen		= sizeof(int),
3413 		.mode		= 0644,
3414 		.proc_handler	= proc_dointvec,
3415 	},
3416 	{
3417 		.procname	= "error_burst",
3418 		.data		= &ip_rt_error_burst,
3419 		.maxlen		= sizeof(int),
3420 		.mode		= 0644,
3421 		.proc_handler	= proc_dointvec,
3422 	},
3423 	{
3424 		.procname	= "gc_elasticity",
3425 		.data		= &ip_rt_gc_elasticity,
3426 		.maxlen		= sizeof(int),
3427 		.mode		= 0644,
3428 		.proc_handler	= proc_dointvec,
3429 	},
3430 	{
3431 		.procname	= "mtu_expires",
3432 		.data		= &ip_rt_mtu_expires,
3433 		.maxlen		= sizeof(int),
3434 		.mode		= 0644,
3435 		.proc_handler	= proc_dointvec_jiffies,
3436 	},
3437 	{
3438 		.procname	= "min_pmtu",
3439 		.data		= &ip_rt_min_pmtu,
3440 		.maxlen		= sizeof(int),
3441 		.mode		= 0644,
3442 		.proc_handler	= proc_dointvec_minmax,
3443 		.extra1		= &ip_min_valid_pmtu,
3444 	},
3445 	{
3446 		.procname	= "min_adv_mss",
3447 		.data		= &ip_rt_min_advmss,
3448 		.maxlen		= sizeof(int),
3449 		.mode		= 0644,
3450 		.proc_handler	= proc_dointvec,
3451 	},
3452 	{ }
3453 };
3454 
3455 static const char ipv4_route_flush_procname[] = "flush";
3456 
3457 static struct ctl_table ipv4_route_flush_table[] = {
3458 	{
3459 		.procname	= ipv4_route_flush_procname,
3460 		.maxlen		= sizeof(int),
3461 		.mode		= 0200,
3462 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3463 	},
3464 	{ },
3465 };
3466 
3467 static __net_init int sysctl_route_net_init(struct net *net)
3468 {
3469 	struct ctl_table *tbl;
3470 
3471 	tbl = ipv4_route_flush_table;
3472 	if (!net_eq(net, &init_net)) {
3473 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3474 		if (!tbl)
3475 			goto err_dup;
3476 
3477 		/* Don't export non-whitelisted sysctls to unprivileged users */
3478 		if (net->user_ns != &init_user_ns) {
3479 			if (tbl[0].procname != ipv4_route_flush_procname)
3480 				tbl[0].procname = NULL;
3481 		}
3482 	}
3483 	tbl[0].extra1 = net;
3484 
3485 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3486 	if (!net->ipv4.route_hdr)
3487 		goto err_reg;
3488 	return 0;
3489 
3490 err_reg:
3491 	if (tbl != ipv4_route_flush_table)
3492 		kfree(tbl);
3493 err_dup:
3494 	return -ENOMEM;
3495 }
3496 
3497 static __net_exit void sysctl_route_net_exit(struct net *net)
3498 {
3499 	struct ctl_table *tbl;
3500 
3501 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3502 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3503 	BUG_ON(tbl == ipv4_route_flush_table);
3504 	kfree(tbl);
3505 }
3506 
3507 static __net_initdata struct pernet_operations sysctl_route_ops = {
3508 	.init = sysctl_route_net_init,
3509 	.exit = sysctl_route_net_exit,
3510 };
3511 #endif
3512 
3513 static __net_init int rt_genid_init(struct net *net)
3514 {
3515 	atomic_set(&net->ipv4.rt_genid, 0);
3516 	atomic_set(&net->fnhe_genid, 0);
3517 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3518 	return 0;
3519 }
3520 
3521 static __net_initdata struct pernet_operations rt_genid_ops = {
3522 	.init = rt_genid_init,
3523 };
3524 
3525 static int __net_init ipv4_inetpeer_init(struct net *net)
3526 {
3527 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3528 
3529 	if (!bp)
3530 		return -ENOMEM;
3531 	inet_peer_base_init(bp);
3532 	net->ipv4.peers = bp;
3533 	return 0;
3534 }
3535 
3536 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3537 {
3538 	struct inet_peer_base *bp = net->ipv4.peers;
3539 
3540 	net->ipv4.peers = NULL;
3541 	inetpeer_invalidate_tree(bp);
3542 	kfree(bp);
3543 }
3544 
3545 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3546 	.init	=	ipv4_inetpeer_init,
3547 	.exit	=	ipv4_inetpeer_exit,
3548 };
3549 
3550 #ifdef CONFIG_IP_ROUTE_CLASSID
3551 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3552 #endif /* CONFIG_IP_ROUTE_CLASSID */
3553 
3554 int __init ip_rt_init(void)
3555 {
3556 	int cpu;
3557 
3558 	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3559 				  GFP_KERNEL);
3560 	if (!ip_idents)
3561 		panic("IP: failed to allocate ip_idents\n");
3562 
3563 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3564 
3565 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3566 	if (!ip_tstamps)
3567 		panic("IP: failed to allocate ip_tstamps\n");
3568 
3569 	for_each_possible_cpu(cpu) {
3570 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3571 
3572 		INIT_LIST_HEAD(&ul->head);
3573 		spin_lock_init(&ul->lock);
3574 	}
3575 #ifdef CONFIG_IP_ROUTE_CLASSID
3576 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3577 	if (!ip_rt_acct)
3578 		panic("IP: failed to allocate ip_rt_acct\n");
3579 #endif
3580 
3581 	ipv4_dst_ops.kmem_cachep =
3582 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3583 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3584 
3585 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3586 
3587 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3588 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3589 
3590 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3591 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3592 
3593 	ipv4_dst_ops.gc_thresh = ~0;
3594 	ip_rt_max_size = INT_MAX;
3595 
3596 	devinet_init();
3597 	ip_fib_init();
3598 
3599 	if (ip_rt_proc_init())
3600 		pr_err("Unable to create route proc files\n");
3601 #ifdef CONFIG_XFRM
3602 	xfrm_init();
3603 	xfrm4_init();
3604 #endif
3605 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3606 		      RTNL_FLAG_DOIT_UNLOCKED);
3607 
3608 #ifdef CONFIG_SYSCTL
3609 	register_pernet_subsys(&sysctl_route_ops);
3610 #endif
3611 	register_pernet_subsys(&rt_genid_ops);
3612 	register_pernet_subsys(&ipv4_inetpeer_ops);
3613 	return 0;
3614 }
3615 
3616 #ifdef CONFIG_SYSCTL
3617 /*
3618  * We really need to sanitize the damn ipv4 init order, then all
3619  * this nonsense will go away.
3620  */
3621 void __init ip_static_sysctl_init(void)
3622 {
3623 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3624 }
3625 #endif
3626