xref: /openbmc/linux/net/ipv4/route.c (revision 50df3be7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112 
113 #include "fib_lookup.h"
114 
115 #define RT_FL_TOS(oldflp4) \
116 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly	= 9;
122 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly	= HZ;
125 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly	= 256;
129 
130 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
131 
132 /*
133  *	Interface to generic destination cache.
134  */
135 
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void		 ipv4_link_failure(struct sk_buff *skb);
141 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142 					   struct sk_buff *skb, u32 mtu);
143 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
144 					struct sk_buff *skb);
145 static void		ipv4_dst_destroy(struct dst_entry *dst);
146 
147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148 {
149 	WARN_ON(1);
150 	return NULL;
151 }
152 
153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
154 					   struct sk_buff *skb,
155 					   const void *daddr);
156 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
157 
158 static struct dst_ops ipv4_dst_ops = {
159 	.family =		AF_INET,
160 	.check =		ipv4_dst_check,
161 	.default_advmss =	ipv4_default_advmss,
162 	.mtu =			ipv4_mtu,
163 	.cow_metrics =		ipv4_cow_metrics,
164 	.destroy =		ipv4_dst_destroy,
165 	.negative_advice =	ipv4_negative_advice,
166 	.link_failure =		ipv4_link_failure,
167 	.update_pmtu =		ip_rt_update_pmtu,
168 	.redirect =		ip_do_redirect,
169 	.local_out =		__ip_local_out,
170 	.neigh_lookup =		ipv4_neigh_lookup,
171 	.confirm_neigh =	ipv4_confirm_neigh,
172 };
173 
174 #define ECN_OR_COST(class)	TC_PRIO_##class
175 
176 const __u8 ip_tos2prio[16] = {
177 	TC_PRIO_BESTEFFORT,
178 	ECN_OR_COST(BESTEFFORT),
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BULK,
182 	ECN_OR_COST(BULK),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_INTERACTIVE,
186 	ECN_OR_COST(INTERACTIVE),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE_BULK,
190 	ECN_OR_COST(INTERACTIVE_BULK),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK)
193 };
194 EXPORT_SYMBOL(ip_tos2prio);
195 
196 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
197 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
198 
199 #ifdef CONFIG_PROC_FS
200 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
201 {
202 	if (*pos)
203 		return NULL;
204 	return SEQ_START_TOKEN;
205 }
206 
207 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
208 {
209 	++*pos;
210 	return NULL;
211 }
212 
213 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
214 {
215 }
216 
217 static int rt_cache_seq_show(struct seq_file *seq, void *v)
218 {
219 	if (v == SEQ_START_TOKEN)
220 		seq_printf(seq, "%-127s\n",
221 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
222 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
223 			   "HHUptod\tSpecDst");
224 	return 0;
225 }
226 
227 static const struct seq_operations rt_cache_seq_ops = {
228 	.start  = rt_cache_seq_start,
229 	.next   = rt_cache_seq_next,
230 	.stop   = rt_cache_seq_stop,
231 	.show   = rt_cache_seq_show,
232 };
233 
234 static int rt_cache_seq_open(struct inode *inode, struct file *file)
235 {
236 	return seq_open(file, &rt_cache_seq_ops);
237 }
238 
239 static const struct file_operations rt_cache_seq_fops = {
240 	.open	 = rt_cache_seq_open,
241 	.read	 = seq_read,
242 	.llseek	 = seq_lseek,
243 	.release = seq_release,
244 };
245 
246 
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 {
249 	int cpu;
250 
251 	if (*pos == 0)
252 		return SEQ_START_TOKEN;
253 
254 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255 		if (!cpu_possible(cpu))
256 			continue;
257 		*pos = cpu+1;
258 		return &per_cpu(rt_cache_stat, cpu);
259 	}
260 	return NULL;
261 }
262 
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 {
265 	int cpu;
266 
267 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268 		if (!cpu_possible(cpu))
269 			continue;
270 		*pos = cpu+1;
271 		return &per_cpu(rt_cache_stat, cpu);
272 	}
273 	return NULL;
274 
275 }
276 
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
278 {
279 
280 }
281 
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 {
284 	struct rt_cache_stat *st = v;
285 
286 	if (v == SEQ_START_TOKEN) {
287 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288 		return 0;
289 	}
290 
291 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
292 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293 		   dst_entries_get_slow(&ipv4_dst_ops),
294 		   0, /* st->in_hit */
295 		   st->in_slow_tot,
296 		   st->in_slow_mc,
297 		   st->in_no_route,
298 		   st->in_brd,
299 		   st->in_martian_dst,
300 		   st->in_martian_src,
301 
302 		   0, /* st->out_hit */
303 		   st->out_slow_tot,
304 		   st->out_slow_mc,
305 
306 		   0, /* st->gc_total */
307 		   0, /* st->gc_ignored */
308 		   0, /* st->gc_goal_miss */
309 		   0, /* st->gc_dst_overflow */
310 		   0, /* st->in_hlist_search */
311 		   0  /* st->out_hlist_search */
312 		);
313 	return 0;
314 }
315 
316 static const struct seq_operations rt_cpu_seq_ops = {
317 	.start  = rt_cpu_seq_start,
318 	.next   = rt_cpu_seq_next,
319 	.stop   = rt_cpu_seq_stop,
320 	.show   = rt_cpu_seq_show,
321 };
322 
323 
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 {
326 	return seq_open(file, &rt_cpu_seq_ops);
327 }
328 
329 static const struct file_operations rt_cpu_seq_fops = {
330 	.open	 = rt_cpu_seq_open,
331 	.read	 = seq_read,
332 	.llseek	 = seq_lseek,
333 	.release = seq_release,
334 };
335 
336 #ifdef CONFIG_IP_ROUTE_CLASSID
337 static int rt_acct_proc_show(struct seq_file *m, void *v)
338 {
339 	struct ip_rt_acct *dst, *src;
340 	unsigned int i, j;
341 
342 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
343 	if (!dst)
344 		return -ENOMEM;
345 
346 	for_each_possible_cpu(i) {
347 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
348 		for (j = 0; j < 256; j++) {
349 			dst[j].o_bytes   += src[j].o_bytes;
350 			dst[j].o_packets += src[j].o_packets;
351 			dst[j].i_bytes   += src[j].i_bytes;
352 			dst[j].i_packets += src[j].i_packets;
353 		}
354 	}
355 
356 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
357 	kfree(dst);
358 	return 0;
359 }
360 #endif
361 
362 static int __net_init ip_rt_do_proc_init(struct net *net)
363 {
364 	struct proc_dir_entry *pde;
365 
366 	pde = proc_create("rt_cache", 0444, net->proc_net,
367 			  &rt_cache_seq_fops);
368 	if (!pde)
369 		goto err1;
370 
371 	pde = proc_create("rt_cache", 0444,
372 			  net->proc_net_stat, &rt_cpu_seq_fops);
373 	if (!pde)
374 		goto err2;
375 
376 #ifdef CONFIG_IP_ROUTE_CLASSID
377 	pde = proc_create_single("rt_acct", 0, net->proc_net,
378 			rt_acct_proc_show);
379 	if (!pde)
380 		goto err3;
381 #endif
382 	return 0;
383 
384 #ifdef CONFIG_IP_ROUTE_CLASSID
385 err3:
386 	remove_proc_entry("rt_cache", net->proc_net_stat);
387 #endif
388 err2:
389 	remove_proc_entry("rt_cache", net->proc_net);
390 err1:
391 	return -ENOMEM;
392 }
393 
394 static void __net_exit ip_rt_do_proc_exit(struct net *net)
395 {
396 	remove_proc_entry("rt_cache", net->proc_net_stat);
397 	remove_proc_entry("rt_cache", net->proc_net);
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 	remove_proc_entry("rt_acct", net->proc_net);
400 #endif
401 }
402 
403 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
404 	.init = ip_rt_do_proc_init,
405 	.exit = ip_rt_do_proc_exit,
406 };
407 
408 static int __init ip_rt_proc_init(void)
409 {
410 	return register_pernet_subsys(&ip_rt_proc_ops);
411 }
412 
413 #else
414 static inline int ip_rt_proc_init(void)
415 {
416 	return 0;
417 }
418 #endif /* CONFIG_PROC_FS */
419 
420 static inline bool rt_is_expired(const struct rtable *rth)
421 {
422 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
423 }
424 
425 void rt_cache_flush(struct net *net)
426 {
427 	rt_genid_bump_ipv4(net);
428 }
429 
430 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
431 					   struct sk_buff *skb,
432 					   const void *daddr)
433 {
434 	const struct rtable *rt = container_of(dst, struct rtable, dst);
435 	struct net_device *dev = dst->dev;
436 	struct neighbour *n;
437 
438 	rcu_read_lock_bh();
439 
440 	if (likely(rt->rt_gw_family == AF_INET)) {
441 		n = ip_neigh_gw4(dev, rt->rt_gw4);
442 	} else if (rt->rt_gw_family == AF_INET6) {
443 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
444         } else {
445 		__be32 pkey;
446 
447 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
448 		n = ip_neigh_gw4(dev, pkey);
449 	}
450 
451 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
452 		n = NULL;
453 
454 	rcu_read_unlock_bh();
455 
456 	return n;
457 }
458 
459 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
460 {
461 	const struct rtable *rt = container_of(dst, struct rtable, dst);
462 	struct net_device *dev = dst->dev;
463 	const __be32 *pkey = daddr;
464 
465 	if (rt->rt_gw_family == AF_INET) {
466 		pkey = (const __be32 *)&rt->rt_gw4;
467 	} else if (rt->rt_gw_family == AF_INET6) {
468 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
469 	} else if (!daddr ||
470 		 (rt->rt_flags &
471 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
472 		return;
473 	}
474 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
475 }
476 
477 #define IP_IDENTS_SZ 2048u
478 
479 static atomic_t *ip_idents __read_mostly;
480 static u32 *ip_tstamps __read_mostly;
481 
482 /* In order to protect privacy, we add a perturbation to identifiers
483  * if one generator is seldom used. This makes hard for an attacker
484  * to infer how many packets were sent between two points in time.
485  */
486 u32 ip_idents_reserve(u32 hash, int segs)
487 {
488 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
489 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
490 	u32 old = READ_ONCE(*p_tstamp);
491 	u32 now = (u32)jiffies;
492 	u32 new, delta = 0;
493 
494 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
495 		delta = prandom_u32_max(now - old);
496 
497 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
498 	do {
499 		old = (u32)atomic_read(p_id);
500 		new = old + delta + segs;
501 	} while (atomic_cmpxchg(p_id, old, new) != old);
502 
503 	return new - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506 
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509 	u32 hash, id;
510 
511 	/* Note the following code is not safe, but this is okay. */
512 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513 		get_random_bytes(&net->ipv4.ip_id_key,
514 				 sizeof(net->ipv4.ip_id_key));
515 
516 	hash = siphash_3u32((__force u32)iph->daddr,
517 			    (__force u32)iph->saddr,
518 			    iph->protocol,
519 			    &net->ipv4.ip_id_key);
520 	id = ip_idents_reserve(hash, segs);
521 	iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524 
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526 			     const struct sock *sk,
527 			     const struct iphdr *iph,
528 			     int oif, u8 tos,
529 			     u8 prot, u32 mark, int flow_flags)
530 {
531 	if (sk) {
532 		const struct inet_sock *inet = inet_sk(sk);
533 
534 		oif = sk->sk_bound_dev_if;
535 		mark = sk->sk_mark;
536 		tos = RT_CONN_FLAGS(sk);
537 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538 	}
539 	flowi4_init_output(fl4, oif, mark, tos,
540 			   RT_SCOPE_UNIVERSE, prot,
541 			   flow_flags,
542 			   iph->daddr, iph->saddr, 0, 0,
543 			   sock_net_uid(net, sk));
544 }
545 
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547 			       const struct sock *sk)
548 {
549 	const struct net *net = dev_net(skb->dev);
550 	const struct iphdr *iph = ip_hdr(skb);
551 	int oif = skb->dev->ifindex;
552 	u8 tos = RT_TOS(iph->tos);
553 	u8 prot = iph->protocol;
554 	u32 mark = skb->mark;
555 
556 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558 
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561 	const struct inet_sock *inet = inet_sk(sk);
562 	const struct ip_options_rcu *inet_opt;
563 	__be32 daddr = inet->inet_daddr;
564 
565 	rcu_read_lock();
566 	inet_opt = rcu_dereference(inet->inet_opt);
567 	if (inet_opt && inet_opt->opt.srr)
568 		daddr = inet_opt->opt.faddr;
569 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572 			   inet_sk_flowi_flags(sk),
573 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574 	rcu_read_unlock();
575 }
576 
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578 				 const struct sk_buff *skb)
579 {
580 	if (skb)
581 		build_skb_flow_key(fl4, skb, sk);
582 	else
583 		build_sk_flow_key(fl4, sk);
584 }
585 
586 static DEFINE_SPINLOCK(fnhe_lock);
587 
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590 	struct rtable *rt;
591 
592 	rt = rcu_dereference(fnhe->fnhe_rth_input);
593 	if (rt) {
594 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595 		dst_dev_put(&rt->dst);
596 		dst_release(&rt->dst);
597 	}
598 	rt = rcu_dereference(fnhe->fnhe_rth_output);
599 	if (rt) {
600 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601 		dst_dev_put(&rt->dst);
602 		dst_release(&rt->dst);
603 	}
604 }
605 
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608 	struct fib_nh_exception *fnhe, *oldest;
609 
610 	oldest = rcu_dereference(hash->chain);
611 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
613 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614 			oldest = fnhe;
615 	}
616 	fnhe_flush_routes(oldest);
617 	return oldest;
618 }
619 
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622 	static u32 fnhe_hashrnd __read_mostly;
623 	u32 hval;
624 
625 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
627 	return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629 
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632 	rt->rt_pmtu = fnhe->fnhe_pmtu;
633 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634 	rt->dst.expires = fnhe->fnhe_expires;
635 
636 	if (fnhe->fnhe_gw) {
637 		rt->rt_flags |= RTCF_REDIRECTED;
638 		rt->rt_uses_gateway = 1;
639 		rt->rt_gw_family = AF_INET;
640 		rt->rt_gw4 = fnhe->fnhe_gw;
641 	}
642 }
643 
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645 				  __be32 gw, u32 pmtu, bool lock,
646 				  unsigned long expires)
647 {
648 	struct fnhe_hash_bucket *hash;
649 	struct fib_nh_exception *fnhe;
650 	struct rtable *rt;
651 	u32 genid, hval;
652 	unsigned int i;
653 	int depth;
654 
655 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
656 	hval = fnhe_hashfun(daddr);
657 
658 	spin_lock_bh(&fnhe_lock);
659 
660 	hash = rcu_dereference(nhc->nhc_exceptions);
661 	if (!hash) {
662 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663 		if (!hash)
664 			goto out_unlock;
665 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
666 	}
667 
668 	hash += hval;
669 
670 	depth = 0;
671 	for (fnhe = rcu_dereference(hash->chain); fnhe;
672 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 		if (fnhe->fnhe_daddr == daddr)
674 			break;
675 		depth++;
676 	}
677 
678 	if (fnhe) {
679 		if (fnhe->fnhe_genid != genid)
680 			fnhe->fnhe_genid = genid;
681 		if (gw)
682 			fnhe->fnhe_gw = gw;
683 		if (pmtu) {
684 			fnhe->fnhe_pmtu = pmtu;
685 			fnhe->fnhe_mtu_locked = lock;
686 		}
687 		fnhe->fnhe_expires = max(1UL, expires);
688 		/* Update all cached dsts too */
689 		rt = rcu_dereference(fnhe->fnhe_rth_input);
690 		if (rt)
691 			fill_route_from_fnhe(rt, fnhe);
692 		rt = rcu_dereference(fnhe->fnhe_rth_output);
693 		if (rt)
694 			fill_route_from_fnhe(rt, fnhe);
695 	} else {
696 		if (depth > FNHE_RECLAIM_DEPTH)
697 			fnhe = fnhe_oldest(hash);
698 		else {
699 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700 			if (!fnhe)
701 				goto out_unlock;
702 
703 			fnhe->fnhe_next = hash->chain;
704 			rcu_assign_pointer(hash->chain, fnhe);
705 		}
706 		fnhe->fnhe_genid = genid;
707 		fnhe->fnhe_daddr = daddr;
708 		fnhe->fnhe_gw = gw;
709 		fnhe->fnhe_pmtu = pmtu;
710 		fnhe->fnhe_mtu_locked = lock;
711 		fnhe->fnhe_expires = max(1UL, expires);
712 
713 		/* Exception created; mark the cached routes for the nexthop
714 		 * stale, so anyone caching it rechecks if this exception
715 		 * applies to them.
716 		 */
717 		rt = rcu_dereference(nhc->nhc_rth_input);
718 		if (rt)
719 			rt->dst.obsolete = DST_OBSOLETE_KILL;
720 
721 		for_each_possible_cpu(i) {
722 			struct rtable __rcu **prt;
723 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724 			rt = rcu_dereference(*prt);
725 			if (rt)
726 				rt->dst.obsolete = DST_OBSOLETE_KILL;
727 		}
728 	}
729 
730 	fnhe->fnhe_stamp = jiffies;
731 
732 out_unlock:
733 	spin_unlock_bh(&fnhe_lock);
734 }
735 
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737 			     bool kill_route)
738 {
739 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
740 	__be32 old_gw = ip_hdr(skb)->saddr;
741 	struct net_device *dev = skb->dev;
742 	struct in_device *in_dev;
743 	struct fib_result res;
744 	struct neighbour *n;
745 	struct net *net;
746 
747 	switch (icmp_hdr(skb)->code & 7) {
748 	case ICMP_REDIR_NET:
749 	case ICMP_REDIR_NETTOS:
750 	case ICMP_REDIR_HOST:
751 	case ICMP_REDIR_HOSTTOS:
752 		break;
753 
754 	default:
755 		return;
756 	}
757 
758 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759 		return;
760 
761 	in_dev = __in_dev_get_rcu(dev);
762 	if (!in_dev)
763 		return;
764 
765 	net = dev_net(dev);
766 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768 	    ipv4_is_zeronet(new_gw))
769 		goto reject_redirect;
770 
771 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773 			goto reject_redirect;
774 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775 			goto reject_redirect;
776 	} else {
777 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778 			goto reject_redirect;
779 	}
780 
781 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782 	if (!n)
783 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784 	if (!IS_ERR(n)) {
785 		if (!(n->nud_state & NUD_VALID)) {
786 			neigh_event_send(n, NULL);
787 		} else {
788 			if (fib_lookup(net, fl4, &res, 0) == 0) {
789 				struct fib_nh_common *nhc = FIB_RES_NHC(res);
790 
791 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
792 						0, false,
793 						jiffies + ip_rt_gc_timeout);
794 			}
795 			if (kill_route)
796 				rt->dst.obsolete = DST_OBSOLETE_KILL;
797 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
798 		}
799 		neigh_release(n);
800 	}
801 	return;
802 
803 reject_redirect:
804 #ifdef CONFIG_IP_ROUTE_VERBOSE
805 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
806 		const struct iphdr *iph = (const struct iphdr *) skb->data;
807 		__be32 daddr = iph->daddr;
808 		__be32 saddr = iph->saddr;
809 
810 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811 				     "  Advised path = %pI4 -> %pI4\n",
812 				     &old_gw, dev->name, &new_gw,
813 				     &saddr, &daddr);
814 	}
815 #endif
816 	;
817 }
818 
819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
820 {
821 	struct rtable *rt;
822 	struct flowi4 fl4;
823 	const struct iphdr *iph = (const struct iphdr *) skb->data;
824 	struct net *net = dev_net(skb->dev);
825 	int oif = skb->dev->ifindex;
826 	u8 tos = RT_TOS(iph->tos);
827 	u8 prot = iph->protocol;
828 	u32 mark = skb->mark;
829 
830 	rt = (struct rtable *) dst;
831 
832 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
833 	__ip_do_redirect(rt, skb, &fl4, true);
834 }
835 
836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
837 {
838 	struct rtable *rt = (struct rtable *)dst;
839 	struct dst_entry *ret = dst;
840 
841 	if (rt) {
842 		if (dst->obsolete > 0) {
843 			ip_rt_put(rt);
844 			ret = NULL;
845 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846 			   rt->dst.expires) {
847 			ip_rt_put(rt);
848 			ret = NULL;
849 		}
850 	}
851 	return ret;
852 }
853 
854 /*
855  * Algorithm:
856  *	1. The first ip_rt_redirect_number redirects are sent
857  *	   with exponential backoff, then we stop sending them at all,
858  *	   assuming that the host ignores our redirects.
859  *	2. If we did not see packets requiring redirects
860  *	   during ip_rt_redirect_silence, we assume that the host
861  *	   forgot redirected route and start to send redirects again.
862  *
863  * This algorithm is much cheaper and more intelligent than dumb load limiting
864  * in icmp.c.
865  *
866  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867  * and "frag. need" (breaks PMTU discovery) in icmp.c.
868  */
869 
870 void ip_rt_send_redirect(struct sk_buff *skb)
871 {
872 	struct rtable *rt = skb_rtable(skb);
873 	struct in_device *in_dev;
874 	struct inet_peer *peer;
875 	struct net *net;
876 	int log_martians;
877 	int vif;
878 
879 	rcu_read_lock();
880 	in_dev = __in_dev_get_rcu(rt->dst.dev);
881 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
882 		rcu_read_unlock();
883 		return;
884 	}
885 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
886 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
887 	rcu_read_unlock();
888 
889 	net = dev_net(rt->dst.dev);
890 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
891 	if (!peer) {
892 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
894 		return;
895 	}
896 
897 	/* No redirected packets during ip_rt_redirect_silence;
898 	 * reset the algorithm.
899 	 */
900 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
901 		peer->rate_tokens = 0;
902 		peer->n_redirects = 0;
903 	}
904 
905 	/* Too many ignored redirects; do not send anything
906 	 * set dst.rate_last to the last seen redirected packet.
907 	 */
908 	if (peer->n_redirects >= ip_rt_redirect_number) {
909 		peer->rate_last = jiffies;
910 		goto out_put_peer;
911 	}
912 
913 	/* Check for load limit; set rate_last to the latest sent
914 	 * redirect.
915 	 */
916 	if (peer->rate_tokens == 0 ||
917 	    time_after(jiffies,
918 		       (peer->rate_last +
919 			(ip_rt_redirect_load << peer->rate_tokens)))) {
920 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921 
922 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923 		peer->rate_last = jiffies;
924 		++peer->rate_tokens;
925 		++peer->n_redirects;
926 #ifdef CONFIG_IP_ROUTE_VERBOSE
927 		if (log_martians &&
928 		    peer->rate_tokens == ip_rt_redirect_number)
929 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
930 					     &ip_hdr(skb)->saddr, inet_iif(skb),
931 					     &ip_hdr(skb)->daddr, &gw);
932 #endif
933 	}
934 out_put_peer:
935 	inet_putpeer(peer);
936 }
937 
938 static int ip_error(struct sk_buff *skb)
939 {
940 	struct rtable *rt = skb_rtable(skb);
941 	struct net_device *dev = skb->dev;
942 	struct in_device *in_dev;
943 	struct inet_peer *peer;
944 	unsigned long now;
945 	struct net *net;
946 	bool send;
947 	int code;
948 
949 	if (netif_is_l3_master(skb->dev)) {
950 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
951 		if (!dev)
952 			goto out;
953 	}
954 
955 	in_dev = __in_dev_get_rcu(dev);
956 
957 	/* IP on this device is disabled. */
958 	if (!in_dev)
959 		goto out;
960 
961 	net = dev_net(rt->dst.dev);
962 	if (!IN_DEV_FORWARD(in_dev)) {
963 		switch (rt->dst.error) {
964 		case EHOSTUNREACH:
965 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
966 			break;
967 
968 		case ENETUNREACH:
969 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
970 			break;
971 		}
972 		goto out;
973 	}
974 
975 	switch (rt->dst.error) {
976 	case EINVAL:
977 	default:
978 		goto out;
979 	case EHOSTUNREACH:
980 		code = ICMP_HOST_UNREACH;
981 		break;
982 	case ENETUNREACH:
983 		code = ICMP_NET_UNREACH;
984 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
985 		break;
986 	case EACCES:
987 		code = ICMP_PKT_FILTERED;
988 		break;
989 	}
990 
991 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
992 			       l3mdev_master_ifindex(skb->dev), 1);
993 
994 	send = true;
995 	if (peer) {
996 		now = jiffies;
997 		peer->rate_tokens += now - peer->rate_last;
998 		if (peer->rate_tokens > ip_rt_error_burst)
999 			peer->rate_tokens = ip_rt_error_burst;
1000 		peer->rate_last = now;
1001 		if (peer->rate_tokens >= ip_rt_error_cost)
1002 			peer->rate_tokens -= ip_rt_error_cost;
1003 		else
1004 			send = false;
1005 		inet_putpeer(peer);
1006 	}
1007 	if (send)
1008 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009 
1010 out:	kfree_skb(skb);
1011 	return 0;
1012 }
1013 
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016 	struct dst_entry *dst = &rt->dst;
1017 	u32 old_mtu = ipv4_mtu(dst);
1018 	struct fib_result res;
1019 	bool lock = false;
1020 
1021 	if (ip_mtu_locked(dst))
1022 		return;
1023 
1024 	if (old_mtu < mtu)
1025 		return;
1026 
1027 	if (mtu < ip_rt_min_pmtu) {
1028 		lock = true;
1029 		mtu = min(old_mtu, ip_rt_min_pmtu);
1030 	}
1031 
1032 	if (rt->rt_pmtu == mtu && !lock &&
1033 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034 		return;
1035 
1036 	rcu_read_lock();
1037 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038 		struct fib_nh_common *nhc = FIB_RES_NHC(res);
1039 
1040 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041 				      jiffies + ip_rt_mtu_expires);
1042 	}
1043 	rcu_read_unlock();
1044 }
1045 
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047 			      struct sk_buff *skb, u32 mtu)
1048 {
1049 	struct rtable *rt = (struct rtable *) dst;
1050 	struct flowi4 fl4;
1051 
1052 	ip_rt_build_flow_key(&fl4, sk, skb);
1053 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1054 }
1055 
1056 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1057 		      int oif, u8 protocol)
1058 {
1059 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1060 	struct flowi4 fl4;
1061 	struct rtable *rt;
1062 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1063 
1064 	__build_flow_key(net, &fl4, NULL, iph, oif,
1065 			 RT_TOS(iph->tos), protocol, mark, 0);
1066 	rt = __ip_route_output_key(net, &fl4);
1067 	if (!IS_ERR(rt)) {
1068 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1069 		ip_rt_put(rt);
1070 	}
1071 }
1072 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073 
1074 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1077 	struct flowi4 fl4;
1078 	struct rtable *rt;
1079 
1080 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081 
1082 	if (!fl4.flowi4_mark)
1083 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084 
1085 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1086 	if (!IS_ERR(rt)) {
1087 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1088 		ip_rt_put(rt);
1089 	}
1090 }
1091 
1092 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093 {
1094 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1095 	struct flowi4 fl4;
1096 	struct rtable *rt;
1097 	struct dst_entry *odst = NULL;
1098 	bool new = false;
1099 	struct net *net = sock_net(sk);
1100 
1101 	bh_lock_sock(sk);
1102 
1103 	if (!ip_sk_accept_pmtu(sk))
1104 		goto out;
1105 
1106 	odst = sk_dst_get(sk);
1107 
1108 	if (sock_owned_by_user(sk) || !odst) {
1109 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1110 		goto out;
1111 	}
1112 
1113 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114 
1115 	rt = (struct rtable *)odst;
1116 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118 		if (IS_ERR(rt))
1119 			goto out;
1120 
1121 		new = true;
1122 	}
1123 
1124 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1125 
1126 	if (!dst_check(&rt->dst, 0)) {
1127 		if (new)
1128 			dst_release(&rt->dst);
1129 
1130 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131 		if (IS_ERR(rt))
1132 			goto out;
1133 
1134 		new = true;
1135 	}
1136 
1137 	if (new)
1138 		sk_dst_set(sk, &rt->dst);
1139 
1140 out:
1141 	bh_unlock_sock(sk);
1142 	dst_release(odst);
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145 
1146 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147 		   int oif, u8 protocol)
1148 {
1149 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1150 	struct flowi4 fl4;
1151 	struct rtable *rt;
1152 
1153 	__build_flow_key(net, &fl4, NULL, iph, oif,
1154 			 RT_TOS(iph->tos), protocol, 0, 0);
1155 	rt = __ip_route_output_key(net, &fl4);
1156 	if (!IS_ERR(rt)) {
1157 		__ip_do_redirect(rt, skb, &fl4, false);
1158 		ip_rt_put(rt);
1159 	}
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_redirect);
1162 
1163 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164 {
1165 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1166 	struct flowi4 fl4;
1167 	struct rtable *rt;
1168 	struct net *net = sock_net(sk);
1169 
1170 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171 	rt = __ip_route_output_key(net, &fl4);
1172 	if (!IS_ERR(rt)) {
1173 		__ip_do_redirect(rt, skb, &fl4, false);
1174 		ip_rt_put(rt);
1175 	}
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178 
1179 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180 {
1181 	struct rtable *rt = (struct rtable *) dst;
1182 
1183 	/* All IPV4 dsts are created with ->obsolete set to the value
1184 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185 	 * into this function always.
1186 	 *
1187 	 * When a PMTU/redirect information update invalidates a route,
1188 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189 	 * DST_OBSOLETE_DEAD.
1190 	 */
1191 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192 		return NULL;
1193 	return dst;
1194 }
1195 
1196 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1197 {
1198 	struct ip_options opt;
1199 	int res;
1200 
1201 	/* Recompile ip options since IPCB may not be valid anymore.
1202 	 * Also check we have a reasonable ipv4 header.
1203 	 */
1204 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1205 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1206 		return;
1207 
1208 	memset(&opt, 0, sizeof(opt));
1209 	if (ip_hdr(skb)->ihl > 5) {
1210 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1211 			return;
1212 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1213 
1214 		rcu_read_lock();
1215 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1216 		rcu_read_unlock();
1217 
1218 		if (res)
1219 			return;
1220 	}
1221 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1222 }
1223 
1224 static void ipv4_link_failure(struct sk_buff *skb)
1225 {
1226 	struct rtable *rt;
1227 
1228 	ipv4_send_dest_unreach(skb);
1229 
1230 	rt = skb_rtable(skb);
1231 	if (rt)
1232 		dst_set_expires(&rt->dst, 0);
1233 }
1234 
1235 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1236 {
1237 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1238 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1239 		 skb->dev ? skb->dev->name : "?");
1240 	kfree_skb(skb);
1241 	WARN_ON(1);
1242 	return 0;
1243 }
1244 
1245 /*
1246    We do not cache source address of outgoing interface,
1247    because it is used only by IP RR, TS and SRR options,
1248    so that it out of fast path.
1249 
1250    BTW remember: "addr" is allowed to be not aligned
1251    in IP options!
1252  */
1253 
1254 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1255 {
1256 	__be32 src;
1257 
1258 	if (rt_is_output_route(rt))
1259 		src = ip_hdr(skb)->saddr;
1260 	else {
1261 		struct fib_result res;
1262 		struct iphdr *iph = ip_hdr(skb);
1263 		struct flowi4 fl4 = {
1264 			.daddr = iph->daddr,
1265 			.saddr = iph->saddr,
1266 			.flowi4_tos = RT_TOS(iph->tos),
1267 			.flowi4_oif = rt->dst.dev->ifindex,
1268 			.flowi4_iif = skb->dev->ifindex,
1269 			.flowi4_mark = skb->mark,
1270 		};
1271 
1272 		rcu_read_lock();
1273 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1274 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1275 		else
1276 			src = inet_select_addr(rt->dst.dev,
1277 					       rt_nexthop(rt, iph->daddr),
1278 					       RT_SCOPE_UNIVERSE);
1279 		rcu_read_unlock();
1280 	}
1281 	memcpy(addr, &src, 4);
1282 }
1283 
1284 #ifdef CONFIG_IP_ROUTE_CLASSID
1285 static void set_class_tag(struct rtable *rt, u32 tag)
1286 {
1287 	if (!(rt->dst.tclassid & 0xFFFF))
1288 		rt->dst.tclassid |= tag & 0xFFFF;
1289 	if (!(rt->dst.tclassid & 0xFFFF0000))
1290 		rt->dst.tclassid |= tag & 0xFFFF0000;
1291 }
1292 #endif
1293 
1294 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1295 {
1296 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1297 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1298 				    ip_rt_min_advmss);
1299 
1300 	return min(advmss, IPV4_MAX_PMTU - header_size);
1301 }
1302 
1303 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1304 {
1305 	const struct rtable *rt = (const struct rtable *) dst;
1306 	unsigned int mtu = rt->rt_pmtu;
1307 
1308 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1309 		mtu = dst_metric_raw(dst, RTAX_MTU);
1310 
1311 	if (mtu)
1312 		return mtu;
1313 
1314 	mtu = READ_ONCE(dst->dev->mtu);
1315 
1316 	if (unlikely(ip_mtu_locked(dst))) {
1317 		if (rt->rt_uses_gateway && mtu > 576)
1318 			mtu = 576;
1319 	}
1320 
1321 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1322 
1323 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1324 }
1325 
1326 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1327 {
1328 	struct fnhe_hash_bucket *hash;
1329 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1330 	u32 hval = fnhe_hashfun(daddr);
1331 
1332 	spin_lock_bh(&fnhe_lock);
1333 
1334 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1335 					 lockdep_is_held(&fnhe_lock));
1336 	hash += hval;
1337 
1338 	fnhe_p = &hash->chain;
1339 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1340 	while (fnhe) {
1341 		if (fnhe->fnhe_daddr == daddr) {
1342 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1343 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1344 			/* set fnhe_daddr to 0 to ensure it won't bind with
1345 			 * new dsts in rt_bind_exception().
1346 			 */
1347 			fnhe->fnhe_daddr = 0;
1348 			fnhe_flush_routes(fnhe);
1349 			kfree_rcu(fnhe, rcu);
1350 			break;
1351 		}
1352 		fnhe_p = &fnhe->fnhe_next;
1353 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1354 						 lockdep_is_held(&fnhe_lock));
1355 	}
1356 
1357 	spin_unlock_bh(&fnhe_lock);
1358 }
1359 
1360 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1361 					       __be32 daddr)
1362 {
1363 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1364 	struct fib_nh_exception *fnhe;
1365 	u32 hval;
1366 
1367 	if (!hash)
1368 		return NULL;
1369 
1370 	hval = fnhe_hashfun(daddr);
1371 
1372 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1373 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1374 		if (fnhe->fnhe_daddr == daddr) {
1375 			if (fnhe->fnhe_expires &&
1376 			    time_after(jiffies, fnhe->fnhe_expires)) {
1377 				ip_del_fnhe(nhc, daddr);
1378 				break;
1379 			}
1380 			return fnhe;
1381 		}
1382 	}
1383 	return NULL;
1384 }
1385 
1386 /* MTU selection:
1387  * 1. mtu on route is locked - use it
1388  * 2. mtu from nexthop exception
1389  * 3. mtu from egress device
1390  */
1391 
1392 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1393 {
1394 	struct fib_nh_common *nhc = res->nhc;
1395 	struct net_device *dev = nhc->nhc_dev;
1396 	struct fib_info *fi = res->fi;
1397 	u32 mtu = 0;
1398 
1399 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1400 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1401 		mtu = fi->fib_mtu;
1402 
1403 	if (likely(!mtu)) {
1404 		struct fib_nh_exception *fnhe;
1405 
1406 		fnhe = find_exception(nhc, daddr);
1407 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1408 			mtu = fnhe->fnhe_pmtu;
1409 	}
1410 
1411 	if (likely(!mtu))
1412 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1413 
1414 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1415 }
1416 
1417 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1418 			      __be32 daddr, const bool do_cache)
1419 {
1420 	bool ret = false;
1421 
1422 	spin_lock_bh(&fnhe_lock);
1423 
1424 	if (daddr == fnhe->fnhe_daddr) {
1425 		struct rtable __rcu **porig;
1426 		struct rtable *orig;
1427 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1428 
1429 		if (rt_is_input_route(rt))
1430 			porig = &fnhe->fnhe_rth_input;
1431 		else
1432 			porig = &fnhe->fnhe_rth_output;
1433 		orig = rcu_dereference(*porig);
1434 
1435 		if (fnhe->fnhe_genid != genid) {
1436 			fnhe->fnhe_genid = genid;
1437 			fnhe->fnhe_gw = 0;
1438 			fnhe->fnhe_pmtu = 0;
1439 			fnhe->fnhe_expires = 0;
1440 			fnhe->fnhe_mtu_locked = false;
1441 			fnhe_flush_routes(fnhe);
1442 			orig = NULL;
1443 		}
1444 		fill_route_from_fnhe(rt, fnhe);
1445 		if (!rt->rt_gw4) {
1446 			rt->rt_gw4 = daddr;
1447 			rt->rt_gw_family = AF_INET;
1448 		}
1449 
1450 		if (do_cache) {
1451 			dst_hold(&rt->dst);
1452 			rcu_assign_pointer(*porig, rt);
1453 			if (orig) {
1454 				dst_dev_put(&orig->dst);
1455 				dst_release(&orig->dst);
1456 			}
1457 			ret = true;
1458 		}
1459 
1460 		fnhe->fnhe_stamp = jiffies;
1461 	}
1462 	spin_unlock_bh(&fnhe_lock);
1463 
1464 	return ret;
1465 }
1466 
1467 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1468 {
1469 	struct rtable *orig, *prev, **p;
1470 	bool ret = true;
1471 
1472 	if (rt_is_input_route(rt)) {
1473 		p = (struct rtable **)&nhc->nhc_rth_input;
1474 	} else {
1475 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1476 	}
1477 	orig = *p;
1478 
1479 	/* hold dst before doing cmpxchg() to avoid race condition
1480 	 * on this dst
1481 	 */
1482 	dst_hold(&rt->dst);
1483 	prev = cmpxchg(p, orig, rt);
1484 	if (prev == orig) {
1485 		if (orig) {
1486 			dst_dev_put(&orig->dst);
1487 			dst_release(&orig->dst);
1488 		}
1489 	} else {
1490 		dst_release(&rt->dst);
1491 		ret = false;
1492 	}
1493 
1494 	return ret;
1495 }
1496 
1497 struct uncached_list {
1498 	spinlock_t		lock;
1499 	struct list_head	head;
1500 };
1501 
1502 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1503 
1504 void rt_add_uncached_list(struct rtable *rt)
1505 {
1506 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1507 
1508 	rt->rt_uncached_list = ul;
1509 
1510 	spin_lock_bh(&ul->lock);
1511 	list_add_tail(&rt->rt_uncached, &ul->head);
1512 	spin_unlock_bh(&ul->lock);
1513 }
1514 
1515 void rt_del_uncached_list(struct rtable *rt)
1516 {
1517 	if (!list_empty(&rt->rt_uncached)) {
1518 		struct uncached_list *ul = rt->rt_uncached_list;
1519 
1520 		spin_lock_bh(&ul->lock);
1521 		list_del(&rt->rt_uncached);
1522 		spin_unlock_bh(&ul->lock);
1523 	}
1524 }
1525 
1526 static void ipv4_dst_destroy(struct dst_entry *dst)
1527 {
1528 	struct rtable *rt = (struct rtable *)dst;
1529 
1530 	ip_dst_metrics_put(dst);
1531 	rt_del_uncached_list(rt);
1532 }
1533 
1534 void rt_flush_dev(struct net_device *dev)
1535 {
1536 	struct rtable *rt;
1537 	int cpu;
1538 
1539 	for_each_possible_cpu(cpu) {
1540 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1541 
1542 		spin_lock_bh(&ul->lock);
1543 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1544 			if (rt->dst.dev != dev)
1545 				continue;
1546 			rt->dst.dev = blackhole_netdev;
1547 			dev_hold(rt->dst.dev);
1548 			dev_put(dev);
1549 		}
1550 		spin_unlock_bh(&ul->lock);
1551 	}
1552 }
1553 
1554 static bool rt_cache_valid(const struct rtable *rt)
1555 {
1556 	return	rt &&
1557 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1558 		!rt_is_expired(rt);
1559 }
1560 
1561 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1562 			   const struct fib_result *res,
1563 			   struct fib_nh_exception *fnhe,
1564 			   struct fib_info *fi, u16 type, u32 itag,
1565 			   const bool do_cache)
1566 {
1567 	bool cached = false;
1568 
1569 	if (fi) {
1570 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1571 
1572 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1573 			rt->rt_uses_gateway = 1;
1574 			rt->rt_gw_family = nhc->nhc_gw_family;
1575 			/* only INET and INET6 are supported */
1576 			if (likely(nhc->nhc_gw_family == AF_INET))
1577 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1578 			else
1579 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1580 		}
1581 
1582 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1583 
1584 #ifdef CONFIG_IP_ROUTE_CLASSID
1585 		if (nhc->nhc_family == AF_INET) {
1586 			struct fib_nh *nh;
1587 
1588 			nh = container_of(nhc, struct fib_nh, nh_common);
1589 			rt->dst.tclassid = nh->nh_tclassid;
1590 		}
1591 #endif
1592 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1593 		if (unlikely(fnhe))
1594 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1595 		else if (do_cache)
1596 			cached = rt_cache_route(nhc, rt);
1597 		if (unlikely(!cached)) {
1598 			/* Routes we intend to cache in nexthop exception or
1599 			 * FIB nexthop have the DST_NOCACHE bit clear.
1600 			 * However, if we are unsuccessful at storing this
1601 			 * route into the cache we really need to set it.
1602 			 */
1603 			if (!rt->rt_gw4) {
1604 				rt->rt_gw_family = AF_INET;
1605 				rt->rt_gw4 = daddr;
1606 			}
1607 			rt_add_uncached_list(rt);
1608 		}
1609 	} else
1610 		rt_add_uncached_list(rt);
1611 
1612 #ifdef CONFIG_IP_ROUTE_CLASSID
1613 #ifdef CONFIG_IP_MULTIPLE_TABLES
1614 	set_class_tag(rt, res->tclassid);
1615 #endif
1616 	set_class_tag(rt, itag);
1617 #endif
1618 }
1619 
1620 struct rtable *rt_dst_alloc(struct net_device *dev,
1621 			    unsigned int flags, u16 type,
1622 			    bool nopolicy, bool noxfrm, bool will_cache)
1623 {
1624 	struct rtable *rt;
1625 
1626 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1627 		       (will_cache ? 0 : DST_HOST) |
1628 		       (nopolicy ? DST_NOPOLICY : 0) |
1629 		       (noxfrm ? DST_NOXFRM : 0));
1630 
1631 	if (rt) {
1632 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1633 		rt->rt_flags = flags;
1634 		rt->rt_type = type;
1635 		rt->rt_is_input = 0;
1636 		rt->rt_iif = 0;
1637 		rt->rt_pmtu = 0;
1638 		rt->rt_mtu_locked = 0;
1639 		rt->rt_uses_gateway = 0;
1640 		rt->rt_gw_family = 0;
1641 		rt->rt_gw4 = 0;
1642 		INIT_LIST_HEAD(&rt->rt_uncached);
1643 
1644 		rt->dst.output = ip_output;
1645 		if (flags & RTCF_LOCAL)
1646 			rt->dst.input = ip_local_deliver;
1647 	}
1648 
1649 	return rt;
1650 }
1651 EXPORT_SYMBOL(rt_dst_alloc);
1652 
1653 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1654 {
1655 	struct rtable *new_rt;
1656 
1657 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1658 			   rt->dst.flags);
1659 
1660 	if (new_rt) {
1661 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1662 		new_rt->rt_flags = rt->rt_flags;
1663 		new_rt->rt_type = rt->rt_type;
1664 		new_rt->rt_is_input = rt->rt_is_input;
1665 		new_rt->rt_iif = rt->rt_iif;
1666 		new_rt->rt_pmtu = rt->rt_pmtu;
1667 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1668 		new_rt->rt_gw_family = rt->rt_gw_family;
1669 		if (rt->rt_gw_family == AF_INET)
1670 			new_rt->rt_gw4 = rt->rt_gw4;
1671 		else if (rt->rt_gw_family == AF_INET6)
1672 			new_rt->rt_gw6 = rt->rt_gw6;
1673 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1674 
1675 		new_rt->dst.flags |= DST_HOST;
1676 		new_rt->dst.input = rt->dst.input;
1677 		new_rt->dst.output = rt->dst.output;
1678 		new_rt->dst.error = rt->dst.error;
1679 		new_rt->dst.lastuse = jiffies;
1680 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1681 	}
1682 	return new_rt;
1683 }
1684 EXPORT_SYMBOL(rt_dst_clone);
1685 
1686 /* called in rcu_read_lock() section */
1687 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688 			  u8 tos, struct net_device *dev,
1689 			  struct in_device *in_dev, u32 *itag)
1690 {
1691 	int err;
1692 
1693 	/* Primary sanity checks. */
1694 	if (!in_dev)
1695 		return -EINVAL;
1696 
1697 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1698 	    skb->protocol != htons(ETH_P_IP))
1699 		return -EINVAL;
1700 
1701 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1702 		return -EINVAL;
1703 
1704 	if (ipv4_is_zeronet(saddr)) {
1705 		if (!ipv4_is_local_multicast(daddr) &&
1706 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1707 			return -EINVAL;
1708 	} else {
1709 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1710 					  in_dev, itag);
1711 		if (err < 0)
1712 			return err;
1713 	}
1714 	return 0;
1715 }
1716 
1717 /* called in rcu_read_lock() section */
1718 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1719 			     u8 tos, struct net_device *dev, int our)
1720 {
1721 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1722 	unsigned int flags = RTCF_MULTICAST;
1723 	struct rtable *rth;
1724 	u32 itag = 0;
1725 	int err;
1726 
1727 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1728 	if (err)
1729 		return err;
1730 
1731 	if (our)
1732 		flags |= RTCF_LOCAL;
1733 
1734 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1735 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1736 	if (!rth)
1737 		return -ENOBUFS;
1738 
1739 #ifdef CONFIG_IP_ROUTE_CLASSID
1740 	rth->dst.tclassid = itag;
1741 #endif
1742 	rth->dst.output = ip_rt_bug;
1743 	rth->rt_is_input= 1;
1744 
1745 #ifdef CONFIG_IP_MROUTE
1746 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1747 		rth->dst.input = ip_mr_input;
1748 #endif
1749 	RT_CACHE_STAT_INC(in_slow_mc);
1750 
1751 	skb_dst_set(skb, &rth->dst);
1752 	return 0;
1753 }
1754 
1755 
1756 static void ip_handle_martian_source(struct net_device *dev,
1757 				     struct in_device *in_dev,
1758 				     struct sk_buff *skb,
1759 				     __be32 daddr,
1760 				     __be32 saddr)
1761 {
1762 	RT_CACHE_STAT_INC(in_martian_src);
1763 #ifdef CONFIG_IP_ROUTE_VERBOSE
1764 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1765 		/*
1766 		 *	RFC1812 recommendation, if source is martian,
1767 		 *	the only hint is MAC header.
1768 		 */
1769 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1770 			&daddr, &saddr, dev->name);
1771 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1772 			print_hex_dump(KERN_WARNING, "ll header: ",
1773 				       DUMP_PREFIX_OFFSET, 16, 1,
1774 				       skb_mac_header(skb),
1775 				       dev->hard_header_len, false);
1776 		}
1777 	}
1778 #endif
1779 }
1780 
1781 /* called in rcu_read_lock() section */
1782 static int __mkroute_input(struct sk_buff *skb,
1783 			   const struct fib_result *res,
1784 			   struct in_device *in_dev,
1785 			   __be32 daddr, __be32 saddr, u32 tos)
1786 {
1787 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1788 	struct net_device *dev = nhc->nhc_dev;
1789 	struct fib_nh_exception *fnhe;
1790 	struct rtable *rth;
1791 	int err;
1792 	struct in_device *out_dev;
1793 	bool do_cache;
1794 	u32 itag = 0;
1795 
1796 	/* get a working reference to the output device */
1797 	out_dev = __in_dev_get_rcu(dev);
1798 	if (!out_dev) {
1799 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1800 		return -EINVAL;
1801 	}
1802 
1803 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1804 				  in_dev->dev, in_dev, &itag);
1805 	if (err < 0) {
1806 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1807 					 saddr);
1808 
1809 		goto cleanup;
1810 	}
1811 
1812 	do_cache = res->fi && !itag;
1813 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1814 	    skb->protocol == htons(ETH_P_IP)) {
1815 		__be32 gw;
1816 
1817 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1818 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1819 		    inet_addr_onlink(out_dev, saddr, gw))
1820 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1821 	}
1822 
1823 	if (skb->protocol != htons(ETH_P_IP)) {
1824 		/* Not IP (i.e. ARP). Do not create route, if it is
1825 		 * invalid for proxy arp. DNAT routes are always valid.
1826 		 *
1827 		 * Proxy arp feature have been extended to allow, ARP
1828 		 * replies back to the same interface, to support
1829 		 * Private VLAN switch technologies. See arp.c.
1830 		 */
1831 		if (out_dev == in_dev &&
1832 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1833 			err = -EINVAL;
1834 			goto cleanup;
1835 		}
1836 	}
1837 
1838 	fnhe = find_exception(nhc, daddr);
1839 	if (do_cache) {
1840 		if (fnhe)
1841 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1842 		else
1843 			rth = rcu_dereference(nhc->nhc_rth_input);
1844 		if (rt_cache_valid(rth)) {
1845 			skb_dst_set_noref(skb, &rth->dst);
1846 			goto out;
1847 		}
1848 	}
1849 
1850 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1851 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1852 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1853 	if (!rth) {
1854 		err = -ENOBUFS;
1855 		goto cleanup;
1856 	}
1857 
1858 	rth->rt_is_input = 1;
1859 	RT_CACHE_STAT_INC(in_slow_tot);
1860 
1861 	rth->dst.input = ip_forward;
1862 
1863 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1864 		       do_cache);
1865 	lwtunnel_set_redirect(&rth->dst);
1866 	skb_dst_set(skb, &rth->dst);
1867 out:
1868 	err = 0;
1869  cleanup:
1870 	return err;
1871 }
1872 
1873 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1874 /* To make ICMP packets follow the right flow, the multipath hash is
1875  * calculated from the inner IP addresses.
1876  */
1877 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1878 				 struct flow_keys *hash_keys)
1879 {
1880 	const struct iphdr *outer_iph = ip_hdr(skb);
1881 	const struct iphdr *key_iph = outer_iph;
1882 	const struct iphdr *inner_iph;
1883 	const struct icmphdr *icmph;
1884 	struct iphdr _inner_iph;
1885 	struct icmphdr _icmph;
1886 
1887 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1888 		goto out;
1889 
1890 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1891 		goto out;
1892 
1893 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1894 				   &_icmph);
1895 	if (!icmph)
1896 		goto out;
1897 
1898 	if (icmph->type != ICMP_DEST_UNREACH &&
1899 	    icmph->type != ICMP_REDIRECT &&
1900 	    icmph->type != ICMP_TIME_EXCEEDED &&
1901 	    icmph->type != ICMP_PARAMETERPROB)
1902 		goto out;
1903 
1904 	inner_iph = skb_header_pointer(skb,
1905 				       outer_iph->ihl * 4 + sizeof(_icmph),
1906 				       sizeof(_inner_iph), &_inner_iph);
1907 	if (!inner_iph)
1908 		goto out;
1909 
1910 	key_iph = inner_iph;
1911 out:
1912 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1913 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1914 }
1915 
1916 /* if skb is set it will be used and fl4 can be NULL */
1917 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1918 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1919 {
1920 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1921 	struct flow_keys hash_keys;
1922 	u32 mhash;
1923 
1924 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1925 	case 0:
1926 		memset(&hash_keys, 0, sizeof(hash_keys));
1927 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1928 		if (skb) {
1929 			ip_multipath_l3_keys(skb, &hash_keys);
1930 		} else {
1931 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1932 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1933 		}
1934 		break;
1935 	case 1:
1936 		/* skb is currently provided only when forwarding */
1937 		if (skb) {
1938 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1939 			struct flow_keys keys;
1940 
1941 			/* short-circuit if we already have L4 hash present */
1942 			if (skb->l4_hash)
1943 				return skb_get_hash_raw(skb) >> 1;
1944 
1945 			memset(&hash_keys, 0, sizeof(hash_keys));
1946 
1947 			if (!flkeys) {
1948 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1949 				flkeys = &keys;
1950 			}
1951 
1952 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1953 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1954 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1955 			hash_keys.ports.src = flkeys->ports.src;
1956 			hash_keys.ports.dst = flkeys->ports.dst;
1957 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1958 		} else {
1959 			memset(&hash_keys, 0, sizeof(hash_keys));
1960 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1961 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1962 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1963 			hash_keys.ports.src = fl4->fl4_sport;
1964 			hash_keys.ports.dst = fl4->fl4_dport;
1965 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1966 		}
1967 		break;
1968 	case 2:
1969 		memset(&hash_keys, 0, sizeof(hash_keys));
1970 		/* skb is currently provided only when forwarding */
1971 		if (skb) {
1972 			struct flow_keys keys;
1973 
1974 			skb_flow_dissect_flow_keys(skb, &keys, 0);
1975 			/* Inner can be v4 or v6 */
1976 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1977 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1980 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1981 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1982 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1983 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1984 				hash_keys.tags.flow_label = keys.tags.flow_label;
1985 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1986 			} else {
1987 				/* Same as case 0 */
1988 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989 				ip_multipath_l3_keys(skb, &hash_keys);
1990 			}
1991 		} else {
1992 			/* Same as case 0 */
1993 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1994 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1995 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1996 		}
1997 		break;
1998 	}
1999 	mhash = flow_hash_from_keys(&hash_keys);
2000 
2001 	if (multipath_hash)
2002 		mhash = jhash_2words(mhash, multipath_hash, 0);
2003 
2004 	return mhash >> 1;
2005 }
2006 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2007 
2008 static int ip_mkroute_input(struct sk_buff *skb,
2009 			    struct fib_result *res,
2010 			    struct in_device *in_dev,
2011 			    __be32 daddr, __be32 saddr, u32 tos,
2012 			    struct flow_keys *hkeys)
2013 {
2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2015 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2016 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2017 
2018 		fib_select_multipath(res, h);
2019 	}
2020 #endif
2021 
2022 	/* create a routing cache entry */
2023 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2024 }
2025 
2026 /*
2027  *	NOTE. We drop all the packets that has local source
2028  *	addresses, because every properly looped back packet
2029  *	must have correct destination already attached by output routine.
2030  *
2031  *	Such approach solves two big problems:
2032  *	1. Not simplex devices are handled properly.
2033  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2034  *	called with rcu_read_lock()
2035  */
2036 
2037 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2038 			       u8 tos, struct net_device *dev,
2039 			       struct fib_result *res)
2040 {
2041 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2042 	struct flow_keys *flkeys = NULL, _flkeys;
2043 	struct net    *net = dev_net(dev);
2044 	struct ip_tunnel_info *tun_info;
2045 	int		err = -EINVAL;
2046 	unsigned int	flags = 0;
2047 	u32		itag = 0;
2048 	struct rtable	*rth;
2049 	struct flowi4	fl4;
2050 	bool do_cache = true;
2051 
2052 	/* IP on this device is disabled. */
2053 
2054 	if (!in_dev)
2055 		goto out;
2056 
2057 	/* Check for the most weird martians, which can be not detected
2058 	   by fib_lookup.
2059 	 */
2060 
2061 	tun_info = skb_tunnel_info(skb);
2062 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2063 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2064 	else
2065 		fl4.flowi4_tun_key.tun_id = 0;
2066 	skb_dst_drop(skb);
2067 
2068 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2069 		goto martian_source;
2070 
2071 	res->fi = NULL;
2072 	res->table = NULL;
2073 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2074 		goto brd_input;
2075 
2076 	/* Accept zero addresses only to limited broadcast;
2077 	 * I even do not know to fix it or not. Waiting for complains :-)
2078 	 */
2079 	if (ipv4_is_zeronet(saddr))
2080 		goto martian_source;
2081 
2082 	if (ipv4_is_zeronet(daddr))
2083 		goto martian_destination;
2084 
2085 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2086 	 * and call it once if daddr or/and saddr are loopback addresses
2087 	 */
2088 	if (ipv4_is_loopback(daddr)) {
2089 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2090 			goto martian_destination;
2091 	} else if (ipv4_is_loopback(saddr)) {
2092 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2093 			goto martian_source;
2094 	}
2095 
2096 	/*
2097 	 *	Now we are ready to route packet.
2098 	 */
2099 	fl4.flowi4_oif = 0;
2100 	fl4.flowi4_iif = dev->ifindex;
2101 	fl4.flowi4_mark = skb->mark;
2102 	fl4.flowi4_tos = tos;
2103 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2104 	fl4.flowi4_flags = 0;
2105 	fl4.daddr = daddr;
2106 	fl4.saddr = saddr;
2107 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2108 
2109 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2110 		flkeys = &_flkeys;
2111 	} else {
2112 		fl4.flowi4_proto = 0;
2113 		fl4.fl4_sport = 0;
2114 		fl4.fl4_dport = 0;
2115 	}
2116 
2117 	err = fib_lookup(net, &fl4, res, 0);
2118 	if (err != 0) {
2119 		if (!IN_DEV_FORWARD(in_dev))
2120 			err = -EHOSTUNREACH;
2121 		goto no_route;
2122 	}
2123 
2124 	if (res->type == RTN_BROADCAST) {
2125 		if (IN_DEV_BFORWARD(in_dev))
2126 			goto make_route;
2127 		/* not do cache if bc_forwarding is enabled */
2128 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2129 			do_cache = false;
2130 		goto brd_input;
2131 	}
2132 
2133 	if (res->type == RTN_LOCAL) {
2134 		err = fib_validate_source(skb, saddr, daddr, tos,
2135 					  0, dev, in_dev, &itag);
2136 		if (err < 0)
2137 			goto martian_source;
2138 		goto local_input;
2139 	}
2140 
2141 	if (!IN_DEV_FORWARD(in_dev)) {
2142 		err = -EHOSTUNREACH;
2143 		goto no_route;
2144 	}
2145 	if (res->type != RTN_UNICAST)
2146 		goto martian_destination;
2147 
2148 make_route:
2149 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2150 out:	return err;
2151 
2152 brd_input:
2153 	if (skb->protocol != htons(ETH_P_IP))
2154 		goto e_inval;
2155 
2156 	if (!ipv4_is_zeronet(saddr)) {
2157 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2158 					  in_dev, &itag);
2159 		if (err < 0)
2160 			goto martian_source;
2161 	}
2162 	flags |= RTCF_BROADCAST;
2163 	res->type = RTN_BROADCAST;
2164 	RT_CACHE_STAT_INC(in_brd);
2165 
2166 local_input:
2167 	do_cache &= res->fi && !itag;
2168 	if (do_cache) {
2169 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2170 
2171 		rth = rcu_dereference(nhc->nhc_rth_input);
2172 		if (rt_cache_valid(rth)) {
2173 			skb_dst_set_noref(skb, &rth->dst);
2174 			err = 0;
2175 			goto out;
2176 		}
2177 	}
2178 
2179 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2180 			   flags | RTCF_LOCAL, res->type,
2181 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2182 	if (!rth)
2183 		goto e_nobufs;
2184 
2185 	rth->dst.output= ip_rt_bug;
2186 #ifdef CONFIG_IP_ROUTE_CLASSID
2187 	rth->dst.tclassid = itag;
2188 #endif
2189 	rth->rt_is_input = 1;
2190 
2191 	RT_CACHE_STAT_INC(in_slow_tot);
2192 	if (res->type == RTN_UNREACHABLE) {
2193 		rth->dst.input= ip_error;
2194 		rth->dst.error= -err;
2195 		rth->rt_flags 	&= ~RTCF_LOCAL;
2196 	}
2197 
2198 	if (do_cache) {
2199 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2200 
2201 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2202 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2203 			WARN_ON(rth->dst.input == lwtunnel_input);
2204 			rth->dst.lwtstate->orig_input = rth->dst.input;
2205 			rth->dst.input = lwtunnel_input;
2206 		}
2207 
2208 		if (unlikely(!rt_cache_route(nhc, rth)))
2209 			rt_add_uncached_list(rth);
2210 	}
2211 	skb_dst_set(skb, &rth->dst);
2212 	err = 0;
2213 	goto out;
2214 
2215 no_route:
2216 	RT_CACHE_STAT_INC(in_no_route);
2217 	res->type = RTN_UNREACHABLE;
2218 	res->fi = NULL;
2219 	res->table = NULL;
2220 	goto local_input;
2221 
2222 	/*
2223 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2224 	 */
2225 martian_destination:
2226 	RT_CACHE_STAT_INC(in_martian_dst);
2227 #ifdef CONFIG_IP_ROUTE_VERBOSE
2228 	if (IN_DEV_LOG_MARTIANS(in_dev))
2229 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2230 				     &daddr, &saddr, dev->name);
2231 #endif
2232 
2233 e_inval:
2234 	err = -EINVAL;
2235 	goto out;
2236 
2237 e_nobufs:
2238 	err = -ENOBUFS;
2239 	goto out;
2240 
2241 martian_source:
2242 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2243 	goto out;
2244 }
2245 
2246 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2247 			 u8 tos, struct net_device *dev)
2248 {
2249 	struct fib_result res;
2250 	int err;
2251 
2252 	tos &= IPTOS_RT_MASK;
2253 	rcu_read_lock();
2254 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2255 	rcu_read_unlock();
2256 
2257 	return err;
2258 }
2259 EXPORT_SYMBOL(ip_route_input_noref);
2260 
2261 /* called with rcu_read_lock held */
2262 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263 		       u8 tos, struct net_device *dev, struct fib_result *res)
2264 {
2265 	/* Multicast recognition logic is moved from route cache to here.
2266 	   The problem was that too many Ethernet cards have broken/missing
2267 	   hardware multicast filters :-( As result the host on multicasting
2268 	   network acquires a lot of useless route cache entries, sort of
2269 	   SDR messages from all the world. Now we try to get rid of them.
2270 	   Really, provided software IP multicast filter is organized
2271 	   reasonably (at least, hashed), it does not result in a slowdown
2272 	   comparing with route cache reject entries.
2273 	   Note, that multicast routers are not affected, because
2274 	   route cache entry is created eventually.
2275 	 */
2276 	if (ipv4_is_multicast(daddr)) {
2277 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2278 		int our = 0;
2279 		int err = -EINVAL;
2280 
2281 		if (!in_dev)
2282 			return err;
2283 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2284 				      ip_hdr(skb)->protocol);
2285 
2286 		/* check l3 master if no match yet */
2287 		if (!our && netif_is_l3_slave(dev)) {
2288 			struct in_device *l3_in_dev;
2289 
2290 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2291 			if (l3_in_dev)
2292 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2293 						      ip_hdr(skb)->protocol);
2294 		}
2295 
2296 		if (our
2297 #ifdef CONFIG_IP_MROUTE
2298 			||
2299 		    (!ipv4_is_local_multicast(daddr) &&
2300 		     IN_DEV_MFORWARD(in_dev))
2301 #endif
2302 		   ) {
2303 			err = ip_route_input_mc(skb, daddr, saddr,
2304 						tos, dev, our);
2305 		}
2306 		return err;
2307 	}
2308 
2309 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2310 }
2311 
2312 /* called with rcu_read_lock() */
2313 static struct rtable *__mkroute_output(const struct fib_result *res,
2314 				       const struct flowi4 *fl4, int orig_oif,
2315 				       struct net_device *dev_out,
2316 				       unsigned int flags)
2317 {
2318 	struct fib_info *fi = res->fi;
2319 	struct fib_nh_exception *fnhe;
2320 	struct in_device *in_dev;
2321 	u16 type = res->type;
2322 	struct rtable *rth;
2323 	bool do_cache;
2324 
2325 	in_dev = __in_dev_get_rcu(dev_out);
2326 	if (!in_dev)
2327 		return ERR_PTR(-EINVAL);
2328 
2329 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2330 		if (ipv4_is_loopback(fl4->saddr) &&
2331 		    !(dev_out->flags & IFF_LOOPBACK) &&
2332 		    !netif_is_l3_master(dev_out))
2333 			return ERR_PTR(-EINVAL);
2334 
2335 	if (ipv4_is_lbcast(fl4->daddr))
2336 		type = RTN_BROADCAST;
2337 	else if (ipv4_is_multicast(fl4->daddr))
2338 		type = RTN_MULTICAST;
2339 	else if (ipv4_is_zeronet(fl4->daddr))
2340 		return ERR_PTR(-EINVAL);
2341 
2342 	if (dev_out->flags & IFF_LOOPBACK)
2343 		flags |= RTCF_LOCAL;
2344 
2345 	do_cache = true;
2346 	if (type == RTN_BROADCAST) {
2347 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2348 		fi = NULL;
2349 	} else if (type == RTN_MULTICAST) {
2350 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2351 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2352 				     fl4->flowi4_proto))
2353 			flags &= ~RTCF_LOCAL;
2354 		else
2355 			do_cache = false;
2356 		/* If multicast route do not exist use
2357 		 * default one, but do not gateway in this case.
2358 		 * Yes, it is hack.
2359 		 */
2360 		if (fi && res->prefixlen < 4)
2361 			fi = NULL;
2362 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2363 		   (orig_oif != dev_out->ifindex)) {
2364 		/* For local routes that require a particular output interface
2365 		 * we do not want to cache the result.  Caching the result
2366 		 * causes incorrect behaviour when there are multiple source
2367 		 * addresses on the interface, the end result being that if the
2368 		 * intended recipient is waiting on that interface for the
2369 		 * packet he won't receive it because it will be delivered on
2370 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2371 		 * be set to the loopback interface as well.
2372 		 */
2373 		do_cache = false;
2374 	}
2375 
2376 	fnhe = NULL;
2377 	do_cache &= fi != NULL;
2378 	if (fi) {
2379 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2380 		struct rtable __rcu **prth;
2381 
2382 		fnhe = find_exception(nhc, fl4->daddr);
2383 		if (!do_cache)
2384 			goto add;
2385 		if (fnhe) {
2386 			prth = &fnhe->fnhe_rth_output;
2387 		} else {
2388 			if (unlikely(fl4->flowi4_flags &
2389 				     FLOWI_FLAG_KNOWN_NH &&
2390 				     !(nhc->nhc_gw_family &&
2391 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2392 				do_cache = false;
2393 				goto add;
2394 			}
2395 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2396 		}
2397 		rth = rcu_dereference(*prth);
2398 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2399 			return rth;
2400 	}
2401 
2402 add:
2403 	rth = rt_dst_alloc(dev_out, flags, type,
2404 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2405 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2406 			   do_cache);
2407 	if (!rth)
2408 		return ERR_PTR(-ENOBUFS);
2409 
2410 	rth->rt_iif = orig_oif;
2411 
2412 	RT_CACHE_STAT_INC(out_slow_tot);
2413 
2414 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2415 		if (flags & RTCF_LOCAL &&
2416 		    !(dev_out->flags & IFF_LOOPBACK)) {
2417 			rth->dst.output = ip_mc_output;
2418 			RT_CACHE_STAT_INC(out_slow_mc);
2419 		}
2420 #ifdef CONFIG_IP_MROUTE
2421 		if (type == RTN_MULTICAST) {
2422 			if (IN_DEV_MFORWARD(in_dev) &&
2423 			    !ipv4_is_local_multicast(fl4->daddr)) {
2424 				rth->dst.input = ip_mr_input;
2425 				rth->dst.output = ip_mc_output;
2426 			}
2427 		}
2428 #endif
2429 	}
2430 
2431 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2432 	lwtunnel_set_redirect(&rth->dst);
2433 
2434 	return rth;
2435 }
2436 
2437 /*
2438  * Major route resolver routine.
2439  */
2440 
2441 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2442 					const struct sk_buff *skb)
2443 {
2444 	__u8 tos = RT_FL_TOS(fl4);
2445 	struct fib_result res = {
2446 		.type		= RTN_UNSPEC,
2447 		.fi		= NULL,
2448 		.table		= NULL,
2449 		.tclassid	= 0,
2450 	};
2451 	struct rtable *rth;
2452 
2453 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2454 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2455 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2456 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2457 
2458 	rcu_read_lock();
2459 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2460 	rcu_read_unlock();
2461 
2462 	return rth;
2463 }
2464 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2465 
2466 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2467 					    struct fib_result *res,
2468 					    const struct sk_buff *skb)
2469 {
2470 	struct net_device *dev_out = NULL;
2471 	int orig_oif = fl4->flowi4_oif;
2472 	unsigned int flags = 0;
2473 	struct rtable *rth;
2474 	int err = -ENETUNREACH;
2475 
2476 	if (fl4->saddr) {
2477 		rth = ERR_PTR(-EINVAL);
2478 		if (ipv4_is_multicast(fl4->saddr) ||
2479 		    ipv4_is_lbcast(fl4->saddr) ||
2480 		    ipv4_is_zeronet(fl4->saddr))
2481 			goto out;
2482 
2483 		/* I removed check for oif == dev_out->oif here.
2484 		   It was wrong for two reasons:
2485 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2486 		      is assigned to multiple interfaces.
2487 		   2. Moreover, we are allowed to send packets with saddr
2488 		      of another iface. --ANK
2489 		 */
2490 
2491 		if (fl4->flowi4_oif == 0 &&
2492 		    (ipv4_is_multicast(fl4->daddr) ||
2493 		     ipv4_is_lbcast(fl4->daddr))) {
2494 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2495 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2496 			if (!dev_out)
2497 				goto out;
2498 
2499 			/* Special hack: user can direct multicasts
2500 			   and limited broadcast via necessary interface
2501 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2502 			   This hack is not just for fun, it allows
2503 			   vic,vat and friends to work.
2504 			   They bind socket to loopback, set ttl to zero
2505 			   and expect that it will work.
2506 			   From the viewpoint of routing cache they are broken,
2507 			   because we are not allowed to build multicast path
2508 			   with loopback source addr (look, routing cache
2509 			   cannot know, that ttl is zero, so that packet
2510 			   will not leave this host and route is valid).
2511 			   Luckily, this hack is good workaround.
2512 			 */
2513 
2514 			fl4->flowi4_oif = dev_out->ifindex;
2515 			goto make_route;
2516 		}
2517 
2518 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2519 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2520 			if (!__ip_dev_find(net, fl4->saddr, false))
2521 				goto out;
2522 		}
2523 	}
2524 
2525 
2526 	if (fl4->flowi4_oif) {
2527 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2528 		rth = ERR_PTR(-ENODEV);
2529 		if (!dev_out)
2530 			goto out;
2531 
2532 		/* RACE: Check return value of inet_select_addr instead. */
2533 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2534 			rth = ERR_PTR(-ENETUNREACH);
2535 			goto out;
2536 		}
2537 		if (ipv4_is_local_multicast(fl4->daddr) ||
2538 		    ipv4_is_lbcast(fl4->daddr) ||
2539 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2540 			if (!fl4->saddr)
2541 				fl4->saddr = inet_select_addr(dev_out, 0,
2542 							      RT_SCOPE_LINK);
2543 			goto make_route;
2544 		}
2545 		if (!fl4->saddr) {
2546 			if (ipv4_is_multicast(fl4->daddr))
2547 				fl4->saddr = inet_select_addr(dev_out, 0,
2548 							      fl4->flowi4_scope);
2549 			else if (!fl4->daddr)
2550 				fl4->saddr = inet_select_addr(dev_out, 0,
2551 							      RT_SCOPE_HOST);
2552 		}
2553 	}
2554 
2555 	if (!fl4->daddr) {
2556 		fl4->daddr = fl4->saddr;
2557 		if (!fl4->daddr)
2558 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2559 		dev_out = net->loopback_dev;
2560 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2561 		res->type = RTN_LOCAL;
2562 		flags |= RTCF_LOCAL;
2563 		goto make_route;
2564 	}
2565 
2566 	err = fib_lookup(net, fl4, res, 0);
2567 	if (err) {
2568 		res->fi = NULL;
2569 		res->table = NULL;
2570 		if (fl4->flowi4_oif &&
2571 		    (ipv4_is_multicast(fl4->daddr) ||
2572 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2573 			/* Apparently, routing tables are wrong. Assume,
2574 			   that the destination is on link.
2575 
2576 			   WHY? DW.
2577 			   Because we are allowed to send to iface
2578 			   even if it has NO routes and NO assigned
2579 			   addresses. When oif is specified, routing
2580 			   tables are looked up with only one purpose:
2581 			   to catch if destination is gatewayed, rather than
2582 			   direct. Moreover, if MSG_DONTROUTE is set,
2583 			   we send packet, ignoring both routing tables
2584 			   and ifaddr state. --ANK
2585 
2586 
2587 			   We could make it even if oif is unknown,
2588 			   likely IPv6, but we do not.
2589 			 */
2590 
2591 			if (fl4->saddr == 0)
2592 				fl4->saddr = inet_select_addr(dev_out, 0,
2593 							      RT_SCOPE_LINK);
2594 			res->type = RTN_UNICAST;
2595 			goto make_route;
2596 		}
2597 		rth = ERR_PTR(err);
2598 		goto out;
2599 	}
2600 
2601 	if (res->type == RTN_LOCAL) {
2602 		if (!fl4->saddr) {
2603 			if (res->fi->fib_prefsrc)
2604 				fl4->saddr = res->fi->fib_prefsrc;
2605 			else
2606 				fl4->saddr = fl4->daddr;
2607 		}
2608 
2609 		/* L3 master device is the loopback for that domain */
2610 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2611 			net->loopback_dev;
2612 
2613 		/* make sure orig_oif points to fib result device even
2614 		 * though packet rx/tx happens over loopback or l3mdev
2615 		 */
2616 		orig_oif = FIB_RES_OIF(*res);
2617 
2618 		fl4->flowi4_oif = dev_out->ifindex;
2619 		flags |= RTCF_LOCAL;
2620 		goto make_route;
2621 	}
2622 
2623 	fib_select_path(net, res, fl4, skb);
2624 
2625 	dev_out = FIB_RES_DEV(*res);
2626 	fl4->flowi4_oif = dev_out->ifindex;
2627 
2628 
2629 make_route:
2630 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2631 
2632 out:
2633 	return rth;
2634 }
2635 
2636 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2637 {
2638 	return NULL;
2639 }
2640 
2641 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2642 {
2643 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2644 
2645 	return mtu ? : dst->dev->mtu;
2646 }
2647 
2648 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2649 					  struct sk_buff *skb, u32 mtu)
2650 {
2651 }
2652 
2653 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2654 				       struct sk_buff *skb)
2655 {
2656 }
2657 
2658 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2659 					  unsigned long old)
2660 {
2661 	return NULL;
2662 }
2663 
2664 static struct dst_ops ipv4_dst_blackhole_ops = {
2665 	.family			=	AF_INET,
2666 	.check			=	ipv4_blackhole_dst_check,
2667 	.mtu			=	ipv4_blackhole_mtu,
2668 	.default_advmss		=	ipv4_default_advmss,
2669 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2670 	.redirect		=	ipv4_rt_blackhole_redirect,
2671 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2672 	.neigh_lookup		=	ipv4_neigh_lookup,
2673 };
2674 
2675 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2676 {
2677 	struct rtable *ort = (struct rtable *) dst_orig;
2678 	struct rtable *rt;
2679 
2680 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2681 	if (rt) {
2682 		struct dst_entry *new = &rt->dst;
2683 
2684 		new->__use = 1;
2685 		new->input = dst_discard;
2686 		new->output = dst_discard_out;
2687 
2688 		new->dev = net->loopback_dev;
2689 		if (new->dev)
2690 			dev_hold(new->dev);
2691 
2692 		rt->rt_is_input = ort->rt_is_input;
2693 		rt->rt_iif = ort->rt_iif;
2694 		rt->rt_pmtu = ort->rt_pmtu;
2695 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2696 
2697 		rt->rt_genid = rt_genid_ipv4(net);
2698 		rt->rt_flags = ort->rt_flags;
2699 		rt->rt_type = ort->rt_type;
2700 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2701 		rt->rt_gw_family = ort->rt_gw_family;
2702 		if (rt->rt_gw_family == AF_INET)
2703 			rt->rt_gw4 = ort->rt_gw4;
2704 		else if (rt->rt_gw_family == AF_INET6)
2705 			rt->rt_gw6 = ort->rt_gw6;
2706 
2707 		INIT_LIST_HEAD(&rt->rt_uncached);
2708 	}
2709 
2710 	dst_release(dst_orig);
2711 
2712 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2713 }
2714 
2715 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2716 				    const struct sock *sk)
2717 {
2718 	struct rtable *rt = __ip_route_output_key(net, flp4);
2719 
2720 	if (IS_ERR(rt))
2721 		return rt;
2722 
2723 	if (flp4->flowi4_proto)
2724 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2725 							flowi4_to_flowi(flp4),
2726 							sk, 0);
2727 
2728 	return rt;
2729 }
2730 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2731 
2732 /* called with rcu_read_lock held */
2733 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2734 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2735 			struct sk_buff *skb, u32 portid, u32 seq,
2736 			unsigned int flags)
2737 {
2738 	struct rtmsg *r;
2739 	struct nlmsghdr *nlh;
2740 	unsigned long expires = 0;
2741 	u32 error;
2742 	u32 metrics[RTAX_MAX];
2743 
2744 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2745 	if (!nlh)
2746 		return -EMSGSIZE;
2747 
2748 	r = nlmsg_data(nlh);
2749 	r->rtm_family	 = AF_INET;
2750 	r->rtm_dst_len	= 32;
2751 	r->rtm_src_len	= 0;
2752 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2753 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2754 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2755 		goto nla_put_failure;
2756 	r->rtm_type	= rt->rt_type;
2757 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2758 	r->rtm_protocol = RTPROT_UNSPEC;
2759 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2760 	if (rt->rt_flags & RTCF_NOTIFY)
2761 		r->rtm_flags |= RTM_F_NOTIFY;
2762 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2763 		r->rtm_flags |= RTCF_DOREDIRECT;
2764 
2765 	if (nla_put_in_addr(skb, RTA_DST, dst))
2766 		goto nla_put_failure;
2767 	if (src) {
2768 		r->rtm_src_len = 32;
2769 		if (nla_put_in_addr(skb, RTA_SRC, src))
2770 			goto nla_put_failure;
2771 	}
2772 	if (rt->dst.dev &&
2773 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2774 		goto nla_put_failure;
2775 #ifdef CONFIG_IP_ROUTE_CLASSID
2776 	if (rt->dst.tclassid &&
2777 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2778 		goto nla_put_failure;
2779 #endif
2780 	if (fl4 && !rt_is_input_route(rt) &&
2781 	    fl4->saddr != src) {
2782 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2783 			goto nla_put_failure;
2784 	}
2785 	if (rt->rt_uses_gateway) {
2786 		if (rt->rt_gw_family == AF_INET &&
2787 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2788 			goto nla_put_failure;
2789 		} else if (rt->rt_gw_family == AF_INET6) {
2790 			int alen = sizeof(struct in6_addr);
2791 			struct nlattr *nla;
2792 			struct rtvia *via;
2793 
2794 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2795 			if (!nla)
2796 				goto nla_put_failure;
2797 
2798 			via = nla_data(nla);
2799 			via->rtvia_family = AF_INET6;
2800 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2801 		}
2802 	}
2803 
2804 	expires = rt->dst.expires;
2805 	if (expires) {
2806 		unsigned long now = jiffies;
2807 
2808 		if (time_before(now, expires))
2809 			expires -= now;
2810 		else
2811 			expires = 0;
2812 	}
2813 
2814 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2815 	if (rt->rt_pmtu && expires)
2816 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2817 	if (rt->rt_mtu_locked && expires)
2818 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2819 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2820 		goto nla_put_failure;
2821 
2822 	if (fl4) {
2823 		if (fl4->flowi4_mark &&
2824 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2825 			goto nla_put_failure;
2826 
2827 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2828 		    nla_put_u32(skb, RTA_UID,
2829 				from_kuid_munged(current_user_ns(),
2830 						 fl4->flowi4_uid)))
2831 			goto nla_put_failure;
2832 
2833 		if (rt_is_input_route(rt)) {
2834 #ifdef CONFIG_IP_MROUTE
2835 			if (ipv4_is_multicast(dst) &&
2836 			    !ipv4_is_local_multicast(dst) &&
2837 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2838 				int err = ipmr_get_route(net, skb,
2839 							 fl4->saddr, fl4->daddr,
2840 							 r, portid);
2841 
2842 				if (err <= 0) {
2843 					if (err == 0)
2844 						return 0;
2845 					goto nla_put_failure;
2846 				}
2847 			} else
2848 #endif
2849 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2850 					goto nla_put_failure;
2851 		}
2852 	}
2853 
2854 	error = rt->dst.error;
2855 
2856 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2857 		goto nla_put_failure;
2858 
2859 	nlmsg_end(skb, nlh);
2860 	return 0;
2861 
2862 nla_put_failure:
2863 	nlmsg_cancel(skb, nlh);
2864 	return -EMSGSIZE;
2865 }
2866 
2867 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2868 			    struct netlink_callback *cb, u32 table_id,
2869 			    struct fnhe_hash_bucket *bucket, int genid,
2870 			    int *fa_index, int fa_start, unsigned int flags)
2871 {
2872 	int i;
2873 
2874 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2875 		struct fib_nh_exception *fnhe;
2876 
2877 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2878 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2879 			struct rtable *rt;
2880 			int err;
2881 
2882 			if (*fa_index < fa_start)
2883 				goto next;
2884 
2885 			if (fnhe->fnhe_genid != genid)
2886 				goto next;
2887 
2888 			if (fnhe->fnhe_expires &&
2889 			    time_after(jiffies, fnhe->fnhe_expires))
2890 				goto next;
2891 
2892 			rt = rcu_dereference(fnhe->fnhe_rth_input);
2893 			if (!rt)
2894 				rt = rcu_dereference(fnhe->fnhe_rth_output);
2895 			if (!rt)
2896 				goto next;
2897 
2898 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2899 					   table_id, NULL, skb,
2900 					   NETLINK_CB(cb->skb).portid,
2901 					   cb->nlh->nlmsg_seq, flags);
2902 			if (err)
2903 				return err;
2904 next:
2905 			(*fa_index)++;
2906 		}
2907 	}
2908 
2909 	return 0;
2910 }
2911 
2912 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2913 		       u32 table_id, struct fib_info *fi,
2914 		       int *fa_index, int fa_start, unsigned int flags)
2915 {
2916 	struct net *net = sock_net(cb->skb->sk);
2917 	int nhsel, genid = fnhe_genid(net);
2918 
2919 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2920 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2921 		struct fnhe_hash_bucket *bucket;
2922 		int err;
2923 
2924 		if (nhc->nhc_flags & RTNH_F_DEAD)
2925 			continue;
2926 
2927 		rcu_read_lock();
2928 		bucket = rcu_dereference(nhc->nhc_exceptions);
2929 		err = 0;
2930 		if (bucket)
2931 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2932 					       genid, fa_index, fa_start,
2933 					       flags);
2934 		rcu_read_unlock();
2935 		if (err)
2936 			return err;
2937 	}
2938 
2939 	return 0;
2940 }
2941 
2942 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2943 						   u8 ip_proto, __be16 sport,
2944 						   __be16 dport)
2945 {
2946 	struct sk_buff *skb;
2947 	struct iphdr *iph;
2948 
2949 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2950 	if (!skb)
2951 		return NULL;
2952 
2953 	/* Reserve room for dummy headers, this skb can pass
2954 	 * through good chunk of routing engine.
2955 	 */
2956 	skb_reset_mac_header(skb);
2957 	skb_reset_network_header(skb);
2958 	skb->protocol = htons(ETH_P_IP);
2959 	iph = skb_put(skb, sizeof(struct iphdr));
2960 	iph->protocol = ip_proto;
2961 	iph->saddr = src;
2962 	iph->daddr = dst;
2963 	iph->version = 0x4;
2964 	iph->frag_off = 0;
2965 	iph->ihl = 0x5;
2966 	skb_set_transport_header(skb, skb->len);
2967 
2968 	switch (iph->protocol) {
2969 	case IPPROTO_UDP: {
2970 		struct udphdr *udph;
2971 
2972 		udph = skb_put_zero(skb, sizeof(struct udphdr));
2973 		udph->source = sport;
2974 		udph->dest = dport;
2975 		udph->len = sizeof(struct udphdr);
2976 		udph->check = 0;
2977 		break;
2978 	}
2979 	case IPPROTO_TCP: {
2980 		struct tcphdr *tcph;
2981 
2982 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2983 		tcph->source	= sport;
2984 		tcph->dest	= dport;
2985 		tcph->doff	= sizeof(struct tcphdr) / 4;
2986 		tcph->rst = 1;
2987 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2988 					    src, dst, 0);
2989 		break;
2990 	}
2991 	case IPPROTO_ICMP: {
2992 		struct icmphdr *icmph;
2993 
2994 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2995 		icmph->type = ICMP_ECHO;
2996 		icmph->code = 0;
2997 	}
2998 	}
2999 
3000 	return skb;
3001 }
3002 
3003 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3004 				       const struct nlmsghdr *nlh,
3005 				       struct nlattr **tb,
3006 				       struct netlink_ext_ack *extack)
3007 {
3008 	struct rtmsg *rtm;
3009 	int i, err;
3010 
3011 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3012 		NL_SET_ERR_MSG(extack,
3013 			       "ipv4: Invalid header for route get request");
3014 		return -EINVAL;
3015 	}
3016 
3017 	if (!netlink_strict_get_check(skb))
3018 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3019 					      rtm_ipv4_policy, extack);
3020 
3021 	rtm = nlmsg_data(nlh);
3022 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3023 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3024 	    rtm->rtm_table || rtm->rtm_protocol ||
3025 	    rtm->rtm_scope || rtm->rtm_type) {
3026 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3027 		return -EINVAL;
3028 	}
3029 
3030 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3031 			       RTM_F_LOOKUP_TABLE |
3032 			       RTM_F_FIB_MATCH)) {
3033 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3034 		return -EINVAL;
3035 	}
3036 
3037 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3038 					    rtm_ipv4_policy, extack);
3039 	if (err)
3040 		return err;
3041 
3042 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3043 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3044 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3045 		return -EINVAL;
3046 	}
3047 
3048 	for (i = 0; i <= RTA_MAX; i++) {
3049 		if (!tb[i])
3050 			continue;
3051 
3052 		switch (i) {
3053 		case RTA_IIF:
3054 		case RTA_OIF:
3055 		case RTA_SRC:
3056 		case RTA_DST:
3057 		case RTA_IP_PROTO:
3058 		case RTA_SPORT:
3059 		case RTA_DPORT:
3060 		case RTA_MARK:
3061 		case RTA_UID:
3062 			break;
3063 		default:
3064 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3065 			return -EINVAL;
3066 		}
3067 	}
3068 
3069 	return 0;
3070 }
3071 
3072 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3073 			     struct netlink_ext_ack *extack)
3074 {
3075 	struct net *net = sock_net(in_skb->sk);
3076 	struct nlattr *tb[RTA_MAX+1];
3077 	u32 table_id = RT_TABLE_MAIN;
3078 	__be16 sport = 0, dport = 0;
3079 	struct fib_result res = {};
3080 	u8 ip_proto = IPPROTO_UDP;
3081 	struct rtable *rt = NULL;
3082 	struct sk_buff *skb;
3083 	struct rtmsg *rtm;
3084 	struct flowi4 fl4 = {};
3085 	__be32 dst = 0;
3086 	__be32 src = 0;
3087 	kuid_t uid;
3088 	u32 iif;
3089 	int err;
3090 	int mark;
3091 
3092 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3093 	if (err < 0)
3094 		return err;
3095 
3096 	rtm = nlmsg_data(nlh);
3097 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3098 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3099 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3100 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3101 	if (tb[RTA_UID])
3102 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3103 	else
3104 		uid = (iif ? INVALID_UID : current_uid());
3105 
3106 	if (tb[RTA_IP_PROTO]) {
3107 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3108 						  &ip_proto, AF_INET, extack);
3109 		if (err)
3110 			return err;
3111 	}
3112 
3113 	if (tb[RTA_SPORT])
3114 		sport = nla_get_be16(tb[RTA_SPORT]);
3115 
3116 	if (tb[RTA_DPORT])
3117 		dport = nla_get_be16(tb[RTA_DPORT]);
3118 
3119 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3120 	if (!skb)
3121 		return -ENOBUFS;
3122 
3123 	fl4.daddr = dst;
3124 	fl4.saddr = src;
3125 	fl4.flowi4_tos = rtm->rtm_tos;
3126 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3127 	fl4.flowi4_mark = mark;
3128 	fl4.flowi4_uid = uid;
3129 	if (sport)
3130 		fl4.fl4_sport = sport;
3131 	if (dport)
3132 		fl4.fl4_dport = dport;
3133 	fl4.flowi4_proto = ip_proto;
3134 
3135 	rcu_read_lock();
3136 
3137 	if (iif) {
3138 		struct net_device *dev;
3139 
3140 		dev = dev_get_by_index_rcu(net, iif);
3141 		if (!dev) {
3142 			err = -ENODEV;
3143 			goto errout_rcu;
3144 		}
3145 
3146 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3147 		skb->dev	= dev;
3148 		skb->mark	= mark;
3149 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3150 					 dev, &res);
3151 
3152 		rt = skb_rtable(skb);
3153 		if (err == 0 && rt->dst.error)
3154 			err = -rt->dst.error;
3155 	} else {
3156 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3157 		skb->dev = net->loopback_dev;
3158 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3159 		err = 0;
3160 		if (IS_ERR(rt))
3161 			err = PTR_ERR(rt);
3162 		else
3163 			skb_dst_set(skb, &rt->dst);
3164 	}
3165 
3166 	if (err)
3167 		goto errout_rcu;
3168 
3169 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3170 		rt->rt_flags |= RTCF_NOTIFY;
3171 
3172 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3173 		table_id = res.table ? res.table->tb_id : 0;
3174 
3175 	/* reset skb for netlink reply msg */
3176 	skb_trim(skb, 0);
3177 	skb_reset_network_header(skb);
3178 	skb_reset_transport_header(skb);
3179 	skb_reset_mac_header(skb);
3180 
3181 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3182 		if (!res.fi) {
3183 			err = fib_props[res.type].error;
3184 			if (!err)
3185 				err = -EHOSTUNREACH;
3186 			goto errout_rcu;
3187 		}
3188 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3189 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3190 				    rt->rt_type, res.prefix, res.prefixlen,
3191 				    fl4.flowi4_tos, res.fi, 0);
3192 	} else {
3193 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3194 				   NETLINK_CB(in_skb).portid,
3195 				   nlh->nlmsg_seq, 0);
3196 	}
3197 	if (err < 0)
3198 		goto errout_rcu;
3199 
3200 	rcu_read_unlock();
3201 
3202 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3203 
3204 errout_free:
3205 	return err;
3206 errout_rcu:
3207 	rcu_read_unlock();
3208 	kfree_skb(skb);
3209 	goto errout_free;
3210 }
3211 
3212 void ip_rt_multicast_event(struct in_device *in_dev)
3213 {
3214 	rt_cache_flush(dev_net(in_dev->dev));
3215 }
3216 
3217 #ifdef CONFIG_SYSCTL
3218 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3219 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3220 static int ip_rt_gc_elasticity __read_mostly	= 8;
3221 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3222 
3223 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3224 					void __user *buffer,
3225 					size_t *lenp, loff_t *ppos)
3226 {
3227 	struct net *net = (struct net *)__ctl->extra1;
3228 
3229 	if (write) {
3230 		rt_cache_flush(net);
3231 		fnhe_genid_bump(net);
3232 		return 0;
3233 	}
3234 
3235 	return -EINVAL;
3236 }
3237 
3238 static struct ctl_table ipv4_route_table[] = {
3239 	{
3240 		.procname	= "gc_thresh",
3241 		.data		= &ipv4_dst_ops.gc_thresh,
3242 		.maxlen		= sizeof(int),
3243 		.mode		= 0644,
3244 		.proc_handler	= proc_dointvec,
3245 	},
3246 	{
3247 		.procname	= "max_size",
3248 		.data		= &ip_rt_max_size,
3249 		.maxlen		= sizeof(int),
3250 		.mode		= 0644,
3251 		.proc_handler	= proc_dointvec,
3252 	},
3253 	{
3254 		/*  Deprecated. Use gc_min_interval_ms */
3255 
3256 		.procname	= "gc_min_interval",
3257 		.data		= &ip_rt_gc_min_interval,
3258 		.maxlen		= sizeof(int),
3259 		.mode		= 0644,
3260 		.proc_handler	= proc_dointvec_jiffies,
3261 	},
3262 	{
3263 		.procname	= "gc_min_interval_ms",
3264 		.data		= &ip_rt_gc_min_interval,
3265 		.maxlen		= sizeof(int),
3266 		.mode		= 0644,
3267 		.proc_handler	= proc_dointvec_ms_jiffies,
3268 	},
3269 	{
3270 		.procname	= "gc_timeout",
3271 		.data		= &ip_rt_gc_timeout,
3272 		.maxlen		= sizeof(int),
3273 		.mode		= 0644,
3274 		.proc_handler	= proc_dointvec_jiffies,
3275 	},
3276 	{
3277 		.procname	= "gc_interval",
3278 		.data		= &ip_rt_gc_interval,
3279 		.maxlen		= sizeof(int),
3280 		.mode		= 0644,
3281 		.proc_handler	= proc_dointvec_jiffies,
3282 	},
3283 	{
3284 		.procname	= "redirect_load",
3285 		.data		= &ip_rt_redirect_load,
3286 		.maxlen		= sizeof(int),
3287 		.mode		= 0644,
3288 		.proc_handler	= proc_dointvec,
3289 	},
3290 	{
3291 		.procname	= "redirect_number",
3292 		.data		= &ip_rt_redirect_number,
3293 		.maxlen		= sizeof(int),
3294 		.mode		= 0644,
3295 		.proc_handler	= proc_dointvec,
3296 	},
3297 	{
3298 		.procname	= "redirect_silence",
3299 		.data		= &ip_rt_redirect_silence,
3300 		.maxlen		= sizeof(int),
3301 		.mode		= 0644,
3302 		.proc_handler	= proc_dointvec,
3303 	},
3304 	{
3305 		.procname	= "error_cost",
3306 		.data		= &ip_rt_error_cost,
3307 		.maxlen		= sizeof(int),
3308 		.mode		= 0644,
3309 		.proc_handler	= proc_dointvec,
3310 	},
3311 	{
3312 		.procname	= "error_burst",
3313 		.data		= &ip_rt_error_burst,
3314 		.maxlen		= sizeof(int),
3315 		.mode		= 0644,
3316 		.proc_handler	= proc_dointvec,
3317 	},
3318 	{
3319 		.procname	= "gc_elasticity",
3320 		.data		= &ip_rt_gc_elasticity,
3321 		.maxlen		= sizeof(int),
3322 		.mode		= 0644,
3323 		.proc_handler	= proc_dointvec,
3324 	},
3325 	{
3326 		.procname	= "mtu_expires",
3327 		.data		= &ip_rt_mtu_expires,
3328 		.maxlen		= sizeof(int),
3329 		.mode		= 0644,
3330 		.proc_handler	= proc_dointvec_jiffies,
3331 	},
3332 	{
3333 		.procname	= "min_pmtu",
3334 		.data		= &ip_rt_min_pmtu,
3335 		.maxlen		= sizeof(int),
3336 		.mode		= 0644,
3337 		.proc_handler	= proc_dointvec_minmax,
3338 		.extra1		= &ip_min_valid_pmtu,
3339 	},
3340 	{
3341 		.procname	= "min_adv_mss",
3342 		.data		= &ip_rt_min_advmss,
3343 		.maxlen		= sizeof(int),
3344 		.mode		= 0644,
3345 		.proc_handler	= proc_dointvec,
3346 	},
3347 	{ }
3348 };
3349 
3350 static const char ipv4_route_flush_procname[] = "flush";
3351 
3352 static struct ctl_table ipv4_route_flush_table[] = {
3353 	{
3354 		.procname	= ipv4_route_flush_procname,
3355 		.maxlen		= sizeof(int),
3356 		.mode		= 0200,
3357 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3358 	},
3359 	{ },
3360 };
3361 
3362 static __net_init int sysctl_route_net_init(struct net *net)
3363 {
3364 	struct ctl_table *tbl;
3365 
3366 	tbl = ipv4_route_flush_table;
3367 	if (!net_eq(net, &init_net)) {
3368 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3369 		if (!tbl)
3370 			goto err_dup;
3371 
3372 		/* Don't export non-whitelisted sysctls to unprivileged users */
3373 		if (net->user_ns != &init_user_ns) {
3374 			if (tbl[0].procname != ipv4_route_flush_procname)
3375 				tbl[0].procname = NULL;
3376 		}
3377 	}
3378 	tbl[0].extra1 = net;
3379 
3380 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3381 	if (!net->ipv4.route_hdr)
3382 		goto err_reg;
3383 	return 0;
3384 
3385 err_reg:
3386 	if (tbl != ipv4_route_flush_table)
3387 		kfree(tbl);
3388 err_dup:
3389 	return -ENOMEM;
3390 }
3391 
3392 static __net_exit void sysctl_route_net_exit(struct net *net)
3393 {
3394 	struct ctl_table *tbl;
3395 
3396 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3397 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3398 	BUG_ON(tbl == ipv4_route_flush_table);
3399 	kfree(tbl);
3400 }
3401 
3402 static __net_initdata struct pernet_operations sysctl_route_ops = {
3403 	.init = sysctl_route_net_init,
3404 	.exit = sysctl_route_net_exit,
3405 };
3406 #endif
3407 
3408 static __net_init int rt_genid_init(struct net *net)
3409 {
3410 	atomic_set(&net->ipv4.rt_genid, 0);
3411 	atomic_set(&net->fnhe_genid, 0);
3412 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3413 	return 0;
3414 }
3415 
3416 static __net_initdata struct pernet_operations rt_genid_ops = {
3417 	.init = rt_genid_init,
3418 };
3419 
3420 static int __net_init ipv4_inetpeer_init(struct net *net)
3421 {
3422 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3423 
3424 	if (!bp)
3425 		return -ENOMEM;
3426 	inet_peer_base_init(bp);
3427 	net->ipv4.peers = bp;
3428 	return 0;
3429 }
3430 
3431 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3432 {
3433 	struct inet_peer_base *bp = net->ipv4.peers;
3434 
3435 	net->ipv4.peers = NULL;
3436 	inetpeer_invalidate_tree(bp);
3437 	kfree(bp);
3438 }
3439 
3440 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3441 	.init	=	ipv4_inetpeer_init,
3442 	.exit	=	ipv4_inetpeer_exit,
3443 };
3444 
3445 #ifdef CONFIG_IP_ROUTE_CLASSID
3446 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3447 #endif /* CONFIG_IP_ROUTE_CLASSID */
3448 
3449 int __init ip_rt_init(void)
3450 {
3451 	int cpu;
3452 
3453 	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3454 				  GFP_KERNEL);
3455 	if (!ip_idents)
3456 		panic("IP: failed to allocate ip_idents\n");
3457 
3458 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3459 
3460 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3461 	if (!ip_tstamps)
3462 		panic("IP: failed to allocate ip_tstamps\n");
3463 
3464 	for_each_possible_cpu(cpu) {
3465 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3466 
3467 		INIT_LIST_HEAD(&ul->head);
3468 		spin_lock_init(&ul->lock);
3469 	}
3470 #ifdef CONFIG_IP_ROUTE_CLASSID
3471 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3472 	if (!ip_rt_acct)
3473 		panic("IP: failed to allocate ip_rt_acct\n");
3474 #endif
3475 
3476 	ipv4_dst_ops.kmem_cachep =
3477 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3478 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3479 
3480 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3481 
3482 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3483 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3484 
3485 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3486 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3487 
3488 	ipv4_dst_ops.gc_thresh = ~0;
3489 	ip_rt_max_size = INT_MAX;
3490 
3491 	devinet_init();
3492 	ip_fib_init();
3493 
3494 	if (ip_rt_proc_init())
3495 		pr_err("Unable to create route proc files\n");
3496 #ifdef CONFIG_XFRM
3497 	xfrm_init();
3498 	xfrm4_init();
3499 #endif
3500 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3501 		      RTNL_FLAG_DOIT_UNLOCKED);
3502 
3503 #ifdef CONFIG_SYSCTL
3504 	register_pernet_subsys(&sysctl_route_ops);
3505 #endif
3506 	register_pernet_subsys(&rt_genid_ops);
3507 	register_pernet_subsys(&ipv4_inetpeer_ops);
3508 	return 0;
3509 }
3510 
3511 #ifdef CONFIG_SYSCTL
3512 /*
3513  * We really need to sanitize the damn ipv4 init order, then all
3514  * this nonsense will go away.
3515  */
3516 void __init ip_static_sysctl_init(void)
3517 {
3518 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3519 }
3520 #endif
3521