xref: /openbmc/linux/net/ipv4/route.c (revision 8dde5715)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/arp.h>
99 #include <net/tcp.h>
100 #include <net/icmp.h>
101 #include <net/xfrm.h>
102 #include <net/lwtunnel.h>
103 #include <net/netevent.h>
104 #include <net/rtnetlink.h>
105 #ifdef CONFIG_SYSCTL
106 #include <linux/sysctl.h>
107 #endif
108 #include <net/secure_seq.h>
109 #include <net/ip_tunnels.h>
110 #include <net/l3mdev.h>
111 
112 #include "fib_lookup.h"
113 
114 #define RT_FL_TOS(oldflp4) \
115 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly	= 9;
121 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly	= HZ;
124 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
126 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly	= 256;
128 
129 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
130 
131 /*
132  *	Interface to generic destination cache.
133  */
134 
135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
136 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
137 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void		 ipv4_link_failure(struct sk_buff *skb);
140 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
141 					   struct sk_buff *skb, u32 mtu);
142 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
143 					struct sk_buff *skb);
144 static void		ipv4_dst_destroy(struct dst_entry *dst);
145 
146 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
147 {
148 	WARN_ON(1);
149 	return NULL;
150 }
151 
152 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
153 					   struct sk_buff *skb,
154 					   const void *daddr);
155 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.check =		ipv4_dst_check,
160 	.default_advmss =	ipv4_default_advmss,
161 	.mtu =			ipv4_mtu,
162 	.cow_metrics =		ipv4_cow_metrics,
163 	.destroy =		ipv4_dst_destroy,
164 	.negative_advice =	ipv4_negative_advice,
165 	.link_failure =		ipv4_link_failure,
166 	.update_pmtu =		ip_rt_update_pmtu,
167 	.redirect =		ip_do_redirect,
168 	.local_out =		__ip_local_out,
169 	.neigh_lookup =		ipv4_neigh_lookup,
170 	.confirm_neigh =	ipv4_confirm_neigh,
171 };
172 
173 #define ECN_OR_COST(class)	TC_PRIO_##class
174 
175 const __u8 ip_tos2prio[16] = {
176 	TC_PRIO_BESTEFFORT,
177 	ECN_OR_COST(BESTEFFORT),
178 	TC_PRIO_BESTEFFORT,
179 	ECN_OR_COST(BESTEFFORT),
180 	TC_PRIO_BULK,
181 	ECN_OR_COST(BULK),
182 	TC_PRIO_BULK,
183 	ECN_OR_COST(BULK),
184 	TC_PRIO_INTERACTIVE,
185 	ECN_OR_COST(INTERACTIVE),
186 	TC_PRIO_INTERACTIVE,
187 	ECN_OR_COST(INTERACTIVE),
188 	TC_PRIO_INTERACTIVE_BULK,
189 	ECN_OR_COST(INTERACTIVE_BULK),
190 	TC_PRIO_INTERACTIVE_BULK,
191 	ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194 
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197 
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201 	if (*pos)
202 		return NULL;
203 	return SEQ_START_TOKEN;
204 }
205 
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208 	++*pos;
209 	return NULL;
210 }
211 
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215 
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218 	if (v == SEQ_START_TOKEN)
219 		seq_printf(seq, "%-127s\n",
220 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222 			   "HHUptod\tSpecDst");
223 	return 0;
224 }
225 
226 static const struct seq_operations rt_cache_seq_ops = {
227 	.start  = rt_cache_seq_start,
228 	.next   = rt_cache_seq_next,
229 	.stop   = rt_cache_seq_stop,
230 	.show   = rt_cache_seq_show,
231 };
232 
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235 	return seq_open(file, &rt_cache_seq_ops);
236 }
237 
238 static const struct file_operations rt_cache_seq_fops = {
239 	.open	 = rt_cache_seq_open,
240 	.read	 = seq_read,
241 	.llseek	 = seq_lseek,
242 	.release = seq_release,
243 };
244 
245 
246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
247 {
248 	int cpu;
249 
250 	if (*pos == 0)
251 		return SEQ_START_TOKEN;
252 
253 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
254 		if (!cpu_possible(cpu))
255 			continue;
256 		*pos = cpu+1;
257 		return &per_cpu(rt_cache_stat, cpu);
258 	}
259 	return NULL;
260 }
261 
262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
263 {
264 	int cpu;
265 
266 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
267 		if (!cpu_possible(cpu))
268 			continue;
269 		*pos = cpu+1;
270 		return &per_cpu(rt_cache_stat, cpu);
271 	}
272 	return NULL;
273 
274 }
275 
276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
277 {
278 
279 }
280 
281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
282 {
283 	struct rt_cache_stat *st = v;
284 
285 	if (v == SEQ_START_TOKEN) {
286 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
287 		return 0;
288 	}
289 
290 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
291 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292 		   dst_entries_get_slow(&ipv4_dst_ops),
293 		   0, /* st->in_hit */
294 		   st->in_slow_tot,
295 		   st->in_slow_mc,
296 		   st->in_no_route,
297 		   st->in_brd,
298 		   st->in_martian_dst,
299 		   st->in_martian_src,
300 
301 		   0, /* st->out_hit */
302 		   st->out_slow_tot,
303 		   st->out_slow_mc,
304 
305 		   0, /* st->gc_total */
306 		   0, /* st->gc_ignored */
307 		   0, /* st->gc_goal_miss */
308 		   0, /* st->gc_dst_overflow */
309 		   0, /* st->in_hlist_search */
310 		   0  /* st->out_hlist_search */
311 		);
312 	return 0;
313 }
314 
315 static const struct seq_operations rt_cpu_seq_ops = {
316 	.start  = rt_cpu_seq_start,
317 	.next   = rt_cpu_seq_next,
318 	.stop   = rt_cpu_seq_stop,
319 	.show   = rt_cpu_seq_show,
320 };
321 
322 
323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
324 {
325 	return seq_open(file, &rt_cpu_seq_ops);
326 }
327 
328 static const struct file_operations rt_cpu_seq_fops = {
329 	.open	 = rt_cpu_seq_open,
330 	.read	 = seq_read,
331 	.llseek	 = seq_lseek,
332 	.release = seq_release,
333 };
334 
335 #ifdef CONFIG_IP_ROUTE_CLASSID
336 static int rt_acct_proc_show(struct seq_file *m, void *v)
337 {
338 	struct ip_rt_acct *dst, *src;
339 	unsigned int i, j;
340 
341 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
342 	if (!dst)
343 		return -ENOMEM;
344 
345 	for_each_possible_cpu(i) {
346 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
347 		for (j = 0; j < 256; j++) {
348 			dst[j].o_bytes   += src[j].o_bytes;
349 			dst[j].o_packets += src[j].o_packets;
350 			dst[j].i_bytes   += src[j].i_bytes;
351 			dst[j].i_packets += src[j].i_packets;
352 		}
353 	}
354 
355 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
356 	kfree(dst);
357 	return 0;
358 }
359 #endif
360 
361 static int __net_init ip_rt_do_proc_init(struct net *net)
362 {
363 	struct proc_dir_entry *pde;
364 
365 	pde = proc_create("rt_cache", 0444, net->proc_net,
366 			  &rt_cache_seq_fops);
367 	if (!pde)
368 		goto err1;
369 
370 	pde = proc_create("rt_cache", 0444,
371 			  net->proc_net_stat, &rt_cpu_seq_fops);
372 	if (!pde)
373 		goto err2;
374 
375 #ifdef CONFIG_IP_ROUTE_CLASSID
376 	pde = proc_create_single("rt_acct", 0, net->proc_net,
377 			rt_acct_proc_show);
378 	if (!pde)
379 		goto err3;
380 #endif
381 	return 0;
382 
383 #ifdef CONFIG_IP_ROUTE_CLASSID
384 err3:
385 	remove_proc_entry("rt_cache", net->proc_net_stat);
386 #endif
387 err2:
388 	remove_proc_entry("rt_cache", net->proc_net);
389 err1:
390 	return -ENOMEM;
391 }
392 
393 static void __net_exit ip_rt_do_proc_exit(struct net *net)
394 {
395 	remove_proc_entry("rt_cache", net->proc_net_stat);
396 	remove_proc_entry("rt_cache", net->proc_net);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 	remove_proc_entry("rt_acct", net->proc_net);
399 #endif
400 }
401 
402 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
403 	.init = ip_rt_do_proc_init,
404 	.exit = ip_rt_do_proc_exit,
405 };
406 
407 static int __init ip_rt_proc_init(void)
408 {
409 	return register_pernet_subsys(&ip_rt_proc_ops);
410 }
411 
412 #else
413 static inline int ip_rt_proc_init(void)
414 {
415 	return 0;
416 }
417 #endif /* CONFIG_PROC_FS */
418 
419 static inline bool rt_is_expired(const struct rtable *rth)
420 {
421 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
422 }
423 
424 void rt_cache_flush(struct net *net)
425 {
426 	rt_genid_bump_ipv4(net);
427 }
428 
429 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
430 					   struct sk_buff *skb,
431 					   const void *daddr)
432 {
433 	const struct rtable *rt = container_of(dst, struct rtable, dst);
434 	struct net_device *dev = dst->dev;
435 	struct neighbour *n;
436 
437 	rcu_read_lock_bh();
438 
439 	if (likely(rt->rt_gw_family == AF_INET)) {
440 		n = ip_neigh_gw4(dev, rt->rt_gw4);
441 	} else if (rt->rt_gw_family == AF_INET6) {
442 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
443         } else {
444 		__be32 pkey;
445 
446 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
447 		n = ip_neigh_gw4(dev, pkey);
448 	}
449 
450 	if (n && !refcount_inc_not_zero(&n->refcnt))
451 		n = NULL;
452 
453 	rcu_read_unlock_bh();
454 
455 	return n;
456 }
457 
458 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
459 {
460 	const struct rtable *rt = container_of(dst, struct rtable, dst);
461 	struct net_device *dev = dst->dev;
462 	const __be32 *pkey = daddr;
463 
464 	if (rt->rt_gw_family == AF_INET) {
465 		pkey = (const __be32 *)&rt->rt_gw4;
466 	} else if (rt->rt_gw_family == AF_INET6) {
467 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
468 	} else if (!daddr ||
469 		 (rt->rt_flags &
470 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
471 		return;
472 	}
473 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
474 }
475 
476 #define IP_IDENTS_SZ 2048u
477 
478 static atomic_t *ip_idents __read_mostly;
479 static u32 *ip_tstamps __read_mostly;
480 
481 /* In order to protect privacy, we add a perturbation to identifiers
482  * if one generator is seldom used. This makes hard for an attacker
483  * to infer how many packets were sent between two points in time.
484  */
485 u32 ip_idents_reserve(u32 hash, int segs)
486 {
487 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
488 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
489 	u32 old = READ_ONCE(*p_tstamp);
490 	u32 now = (u32)jiffies;
491 	u32 new, delta = 0;
492 
493 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
494 		delta = prandom_u32_max(now - old);
495 
496 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
497 	do {
498 		old = (u32)atomic_read(p_id);
499 		new = old + delta + segs;
500 	} while (atomic_cmpxchg(p_id, old, new) != old);
501 
502 	return new - segs;
503 }
504 EXPORT_SYMBOL(ip_idents_reserve);
505 
506 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
507 {
508 	u32 hash, id;
509 
510 	/* Note the following code is not safe, but this is okay. */
511 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
512 		get_random_bytes(&net->ipv4.ip_id_key,
513 				 sizeof(net->ipv4.ip_id_key));
514 
515 	hash = siphash_3u32((__force u32)iph->daddr,
516 			    (__force u32)iph->saddr,
517 			    iph->protocol,
518 			    &net->ipv4.ip_id_key);
519 	id = ip_idents_reserve(hash, segs);
520 	iph->id = htons(id);
521 }
522 EXPORT_SYMBOL(__ip_select_ident);
523 
524 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
525 			     const struct sock *sk,
526 			     const struct iphdr *iph,
527 			     int oif, u8 tos,
528 			     u8 prot, u32 mark, int flow_flags)
529 {
530 	if (sk) {
531 		const struct inet_sock *inet = inet_sk(sk);
532 
533 		oif = sk->sk_bound_dev_if;
534 		mark = sk->sk_mark;
535 		tos = RT_CONN_FLAGS(sk);
536 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
537 	}
538 	flowi4_init_output(fl4, oif, mark, tos,
539 			   RT_SCOPE_UNIVERSE, prot,
540 			   flow_flags,
541 			   iph->daddr, iph->saddr, 0, 0,
542 			   sock_net_uid(net, sk));
543 }
544 
545 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
546 			       const struct sock *sk)
547 {
548 	const struct net *net = dev_net(skb->dev);
549 	const struct iphdr *iph = ip_hdr(skb);
550 	int oif = skb->dev->ifindex;
551 	u8 tos = RT_TOS(iph->tos);
552 	u8 prot = iph->protocol;
553 	u32 mark = skb->mark;
554 
555 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
556 }
557 
558 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
559 {
560 	const struct inet_sock *inet = inet_sk(sk);
561 	const struct ip_options_rcu *inet_opt;
562 	__be32 daddr = inet->inet_daddr;
563 
564 	rcu_read_lock();
565 	inet_opt = rcu_dereference(inet->inet_opt);
566 	if (inet_opt && inet_opt->opt.srr)
567 		daddr = inet_opt->opt.faddr;
568 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
569 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
570 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
571 			   inet_sk_flowi_flags(sk),
572 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
573 	rcu_read_unlock();
574 }
575 
576 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
577 				 const struct sk_buff *skb)
578 {
579 	if (skb)
580 		build_skb_flow_key(fl4, skb, sk);
581 	else
582 		build_sk_flow_key(fl4, sk);
583 }
584 
585 static DEFINE_SPINLOCK(fnhe_lock);
586 
587 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
588 {
589 	struct rtable *rt;
590 
591 	rt = rcu_dereference(fnhe->fnhe_rth_input);
592 	if (rt) {
593 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
594 		dst_dev_put(&rt->dst);
595 		dst_release(&rt->dst);
596 	}
597 	rt = rcu_dereference(fnhe->fnhe_rth_output);
598 	if (rt) {
599 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
600 		dst_dev_put(&rt->dst);
601 		dst_release(&rt->dst);
602 	}
603 }
604 
605 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
606 {
607 	struct fib_nh_exception *fnhe, *oldest;
608 
609 	oldest = rcu_dereference(hash->chain);
610 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
611 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
612 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
613 			oldest = fnhe;
614 	}
615 	fnhe_flush_routes(oldest);
616 	return oldest;
617 }
618 
619 static inline u32 fnhe_hashfun(__be32 daddr)
620 {
621 	static u32 fnhe_hashrnd __read_mostly;
622 	u32 hval;
623 
624 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
625 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
626 	return hash_32(hval, FNHE_HASH_SHIFT);
627 }
628 
629 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
630 {
631 	rt->rt_pmtu = fnhe->fnhe_pmtu;
632 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
633 	rt->dst.expires = fnhe->fnhe_expires;
634 
635 	if (fnhe->fnhe_gw) {
636 		rt->rt_flags |= RTCF_REDIRECTED;
637 		rt->rt_gw_family = AF_INET;
638 		rt->rt_gw4 = fnhe->fnhe_gw;
639 	}
640 }
641 
642 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
643 				  __be32 gw, u32 pmtu, bool lock,
644 				  unsigned long expires)
645 {
646 	struct fnhe_hash_bucket *hash;
647 	struct fib_nh_exception *fnhe;
648 	struct rtable *rt;
649 	u32 genid, hval;
650 	unsigned int i;
651 	int depth;
652 
653 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
654 	hval = fnhe_hashfun(daddr);
655 
656 	spin_lock_bh(&fnhe_lock);
657 
658 	hash = rcu_dereference(nhc->nhc_exceptions);
659 	if (!hash) {
660 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
661 		if (!hash)
662 			goto out_unlock;
663 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
664 	}
665 
666 	hash += hval;
667 
668 	depth = 0;
669 	for (fnhe = rcu_dereference(hash->chain); fnhe;
670 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
671 		if (fnhe->fnhe_daddr == daddr)
672 			break;
673 		depth++;
674 	}
675 
676 	if (fnhe) {
677 		if (fnhe->fnhe_genid != genid)
678 			fnhe->fnhe_genid = genid;
679 		if (gw)
680 			fnhe->fnhe_gw = gw;
681 		if (pmtu) {
682 			fnhe->fnhe_pmtu = pmtu;
683 			fnhe->fnhe_mtu_locked = lock;
684 		}
685 		fnhe->fnhe_expires = max(1UL, expires);
686 		/* Update all cached dsts too */
687 		rt = rcu_dereference(fnhe->fnhe_rth_input);
688 		if (rt)
689 			fill_route_from_fnhe(rt, fnhe);
690 		rt = rcu_dereference(fnhe->fnhe_rth_output);
691 		if (rt)
692 			fill_route_from_fnhe(rt, fnhe);
693 	} else {
694 		if (depth > FNHE_RECLAIM_DEPTH)
695 			fnhe = fnhe_oldest(hash);
696 		else {
697 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698 			if (!fnhe)
699 				goto out_unlock;
700 
701 			fnhe->fnhe_next = hash->chain;
702 			rcu_assign_pointer(hash->chain, fnhe);
703 		}
704 		fnhe->fnhe_genid = genid;
705 		fnhe->fnhe_daddr = daddr;
706 		fnhe->fnhe_gw = gw;
707 		fnhe->fnhe_pmtu = pmtu;
708 		fnhe->fnhe_mtu_locked = lock;
709 		fnhe->fnhe_expires = max(1UL, expires);
710 
711 		/* Exception created; mark the cached routes for the nexthop
712 		 * stale, so anyone caching it rechecks if this exception
713 		 * applies to them.
714 		 */
715 		rt = rcu_dereference(nhc->nhc_rth_input);
716 		if (rt)
717 			rt->dst.obsolete = DST_OBSOLETE_KILL;
718 
719 		for_each_possible_cpu(i) {
720 			struct rtable __rcu **prt;
721 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
722 			rt = rcu_dereference(*prt);
723 			if (rt)
724 				rt->dst.obsolete = DST_OBSOLETE_KILL;
725 		}
726 	}
727 
728 	fnhe->fnhe_stamp = jiffies;
729 
730 out_unlock:
731 	spin_unlock_bh(&fnhe_lock);
732 }
733 
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 			     bool kill_route)
736 {
737 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
738 	__be32 old_gw = ip_hdr(skb)->saddr;
739 	struct net_device *dev = skb->dev;
740 	struct in_device *in_dev;
741 	struct fib_result res;
742 	struct neighbour *n;
743 	struct net *net;
744 
745 	switch (icmp_hdr(skb)->code & 7) {
746 	case ICMP_REDIR_NET:
747 	case ICMP_REDIR_NETTOS:
748 	case ICMP_REDIR_HOST:
749 	case ICMP_REDIR_HOSTTOS:
750 		break;
751 
752 	default:
753 		return;
754 	}
755 
756 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
757 		return;
758 
759 	in_dev = __in_dev_get_rcu(dev);
760 	if (!in_dev)
761 		return;
762 
763 	net = dev_net(dev);
764 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 	    ipv4_is_zeronet(new_gw))
767 		goto reject_redirect;
768 
769 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 			goto reject_redirect;
772 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 			goto reject_redirect;
774 	} else {
775 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776 			goto reject_redirect;
777 	}
778 
779 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 	if (!n)
781 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782 	if (!IS_ERR(n)) {
783 		if (!(n->nud_state & NUD_VALID)) {
784 			neigh_event_send(n, NULL);
785 		} else {
786 			if (fib_lookup(net, fl4, &res, 0) == 0) {
787 				struct fib_nh_common *nhc = FIB_RES_NHC(res);
788 
789 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
790 						0, false,
791 						jiffies + ip_rt_gc_timeout);
792 			}
793 			if (kill_route)
794 				rt->dst.obsolete = DST_OBSOLETE_KILL;
795 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
796 		}
797 		neigh_release(n);
798 	}
799 	return;
800 
801 reject_redirect:
802 #ifdef CONFIG_IP_ROUTE_VERBOSE
803 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
804 		const struct iphdr *iph = (const struct iphdr *) skb->data;
805 		__be32 daddr = iph->daddr;
806 		__be32 saddr = iph->saddr;
807 
808 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
809 				     "  Advised path = %pI4 -> %pI4\n",
810 				     &old_gw, dev->name, &new_gw,
811 				     &saddr, &daddr);
812 	}
813 #endif
814 	;
815 }
816 
817 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
818 {
819 	struct rtable *rt;
820 	struct flowi4 fl4;
821 	const struct iphdr *iph = (const struct iphdr *) skb->data;
822 	struct net *net = dev_net(skb->dev);
823 	int oif = skb->dev->ifindex;
824 	u8 tos = RT_TOS(iph->tos);
825 	u8 prot = iph->protocol;
826 	u32 mark = skb->mark;
827 
828 	rt = (struct rtable *) dst;
829 
830 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
831 	__ip_do_redirect(rt, skb, &fl4, true);
832 }
833 
834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
835 {
836 	struct rtable *rt = (struct rtable *)dst;
837 	struct dst_entry *ret = dst;
838 
839 	if (rt) {
840 		if (dst->obsolete > 0) {
841 			ip_rt_put(rt);
842 			ret = NULL;
843 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
844 			   rt->dst.expires) {
845 			ip_rt_put(rt);
846 			ret = NULL;
847 		}
848 	}
849 	return ret;
850 }
851 
852 /*
853  * Algorithm:
854  *	1. The first ip_rt_redirect_number redirects are sent
855  *	   with exponential backoff, then we stop sending them at all,
856  *	   assuming that the host ignores our redirects.
857  *	2. If we did not see packets requiring redirects
858  *	   during ip_rt_redirect_silence, we assume that the host
859  *	   forgot redirected route and start to send redirects again.
860  *
861  * This algorithm is much cheaper and more intelligent than dumb load limiting
862  * in icmp.c.
863  *
864  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
865  * and "frag. need" (breaks PMTU discovery) in icmp.c.
866  */
867 
868 void ip_rt_send_redirect(struct sk_buff *skb)
869 {
870 	struct rtable *rt = skb_rtable(skb);
871 	struct in_device *in_dev;
872 	struct inet_peer *peer;
873 	struct net *net;
874 	int log_martians;
875 	int vif;
876 
877 	rcu_read_lock();
878 	in_dev = __in_dev_get_rcu(rt->dst.dev);
879 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
880 		rcu_read_unlock();
881 		return;
882 	}
883 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
884 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
885 	rcu_read_unlock();
886 
887 	net = dev_net(rt->dst.dev);
888 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
889 	if (!peer) {
890 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
891 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
892 		return;
893 	}
894 
895 	/* No redirected packets during ip_rt_redirect_silence;
896 	 * reset the algorithm.
897 	 */
898 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
899 		peer->rate_tokens = 0;
900 		peer->n_redirects = 0;
901 	}
902 
903 	/* Too many ignored redirects; do not send anything
904 	 * set dst.rate_last to the last seen redirected packet.
905 	 */
906 	if (peer->n_redirects >= ip_rt_redirect_number) {
907 		peer->rate_last = jiffies;
908 		goto out_put_peer;
909 	}
910 
911 	/* Check for load limit; set rate_last to the latest sent
912 	 * redirect.
913 	 */
914 	if (peer->rate_tokens == 0 ||
915 	    time_after(jiffies,
916 		       (peer->rate_last +
917 			(ip_rt_redirect_load << peer->rate_tokens)))) {
918 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
919 
920 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
921 		peer->rate_last = jiffies;
922 		++peer->rate_tokens;
923 		++peer->n_redirects;
924 #ifdef CONFIG_IP_ROUTE_VERBOSE
925 		if (log_martians &&
926 		    peer->rate_tokens == ip_rt_redirect_number)
927 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
928 					     &ip_hdr(skb)->saddr, inet_iif(skb),
929 					     &ip_hdr(skb)->daddr, &gw);
930 #endif
931 	}
932 out_put_peer:
933 	inet_putpeer(peer);
934 }
935 
936 static int ip_error(struct sk_buff *skb)
937 {
938 	struct rtable *rt = skb_rtable(skb);
939 	struct net_device *dev = skb->dev;
940 	struct in_device *in_dev;
941 	struct inet_peer *peer;
942 	unsigned long now;
943 	struct net *net;
944 	bool send;
945 	int code;
946 
947 	if (netif_is_l3_master(skb->dev)) {
948 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949 		if (!dev)
950 			goto out;
951 	}
952 
953 	in_dev = __in_dev_get_rcu(dev);
954 
955 	/* IP on this device is disabled. */
956 	if (!in_dev)
957 		goto out;
958 
959 	net = dev_net(rt->dst.dev);
960 	if (!IN_DEV_FORWARD(in_dev)) {
961 		switch (rt->dst.error) {
962 		case EHOSTUNREACH:
963 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
964 			break;
965 
966 		case ENETUNREACH:
967 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968 			break;
969 		}
970 		goto out;
971 	}
972 
973 	switch (rt->dst.error) {
974 	case EINVAL:
975 	default:
976 		goto out;
977 	case EHOSTUNREACH:
978 		code = ICMP_HOST_UNREACH;
979 		break;
980 	case ENETUNREACH:
981 		code = ICMP_NET_UNREACH;
982 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
983 		break;
984 	case EACCES:
985 		code = ICMP_PKT_FILTERED;
986 		break;
987 	}
988 
989 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
990 			       l3mdev_master_ifindex(skb->dev), 1);
991 
992 	send = true;
993 	if (peer) {
994 		now = jiffies;
995 		peer->rate_tokens += now - peer->rate_last;
996 		if (peer->rate_tokens > ip_rt_error_burst)
997 			peer->rate_tokens = ip_rt_error_burst;
998 		peer->rate_last = now;
999 		if (peer->rate_tokens >= ip_rt_error_cost)
1000 			peer->rate_tokens -= ip_rt_error_cost;
1001 		else
1002 			send = false;
1003 		inet_putpeer(peer);
1004 	}
1005 	if (send)
1006 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007 
1008 out:	kfree_skb(skb);
1009 	return 0;
1010 }
1011 
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014 	struct dst_entry *dst = &rt->dst;
1015 	u32 old_mtu = ipv4_mtu(dst);
1016 	struct fib_result res;
1017 	bool lock = false;
1018 
1019 	if (ip_mtu_locked(dst))
1020 		return;
1021 
1022 	if (old_mtu < mtu)
1023 		return;
1024 
1025 	if (mtu < ip_rt_min_pmtu) {
1026 		lock = true;
1027 		mtu = min(old_mtu, ip_rt_min_pmtu);
1028 	}
1029 
1030 	if (rt->rt_pmtu == mtu && !lock &&
1031 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032 		return;
1033 
1034 	rcu_read_lock();
1035 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036 		struct fib_nh_common *nhc = FIB_RES_NHC(res);
1037 
1038 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1039 				      jiffies + ip_rt_mtu_expires);
1040 	}
1041 	rcu_read_unlock();
1042 }
1043 
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045 			      struct sk_buff *skb, u32 mtu)
1046 {
1047 	struct rtable *rt = (struct rtable *) dst;
1048 	struct flowi4 fl4;
1049 
1050 	ip_rt_build_flow_key(&fl4, sk, skb);
1051 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1052 }
1053 
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055 		      int oif, u8 protocol)
1056 {
1057 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 	struct flowi4 fl4;
1059 	struct rtable *rt;
1060 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1061 
1062 	__build_flow_key(net, &fl4, NULL, iph, oif,
1063 			 RT_TOS(iph->tos), protocol, mark, 0);
1064 	rt = __ip_route_output_key(net, &fl4);
1065 	if (!IS_ERR(rt)) {
1066 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1067 		ip_rt_put(rt);
1068 	}
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1071 
1072 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1073 {
1074 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1075 	struct flowi4 fl4;
1076 	struct rtable *rt;
1077 
1078 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1079 
1080 	if (!fl4.flowi4_mark)
1081 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1082 
1083 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1084 	if (!IS_ERR(rt)) {
1085 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1086 		ip_rt_put(rt);
1087 	}
1088 }
1089 
1090 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1091 {
1092 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1093 	struct flowi4 fl4;
1094 	struct rtable *rt;
1095 	struct dst_entry *odst = NULL;
1096 	bool new = false;
1097 	struct net *net = sock_net(sk);
1098 
1099 	bh_lock_sock(sk);
1100 
1101 	if (!ip_sk_accept_pmtu(sk))
1102 		goto out;
1103 
1104 	odst = sk_dst_get(sk);
1105 
1106 	if (sock_owned_by_user(sk) || !odst) {
1107 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1108 		goto out;
1109 	}
1110 
1111 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1112 
1113 	rt = (struct rtable *)odst;
1114 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1115 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116 		if (IS_ERR(rt))
1117 			goto out;
1118 
1119 		new = true;
1120 	}
1121 
1122 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1123 
1124 	if (!dst_check(&rt->dst, 0)) {
1125 		if (new)
1126 			dst_release(&rt->dst);
1127 
1128 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129 		if (IS_ERR(rt))
1130 			goto out;
1131 
1132 		new = true;
1133 	}
1134 
1135 	if (new)
1136 		sk_dst_set(sk, &rt->dst);
1137 
1138 out:
1139 	bh_unlock_sock(sk);
1140 	dst_release(odst);
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1143 
1144 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1145 		   int oif, u8 protocol)
1146 {
1147 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1148 	struct flowi4 fl4;
1149 	struct rtable *rt;
1150 
1151 	__build_flow_key(net, &fl4, NULL, iph, oif,
1152 			 RT_TOS(iph->tos), protocol, 0, 0);
1153 	rt = __ip_route_output_key(net, &fl4);
1154 	if (!IS_ERR(rt)) {
1155 		__ip_do_redirect(rt, skb, &fl4, false);
1156 		ip_rt_put(rt);
1157 	}
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_redirect);
1160 
1161 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1162 {
1163 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1164 	struct flowi4 fl4;
1165 	struct rtable *rt;
1166 	struct net *net = sock_net(sk);
1167 
1168 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169 	rt = __ip_route_output_key(net, &fl4);
1170 	if (!IS_ERR(rt)) {
1171 		__ip_do_redirect(rt, skb, &fl4, false);
1172 		ip_rt_put(rt);
1173 	}
1174 }
1175 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1176 
1177 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1178 {
1179 	struct rtable *rt = (struct rtable *) dst;
1180 
1181 	/* All IPV4 dsts are created with ->obsolete set to the value
1182 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183 	 * into this function always.
1184 	 *
1185 	 * When a PMTU/redirect information update invalidates a route,
1186 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1187 	 * DST_OBSOLETE_DEAD.
1188 	 */
1189 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1190 		return NULL;
1191 	return dst;
1192 }
1193 
1194 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1195 {
1196 	struct ip_options opt;
1197 	int res;
1198 
1199 	/* Recompile ip options since IPCB may not be valid anymore.
1200 	 * Also check we have a reasonable ipv4 header.
1201 	 */
1202 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204 		return;
1205 
1206 	memset(&opt, 0, sizeof(opt));
1207 	if (ip_hdr(skb)->ihl > 5) {
1208 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209 			return;
1210 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1211 
1212 		rcu_read_lock();
1213 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214 		rcu_read_unlock();
1215 
1216 		if (res)
1217 			return;
1218 	}
1219 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1220 }
1221 
1222 static void ipv4_link_failure(struct sk_buff *skb)
1223 {
1224 	struct rtable *rt;
1225 
1226 	ipv4_send_dest_unreach(skb);
1227 
1228 	rt = skb_rtable(skb);
1229 	if (rt)
1230 		dst_set_expires(&rt->dst, 0);
1231 }
1232 
1233 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1234 {
1235 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1236 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237 		 skb->dev ? skb->dev->name : "?");
1238 	kfree_skb(skb);
1239 	WARN_ON(1);
1240 	return 0;
1241 }
1242 
1243 /*
1244    We do not cache source address of outgoing interface,
1245    because it is used only by IP RR, TS and SRR options,
1246    so that it out of fast path.
1247 
1248    BTW remember: "addr" is allowed to be not aligned
1249    in IP options!
1250  */
1251 
1252 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1253 {
1254 	__be32 src;
1255 
1256 	if (rt_is_output_route(rt))
1257 		src = ip_hdr(skb)->saddr;
1258 	else {
1259 		struct fib_result res;
1260 		struct iphdr *iph = ip_hdr(skb);
1261 		struct flowi4 fl4 = {
1262 			.daddr = iph->daddr,
1263 			.saddr = iph->saddr,
1264 			.flowi4_tos = RT_TOS(iph->tos),
1265 			.flowi4_oif = rt->dst.dev->ifindex,
1266 			.flowi4_iif = skb->dev->ifindex,
1267 			.flowi4_mark = skb->mark,
1268 		};
1269 
1270 		rcu_read_lock();
1271 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1273 		else
1274 			src = inet_select_addr(rt->dst.dev,
1275 					       rt_nexthop(rt, iph->daddr),
1276 					       RT_SCOPE_UNIVERSE);
1277 		rcu_read_unlock();
1278 	}
1279 	memcpy(addr, &src, 4);
1280 }
1281 
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285 	if (!(rt->dst.tclassid & 0xFFFF))
1286 		rt->dst.tclassid |= tag & 0xFFFF;
1287 	if (!(rt->dst.tclassid & 0xFFFF0000))
1288 		rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291 
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1295 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1296 				    ip_rt_min_advmss);
1297 
1298 	return min(advmss, IPV4_MAX_PMTU - header_size);
1299 }
1300 
1301 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1302 {
1303 	const struct rtable *rt = (const struct rtable *) dst;
1304 	unsigned int mtu = rt->rt_pmtu;
1305 
1306 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1307 		mtu = dst_metric_raw(dst, RTAX_MTU);
1308 
1309 	if (mtu)
1310 		return mtu;
1311 
1312 	mtu = READ_ONCE(dst->dev->mtu);
1313 
1314 	if (unlikely(ip_mtu_locked(dst))) {
1315 		if (rt->rt_gw_family && mtu > 576)
1316 			mtu = 576;
1317 	}
1318 
1319 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320 
1321 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1322 }
1323 
1324 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1325 {
1326 	struct fnhe_hash_bucket *hash;
1327 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328 	u32 hval = fnhe_hashfun(daddr);
1329 
1330 	spin_lock_bh(&fnhe_lock);
1331 
1332 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1333 					 lockdep_is_held(&fnhe_lock));
1334 	hash += hval;
1335 
1336 	fnhe_p = &hash->chain;
1337 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338 	while (fnhe) {
1339 		if (fnhe->fnhe_daddr == daddr) {
1340 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1342 			/* set fnhe_daddr to 0 to ensure it won't bind with
1343 			 * new dsts in rt_bind_exception().
1344 			 */
1345 			fnhe->fnhe_daddr = 0;
1346 			fnhe_flush_routes(fnhe);
1347 			kfree_rcu(fnhe, rcu);
1348 			break;
1349 		}
1350 		fnhe_p = &fnhe->fnhe_next;
1351 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352 						 lockdep_is_held(&fnhe_lock));
1353 	}
1354 
1355 	spin_unlock_bh(&fnhe_lock);
1356 }
1357 
1358 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359 					       __be32 daddr)
1360 {
1361 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1362 	struct fib_nh_exception *fnhe;
1363 	u32 hval;
1364 
1365 	if (!hash)
1366 		return NULL;
1367 
1368 	hval = fnhe_hashfun(daddr);
1369 
1370 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1372 		if (fnhe->fnhe_daddr == daddr) {
1373 			if (fnhe->fnhe_expires &&
1374 			    time_after(jiffies, fnhe->fnhe_expires)) {
1375 				ip_del_fnhe(nhc, daddr);
1376 				break;
1377 			}
1378 			return fnhe;
1379 		}
1380 	}
1381 	return NULL;
1382 }
1383 
1384 /* MTU selection:
1385  * 1. mtu on route is locked - use it
1386  * 2. mtu from nexthop exception
1387  * 3. mtu from egress device
1388  */
1389 
1390 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1391 {
1392 	struct fib_nh_common *nhc = res->nhc;
1393 	struct net_device *dev = nhc->nhc_dev;
1394 	struct fib_info *fi = res->fi;
1395 	u32 mtu = 0;
1396 
1397 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399 		mtu = fi->fib_mtu;
1400 
1401 	if (likely(!mtu)) {
1402 		struct fib_nh_exception *fnhe;
1403 
1404 		fnhe = find_exception(nhc, daddr);
1405 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406 			mtu = fnhe->fnhe_pmtu;
1407 	}
1408 
1409 	if (likely(!mtu))
1410 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1411 
1412 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1413 }
1414 
1415 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1416 			      __be32 daddr, const bool do_cache)
1417 {
1418 	bool ret = false;
1419 
1420 	spin_lock_bh(&fnhe_lock);
1421 
1422 	if (daddr == fnhe->fnhe_daddr) {
1423 		struct rtable __rcu **porig;
1424 		struct rtable *orig;
1425 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1426 
1427 		if (rt_is_input_route(rt))
1428 			porig = &fnhe->fnhe_rth_input;
1429 		else
1430 			porig = &fnhe->fnhe_rth_output;
1431 		orig = rcu_dereference(*porig);
1432 
1433 		if (fnhe->fnhe_genid != genid) {
1434 			fnhe->fnhe_genid = genid;
1435 			fnhe->fnhe_gw = 0;
1436 			fnhe->fnhe_pmtu = 0;
1437 			fnhe->fnhe_expires = 0;
1438 			fnhe->fnhe_mtu_locked = false;
1439 			fnhe_flush_routes(fnhe);
1440 			orig = NULL;
1441 		}
1442 		fill_route_from_fnhe(rt, fnhe);
1443 		if (!rt->rt_gw4) {
1444 			rt->rt_gw4 = daddr;
1445 			rt->rt_gw_family = AF_INET;
1446 		}
1447 
1448 		if (do_cache) {
1449 			dst_hold(&rt->dst);
1450 			rcu_assign_pointer(*porig, rt);
1451 			if (orig) {
1452 				dst_dev_put(&orig->dst);
1453 				dst_release(&orig->dst);
1454 			}
1455 			ret = true;
1456 		}
1457 
1458 		fnhe->fnhe_stamp = jiffies;
1459 	}
1460 	spin_unlock_bh(&fnhe_lock);
1461 
1462 	return ret;
1463 }
1464 
1465 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1466 {
1467 	struct rtable *orig, *prev, **p;
1468 	bool ret = true;
1469 
1470 	if (rt_is_input_route(rt)) {
1471 		p = (struct rtable **)&nhc->nhc_rth_input;
1472 	} else {
1473 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1474 	}
1475 	orig = *p;
1476 
1477 	/* hold dst before doing cmpxchg() to avoid race condition
1478 	 * on this dst
1479 	 */
1480 	dst_hold(&rt->dst);
1481 	prev = cmpxchg(p, orig, rt);
1482 	if (prev == orig) {
1483 		if (orig) {
1484 			dst_dev_put(&orig->dst);
1485 			dst_release(&orig->dst);
1486 		}
1487 	} else {
1488 		dst_release(&rt->dst);
1489 		ret = false;
1490 	}
1491 
1492 	return ret;
1493 }
1494 
1495 struct uncached_list {
1496 	spinlock_t		lock;
1497 	struct list_head	head;
1498 };
1499 
1500 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1501 
1502 void rt_add_uncached_list(struct rtable *rt)
1503 {
1504 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1505 
1506 	rt->rt_uncached_list = ul;
1507 
1508 	spin_lock_bh(&ul->lock);
1509 	list_add_tail(&rt->rt_uncached, &ul->head);
1510 	spin_unlock_bh(&ul->lock);
1511 }
1512 
1513 void rt_del_uncached_list(struct rtable *rt)
1514 {
1515 	if (!list_empty(&rt->rt_uncached)) {
1516 		struct uncached_list *ul = rt->rt_uncached_list;
1517 
1518 		spin_lock_bh(&ul->lock);
1519 		list_del(&rt->rt_uncached);
1520 		spin_unlock_bh(&ul->lock);
1521 	}
1522 }
1523 
1524 static void ipv4_dst_destroy(struct dst_entry *dst)
1525 {
1526 	struct rtable *rt = (struct rtable *)dst;
1527 
1528 	ip_dst_metrics_put(dst);
1529 	rt_del_uncached_list(rt);
1530 }
1531 
1532 void rt_flush_dev(struct net_device *dev)
1533 {
1534 	struct net *net = dev_net(dev);
1535 	struct rtable *rt;
1536 	int cpu;
1537 
1538 	for_each_possible_cpu(cpu) {
1539 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540 
1541 		spin_lock_bh(&ul->lock);
1542 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1543 			if (rt->dst.dev != dev)
1544 				continue;
1545 			rt->dst.dev = net->loopback_dev;
1546 			dev_hold(rt->dst.dev);
1547 			dev_put(dev);
1548 		}
1549 		spin_unlock_bh(&ul->lock);
1550 	}
1551 }
1552 
1553 static bool rt_cache_valid(const struct rtable *rt)
1554 {
1555 	return	rt &&
1556 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557 		!rt_is_expired(rt);
1558 }
1559 
1560 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561 			   const struct fib_result *res,
1562 			   struct fib_nh_exception *fnhe,
1563 			   struct fib_info *fi, u16 type, u32 itag,
1564 			   const bool do_cache)
1565 {
1566 	bool cached = false;
1567 
1568 	if (fi) {
1569 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570 
1571 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572 			rt->rt_gw_family = nhc->nhc_gw_family;
1573 			/* only INET and INET6 are supported */
1574 			if (likely(nhc->nhc_gw_family == AF_INET))
1575 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576 			else
1577 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1578 		}
1579 
1580 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581 
1582 #ifdef CONFIG_IP_ROUTE_CLASSID
1583 		{
1584 			struct fib_nh *nh;
1585 
1586 			nh = container_of(nhc, struct fib_nh, nh_common);
1587 			rt->dst.tclassid = nh->nh_tclassid;
1588 		}
1589 #endif
1590 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591 		if (unlikely(fnhe))
1592 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593 		else if (do_cache)
1594 			cached = rt_cache_route(nhc, rt);
1595 		if (unlikely(!cached)) {
1596 			/* Routes we intend to cache in nexthop exception or
1597 			 * FIB nexthop have the DST_NOCACHE bit clear.
1598 			 * However, if we are unsuccessful at storing this
1599 			 * route into the cache we really need to set it.
1600 			 */
1601 			if (!rt->rt_gw4) {
1602 				rt->rt_gw_family = AF_INET;
1603 				rt->rt_gw4 = daddr;
1604 			}
1605 			rt_add_uncached_list(rt);
1606 		}
1607 	} else
1608 		rt_add_uncached_list(rt);
1609 
1610 #ifdef CONFIG_IP_ROUTE_CLASSID
1611 #ifdef CONFIG_IP_MULTIPLE_TABLES
1612 	set_class_tag(rt, res->tclassid);
1613 #endif
1614 	set_class_tag(rt, itag);
1615 #endif
1616 }
1617 
1618 struct rtable *rt_dst_alloc(struct net_device *dev,
1619 			    unsigned int flags, u16 type,
1620 			    bool nopolicy, bool noxfrm, bool will_cache)
1621 {
1622 	struct rtable *rt;
1623 
1624 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625 		       (will_cache ? 0 : DST_HOST) |
1626 		       (nopolicy ? DST_NOPOLICY : 0) |
1627 		       (noxfrm ? DST_NOXFRM : 0));
1628 
1629 	if (rt) {
1630 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631 		rt->rt_flags = flags;
1632 		rt->rt_type = type;
1633 		rt->rt_is_input = 0;
1634 		rt->rt_iif = 0;
1635 		rt->rt_pmtu = 0;
1636 		rt->rt_mtu_locked = 0;
1637 		rt->rt_gw_family = 0;
1638 		rt->rt_gw4 = 0;
1639 		INIT_LIST_HEAD(&rt->rt_uncached);
1640 
1641 		rt->dst.output = ip_output;
1642 		if (flags & RTCF_LOCAL)
1643 			rt->dst.input = ip_local_deliver;
1644 	}
1645 
1646 	return rt;
1647 }
1648 EXPORT_SYMBOL(rt_dst_alloc);
1649 
1650 /* called in rcu_read_lock() section */
1651 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1652 			  u8 tos, struct net_device *dev,
1653 			  struct in_device *in_dev, u32 *itag)
1654 {
1655 	int err;
1656 
1657 	/* Primary sanity checks. */
1658 	if (!in_dev)
1659 		return -EINVAL;
1660 
1661 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1662 	    skb->protocol != htons(ETH_P_IP))
1663 		return -EINVAL;
1664 
1665 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1666 		return -EINVAL;
1667 
1668 	if (ipv4_is_zeronet(saddr)) {
1669 		if (!ipv4_is_local_multicast(daddr) &&
1670 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1671 			return -EINVAL;
1672 	} else {
1673 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1674 					  in_dev, itag);
1675 		if (err < 0)
1676 			return err;
1677 	}
1678 	return 0;
1679 }
1680 
1681 /* called in rcu_read_lock() section */
1682 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1683 			     u8 tos, struct net_device *dev, int our)
1684 {
1685 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1686 	unsigned int flags = RTCF_MULTICAST;
1687 	struct rtable *rth;
1688 	u32 itag = 0;
1689 	int err;
1690 
1691 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1692 	if (err)
1693 		return err;
1694 
1695 	if (our)
1696 		flags |= RTCF_LOCAL;
1697 
1698 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1699 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1700 	if (!rth)
1701 		return -ENOBUFS;
1702 
1703 #ifdef CONFIG_IP_ROUTE_CLASSID
1704 	rth->dst.tclassid = itag;
1705 #endif
1706 	rth->dst.output = ip_rt_bug;
1707 	rth->rt_is_input= 1;
1708 
1709 #ifdef CONFIG_IP_MROUTE
1710 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1711 		rth->dst.input = ip_mr_input;
1712 #endif
1713 	RT_CACHE_STAT_INC(in_slow_mc);
1714 
1715 	skb_dst_set(skb, &rth->dst);
1716 	return 0;
1717 }
1718 
1719 
1720 static void ip_handle_martian_source(struct net_device *dev,
1721 				     struct in_device *in_dev,
1722 				     struct sk_buff *skb,
1723 				     __be32 daddr,
1724 				     __be32 saddr)
1725 {
1726 	RT_CACHE_STAT_INC(in_martian_src);
1727 #ifdef CONFIG_IP_ROUTE_VERBOSE
1728 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1729 		/*
1730 		 *	RFC1812 recommendation, if source is martian,
1731 		 *	the only hint is MAC header.
1732 		 */
1733 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1734 			&daddr, &saddr, dev->name);
1735 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1736 			print_hex_dump(KERN_WARNING, "ll header: ",
1737 				       DUMP_PREFIX_OFFSET, 16, 1,
1738 				       skb_mac_header(skb),
1739 				       dev->hard_header_len, false);
1740 		}
1741 	}
1742 #endif
1743 }
1744 
1745 /* called in rcu_read_lock() section */
1746 static int __mkroute_input(struct sk_buff *skb,
1747 			   const struct fib_result *res,
1748 			   struct in_device *in_dev,
1749 			   __be32 daddr, __be32 saddr, u32 tos)
1750 {
1751 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1752 	struct net_device *dev = nhc->nhc_dev;
1753 	struct fib_nh_exception *fnhe;
1754 	struct rtable *rth;
1755 	int err;
1756 	struct in_device *out_dev;
1757 	bool do_cache;
1758 	u32 itag = 0;
1759 
1760 	/* get a working reference to the output device */
1761 	out_dev = __in_dev_get_rcu(dev);
1762 	if (!out_dev) {
1763 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1764 		return -EINVAL;
1765 	}
1766 
1767 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1768 				  in_dev->dev, in_dev, &itag);
1769 	if (err < 0) {
1770 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1771 					 saddr);
1772 
1773 		goto cleanup;
1774 	}
1775 
1776 	do_cache = res->fi && !itag;
1777 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1778 	    skb->protocol == htons(ETH_P_IP)) {
1779 		__be32 gw;
1780 
1781 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1782 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1783 		    inet_addr_onlink(out_dev, saddr, gw))
1784 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1785 	}
1786 
1787 	if (skb->protocol != htons(ETH_P_IP)) {
1788 		/* Not IP (i.e. ARP). Do not create route, if it is
1789 		 * invalid for proxy arp. DNAT routes are always valid.
1790 		 *
1791 		 * Proxy arp feature have been extended to allow, ARP
1792 		 * replies back to the same interface, to support
1793 		 * Private VLAN switch technologies. See arp.c.
1794 		 */
1795 		if (out_dev == in_dev &&
1796 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1797 			err = -EINVAL;
1798 			goto cleanup;
1799 		}
1800 	}
1801 
1802 	fnhe = find_exception(nhc, daddr);
1803 	if (do_cache) {
1804 		if (fnhe)
1805 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1806 		else
1807 			rth = rcu_dereference(nhc->nhc_rth_input);
1808 		if (rt_cache_valid(rth)) {
1809 			skb_dst_set_noref(skb, &rth->dst);
1810 			goto out;
1811 		}
1812 	}
1813 
1814 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1815 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1816 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1817 	if (!rth) {
1818 		err = -ENOBUFS;
1819 		goto cleanup;
1820 	}
1821 
1822 	rth->rt_is_input = 1;
1823 	RT_CACHE_STAT_INC(in_slow_tot);
1824 
1825 	rth->dst.input = ip_forward;
1826 
1827 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1828 		       do_cache);
1829 	lwtunnel_set_redirect(&rth->dst);
1830 	skb_dst_set(skb, &rth->dst);
1831 out:
1832 	err = 0;
1833  cleanup:
1834 	return err;
1835 }
1836 
1837 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1838 /* To make ICMP packets follow the right flow, the multipath hash is
1839  * calculated from the inner IP addresses.
1840  */
1841 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1842 				 struct flow_keys *hash_keys)
1843 {
1844 	const struct iphdr *outer_iph = ip_hdr(skb);
1845 	const struct iphdr *key_iph = outer_iph;
1846 	const struct iphdr *inner_iph;
1847 	const struct icmphdr *icmph;
1848 	struct iphdr _inner_iph;
1849 	struct icmphdr _icmph;
1850 
1851 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1852 		goto out;
1853 
1854 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1855 		goto out;
1856 
1857 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1858 				   &_icmph);
1859 	if (!icmph)
1860 		goto out;
1861 
1862 	if (icmph->type != ICMP_DEST_UNREACH &&
1863 	    icmph->type != ICMP_REDIRECT &&
1864 	    icmph->type != ICMP_TIME_EXCEEDED &&
1865 	    icmph->type != ICMP_PARAMETERPROB)
1866 		goto out;
1867 
1868 	inner_iph = skb_header_pointer(skb,
1869 				       outer_iph->ihl * 4 + sizeof(_icmph),
1870 				       sizeof(_inner_iph), &_inner_iph);
1871 	if (!inner_iph)
1872 		goto out;
1873 
1874 	key_iph = inner_iph;
1875 out:
1876 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1877 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1878 }
1879 
1880 /* if skb is set it will be used and fl4 can be NULL */
1881 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1882 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1883 {
1884 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1885 	struct flow_keys hash_keys;
1886 	u32 mhash;
1887 
1888 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1889 	case 0:
1890 		memset(&hash_keys, 0, sizeof(hash_keys));
1891 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1892 		if (skb) {
1893 			ip_multipath_l3_keys(skb, &hash_keys);
1894 		} else {
1895 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1896 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1897 		}
1898 		break;
1899 	case 1:
1900 		/* skb is currently provided only when forwarding */
1901 		if (skb) {
1902 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1903 			struct flow_keys keys;
1904 
1905 			/* short-circuit if we already have L4 hash present */
1906 			if (skb->l4_hash)
1907 				return skb_get_hash_raw(skb) >> 1;
1908 
1909 			memset(&hash_keys, 0, sizeof(hash_keys));
1910 
1911 			if (!flkeys) {
1912 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1913 				flkeys = &keys;
1914 			}
1915 
1916 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1917 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1918 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1919 			hash_keys.ports.src = flkeys->ports.src;
1920 			hash_keys.ports.dst = flkeys->ports.dst;
1921 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1922 		} else {
1923 			memset(&hash_keys, 0, sizeof(hash_keys));
1924 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1926 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1927 			hash_keys.ports.src = fl4->fl4_sport;
1928 			hash_keys.ports.dst = fl4->fl4_dport;
1929 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1930 		}
1931 		break;
1932 	}
1933 	mhash = flow_hash_from_keys(&hash_keys);
1934 
1935 	if (multipath_hash)
1936 		mhash = jhash_2words(mhash, multipath_hash, 0);
1937 
1938 	return mhash >> 1;
1939 }
1940 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1941 
1942 static int ip_mkroute_input(struct sk_buff *skb,
1943 			    struct fib_result *res,
1944 			    struct in_device *in_dev,
1945 			    __be32 daddr, __be32 saddr, u32 tos,
1946 			    struct flow_keys *hkeys)
1947 {
1948 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1949 	if (res->fi && res->fi->fib_nhs > 1) {
1950 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1951 
1952 		fib_select_multipath(res, h);
1953 	}
1954 #endif
1955 
1956 	/* create a routing cache entry */
1957 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1958 }
1959 
1960 /*
1961  *	NOTE. We drop all the packets that has local source
1962  *	addresses, because every properly looped back packet
1963  *	must have correct destination already attached by output routine.
1964  *
1965  *	Such approach solves two big problems:
1966  *	1. Not simplex devices are handled properly.
1967  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1968  *	called with rcu_read_lock()
1969  */
1970 
1971 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1972 			       u8 tos, struct net_device *dev,
1973 			       struct fib_result *res)
1974 {
1975 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1976 	struct flow_keys *flkeys = NULL, _flkeys;
1977 	struct net    *net = dev_net(dev);
1978 	struct ip_tunnel_info *tun_info;
1979 	int		err = -EINVAL;
1980 	unsigned int	flags = 0;
1981 	u32		itag = 0;
1982 	struct rtable	*rth;
1983 	struct flowi4	fl4;
1984 	bool do_cache;
1985 
1986 	/* IP on this device is disabled. */
1987 
1988 	if (!in_dev)
1989 		goto out;
1990 
1991 	/* Check for the most weird martians, which can be not detected
1992 	   by fib_lookup.
1993 	 */
1994 
1995 	tun_info = skb_tunnel_info(skb);
1996 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1997 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1998 	else
1999 		fl4.flowi4_tun_key.tun_id = 0;
2000 	skb_dst_drop(skb);
2001 
2002 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2003 		goto martian_source;
2004 
2005 	res->fi = NULL;
2006 	res->table = NULL;
2007 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2008 		goto brd_input;
2009 
2010 	/* Accept zero addresses only to limited broadcast;
2011 	 * I even do not know to fix it or not. Waiting for complains :-)
2012 	 */
2013 	if (ipv4_is_zeronet(saddr))
2014 		goto martian_source;
2015 
2016 	if (ipv4_is_zeronet(daddr))
2017 		goto martian_destination;
2018 
2019 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2020 	 * and call it once if daddr or/and saddr are loopback addresses
2021 	 */
2022 	if (ipv4_is_loopback(daddr)) {
2023 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2024 			goto martian_destination;
2025 	} else if (ipv4_is_loopback(saddr)) {
2026 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2027 			goto martian_source;
2028 	}
2029 
2030 	/*
2031 	 *	Now we are ready to route packet.
2032 	 */
2033 	fl4.flowi4_oif = 0;
2034 	fl4.flowi4_iif = dev->ifindex;
2035 	fl4.flowi4_mark = skb->mark;
2036 	fl4.flowi4_tos = tos;
2037 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2038 	fl4.flowi4_flags = 0;
2039 	fl4.daddr = daddr;
2040 	fl4.saddr = saddr;
2041 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2042 
2043 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2044 		flkeys = &_flkeys;
2045 	} else {
2046 		fl4.flowi4_proto = 0;
2047 		fl4.fl4_sport = 0;
2048 		fl4.fl4_dport = 0;
2049 	}
2050 
2051 	err = fib_lookup(net, &fl4, res, 0);
2052 	if (err != 0) {
2053 		if (!IN_DEV_FORWARD(in_dev))
2054 			err = -EHOSTUNREACH;
2055 		goto no_route;
2056 	}
2057 
2058 	if (res->type == RTN_BROADCAST) {
2059 		if (IN_DEV_BFORWARD(in_dev))
2060 			goto make_route;
2061 		goto brd_input;
2062 	}
2063 
2064 	if (res->type == RTN_LOCAL) {
2065 		err = fib_validate_source(skb, saddr, daddr, tos,
2066 					  0, dev, in_dev, &itag);
2067 		if (err < 0)
2068 			goto martian_source;
2069 		goto local_input;
2070 	}
2071 
2072 	if (!IN_DEV_FORWARD(in_dev)) {
2073 		err = -EHOSTUNREACH;
2074 		goto no_route;
2075 	}
2076 	if (res->type != RTN_UNICAST)
2077 		goto martian_destination;
2078 
2079 make_route:
2080 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2081 out:	return err;
2082 
2083 brd_input:
2084 	if (skb->protocol != htons(ETH_P_IP))
2085 		goto e_inval;
2086 
2087 	if (!ipv4_is_zeronet(saddr)) {
2088 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2089 					  in_dev, &itag);
2090 		if (err < 0)
2091 			goto martian_source;
2092 	}
2093 	flags |= RTCF_BROADCAST;
2094 	res->type = RTN_BROADCAST;
2095 	RT_CACHE_STAT_INC(in_brd);
2096 
2097 local_input:
2098 	do_cache = false;
2099 	if (res->fi) {
2100 		if (!itag) {
2101 			struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2102 
2103 			rth = rcu_dereference(nhc->nhc_rth_input);
2104 			if (rt_cache_valid(rth)) {
2105 				skb_dst_set_noref(skb, &rth->dst);
2106 				err = 0;
2107 				goto out;
2108 			}
2109 			do_cache = true;
2110 		}
2111 	}
2112 
2113 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2114 			   flags | RTCF_LOCAL, res->type,
2115 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2116 	if (!rth)
2117 		goto e_nobufs;
2118 
2119 	rth->dst.output= ip_rt_bug;
2120 #ifdef CONFIG_IP_ROUTE_CLASSID
2121 	rth->dst.tclassid = itag;
2122 #endif
2123 	rth->rt_is_input = 1;
2124 
2125 	RT_CACHE_STAT_INC(in_slow_tot);
2126 	if (res->type == RTN_UNREACHABLE) {
2127 		rth->dst.input= ip_error;
2128 		rth->dst.error= -err;
2129 		rth->rt_flags 	&= ~RTCF_LOCAL;
2130 	}
2131 
2132 	if (do_cache) {
2133 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2134 
2135 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2136 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2137 			WARN_ON(rth->dst.input == lwtunnel_input);
2138 			rth->dst.lwtstate->orig_input = rth->dst.input;
2139 			rth->dst.input = lwtunnel_input;
2140 		}
2141 
2142 		if (unlikely(!rt_cache_route(nhc, rth)))
2143 			rt_add_uncached_list(rth);
2144 	}
2145 	skb_dst_set(skb, &rth->dst);
2146 	err = 0;
2147 	goto out;
2148 
2149 no_route:
2150 	RT_CACHE_STAT_INC(in_no_route);
2151 	res->type = RTN_UNREACHABLE;
2152 	res->fi = NULL;
2153 	res->table = NULL;
2154 	goto local_input;
2155 
2156 	/*
2157 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2158 	 */
2159 martian_destination:
2160 	RT_CACHE_STAT_INC(in_martian_dst);
2161 #ifdef CONFIG_IP_ROUTE_VERBOSE
2162 	if (IN_DEV_LOG_MARTIANS(in_dev))
2163 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2164 				     &daddr, &saddr, dev->name);
2165 #endif
2166 
2167 e_inval:
2168 	err = -EINVAL;
2169 	goto out;
2170 
2171 e_nobufs:
2172 	err = -ENOBUFS;
2173 	goto out;
2174 
2175 martian_source:
2176 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2177 	goto out;
2178 }
2179 
2180 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2181 			 u8 tos, struct net_device *dev)
2182 {
2183 	struct fib_result res;
2184 	int err;
2185 
2186 	tos &= IPTOS_RT_MASK;
2187 	rcu_read_lock();
2188 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2189 	rcu_read_unlock();
2190 
2191 	return err;
2192 }
2193 EXPORT_SYMBOL(ip_route_input_noref);
2194 
2195 /* called with rcu_read_lock held */
2196 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2197 		       u8 tos, struct net_device *dev, struct fib_result *res)
2198 {
2199 	/* Multicast recognition logic is moved from route cache to here.
2200 	   The problem was that too many Ethernet cards have broken/missing
2201 	   hardware multicast filters :-( As result the host on multicasting
2202 	   network acquires a lot of useless route cache entries, sort of
2203 	   SDR messages from all the world. Now we try to get rid of them.
2204 	   Really, provided software IP multicast filter is organized
2205 	   reasonably (at least, hashed), it does not result in a slowdown
2206 	   comparing with route cache reject entries.
2207 	   Note, that multicast routers are not affected, because
2208 	   route cache entry is created eventually.
2209 	 */
2210 	if (ipv4_is_multicast(daddr)) {
2211 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2212 		int our = 0;
2213 		int err = -EINVAL;
2214 
2215 		if (!in_dev)
2216 			return err;
2217 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2218 				      ip_hdr(skb)->protocol);
2219 
2220 		/* check l3 master if no match yet */
2221 		if (!our && netif_is_l3_slave(dev)) {
2222 			struct in_device *l3_in_dev;
2223 
2224 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2225 			if (l3_in_dev)
2226 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2227 						      ip_hdr(skb)->protocol);
2228 		}
2229 
2230 		if (our
2231 #ifdef CONFIG_IP_MROUTE
2232 			||
2233 		    (!ipv4_is_local_multicast(daddr) &&
2234 		     IN_DEV_MFORWARD(in_dev))
2235 #endif
2236 		   ) {
2237 			err = ip_route_input_mc(skb, daddr, saddr,
2238 						tos, dev, our);
2239 		}
2240 		return err;
2241 	}
2242 
2243 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2244 }
2245 
2246 /* called with rcu_read_lock() */
2247 static struct rtable *__mkroute_output(const struct fib_result *res,
2248 				       const struct flowi4 *fl4, int orig_oif,
2249 				       struct net_device *dev_out,
2250 				       unsigned int flags)
2251 {
2252 	struct fib_info *fi = res->fi;
2253 	struct fib_nh_exception *fnhe;
2254 	struct in_device *in_dev;
2255 	u16 type = res->type;
2256 	struct rtable *rth;
2257 	bool do_cache;
2258 
2259 	in_dev = __in_dev_get_rcu(dev_out);
2260 	if (!in_dev)
2261 		return ERR_PTR(-EINVAL);
2262 
2263 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2264 		if (ipv4_is_loopback(fl4->saddr) &&
2265 		    !(dev_out->flags & IFF_LOOPBACK) &&
2266 		    !netif_is_l3_master(dev_out))
2267 			return ERR_PTR(-EINVAL);
2268 
2269 	if (ipv4_is_lbcast(fl4->daddr))
2270 		type = RTN_BROADCAST;
2271 	else if (ipv4_is_multicast(fl4->daddr))
2272 		type = RTN_MULTICAST;
2273 	else if (ipv4_is_zeronet(fl4->daddr))
2274 		return ERR_PTR(-EINVAL);
2275 
2276 	if (dev_out->flags & IFF_LOOPBACK)
2277 		flags |= RTCF_LOCAL;
2278 
2279 	do_cache = true;
2280 	if (type == RTN_BROADCAST) {
2281 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2282 		fi = NULL;
2283 	} else if (type == RTN_MULTICAST) {
2284 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2285 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2286 				     fl4->flowi4_proto))
2287 			flags &= ~RTCF_LOCAL;
2288 		else
2289 			do_cache = false;
2290 		/* If multicast route do not exist use
2291 		 * default one, but do not gateway in this case.
2292 		 * Yes, it is hack.
2293 		 */
2294 		if (fi && res->prefixlen < 4)
2295 			fi = NULL;
2296 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2297 		   (orig_oif != dev_out->ifindex)) {
2298 		/* For local routes that require a particular output interface
2299 		 * we do not want to cache the result.  Caching the result
2300 		 * causes incorrect behaviour when there are multiple source
2301 		 * addresses on the interface, the end result being that if the
2302 		 * intended recipient is waiting on that interface for the
2303 		 * packet he won't receive it because it will be delivered on
2304 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2305 		 * be set to the loopback interface as well.
2306 		 */
2307 		do_cache = false;
2308 	}
2309 
2310 	fnhe = NULL;
2311 	do_cache &= fi != NULL;
2312 	if (fi) {
2313 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2314 		struct rtable __rcu **prth;
2315 
2316 		fnhe = find_exception(nhc, fl4->daddr);
2317 		if (!do_cache)
2318 			goto add;
2319 		if (fnhe) {
2320 			prth = &fnhe->fnhe_rth_output;
2321 		} else {
2322 			if (unlikely(fl4->flowi4_flags &
2323 				     FLOWI_FLAG_KNOWN_NH &&
2324 				     !(nhc->nhc_gw_family &&
2325 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2326 				do_cache = false;
2327 				goto add;
2328 			}
2329 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2330 		}
2331 		rth = rcu_dereference(*prth);
2332 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2333 			return rth;
2334 	}
2335 
2336 add:
2337 	rth = rt_dst_alloc(dev_out, flags, type,
2338 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2339 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2340 			   do_cache);
2341 	if (!rth)
2342 		return ERR_PTR(-ENOBUFS);
2343 
2344 	rth->rt_iif = orig_oif;
2345 
2346 	RT_CACHE_STAT_INC(out_slow_tot);
2347 
2348 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2349 		if (flags & RTCF_LOCAL &&
2350 		    !(dev_out->flags & IFF_LOOPBACK)) {
2351 			rth->dst.output = ip_mc_output;
2352 			RT_CACHE_STAT_INC(out_slow_mc);
2353 		}
2354 #ifdef CONFIG_IP_MROUTE
2355 		if (type == RTN_MULTICAST) {
2356 			if (IN_DEV_MFORWARD(in_dev) &&
2357 			    !ipv4_is_local_multicast(fl4->daddr)) {
2358 				rth->dst.input = ip_mr_input;
2359 				rth->dst.output = ip_mc_output;
2360 			}
2361 		}
2362 #endif
2363 	}
2364 
2365 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2366 	lwtunnel_set_redirect(&rth->dst);
2367 
2368 	return rth;
2369 }
2370 
2371 /*
2372  * Major route resolver routine.
2373  */
2374 
2375 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2376 					const struct sk_buff *skb)
2377 {
2378 	__u8 tos = RT_FL_TOS(fl4);
2379 	struct fib_result res = {
2380 		.type		= RTN_UNSPEC,
2381 		.fi		= NULL,
2382 		.table		= NULL,
2383 		.tclassid	= 0,
2384 	};
2385 	struct rtable *rth;
2386 
2387 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2388 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2389 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2390 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2391 
2392 	rcu_read_lock();
2393 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2394 	rcu_read_unlock();
2395 
2396 	return rth;
2397 }
2398 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2399 
2400 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2401 					    struct fib_result *res,
2402 					    const struct sk_buff *skb)
2403 {
2404 	struct net_device *dev_out = NULL;
2405 	int orig_oif = fl4->flowi4_oif;
2406 	unsigned int flags = 0;
2407 	struct rtable *rth;
2408 	int err = -ENETUNREACH;
2409 
2410 	if (fl4->saddr) {
2411 		rth = ERR_PTR(-EINVAL);
2412 		if (ipv4_is_multicast(fl4->saddr) ||
2413 		    ipv4_is_lbcast(fl4->saddr) ||
2414 		    ipv4_is_zeronet(fl4->saddr))
2415 			goto out;
2416 
2417 		/* I removed check for oif == dev_out->oif here.
2418 		   It was wrong for two reasons:
2419 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2420 		      is assigned to multiple interfaces.
2421 		   2. Moreover, we are allowed to send packets with saddr
2422 		      of another iface. --ANK
2423 		 */
2424 
2425 		if (fl4->flowi4_oif == 0 &&
2426 		    (ipv4_is_multicast(fl4->daddr) ||
2427 		     ipv4_is_lbcast(fl4->daddr))) {
2428 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2429 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2430 			if (!dev_out)
2431 				goto out;
2432 
2433 			/* Special hack: user can direct multicasts
2434 			   and limited broadcast via necessary interface
2435 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2436 			   This hack is not just for fun, it allows
2437 			   vic,vat and friends to work.
2438 			   They bind socket to loopback, set ttl to zero
2439 			   and expect that it will work.
2440 			   From the viewpoint of routing cache they are broken,
2441 			   because we are not allowed to build multicast path
2442 			   with loopback source addr (look, routing cache
2443 			   cannot know, that ttl is zero, so that packet
2444 			   will not leave this host and route is valid).
2445 			   Luckily, this hack is good workaround.
2446 			 */
2447 
2448 			fl4->flowi4_oif = dev_out->ifindex;
2449 			goto make_route;
2450 		}
2451 
2452 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2453 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2454 			if (!__ip_dev_find(net, fl4->saddr, false))
2455 				goto out;
2456 		}
2457 	}
2458 
2459 
2460 	if (fl4->flowi4_oif) {
2461 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2462 		rth = ERR_PTR(-ENODEV);
2463 		if (!dev_out)
2464 			goto out;
2465 
2466 		/* RACE: Check return value of inet_select_addr instead. */
2467 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2468 			rth = ERR_PTR(-ENETUNREACH);
2469 			goto out;
2470 		}
2471 		if (ipv4_is_local_multicast(fl4->daddr) ||
2472 		    ipv4_is_lbcast(fl4->daddr) ||
2473 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2474 			if (!fl4->saddr)
2475 				fl4->saddr = inet_select_addr(dev_out, 0,
2476 							      RT_SCOPE_LINK);
2477 			goto make_route;
2478 		}
2479 		if (!fl4->saddr) {
2480 			if (ipv4_is_multicast(fl4->daddr))
2481 				fl4->saddr = inet_select_addr(dev_out, 0,
2482 							      fl4->flowi4_scope);
2483 			else if (!fl4->daddr)
2484 				fl4->saddr = inet_select_addr(dev_out, 0,
2485 							      RT_SCOPE_HOST);
2486 		}
2487 	}
2488 
2489 	if (!fl4->daddr) {
2490 		fl4->daddr = fl4->saddr;
2491 		if (!fl4->daddr)
2492 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2493 		dev_out = net->loopback_dev;
2494 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2495 		res->type = RTN_LOCAL;
2496 		flags |= RTCF_LOCAL;
2497 		goto make_route;
2498 	}
2499 
2500 	err = fib_lookup(net, fl4, res, 0);
2501 	if (err) {
2502 		res->fi = NULL;
2503 		res->table = NULL;
2504 		if (fl4->flowi4_oif &&
2505 		    (ipv4_is_multicast(fl4->daddr) ||
2506 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2507 			/* Apparently, routing tables are wrong. Assume,
2508 			   that the destination is on link.
2509 
2510 			   WHY? DW.
2511 			   Because we are allowed to send to iface
2512 			   even if it has NO routes and NO assigned
2513 			   addresses. When oif is specified, routing
2514 			   tables are looked up with only one purpose:
2515 			   to catch if destination is gatewayed, rather than
2516 			   direct. Moreover, if MSG_DONTROUTE is set,
2517 			   we send packet, ignoring both routing tables
2518 			   and ifaddr state. --ANK
2519 
2520 
2521 			   We could make it even if oif is unknown,
2522 			   likely IPv6, but we do not.
2523 			 */
2524 
2525 			if (fl4->saddr == 0)
2526 				fl4->saddr = inet_select_addr(dev_out, 0,
2527 							      RT_SCOPE_LINK);
2528 			res->type = RTN_UNICAST;
2529 			goto make_route;
2530 		}
2531 		rth = ERR_PTR(err);
2532 		goto out;
2533 	}
2534 
2535 	if (res->type == RTN_LOCAL) {
2536 		if (!fl4->saddr) {
2537 			if (res->fi->fib_prefsrc)
2538 				fl4->saddr = res->fi->fib_prefsrc;
2539 			else
2540 				fl4->saddr = fl4->daddr;
2541 		}
2542 
2543 		/* L3 master device is the loopback for that domain */
2544 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2545 			net->loopback_dev;
2546 
2547 		/* make sure orig_oif points to fib result device even
2548 		 * though packet rx/tx happens over loopback or l3mdev
2549 		 */
2550 		orig_oif = FIB_RES_OIF(*res);
2551 
2552 		fl4->flowi4_oif = dev_out->ifindex;
2553 		flags |= RTCF_LOCAL;
2554 		goto make_route;
2555 	}
2556 
2557 	fib_select_path(net, res, fl4, skb);
2558 
2559 	dev_out = FIB_RES_DEV(*res);
2560 	fl4->flowi4_oif = dev_out->ifindex;
2561 
2562 
2563 make_route:
2564 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2565 
2566 out:
2567 	return rth;
2568 }
2569 
2570 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2571 {
2572 	return NULL;
2573 }
2574 
2575 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2576 {
2577 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2578 
2579 	return mtu ? : dst->dev->mtu;
2580 }
2581 
2582 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2583 					  struct sk_buff *skb, u32 mtu)
2584 {
2585 }
2586 
2587 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2588 				       struct sk_buff *skb)
2589 {
2590 }
2591 
2592 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2593 					  unsigned long old)
2594 {
2595 	return NULL;
2596 }
2597 
2598 static struct dst_ops ipv4_dst_blackhole_ops = {
2599 	.family			=	AF_INET,
2600 	.check			=	ipv4_blackhole_dst_check,
2601 	.mtu			=	ipv4_blackhole_mtu,
2602 	.default_advmss		=	ipv4_default_advmss,
2603 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2604 	.redirect		=	ipv4_rt_blackhole_redirect,
2605 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2606 	.neigh_lookup		=	ipv4_neigh_lookup,
2607 };
2608 
2609 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2610 {
2611 	struct rtable *ort = (struct rtable *) dst_orig;
2612 	struct rtable *rt;
2613 
2614 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2615 	if (rt) {
2616 		struct dst_entry *new = &rt->dst;
2617 
2618 		new->__use = 1;
2619 		new->input = dst_discard;
2620 		new->output = dst_discard_out;
2621 
2622 		new->dev = net->loopback_dev;
2623 		if (new->dev)
2624 			dev_hold(new->dev);
2625 
2626 		rt->rt_is_input = ort->rt_is_input;
2627 		rt->rt_iif = ort->rt_iif;
2628 		rt->rt_pmtu = ort->rt_pmtu;
2629 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2630 
2631 		rt->rt_genid = rt_genid_ipv4(net);
2632 		rt->rt_flags = ort->rt_flags;
2633 		rt->rt_type = ort->rt_type;
2634 		rt->rt_gw_family = ort->rt_gw_family;
2635 		if (rt->rt_gw_family == AF_INET)
2636 			rt->rt_gw4 = ort->rt_gw4;
2637 		else if (rt->rt_gw_family == AF_INET6)
2638 			rt->rt_gw6 = ort->rt_gw6;
2639 
2640 		INIT_LIST_HEAD(&rt->rt_uncached);
2641 	}
2642 
2643 	dst_release(dst_orig);
2644 
2645 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2646 }
2647 
2648 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2649 				    const struct sock *sk)
2650 {
2651 	struct rtable *rt = __ip_route_output_key(net, flp4);
2652 
2653 	if (IS_ERR(rt))
2654 		return rt;
2655 
2656 	if (flp4->flowi4_proto)
2657 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2658 							flowi4_to_flowi(flp4),
2659 							sk, 0);
2660 
2661 	return rt;
2662 }
2663 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2664 
2665 /* called with rcu_read_lock held */
2666 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2667 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2668 			struct sk_buff *skb, u32 portid, u32 seq)
2669 {
2670 	struct rtmsg *r;
2671 	struct nlmsghdr *nlh;
2672 	unsigned long expires = 0;
2673 	u32 error;
2674 	u32 metrics[RTAX_MAX];
2675 
2676 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2677 	if (!nlh)
2678 		return -EMSGSIZE;
2679 
2680 	r = nlmsg_data(nlh);
2681 	r->rtm_family	 = AF_INET;
2682 	r->rtm_dst_len	= 32;
2683 	r->rtm_src_len	= 0;
2684 	r->rtm_tos	= fl4->flowi4_tos;
2685 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2686 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2687 		goto nla_put_failure;
2688 	r->rtm_type	= rt->rt_type;
2689 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2690 	r->rtm_protocol = RTPROT_UNSPEC;
2691 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2692 	if (rt->rt_flags & RTCF_NOTIFY)
2693 		r->rtm_flags |= RTM_F_NOTIFY;
2694 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2695 		r->rtm_flags |= RTCF_DOREDIRECT;
2696 
2697 	if (nla_put_in_addr(skb, RTA_DST, dst))
2698 		goto nla_put_failure;
2699 	if (src) {
2700 		r->rtm_src_len = 32;
2701 		if (nla_put_in_addr(skb, RTA_SRC, src))
2702 			goto nla_put_failure;
2703 	}
2704 	if (rt->dst.dev &&
2705 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2706 		goto nla_put_failure;
2707 #ifdef CONFIG_IP_ROUTE_CLASSID
2708 	if (rt->dst.tclassid &&
2709 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2710 		goto nla_put_failure;
2711 #endif
2712 	if (!rt_is_input_route(rt) &&
2713 	    fl4->saddr != src) {
2714 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2715 			goto nla_put_failure;
2716 	}
2717 	if (rt->rt_gw_family == AF_INET &&
2718 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2719 		goto nla_put_failure;
2720 	} else if (rt->rt_gw_family == AF_INET6) {
2721 		int alen = sizeof(struct in6_addr);
2722 		struct nlattr *nla;
2723 		struct rtvia *via;
2724 
2725 		nla = nla_reserve(skb, RTA_VIA, alen + 2);
2726 		if (!nla)
2727 			goto nla_put_failure;
2728 
2729 		via = nla_data(nla);
2730 		via->rtvia_family = AF_INET6;
2731 		memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2732 	}
2733 
2734 	expires = rt->dst.expires;
2735 	if (expires) {
2736 		unsigned long now = jiffies;
2737 
2738 		if (time_before(now, expires))
2739 			expires -= now;
2740 		else
2741 			expires = 0;
2742 	}
2743 
2744 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2745 	if (rt->rt_pmtu && expires)
2746 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2747 	if (rt->rt_mtu_locked && expires)
2748 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2749 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2750 		goto nla_put_failure;
2751 
2752 	if (fl4->flowi4_mark &&
2753 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2754 		goto nla_put_failure;
2755 
2756 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2757 	    nla_put_u32(skb, RTA_UID,
2758 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2759 		goto nla_put_failure;
2760 
2761 	error = rt->dst.error;
2762 
2763 	if (rt_is_input_route(rt)) {
2764 #ifdef CONFIG_IP_MROUTE
2765 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2766 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2767 			int err = ipmr_get_route(net, skb,
2768 						 fl4->saddr, fl4->daddr,
2769 						 r, portid);
2770 
2771 			if (err <= 0) {
2772 				if (err == 0)
2773 					return 0;
2774 				goto nla_put_failure;
2775 			}
2776 		} else
2777 #endif
2778 			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2779 				goto nla_put_failure;
2780 	}
2781 
2782 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2783 		goto nla_put_failure;
2784 
2785 	nlmsg_end(skb, nlh);
2786 	return 0;
2787 
2788 nla_put_failure:
2789 	nlmsg_cancel(skb, nlh);
2790 	return -EMSGSIZE;
2791 }
2792 
2793 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2794 						   u8 ip_proto, __be16 sport,
2795 						   __be16 dport)
2796 {
2797 	struct sk_buff *skb;
2798 	struct iphdr *iph;
2799 
2800 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2801 	if (!skb)
2802 		return NULL;
2803 
2804 	/* Reserve room for dummy headers, this skb can pass
2805 	 * through good chunk of routing engine.
2806 	 */
2807 	skb_reset_mac_header(skb);
2808 	skb_reset_network_header(skb);
2809 	skb->protocol = htons(ETH_P_IP);
2810 	iph = skb_put(skb, sizeof(struct iphdr));
2811 	iph->protocol = ip_proto;
2812 	iph->saddr = src;
2813 	iph->daddr = dst;
2814 	iph->version = 0x4;
2815 	iph->frag_off = 0;
2816 	iph->ihl = 0x5;
2817 	skb_set_transport_header(skb, skb->len);
2818 
2819 	switch (iph->protocol) {
2820 	case IPPROTO_UDP: {
2821 		struct udphdr *udph;
2822 
2823 		udph = skb_put_zero(skb, sizeof(struct udphdr));
2824 		udph->source = sport;
2825 		udph->dest = dport;
2826 		udph->len = sizeof(struct udphdr);
2827 		udph->check = 0;
2828 		break;
2829 	}
2830 	case IPPROTO_TCP: {
2831 		struct tcphdr *tcph;
2832 
2833 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2834 		tcph->source	= sport;
2835 		tcph->dest	= dport;
2836 		tcph->doff	= sizeof(struct tcphdr) / 4;
2837 		tcph->rst = 1;
2838 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2839 					    src, dst, 0);
2840 		break;
2841 	}
2842 	case IPPROTO_ICMP: {
2843 		struct icmphdr *icmph;
2844 
2845 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2846 		icmph->type = ICMP_ECHO;
2847 		icmph->code = 0;
2848 	}
2849 	}
2850 
2851 	return skb;
2852 }
2853 
2854 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2855 				       const struct nlmsghdr *nlh,
2856 				       struct nlattr **tb,
2857 				       struct netlink_ext_ack *extack)
2858 {
2859 	struct rtmsg *rtm;
2860 	int i, err;
2861 
2862 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2863 		NL_SET_ERR_MSG(extack,
2864 			       "ipv4: Invalid header for route get request");
2865 		return -EINVAL;
2866 	}
2867 
2868 	if (!netlink_strict_get_check(skb))
2869 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2870 					      rtm_ipv4_policy, extack);
2871 
2872 	rtm = nlmsg_data(nlh);
2873 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2874 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2875 	    rtm->rtm_table || rtm->rtm_protocol ||
2876 	    rtm->rtm_scope || rtm->rtm_type) {
2877 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2878 		return -EINVAL;
2879 	}
2880 
2881 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2882 			       RTM_F_LOOKUP_TABLE |
2883 			       RTM_F_FIB_MATCH)) {
2884 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2885 		return -EINVAL;
2886 	}
2887 
2888 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2889 					    rtm_ipv4_policy, extack);
2890 	if (err)
2891 		return err;
2892 
2893 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2894 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2895 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2896 		return -EINVAL;
2897 	}
2898 
2899 	for (i = 0; i <= RTA_MAX; i++) {
2900 		if (!tb[i])
2901 			continue;
2902 
2903 		switch (i) {
2904 		case RTA_IIF:
2905 		case RTA_OIF:
2906 		case RTA_SRC:
2907 		case RTA_DST:
2908 		case RTA_IP_PROTO:
2909 		case RTA_SPORT:
2910 		case RTA_DPORT:
2911 		case RTA_MARK:
2912 		case RTA_UID:
2913 			break;
2914 		default:
2915 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2916 			return -EINVAL;
2917 		}
2918 	}
2919 
2920 	return 0;
2921 }
2922 
2923 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2924 			     struct netlink_ext_ack *extack)
2925 {
2926 	struct net *net = sock_net(in_skb->sk);
2927 	struct nlattr *tb[RTA_MAX+1];
2928 	u32 table_id = RT_TABLE_MAIN;
2929 	__be16 sport = 0, dport = 0;
2930 	struct fib_result res = {};
2931 	u8 ip_proto = IPPROTO_UDP;
2932 	struct rtable *rt = NULL;
2933 	struct sk_buff *skb;
2934 	struct rtmsg *rtm;
2935 	struct flowi4 fl4 = {};
2936 	__be32 dst = 0;
2937 	__be32 src = 0;
2938 	kuid_t uid;
2939 	u32 iif;
2940 	int err;
2941 	int mark;
2942 
2943 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2944 	if (err < 0)
2945 		return err;
2946 
2947 	rtm = nlmsg_data(nlh);
2948 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2949 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2950 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2951 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2952 	if (tb[RTA_UID])
2953 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2954 	else
2955 		uid = (iif ? INVALID_UID : current_uid());
2956 
2957 	if (tb[RTA_IP_PROTO]) {
2958 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2959 						  &ip_proto, AF_INET, extack);
2960 		if (err)
2961 			return err;
2962 	}
2963 
2964 	if (tb[RTA_SPORT])
2965 		sport = nla_get_be16(tb[RTA_SPORT]);
2966 
2967 	if (tb[RTA_DPORT])
2968 		dport = nla_get_be16(tb[RTA_DPORT]);
2969 
2970 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2971 	if (!skb)
2972 		return -ENOBUFS;
2973 
2974 	fl4.daddr = dst;
2975 	fl4.saddr = src;
2976 	fl4.flowi4_tos = rtm->rtm_tos;
2977 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2978 	fl4.flowi4_mark = mark;
2979 	fl4.flowi4_uid = uid;
2980 	if (sport)
2981 		fl4.fl4_sport = sport;
2982 	if (dport)
2983 		fl4.fl4_dport = dport;
2984 	fl4.flowi4_proto = ip_proto;
2985 
2986 	rcu_read_lock();
2987 
2988 	if (iif) {
2989 		struct net_device *dev;
2990 
2991 		dev = dev_get_by_index_rcu(net, iif);
2992 		if (!dev) {
2993 			err = -ENODEV;
2994 			goto errout_rcu;
2995 		}
2996 
2997 		fl4.flowi4_iif = iif; /* for rt_fill_info */
2998 		skb->dev	= dev;
2999 		skb->mark	= mark;
3000 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3001 					 dev, &res);
3002 
3003 		rt = skb_rtable(skb);
3004 		if (err == 0 && rt->dst.error)
3005 			err = -rt->dst.error;
3006 	} else {
3007 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3008 		skb->dev = net->loopback_dev;
3009 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3010 		err = 0;
3011 		if (IS_ERR(rt))
3012 			err = PTR_ERR(rt);
3013 		else
3014 			skb_dst_set(skb, &rt->dst);
3015 	}
3016 
3017 	if (err)
3018 		goto errout_rcu;
3019 
3020 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3021 		rt->rt_flags |= RTCF_NOTIFY;
3022 
3023 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3024 		table_id = res.table ? res.table->tb_id : 0;
3025 
3026 	/* reset skb for netlink reply msg */
3027 	skb_trim(skb, 0);
3028 	skb_reset_network_header(skb);
3029 	skb_reset_transport_header(skb);
3030 	skb_reset_mac_header(skb);
3031 
3032 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3033 		if (!res.fi) {
3034 			err = fib_props[res.type].error;
3035 			if (!err)
3036 				err = -EHOSTUNREACH;
3037 			goto errout_rcu;
3038 		}
3039 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3040 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3041 				    rt->rt_type, res.prefix, res.prefixlen,
3042 				    fl4.flowi4_tos, res.fi, 0);
3043 	} else {
3044 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3045 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3046 	}
3047 	if (err < 0)
3048 		goto errout_rcu;
3049 
3050 	rcu_read_unlock();
3051 
3052 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3053 
3054 errout_free:
3055 	return err;
3056 errout_rcu:
3057 	rcu_read_unlock();
3058 	kfree_skb(skb);
3059 	goto errout_free;
3060 }
3061 
3062 void ip_rt_multicast_event(struct in_device *in_dev)
3063 {
3064 	rt_cache_flush(dev_net(in_dev->dev));
3065 }
3066 
3067 #ifdef CONFIG_SYSCTL
3068 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3069 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3070 static int ip_rt_gc_elasticity __read_mostly	= 8;
3071 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3072 
3073 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3074 					void __user *buffer,
3075 					size_t *lenp, loff_t *ppos)
3076 {
3077 	struct net *net = (struct net *)__ctl->extra1;
3078 
3079 	if (write) {
3080 		rt_cache_flush(net);
3081 		fnhe_genid_bump(net);
3082 		return 0;
3083 	}
3084 
3085 	return -EINVAL;
3086 }
3087 
3088 static struct ctl_table ipv4_route_table[] = {
3089 	{
3090 		.procname	= "gc_thresh",
3091 		.data		= &ipv4_dst_ops.gc_thresh,
3092 		.maxlen		= sizeof(int),
3093 		.mode		= 0644,
3094 		.proc_handler	= proc_dointvec,
3095 	},
3096 	{
3097 		.procname	= "max_size",
3098 		.data		= &ip_rt_max_size,
3099 		.maxlen		= sizeof(int),
3100 		.mode		= 0644,
3101 		.proc_handler	= proc_dointvec,
3102 	},
3103 	{
3104 		/*  Deprecated. Use gc_min_interval_ms */
3105 
3106 		.procname	= "gc_min_interval",
3107 		.data		= &ip_rt_gc_min_interval,
3108 		.maxlen		= sizeof(int),
3109 		.mode		= 0644,
3110 		.proc_handler	= proc_dointvec_jiffies,
3111 	},
3112 	{
3113 		.procname	= "gc_min_interval_ms",
3114 		.data		= &ip_rt_gc_min_interval,
3115 		.maxlen		= sizeof(int),
3116 		.mode		= 0644,
3117 		.proc_handler	= proc_dointvec_ms_jiffies,
3118 	},
3119 	{
3120 		.procname	= "gc_timeout",
3121 		.data		= &ip_rt_gc_timeout,
3122 		.maxlen		= sizeof(int),
3123 		.mode		= 0644,
3124 		.proc_handler	= proc_dointvec_jiffies,
3125 	},
3126 	{
3127 		.procname	= "gc_interval",
3128 		.data		= &ip_rt_gc_interval,
3129 		.maxlen		= sizeof(int),
3130 		.mode		= 0644,
3131 		.proc_handler	= proc_dointvec_jiffies,
3132 	},
3133 	{
3134 		.procname	= "redirect_load",
3135 		.data		= &ip_rt_redirect_load,
3136 		.maxlen		= sizeof(int),
3137 		.mode		= 0644,
3138 		.proc_handler	= proc_dointvec,
3139 	},
3140 	{
3141 		.procname	= "redirect_number",
3142 		.data		= &ip_rt_redirect_number,
3143 		.maxlen		= sizeof(int),
3144 		.mode		= 0644,
3145 		.proc_handler	= proc_dointvec,
3146 	},
3147 	{
3148 		.procname	= "redirect_silence",
3149 		.data		= &ip_rt_redirect_silence,
3150 		.maxlen		= sizeof(int),
3151 		.mode		= 0644,
3152 		.proc_handler	= proc_dointvec,
3153 	},
3154 	{
3155 		.procname	= "error_cost",
3156 		.data		= &ip_rt_error_cost,
3157 		.maxlen		= sizeof(int),
3158 		.mode		= 0644,
3159 		.proc_handler	= proc_dointvec,
3160 	},
3161 	{
3162 		.procname	= "error_burst",
3163 		.data		= &ip_rt_error_burst,
3164 		.maxlen		= sizeof(int),
3165 		.mode		= 0644,
3166 		.proc_handler	= proc_dointvec,
3167 	},
3168 	{
3169 		.procname	= "gc_elasticity",
3170 		.data		= &ip_rt_gc_elasticity,
3171 		.maxlen		= sizeof(int),
3172 		.mode		= 0644,
3173 		.proc_handler	= proc_dointvec,
3174 	},
3175 	{
3176 		.procname	= "mtu_expires",
3177 		.data		= &ip_rt_mtu_expires,
3178 		.maxlen		= sizeof(int),
3179 		.mode		= 0644,
3180 		.proc_handler	= proc_dointvec_jiffies,
3181 	},
3182 	{
3183 		.procname	= "min_pmtu",
3184 		.data		= &ip_rt_min_pmtu,
3185 		.maxlen		= sizeof(int),
3186 		.mode		= 0644,
3187 		.proc_handler	= proc_dointvec_minmax,
3188 		.extra1		= &ip_min_valid_pmtu,
3189 	},
3190 	{
3191 		.procname	= "min_adv_mss",
3192 		.data		= &ip_rt_min_advmss,
3193 		.maxlen		= sizeof(int),
3194 		.mode		= 0644,
3195 		.proc_handler	= proc_dointvec,
3196 	},
3197 	{ }
3198 };
3199 
3200 static struct ctl_table ipv4_route_flush_table[] = {
3201 	{
3202 		.procname	= "flush",
3203 		.maxlen		= sizeof(int),
3204 		.mode		= 0200,
3205 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3206 	},
3207 	{ },
3208 };
3209 
3210 static __net_init int sysctl_route_net_init(struct net *net)
3211 {
3212 	struct ctl_table *tbl;
3213 
3214 	tbl = ipv4_route_flush_table;
3215 	if (!net_eq(net, &init_net)) {
3216 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3217 		if (!tbl)
3218 			goto err_dup;
3219 
3220 		/* Don't export sysctls to unprivileged users */
3221 		if (net->user_ns != &init_user_ns)
3222 			tbl[0].procname = NULL;
3223 	}
3224 	tbl[0].extra1 = net;
3225 
3226 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3227 	if (!net->ipv4.route_hdr)
3228 		goto err_reg;
3229 	return 0;
3230 
3231 err_reg:
3232 	if (tbl != ipv4_route_flush_table)
3233 		kfree(tbl);
3234 err_dup:
3235 	return -ENOMEM;
3236 }
3237 
3238 static __net_exit void sysctl_route_net_exit(struct net *net)
3239 {
3240 	struct ctl_table *tbl;
3241 
3242 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3243 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3244 	BUG_ON(tbl == ipv4_route_flush_table);
3245 	kfree(tbl);
3246 }
3247 
3248 static __net_initdata struct pernet_operations sysctl_route_ops = {
3249 	.init = sysctl_route_net_init,
3250 	.exit = sysctl_route_net_exit,
3251 };
3252 #endif
3253 
3254 static __net_init int rt_genid_init(struct net *net)
3255 {
3256 	atomic_set(&net->ipv4.rt_genid, 0);
3257 	atomic_set(&net->fnhe_genid, 0);
3258 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3259 	return 0;
3260 }
3261 
3262 static __net_initdata struct pernet_operations rt_genid_ops = {
3263 	.init = rt_genid_init,
3264 };
3265 
3266 static int __net_init ipv4_inetpeer_init(struct net *net)
3267 {
3268 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3269 
3270 	if (!bp)
3271 		return -ENOMEM;
3272 	inet_peer_base_init(bp);
3273 	net->ipv4.peers = bp;
3274 	return 0;
3275 }
3276 
3277 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3278 {
3279 	struct inet_peer_base *bp = net->ipv4.peers;
3280 
3281 	net->ipv4.peers = NULL;
3282 	inetpeer_invalidate_tree(bp);
3283 	kfree(bp);
3284 }
3285 
3286 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3287 	.init	=	ipv4_inetpeer_init,
3288 	.exit	=	ipv4_inetpeer_exit,
3289 };
3290 
3291 #ifdef CONFIG_IP_ROUTE_CLASSID
3292 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3293 #endif /* CONFIG_IP_ROUTE_CLASSID */
3294 
3295 int __init ip_rt_init(void)
3296 {
3297 	int cpu;
3298 
3299 	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3300 				  GFP_KERNEL);
3301 	if (!ip_idents)
3302 		panic("IP: failed to allocate ip_idents\n");
3303 
3304 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3305 
3306 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3307 	if (!ip_tstamps)
3308 		panic("IP: failed to allocate ip_tstamps\n");
3309 
3310 	for_each_possible_cpu(cpu) {
3311 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3312 
3313 		INIT_LIST_HEAD(&ul->head);
3314 		spin_lock_init(&ul->lock);
3315 	}
3316 #ifdef CONFIG_IP_ROUTE_CLASSID
3317 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3318 	if (!ip_rt_acct)
3319 		panic("IP: failed to allocate ip_rt_acct\n");
3320 #endif
3321 
3322 	ipv4_dst_ops.kmem_cachep =
3323 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3324 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3325 
3326 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3327 
3328 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3329 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3330 
3331 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3332 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3333 
3334 	ipv4_dst_ops.gc_thresh = ~0;
3335 	ip_rt_max_size = INT_MAX;
3336 
3337 	devinet_init();
3338 	ip_fib_init();
3339 
3340 	if (ip_rt_proc_init())
3341 		pr_err("Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343 	xfrm_init();
3344 	xfrm4_init();
3345 #endif
3346 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3347 		      RTNL_FLAG_DOIT_UNLOCKED);
3348 
3349 #ifdef CONFIG_SYSCTL
3350 	register_pernet_subsys(&sysctl_route_ops);
3351 #endif
3352 	register_pernet_subsys(&rt_genid_ops);
3353 	register_pernet_subsys(&ipv4_inetpeer_ops);
3354 	return 0;
3355 }
3356 
3357 #ifdef CONFIG_SYSCTL
3358 /*
3359  * We really need to sanitize the damn ipv4 init order, then all
3360  * this nonsense will go away.
3361  */
3362 void __init ip_static_sysctl_init(void)
3363 {
3364 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3365 }
3366 #endif
3367