xref: /openbmc/linux/net/ipv4/route.c (revision bc05aa6e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116 
117 #include "fib_lookup.h"
118 
119 #define RT_FL_TOS(oldflp4) \
120 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121 
122 #define RT_GC_TIMEOUT (300*HZ)
123 
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 
134 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
135 
136 /*
137  *	Interface to generic destination cache.
138  */
139 
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void		 ipv4_link_failure(struct sk_buff *skb);
145 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 					   struct sk_buff *skb, u32 mtu);
147 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148 					struct sk_buff *skb);
149 static void		ipv4_dst_destroy(struct dst_entry *dst);
150 
151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
152 {
153 	WARN_ON(1);
154 	return NULL;
155 }
156 
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 					   struct sk_buff *skb,
159 					   const void *daddr);
160 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
161 
162 static struct dst_ops ipv4_dst_ops = {
163 	.family =		AF_INET,
164 	.check =		ipv4_dst_check,
165 	.default_advmss =	ipv4_default_advmss,
166 	.mtu =			ipv4_mtu,
167 	.cow_metrics =		ipv4_cow_metrics,
168 	.destroy =		ipv4_dst_destroy,
169 	.negative_advice =	ipv4_negative_advice,
170 	.link_failure =		ipv4_link_failure,
171 	.update_pmtu =		ip_rt_update_pmtu,
172 	.redirect =		ip_do_redirect,
173 	.local_out =		__ip_local_out,
174 	.neigh_lookup =		ipv4_neigh_lookup,
175 	.confirm_neigh =	ipv4_confirm_neigh,
176 };
177 
178 #define ECN_OR_COST(class)	TC_PRIO_##class
179 
180 const __u8 ip_tos2prio[16] = {
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BESTEFFORT,
184 	ECN_OR_COST(BESTEFFORT),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_BULK,
188 	ECN_OR_COST(BULK),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE,
192 	ECN_OR_COST(INTERACTIVE),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK),
195 	TC_PRIO_INTERACTIVE_BULK,
196 	ECN_OR_COST(INTERACTIVE_BULK)
197 };
198 EXPORT_SYMBOL(ip_tos2prio);
199 
200 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
201 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
202 
203 #ifdef CONFIG_PROC_FS
204 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
205 {
206 	if (*pos)
207 		return NULL;
208 	return SEQ_START_TOKEN;
209 }
210 
211 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
212 {
213 	++*pos;
214 	return NULL;
215 }
216 
217 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
218 {
219 }
220 
221 static int rt_cache_seq_show(struct seq_file *seq, void *v)
222 {
223 	if (v == SEQ_START_TOKEN)
224 		seq_printf(seq, "%-127s\n",
225 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
226 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
227 			   "HHUptod\tSpecDst");
228 	return 0;
229 }
230 
231 static const struct seq_operations rt_cache_seq_ops = {
232 	.start  = rt_cache_seq_start,
233 	.next   = rt_cache_seq_next,
234 	.stop   = rt_cache_seq_stop,
235 	.show   = rt_cache_seq_show,
236 };
237 
238 static int rt_cache_seq_open(struct inode *inode, struct file *file)
239 {
240 	return seq_open(file, &rt_cache_seq_ops);
241 }
242 
243 static const struct file_operations rt_cache_seq_fops = {
244 	.open	 = rt_cache_seq_open,
245 	.read	 = seq_read,
246 	.llseek	 = seq_lseek,
247 	.release = seq_release,
248 };
249 
250 
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253 	int cpu;
254 
255 	if (*pos == 0)
256 		return SEQ_START_TOKEN;
257 
258 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259 		if (!cpu_possible(cpu))
260 			continue;
261 		*pos = cpu+1;
262 		return &per_cpu(rt_cache_stat, cpu);
263 	}
264 	return NULL;
265 }
266 
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269 	int cpu;
270 
271 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272 		if (!cpu_possible(cpu))
273 			continue;
274 		*pos = cpu+1;
275 		return &per_cpu(rt_cache_stat, cpu);
276 	}
277 	return NULL;
278 
279 }
280 
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct file_operations rt_cpu_seq_fops = {
334 	.open	 = rt_cpu_seq_open,
335 	.read	 = seq_read,
336 	.llseek	 = seq_lseek,
337 	.release = seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 	return single_open(file, rt_acct_proc_show, NULL);
368 }
369 
370 static const struct file_operations rt_acct_proc_fops = {
371 	.open		= rt_acct_proc_open,
372 	.read		= seq_read,
373 	.llseek		= seq_lseek,
374 	.release	= single_release,
375 };
376 #endif
377 
378 static int __net_init ip_rt_do_proc_init(struct net *net)
379 {
380 	struct proc_dir_entry *pde;
381 
382 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
383 			  &rt_cache_seq_fops);
384 	if (!pde)
385 		goto err1;
386 
387 	pde = proc_create("rt_cache", S_IRUGO,
388 			  net->proc_net_stat, &rt_cpu_seq_fops);
389 	if (!pde)
390 		goto err2;
391 
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
394 	if (!pde)
395 		goto err3;
396 #endif
397 	return 0;
398 
399 #ifdef CONFIG_IP_ROUTE_CLASSID
400 err3:
401 	remove_proc_entry("rt_cache", net->proc_net_stat);
402 #endif
403 err2:
404 	remove_proc_entry("rt_cache", net->proc_net);
405 err1:
406 	return -ENOMEM;
407 }
408 
409 static void __net_exit ip_rt_do_proc_exit(struct net *net)
410 {
411 	remove_proc_entry("rt_cache", net->proc_net_stat);
412 	remove_proc_entry("rt_cache", net->proc_net);
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 	remove_proc_entry("rt_acct", net->proc_net);
415 #endif
416 }
417 
418 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
419 	.init = ip_rt_do_proc_init,
420 	.exit = ip_rt_do_proc_exit,
421 };
422 
423 static int __init ip_rt_proc_init(void)
424 {
425 	return register_pernet_subsys(&ip_rt_proc_ops);
426 }
427 
428 #else
429 static inline int ip_rt_proc_init(void)
430 {
431 	return 0;
432 }
433 #endif /* CONFIG_PROC_FS */
434 
435 static inline bool rt_is_expired(const struct rtable *rth)
436 {
437 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
438 }
439 
440 void rt_cache_flush(struct net *net)
441 {
442 	rt_genid_bump_ipv4(net);
443 }
444 
445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
446 					   struct sk_buff *skb,
447 					   const void *daddr)
448 {
449 	struct net_device *dev = dst->dev;
450 	const __be32 *pkey = daddr;
451 	const struct rtable *rt;
452 	struct neighbour *n;
453 
454 	rt = (const struct rtable *) dst;
455 	if (rt->rt_gateway)
456 		pkey = (const __be32 *) &rt->rt_gateway;
457 	else if (skb)
458 		pkey = &ip_hdr(skb)->daddr;
459 
460 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
461 	if (n)
462 		return n;
463 	return neigh_create(&arp_tbl, pkey, dev);
464 }
465 
466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467 {
468 	struct net_device *dev = dst->dev;
469 	const __be32 *pkey = daddr;
470 	const struct rtable *rt;
471 
472 	rt = (const struct rtable *)dst;
473 	if (rt->rt_gateway)
474 		pkey = (const __be32 *)&rt->rt_gateway;
475 	else if (!daddr ||
476 		 (rt->rt_flags &
477 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478 		return;
479 
480 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481 }
482 
483 #define IP_IDENTS_SZ 2048u
484 
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487 
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
495 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
496 	u32 old = READ_ONCE(*p_tstamp);
497 	u32 now = (u32)jiffies;
498 	u32 new, delta = 0;
499 
500 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
501 		delta = prandom_u32_max(now - old);
502 
503 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
504 	do {
505 		old = (u32)atomic_read(p_id);
506 		new = old + delta + segs;
507 	} while (atomic_cmpxchg(p_id, old, new) != old);
508 
509 	return new - segs;
510 }
511 EXPORT_SYMBOL(ip_idents_reserve);
512 
513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
514 {
515 	static u32 ip_idents_hashrnd __read_mostly;
516 	u32 hash, id;
517 
518 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
519 
520 	hash = jhash_3words((__force u32)iph->daddr,
521 			    (__force u32)iph->saddr,
522 			    iph->protocol ^ net_hash_mix(net),
523 			    ip_idents_hashrnd);
524 	id = ip_idents_reserve(hash, segs);
525 	iph->id = htons(id);
526 }
527 EXPORT_SYMBOL(__ip_select_ident);
528 
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530 			     const struct sock *sk,
531 			     const struct iphdr *iph,
532 			     int oif, u8 tos,
533 			     u8 prot, u32 mark, int flow_flags)
534 {
535 	if (sk) {
536 		const struct inet_sock *inet = inet_sk(sk);
537 
538 		oif = sk->sk_bound_dev_if;
539 		mark = sk->sk_mark;
540 		tos = RT_CONN_FLAGS(sk);
541 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542 	}
543 	flowi4_init_output(fl4, oif, mark, tos,
544 			   RT_SCOPE_UNIVERSE, prot,
545 			   flow_flags,
546 			   iph->daddr, iph->saddr, 0, 0,
547 			   sock_net_uid(net, sk));
548 }
549 
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551 			       const struct sock *sk)
552 {
553 	const struct net *net = dev_net(skb->dev);
554 	const struct iphdr *iph = ip_hdr(skb);
555 	int oif = skb->dev->ifindex;
556 	u8 tos = RT_TOS(iph->tos);
557 	u8 prot = iph->protocol;
558 	u32 mark = skb->mark;
559 
560 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562 
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565 	const struct inet_sock *inet = inet_sk(sk);
566 	const struct ip_options_rcu *inet_opt;
567 	__be32 daddr = inet->inet_daddr;
568 
569 	rcu_read_lock();
570 	inet_opt = rcu_dereference(inet->inet_opt);
571 	if (inet_opt && inet_opt->opt.srr)
572 		daddr = inet_opt->opt.faddr;
573 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 			   inet_sk_flowi_flags(sk),
577 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578 	rcu_read_unlock();
579 }
580 
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 				 const struct sk_buff *skb)
583 {
584 	if (skb)
585 		build_skb_flow_key(fl4, skb, sk);
586 	else
587 		build_sk_flow_key(fl4, sk);
588 }
589 
590 static DEFINE_SPINLOCK(fnhe_lock);
591 
592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 {
594 	struct rtable *rt;
595 
596 	rt = rcu_dereference(fnhe->fnhe_rth_input);
597 	if (rt) {
598 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
599 		dst_dev_put(&rt->dst);
600 		dst_release(&rt->dst);
601 	}
602 	rt = rcu_dereference(fnhe->fnhe_rth_output);
603 	if (rt) {
604 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
605 		dst_dev_put(&rt->dst);
606 		dst_release(&rt->dst);
607 	}
608 }
609 
610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
611 {
612 	struct fib_nh_exception *fnhe, *oldest;
613 
614 	oldest = rcu_dereference(hash->chain);
615 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
616 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
617 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
618 			oldest = fnhe;
619 	}
620 	fnhe_flush_routes(oldest);
621 	return oldest;
622 }
623 
624 static inline u32 fnhe_hashfun(__be32 daddr)
625 {
626 	static u32 fnhe_hashrnd __read_mostly;
627 	u32 hval;
628 
629 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
630 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
631 	return hash_32(hval, FNHE_HASH_SHIFT);
632 }
633 
634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
635 {
636 	rt->rt_pmtu = fnhe->fnhe_pmtu;
637 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
638 	rt->dst.expires = fnhe->fnhe_expires;
639 
640 	if (fnhe->fnhe_gw) {
641 		rt->rt_flags |= RTCF_REDIRECTED;
642 		rt->rt_gateway = fnhe->fnhe_gw;
643 		rt->rt_uses_gateway = 1;
644 	}
645 }
646 
647 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
648 				  u32 pmtu, bool lock, unsigned long expires)
649 {
650 	struct fnhe_hash_bucket *hash;
651 	struct fib_nh_exception *fnhe;
652 	struct rtable *rt;
653 	u32 genid, hval;
654 	unsigned int i;
655 	int depth;
656 
657 	genid = fnhe_genid(dev_net(nh->nh_dev));
658 	hval = fnhe_hashfun(daddr);
659 
660 	spin_lock_bh(&fnhe_lock);
661 
662 	hash = rcu_dereference(nh->nh_exceptions);
663 	if (!hash) {
664 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
665 		if (!hash)
666 			goto out_unlock;
667 		rcu_assign_pointer(nh->nh_exceptions, hash);
668 	}
669 
670 	hash += hval;
671 
672 	depth = 0;
673 	for (fnhe = rcu_dereference(hash->chain); fnhe;
674 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
675 		if (fnhe->fnhe_daddr == daddr)
676 			break;
677 		depth++;
678 	}
679 
680 	if (fnhe) {
681 		if (fnhe->fnhe_genid != genid)
682 			fnhe->fnhe_genid = genid;
683 		if (gw)
684 			fnhe->fnhe_gw = gw;
685 		if (pmtu) {
686 			fnhe->fnhe_pmtu = pmtu;
687 			fnhe->fnhe_mtu_locked = lock;
688 		}
689 		fnhe->fnhe_expires = max(1UL, expires);
690 		/* Update all cached dsts too */
691 		rt = rcu_dereference(fnhe->fnhe_rth_input);
692 		if (rt)
693 			fill_route_from_fnhe(rt, fnhe);
694 		rt = rcu_dereference(fnhe->fnhe_rth_output);
695 		if (rt)
696 			fill_route_from_fnhe(rt, fnhe);
697 	} else {
698 		if (depth > FNHE_RECLAIM_DEPTH)
699 			fnhe = fnhe_oldest(hash);
700 		else {
701 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702 			if (!fnhe)
703 				goto out_unlock;
704 
705 			fnhe->fnhe_next = hash->chain;
706 			rcu_assign_pointer(hash->chain, fnhe);
707 		}
708 		fnhe->fnhe_genid = genid;
709 		fnhe->fnhe_daddr = daddr;
710 		fnhe->fnhe_gw = gw;
711 		fnhe->fnhe_pmtu = pmtu;
712 		fnhe->fnhe_mtu_locked = lock;
713 		fnhe->fnhe_expires = expires;
714 
715 		/* Exception created; mark the cached routes for the nexthop
716 		 * stale, so anyone caching it rechecks if this exception
717 		 * applies to them.
718 		 */
719 		rt = rcu_dereference(nh->nh_rth_input);
720 		if (rt)
721 			rt->dst.obsolete = DST_OBSOLETE_KILL;
722 
723 		for_each_possible_cpu(i) {
724 			struct rtable __rcu **prt;
725 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
726 			rt = rcu_dereference(*prt);
727 			if (rt)
728 				rt->dst.obsolete = DST_OBSOLETE_KILL;
729 		}
730 	}
731 
732 	fnhe->fnhe_stamp = jiffies;
733 
734 out_unlock:
735 	spin_unlock_bh(&fnhe_lock);
736 }
737 
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739 			     bool kill_route)
740 {
741 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
742 	__be32 old_gw = ip_hdr(skb)->saddr;
743 	struct net_device *dev = skb->dev;
744 	struct in_device *in_dev;
745 	struct fib_result res;
746 	struct neighbour *n;
747 	struct net *net;
748 
749 	switch (icmp_hdr(skb)->code & 7) {
750 	case ICMP_REDIR_NET:
751 	case ICMP_REDIR_NETTOS:
752 	case ICMP_REDIR_HOST:
753 	case ICMP_REDIR_HOSTTOS:
754 		break;
755 
756 	default:
757 		return;
758 	}
759 
760 	if (rt->rt_gateway != old_gw)
761 		return;
762 
763 	in_dev = __in_dev_get_rcu(dev);
764 	if (!in_dev)
765 		return;
766 
767 	net = dev_net(dev);
768 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770 	    ipv4_is_zeronet(new_gw))
771 		goto reject_redirect;
772 
773 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775 			goto reject_redirect;
776 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777 			goto reject_redirect;
778 	} else {
779 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780 			goto reject_redirect;
781 	}
782 
783 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784 	if (!n)
785 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786 	if (!IS_ERR(n)) {
787 		if (!(n->nud_state & NUD_VALID)) {
788 			neigh_event_send(n, NULL);
789 		} else {
790 			if (fib_lookup(net, fl4, &res, 0) == 0) {
791 				struct fib_nh *nh = &FIB_RES_NH(res);
792 
793 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
794 						0, false,
795 						jiffies + ip_rt_gc_timeout);
796 			}
797 			if (kill_route)
798 				rt->dst.obsolete = DST_OBSOLETE_KILL;
799 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800 		}
801 		neigh_release(n);
802 	}
803 	return;
804 
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
808 		const struct iphdr *iph = (const struct iphdr *) skb->data;
809 		__be32 daddr = iph->daddr;
810 		__be32 saddr = iph->saddr;
811 
812 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813 				     "  Advised path = %pI4 -> %pI4\n",
814 				     &old_gw, dev->name, &new_gw,
815 				     &saddr, &daddr);
816 	}
817 #endif
818 	;
819 }
820 
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823 	struct rtable *rt;
824 	struct flowi4 fl4;
825 	const struct iphdr *iph = (const struct iphdr *) skb->data;
826 	struct net *net = dev_net(skb->dev);
827 	int oif = skb->dev->ifindex;
828 	u8 tos = RT_TOS(iph->tos);
829 	u8 prot = iph->protocol;
830 	u32 mark = skb->mark;
831 
832 	rt = (struct rtable *) dst;
833 
834 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835 	__ip_do_redirect(rt, skb, &fl4, true);
836 }
837 
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840 	struct rtable *rt = (struct rtable *)dst;
841 	struct dst_entry *ret = dst;
842 
843 	if (rt) {
844 		if (dst->obsolete > 0) {
845 			ip_rt_put(rt);
846 			ret = NULL;
847 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848 			   rt->dst.expires) {
849 			ip_rt_put(rt);
850 			ret = NULL;
851 		}
852 	}
853 	return ret;
854 }
855 
856 /*
857  * Algorithm:
858  *	1. The first ip_rt_redirect_number redirects are sent
859  *	   with exponential backoff, then we stop sending them at all,
860  *	   assuming that the host ignores our redirects.
861  *	2. If we did not see packets requiring redirects
862  *	   during ip_rt_redirect_silence, we assume that the host
863  *	   forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871 
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874 	struct rtable *rt = skb_rtable(skb);
875 	struct in_device *in_dev;
876 	struct inet_peer *peer;
877 	struct net *net;
878 	int log_martians;
879 	int vif;
880 
881 	rcu_read_lock();
882 	in_dev = __in_dev_get_rcu(rt->dst.dev);
883 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884 		rcu_read_unlock();
885 		return;
886 	}
887 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889 	rcu_read_unlock();
890 
891 	net = dev_net(rt->dst.dev);
892 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893 	if (!peer) {
894 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
896 		return;
897 	}
898 
899 	/* No redirected packets during ip_rt_redirect_silence;
900 	 * reset the algorithm.
901 	 */
902 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
903 		peer->rate_tokens = 0;
904 
905 	/* Too many ignored redirects; do not send anything
906 	 * set dst.rate_last to the last seen redirected packet.
907 	 */
908 	if (peer->rate_tokens >= ip_rt_redirect_number) {
909 		peer->rate_last = jiffies;
910 		goto out_put_peer;
911 	}
912 
913 	/* Check for load limit; set rate_last to the latest sent
914 	 * redirect.
915 	 */
916 	if (peer->rate_tokens == 0 ||
917 	    time_after(jiffies,
918 		       (peer->rate_last +
919 			(ip_rt_redirect_load << peer->rate_tokens)))) {
920 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921 
922 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923 		peer->rate_last = jiffies;
924 		++peer->rate_tokens;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
926 		if (log_martians &&
927 		    peer->rate_tokens == ip_rt_redirect_number)
928 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929 					     &ip_hdr(skb)->saddr, inet_iif(skb),
930 					     &ip_hdr(skb)->daddr, &gw);
931 #endif
932 	}
933 out_put_peer:
934 	inet_putpeer(peer);
935 }
936 
937 static int ip_error(struct sk_buff *skb)
938 {
939 	struct rtable *rt = skb_rtable(skb);
940 	struct net_device *dev = skb->dev;
941 	struct in_device *in_dev;
942 	struct inet_peer *peer;
943 	unsigned long now;
944 	struct net *net;
945 	bool send;
946 	int code;
947 
948 	if (netif_is_l3_master(skb->dev)) {
949 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
950 		if (!dev)
951 			goto out;
952 	}
953 
954 	in_dev = __in_dev_get_rcu(dev);
955 
956 	/* IP on this device is disabled. */
957 	if (!in_dev)
958 		goto out;
959 
960 	net = dev_net(rt->dst.dev);
961 	if (!IN_DEV_FORWARD(in_dev)) {
962 		switch (rt->dst.error) {
963 		case EHOSTUNREACH:
964 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
965 			break;
966 
967 		case ENETUNREACH:
968 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969 			break;
970 		}
971 		goto out;
972 	}
973 
974 	switch (rt->dst.error) {
975 	case EINVAL:
976 	default:
977 		goto out;
978 	case EHOSTUNREACH:
979 		code = ICMP_HOST_UNREACH;
980 		break;
981 	case ENETUNREACH:
982 		code = ICMP_NET_UNREACH;
983 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
984 		break;
985 	case EACCES:
986 		code = ICMP_PKT_FILTERED;
987 		break;
988 	}
989 
990 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991 			       l3mdev_master_ifindex(skb->dev), 1);
992 
993 	send = true;
994 	if (peer) {
995 		now = jiffies;
996 		peer->rate_tokens += now - peer->rate_last;
997 		if (peer->rate_tokens > ip_rt_error_burst)
998 			peer->rate_tokens = ip_rt_error_burst;
999 		peer->rate_last = now;
1000 		if (peer->rate_tokens >= ip_rt_error_cost)
1001 			peer->rate_tokens -= ip_rt_error_cost;
1002 		else
1003 			send = false;
1004 		inet_putpeer(peer);
1005 	}
1006 	if (send)
1007 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008 
1009 out:	kfree_skb(skb);
1010 	return 0;
1011 }
1012 
1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014 {
1015 	struct dst_entry *dst = &rt->dst;
1016 	struct fib_result res;
1017 	bool lock = false;
1018 
1019 	if (ip_mtu_locked(dst))
1020 		return;
1021 
1022 	if (ipv4_mtu(dst) < mtu)
1023 		return;
1024 
1025 	if (mtu < ip_rt_min_pmtu) {
1026 		lock = true;
1027 		mtu = ip_rt_min_pmtu;
1028 	}
1029 
1030 	if (rt->rt_pmtu == mtu &&
1031 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032 		return;
1033 
1034 	rcu_read_lock();
1035 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036 		struct fib_nh *nh = &FIB_RES_NH(res);
1037 
1038 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1039 				      jiffies + ip_rt_mtu_expires);
1040 	}
1041 	rcu_read_unlock();
1042 }
1043 
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045 			      struct sk_buff *skb, u32 mtu)
1046 {
1047 	struct rtable *rt = (struct rtable *) dst;
1048 	struct flowi4 fl4;
1049 
1050 	ip_rt_build_flow_key(&fl4, sk, skb);
1051 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1052 }
1053 
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055 		      int oif, u32 mark, u8 protocol, int flow_flags)
1056 {
1057 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 	struct flowi4 fl4;
1059 	struct rtable *rt;
1060 
1061 	if (!mark)
1062 		mark = IP4_REPLY_MARK(net, skb->mark);
1063 
1064 	__build_flow_key(net, &fl4, NULL, iph, oif,
1065 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1066 	rt = __ip_route_output_key(net, &fl4);
1067 	if (!IS_ERR(rt)) {
1068 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1069 		ip_rt_put(rt);
1070 	}
1071 }
1072 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073 
1074 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1077 	struct flowi4 fl4;
1078 	struct rtable *rt;
1079 
1080 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081 
1082 	if (!fl4.flowi4_mark)
1083 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084 
1085 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1086 	if (!IS_ERR(rt)) {
1087 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1088 		ip_rt_put(rt);
1089 	}
1090 }
1091 
1092 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093 {
1094 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1095 	struct flowi4 fl4;
1096 	struct rtable *rt;
1097 	struct dst_entry *odst = NULL;
1098 	bool new = false;
1099 	struct net *net = sock_net(sk);
1100 
1101 	bh_lock_sock(sk);
1102 
1103 	if (!ip_sk_accept_pmtu(sk))
1104 		goto out;
1105 
1106 	odst = sk_dst_get(sk);
1107 
1108 	if (sock_owned_by_user(sk) || !odst) {
1109 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1110 		goto out;
1111 	}
1112 
1113 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114 
1115 	rt = (struct rtable *)odst;
1116 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118 		if (IS_ERR(rt))
1119 			goto out;
1120 
1121 		new = true;
1122 	}
1123 
1124 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1125 
1126 	if (!dst_check(&rt->dst, 0)) {
1127 		if (new)
1128 			dst_release(&rt->dst);
1129 
1130 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131 		if (IS_ERR(rt))
1132 			goto out;
1133 
1134 		new = true;
1135 	}
1136 
1137 	if (new)
1138 		sk_dst_set(sk, &rt->dst);
1139 
1140 out:
1141 	bh_unlock_sock(sk);
1142 	dst_release(odst);
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145 
1146 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147 		   int oif, u32 mark, u8 protocol, int flow_flags)
1148 {
1149 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1150 	struct flowi4 fl4;
1151 	struct rtable *rt;
1152 
1153 	__build_flow_key(net, &fl4, NULL, iph, oif,
1154 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1155 	rt = __ip_route_output_key(net, &fl4);
1156 	if (!IS_ERR(rt)) {
1157 		__ip_do_redirect(rt, skb, &fl4, false);
1158 		ip_rt_put(rt);
1159 	}
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_redirect);
1162 
1163 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164 {
1165 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1166 	struct flowi4 fl4;
1167 	struct rtable *rt;
1168 	struct net *net = sock_net(sk);
1169 
1170 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171 	rt = __ip_route_output_key(net, &fl4);
1172 	if (!IS_ERR(rt)) {
1173 		__ip_do_redirect(rt, skb, &fl4, false);
1174 		ip_rt_put(rt);
1175 	}
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178 
1179 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180 {
1181 	struct rtable *rt = (struct rtable *) dst;
1182 
1183 	/* All IPV4 dsts are created with ->obsolete set to the value
1184 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185 	 * into this function always.
1186 	 *
1187 	 * When a PMTU/redirect information update invalidates a route,
1188 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189 	 * DST_OBSOLETE_DEAD by dst_free().
1190 	 */
1191 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192 		return NULL;
1193 	return dst;
1194 }
1195 
1196 static void ipv4_link_failure(struct sk_buff *skb)
1197 {
1198 	struct rtable *rt;
1199 
1200 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1201 
1202 	rt = skb_rtable(skb);
1203 	if (rt)
1204 		dst_set_expires(&rt->dst, 0);
1205 }
1206 
1207 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1208 {
1209 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1210 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1211 		 skb->dev ? skb->dev->name : "?");
1212 	kfree_skb(skb);
1213 	WARN_ON(1);
1214 	return 0;
1215 }
1216 
1217 /*
1218    We do not cache source address of outgoing interface,
1219    because it is used only by IP RR, TS and SRR options,
1220    so that it out of fast path.
1221 
1222    BTW remember: "addr" is allowed to be not aligned
1223    in IP options!
1224  */
1225 
1226 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1227 {
1228 	__be32 src;
1229 
1230 	if (rt_is_output_route(rt))
1231 		src = ip_hdr(skb)->saddr;
1232 	else {
1233 		struct fib_result res;
1234 		struct flowi4 fl4;
1235 		struct iphdr *iph;
1236 
1237 		iph = ip_hdr(skb);
1238 
1239 		memset(&fl4, 0, sizeof(fl4));
1240 		fl4.daddr = iph->daddr;
1241 		fl4.saddr = iph->saddr;
1242 		fl4.flowi4_tos = RT_TOS(iph->tos);
1243 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1244 		fl4.flowi4_iif = skb->dev->ifindex;
1245 		fl4.flowi4_mark = skb->mark;
1246 
1247 		rcu_read_lock();
1248 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1249 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1250 		else
1251 			src = inet_select_addr(rt->dst.dev,
1252 					       rt_nexthop(rt, iph->daddr),
1253 					       RT_SCOPE_UNIVERSE);
1254 		rcu_read_unlock();
1255 	}
1256 	memcpy(addr, &src, 4);
1257 }
1258 
1259 #ifdef CONFIG_IP_ROUTE_CLASSID
1260 static void set_class_tag(struct rtable *rt, u32 tag)
1261 {
1262 	if (!(rt->dst.tclassid & 0xFFFF))
1263 		rt->dst.tclassid |= tag & 0xFFFF;
1264 	if (!(rt->dst.tclassid & 0xFFFF0000))
1265 		rt->dst.tclassid |= tag & 0xFFFF0000;
1266 }
1267 #endif
1268 
1269 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1270 {
1271 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1272 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1273 				    ip_rt_min_advmss);
1274 
1275 	return min(advmss, IPV4_MAX_PMTU - header_size);
1276 }
1277 
1278 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1279 {
1280 	const struct rtable *rt = (const struct rtable *) dst;
1281 	unsigned int mtu = rt->rt_pmtu;
1282 
1283 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1284 		mtu = dst_metric_raw(dst, RTAX_MTU);
1285 
1286 	if (mtu)
1287 		return mtu;
1288 
1289 	mtu = READ_ONCE(dst->dev->mtu);
1290 
1291 	if (unlikely(ip_mtu_locked(dst))) {
1292 		if (rt->rt_uses_gateway && mtu > 576)
1293 			mtu = 576;
1294 	}
1295 
1296 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1297 
1298 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1299 }
1300 
1301 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1302 {
1303 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1304 	struct fib_nh_exception *fnhe;
1305 	u32 hval;
1306 
1307 	if (!hash)
1308 		return NULL;
1309 
1310 	hval = fnhe_hashfun(daddr);
1311 
1312 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1313 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1314 		if (fnhe->fnhe_daddr == daddr)
1315 			return fnhe;
1316 	}
1317 	return NULL;
1318 }
1319 
1320 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1321 			      __be32 daddr, const bool do_cache)
1322 {
1323 	bool ret = false;
1324 
1325 	spin_lock_bh(&fnhe_lock);
1326 
1327 	if (daddr == fnhe->fnhe_daddr) {
1328 		struct rtable __rcu **porig;
1329 		struct rtable *orig;
1330 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1331 
1332 		if (rt_is_input_route(rt))
1333 			porig = &fnhe->fnhe_rth_input;
1334 		else
1335 			porig = &fnhe->fnhe_rth_output;
1336 		orig = rcu_dereference(*porig);
1337 
1338 		if (fnhe->fnhe_genid != genid) {
1339 			fnhe->fnhe_genid = genid;
1340 			fnhe->fnhe_gw = 0;
1341 			fnhe->fnhe_pmtu = 0;
1342 			fnhe->fnhe_expires = 0;
1343 			fnhe_flush_routes(fnhe);
1344 			orig = NULL;
1345 		}
1346 		fill_route_from_fnhe(rt, fnhe);
1347 		if (!rt->rt_gateway)
1348 			rt->rt_gateway = daddr;
1349 
1350 		if (do_cache) {
1351 			dst_hold(&rt->dst);
1352 			rcu_assign_pointer(*porig, rt);
1353 			if (orig) {
1354 				dst_dev_put(&orig->dst);
1355 				dst_release(&orig->dst);
1356 			}
1357 			ret = true;
1358 		}
1359 
1360 		fnhe->fnhe_stamp = jiffies;
1361 	}
1362 	spin_unlock_bh(&fnhe_lock);
1363 
1364 	return ret;
1365 }
1366 
1367 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1368 {
1369 	struct rtable *orig, *prev, **p;
1370 	bool ret = true;
1371 
1372 	if (rt_is_input_route(rt)) {
1373 		p = (struct rtable **)&nh->nh_rth_input;
1374 	} else {
1375 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1376 	}
1377 	orig = *p;
1378 
1379 	/* hold dst before doing cmpxchg() to avoid race condition
1380 	 * on this dst
1381 	 */
1382 	dst_hold(&rt->dst);
1383 	prev = cmpxchg(p, orig, rt);
1384 	if (prev == orig) {
1385 		if (orig) {
1386 			dst_dev_put(&orig->dst);
1387 			dst_release(&orig->dst);
1388 		}
1389 	} else {
1390 		dst_release(&rt->dst);
1391 		ret = false;
1392 	}
1393 
1394 	return ret;
1395 }
1396 
1397 struct uncached_list {
1398 	spinlock_t		lock;
1399 	struct list_head	head;
1400 };
1401 
1402 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1403 
1404 void rt_add_uncached_list(struct rtable *rt)
1405 {
1406 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1407 
1408 	rt->rt_uncached_list = ul;
1409 
1410 	spin_lock_bh(&ul->lock);
1411 	list_add_tail(&rt->rt_uncached, &ul->head);
1412 	spin_unlock_bh(&ul->lock);
1413 }
1414 
1415 void rt_del_uncached_list(struct rtable *rt)
1416 {
1417 	if (!list_empty(&rt->rt_uncached)) {
1418 		struct uncached_list *ul = rt->rt_uncached_list;
1419 
1420 		spin_lock_bh(&ul->lock);
1421 		list_del(&rt->rt_uncached);
1422 		spin_unlock_bh(&ul->lock);
1423 	}
1424 }
1425 
1426 static void ipv4_dst_destroy(struct dst_entry *dst)
1427 {
1428 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1429 	struct rtable *rt = (struct rtable *)dst;
1430 
1431 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1432 		kfree(p);
1433 
1434 	rt_del_uncached_list(rt);
1435 }
1436 
1437 void rt_flush_dev(struct net_device *dev)
1438 {
1439 	struct net *net = dev_net(dev);
1440 	struct rtable *rt;
1441 	int cpu;
1442 
1443 	for_each_possible_cpu(cpu) {
1444 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1445 
1446 		spin_lock_bh(&ul->lock);
1447 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1448 			if (rt->dst.dev != dev)
1449 				continue;
1450 			rt->dst.dev = net->loopback_dev;
1451 			dev_hold(rt->dst.dev);
1452 			dev_put(dev);
1453 		}
1454 		spin_unlock_bh(&ul->lock);
1455 	}
1456 }
1457 
1458 static bool rt_cache_valid(const struct rtable *rt)
1459 {
1460 	return	rt &&
1461 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1462 		!rt_is_expired(rt);
1463 }
1464 
1465 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1466 			   const struct fib_result *res,
1467 			   struct fib_nh_exception *fnhe,
1468 			   struct fib_info *fi, u16 type, u32 itag,
1469 			   const bool do_cache)
1470 {
1471 	bool cached = false;
1472 
1473 	if (fi) {
1474 		struct fib_nh *nh = &FIB_RES_NH(*res);
1475 
1476 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1477 			rt->rt_gateway = nh->nh_gw;
1478 			rt->rt_uses_gateway = 1;
1479 		}
1480 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1481 		if (fi->fib_metrics != &dst_default_metrics) {
1482 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1483 			refcount_inc(&fi->fib_metrics->refcnt);
1484 		}
1485 #ifdef CONFIG_IP_ROUTE_CLASSID
1486 		rt->dst.tclassid = nh->nh_tclassid;
1487 #endif
1488 		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1489 		if (unlikely(fnhe))
1490 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1491 		else if (do_cache)
1492 			cached = rt_cache_route(nh, rt);
1493 		if (unlikely(!cached)) {
1494 			/* Routes we intend to cache in nexthop exception or
1495 			 * FIB nexthop have the DST_NOCACHE bit clear.
1496 			 * However, if we are unsuccessful at storing this
1497 			 * route into the cache we really need to set it.
1498 			 */
1499 			if (!rt->rt_gateway)
1500 				rt->rt_gateway = daddr;
1501 			rt_add_uncached_list(rt);
1502 		}
1503 	} else
1504 		rt_add_uncached_list(rt);
1505 
1506 #ifdef CONFIG_IP_ROUTE_CLASSID
1507 #ifdef CONFIG_IP_MULTIPLE_TABLES
1508 	set_class_tag(rt, res->tclassid);
1509 #endif
1510 	set_class_tag(rt, itag);
1511 #endif
1512 }
1513 
1514 struct rtable *rt_dst_alloc(struct net_device *dev,
1515 			    unsigned int flags, u16 type,
1516 			    bool nopolicy, bool noxfrm, bool will_cache)
1517 {
1518 	struct rtable *rt;
1519 
1520 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1521 		       (will_cache ? 0 : DST_HOST) |
1522 		       (nopolicy ? DST_NOPOLICY : 0) |
1523 		       (noxfrm ? DST_NOXFRM : 0));
1524 
1525 	if (rt) {
1526 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1527 		rt->rt_flags = flags;
1528 		rt->rt_type = type;
1529 		rt->rt_is_input = 0;
1530 		rt->rt_iif = 0;
1531 		rt->rt_pmtu = 0;
1532 		rt->rt_mtu_locked = 0;
1533 		rt->rt_gateway = 0;
1534 		rt->rt_uses_gateway = 0;
1535 		rt->rt_table_id = 0;
1536 		INIT_LIST_HEAD(&rt->rt_uncached);
1537 
1538 		rt->dst.output = ip_output;
1539 		if (flags & RTCF_LOCAL)
1540 			rt->dst.input = ip_local_deliver;
1541 	}
1542 
1543 	return rt;
1544 }
1545 EXPORT_SYMBOL(rt_dst_alloc);
1546 
1547 /* called in rcu_read_lock() section */
1548 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1549 			  u8 tos, struct net_device *dev,
1550 			  struct in_device *in_dev, u32 *itag)
1551 {
1552 	int err;
1553 
1554 	/* Primary sanity checks. */
1555 	if (!in_dev)
1556 		return -EINVAL;
1557 
1558 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1559 	    skb->protocol != htons(ETH_P_IP))
1560 		return -EINVAL;
1561 
1562 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1563 		return -EINVAL;
1564 
1565 	if (ipv4_is_zeronet(saddr)) {
1566 		if (!ipv4_is_local_multicast(daddr))
1567 			return -EINVAL;
1568 	} else {
1569 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1570 					  in_dev, itag);
1571 		if (err < 0)
1572 			return err;
1573 	}
1574 	return 0;
1575 }
1576 
1577 /* called in rcu_read_lock() section */
1578 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1579 			     u8 tos, struct net_device *dev, int our)
1580 {
1581 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1582 	unsigned int flags = RTCF_MULTICAST;
1583 	struct rtable *rth;
1584 	u32 itag = 0;
1585 	int err;
1586 
1587 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1588 	if (err)
1589 		return err;
1590 
1591 	if (our)
1592 		flags |= RTCF_LOCAL;
1593 
1594 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1595 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1596 	if (!rth)
1597 		return -ENOBUFS;
1598 
1599 #ifdef CONFIG_IP_ROUTE_CLASSID
1600 	rth->dst.tclassid = itag;
1601 #endif
1602 	rth->dst.output = ip_rt_bug;
1603 	rth->rt_is_input= 1;
1604 
1605 #ifdef CONFIG_IP_MROUTE
1606 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1607 		rth->dst.input = ip_mr_input;
1608 #endif
1609 	RT_CACHE_STAT_INC(in_slow_mc);
1610 
1611 	skb_dst_set(skb, &rth->dst);
1612 	return 0;
1613 }
1614 
1615 
1616 static void ip_handle_martian_source(struct net_device *dev,
1617 				     struct in_device *in_dev,
1618 				     struct sk_buff *skb,
1619 				     __be32 daddr,
1620 				     __be32 saddr)
1621 {
1622 	RT_CACHE_STAT_INC(in_martian_src);
1623 #ifdef CONFIG_IP_ROUTE_VERBOSE
1624 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1625 		/*
1626 		 *	RFC1812 recommendation, if source is martian,
1627 		 *	the only hint is MAC header.
1628 		 */
1629 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1630 			&daddr, &saddr, dev->name);
1631 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1632 			print_hex_dump(KERN_WARNING, "ll header: ",
1633 				       DUMP_PREFIX_OFFSET, 16, 1,
1634 				       skb_mac_header(skb),
1635 				       dev->hard_header_len, true);
1636 		}
1637 	}
1638 #endif
1639 }
1640 
1641 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1642 {
1643 	struct fnhe_hash_bucket *hash;
1644 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1645 	u32 hval = fnhe_hashfun(daddr);
1646 
1647 	spin_lock_bh(&fnhe_lock);
1648 
1649 	hash = rcu_dereference_protected(nh->nh_exceptions,
1650 					 lockdep_is_held(&fnhe_lock));
1651 	hash += hval;
1652 
1653 	fnhe_p = &hash->chain;
1654 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1655 	while (fnhe) {
1656 		if (fnhe->fnhe_daddr == daddr) {
1657 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1658 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1659 			fnhe_flush_routes(fnhe);
1660 			kfree_rcu(fnhe, rcu);
1661 			break;
1662 		}
1663 		fnhe_p = &fnhe->fnhe_next;
1664 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1665 						 lockdep_is_held(&fnhe_lock));
1666 	}
1667 
1668 	spin_unlock_bh(&fnhe_lock);
1669 }
1670 
1671 static void set_lwt_redirect(struct rtable *rth)
1672 {
1673 	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1674 		rth->dst.lwtstate->orig_output = rth->dst.output;
1675 		rth->dst.output = lwtunnel_output;
1676 	}
1677 
1678 	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1679 		rth->dst.lwtstate->orig_input = rth->dst.input;
1680 		rth->dst.input = lwtunnel_input;
1681 	}
1682 }
1683 
1684 /* called in rcu_read_lock() section */
1685 static int __mkroute_input(struct sk_buff *skb,
1686 			   const struct fib_result *res,
1687 			   struct in_device *in_dev,
1688 			   __be32 daddr, __be32 saddr, u32 tos)
1689 {
1690 	struct fib_nh_exception *fnhe;
1691 	struct rtable *rth;
1692 	int err;
1693 	struct in_device *out_dev;
1694 	bool do_cache;
1695 	u32 itag = 0;
1696 
1697 	/* get a working reference to the output device */
1698 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1699 	if (!out_dev) {
1700 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1701 		return -EINVAL;
1702 	}
1703 
1704 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1705 				  in_dev->dev, in_dev, &itag);
1706 	if (err < 0) {
1707 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1708 					 saddr);
1709 
1710 		goto cleanup;
1711 	}
1712 
1713 	do_cache = res->fi && !itag;
1714 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1715 	    skb->protocol == htons(ETH_P_IP) &&
1716 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1717 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1718 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1719 
1720 	if (skb->protocol != htons(ETH_P_IP)) {
1721 		/* Not IP (i.e. ARP). Do not create route, if it is
1722 		 * invalid for proxy arp. DNAT routes are always valid.
1723 		 *
1724 		 * Proxy arp feature have been extended to allow, ARP
1725 		 * replies back to the same interface, to support
1726 		 * Private VLAN switch technologies. See arp.c.
1727 		 */
1728 		if (out_dev == in_dev &&
1729 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1730 			err = -EINVAL;
1731 			goto cleanup;
1732 		}
1733 	}
1734 
1735 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1736 	if (do_cache) {
1737 		if (fnhe) {
1738 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1739 			if (rth && rth->dst.expires &&
1740 			    time_after(jiffies, rth->dst.expires)) {
1741 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1742 				fnhe = NULL;
1743 			} else {
1744 				goto rt_cache;
1745 			}
1746 		}
1747 
1748 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1749 
1750 rt_cache:
1751 		if (rt_cache_valid(rth)) {
1752 			skb_dst_set_noref(skb, &rth->dst);
1753 			goto out;
1754 		}
1755 	}
1756 
1757 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1758 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1759 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1760 	if (!rth) {
1761 		err = -ENOBUFS;
1762 		goto cleanup;
1763 	}
1764 
1765 	rth->rt_is_input = 1;
1766 	if (res->table)
1767 		rth->rt_table_id = res->table->tb_id;
1768 	RT_CACHE_STAT_INC(in_slow_tot);
1769 
1770 	rth->dst.input = ip_forward;
1771 
1772 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1773 		       do_cache);
1774 	set_lwt_redirect(rth);
1775 	skb_dst_set(skb, &rth->dst);
1776 out:
1777 	err = 0;
1778  cleanup:
1779 	return err;
1780 }
1781 
1782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1783 /* To make ICMP packets follow the right flow, the multipath hash is
1784  * calculated from the inner IP addresses.
1785  */
1786 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1787 				 struct flow_keys *hash_keys)
1788 {
1789 	const struct iphdr *outer_iph = ip_hdr(skb);
1790 	const struct iphdr *inner_iph;
1791 	const struct icmphdr *icmph;
1792 	struct iphdr _inner_iph;
1793 	struct icmphdr _icmph;
1794 
1795 	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1796 	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1797 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1798 		return;
1799 
1800 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1801 		return;
1802 
1803 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1804 				   &_icmph);
1805 	if (!icmph)
1806 		return;
1807 
1808 	if (icmph->type != ICMP_DEST_UNREACH &&
1809 	    icmph->type != ICMP_REDIRECT &&
1810 	    icmph->type != ICMP_TIME_EXCEEDED &&
1811 	    icmph->type != ICMP_PARAMETERPROB)
1812 		return;
1813 
1814 	inner_iph = skb_header_pointer(skb,
1815 				       outer_iph->ihl * 4 + sizeof(_icmph),
1816 				       sizeof(_inner_iph), &_inner_iph);
1817 	if (!inner_iph)
1818 		return;
1819 	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1820 	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1821 }
1822 
1823 /* if skb is set it will be used and fl4 can be NULL */
1824 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1825 		       const struct sk_buff *skb)
1826 {
1827 	struct net *net = fi->fib_net;
1828 	struct flow_keys hash_keys;
1829 	u32 mhash;
1830 
1831 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1832 	case 0:
1833 		memset(&hash_keys, 0, sizeof(hash_keys));
1834 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1835 		if (skb) {
1836 			ip_multipath_l3_keys(skb, &hash_keys);
1837 		} else {
1838 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1839 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1840 		}
1841 		break;
1842 	case 1:
1843 		/* skb is currently provided only when forwarding */
1844 		if (skb) {
1845 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1846 			struct flow_keys keys;
1847 
1848 			/* short-circuit if we already have L4 hash present */
1849 			if (skb->l4_hash)
1850 				return skb_get_hash_raw(skb) >> 1;
1851 			memset(&hash_keys, 0, sizeof(hash_keys));
1852 			skb_flow_dissect_flow_keys(skb, &keys, flag);
1853 
1854 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1855 			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1856 			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1857 			hash_keys.ports.src = keys.ports.src;
1858 			hash_keys.ports.dst = keys.ports.dst;
1859 			hash_keys.basic.ip_proto = keys.basic.ip_proto;
1860 		} else {
1861 			memset(&hash_keys, 0, sizeof(hash_keys));
1862 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1863 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1864 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1865 			hash_keys.ports.src = fl4->fl4_sport;
1866 			hash_keys.ports.dst = fl4->fl4_dport;
1867 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1868 		}
1869 		break;
1870 	}
1871 	mhash = flow_hash_from_keys(&hash_keys);
1872 
1873 	return mhash >> 1;
1874 }
1875 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1876 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1877 
1878 static int ip_mkroute_input(struct sk_buff *skb,
1879 			    struct fib_result *res,
1880 			    struct in_device *in_dev,
1881 			    __be32 daddr, __be32 saddr, u32 tos)
1882 {
1883 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1884 	if (res->fi && res->fi->fib_nhs > 1) {
1885 		int h = fib_multipath_hash(res->fi, NULL, skb);
1886 
1887 		fib_select_multipath(res, h);
1888 	}
1889 #endif
1890 
1891 	/* create a routing cache entry */
1892 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1893 }
1894 
1895 /*
1896  *	NOTE. We drop all the packets that has local source
1897  *	addresses, because every properly looped back packet
1898  *	must have correct destination already attached by output routine.
1899  *
1900  *	Such approach solves two big problems:
1901  *	1. Not simplex devices are handled properly.
1902  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1903  *	called with rcu_read_lock()
1904  */
1905 
1906 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1907 			       u8 tos, struct net_device *dev,
1908 			       struct fib_result *res)
1909 {
1910 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1911 	struct ip_tunnel_info *tun_info;
1912 	struct flowi4	fl4;
1913 	unsigned int	flags = 0;
1914 	u32		itag = 0;
1915 	struct rtable	*rth;
1916 	int		err = -EINVAL;
1917 	struct net    *net = dev_net(dev);
1918 	bool do_cache;
1919 
1920 	/* IP on this device is disabled. */
1921 
1922 	if (!in_dev)
1923 		goto out;
1924 
1925 	/* Check for the most weird martians, which can be not detected
1926 	   by fib_lookup.
1927 	 */
1928 
1929 	tun_info = skb_tunnel_info(skb);
1930 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1931 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1932 	else
1933 		fl4.flowi4_tun_key.tun_id = 0;
1934 	skb_dst_drop(skb);
1935 
1936 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1937 		goto martian_source;
1938 
1939 	res->fi = NULL;
1940 	res->table = NULL;
1941 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1942 		goto brd_input;
1943 
1944 	/* Accept zero addresses only to limited broadcast;
1945 	 * I even do not know to fix it or not. Waiting for complains :-)
1946 	 */
1947 	if (ipv4_is_zeronet(saddr))
1948 		goto martian_source;
1949 
1950 	if (ipv4_is_zeronet(daddr))
1951 		goto martian_destination;
1952 
1953 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1954 	 * and call it once if daddr or/and saddr are loopback addresses
1955 	 */
1956 	if (ipv4_is_loopback(daddr)) {
1957 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1958 			goto martian_destination;
1959 	} else if (ipv4_is_loopback(saddr)) {
1960 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1961 			goto martian_source;
1962 	}
1963 
1964 	/*
1965 	 *	Now we are ready to route packet.
1966 	 */
1967 	fl4.flowi4_oif = 0;
1968 	fl4.flowi4_iif = dev->ifindex;
1969 	fl4.flowi4_mark = skb->mark;
1970 	fl4.flowi4_tos = tos;
1971 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1972 	fl4.flowi4_flags = 0;
1973 	fl4.daddr = daddr;
1974 	fl4.saddr = saddr;
1975 	fl4.flowi4_uid = sock_net_uid(net, NULL);
1976 	err = fib_lookup(net, &fl4, res, 0);
1977 	if (err != 0) {
1978 		if (!IN_DEV_FORWARD(in_dev))
1979 			err = -EHOSTUNREACH;
1980 		goto no_route;
1981 	}
1982 
1983 	if (res->type == RTN_BROADCAST)
1984 		goto brd_input;
1985 
1986 	if (res->type == RTN_LOCAL) {
1987 		err = fib_validate_source(skb, saddr, daddr, tos,
1988 					  0, dev, in_dev, &itag);
1989 		if (err < 0)
1990 			goto martian_source;
1991 		goto local_input;
1992 	}
1993 
1994 	if (!IN_DEV_FORWARD(in_dev)) {
1995 		err = -EHOSTUNREACH;
1996 		goto no_route;
1997 	}
1998 	if (res->type != RTN_UNICAST)
1999 		goto martian_destination;
2000 
2001 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2002 out:	return err;
2003 
2004 brd_input:
2005 	if (skb->protocol != htons(ETH_P_IP))
2006 		goto e_inval;
2007 
2008 	if (!ipv4_is_zeronet(saddr)) {
2009 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2010 					  in_dev, &itag);
2011 		if (err < 0)
2012 			goto martian_source;
2013 	}
2014 	flags |= RTCF_BROADCAST;
2015 	res->type = RTN_BROADCAST;
2016 	RT_CACHE_STAT_INC(in_brd);
2017 
2018 local_input:
2019 	do_cache = false;
2020 	if (res->fi) {
2021 		if (!itag) {
2022 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2023 			if (rt_cache_valid(rth)) {
2024 				skb_dst_set_noref(skb, &rth->dst);
2025 				err = 0;
2026 				goto out;
2027 			}
2028 			do_cache = true;
2029 		}
2030 	}
2031 
2032 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2033 			   flags | RTCF_LOCAL, res->type,
2034 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2035 	if (!rth)
2036 		goto e_nobufs;
2037 
2038 	rth->dst.output= ip_rt_bug;
2039 #ifdef CONFIG_IP_ROUTE_CLASSID
2040 	rth->dst.tclassid = itag;
2041 #endif
2042 	rth->rt_is_input = 1;
2043 	if (res->table)
2044 		rth->rt_table_id = res->table->tb_id;
2045 
2046 	RT_CACHE_STAT_INC(in_slow_tot);
2047 	if (res->type == RTN_UNREACHABLE) {
2048 		rth->dst.input= ip_error;
2049 		rth->dst.error= -err;
2050 		rth->rt_flags 	&= ~RTCF_LOCAL;
2051 	}
2052 
2053 	if (do_cache) {
2054 		struct fib_nh *nh = &FIB_RES_NH(*res);
2055 
2056 		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2057 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2058 			WARN_ON(rth->dst.input == lwtunnel_input);
2059 			rth->dst.lwtstate->orig_input = rth->dst.input;
2060 			rth->dst.input = lwtunnel_input;
2061 		}
2062 
2063 		if (unlikely(!rt_cache_route(nh, rth)))
2064 			rt_add_uncached_list(rth);
2065 	}
2066 	skb_dst_set(skb, &rth->dst);
2067 	err = 0;
2068 	goto out;
2069 
2070 no_route:
2071 	RT_CACHE_STAT_INC(in_no_route);
2072 	res->type = RTN_UNREACHABLE;
2073 	res->fi = NULL;
2074 	res->table = NULL;
2075 	goto local_input;
2076 
2077 	/*
2078 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2079 	 */
2080 martian_destination:
2081 	RT_CACHE_STAT_INC(in_martian_dst);
2082 #ifdef CONFIG_IP_ROUTE_VERBOSE
2083 	if (IN_DEV_LOG_MARTIANS(in_dev))
2084 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2085 				     &daddr, &saddr, dev->name);
2086 #endif
2087 
2088 e_inval:
2089 	err = -EINVAL;
2090 	goto out;
2091 
2092 e_nobufs:
2093 	err = -ENOBUFS;
2094 	goto out;
2095 
2096 martian_source:
2097 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2098 	goto out;
2099 }
2100 
2101 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2102 			 u8 tos, struct net_device *dev)
2103 {
2104 	struct fib_result res;
2105 	int err;
2106 
2107 	tos &= IPTOS_RT_MASK;
2108 	rcu_read_lock();
2109 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2110 	rcu_read_unlock();
2111 
2112 	return err;
2113 }
2114 EXPORT_SYMBOL(ip_route_input_noref);
2115 
2116 /* called with rcu_read_lock held */
2117 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2118 		       u8 tos, struct net_device *dev, struct fib_result *res)
2119 {
2120 	/* Multicast recognition logic is moved from route cache to here.
2121 	   The problem was that too many Ethernet cards have broken/missing
2122 	   hardware multicast filters :-( As result the host on multicasting
2123 	   network acquires a lot of useless route cache entries, sort of
2124 	   SDR messages from all the world. Now we try to get rid of them.
2125 	   Really, provided software IP multicast filter is organized
2126 	   reasonably (at least, hashed), it does not result in a slowdown
2127 	   comparing with route cache reject entries.
2128 	   Note, that multicast routers are not affected, because
2129 	   route cache entry is created eventually.
2130 	 */
2131 	if (ipv4_is_multicast(daddr)) {
2132 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2133 		int our = 0;
2134 		int err = -EINVAL;
2135 
2136 		if (in_dev)
2137 			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2138 					      ip_hdr(skb)->protocol);
2139 
2140 		/* check l3 master if no match yet */
2141 		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2142 			struct in_device *l3_in_dev;
2143 
2144 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2145 			if (l3_in_dev)
2146 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2147 						      ip_hdr(skb)->protocol);
2148 		}
2149 
2150 		if (our
2151 #ifdef CONFIG_IP_MROUTE
2152 			||
2153 		    (!ipv4_is_local_multicast(daddr) &&
2154 		     IN_DEV_MFORWARD(in_dev))
2155 #endif
2156 		   ) {
2157 			err = ip_route_input_mc(skb, daddr, saddr,
2158 						tos, dev, our);
2159 		}
2160 		return err;
2161 	}
2162 
2163 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2164 }
2165 
2166 /* called with rcu_read_lock() */
2167 static struct rtable *__mkroute_output(const struct fib_result *res,
2168 				       const struct flowi4 *fl4, int orig_oif,
2169 				       struct net_device *dev_out,
2170 				       unsigned int flags)
2171 {
2172 	struct fib_info *fi = res->fi;
2173 	struct fib_nh_exception *fnhe;
2174 	struct in_device *in_dev;
2175 	u16 type = res->type;
2176 	struct rtable *rth;
2177 	bool do_cache;
2178 
2179 	in_dev = __in_dev_get_rcu(dev_out);
2180 	if (!in_dev)
2181 		return ERR_PTR(-EINVAL);
2182 
2183 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2184 		if (ipv4_is_loopback(fl4->saddr) &&
2185 		    !(dev_out->flags & IFF_LOOPBACK) &&
2186 		    !netif_is_l3_master(dev_out))
2187 			return ERR_PTR(-EINVAL);
2188 
2189 	if (ipv4_is_lbcast(fl4->daddr))
2190 		type = RTN_BROADCAST;
2191 	else if (ipv4_is_multicast(fl4->daddr))
2192 		type = RTN_MULTICAST;
2193 	else if (ipv4_is_zeronet(fl4->daddr))
2194 		return ERR_PTR(-EINVAL);
2195 
2196 	if (dev_out->flags & IFF_LOOPBACK)
2197 		flags |= RTCF_LOCAL;
2198 
2199 	do_cache = true;
2200 	if (type == RTN_BROADCAST) {
2201 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2202 		fi = NULL;
2203 	} else if (type == RTN_MULTICAST) {
2204 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2205 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2206 				     fl4->flowi4_proto))
2207 			flags &= ~RTCF_LOCAL;
2208 		else
2209 			do_cache = false;
2210 		/* If multicast route do not exist use
2211 		 * default one, but do not gateway in this case.
2212 		 * Yes, it is hack.
2213 		 */
2214 		if (fi && res->prefixlen < 4)
2215 			fi = NULL;
2216 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2217 		   (orig_oif != dev_out->ifindex)) {
2218 		/* For local routes that require a particular output interface
2219 		 * we do not want to cache the result.  Caching the result
2220 		 * causes incorrect behaviour when there are multiple source
2221 		 * addresses on the interface, the end result being that if the
2222 		 * intended recipient is waiting on that interface for the
2223 		 * packet he won't receive it because it will be delivered on
2224 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2225 		 * be set to the loopback interface as well.
2226 		 */
2227 		fi = NULL;
2228 	}
2229 
2230 	fnhe = NULL;
2231 	do_cache &= fi != NULL;
2232 	if (do_cache) {
2233 		struct rtable __rcu **prth;
2234 		struct fib_nh *nh = &FIB_RES_NH(*res);
2235 
2236 		fnhe = find_exception(nh, fl4->daddr);
2237 		if (fnhe) {
2238 			prth = &fnhe->fnhe_rth_output;
2239 			rth = rcu_dereference(*prth);
2240 			if (rth && rth->dst.expires &&
2241 			    time_after(jiffies, rth->dst.expires)) {
2242 				ip_del_fnhe(nh, fl4->daddr);
2243 				fnhe = NULL;
2244 			} else {
2245 				goto rt_cache;
2246 			}
2247 		}
2248 
2249 		if (unlikely(fl4->flowi4_flags &
2250 			     FLOWI_FLAG_KNOWN_NH &&
2251 			     !(nh->nh_gw &&
2252 			       nh->nh_scope == RT_SCOPE_LINK))) {
2253 			do_cache = false;
2254 			goto add;
2255 		}
2256 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2257 		rth = rcu_dereference(*prth);
2258 
2259 rt_cache:
2260 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2261 			return rth;
2262 	}
2263 
2264 add:
2265 	rth = rt_dst_alloc(dev_out, flags, type,
2266 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2267 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2268 			   do_cache);
2269 	if (!rth)
2270 		return ERR_PTR(-ENOBUFS);
2271 
2272 	rth->rt_iif = orig_oif;
2273 	if (res->table)
2274 		rth->rt_table_id = res->table->tb_id;
2275 
2276 	RT_CACHE_STAT_INC(out_slow_tot);
2277 
2278 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2279 		if (flags & RTCF_LOCAL &&
2280 		    !(dev_out->flags & IFF_LOOPBACK)) {
2281 			rth->dst.output = ip_mc_output;
2282 			RT_CACHE_STAT_INC(out_slow_mc);
2283 		}
2284 #ifdef CONFIG_IP_MROUTE
2285 		if (type == RTN_MULTICAST) {
2286 			if (IN_DEV_MFORWARD(in_dev) &&
2287 			    !ipv4_is_local_multicast(fl4->daddr)) {
2288 				rth->dst.input = ip_mr_input;
2289 				rth->dst.output = ip_mc_output;
2290 			}
2291 		}
2292 #endif
2293 	}
2294 
2295 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2296 	set_lwt_redirect(rth);
2297 
2298 	return rth;
2299 }
2300 
2301 /*
2302  * Major route resolver routine.
2303  */
2304 
2305 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2306 					const struct sk_buff *skb)
2307 {
2308 	__u8 tos = RT_FL_TOS(fl4);
2309 	struct fib_result res;
2310 	struct rtable *rth;
2311 
2312 	res.tclassid	= 0;
2313 	res.fi		= NULL;
2314 	res.table	= NULL;
2315 
2316 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2317 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2318 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2319 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2320 
2321 	rcu_read_lock();
2322 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2323 	rcu_read_unlock();
2324 
2325 	return rth;
2326 }
2327 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2328 
2329 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2330 					    struct fib_result *res,
2331 					    const struct sk_buff *skb)
2332 {
2333 	struct net_device *dev_out = NULL;
2334 	int orig_oif = fl4->flowi4_oif;
2335 	unsigned int flags = 0;
2336 	struct rtable *rth;
2337 	int err = -ENETUNREACH;
2338 
2339 	if (fl4->saddr) {
2340 		rth = ERR_PTR(-EINVAL);
2341 		if (ipv4_is_multicast(fl4->saddr) ||
2342 		    ipv4_is_lbcast(fl4->saddr) ||
2343 		    ipv4_is_zeronet(fl4->saddr))
2344 			goto out;
2345 
2346 		/* I removed check for oif == dev_out->oif here.
2347 		   It was wrong for two reasons:
2348 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2349 		      is assigned to multiple interfaces.
2350 		   2. Moreover, we are allowed to send packets with saddr
2351 		      of another iface. --ANK
2352 		 */
2353 
2354 		if (fl4->flowi4_oif == 0 &&
2355 		    (ipv4_is_multicast(fl4->daddr) ||
2356 		     ipv4_is_lbcast(fl4->daddr))) {
2357 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2358 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2359 			if (!dev_out)
2360 				goto out;
2361 
2362 			/* Special hack: user can direct multicasts
2363 			   and limited broadcast via necessary interface
2364 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2365 			   This hack is not just for fun, it allows
2366 			   vic,vat and friends to work.
2367 			   They bind socket to loopback, set ttl to zero
2368 			   and expect that it will work.
2369 			   From the viewpoint of routing cache they are broken,
2370 			   because we are not allowed to build multicast path
2371 			   with loopback source addr (look, routing cache
2372 			   cannot know, that ttl is zero, so that packet
2373 			   will not leave this host and route is valid).
2374 			   Luckily, this hack is good workaround.
2375 			 */
2376 
2377 			fl4->flowi4_oif = dev_out->ifindex;
2378 			goto make_route;
2379 		}
2380 
2381 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2382 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2383 			if (!__ip_dev_find(net, fl4->saddr, false))
2384 				goto out;
2385 		}
2386 	}
2387 
2388 
2389 	if (fl4->flowi4_oif) {
2390 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2391 		rth = ERR_PTR(-ENODEV);
2392 		if (!dev_out)
2393 			goto out;
2394 
2395 		/* RACE: Check return value of inet_select_addr instead. */
2396 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2397 			rth = ERR_PTR(-ENETUNREACH);
2398 			goto out;
2399 		}
2400 		if (ipv4_is_local_multicast(fl4->daddr) ||
2401 		    ipv4_is_lbcast(fl4->daddr) ||
2402 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2403 			if (!fl4->saddr)
2404 				fl4->saddr = inet_select_addr(dev_out, 0,
2405 							      RT_SCOPE_LINK);
2406 			goto make_route;
2407 		}
2408 		if (!fl4->saddr) {
2409 			if (ipv4_is_multicast(fl4->daddr))
2410 				fl4->saddr = inet_select_addr(dev_out, 0,
2411 							      fl4->flowi4_scope);
2412 			else if (!fl4->daddr)
2413 				fl4->saddr = inet_select_addr(dev_out, 0,
2414 							      RT_SCOPE_HOST);
2415 		}
2416 	}
2417 
2418 	if (!fl4->daddr) {
2419 		fl4->daddr = fl4->saddr;
2420 		if (!fl4->daddr)
2421 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2422 		dev_out = net->loopback_dev;
2423 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2424 		res->type = RTN_LOCAL;
2425 		flags |= RTCF_LOCAL;
2426 		goto make_route;
2427 	}
2428 
2429 	err = fib_lookup(net, fl4, res, 0);
2430 	if (err) {
2431 		res->fi = NULL;
2432 		res->table = NULL;
2433 		if (fl4->flowi4_oif &&
2434 		    (ipv4_is_multicast(fl4->daddr) ||
2435 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2436 			/* Apparently, routing tables are wrong. Assume,
2437 			   that the destination is on link.
2438 
2439 			   WHY? DW.
2440 			   Because we are allowed to send to iface
2441 			   even if it has NO routes and NO assigned
2442 			   addresses. When oif is specified, routing
2443 			   tables are looked up with only one purpose:
2444 			   to catch if destination is gatewayed, rather than
2445 			   direct. Moreover, if MSG_DONTROUTE is set,
2446 			   we send packet, ignoring both routing tables
2447 			   and ifaddr state. --ANK
2448 
2449 
2450 			   We could make it even if oif is unknown,
2451 			   likely IPv6, but we do not.
2452 			 */
2453 
2454 			if (fl4->saddr == 0)
2455 				fl4->saddr = inet_select_addr(dev_out, 0,
2456 							      RT_SCOPE_LINK);
2457 			res->type = RTN_UNICAST;
2458 			goto make_route;
2459 		}
2460 		rth = ERR_PTR(err);
2461 		goto out;
2462 	}
2463 
2464 	if (res->type == RTN_LOCAL) {
2465 		if (!fl4->saddr) {
2466 			if (res->fi->fib_prefsrc)
2467 				fl4->saddr = res->fi->fib_prefsrc;
2468 			else
2469 				fl4->saddr = fl4->daddr;
2470 		}
2471 
2472 		/* L3 master device is the loopback for that domain */
2473 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2474 			net->loopback_dev;
2475 
2476 		/* make sure orig_oif points to fib result device even
2477 		 * though packet rx/tx happens over loopback or l3mdev
2478 		 */
2479 		orig_oif = FIB_RES_OIF(*res);
2480 
2481 		fl4->flowi4_oif = dev_out->ifindex;
2482 		flags |= RTCF_LOCAL;
2483 		goto make_route;
2484 	}
2485 
2486 	fib_select_path(net, res, fl4, skb);
2487 
2488 	dev_out = FIB_RES_DEV(*res);
2489 	fl4->flowi4_oif = dev_out->ifindex;
2490 
2491 
2492 make_route:
2493 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2494 
2495 out:
2496 	return rth;
2497 }
2498 
2499 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2500 {
2501 	return NULL;
2502 }
2503 
2504 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2505 {
2506 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2507 
2508 	return mtu ? : dst->dev->mtu;
2509 }
2510 
2511 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2512 					  struct sk_buff *skb, u32 mtu)
2513 {
2514 }
2515 
2516 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2517 				       struct sk_buff *skb)
2518 {
2519 }
2520 
2521 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2522 					  unsigned long old)
2523 {
2524 	return NULL;
2525 }
2526 
2527 static struct dst_ops ipv4_dst_blackhole_ops = {
2528 	.family			=	AF_INET,
2529 	.check			=	ipv4_blackhole_dst_check,
2530 	.mtu			=	ipv4_blackhole_mtu,
2531 	.default_advmss		=	ipv4_default_advmss,
2532 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2533 	.redirect		=	ipv4_rt_blackhole_redirect,
2534 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2535 	.neigh_lookup		=	ipv4_neigh_lookup,
2536 };
2537 
2538 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2539 {
2540 	struct rtable *ort = (struct rtable *) dst_orig;
2541 	struct rtable *rt;
2542 
2543 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2544 	if (rt) {
2545 		struct dst_entry *new = &rt->dst;
2546 
2547 		new->__use = 1;
2548 		new->input = dst_discard;
2549 		new->output = dst_discard_out;
2550 
2551 		new->dev = net->loopback_dev;
2552 		if (new->dev)
2553 			dev_hold(new->dev);
2554 
2555 		rt->rt_is_input = ort->rt_is_input;
2556 		rt->rt_iif = ort->rt_iif;
2557 		rt->rt_pmtu = ort->rt_pmtu;
2558 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2559 
2560 		rt->rt_genid = rt_genid_ipv4(net);
2561 		rt->rt_flags = ort->rt_flags;
2562 		rt->rt_type = ort->rt_type;
2563 		rt->rt_gateway = ort->rt_gateway;
2564 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2565 
2566 		INIT_LIST_HEAD(&rt->rt_uncached);
2567 	}
2568 
2569 	dst_release(dst_orig);
2570 
2571 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2572 }
2573 
2574 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2575 				    const struct sock *sk)
2576 {
2577 	struct rtable *rt = __ip_route_output_key(net, flp4);
2578 
2579 	if (IS_ERR(rt))
2580 		return rt;
2581 
2582 	if (flp4->flowi4_proto)
2583 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2584 							flowi4_to_flowi(flp4),
2585 							sk, 0);
2586 
2587 	return rt;
2588 }
2589 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2590 
2591 /* called with rcu_read_lock held */
2592 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2593 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2594 			u32 seq)
2595 {
2596 	struct rtable *rt = skb_rtable(skb);
2597 	struct rtmsg *r;
2598 	struct nlmsghdr *nlh;
2599 	unsigned long expires = 0;
2600 	u32 error;
2601 	u32 metrics[RTAX_MAX];
2602 
2603 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2604 	if (!nlh)
2605 		return -EMSGSIZE;
2606 
2607 	r = nlmsg_data(nlh);
2608 	r->rtm_family	 = AF_INET;
2609 	r->rtm_dst_len	= 32;
2610 	r->rtm_src_len	= 0;
2611 	r->rtm_tos	= fl4->flowi4_tos;
2612 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2613 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2614 		goto nla_put_failure;
2615 	r->rtm_type	= rt->rt_type;
2616 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2617 	r->rtm_protocol = RTPROT_UNSPEC;
2618 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619 	if (rt->rt_flags & RTCF_NOTIFY)
2620 		r->rtm_flags |= RTM_F_NOTIFY;
2621 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2622 		r->rtm_flags |= RTCF_DOREDIRECT;
2623 
2624 	if (nla_put_in_addr(skb, RTA_DST, dst))
2625 		goto nla_put_failure;
2626 	if (src) {
2627 		r->rtm_src_len = 32;
2628 		if (nla_put_in_addr(skb, RTA_SRC, src))
2629 			goto nla_put_failure;
2630 	}
2631 	if (rt->dst.dev &&
2632 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2633 		goto nla_put_failure;
2634 #ifdef CONFIG_IP_ROUTE_CLASSID
2635 	if (rt->dst.tclassid &&
2636 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2637 		goto nla_put_failure;
2638 #endif
2639 	if (!rt_is_input_route(rt) &&
2640 	    fl4->saddr != src) {
2641 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2642 			goto nla_put_failure;
2643 	}
2644 	if (rt->rt_uses_gateway &&
2645 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2646 		goto nla_put_failure;
2647 
2648 	expires = rt->dst.expires;
2649 	if (expires) {
2650 		unsigned long now = jiffies;
2651 
2652 		if (time_before(now, expires))
2653 			expires -= now;
2654 		else
2655 			expires = 0;
2656 	}
2657 
2658 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2659 	if (rt->rt_pmtu && expires)
2660 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2661 	if (rt->rt_mtu_locked && expires)
2662 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2663 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2664 		goto nla_put_failure;
2665 
2666 	if (fl4->flowi4_mark &&
2667 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2668 		goto nla_put_failure;
2669 
2670 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2671 	    nla_put_u32(skb, RTA_UID,
2672 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2673 		goto nla_put_failure;
2674 
2675 	error = rt->dst.error;
2676 
2677 	if (rt_is_input_route(rt)) {
2678 #ifdef CONFIG_IP_MROUTE
2679 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2680 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2681 			int err = ipmr_get_route(net, skb,
2682 						 fl4->saddr, fl4->daddr,
2683 						 r, portid);
2684 
2685 			if (err <= 0) {
2686 				if (err == 0)
2687 					return 0;
2688 				goto nla_put_failure;
2689 			}
2690 		} else
2691 #endif
2692 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2693 				goto nla_put_failure;
2694 	}
2695 
2696 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2697 		goto nla_put_failure;
2698 
2699 	nlmsg_end(skb, nlh);
2700 	return 0;
2701 
2702 nla_put_failure:
2703 	nlmsg_cancel(skb, nlh);
2704 	return -EMSGSIZE;
2705 }
2706 
2707 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2708 			     struct netlink_ext_ack *extack)
2709 {
2710 	struct net *net = sock_net(in_skb->sk);
2711 	struct rtmsg *rtm;
2712 	struct nlattr *tb[RTA_MAX+1];
2713 	struct fib_result res = {};
2714 	struct rtable *rt = NULL;
2715 	struct flowi4 fl4;
2716 	__be32 dst = 0;
2717 	__be32 src = 0;
2718 	u32 iif;
2719 	int err;
2720 	int mark;
2721 	struct sk_buff *skb;
2722 	u32 table_id = RT_TABLE_MAIN;
2723 	kuid_t uid;
2724 
2725 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2726 			  extack);
2727 	if (err < 0)
2728 		goto errout;
2729 
2730 	rtm = nlmsg_data(nlh);
2731 
2732 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2733 	if (!skb) {
2734 		err = -ENOBUFS;
2735 		goto errout;
2736 	}
2737 
2738 	/* Reserve room for dummy headers, this skb can pass
2739 	   through good chunk of routing engine.
2740 	 */
2741 	skb_reset_mac_header(skb);
2742 	skb_reset_network_header(skb);
2743 
2744 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2745 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2746 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2747 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2748 	if (tb[RTA_UID])
2749 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2750 	else
2751 		uid = (iif ? INVALID_UID : current_uid());
2752 
2753 	/* Bugfix: need to give ip_route_input enough of an IP header to
2754 	 * not gag.
2755 	 */
2756 	ip_hdr(skb)->protocol = IPPROTO_UDP;
2757 	ip_hdr(skb)->saddr = src;
2758 	ip_hdr(skb)->daddr = dst;
2759 
2760 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2761 
2762 	memset(&fl4, 0, sizeof(fl4));
2763 	fl4.daddr = dst;
2764 	fl4.saddr = src;
2765 	fl4.flowi4_tos = rtm->rtm_tos;
2766 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2767 	fl4.flowi4_mark = mark;
2768 	fl4.flowi4_uid = uid;
2769 
2770 	rcu_read_lock();
2771 
2772 	if (iif) {
2773 		struct net_device *dev;
2774 
2775 		dev = dev_get_by_index_rcu(net, iif);
2776 		if (!dev) {
2777 			err = -ENODEV;
2778 			goto errout_free;
2779 		}
2780 
2781 		skb->protocol	= htons(ETH_P_IP);
2782 		skb->dev	= dev;
2783 		skb->mark	= mark;
2784 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2785 					 dev, &res);
2786 
2787 		rt = skb_rtable(skb);
2788 		if (err == 0 && rt->dst.error)
2789 			err = -rt->dst.error;
2790 	} else {
2791 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2792 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2793 		err = 0;
2794 		if (IS_ERR(rt))
2795 			err = PTR_ERR(rt);
2796 		else
2797 			skb_dst_set(skb, &rt->dst);
2798 	}
2799 
2800 	if (err)
2801 		goto errout_free;
2802 
2803 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2804 		rt->rt_flags |= RTCF_NOTIFY;
2805 
2806 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2807 		table_id = rt->rt_table_id;
2808 
2809 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2810 		if (!res.fi) {
2811 			err = fib_props[res.type].error;
2812 			if (!err)
2813 				err = -EHOSTUNREACH;
2814 			goto errout_free;
2815 		}
2816 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2817 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2818 				    rt->rt_type, res.prefix, res.prefixlen,
2819 				    fl4.flowi4_tos, res.fi, 0);
2820 	} else {
2821 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2822 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2823 	}
2824 	if (err < 0)
2825 		goto errout_free;
2826 
2827 	rcu_read_unlock();
2828 
2829 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2830 errout:
2831 	return err;
2832 
2833 errout_free:
2834 	rcu_read_unlock();
2835 	kfree_skb(skb);
2836 	goto errout;
2837 }
2838 
2839 void ip_rt_multicast_event(struct in_device *in_dev)
2840 {
2841 	rt_cache_flush(dev_net(in_dev->dev));
2842 }
2843 
2844 #ifdef CONFIG_SYSCTL
2845 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2846 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2847 static int ip_rt_gc_elasticity __read_mostly	= 8;
2848 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
2849 
2850 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2851 					void __user *buffer,
2852 					size_t *lenp, loff_t *ppos)
2853 {
2854 	struct net *net = (struct net *)__ctl->extra1;
2855 
2856 	if (write) {
2857 		rt_cache_flush(net);
2858 		fnhe_genid_bump(net);
2859 		return 0;
2860 	}
2861 
2862 	return -EINVAL;
2863 }
2864 
2865 static struct ctl_table ipv4_route_table[] = {
2866 	{
2867 		.procname	= "gc_thresh",
2868 		.data		= &ipv4_dst_ops.gc_thresh,
2869 		.maxlen		= sizeof(int),
2870 		.mode		= 0644,
2871 		.proc_handler	= proc_dointvec,
2872 	},
2873 	{
2874 		.procname	= "max_size",
2875 		.data		= &ip_rt_max_size,
2876 		.maxlen		= sizeof(int),
2877 		.mode		= 0644,
2878 		.proc_handler	= proc_dointvec,
2879 	},
2880 	{
2881 		/*  Deprecated. Use gc_min_interval_ms */
2882 
2883 		.procname	= "gc_min_interval",
2884 		.data		= &ip_rt_gc_min_interval,
2885 		.maxlen		= sizeof(int),
2886 		.mode		= 0644,
2887 		.proc_handler	= proc_dointvec_jiffies,
2888 	},
2889 	{
2890 		.procname	= "gc_min_interval_ms",
2891 		.data		= &ip_rt_gc_min_interval,
2892 		.maxlen		= sizeof(int),
2893 		.mode		= 0644,
2894 		.proc_handler	= proc_dointvec_ms_jiffies,
2895 	},
2896 	{
2897 		.procname	= "gc_timeout",
2898 		.data		= &ip_rt_gc_timeout,
2899 		.maxlen		= sizeof(int),
2900 		.mode		= 0644,
2901 		.proc_handler	= proc_dointvec_jiffies,
2902 	},
2903 	{
2904 		.procname	= "gc_interval",
2905 		.data		= &ip_rt_gc_interval,
2906 		.maxlen		= sizeof(int),
2907 		.mode		= 0644,
2908 		.proc_handler	= proc_dointvec_jiffies,
2909 	},
2910 	{
2911 		.procname	= "redirect_load",
2912 		.data		= &ip_rt_redirect_load,
2913 		.maxlen		= sizeof(int),
2914 		.mode		= 0644,
2915 		.proc_handler	= proc_dointvec,
2916 	},
2917 	{
2918 		.procname	= "redirect_number",
2919 		.data		= &ip_rt_redirect_number,
2920 		.maxlen		= sizeof(int),
2921 		.mode		= 0644,
2922 		.proc_handler	= proc_dointvec,
2923 	},
2924 	{
2925 		.procname	= "redirect_silence",
2926 		.data		= &ip_rt_redirect_silence,
2927 		.maxlen		= sizeof(int),
2928 		.mode		= 0644,
2929 		.proc_handler	= proc_dointvec,
2930 	},
2931 	{
2932 		.procname	= "error_cost",
2933 		.data		= &ip_rt_error_cost,
2934 		.maxlen		= sizeof(int),
2935 		.mode		= 0644,
2936 		.proc_handler	= proc_dointvec,
2937 	},
2938 	{
2939 		.procname	= "error_burst",
2940 		.data		= &ip_rt_error_burst,
2941 		.maxlen		= sizeof(int),
2942 		.mode		= 0644,
2943 		.proc_handler	= proc_dointvec,
2944 	},
2945 	{
2946 		.procname	= "gc_elasticity",
2947 		.data		= &ip_rt_gc_elasticity,
2948 		.maxlen		= sizeof(int),
2949 		.mode		= 0644,
2950 		.proc_handler	= proc_dointvec,
2951 	},
2952 	{
2953 		.procname	= "mtu_expires",
2954 		.data		= &ip_rt_mtu_expires,
2955 		.maxlen		= sizeof(int),
2956 		.mode		= 0644,
2957 		.proc_handler	= proc_dointvec_jiffies,
2958 	},
2959 	{
2960 		.procname	= "min_pmtu",
2961 		.data		= &ip_rt_min_pmtu,
2962 		.maxlen		= sizeof(int),
2963 		.mode		= 0644,
2964 		.proc_handler	= proc_dointvec_minmax,
2965 		.extra1		= &ip_min_valid_pmtu,
2966 	},
2967 	{
2968 		.procname	= "min_adv_mss",
2969 		.data		= &ip_rt_min_advmss,
2970 		.maxlen		= sizeof(int),
2971 		.mode		= 0644,
2972 		.proc_handler	= proc_dointvec,
2973 	},
2974 	{ }
2975 };
2976 
2977 static struct ctl_table ipv4_route_flush_table[] = {
2978 	{
2979 		.procname	= "flush",
2980 		.maxlen		= sizeof(int),
2981 		.mode		= 0200,
2982 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2983 	},
2984 	{ },
2985 };
2986 
2987 static __net_init int sysctl_route_net_init(struct net *net)
2988 {
2989 	struct ctl_table *tbl;
2990 
2991 	tbl = ipv4_route_flush_table;
2992 	if (!net_eq(net, &init_net)) {
2993 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2994 		if (!tbl)
2995 			goto err_dup;
2996 
2997 		/* Don't export sysctls to unprivileged users */
2998 		if (net->user_ns != &init_user_ns)
2999 			tbl[0].procname = NULL;
3000 	}
3001 	tbl[0].extra1 = net;
3002 
3003 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3004 	if (!net->ipv4.route_hdr)
3005 		goto err_reg;
3006 	return 0;
3007 
3008 err_reg:
3009 	if (tbl != ipv4_route_flush_table)
3010 		kfree(tbl);
3011 err_dup:
3012 	return -ENOMEM;
3013 }
3014 
3015 static __net_exit void sysctl_route_net_exit(struct net *net)
3016 {
3017 	struct ctl_table *tbl;
3018 
3019 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3020 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3021 	BUG_ON(tbl == ipv4_route_flush_table);
3022 	kfree(tbl);
3023 }
3024 
3025 static __net_initdata struct pernet_operations sysctl_route_ops = {
3026 	.init = sysctl_route_net_init,
3027 	.exit = sysctl_route_net_exit,
3028 };
3029 #endif
3030 
3031 static __net_init int rt_genid_init(struct net *net)
3032 {
3033 	atomic_set(&net->ipv4.rt_genid, 0);
3034 	atomic_set(&net->fnhe_genid, 0);
3035 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3036 	return 0;
3037 }
3038 
3039 static __net_initdata struct pernet_operations rt_genid_ops = {
3040 	.init = rt_genid_init,
3041 };
3042 
3043 static int __net_init ipv4_inetpeer_init(struct net *net)
3044 {
3045 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3046 
3047 	if (!bp)
3048 		return -ENOMEM;
3049 	inet_peer_base_init(bp);
3050 	net->ipv4.peers = bp;
3051 	return 0;
3052 }
3053 
3054 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3055 {
3056 	struct inet_peer_base *bp = net->ipv4.peers;
3057 
3058 	net->ipv4.peers = NULL;
3059 	inetpeer_invalidate_tree(bp);
3060 	kfree(bp);
3061 }
3062 
3063 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3064 	.init	=	ipv4_inetpeer_init,
3065 	.exit	=	ipv4_inetpeer_exit,
3066 };
3067 
3068 #ifdef CONFIG_IP_ROUTE_CLASSID
3069 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3070 #endif /* CONFIG_IP_ROUTE_CLASSID */
3071 
3072 int __init ip_rt_init(void)
3073 {
3074 	int cpu;
3075 
3076 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3077 	if (!ip_idents)
3078 		panic("IP: failed to allocate ip_idents\n");
3079 
3080 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3081 
3082 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3083 	if (!ip_tstamps)
3084 		panic("IP: failed to allocate ip_tstamps\n");
3085 
3086 	for_each_possible_cpu(cpu) {
3087 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3088 
3089 		INIT_LIST_HEAD(&ul->head);
3090 		spin_lock_init(&ul->lock);
3091 	}
3092 #ifdef CONFIG_IP_ROUTE_CLASSID
3093 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3094 	if (!ip_rt_acct)
3095 		panic("IP: failed to allocate ip_rt_acct\n");
3096 #endif
3097 
3098 	ipv4_dst_ops.kmem_cachep =
3099 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3100 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3101 
3102 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3103 
3104 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3105 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3106 
3107 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3108 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3109 
3110 	ipv4_dst_ops.gc_thresh = ~0;
3111 	ip_rt_max_size = INT_MAX;
3112 
3113 	devinet_init();
3114 	ip_fib_init();
3115 
3116 	if (ip_rt_proc_init())
3117 		pr_err("Unable to create route proc files\n");
3118 #ifdef CONFIG_XFRM
3119 	xfrm_init();
3120 	xfrm4_init();
3121 #endif
3122 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3123 		      RTNL_FLAG_DOIT_UNLOCKED);
3124 
3125 #ifdef CONFIG_SYSCTL
3126 	register_pernet_subsys(&sysctl_route_ops);
3127 #endif
3128 	register_pernet_subsys(&rt_genid_ops);
3129 	register_pernet_subsys(&ipv4_inetpeer_ops);
3130 	return 0;
3131 }
3132 
3133 #ifdef CONFIG_SYSCTL
3134 /*
3135  * We really need to sanitize the damn ipv4 init order, then all
3136  * this nonsense will go away.
3137  */
3138 void __init ip_static_sysctl_init(void)
3139 {
3140 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3141 }
3142 #endif
3143