xref: /openbmc/linux/net/ipv4/route.c (revision 3769cffb1c48f64640ffab7ce3bffe867342c0f0)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112 
113 #define RT_FL_TOS(oldflp4) \
114     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 
116 #define IP_MAX_MTU	0xFFF0
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
124 static int ip_rt_redirect_number __read_mostly	= 9;
125 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly	= HZ;
128 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly	= 8;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 static int rt_chain_length_max __read_mostly	= 20;
134 
135 /*
136  *	Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int	 ipv4_default_mtu(const struct dst_entry *dst);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void		 ipv4_link_failure(struct sk_buff *skb);
145 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(struct dst_ops *ops);
147 
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 			    int how)
150 {
151 }
152 
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 	struct rtable *rt = (struct rtable *) dst;
156 	struct inet_peer *peer;
157 	u32 *p = NULL;
158 
159 	if (!rt->peer)
160 		rt_bind_peer(rt, rt->rt_dst, 1);
161 
162 	peer = rt->peer;
163 	if (peer) {
164 		u32 *old_p = __DST_METRICS_PTR(old);
165 		unsigned long prev, new;
166 
167 		p = peer->metrics;
168 		if (inet_metrics_new(peer))
169 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
170 
171 		new = (unsigned long) p;
172 		prev = cmpxchg(&dst->_metrics, old, new);
173 
174 		if (prev != old) {
175 			p = __DST_METRICS_PTR(prev);
176 			if (prev & DST_METRICS_READ_ONLY)
177 				p = NULL;
178 		} else {
179 			if (rt->fi) {
180 				fib_info_put(rt->fi);
181 				rt->fi = NULL;
182 			}
183 		}
184 	}
185 	return p;
186 }
187 
188 static struct dst_ops ipv4_dst_ops = {
189 	.family =		AF_INET,
190 	.protocol =		cpu_to_be16(ETH_P_IP),
191 	.gc =			rt_garbage_collect,
192 	.check =		ipv4_dst_check,
193 	.default_advmss =	ipv4_default_advmss,
194 	.default_mtu =		ipv4_default_mtu,
195 	.cow_metrics =		ipv4_cow_metrics,
196 	.destroy =		ipv4_dst_destroy,
197 	.ifdown =		ipv4_dst_ifdown,
198 	.negative_advice =	ipv4_negative_advice,
199 	.link_failure =		ipv4_link_failure,
200 	.update_pmtu =		ip_rt_update_pmtu,
201 	.local_out =		__ip_local_out,
202 };
203 
204 #define ECN_OR_COST(class)	TC_PRIO_##class
205 
206 const __u8 ip_tos2prio[16] = {
207 	TC_PRIO_BESTEFFORT,
208 	ECN_OR_COST(BESTEFFORT),
209 	TC_PRIO_BESTEFFORT,
210 	ECN_OR_COST(BESTEFFORT),
211 	TC_PRIO_BULK,
212 	ECN_OR_COST(BULK),
213 	TC_PRIO_BULK,
214 	ECN_OR_COST(BULK),
215 	TC_PRIO_INTERACTIVE,
216 	ECN_OR_COST(INTERACTIVE),
217 	TC_PRIO_INTERACTIVE,
218 	ECN_OR_COST(INTERACTIVE),
219 	TC_PRIO_INTERACTIVE_BULK,
220 	ECN_OR_COST(INTERACTIVE_BULK),
221 	TC_PRIO_INTERACTIVE_BULK,
222 	ECN_OR_COST(INTERACTIVE_BULK)
223 };
224 
225 
226 /*
227  * Route cache.
228  */
229 
230 /* The locking scheme is rather straight forward:
231  *
232  * 1) Read-Copy Update protects the buckets of the central route hash.
233  * 2) Only writers remove entries, and they hold the lock
234  *    as they look at rtable reference counts.
235  * 3) Only readers acquire references to rtable entries,
236  *    they do so with atomic increments and with the
237  *    lock held.
238  */
239 
240 struct rt_hash_bucket {
241 	struct rtable __rcu	*chain;
242 };
243 
244 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245 	defined(CONFIG_PROVE_LOCKING)
246 /*
247  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248  * The size of this table is a power of two and depends on the number of CPUS.
249  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
250  */
251 #ifdef CONFIG_LOCKDEP
252 # define RT_HASH_LOCK_SZ	256
253 #else
254 # if NR_CPUS >= 32
255 #  define RT_HASH_LOCK_SZ	4096
256 # elif NR_CPUS >= 16
257 #  define RT_HASH_LOCK_SZ	2048
258 # elif NR_CPUS >= 8
259 #  define RT_HASH_LOCK_SZ	1024
260 # elif NR_CPUS >= 4
261 #  define RT_HASH_LOCK_SZ	512
262 # else
263 #  define RT_HASH_LOCK_SZ	256
264 # endif
265 #endif
266 
267 static spinlock_t	*rt_hash_locks;
268 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
269 
270 static __init void rt_hash_lock_init(void)
271 {
272 	int i;
273 
274 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275 			GFP_KERNEL);
276 	if (!rt_hash_locks)
277 		panic("IP: failed to allocate rt_hash_locks\n");
278 
279 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280 		spin_lock_init(&rt_hash_locks[i]);
281 }
282 #else
283 # define rt_hash_lock_addr(slot) NULL
284 
285 static inline void rt_hash_lock_init(void)
286 {
287 }
288 #endif
289 
290 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
291 static unsigned			rt_hash_mask __read_mostly;
292 static unsigned int		rt_hash_log  __read_mostly;
293 
294 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
295 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
296 
297 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
298 				   int genid)
299 {
300 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
301 			    idx, genid)
302 		& rt_hash_mask;
303 }
304 
305 static inline int rt_genid(struct net *net)
306 {
307 	return atomic_read(&net->ipv4.rt_genid);
308 }
309 
310 #ifdef CONFIG_PROC_FS
311 struct rt_cache_iter_state {
312 	struct seq_net_private p;
313 	int bucket;
314 	int genid;
315 };
316 
317 static struct rtable *rt_cache_get_first(struct seq_file *seq)
318 {
319 	struct rt_cache_iter_state *st = seq->private;
320 	struct rtable *r = NULL;
321 
322 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
323 		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
324 			continue;
325 		rcu_read_lock_bh();
326 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
327 		while (r) {
328 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
329 			    r->rt_genid == st->genid)
330 				return r;
331 			r = rcu_dereference_bh(r->dst.rt_next);
332 		}
333 		rcu_read_unlock_bh();
334 	}
335 	return r;
336 }
337 
338 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
339 					  struct rtable *r)
340 {
341 	struct rt_cache_iter_state *st = seq->private;
342 
343 	r = rcu_dereference_bh(r->dst.rt_next);
344 	while (!r) {
345 		rcu_read_unlock_bh();
346 		do {
347 			if (--st->bucket < 0)
348 				return NULL;
349 		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
350 		rcu_read_lock_bh();
351 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
352 	}
353 	return r;
354 }
355 
356 static struct rtable *rt_cache_get_next(struct seq_file *seq,
357 					struct rtable *r)
358 {
359 	struct rt_cache_iter_state *st = seq->private;
360 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
361 		if (dev_net(r->dst.dev) != seq_file_net(seq))
362 			continue;
363 		if (r->rt_genid == st->genid)
364 			break;
365 	}
366 	return r;
367 }
368 
369 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
370 {
371 	struct rtable *r = rt_cache_get_first(seq);
372 
373 	if (r)
374 		while (pos && (r = rt_cache_get_next(seq, r)))
375 			--pos;
376 	return pos ? NULL : r;
377 }
378 
379 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380 {
381 	struct rt_cache_iter_state *st = seq->private;
382 	if (*pos)
383 		return rt_cache_get_idx(seq, *pos - 1);
384 	st->genid = rt_genid(seq_file_net(seq));
385 	return SEQ_START_TOKEN;
386 }
387 
388 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389 {
390 	struct rtable *r;
391 
392 	if (v == SEQ_START_TOKEN)
393 		r = rt_cache_get_first(seq);
394 	else
395 		r = rt_cache_get_next(seq, v);
396 	++*pos;
397 	return r;
398 }
399 
400 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401 {
402 	if (v && v != SEQ_START_TOKEN)
403 		rcu_read_unlock_bh();
404 }
405 
406 static int rt_cache_seq_show(struct seq_file *seq, void *v)
407 {
408 	if (v == SEQ_START_TOKEN)
409 		seq_printf(seq, "%-127s\n",
410 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412 			   "HHUptod\tSpecDst");
413 	else {
414 		struct rtable *r = v;
415 		int len;
416 
417 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
419 			r->dst.dev ? r->dst.dev->name : "*",
420 			(__force u32)r->rt_dst,
421 			(__force u32)r->rt_gateway,
422 			r->rt_flags, atomic_read(&r->dst.__refcnt),
423 			r->dst.__use, 0, (__force u32)r->rt_src,
424 			dst_metric_advmss(&r->dst) + 40,
425 			dst_metric(&r->dst, RTAX_WINDOW),
426 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427 			      dst_metric(&r->dst, RTAX_RTTVAR)),
428 			r->rt_key_tos,
429 			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430 			r->dst.hh ? (r->dst.hh->hh_output ==
431 				       dev_queue_xmit) : 0,
432 			r->rt_spec_dst, &len);
433 
434 		seq_printf(seq, "%*s\n", 127 - len, "");
435 	}
436 	return 0;
437 }
438 
439 static const struct seq_operations rt_cache_seq_ops = {
440 	.start  = rt_cache_seq_start,
441 	.next   = rt_cache_seq_next,
442 	.stop   = rt_cache_seq_stop,
443 	.show   = rt_cache_seq_show,
444 };
445 
446 static int rt_cache_seq_open(struct inode *inode, struct file *file)
447 {
448 	return seq_open_net(inode, file, &rt_cache_seq_ops,
449 			sizeof(struct rt_cache_iter_state));
450 }
451 
452 static const struct file_operations rt_cache_seq_fops = {
453 	.owner	 = THIS_MODULE,
454 	.open	 = rt_cache_seq_open,
455 	.read	 = seq_read,
456 	.llseek	 = seq_lseek,
457 	.release = seq_release_net,
458 };
459 
460 
461 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462 {
463 	int cpu;
464 
465 	if (*pos == 0)
466 		return SEQ_START_TOKEN;
467 
468 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
469 		if (!cpu_possible(cpu))
470 			continue;
471 		*pos = cpu+1;
472 		return &per_cpu(rt_cache_stat, cpu);
473 	}
474 	return NULL;
475 }
476 
477 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478 {
479 	int cpu;
480 
481 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
482 		if (!cpu_possible(cpu))
483 			continue;
484 		*pos = cpu+1;
485 		return &per_cpu(rt_cache_stat, cpu);
486 	}
487 	return NULL;
488 
489 }
490 
491 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492 {
493 
494 }
495 
496 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497 {
498 	struct rt_cache_stat *st = v;
499 
500 	if (v == SEQ_START_TOKEN) {
501 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
502 		return 0;
503 	}
504 
505 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
506 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
507 		   dst_entries_get_slow(&ipv4_dst_ops),
508 		   st->in_hit,
509 		   st->in_slow_tot,
510 		   st->in_slow_mc,
511 		   st->in_no_route,
512 		   st->in_brd,
513 		   st->in_martian_dst,
514 		   st->in_martian_src,
515 
516 		   st->out_hit,
517 		   st->out_slow_tot,
518 		   st->out_slow_mc,
519 
520 		   st->gc_total,
521 		   st->gc_ignored,
522 		   st->gc_goal_miss,
523 		   st->gc_dst_overflow,
524 		   st->in_hlist_search,
525 		   st->out_hlist_search
526 		);
527 	return 0;
528 }
529 
530 static const struct seq_operations rt_cpu_seq_ops = {
531 	.start  = rt_cpu_seq_start,
532 	.next   = rt_cpu_seq_next,
533 	.stop   = rt_cpu_seq_stop,
534 	.show   = rt_cpu_seq_show,
535 };
536 
537 
538 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539 {
540 	return seq_open(file, &rt_cpu_seq_ops);
541 }
542 
543 static const struct file_operations rt_cpu_seq_fops = {
544 	.owner	 = THIS_MODULE,
545 	.open	 = rt_cpu_seq_open,
546 	.read	 = seq_read,
547 	.llseek	 = seq_lseek,
548 	.release = seq_release,
549 };
550 
551 #ifdef CONFIG_IP_ROUTE_CLASSID
552 static int rt_acct_proc_show(struct seq_file *m, void *v)
553 {
554 	struct ip_rt_acct *dst, *src;
555 	unsigned int i, j;
556 
557 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558 	if (!dst)
559 		return -ENOMEM;
560 
561 	for_each_possible_cpu(i) {
562 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563 		for (j = 0; j < 256; j++) {
564 			dst[j].o_bytes   += src[j].o_bytes;
565 			dst[j].o_packets += src[j].o_packets;
566 			dst[j].i_bytes   += src[j].i_bytes;
567 			dst[j].i_packets += src[j].i_packets;
568 		}
569 	}
570 
571 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572 	kfree(dst);
573 	return 0;
574 }
575 
576 static int rt_acct_proc_open(struct inode *inode, struct file *file)
577 {
578 	return single_open(file, rt_acct_proc_show, NULL);
579 }
580 
581 static const struct file_operations rt_acct_proc_fops = {
582 	.owner		= THIS_MODULE,
583 	.open		= rt_acct_proc_open,
584 	.read		= seq_read,
585 	.llseek		= seq_lseek,
586 	.release	= single_release,
587 };
588 #endif
589 
590 static int __net_init ip_rt_do_proc_init(struct net *net)
591 {
592 	struct proc_dir_entry *pde;
593 
594 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595 			&rt_cache_seq_fops);
596 	if (!pde)
597 		goto err1;
598 
599 	pde = proc_create("rt_cache", S_IRUGO,
600 			  net->proc_net_stat, &rt_cpu_seq_fops);
601 	if (!pde)
602 		goto err2;
603 
604 #ifdef CONFIG_IP_ROUTE_CLASSID
605 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
606 	if (!pde)
607 		goto err3;
608 #endif
609 	return 0;
610 
611 #ifdef CONFIG_IP_ROUTE_CLASSID
612 err3:
613 	remove_proc_entry("rt_cache", net->proc_net_stat);
614 #endif
615 err2:
616 	remove_proc_entry("rt_cache", net->proc_net);
617 err1:
618 	return -ENOMEM;
619 }
620 
621 static void __net_exit ip_rt_do_proc_exit(struct net *net)
622 {
623 	remove_proc_entry("rt_cache", net->proc_net_stat);
624 	remove_proc_entry("rt_cache", net->proc_net);
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626 	remove_proc_entry("rt_acct", net->proc_net);
627 #endif
628 }
629 
630 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
631 	.init = ip_rt_do_proc_init,
632 	.exit = ip_rt_do_proc_exit,
633 };
634 
635 static int __init ip_rt_proc_init(void)
636 {
637 	return register_pernet_subsys(&ip_rt_proc_ops);
638 }
639 
640 #else
641 static inline int ip_rt_proc_init(void)
642 {
643 	return 0;
644 }
645 #endif /* CONFIG_PROC_FS */
646 
647 static inline void rt_free(struct rtable *rt)
648 {
649 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
650 }
651 
652 static inline void rt_drop(struct rtable *rt)
653 {
654 	ip_rt_put(rt);
655 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
656 }
657 
658 static inline int rt_fast_clean(struct rtable *rth)
659 {
660 	/* Kill broadcast/multicast entries very aggresively, if they
661 	   collide in hash table with more useful entries */
662 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
663 		rt_is_input_route(rth) && rth->dst.rt_next;
664 }
665 
666 static inline int rt_valuable(struct rtable *rth)
667 {
668 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
669 		(rth->peer && rth->peer->pmtu_expires);
670 }
671 
672 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673 {
674 	unsigned long age;
675 	int ret = 0;
676 
677 	if (atomic_read(&rth->dst.__refcnt))
678 		goto out;
679 
680 	age = jiffies - rth->dst.lastuse;
681 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682 	    (age <= tmo2 && rt_valuable(rth)))
683 		goto out;
684 	ret = 1;
685 out:	return ret;
686 }
687 
688 /* Bits of score are:
689  * 31: very valuable
690  * 30: not quite useless
691  * 29..0: usage counter
692  */
693 static inline u32 rt_score(struct rtable *rt)
694 {
695 	u32 score = jiffies - rt->dst.lastuse;
696 
697 	score = ~score & ~(3<<30);
698 
699 	if (rt_valuable(rt))
700 		score |= (1<<31);
701 
702 	if (rt_is_output_route(rt) ||
703 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704 		score |= (1<<30);
705 
706 	return score;
707 }
708 
709 static inline bool rt_caching(const struct net *net)
710 {
711 	return net->ipv4.current_rt_cache_rebuild_count <=
712 		net->ipv4.sysctl_rt_cache_rebuild_count;
713 }
714 
715 static inline bool compare_hash_inputs(const struct rtable *rt1,
716 				       const struct rtable *rt2)
717 {
718 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
720 		(rt1->rt_iif ^ rt2->rt_iif)) == 0);
721 }
722 
723 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
724 {
725 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727 		(rt1->rt_mark ^ rt2->rt_mark) |
728 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
729 		(rt1->rt_oif ^ rt2->rt_oif) |
730 		(rt1->rt_iif ^ rt2->rt_iif)) == 0;
731 }
732 
733 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734 {
735 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
736 }
737 
738 static inline int rt_is_expired(struct rtable *rth)
739 {
740 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
741 }
742 
743 /*
744  * Perform a full scan of hash table and free all entries.
745  * Can be called by a softirq or a process.
746  * In the later case, we want to be reschedule if necessary
747  */
748 static void rt_do_flush(struct net *net, int process_context)
749 {
750 	unsigned int i;
751 	struct rtable *rth, *next;
752 
753 	for (i = 0; i <= rt_hash_mask; i++) {
754 		struct rtable __rcu **pprev;
755 		struct rtable *list;
756 
757 		if (process_context && need_resched())
758 			cond_resched();
759 		rth = rcu_dereference_raw(rt_hash_table[i].chain);
760 		if (!rth)
761 			continue;
762 
763 		spin_lock_bh(rt_hash_lock_addr(i));
764 
765 		list = NULL;
766 		pprev = &rt_hash_table[i].chain;
767 		rth = rcu_dereference_protected(*pprev,
768 			lockdep_is_held(rt_hash_lock_addr(i)));
769 
770 		while (rth) {
771 			next = rcu_dereference_protected(rth->dst.rt_next,
772 				lockdep_is_held(rt_hash_lock_addr(i)));
773 
774 			if (!net ||
775 			    net_eq(dev_net(rth->dst.dev), net)) {
776 				rcu_assign_pointer(*pprev, next);
777 				rcu_assign_pointer(rth->dst.rt_next, list);
778 				list = rth;
779 			} else {
780 				pprev = &rth->dst.rt_next;
781 			}
782 			rth = next;
783 		}
784 
785 		spin_unlock_bh(rt_hash_lock_addr(i));
786 
787 		for (; list; list = next) {
788 			next = rcu_dereference_protected(list->dst.rt_next, 1);
789 			rt_free(list);
790 		}
791 	}
792 }
793 
794 /*
795  * While freeing expired entries, we compute average chain length
796  * and standard deviation, using fixed-point arithmetic.
797  * This to have an estimation of rt_chain_length_max
798  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
799  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800  */
801 
802 #define FRACT_BITS 3
803 #define ONE (1UL << FRACT_BITS)
804 
805 /*
806  * Given a hash chain and an item in this hash chain,
807  * find if a previous entry has the same hash_inputs
808  * (but differs on tos, mark or oif)
809  * Returns 0 if an alias is found.
810  * Returns ONE if rth has no alias before itself.
811  */
812 static int has_noalias(const struct rtable *head, const struct rtable *rth)
813 {
814 	const struct rtable *aux = head;
815 
816 	while (aux != rth) {
817 		if (compare_hash_inputs(aux, rth))
818 			return 0;
819 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
820 	}
821 	return ONE;
822 }
823 
824 /*
825  * Perturbation of rt_genid by a small quantity [1..256]
826  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827  * many times (2^24) without giving recent rt_genid.
828  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
829  */
830 static void rt_cache_invalidate(struct net *net)
831 {
832 	unsigned char shuffle;
833 
834 	get_random_bytes(&shuffle, sizeof(shuffle));
835 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
836 }
837 
838 /*
839  * delay < 0  : invalidate cache (fast : entries will be deleted later)
840  * delay >= 0 : invalidate & flush cache (can be long)
841  */
842 void rt_cache_flush(struct net *net, int delay)
843 {
844 	rt_cache_invalidate(net);
845 	if (delay >= 0)
846 		rt_do_flush(net, !in_softirq());
847 }
848 
849 /* Flush previous cache invalidated entries from the cache */
850 void rt_cache_flush_batch(struct net *net)
851 {
852 	rt_do_flush(net, !in_softirq());
853 }
854 
855 static void rt_emergency_hash_rebuild(struct net *net)
856 {
857 	if (net_ratelimit())
858 		printk(KERN_WARNING "Route hash chain too long!\n");
859 	rt_cache_invalidate(net);
860 }
861 
862 /*
863    Short description of GC goals.
864 
865    We want to build algorithm, which will keep routing cache
866    at some equilibrium point, when number of aged off entries
867    is kept approximately equal to newly generated ones.
868 
869    Current expiration strength is variable "expire".
870    We try to adjust it dynamically, so that if networking
871    is idle expires is large enough to keep enough of warm entries,
872    and when load increases it reduces to limit cache size.
873  */
874 
875 static int rt_garbage_collect(struct dst_ops *ops)
876 {
877 	static unsigned long expire = RT_GC_TIMEOUT;
878 	static unsigned long last_gc;
879 	static int rover;
880 	static int equilibrium;
881 	struct rtable *rth;
882 	struct rtable __rcu **rthp;
883 	unsigned long now = jiffies;
884 	int goal;
885 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
886 
887 	/*
888 	 * Garbage collection is pretty expensive,
889 	 * do not make it too frequently.
890 	 */
891 
892 	RT_CACHE_STAT_INC(gc_total);
893 
894 	if (now - last_gc < ip_rt_gc_min_interval &&
895 	    entries < ip_rt_max_size) {
896 		RT_CACHE_STAT_INC(gc_ignored);
897 		goto out;
898 	}
899 
900 	entries = dst_entries_get_slow(&ipv4_dst_ops);
901 	/* Calculate number of entries, which we want to expire now. */
902 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
903 	if (goal <= 0) {
904 		if (equilibrium < ipv4_dst_ops.gc_thresh)
905 			equilibrium = ipv4_dst_ops.gc_thresh;
906 		goal = entries - equilibrium;
907 		if (goal > 0) {
908 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
909 			goal = entries - equilibrium;
910 		}
911 	} else {
912 		/* We are in dangerous area. Try to reduce cache really
913 		 * aggressively.
914 		 */
915 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
916 		equilibrium = entries - goal;
917 	}
918 
919 	if (now - last_gc >= ip_rt_gc_min_interval)
920 		last_gc = now;
921 
922 	if (goal <= 0) {
923 		equilibrium += goal;
924 		goto work_done;
925 	}
926 
927 	do {
928 		int i, k;
929 
930 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931 			unsigned long tmo = expire;
932 
933 			k = (k + 1) & rt_hash_mask;
934 			rthp = &rt_hash_table[k].chain;
935 			spin_lock_bh(rt_hash_lock_addr(k));
936 			while ((rth = rcu_dereference_protected(*rthp,
937 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
938 				if (!rt_is_expired(rth) &&
939 					!rt_may_expire(rth, tmo, expire)) {
940 					tmo >>= 1;
941 					rthp = &rth->dst.rt_next;
942 					continue;
943 				}
944 				*rthp = rth->dst.rt_next;
945 				rt_free(rth);
946 				goal--;
947 			}
948 			spin_unlock_bh(rt_hash_lock_addr(k));
949 			if (goal <= 0)
950 				break;
951 		}
952 		rover = k;
953 
954 		if (goal <= 0)
955 			goto work_done;
956 
957 		/* Goal is not achieved. We stop process if:
958 
959 		   - if expire reduced to zero. Otherwise, expire is halfed.
960 		   - if table is not full.
961 		   - if we are called from interrupt.
962 		   - jiffies check is just fallback/debug loop breaker.
963 		     We will not spin here for long time in any case.
964 		 */
965 
966 		RT_CACHE_STAT_INC(gc_goal_miss);
967 
968 		if (expire == 0)
969 			break;
970 
971 		expire >>= 1;
972 
973 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
974 			goto out;
975 	} while (!in_softirq() && time_before_eq(jiffies, now));
976 
977 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978 		goto out;
979 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
980 		goto out;
981 	if (net_ratelimit())
982 		printk(KERN_WARNING "dst cache overflow\n");
983 	RT_CACHE_STAT_INC(gc_dst_overflow);
984 	return 1;
985 
986 work_done:
987 	expire += ip_rt_gc_min_interval;
988 	if (expire > ip_rt_gc_timeout ||
989 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
991 		expire = ip_rt_gc_timeout;
992 out:	return 0;
993 }
994 
995 /*
996  * Returns number of entries in a hash chain that have different hash_inputs
997  */
998 static int slow_chain_length(const struct rtable *head)
999 {
1000 	int length = 0;
1001 	const struct rtable *rth = head;
1002 
1003 	while (rth) {
1004 		length += has_noalias(head, rth);
1005 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1006 	}
1007 	return length >> FRACT_BITS;
1008 }
1009 
1010 static int rt_bind_neighbour(struct rtable *rt)
1011 {
1012 	static const __be32 inaddr_any = 0;
1013 	struct net_device *dev = rt->dst.dev;
1014 	struct neigh_table *tbl = &arp_tbl;
1015 	const __be32 *nexthop;
1016 	struct neighbour *n;
1017 
1018 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1019 	if (dev->type == ARPHRD_ATM)
1020 		tbl = clip_tbl_hook;
1021 #endif
1022 	nexthop = &rt->rt_gateway;
1023 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1024 		nexthop = &inaddr_any;
1025 	n = ipv4_neigh_lookup(tbl, dev, nexthop);
1026 	if (IS_ERR(n))
1027 		return PTR_ERR(n);
1028 	rt->dst.neighbour = n;
1029 
1030 	return 0;
1031 }
1032 
1033 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1034 				     struct sk_buff *skb, int ifindex)
1035 {
1036 	struct rtable	*rth, *cand;
1037 	struct rtable __rcu **rthp, **candp;
1038 	unsigned long	now;
1039 	u32 		min_score;
1040 	int		chain_length;
1041 	int attempts = !in_softirq();
1042 
1043 restart:
1044 	chain_length = 0;
1045 	min_score = ~(u32)0;
1046 	cand = NULL;
1047 	candp = NULL;
1048 	now = jiffies;
1049 
1050 	if (!rt_caching(dev_net(rt->dst.dev))) {
1051 		/*
1052 		 * If we're not caching, just tell the caller we
1053 		 * were successful and don't touch the route.  The
1054 		 * caller hold the sole reference to the cache entry, and
1055 		 * it will be released when the caller is done with it.
1056 		 * If we drop it here, the callers have no way to resolve routes
1057 		 * when we're not caching.  Instead, just point *rp at rt, so
1058 		 * the caller gets a single use out of the route
1059 		 * Note that we do rt_free on this new route entry, so that
1060 		 * once its refcount hits zero, we are still able to reap it
1061 		 * (Thanks Alexey)
1062 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1063 		 * we set DST_NOCACHE so that dst_release() can free dst without
1064 		 * waiting a grace period.
1065 		 */
1066 
1067 		rt->dst.flags |= DST_NOCACHE;
1068 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1069 			int err = rt_bind_neighbour(rt);
1070 			if (err) {
1071 				if (net_ratelimit())
1072 					printk(KERN_WARNING
1073 					    "Neighbour table failure & not caching routes.\n");
1074 				ip_rt_put(rt);
1075 				return ERR_PTR(err);
1076 			}
1077 		}
1078 
1079 		goto skip_hashing;
1080 	}
1081 
1082 	rthp = &rt_hash_table[hash].chain;
1083 
1084 	spin_lock_bh(rt_hash_lock_addr(hash));
1085 	while ((rth = rcu_dereference_protected(*rthp,
1086 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1087 		if (rt_is_expired(rth)) {
1088 			*rthp = rth->dst.rt_next;
1089 			rt_free(rth);
1090 			continue;
1091 		}
1092 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1093 			/* Put it first */
1094 			*rthp = rth->dst.rt_next;
1095 			/*
1096 			 * Since lookup is lockfree, the deletion
1097 			 * must be visible to another weakly ordered CPU before
1098 			 * the insertion at the start of the hash chain.
1099 			 */
1100 			rcu_assign_pointer(rth->dst.rt_next,
1101 					   rt_hash_table[hash].chain);
1102 			/*
1103 			 * Since lookup is lockfree, the update writes
1104 			 * must be ordered for consistency on SMP.
1105 			 */
1106 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1107 
1108 			dst_use(&rth->dst, now);
1109 			spin_unlock_bh(rt_hash_lock_addr(hash));
1110 
1111 			rt_drop(rt);
1112 			if (skb)
1113 				skb_dst_set(skb, &rth->dst);
1114 			return rth;
1115 		}
1116 
1117 		if (!atomic_read(&rth->dst.__refcnt)) {
1118 			u32 score = rt_score(rth);
1119 
1120 			if (score <= min_score) {
1121 				cand = rth;
1122 				candp = rthp;
1123 				min_score = score;
1124 			}
1125 		}
1126 
1127 		chain_length++;
1128 
1129 		rthp = &rth->dst.rt_next;
1130 	}
1131 
1132 	if (cand) {
1133 		/* ip_rt_gc_elasticity used to be average length of chain
1134 		 * length, when exceeded gc becomes really aggressive.
1135 		 *
1136 		 * The second limit is less certain. At the moment it allows
1137 		 * only 2 entries per bucket. We will see.
1138 		 */
1139 		if (chain_length > ip_rt_gc_elasticity) {
1140 			*candp = cand->dst.rt_next;
1141 			rt_free(cand);
1142 		}
1143 	} else {
1144 		if (chain_length > rt_chain_length_max &&
1145 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1146 			struct net *net = dev_net(rt->dst.dev);
1147 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1148 			if (!rt_caching(net)) {
1149 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1150 					rt->dst.dev->name, num);
1151 			}
1152 			rt_emergency_hash_rebuild(net);
1153 			spin_unlock_bh(rt_hash_lock_addr(hash));
1154 
1155 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1156 					ifindex, rt_genid(net));
1157 			goto restart;
1158 		}
1159 	}
1160 
1161 	/* Try to bind route to arp only if it is output
1162 	   route or unicast forwarding path.
1163 	 */
1164 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1165 		int err = rt_bind_neighbour(rt);
1166 		if (err) {
1167 			spin_unlock_bh(rt_hash_lock_addr(hash));
1168 
1169 			if (err != -ENOBUFS) {
1170 				rt_drop(rt);
1171 				return ERR_PTR(err);
1172 			}
1173 
1174 			/* Neighbour tables are full and nothing
1175 			   can be released. Try to shrink route cache,
1176 			   it is most likely it holds some neighbour records.
1177 			 */
1178 			if (attempts-- > 0) {
1179 				int saved_elasticity = ip_rt_gc_elasticity;
1180 				int saved_int = ip_rt_gc_min_interval;
1181 				ip_rt_gc_elasticity	= 1;
1182 				ip_rt_gc_min_interval	= 0;
1183 				rt_garbage_collect(&ipv4_dst_ops);
1184 				ip_rt_gc_min_interval	= saved_int;
1185 				ip_rt_gc_elasticity	= saved_elasticity;
1186 				goto restart;
1187 			}
1188 
1189 			if (net_ratelimit())
1190 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1191 			rt_drop(rt);
1192 			return ERR_PTR(-ENOBUFS);
1193 		}
1194 	}
1195 
1196 	rt->dst.rt_next = rt_hash_table[hash].chain;
1197 
1198 	/*
1199 	 * Since lookup is lockfree, we must make sure
1200 	 * previous writes to rt are committed to memory
1201 	 * before making rt visible to other CPUS.
1202 	 */
1203 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1204 
1205 	spin_unlock_bh(rt_hash_lock_addr(hash));
1206 
1207 skip_hashing:
1208 	if (skb)
1209 		skb_dst_set(skb, &rt->dst);
1210 	return rt;
1211 }
1212 
1213 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1214 
1215 static u32 rt_peer_genid(void)
1216 {
1217 	return atomic_read(&__rt_peer_genid);
1218 }
1219 
1220 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1221 {
1222 	struct inet_peer *peer;
1223 
1224 	peer = inet_getpeer_v4(daddr, create);
1225 
1226 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1227 		inet_putpeer(peer);
1228 	else
1229 		rt->rt_peer_genid = rt_peer_genid();
1230 }
1231 
1232 /*
1233  * Peer allocation may fail only in serious out-of-memory conditions.  However
1234  * we still can generate some output.
1235  * Random ID selection looks a bit dangerous because we have no chances to
1236  * select ID being unique in a reasonable period of time.
1237  * But broken packet identifier may be better than no packet at all.
1238  */
1239 static void ip_select_fb_ident(struct iphdr *iph)
1240 {
1241 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1242 	static u32 ip_fallback_id;
1243 	u32 salt;
1244 
1245 	spin_lock_bh(&ip_fb_id_lock);
1246 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1247 	iph->id = htons(salt & 0xFFFF);
1248 	ip_fallback_id = salt;
1249 	spin_unlock_bh(&ip_fb_id_lock);
1250 }
1251 
1252 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1253 {
1254 	struct rtable *rt = (struct rtable *) dst;
1255 
1256 	if (rt) {
1257 		if (rt->peer == NULL)
1258 			rt_bind_peer(rt, rt->rt_dst, 1);
1259 
1260 		/* If peer is attached to destination, it is never detached,
1261 		   so that we need not to grab a lock to dereference it.
1262 		 */
1263 		if (rt->peer) {
1264 			iph->id = htons(inet_getid(rt->peer, more));
1265 			return;
1266 		}
1267 	} else
1268 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1269 		       __builtin_return_address(0));
1270 
1271 	ip_select_fb_ident(iph);
1272 }
1273 EXPORT_SYMBOL(__ip_select_ident);
1274 
1275 static void rt_del(unsigned hash, struct rtable *rt)
1276 {
1277 	struct rtable __rcu **rthp;
1278 	struct rtable *aux;
1279 
1280 	rthp = &rt_hash_table[hash].chain;
1281 	spin_lock_bh(rt_hash_lock_addr(hash));
1282 	ip_rt_put(rt);
1283 	while ((aux = rcu_dereference_protected(*rthp,
1284 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1285 		if (aux == rt || rt_is_expired(aux)) {
1286 			*rthp = aux->dst.rt_next;
1287 			rt_free(aux);
1288 			continue;
1289 		}
1290 		rthp = &aux->dst.rt_next;
1291 	}
1292 	spin_unlock_bh(rt_hash_lock_addr(hash));
1293 }
1294 
1295 /* called in rcu_read_lock() section */
1296 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1297 		    __be32 saddr, struct net_device *dev)
1298 {
1299 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1300 	struct inet_peer *peer;
1301 	struct net *net;
1302 
1303 	if (!in_dev)
1304 		return;
1305 
1306 	net = dev_net(dev);
1307 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1308 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1309 	    ipv4_is_zeronet(new_gw))
1310 		goto reject_redirect;
1311 
1312 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1313 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1314 			goto reject_redirect;
1315 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1316 			goto reject_redirect;
1317 	} else {
1318 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1319 			goto reject_redirect;
1320 	}
1321 
1322 	peer = inet_getpeer_v4(daddr, 1);
1323 	if (peer) {
1324 		peer->redirect_learned.a4 = new_gw;
1325 
1326 		inet_putpeer(peer);
1327 
1328 		atomic_inc(&__rt_peer_genid);
1329 	}
1330 	return;
1331 
1332 reject_redirect:
1333 #ifdef CONFIG_IP_ROUTE_VERBOSE
1334 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1335 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1336 			"  Advised path = %pI4 -> %pI4\n",
1337 		       &old_gw, dev->name, &new_gw,
1338 		       &saddr, &daddr);
1339 #endif
1340 	;
1341 }
1342 
1343 static bool peer_pmtu_expired(struct inet_peer *peer)
1344 {
1345 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1346 
1347 	return orig &&
1348 	       time_after_eq(jiffies, orig) &&
1349 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1350 }
1351 
1352 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1353 {
1354 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1355 
1356 	return orig &&
1357 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1358 }
1359 
1360 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1361 {
1362 	struct rtable *rt = (struct rtable *)dst;
1363 	struct dst_entry *ret = dst;
1364 
1365 	if (rt) {
1366 		if (dst->obsolete > 0) {
1367 			ip_rt_put(rt);
1368 			ret = NULL;
1369 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1370 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1371 						rt->rt_oif,
1372 						rt_genid(dev_net(dst->dev)));
1373 			rt_del(hash, rt);
1374 			ret = NULL;
1375 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1376 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1377 		}
1378 	}
1379 	return ret;
1380 }
1381 
1382 /*
1383  * Algorithm:
1384  *	1. The first ip_rt_redirect_number redirects are sent
1385  *	   with exponential backoff, then we stop sending them at all,
1386  *	   assuming that the host ignores our redirects.
1387  *	2. If we did not see packets requiring redirects
1388  *	   during ip_rt_redirect_silence, we assume that the host
1389  *	   forgot redirected route and start to send redirects again.
1390  *
1391  * This algorithm is much cheaper and more intelligent than dumb load limiting
1392  * in icmp.c.
1393  *
1394  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1395  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1396  */
1397 
1398 void ip_rt_send_redirect(struct sk_buff *skb)
1399 {
1400 	struct rtable *rt = skb_rtable(skb);
1401 	struct in_device *in_dev;
1402 	struct inet_peer *peer;
1403 	int log_martians;
1404 
1405 	rcu_read_lock();
1406 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1407 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1408 		rcu_read_unlock();
1409 		return;
1410 	}
1411 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1412 	rcu_read_unlock();
1413 
1414 	if (!rt->peer)
1415 		rt_bind_peer(rt, rt->rt_dst, 1);
1416 	peer = rt->peer;
1417 	if (!peer) {
1418 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1419 		return;
1420 	}
1421 
1422 	/* No redirected packets during ip_rt_redirect_silence;
1423 	 * reset the algorithm.
1424 	 */
1425 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1426 		peer->rate_tokens = 0;
1427 
1428 	/* Too many ignored redirects; do not send anything
1429 	 * set dst.rate_last to the last seen redirected packet.
1430 	 */
1431 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1432 		peer->rate_last = jiffies;
1433 		return;
1434 	}
1435 
1436 	/* Check for load limit; set rate_last to the latest sent
1437 	 * redirect.
1438 	 */
1439 	if (peer->rate_tokens == 0 ||
1440 	    time_after(jiffies,
1441 		       (peer->rate_last +
1442 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1443 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1444 		peer->rate_last = jiffies;
1445 		++peer->rate_tokens;
1446 #ifdef CONFIG_IP_ROUTE_VERBOSE
1447 		if (log_martians &&
1448 		    peer->rate_tokens == ip_rt_redirect_number &&
1449 		    net_ratelimit())
1450 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1451 			       &ip_hdr(skb)->saddr, rt->rt_iif,
1452 				&rt->rt_dst, &rt->rt_gateway);
1453 #endif
1454 	}
1455 }
1456 
1457 static int ip_error(struct sk_buff *skb)
1458 {
1459 	struct rtable *rt = skb_rtable(skb);
1460 	struct inet_peer *peer;
1461 	unsigned long now;
1462 	bool send;
1463 	int code;
1464 
1465 	switch (rt->dst.error) {
1466 	case EINVAL:
1467 	default:
1468 		goto out;
1469 	case EHOSTUNREACH:
1470 		code = ICMP_HOST_UNREACH;
1471 		break;
1472 	case ENETUNREACH:
1473 		code = ICMP_NET_UNREACH;
1474 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1475 				IPSTATS_MIB_INNOROUTES);
1476 		break;
1477 	case EACCES:
1478 		code = ICMP_PKT_FILTERED;
1479 		break;
1480 	}
1481 
1482 	if (!rt->peer)
1483 		rt_bind_peer(rt, rt->rt_dst, 1);
1484 	peer = rt->peer;
1485 
1486 	send = true;
1487 	if (peer) {
1488 		now = jiffies;
1489 		peer->rate_tokens += now - peer->rate_last;
1490 		if (peer->rate_tokens > ip_rt_error_burst)
1491 			peer->rate_tokens = ip_rt_error_burst;
1492 		peer->rate_last = now;
1493 		if (peer->rate_tokens >= ip_rt_error_cost)
1494 			peer->rate_tokens -= ip_rt_error_cost;
1495 		else
1496 			send = false;
1497 	}
1498 	if (send)
1499 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1500 
1501 out:	kfree_skb(skb);
1502 	return 0;
1503 }
1504 
1505 /*
1506  *	The last two values are not from the RFC but
1507  *	are needed for AMPRnet AX.25 paths.
1508  */
1509 
1510 static const unsigned short mtu_plateau[] =
1511 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1512 
1513 static inline unsigned short guess_mtu(unsigned short old_mtu)
1514 {
1515 	int i;
1516 
1517 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1518 		if (old_mtu > mtu_plateau[i])
1519 			return mtu_plateau[i];
1520 	return 68;
1521 }
1522 
1523 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1524 				 unsigned short new_mtu,
1525 				 struct net_device *dev)
1526 {
1527 	unsigned short old_mtu = ntohs(iph->tot_len);
1528 	unsigned short est_mtu = 0;
1529 	struct inet_peer *peer;
1530 
1531 	peer = inet_getpeer_v4(iph->daddr, 1);
1532 	if (peer) {
1533 		unsigned short mtu = new_mtu;
1534 
1535 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1536 			/* BSD 4.2 derived systems incorrectly adjust
1537 			 * tot_len by the IP header length, and report
1538 			 * a zero MTU in the ICMP message.
1539 			 */
1540 			if (mtu == 0 &&
1541 			    old_mtu >= 68 + (iph->ihl << 2))
1542 				old_mtu -= iph->ihl << 2;
1543 			mtu = guess_mtu(old_mtu);
1544 		}
1545 
1546 		if (mtu < ip_rt_min_pmtu)
1547 			mtu = ip_rt_min_pmtu;
1548 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1549 			unsigned long pmtu_expires;
1550 
1551 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1552 			if (!pmtu_expires)
1553 				pmtu_expires = 1UL;
1554 
1555 			est_mtu = mtu;
1556 			peer->pmtu_learned = mtu;
1557 			peer->pmtu_expires = pmtu_expires;
1558 		}
1559 
1560 		inet_putpeer(peer);
1561 
1562 		atomic_inc(&__rt_peer_genid);
1563 	}
1564 	return est_mtu ? : new_mtu;
1565 }
1566 
1567 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1568 {
1569 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1570 
1571 	if (!expires)
1572 		return;
1573 	if (time_before(jiffies, expires)) {
1574 		u32 orig_dst_mtu = dst_mtu(dst);
1575 		if (peer->pmtu_learned < orig_dst_mtu) {
1576 			if (!peer->pmtu_orig)
1577 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1578 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1579 		}
1580 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1581 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1582 }
1583 
1584 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1585 {
1586 	struct rtable *rt = (struct rtable *) dst;
1587 	struct inet_peer *peer;
1588 
1589 	dst_confirm(dst);
1590 
1591 	if (!rt->peer)
1592 		rt_bind_peer(rt, rt->rt_dst, 1);
1593 	peer = rt->peer;
1594 	if (peer) {
1595 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1596 
1597 		if (mtu < ip_rt_min_pmtu)
1598 			mtu = ip_rt_min_pmtu;
1599 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1600 
1601 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1602 			if (!pmtu_expires)
1603 				pmtu_expires = 1UL;
1604 
1605 			peer->pmtu_learned = mtu;
1606 			peer->pmtu_expires = pmtu_expires;
1607 
1608 			atomic_inc(&__rt_peer_genid);
1609 			rt->rt_peer_genid = rt_peer_genid();
1610 		}
1611 		check_peer_pmtu(dst, peer);
1612 	}
1613 }
1614 
1615 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1616 {
1617 	struct rtable *rt = (struct rtable *) dst;
1618 	__be32 orig_gw = rt->rt_gateway;
1619 
1620 	dst_confirm(&rt->dst);
1621 
1622 	neigh_release(rt->dst.neighbour);
1623 	rt->dst.neighbour = NULL;
1624 
1625 	rt->rt_gateway = peer->redirect_learned.a4;
1626 	if (rt_bind_neighbour(rt) ||
1627 	    !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1628 		if (rt->dst.neighbour)
1629 			neigh_event_send(rt->dst.neighbour, NULL);
1630 		rt->rt_gateway = orig_gw;
1631 		return -EAGAIN;
1632 	} else {
1633 		rt->rt_flags |= RTCF_REDIRECTED;
1634 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1635 					rt->dst.neighbour);
1636 	}
1637 	return 0;
1638 }
1639 
1640 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1641 {
1642 	struct rtable *rt = (struct rtable *) dst;
1643 
1644 	if (rt_is_expired(rt))
1645 		return NULL;
1646 	if (rt->rt_peer_genid != rt_peer_genid()) {
1647 		struct inet_peer *peer;
1648 
1649 		if (!rt->peer)
1650 			rt_bind_peer(rt, rt->rt_dst, 0);
1651 
1652 		peer = rt->peer;
1653 		if (peer) {
1654 			check_peer_pmtu(dst, peer);
1655 
1656 			if (peer->redirect_learned.a4 &&
1657 			    peer->redirect_learned.a4 != rt->rt_gateway) {
1658 				if (check_peer_redir(dst, peer))
1659 					return NULL;
1660 			}
1661 		}
1662 
1663 		rt->rt_peer_genid = rt_peer_genid();
1664 	}
1665 	return dst;
1666 }
1667 
1668 static void ipv4_dst_destroy(struct dst_entry *dst)
1669 {
1670 	struct rtable *rt = (struct rtable *) dst;
1671 	struct inet_peer *peer = rt->peer;
1672 
1673 	if (rt->fi) {
1674 		fib_info_put(rt->fi);
1675 		rt->fi = NULL;
1676 	}
1677 	if (peer) {
1678 		rt->peer = NULL;
1679 		inet_putpeer(peer);
1680 	}
1681 }
1682 
1683 
1684 static void ipv4_link_failure(struct sk_buff *skb)
1685 {
1686 	struct rtable *rt;
1687 
1688 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1689 
1690 	rt = skb_rtable(skb);
1691 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1692 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1693 }
1694 
1695 static int ip_rt_bug(struct sk_buff *skb)
1696 {
1697 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1698 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1699 		skb->dev ? skb->dev->name : "?");
1700 	kfree_skb(skb);
1701 	WARN_ON(1);
1702 	return 0;
1703 }
1704 
1705 /*
1706    We do not cache source address of outgoing interface,
1707    because it is used only by IP RR, TS and SRR options,
1708    so that it out of fast path.
1709 
1710    BTW remember: "addr" is allowed to be not aligned
1711    in IP options!
1712  */
1713 
1714 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1715 {
1716 	__be32 src;
1717 
1718 	if (rt_is_output_route(rt))
1719 		src = ip_hdr(skb)->saddr;
1720 	else {
1721 		struct fib_result res;
1722 		struct flowi4 fl4;
1723 		struct iphdr *iph;
1724 
1725 		iph = ip_hdr(skb);
1726 
1727 		memset(&fl4, 0, sizeof(fl4));
1728 		fl4.daddr = iph->daddr;
1729 		fl4.saddr = iph->saddr;
1730 		fl4.flowi4_tos = iph->tos;
1731 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1732 		fl4.flowi4_iif = skb->dev->ifindex;
1733 		fl4.flowi4_mark = skb->mark;
1734 
1735 		rcu_read_lock();
1736 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1737 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1738 		else
1739 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1740 					RT_SCOPE_UNIVERSE);
1741 		rcu_read_unlock();
1742 	}
1743 	memcpy(addr, &src, 4);
1744 }
1745 
1746 #ifdef CONFIG_IP_ROUTE_CLASSID
1747 static void set_class_tag(struct rtable *rt, u32 tag)
1748 {
1749 	if (!(rt->dst.tclassid & 0xFFFF))
1750 		rt->dst.tclassid |= tag & 0xFFFF;
1751 	if (!(rt->dst.tclassid & 0xFFFF0000))
1752 		rt->dst.tclassid |= tag & 0xFFFF0000;
1753 }
1754 #endif
1755 
1756 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1757 {
1758 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1759 
1760 	if (advmss == 0) {
1761 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1762 			       ip_rt_min_advmss);
1763 		if (advmss > 65535 - 40)
1764 			advmss = 65535 - 40;
1765 	}
1766 	return advmss;
1767 }
1768 
1769 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1770 {
1771 	unsigned int mtu = dst->dev->mtu;
1772 
1773 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1774 		const struct rtable *rt = (const struct rtable *) dst;
1775 
1776 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1777 			mtu = 576;
1778 	}
1779 
1780 	if (mtu > IP_MAX_MTU)
1781 		mtu = IP_MAX_MTU;
1782 
1783 	return mtu;
1784 }
1785 
1786 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1787 			    struct fib_info *fi)
1788 {
1789 	struct inet_peer *peer;
1790 	int create = 0;
1791 
1792 	/* If a peer entry exists for this destination, we must hook
1793 	 * it up in order to get at cached metrics.
1794 	 */
1795 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1796 		create = 1;
1797 
1798 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1799 	if (peer) {
1800 		rt->rt_peer_genid = rt_peer_genid();
1801 		if (inet_metrics_new(peer))
1802 			memcpy(peer->metrics, fi->fib_metrics,
1803 			       sizeof(u32) * RTAX_MAX);
1804 		dst_init_metrics(&rt->dst, peer->metrics, false);
1805 
1806 		check_peer_pmtu(&rt->dst, peer);
1807 		if (peer->redirect_learned.a4 &&
1808 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1809 			rt->rt_gateway = peer->redirect_learned.a4;
1810 			rt->rt_flags |= RTCF_REDIRECTED;
1811 		}
1812 	} else {
1813 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1814 			rt->fi = fi;
1815 			atomic_inc(&fi->fib_clntref);
1816 		}
1817 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1818 	}
1819 }
1820 
1821 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1822 			   const struct fib_result *res,
1823 			   struct fib_info *fi, u16 type, u32 itag)
1824 {
1825 	struct dst_entry *dst = &rt->dst;
1826 
1827 	if (fi) {
1828 		if (FIB_RES_GW(*res) &&
1829 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1830 			rt->rt_gateway = FIB_RES_GW(*res);
1831 		rt_init_metrics(rt, fl4, fi);
1832 #ifdef CONFIG_IP_ROUTE_CLASSID
1833 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1834 #endif
1835 	}
1836 
1837 	if (dst_mtu(dst) > IP_MAX_MTU)
1838 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1839 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1840 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1841 
1842 #ifdef CONFIG_IP_ROUTE_CLASSID
1843 #ifdef CONFIG_IP_MULTIPLE_TABLES
1844 	set_class_tag(rt, fib_rules_tclass(res));
1845 #endif
1846 	set_class_tag(rt, itag);
1847 #endif
1848 }
1849 
1850 static struct rtable *rt_dst_alloc(struct net_device *dev,
1851 				   bool nopolicy, bool noxfrm)
1852 {
1853 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1854 			 DST_HOST |
1855 			 (nopolicy ? DST_NOPOLICY : 0) |
1856 			 (noxfrm ? DST_NOXFRM : 0));
1857 }
1858 
1859 /* called in rcu_read_lock() section */
1860 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1861 				u8 tos, struct net_device *dev, int our)
1862 {
1863 	unsigned int hash;
1864 	struct rtable *rth;
1865 	__be32 spec_dst;
1866 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1867 	u32 itag = 0;
1868 	int err;
1869 
1870 	/* Primary sanity checks. */
1871 
1872 	if (in_dev == NULL)
1873 		return -EINVAL;
1874 
1875 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1876 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1877 		goto e_inval;
1878 
1879 	if (ipv4_is_zeronet(saddr)) {
1880 		if (!ipv4_is_local_multicast(daddr))
1881 			goto e_inval;
1882 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1883 	} else {
1884 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1885 					  &itag);
1886 		if (err < 0)
1887 			goto e_err;
1888 	}
1889 	rth = rt_dst_alloc(init_net.loopback_dev,
1890 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1891 	if (!rth)
1892 		goto e_nobufs;
1893 
1894 #ifdef CONFIG_IP_ROUTE_CLASSID
1895 	rth->dst.tclassid = itag;
1896 #endif
1897 	rth->dst.output = ip_rt_bug;
1898 
1899 	rth->rt_key_dst	= daddr;
1900 	rth->rt_key_src	= saddr;
1901 	rth->rt_genid	= rt_genid(dev_net(dev));
1902 	rth->rt_flags	= RTCF_MULTICAST;
1903 	rth->rt_type	= RTN_MULTICAST;
1904 	rth->rt_key_tos	= tos;
1905 	rth->rt_dst	= daddr;
1906 	rth->rt_src	= saddr;
1907 	rth->rt_route_iif = dev->ifindex;
1908 	rth->rt_iif	= dev->ifindex;
1909 	rth->rt_oif	= 0;
1910 	rth->rt_mark    = skb->mark;
1911 	rth->rt_gateway	= daddr;
1912 	rth->rt_spec_dst= spec_dst;
1913 	rth->rt_peer_genid = 0;
1914 	rth->peer = NULL;
1915 	rth->fi = NULL;
1916 	if (our) {
1917 		rth->dst.input= ip_local_deliver;
1918 		rth->rt_flags |= RTCF_LOCAL;
1919 	}
1920 
1921 #ifdef CONFIG_IP_MROUTE
1922 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1923 		rth->dst.input = ip_mr_input;
1924 #endif
1925 	RT_CACHE_STAT_INC(in_slow_mc);
1926 
1927 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1928 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1929 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1930 
1931 e_nobufs:
1932 	return -ENOBUFS;
1933 e_inval:
1934 	return -EINVAL;
1935 e_err:
1936 	return err;
1937 }
1938 
1939 
1940 static void ip_handle_martian_source(struct net_device *dev,
1941 				     struct in_device *in_dev,
1942 				     struct sk_buff *skb,
1943 				     __be32 daddr,
1944 				     __be32 saddr)
1945 {
1946 	RT_CACHE_STAT_INC(in_martian_src);
1947 #ifdef CONFIG_IP_ROUTE_VERBOSE
1948 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1949 		/*
1950 		 *	RFC1812 recommendation, if source is martian,
1951 		 *	the only hint is MAC header.
1952 		 */
1953 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1954 			&daddr, &saddr, dev->name);
1955 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1956 			int i;
1957 			const unsigned char *p = skb_mac_header(skb);
1958 			printk(KERN_WARNING "ll header: ");
1959 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1960 				printk("%02x", *p);
1961 				if (i < (dev->hard_header_len - 1))
1962 					printk(":");
1963 			}
1964 			printk("\n");
1965 		}
1966 	}
1967 #endif
1968 }
1969 
1970 /* called in rcu_read_lock() section */
1971 static int __mkroute_input(struct sk_buff *skb,
1972 			   const struct fib_result *res,
1973 			   struct in_device *in_dev,
1974 			   __be32 daddr, __be32 saddr, u32 tos,
1975 			   struct rtable **result)
1976 {
1977 	struct rtable *rth;
1978 	int err;
1979 	struct in_device *out_dev;
1980 	unsigned int flags = 0;
1981 	__be32 spec_dst;
1982 	u32 itag;
1983 
1984 	/* get a working reference to the output device */
1985 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1986 	if (out_dev == NULL) {
1987 		if (net_ratelimit())
1988 			printk(KERN_CRIT "Bug in ip_route_input" \
1989 			       "_slow(). Please, report\n");
1990 		return -EINVAL;
1991 	}
1992 
1993 
1994 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1995 				  in_dev->dev, &spec_dst, &itag);
1996 	if (err < 0) {
1997 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1998 					 saddr);
1999 
2000 		goto cleanup;
2001 	}
2002 
2003 	if (err)
2004 		flags |= RTCF_DIRECTSRC;
2005 
2006 	if (out_dev == in_dev && err &&
2007 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2008 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2009 		flags |= RTCF_DOREDIRECT;
2010 
2011 	if (skb->protocol != htons(ETH_P_IP)) {
2012 		/* Not IP (i.e. ARP). Do not create route, if it is
2013 		 * invalid for proxy arp. DNAT routes are always valid.
2014 		 *
2015 		 * Proxy arp feature have been extended to allow, ARP
2016 		 * replies back to the same interface, to support
2017 		 * Private VLAN switch technologies. See arp.c.
2018 		 */
2019 		if (out_dev == in_dev &&
2020 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2021 			err = -EINVAL;
2022 			goto cleanup;
2023 		}
2024 	}
2025 
2026 	rth = rt_dst_alloc(out_dev->dev,
2027 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2028 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2029 	if (!rth) {
2030 		err = -ENOBUFS;
2031 		goto cleanup;
2032 	}
2033 
2034 	rth->rt_key_dst	= daddr;
2035 	rth->rt_key_src	= saddr;
2036 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2037 	rth->rt_flags = flags;
2038 	rth->rt_type = res->type;
2039 	rth->rt_key_tos	= tos;
2040 	rth->rt_dst	= daddr;
2041 	rth->rt_src	= saddr;
2042 	rth->rt_route_iif = in_dev->dev->ifindex;
2043 	rth->rt_iif 	= in_dev->dev->ifindex;
2044 	rth->rt_oif 	= 0;
2045 	rth->rt_mark    = skb->mark;
2046 	rth->rt_gateway	= daddr;
2047 	rth->rt_spec_dst= spec_dst;
2048 	rth->rt_peer_genid = 0;
2049 	rth->peer = NULL;
2050 	rth->fi = NULL;
2051 
2052 	rth->dst.input = ip_forward;
2053 	rth->dst.output = ip_output;
2054 
2055 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2056 
2057 	*result = rth;
2058 	err = 0;
2059  cleanup:
2060 	return err;
2061 }
2062 
2063 static int ip_mkroute_input(struct sk_buff *skb,
2064 			    struct fib_result *res,
2065 			    const struct flowi4 *fl4,
2066 			    struct in_device *in_dev,
2067 			    __be32 daddr, __be32 saddr, u32 tos)
2068 {
2069 	struct rtable* rth = NULL;
2070 	int err;
2071 	unsigned hash;
2072 
2073 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2074 	if (res->fi && res->fi->fib_nhs > 1)
2075 		fib_select_multipath(res);
2076 #endif
2077 
2078 	/* create a routing cache entry */
2079 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2080 	if (err)
2081 		return err;
2082 
2083 	/* put it into the cache */
2084 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2085 		       rt_genid(dev_net(rth->dst.dev)));
2086 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2087 	if (IS_ERR(rth))
2088 		return PTR_ERR(rth);
2089 	return 0;
2090 }
2091 
2092 /*
2093  *	NOTE. We drop all the packets that has local source
2094  *	addresses, because every properly looped back packet
2095  *	must have correct destination already attached by output routine.
2096  *
2097  *	Such approach solves two big problems:
2098  *	1. Not simplex devices are handled properly.
2099  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2100  *	called with rcu_read_lock()
2101  */
2102 
2103 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2104 			       u8 tos, struct net_device *dev)
2105 {
2106 	struct fib_result res;
2107 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2108 	struct flowi4	fl4;
2109 	unsigned	flags = 0;
2110 	u32		itag = 0;
2111 	struct rtable * rth;
2112 	unsigned	hash;
2113 	__be32		spec_dst;
2114 	int		err = -EINVAL;
2115 	struct net    * net = dev_net(dev);
2116 
2117 	/* IP on this device is disabled. */
2118 
2119 	if (!in_dev)
2120 		goto out;
2121 
2122 	/* Check for the most weird martians, which can be not detected
2123 	   by fib_lookup.
2124 	 */
2125 
2126 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2127 	    ipv4_is_loopback(saddr))
2128 		goto martian_source;
2129 
2130 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2131 		goto brd_input;
2132 
2133 	/* Accept zero addresses only to limited broadcast;
2134 	 * I even do not know to fix it or not. Waiting for complains :-)
2135 	 */
2136 	if (ipv4_is_zeronet(saddr))
2137 		goto martian_source;
2138 
2139 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2140 		goto martian_destination;
2141 
2142 	/*
2143 	 *	Now we are ready to route packet.
2144 	 */
2145 	fl4.flowi4_oif = 0;
2146 	fl4.flowi4_iif = dev->ifindex;
2147 	fl4.flowi4_mark = skb->mark;
2148 	fl4.flowi4_tos = tos;
2149 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2150 	fl4.daddr = daddr;
2151 	fl4.saddr = saddr;
2152 	err = fib_lookup(net, &fl4, &res);
2153 	if (err != 0) {
2154 		if (!IN_DEV_FORWARD(in_dev))
2155 			goto e_hostunreach;
2156 		goto no_route;
2157 	}
2158 
2159 	RT_CACHE_STAT_INC(in_slow_tot);
2160 
2161 	if (res.type == RTN_BROADCAST)
2162 		goto brd_input;
2163 
2164 	if (res.type == RTN_LOCAL) {
2165 		err = fib_validate_source(skb, saddr, daddr, tos,
2166 					  net->loopback_dev->ifindex,
2167 					  dev, &spec_dst, &itag);
2168 		if (err < 0)
2169 			goto martian_source_keep_err;
2170 		if (err)
2171 			flags |= RTCF_DIRECTSRC;
2172 		spec_dst = daddr;
2173 		goto local_input;
2174 	}
2175 
2176 	if (!IN_DEV_FORWARD(in_dev))
2177 		goto e_hostunreach;
2178 	if (res.type != RTN_UNICAST)
2179 		goto martian_destination;
2180 
2181 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2182 out:	return err;
2183 
2184 brd_input:
2185 	if (skb->protocol != htons(ETH_P_IP))
2186 		goto e_inval;
2187 
2188 	if (ipv4_is_zeronet(saddr))
2189 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2190 	else {
2191 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2192 					  &itag);
2193 		if (err < 0)
2194 			goto martian_source_keep_err;
2195 		if (err)
2196 			flags |= RTCF_DIRECTSRC;
2197 	}
2198 	flags |= RTCF_BROADCAST;
2199 	res.type = RTN_BROADCAST;
2200 	RT_CACHE_STAT_INC(in_brd);
2201 
2202 local_input:
2203 	rth = rt_dst_alloc(net->loopback_dev,
2204 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2205 	if (!rth)
2206 		goto e_nobufs;
2207 
2208 	rth->dst.input= ip_local_deliver;
2209 	rth->dst.output= ip_rt_bug;
2210 #ifdef CONFIG_IP_ROUTE_CLASSID
2211 	rth->dst.tclassid = itag;
2212 #endif
2213 
2214 	rth->rt_key_dst	= daddr;
2215 	rth->rt_key_src	= saddr;
2216 	rth->rt_genid = rt_genid(net);
2217 	rth->rt_flags 	= flags|RTCF_LOCAL;
2218 	rth->rt_type	= res.type;
2219 	rth->rt_key_tos	= tos;
2220 	rth->rt_dst	= daddr;
2221 	rth->rt_src	= saddr;
2222 #ifdef CONFIG_IP_ROUTE_CLASSID
2223 	rth->dst.tclassid = itag;
2224 #endif
2225 	rth->rt_route_iif = dev->ifindex;
2226 	rth->rt_iif	= dev->ifindex;
2227 	rth->rt_oif	= 0;
2228 	rth->rt_mark    = skb->mark;
2229 	rth->rt_gateway	= daddr;
2230 	rth->rt_spec_dst= spec_dst;
2231 	rth->rt_peer_genid = 0;
2232 	rth->peer = NULL;
2233 	rth->fi = NULL;
2234 	if (res.type == RTN_UNREACHABLE) {
2235 		rth->dst.input= ip_error;
2236 		rth->dst.error= -err;
2237 		rth->rt_flags 	&= ~RTCF_LOCAL;
2238 	}
2239 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2240 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2241 	err = 0;
2242 	if (IS_ERR(rth))
2243 		err = PTR_ERR(rth);
2244 	goto out;
2245 
2246 no_route:
2247 	RT_CACHE_STAT_INC(in_no_route);
2248 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2249 	res.type = RTN_UNREACHABLE;
2250 	if (err == -ESRCH)
2251 		err = -ENETUNREACH;
2252 	goto local_input;
2253 
2254 	/*
2255 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2256 	 */
2257 martian_destination:
2258 	RT_CACHE_STAT_INC(in_martian_dst);
2259 #ifdef CONFIG_IP_ROUTE_VERBOSE
2260 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2261 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2262 			&daddr, &saddr, dev->name);
2263 #endif
2264 
2265 e_hostunreach:
2266 	err = -EHOSTUNREACH;
2267 	goto out;
2268 
2269 e_inval:
2270 	err = -EINVAL;
2271 	goto out;
2272 
2273 e_nobufs:
2274 	err = -ENOBUFS;
2275 	goto out;
2276 
2277 martian_source:
2278 	err = -EINVAL;
2279 martian_source_keep_err:
2280 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2281 	goto out;
2282 }
2283 
2284 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2285 			   u8 tos, struct net_device *dev, bool noref)
2286 {
2287 	struct rtable * rth;
2288 	unsigned	hash;
2289 	int iif = dev->ifindex;
2290 	struct net *net;
2291 	int res;
2292 
2293 	net = dev_net(dev);
2294 
2295 	rcu_read_lock();
2296 
2297 	if (!rt_caching(net))
2298 		goto skip_cache;
2299 
2300 	tos &= IPTOS_RT_MASK;
2301 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2302 
2303 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2304 	     rth = rcu_dereference(rth->dst.rt_next)) {
2305 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2306 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2307 		     (rth->rt_iif ^ iif) |
2308 		     rth->rt_oif |
2309 		     (rth->rt_key_tos ^ tos)) == 0 &&
2310 		    rth->rt_mark == skb->mark &&
2311 		    net_eq(dev_net(rth->dst.dev), net) &&
2312 		    !rt_is_expired(rth)) {
2313 			if (noref) {
2314 				dst_use_noref(&rth->dst, jiffies);
2315 				skb_dst_set_noref(skb, &rth->dst);
2316 			} else {
2317 				dst_use(&rth->dst, jiffies);
2318 				skb_dst_set(skb, &rth->dst);
2319 			}
2320 			RT_CACHE_STAT_INC(in_hit);
2321 			rcu_read_unlock();
2322 			return 0;
2323 		}
2324 		RT_CACHE_STAT_INC(in_hlist_search);
2325 	}
2326 
2327 skip_cache:
2328 	/* Multicast recognition logic is moved from route cache to here.
2329 	   The problem was that too many Ethernet cards have broken/missing
2330 	   hardware multicast filters :-( As result the host on multicasting
2331 	   network acquires a lot of useless route cache entries, sort of
2332 	   SDR messages from all the world. Now we try to get rid of them.
2333 	   Really, provided software IP multicast filter is organized
2334 	   reasonably (at least, hashed), it does not result in a slowdown
2335 	   comparing with route cache reject entries.
2336 	   Note, that multicast routers are not affected, because
2337 	   route cache entry is created eventually.
2338 	 */
2339 	if (ipv4_is_multicast(daddr)) {
2340 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2341 
2342 		if (in_dev) {
2343 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2344 						  ip_hdr(skb)->protocol);
2345 			if (our
2346 #ifdef CONFIG_IP_MROUTE
2347 				||
2348 			    (!ipv4_is_local_multicast(daddr) &&
2349 			     IN_DEV_MFORWARD(in_dev))
2350 #endif
2351 			   ) {
2352 				int res = ip_route_input_mc(skb, daddr, saddr,
2353 							    tos, dev, our);
2354 				rcu_read_unlock();
2355 				return res;
2356 			}
2357 		}
2358 		rcu_read_unlock();
2359 		return -EINVAL;
2360 	}
2361 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2362 	rcu_read_unlock();
2363 	return res;
2364 }
2365 EXPORT_SYMBOL(ip_route_input_common);
2366 
2367 /* called with rcu_read_lock() */
2368 static struct rtable *__mkroute_output(const struct fib_result *res,
2369 				       const struct flowi4 *fl4,
2370 				       __be32 orig_daddr, __be32 orig_saddr,
2371 				       int orig_oif, struct net_device *dev_out,
2372 				       unsigned int flags)
2373 {
2374 	struct fib_info *fi = res->fi;
2375 	u32 tos = RT_FL_TOS(fl4);
2376 	struct in_device *in_dev;
2377 	u16 type = res->type;
2378 	struct rtable *rth;
2379 
2380 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2381 		return ERR_PTR(-EINVAL);
2382 
2383 	if (ipv4_is_lbcast(fl4->daddr))
2384 		type = RTN_BROADCAST;
2385 	else if (ipv4_is_multicast(fl4->daddr))
2386 		type = RTN_MULTICAST;
2387 	else if (ipv4_is_zeronet(fl4->daddr))
2388 		return ERR_PTR(-EINVAL);
2389 
2390 	if (dev_out->flags & IFF_LOOPBACK)
2391 		flags |= RTCF_LOCAL;
2392 
2393 	in_dev = __in_dev_get_rcu(dev_out);
2394 	if (!in_dev)
2395 		return ERR_PTR(-EINVAL);
2396 
2397 	if (type == RTN_BROADCAST) {
2398 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2399 		fi = NULL;
2400 	} else if (type == RTN_MULTICAST) {
2401 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2402 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2403 				     fl4->flowi4_proto))
2404 			flags &= ~RTCF_LOCAL;
2405 		/* If multicast route do not exist use
2406 		 * default one, but do not gateway in this case.
2407 		 * Yes, it is hack.
2408 		 */
2409 		if (fi && res->prefixlen < 4)
2410 			fi = NULL;
2411 	}
2412 
2413 	rth = rt_dst_alloc(dev_out,
2414 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2415 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2416 	if (!rth)
2417 		return ERR_PTR(-ENOBUFS);
2418 
2419 	rth->dst.output = ip_output;
2420 
2421 	rth->rt_key_dst	= orig_daddr;
2422 	rth->rt_key_src	= orig_saddr;
2423 	rth->rt_genid = rt_genid(dev_net(dev_out));
2424 	rth->rt_flags	= flags;
2425 	rth->rt_type	= type;
2426 	rth->rt_key_tos	= tos;
2427 	rth->rt_dst	= fl4->daddr;
2428 	rth->rt_src	= fl4->saddr;
2429 	rth->rt_route_iif = 0;
2430 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2431 	rth->rt_oif	= orig_oif;
2432 	rth->rt_mark    = fl4->flowi4_mark;
2433 	rth->rt_gateway = fl4->daddr;
2434 	rth->rt_spec_dst= fl4->saddr;
2435 	rth->rt_peer_genid = 0;
2436 	rth->peer = NULL;
2437 	rth->fi = NULL;
2438 
2439 	RT_CACHE_STAT_INC(out_slow_tot);
2440 
2441 	if (flags & RTCF_LOCAL) {
2442 		rth->dst.input = ip_local_deliver;
2443 		rth->rt_spec_dst = fl4->daddr;
2444 	}
2445 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2446 		rth->rt_spec_dst = fl4->saddr;
2447 		if (flags & RTCF_LOCAL &&
2448 		    !(dev_out->flags & IFF_LOOPBACK)) {
2449 			rth->dst.output = ip_mc_output;
2450 			RT_CACHE_STAT_INC(out_slow_mc);
2451 		}
2452 #ifdef CONFIG_IP_MROUTE
2453 		if (type == RTN_MULTICAST) {
2454 			if (IN_DEV_MFORWARD(in_dev) &&
2455 			    !ipv4_is_local_multicast(fl4->daddr)) {
2456 				rth->dst.input = ip_mr_input;
2457 				rth->dst.output = ip_mc_output;
2458 			}
2459 		}
2460 #endif
2461 	}
2462 
2463 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2464 
2465 	return rth;
2466 }
2467 
2468 /*
2469  * Major route resolver routine.
2470  * called with rcu_read_lock();
2471  */
2472 
2473 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2474 {
2475 	struct net_device *dev_out = NULL;
2476 	u32 tos	= RT_FL_TOS(fl4);
2477 	unsigned int flags = 0;
2478 	struct fib_result res;
2479 	struct rtable *rth;
2480 	__be32 orig_daddr;
2481 	__be32 orig_saddr;
2482 	int orig_oif;
2483 
2484 	res.fi		= NULL;
2485 #ifdef CONFIG_IP_MULTIPLE_TABLES
2486 	res.r		= NULL;
2487 #endif
2488 
2489 	orig_daddr = fl4->daddr;
2490 	orig_saddr = fl4->saddr;
2491 	orig_oif = fl4->flowi4_oif;
2492 
2493 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2494 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2495 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2496 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2497 
2498 	rcu_read_lock();
2499 	if (fl4->saddr) {
2500 		rth = ERR_PTR(-EINVAL);
2501 		if (ipv4_is_multicast(fl4->saddr) ||
2502 		    ipv4_is_lbcast(fl4->saddr) ||
2503 		    ipv4_is_zeronet(fl4->saddr))
2504 			goto out;
2505 
2506 		/* I removed check for oif == dev_out->oif here.
2507 		   It was wrong for two reasons:
2508 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2509 		      is assigned to multiple interfaces.
2510 		   2. Moreover, we are allowed to send packets with saddr
2511 		      of another iface. --ANK
2512 		 */
2513 
2514 		if (fl4->flowi4_oif == 0 &&
2515 		    (ipv4_is_multicast(fl4->daddr) ||
2516 		     ipv4_is_lbcast(fl4->daddr))) {
2517 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2518 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2519 			if (dev_out == NULL)
2520 				goto out;
2521 
2522 			/* Special hack: user can direct multicasts
2523 			   and limited broadcast via necessary interface
2524 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2525 			   This hack is not just for fun, it allows
2526 			   vic,vat and friends to work.
2527 			   They bind socket to loopback, set ttl to zero
2528 			   and expect that it will work.
2529 			   From the viewpoint of routing cache they are broken,
2530 			   because we are not allowed to build multicast path
2531 			   with loopback source addr (look, routing cache
2532 			   cannot know, that ttl is zero, so that packet
2533 			   will not leave this host and route is valid).
2534 			   Luckily, this hack is good workaround.
2535 			 */
2536 
2537 			fl4->flowi4_oif = dev_out->ifindex;
2538 			goto make_route;
2539 		}
2540 
2541 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2542 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2543 			if (!__ip_dev_find(net, fl4->saddr, false))
2544 				goto out;
2545 		}
2546 	}
2547 
2548 
2549 	if (fl4->flowi4_oif) {
2550 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2551 		rth = ERR_PTR(-ENODEV);
2552 		if (dev_out == NULL)
2553 			goto out;
2554 
2555 		/* RACE: Check return value of inet_select_addr instead. */
2556 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2557 			rth = ERR_PTR(-ENETUNREACH);
2558 			goto out;
2559 		}
2560 		if (ipv4_is_local_multicast(fl4->daddr) ||
2561 		    ipv4_is_lbcast(fl4->daddr)) {
2562 			if (!fl4->saddr)
2563 				fl4->saddr = inet_select_addr(dev_out, 0,
2564 							      RT_SCOPE_LINK);
2565 			goto make_route;
2566 		}
2567 		if (fl4->saddr) {
2568 			if (ipv4_is_multicast(fl4->daddr))
2569 				fl4->saddr = inet_select_addr(dev_out, 0,
2570 							      fl4->flowi4_scope);
2571 			else if (!fl4->daddr)
2572 				fl4->saddr = inet_select_addr(dev_out, 0,
2573 							      RT_SCOPE_HOST);
2574 		}
2575 	}
2576 
2577 	if (!fl4->daddr) {
2578 		fl4->daddr = fl4->saddr;
2579 		if (!fl4->daddr)
2580 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2581 		dev_out = net->loopback_dev;
2582 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2583 		res.type = RTN_LOCAL;
2584 		flags |= RTCF_LOCAL;
2585 		goto make_route;
2586 	}
2587 
2588 	if (fib_lookup(net, fl4, &res)) {
2589 		res.fi = NULL;
2590 		if (fl4->flowi4_oif) {
2591 			/* Apparently, routing tables are wrong. Assume,
2592 			   that the destination is on link.
2593 
2594 			   WHY? DW.
2595 			   Because we are allowed to send to iface
2596 			   even if it has NO routes and NO assigned
2597 			   addresses. When oif is specified, routing
2598 			   tables are looked up with only one purpose:
2599 			   to catch if destination is gatewayed, rather than
2600 			   direct. Moreover, if MSG_DONTROUTE is set,
2601 			   we send packet, ignoring both routing tables
2602 			   and ifaddr state. --ANK
2603 
2604 
2605 			   We could make it even if oif is unknown,
2606 			   likely IPv6, but we do not.
2607 			 */
2608 
2609 			if (fl4->saddr == 0)
2610 				fl4->saddr = inet_select_addr(dev_out, 0,
2611 							      RT_SCOPE_LINK);
2612 			res.type = RTN_UNICAST;
2613 			goto make_route;
2614 		}
2615 		rth = ERR_PTR(-ENETUNREACH);
2616 		goto out;
2617 	}
2618 
2619 	if (res.type == RTN_LOCAL) {
2620 		if (!fl4->saddr) {
2621 			if (res.fi->fib_prefsrc)
2622 				fl4->saddr = res.fi->fib_prefsrc;
2623 			else
2624 				fl4->saddr = fl4->daddr;
2625 		}
2626 		dev_out = net->loopback_dev;
2627 		fl4->flowi4_oif = dev_out->ifindex;
2628 		res.fi = NULL;
2629 		flags |= RTCF_LOCAL;
2630 		goto make_route;
2631 	}
2632 
2633 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2634 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2635 		fib_select_multipath(&res);
2636 	else
2637 #endif
2638 	if (!res.prefixlen &&
2639 	    res.table->tb_num_default > 1 &&
2640 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2641 		fib_select_default(&res);
2642 
2643 	if (!fl4->saddr)
2644 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2645 
2646 	dev_out = FIB_RES_DEV(res);
2647 	fl4->flowi4_oif = dev_out->ifindex;
2648 
2649 
2650 make_route:
2651 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2652 			       dev_out, flags);
2653 	if (!IS_ERR(rth)) {
2654 		unsigned int hash;
2655 
2656 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2657 			       rt_genid(dev_net(dev_out)));
2658 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2659 	}
2660 
2661 out:
2662 	rcu_read_unlock();
2663 	return rth;
2664 }
2665 
2666 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2667 {
2668 	struct rtable *rth;
2669 	unsigned int hash;
2670 
2671 	if (!rt_caching(net))
2672 		goto slow_output;
2673 
2674 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2675 
2676 	rcu_read_lock_bh();
2677 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2678 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2679 		if (rth->rt_key_dst == flp4->daddr &&
2680 		    rth->rt_key_src == flp4->saddr &&
2681 		    rt_is_output_route(rth) &&
2682 		    rth->rt_oif == flp4->flowi4_oif &&
2683 		    rth->rt_mark == flp4->flowi4_mark &&
2684 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2685 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2686 		    net_eq(dev_net(rth->dst.dev), net) &&
2687 		    !rt_is_expired(rth)) {
2688 			dst_use(&rth->dst, jiffies);
2689 			RT_CACHE_STAT_INC(out_hit);
2690 			rcu_read_unlock_bh();
2691 			if (!flp4->saddr)
2692 				flp4->saddr = rth->rt_src;
2693 			if (!flp4->daddr)
2694 				flp4->daddr = rth->rt_dst;
2695 			return rth;
2696 		}
2697 		RT_CACHE_STAT_INC(out_hlist_search);
2698 	}
2699 	rcu_read_unlock_bh();
2700 
2701 slow_output:
2702 	return ip_route_output_slow(net, flp4);
2703 }
2704 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2705 
2706 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2707 {
2708 	return NULL;
2709 }
2710 
2711 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2712 {
2713 	return 0;
2714 }
2715 
2716 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2717 {
2718 }
2719 
2720 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2721 					  unsigned long old)
2722 {
2723 	return NULL;
2724 }
2725 
2726 static struct dst_ops ipv4_dst_blackhole_ops = {
2727 	.family			=	AF_INET,
2728 	.protocol		=	cpu_to_be16(ETH_P_IP),
2729 	.destroy		=	ipv4_dst_destroy,
2730 	.check			=	ipv4_blackhole_dst_check,
2731 	.default_mtu		=	ipv4_blackhole_default_mtu,
2732 	.default_advmss		=	ipv4_default_advmss,
2733 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2734 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2735 };
2736 
2737 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2738 {
2739 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2740 	struct rtable *ort = (struct rtable *) dst_orig;
2741 
2742 	if (rt) {
2743 		struct dst_entry *new = &rt->dst;
2744 
2745 		new->__use = 1;
2746 		new->input = dst_discard;
2747 		new->output = dst_discard;
2748 		dst_copy_metrics(new, &ort->dst);
2749 
2750 		new->dev = ort->dst.dev;
2751 		if (new->dev)
2752 			dev_hold(new->dev);
2753 
2754 		rt->rt_key_dst = ort->rt_key_dst;
2755 		rt->rt_key_src = ort->rt_key_src;
2756 		rt->rt_key_tos = ort->rt_key_tos;
2757 		rt->rt_route_iif = ort->rt_route_iif;
2758 		rt->rt_iif = ort->rt_iif;
2759 		rt->rt_oif = ort->rt_oif;
2760 		rt->rt_mark = ort->rt_mark;
2761 
2762 		rt->rt_genid = rt_genid(net);
2763 		rt->rt_flags = ort->rt_flags;
2764 		rt->rt_type = ort->rt_type;
2765 		rt->rt_dst = ort->rt_dst;
2766 		rt->rt_src = ort->rt_src;
2767 		rt->rt_gateway = ort->rt_gateway;
2768 		rt->rt_spec_dst = ort->rt_spec_dst;
2769 		rt->peer = ort->peer;
2770 		if (rt->peer)
2771 			atomic_inc(&rt->peer->refcnt);
2772 		rt->fi = ort->fi;
2773 		if (rt->fi)
2774 			atomic_inc(&rt->fi->fib_clntref);
2775 
2776 		dst_free(new);
2777 	}
2778 
2779 	dst_release(dst_orig);
2780 
2781 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2782 }
2783 
2784 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2785 				    struct sock *sk)
2786 {
2787 	struct rtable *rt = __ip_route_output_key(net, flp4);
2788 
2789 	if (IS_ERR(rt))
2790 		return rt;
2791 
2792 	if (flp4->flowi4_proto)
2793 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2794 						   flowi4_to_flowi(flp4),
2795 						   sk, 0);
2796 
2797 	return rt;
2798 }
2799 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800 
2801 static int rt_fill_info(struct net *net,
2802 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2803 			int nowait, unsigned int flags)
2804 {
2805 	struct rtable *rt = skb_rtable(skb);
2806 	struct rtmsg *r;
2807 	struct nlmsghdr *nlh;
2808 	long expires = 0;
2809 	const struct inet_peer *peer = rt->peer;
2810 	u32 id = 0, ts = 0, tsage = 0, error;
2811 
2812 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813 	if (nlh == NULL)
2814 		return -EMSGSIZE;
2815 
2816 	r = nlmsg_data(nlh);
2817 	r->rtm_family	 = AF_INET;
2818 	r->rtm_dst_len	= 32;
2819 	r->rtm_src_len	= 0;
2820 	r->rtm_tos	= rt->rt_key_tos;
2821 	r->rtm_table	= RT_TABLE_MAIN;
2822 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823 	r->rtm_type	= rt->rt_type;
2824 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2825 	r->rtm_protocol = RTPROT_UNSPEC;
2826 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827 	if (rt->rt_flags & RTCF_NOTIFY)
2828 		r->rtm_flags |= RTM_F_NOTIFY;
2829 
2830 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831 
2832 	if (rt->rt_key_src) {
2833 		r->rtm_src_len = 32;
2834 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2835 	}
2836 	if (rt->dst.dev)
2837 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2838 #ifdef CONFIG_IP_ROUTE_CLASSID
2839 	if (rt->dst.tclassid)
2840 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2841 #endif
2842 	if (rt_is_input_route(rt))
2843 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844 	else if (rt->rt_src != rt->rt_key_src)
2845 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846 
2847 	if (rt->rt_dst != rt->rt_gateway)
2848 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849 
2850 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2851 		goto nla_put_failure;
2852 
2853 	if (rt->rt_mark)
2854 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2855 
2856 	error = rt->dst.error;
2857 	if (peer) {
2858 		inet_peer_refcheck(rt->peer);
2859 		id = atomic_read(&peer->ip_id_count) & 0xffff;
2860 		if (peer->tcp_ts_stamp) {
2861 			ts = peer->tcp_ts;
2862 			tsage = get_seconds() - peer->tcp_ts_stamp;
2863 		}
2864 		expires = ACCESS_ONCE(peer->pmtu_expires);
2865 		if (expires)
2866 			expires -= jiffies;
2867 	}
2868 
2869 	if (rt_is_input_route(rt)) {
2870 #ifdef CONFIG_IP_MROUTE
2871 		__be32 dst = rt->rt_dst;
2872 
2873 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2874 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2875 			int err = ipmr_get_route(net, skb,
2876 						 rt->rt_src, rt->rt_dst,
2877 						 r, nowait);
2878 			if (err <= 0) {
2879 				if (!nowait) {
2880 					if (err == 0)
2881 						return 0;
2882 					goto nla_put_failure;
2883 				} else {
2884 					if (err == -EMSGSIZE)
2885 						goto nla_put_failure;
2886 					error = err;
2887 				}
2888 			}
2889 		} else
2890 #endif
2891 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2892 	}
2893 
2894 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2895 			       expires, error) < 0)
2896 		goto nla_put_failure;
2897 
2898 	return nlmsg_end(skb, nlh);
2899 
2900 nla_put_failure:
2901 	nlmsg_cancel(skb, nlh);
2902 	return -EMSGSIZE;
2903 }
2904 
2905 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2906 {
2907 	struct net *net = sock_net(in_skb->sk);
2908 	struct rtmsg *rtm;
2909 	struct nlattr *tb[RTA_MAX+1];
2910 	struct rtable *rt = NULL;
2911 	__be32 dst = 0;
2912 	__be32 src = 0;
2913 	u32 iif;
2914 	int err;
2915 	int mark;
2916 	struct sk_buff *skb;
2917 
2918 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2919 	if (err < 0)
2920 		goto errout;
2921 
2922 	rtm = nlmsg_data(nlh);
2923 
2924 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2925 	if (skb == NULL) {
2926 		err = -ENOBUFS;
2927 		goto errout;
2928 	}
2929 
2930 	/* Reserve room for dummy headers, this skb can pass
2931 	   through good chunk of routing engine.
2932 	 */
2933 	skb_reset_mac_header(skb);
2934 	skb_reset_network_header(skb);
2935 
2936 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2937 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2938 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2939 
2940 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2941 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2942 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2943 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2944 
2945 	if (iif) {
2946 		struct net_device *dev;
2947 
2948 		dev = __dev_get_by_index(net, iif);
2949 		if (dev == NULL) {
2950 			err = -ENODEV;
2951 			goto errout_free;
2952 		}
2953 
2954 		skb->protocol	= htons(ETH_P_IP);
2955 		skb->dev	= dev;
2956 		skb->mark	= mark;
2957 		local_bh_disable();
2958 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2959 		local_bh_enable();
2960 
2961 		rt = skb_rtable(skb);
2962 		if (err == 0 && rt->dst.error)
2963 			err = -rt->dst.error;
2964 	} else {
2965 		struct flowi4 fl4 = {
2966 			.daddr = dst,
2967 			.saddr = src,
2968 			.flowi4_tos = rtm->rtm_tos,
2969 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2970 			.flowi4_mark = mark,
2971 		};
2972 		rt = ip_route_output_key(net, &fl4);
2973 
2974 		err = 0;
2975 		if (IS_ERR(rt))
2976 			err = PTR_ERR(rt);
2977 	}
2978 
2979 	if (err)
2980 		goto errout_free;
2981 
2982 	skb_dst_set(skb, &rt->dst);
2983 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2984 		rt->rt_flags |= RTCF_NOTIFY;
2985 
2986 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2987 			   RTM_NEWROUTE, 0, 0);
2988 	if (err <= 0)
2989 		goto errout_free;
2990 
2991 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2992 errout:
2993 	return err;
2994 
2995 errout_free:
2996 	kfree_skb(skb);
2997 	goto errout;
2998 }
2999 
3000 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3001 {
3002 	struct rtable *rt;
3003 	int h, s_h;
3004 	int idx, s_idx;
3005 	struct net *net;
3006 
3007 	net = sock_net(skb->sk);
3008 
3009 	s_h = cb->args[0];
3010 	if (s_h < 0)
3011 		s_h = 0;
3012 	s_idx = idx = cb->args[1];
3013 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3014 		if (!rt_hash_table[h].chain)
3015 			continue;
3016 		rcu_read_lock_bh();
3017 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3018 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3019 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3020 				continue;
3021 			if (rt_is_expired(rt))
3022 				continue;
3023 			skb_dst_set_noref(skb, &rt->dst);
3024 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3025 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3026 					 1, NLM_F_MULTI) <= 0) {
3027 				skb_dst_drop(skb);
3028 				rcu_read_unlock_bh();
3029 				goto done;
3030 			}
3031 			skb_dst_drop(skb);
3032 		}
3033 		rcu_read_unlock_bh();
3034 	}
3035 
3036 done:
3037 	cb->args[0] = h;
3038 	cb->args[1] = idx;
3039 	return skb->len;
3040 }
3041 
3042 void ip_rt_multicast_event(struct in_device *in_dev)
3043 {
3044 	rt_cache_flush(dev_net(in_dev->dev), 0);
3045 }
3046 
3047 #ifdef CONFIG_SYSCTL
3048 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3049 					void __user *buffer,
3050 					size_t *lenp, loff_t *ppos)
3051 {
3052 	if (write) {
3053 		int flush_delay;
3054 		ctl_table ctl;
3055 		struct net *net;
3056 
3057 		memcpy(&ctl, __ctl, sizeof(ctl));
3058 		ctl.data = &flush_delay;
3059 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3060 
3061 		net = (struct net *)__ctl->extra1;
3062 		rt_cache_flush(net, flush_delay);
3063 		return 0;
3064 	}
3065 
3066 	return -EINVAL;
3067 }
3068 
3069 static ctl_table ipv4_route_table[] = {
3070 	{
3071 		.procname	= "gc_thresh",
3072 		.data		= &ipv4_dst_ops.gc_thresh,
3073 		.maxlen		= sizeof(int),
3074 		.mode		= 0644,
3075 		.proc_handler	= proc_dointvec,
3076 	},
3077 	{
3078 		.procname	= "max_size",
3079 		.data		= &ip_rt_max_size,
3080 		.maxlen		= sizeof(int),
3081 		.mode		= 0644,
3082 		.proc_handler	= proc_dointvec,
3083 	},
3084 	{
3085 		/*  Deprecated. Use gc_min_interval_ms */
3086 
3087 		.procname	= "gc_min_interval",
3088 		.data		= &ip_rt_gc_min_interval,
3089 		.maxlen		= sizeof(int),
3090 		.mode		= 0644,
3091 		.proc_handler	= proc_dointvec_jiffies,
3092 	},
3093 	{
3094 		.procname	= "gc_min_interval_ms",
3095 		.data		= &ip_rt_gc_min_interval,
3096 		.maxlen		= sizeof(int),
3097 		.mode		= 0644,
3098 		.proc_handler	= proc_dointvec_ms_jiffies,
3099 	},
3100 	{
3101 		.procname	= "gc_timeout",
3102 		.data		= &ip_rt_gc_timeout,
3103 		.maxlen		= sizeof(int),
3104 		.mode		= 0644,
3105 		.proc_handler	= proc_dointvec_jiffies,
3106 	},
3107 	{
3108 		.procname	= "gc_interval",
3109 		.data		= &ip_rt_gc_interval,
3110 		.maxlen		= sizeof(int),
3111 		.mode		= 0644,
3112 		.proc_handler	= proc_dointvec_jiffies,
3113 	},
3114 	{
3115 		.procname	= "redirect_load",
3116 		.data		= &ip_rt_redirect_load,
3117 		.maxlen		= sizeof(int),
3118 		.mode		= 0644,
3119 		.proc_handler	= proc_dointvec,
3120 	},
3121 	{
3122 		.procname	= "redirect_number",
3123 		.data		= &ip_rt_redirect_number,
3124 		.maxlen		= sizeof(int),
3125 		.mode		= 0644,
3126 		.proc_handler	= proc_dointvec,
3127 	},
3128 	{
3129 		.procname	= "redirect_silence",
3130 		.data		= &ip_rt_redirect_silence,
3131 		.maxlen		= sizeof(int),
3132 		.mode		= 0644,
3133 		.proc_handler	= proc_dointvec,
3134 	},
3135 	{
3136 		.procname	= "error_cost",
3137 		.data		= &ip_rt_error_cost,
3138 		.maxlen		= sizeof(int),
3139 		.mode		= 0644,
3140 		.proc_handler	= proc_dointvec,
3141 	},
3142 	{
3143 		.procname	= "error_burst",
3144 		.data		= &ip_rt_error_burst,
3145 		.maxlen		= sizeof(int),
3146 		.mode		= 0644,
3147 		.proc_handler	= proc_dointvec,
3148 	},
3149 	{
3150 		.procname	= "gc_elasticity",
3151 		.data		= &ip_rt_gc_elasticity,
3152 		.maxlen		= sizeof(int),
3153 		.mode		= 0644,
3154 		.proc_handler	= proc_dointvec,
3155 	},
3156 	{
3157 		.procname	= "mtu_expires",
3158 		.data		= &ip_rt_mtu_expires,
3159 		.maxlen		= sizeof(int),
3160 		.mode		= 0644,
3161 		.proc_handler	= proc_dointvec_jiffies,
3162 	},
3163 	{
3164 		.procname	= "min_pmtu",
3165 		.data		= &ip_rt_min_pmtu,
3166 		.maxlen		= sizeof(int),
3167 		.mode		= 0644,
3168 		.proc_handler	= proc_dointvec,
3169 	},
3170 	{
3171 		.procname	= "min_adv_mss",
3172 		.data		= &ip_rt_min_advmss,
3173 		.maxlen		= sizeof(int),
3174 		.mode		= 0644,
3175 		.proc_handler	= proc_dointvec,
3176 	},
3177 	{ }
3178 };
3179 
3180 static struct ctl_table empty[1];
3181 
3182 static struct ctl_table ipv4_skeleton[] =
3183 {
3184 	{ .procname = "route",
3185 	  .mode = 0555, .child = ipv4_route_table},
3186 	{ .procname = "neigh",
3187 	  .mode = 0555, .child = empty},
3188 	{ }
3189 };
3190 
3191 static __net_initdata struct ctl_path ipv4_path[] = {
3192 	{ .procname = "net", },
3193 	{ .procname = "ipv4", },
3194 	{ },
3195 };
3196 
3197 static struct ctl_table ipv4_route_flush_table[] = {
3198 	{
3199 		.procname	= "flush",
3200 		.maxlen		= sizeof(int),
3201 		.mode		= 0200,
3202 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3203 	},
3204 	{ },
3205 };
3206 
3207 static __net_initdata struct ctl_path ipv4_route_path[] = {
3208 	{ .procname = "net", },
3209 	{ .procname = "ipv4", },
3210 	{ .procname = "route", },
3211 	{ },
3212 };
3213 
3214 static __net_init int sysctl_route_net_init(struct net *net)
3215 {
3216 	struct ctl_table *tbl;
3217 
3218 	tbl = ipv4_route_flush_table;
3219 	if (!net_eq(net, &init_net)) {
3220 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3221 		if (tbl == NULL)
3222 			goto err_dup;
3223 	}
3224 	tbl[0].extra1 = net;
3225 
3226 	net->ipv4.route_hdr =
3227 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3228 	if (net->ipv4.route_hdr == NULL)
3229 		goto err_reg;
3230 	return 0;
3231 
3232 err_reg:
3233 	if (tbl != ipv4_route_flush_table)
3234 		kfree(tbl);
3235 err_dup:
3236 	return -ENOMEM;
3237 }
3238 
3239 static __net_exit void sysctl_route_net_exit(struct net *net)
3240 {
3241 	struct ctl_table *tbl;
3242 
3243 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3244 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3245 	BUG_ON(tbl == ipv4_route_flush_table);
3246 	kfree(tbl);
3247 }
3248 
3249 static __net_initdata struct pernet_operations sysctl_route_ops = {
3250 	.init = sysctl_route_net_init,
3251 	.exit = sysctl_route_net_exit,
3252 };
3253 #endif
3254 
3255 static __net_init int rt_genid_init(struct net *net)
3256 {
3257 	get_random_bytes(&net->ipv4.rt_genid,
3258 			 sizeof(net->ipv4.rt_genid));
3259 	get_random_bytes(&net->ipv4.dev_addr_genid,
3260 			 sizeof(net->ipv4.dev_addr_genid));
3261 	return 0;
3262 }
3263 
3264 static __net_initdata struct pernet_operations rt_genid_ops = {
3265 	.init = rt_genid_init,
3266 };
3267 
3268 
3269 #ifdef CONFIG_IP_ROUTE_CLASSID
3270 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3271 #endif /* CONFIG_IP_ROUTE_CLASSID */
3272 
3273 static __initdata unsigned long rhash_entries;
3274 static int __init set_rhash_entries(char *str)
3275 {
3276 	if (!str)
3277 		return 0;
3278 	rhash_entries = simple_strtoul(str, &str, 0);
3279 	return 1;
3280 }
3281 __setup("rhash_entries=", set_rhash_entries);
3282 
3283 int __init ip_rt_init(void)
3284 {
3285 	int rc = 0;
3286 
3287 #ifdef CONFIG_IP_ROUTE_CLASSID
3288 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3289 	if (!ip_rt_acct)
3290 		panic("IP: failed to allocate ip_rt_acct\n");
3291 #endif
3292 
3293 	ipv4_dst_ops.kmem_cachep =
3294 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3295 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3296 
3297 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3298 
3299 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3300 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3301 
3302 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3303 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3304 
3305 	rt_hash_table = (struct rt_hash_bucket *)
3306 		alloc_large_system_hash("IP route cache",
3307 					sizeof(struct rt_hash_bucket),
3308 					rhash_entries,
3309 					(totalram_pages >= 128 * 1024) ?
3310 					15 : 17,
3311 					0,
3312 					&rt_hash_log,
3313 					&rt_hash_mask,
3314 					rhash_entries ? 0 : 512 * 1024);
3315 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3316 	rt_hash_lock_init();
3317 
3318 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3319 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3320 
3321 	devinet_init();
3322 	ip_fib_init();
3323 
3324 	if (ip_rt_proc_init())
3325 		printk(KERN_ERR "Unable to create route proc files\n");
3326 #ifdef CONFIG_XFRM
3327 	xfrm_init();
3328 	xfrm4_init(ip_rt_max_size);
3329 #endif
3330 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3331 
3332 #ifdef CONFIG_SYSCTL
3333 	register_pernet_subsys(&sysctl_route_ops);
3334 #endif
3335 	register_pernet_subsys(&rt_genid_ops);
3336 	return rc;
3337 }
3338 
3339 #ifdef CONFIG_SYSCTL
3340 /*
3341  * We really need to sanitize the damn ipv4 init order, then all
3342  * this nonsense will go away.
3343  */
3344 void __init ip_static_sysctl_init(void)
3345 {
3346 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3347 }
3348 #endif
3349