xref: /openbmc/linux/net/ipv4/route.c (revision 384740dc)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135 
136 /*
137  *	Interface to generic destination cache.
138  */
139 
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static void		 ipv4_dst_destroy(struct dst_entry *dst);
142 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
143 					 struct net_device *dev, int how);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void		 ipv4_link_failure(struct sk_buff *skb);
146 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
148 
149 
150 static struct dst_ops ipv4_dst_ops = {
151 	.family =		AF_INET,
152 	.protocol =		__constant_htons(ETH_P_IP),
153 	.gc =			rt_garbage_collect,
154 	.check =		ipv4_dst_check,
155 	.destroy =		ipv4_dst_destroy,
156 	.ifdown =		ipv4_dst_ifdown,
157 	.negative_advice =	ipv4_negative_advice,
158 	.link_failure =		ipv4_link_failure,
159 	.update_pmtu =		ip_rt_update_pmtu,
160 	.local_out =		__ip_local_out,
161 	.entry_size =		sizeof(struct rtable),
162 	.entries =		ATOMIC_INIT(0),
163 };
164 
165 #define ECN_OR_COST(class)	TC_PRIO_##class
166 
167 const __u8 ip_tos2prio[16] = {
168 	TC_PRIO_BESTEFFORT,
169 	ECN_OR_COST(FILLER),
170 	TC_PRIO_BESTEFFORT,
171 	ECN_OR_COST(BESTEFFORT),
172 	TC_PRIO_BULK,
173 	ECN_OR_COST(BULK),
174 	TC_PRIO_BULK,
175 	ECN_OR_COST(BULK),
176 	TC_PRIO_INTERACTIVE,
177 	ECN_OR_COST(INTERACTIVE),
178 	TC_PRIO_INTERACTIVE,
179 	ECN_OR_COST(INTERACTIVE),
180 	TC_PRIO_INTERACTIVE_BULK,
181 	ECN_OR_COST(INTERACTIVE_BULK),
182 	TC_PRIO_INTERACTIVE_BULK,
183 	ECN_OR_COST(INTERACTIVE_BULK)
184 };
185 
186 
187 /*
188  * Route cache.
189  */
190 
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200 
201 struct rt_hash_bucket {
202 	struct rtable	*chain;
203 };
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 	defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ	256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ	4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ	2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ	1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ	512
222 # else
223 #  define RT_HASH_LOCK_SZ	256
224 # endif
225 #endif
226 
227 static spinlock_t	*rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229 
230 static __init void rt_hash_lock_init(void)
231 {
232 	int i;
233 
234 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 			GFP_KERNEL);
236 	if (!rt_hash_locks)
237 		panic("IP: failed to allocate rt_hash_locks\n");
238 
239 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 		spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249 
250 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
251 static unsigned			rt_hash_mask __read_mostly;
252 static unsigned int		rt_hash_log  __read_mostly;
253 
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) \
256 	(__raw_get_cpu_var(rt_cache_stat).field++)
257 
258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259 		int genid)
260 {
261 	return jhash_3words((__force u32)(__be32)(daddr),
262 			    (__force u32)(__be32)(saddr),
263 			    idx, genid)
264 		& rt_hash_mask;
265 }
266 
267 static inline int rt_genid(struct net *net)
268 {
269 	return atomic_read(&net->ipv4.rt_genid);
270 }
271 
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274 	struct seq_net_private p;
275 	int bucket;
276 	int genid;
277 };
278 
279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
280 {
281 	struct rt_cache_iter_state *st = seq->private;
282 	struct rtable *r = NULL;
283 
284 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 		rcu_read_lock_bh();
286 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 		while (r) {
288 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
289 			    r->rt_genid == st->genid)
290 				return r;
291 			r = rcu_dereference(r->u.dst.rt_next);
292 		}
293 		rcu_read_unlock_bh();
294 	}
295 	return r;
296 }
297 
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 					  struct rtable *r)
300 {
301 	struct rt_cache_iter_state *st = seq->private;
302 	r = r->u.dst.rt_next;
303 	while (!r) {
304 		rcu_read_unlock_bh();
305 		if (--st->bucket < 0)
306 			break;
307 		rcu_read_lock_bh();
308 		r = rt_hash_table[st->bucket].chain;
309 	}
310 	return rcu_dereference(r);
311 }
312 
313 static struct rtable *rt_cache_get_next(struct seq_file *seq,
314 					struct rtable *r)
315 {
316 	struct rt_cache_iter_state *st = seq->private;
317 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
318 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
319 			continue;
320 		if (r->rt_genid == st->genid)
321 			break;
322 	}
323 	return r;
324 }
325 
326 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
327 {
328 	struct rtable *r = rt_cache_get_first(seq);
329 
330 	if (r)
331 		while (pos && (r = rt_cache_get_next(seq, r)))
332 			--pos;
333 	return pos ? NULL : r;
334 }
335 
336 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
337 {
338 	struct rt_cache_iter_state *st = seq->private;
339 	if (*pos)
340 		return rt_cache_get_idx(seq, *pos - 1);
341 	st->genid = rt_genid(seq_file_net(seq));
342 	return SEQ_START_TOKEN;
343 }
344 
345 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
346 {
347 	struct rtable *r;
348 
349 	if (v == SEQ_START_TOKEN)
350 		r = rt_cache_get_first(seq);
351 	else
352 		r = rt_cache_get_next(seq, v);
353 	++*pos;
354 	return r;
355 }
356 
357 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
358 {
359 	if (v && v != SEQ_START_TOKEN)
360 		rcu_read_unlock_bh();
361 }
362 
363 static int rt_cache_seq_show(struct seq_file *seq, void *v)
364 {
365 	if (v == SEQ_START_TOKEN)
366 		seq_printf(seq, "%-127s\n",
367 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
368 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
369 			   "HHUptod\tSpecDst");
370 	else {
371 		struct rtable *r = v;
372 		int len;
373 
374 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
375 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
376 			r->u.dst.dev ? r->u.dst.dev->name : "*",
377 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
378 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
379 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
380 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
381 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
382 			dst_metric(&r->u.dst, RTAX_WINDOW),
383 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
384 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
385 			r->fl.fl4_tos,
386 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
387 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
388 				       dev_queue_xmit) : 0,
389 			r->rt_spec_dst, &len);
390 
391 		seq_printf(seq, "%*s\n", 127 - len, "");
392 	}
393 	return 0;
394 }
395 
396 static const struct seq_operations rt_cache_seq_ops = {
397 	.start  = rt_cache_seq_start,
398 	.next   = rt_cache_seq_next,
399 	.stop   = rt_cache_seq_stop,
400 	.show   = rt_cache_seq_show,
401 };
402 
403 static int rt_cache_seq_open(struct inode *inode, struct file *file)
404 {
405 	return seq_open_net(inode, file, &rt_cache_seq_ops,
406 			sizeof(struct rt_cache_iter_state));
407 }
408 
409 static const struct file_operations rt_cache_seq_fops = {
410 	.owner	 = THIS_MODULE,
411 	.open	 = rt_cache_seq_open,
412 	.read	 = seq_read,
413 	.llseek	 = seq_lseek,
414 	.release = seq_release_net,
415 };
416 
417 
418 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 {
420 	int cpu;
421 
422 	if (*pos == 0)
423 		return SEQ_START_TOKEN;
424 
425 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
426 		if (!cpu_possible(cpu))
427 			continue;
428 		*pos = cpu+1;
429 		return &per_cpu(rt_cache_stat, cpu);
430 	}
431 	return NULL;
432 }
433 
434 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435 {
436 	int cpu;
437 
438 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
439 		if (!cpu_possible(cpu))
440 			continue;
441 		*pos = cpu+1;
442 		return &per_cpu(rt_cache_stat, cpu);
443 	}
444 	return NULL;
445 
446 }
447 
448 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 {
450 
451 }
452 
453 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
454 {
455 	struct rt_cache_stat *st = v;
456 
457 	if (v == SEQ_START_TOKEN) {
458 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
459 		return 0;
460 	}
461 
462 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
463 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
464 		   atomic_read(&ipv4_dst_ops.entries),
465 		   st->in_hit,
466 		   st->in_slow_tot,
467 		   st->in_slow_mc,
468 		   st->in_no_route,
469 		   st->in_brd,
470 		   st->in_martian_dst,
471 		   st->in_martian_src,
472 
473 		   st->out_hit,
474 		   st->out_slow_tot,
475 		   st->out_slow_mc,
476 
477 		   st->gc_total,
478 		   st->gc_ignored,
479 		   st->gc_goal_miss,
480 		   st->gc_dst_overflow,
481 		   st->in_hlist_search,
482 		   st->out_hlist_search
483 		);
484 	return 0;
485 }
486 
487 static const struct seq_operations rt_cpu_seq_ops = {
488 	.start  = rt_cpu_seq_start,
489 	.next   = rt_cpu_seq_next,
490 	.stop   = rt_cpu_seq_stop,
491 	.show   = rt_cpu_seq_show,
492 };
493 
494 
495 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
496 {
497 	return seq_open(file, &rt_cpu_seq_ops);
498 }
499 
500 static const struct file_operations rt_cpu_seq_fops = {
501 	.owner	 = THIS_MODULE,
502 	.open	 = rt_cpu_seq_open,
503 	.read	 = seq_read,
504 	.llseek	 = seq_lseek,
505 	.release = seq_release,
506 };
507 
508 #ifdef CONFIG_NET_CLS_ROUTE
509 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
510 			   int length, int *eof, void *data)
511 {
512 	unsigned int i;
513 
514 	if ((offset & 3) || (length & 3))
515 		return -EIO;
516 
517 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
518 		*eof = 1;
519 		return 0;
520 	}
521 
522 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
523 		length = sizeof(struct ip_rt_acct) * 256 - offset;
524 		*eof = 1;
525 	}
526 
527 	offset /= sizeof(u32);
528 
529 	if (length > 0) {
530 		u32 *dst = (u32 *) buffer;
531 
532 		*start = buffer;
533 		memset(dst, 0, length);
534 
535 		for_each_possible_cpu(i) {
536 			unsigned int j;
537 			u32 *src;
538 
539 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
540 			for (j = 0; j < length/4; j++)
541 				dst[j] += src[j];
542 		}
543 	}
544 	return length;
545 }
546 #endif
547 
548 static int __net_init ip_rt_do_proc_init(struct net *net)
549 {
550 	struct proc_dir_entry *pde;
551 
552 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553 			&rt_cache_seq_fops);
554 	if (!pde)
555 		goto err1;
556 
557 	pde = proc_create("rt_cache", S_IRUGO,
558 			  net->proc_net_stat, &rt_cpu_seq_fops);
559 	if (!pde)
560 		goto err2;
561 
562 #ifdef CONFIG_NET_CLS_ROUTE
563 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
564 			ip_rt_acct_read, NULL);
565 	if (!pde)
566 		goto err3;
567 #endif
568 	return 0;
569 
570 #ifdef CONFIG_NET_CLS_ROUTE
571 err3:
572 	remove_proc_entry("rt_cache", net->proc_net_stat);
573 #endif
574 err2:
575 	remove_proc_entry("rt_cache", net->proc_net);
576 err1:
577 	return -ENOMEM;
578 }
579 
580 static void __net_exit ip_rt_do_proc_exit(struct net *net)
581 {
582 	remove_proc_entry("rt_cache", net->proc_net_stat);
583 	remove_proc_entry("rt_cache", net->proc_net);
584 	remove_proc_entry("rt_acct", net->proc_net);
585 }
586 
587 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
588 	.init = ip_rt_do_proc_init,
589 	.exit = ip_rt_do_proc_exit,
590 };
591 
592 static int __init ip_rt_proc_init(void)
593 {
594 	return register_pernet_subsys(&ip_rt_proc_ops);
595 }
596 
597 #else
598 static inline int ip_rt_proc_init(void)
599 {
600 	return 0;
601 }
602 #endif /* CONFIG_PROC_FS */
603 
604 static inline void rt_free(struct rtable *rt)
605 {
606 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
607 }
608 
609 static inline void rt_drop(struct rtable *rt)
610 {
611 	ip_rt_put(rt);
612 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
613 }
614 
615 static inline int rt_fast_clean(struct rtable *rth)
616 {
617 	/* Kill broadcast/multicast entries very aggresively, if they
618 	   collide in hash table with more useful entries */
619 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
620 		rth->fl.iif && rth->u.dst.rt_next;
621 }
622 
623 static inline int rt_valuable(struct rtable *rth)
624 {
625 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
626 		rth->u.dst.expires;
627 }
628 
629 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
630 {
631 	unsigned long age;
632 	int ret = 0;
633 
634 	if (atomic_read(&rth->u.dst.__refcnt))
635 		goto out;
636 
637 	ret = 1;
638 	if (rth->u.dst.expires &&
639 	    time_after_eq(jiffies, rth->u.dst.expires))
640 		goto out;
641 
642 	age = jiffies - rth->u.dst.lastuse;
643 	ret = 0;
644 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
645 	    (age <= tmo2 && rt_valuable(rth)))
646 		goto out;
647 	ret = 1;
648 out:	return ret;
649 }
650 
651 /* Bits of score are:
652  * 31: very valuable
653  * 30: not quite useless
654  * 29..0: usage counter
655  */
656 static inline u32 rt_score(struct rtable *rt)
657 {
658 	u32 score = jiffies - rt->u.dst.lastuse;
659 
660 	score = ~score & ~(3<<30);
661 
662 	if (rt_valuable(rt))
663 		score |= (1<<31);
664 
665 	if (!rt->fl.iif ||
666 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
667 		score |= (1<<30);
668 
669 	return score;
670 }
671 
672 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
673 {
674 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
675 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
676 		(fl1->mark ^ fl2->mark) |
677 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
678 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
679 		(fl1->oif ^ fl2->oif) |
680 		(fl1->iif ^ fl2->iif)) == 0;
681 }
682 
683 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
684 {
685 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
686 }
687 
688 static inline int rt_is_expired(struct rtable *rth)
689 {
690 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
691 }
692 
693 /*
694  * Perform a full scan of hash table and free all entries.
695  * Can be called by a softirq or a process.
696  * In the later case, we want to be reschedule if necessary
697  */
698 static void rt_do_flush(int process_context)
699 {
700 	unsigned int i;
701 	struct rtable *rth, *next;
702 	struct rtable * tail;
703 
704 	for (i = 0; i <= rt_hash_mask; i++) {
705 		if (process_context && need_resched())
706 			cond_resched();
707 		rth = rt_hash_table[i].chain;
708 		if (!rth)
709 			continue;
710 
711 		spin_lock_bh(rt_hash_lock_addr(i));
712 #ifdef CONFIG_NET_NS
713 		{
714 		struct rtable ** prev, * p;
715 
716 		rth = rt_hash_table[i].chain;
717 
718 		/* defer releasing the head of the list after spin_unlock */
719 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
720 			if (!rt_is_expired(tail))
721 				break;
722 		if (rth != tail)
723 			rt_hash_table[i].chain = tail;
724 
725 		/* call rt_free on entries after the tail requiring flush */
726 		prev = &rt_hash_table[i].chain;
727 		for (p = *prev; p; p = next) {
728 			next = p->u.dst.rt_next;
729 			if (!rt_is_expired(p)) {
730 				prev = &p->u.dst.rt_next;
731 			} else {
732 				*prev = next;
733 				rt_free(p);
734 			}
735 		}
736 		}
737 #else
738 		rth = rt_hash_table[i].chain;
739 		rt_hash_table[i].chain = NULL;
740 		tail = NULL;
741 #endif
742 		spin_unlock_bh(rt_hash_lock_addr(i));
743 
744 		for (; rth != tail; rth = next) {
745 			next = rth->u.dst.rt_next;
746 			rt_free(rth);
747 		}
748 	}
749 }
750 
751 static void rt_check_expire(void)
752 {
753 	static unsigned int rover;
754 	unsigned int i = rover, goal;
755 	struct rtable *rth, **rthp;
756 	u64 mult;
757 
758 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
759 	if (ip_rt_gc_timeout > 1)
760 		do_div(mult, ip_rt_gc_timeout);
761 	goal = (unsigned int)mult;
762 	if (goal > rt_hash_mask)
763 		goal = rt_hash_mask + 1;
764 	for (; goal > 0; goal--) {
765 		unsigned long tmo = ip_rt_gc_timeout;
766 
767 		i = (i + 1) & rt_hash_mask;
768 		rthp = &rt_hash_table[i].chain;
769 
770 		if (need_resched())
771 			cond_resched();
772 
773 		if (*rthp == NULL)
774 			continue;
775 		spin_lock_bh(rt_hash_lock_addr(i));
776 		while ((rth = *rthp) != NULL) {
777 			if (rt_is_expired(rth)) {
778 				*rthp = rth->u.dst.rt_next;
779 				rt_free(rth);
780 				continue;
781 			}
782 			if (rth->u.dst.expires) {
783 				/* Entry is expired even if it is in use */
784 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
785 					tmo >>= 1;
786 					rthp = &rth->u.dst.rt_next;
787 					continue;
788 				}
789 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
790 				tmo >>= 1;
791 				rthp = &rth->u.dst.rt_next;
792 				continue;
793 			}
794 
795 			/* Cleanup aged off entries. */
796 			*rthp = rth->u.dst.rt_next;
797 			rt_free(rth);
798 		}
799 		spin_unlock_bh(rt_hash_lock_addr(i));
800 	}
801 	rover = i;
802 }
803 
804 /*
805  * rt_worker_func() is run in process context.
806  * we call rt_check_expire() to scan part of the hash table
807  */
808 static void rt_worker_func(struct work_struct *work)
809 {
810 	rt_check_expire();
811 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
812 }
813 
814 /*
815  * Pertubation of rt_genid by a small quantity [1..256]
816  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
817  * many times (2^24) without giving recent rt_genid.
818  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
819  */
820 static void rt_cache_invalidate(struct net *net)
821 {
822 	unsigned char shuffle;
823 
824 	get_random_bytes(&shuffle, sizeof(shuffle));
825 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
826 }
827 
828 /*
829  * delay < 0  : invalidate cache (fast : entries will be deleted later)
830  * delay >= 0 : invalidate & flush cache (can be long)
831  */
832 void rt_cache_flush(struct net *net, int delay)
833 {
834 	rt_cache_invalidate(net);
835 	if (delay >= 0)
836 		rt_do_flush(!in_softirq());
837 }
838 
839 /*
840  * We change rt_genid and let gc do the cleanup
841  */
842 static void rt_secret_rebuild(unsigned long __net)
843 {
844 	struct net *net = (struct net *)__net;
845 	rt_cache_invalidate(net);
846 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
847 }
848 
849 /*
850    Short description of GC goals.
851 
852    We want to build algorithm, which will keep routing cache
853    at some equilibrium point, when number of aged off entries
854    is kept approximately equal to newly generated ones.
855 
856    Current expiration strength is variable "expire".
857    We try to adjust it dynamically, so that if networking
858    is idle expires is large enough to keep enough of warm entries,
859    and when load increases it reduces to limit cache size.
860  */
861 
862 static int rt_garbage_collect(struct dst_ops *ops)
863 {
864 	static unsigned long expire = RT_GC_TIMEOUT;
865 	static unsigned long last_gc;
866 	static int rover;
867 	static int equilibrium;
868 	struct rtable *rth, **rthp;
869 	unsigned long now = jiffies;
870 	int goal;
871 
872 	/*
873 	 * Garbage collection is pretty expensive,
874 	 * do not make it too frequently.
875 	 */
876 
877 	RT_CACHE_STAT_INC(gc_total);
878 
879 	if (now - last_gc < ip_rt_gc_min_interval &&
880 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
881 		RT_CACHE_STAT_INC(gc_ignored);
882 		goto out;
883 	}
884 
885 	/* Calculate number of entries, which we want to expire now. */
886 	goal = atomic_read(&ipv4_dst_ops.entries) -
887 		(ip_rt_gc_elasticity << rt_hash_log);
888 	if (goal <= 0) {
889 		if (equilibrium < ipv4_dst_ops.gc_thresh)
890 			equilibrium = ipv4_dst_ops.gc_thresh;
891 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
892 		if (goal > 0) {
893 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
894 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
895 		}
896 	} else {
897 		/* We are in dangerous area. Try to reduce cache really
898 		 * aggressively.
899 		 */
900 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
901 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
902 	}
903 
904 	if (now - last_gc >= ip_rt_gc_min_interval)
905 		last_gc = now;
906 
907 	if (goal <= 0) {
908 		equilibrium += goal;
909 		goto work_done;
910 	}
911 
912 	do {
913 		int i, k;
914 
915 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
916 			unsigned long tmo = expire;
917 
918 			k = (k + 1) & rt_hash_mask;
919 			rthp = &rt_hash_table[k].chain;
920 			spin_lock_bh(rt_hash_lock_addr(k));
921 			while ((rth = *rthp) != NULL) {
922 				if (!rt_is_expired(rth) &&
923 					!rt_may_expire(rth, tmo, expire)) {
924 					tmo >>= 1;
925 					rthp = &rth->u.dst.rt_next;
926 					continue;
927 				}
928 				*rthp = rth->u.dst.rt_next;
929 				rt_free(rth);
930 				goal--;
931 			}
932 			spin_unlock_bh(rt_hash_lock_addr(k));
933 			if (goal <= 0)
934 				break;
935 		}
936 		rover = k;
937 
938 		if (goal <= 0)
939 			goto work_done;
940 
941 		/* Goal is not achieved. We stop process if:
942 
943 		   - if expire reduced to zero. Otherwise, expire is halfed.
944 		   - if table is not full.
945 		   - if we are called from interrupt.
946 		   - jiffies check is just fallback/debug loop breaker.
947 		     We will not spin here for long time in any case.
948 		 */
949 
950 		RT_CACHE_STAT_INC(gc_goal_miss);
951 
952 		if (expire == 0)
953 			break;
954 
955 		expire >>= 1;
956 #if RT_CACHE_DEBUG >= 2
957 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
958 				atomic_read(&ipv4_dst_ops.entries), goal, i);
959 #endif
960 
961 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
962 			goto out;
963 	} while (!in_softirq() && time_before_eq(jiffies, now));
964 
965 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
966 		goto out;
967 	if (net_ratelimit())
968 		printk(KERN_WARNING "dst cache overflow\n");
969 	RT_CACHE_STAT_INC(gc_dst_overflow);
970 	return 1;
971 
972 work_done:
973 	expire += ip_rt_gc_min_interval;
974 	if (expire > ip_rt_gc_timeout ||
975 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
976 		expire = ip_rt_gc_timeout;
977 #if RT_CACHE_DEBUG >= 2
978 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
979 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
980 #endif
981 out:	return 0;
982 }
983 
984 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
985 {
986 	struct rtable	*rth, **rthp;
987 	unsigned long	now;
988 	struct rtable *cand, **candp;
989 	u32 		min_score;
990 	int		chain_length;
991 	int attempts = !in_softirq();
992 
993 restart:
994 	chain_length = 0;
995 	min_score = ~(u32)0;
996 	cand = NULL;
997 	candp = NULL;
998 	now = jiffies;
999 
1000 	rthp = &rt_hash_table[hash].chain;
1001 
1002 	spin_lock_bh(rt_hash_lock_addr(hash));
1003 	while ((rth = *rthp) != NULL) {
1004 		if (rt_is_expired(rth)) {
1005 			*rthp = rth->u.dst.rt_next;
1006 			rt_free(rth);
1007 			continue;
1008 		}
1009 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1010 			/* Put it first */
1011 			*rthp = rth->u.dst.rt_next;
1012 			/*
1013 			 * Since lookup is lockfree, the deletion
1014 			 * must be visible to another weakly ordered CPU before
1015 			 * the insertion at the start of the hash chain.
1016 			 */
1017 			rcu_assign_pointer(rth->u.dst.rt_next,
1018 					   rt_hash_table[hash].chain);
1019 			/*
1020 			 * Since lookup is lockfree, the update writes
1021 			 * must be ordered for consistency on SMP.
1022 			 */
1023 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1024 
1025 			dst_use(&rth->u.dst, now);
1026 			spin_unlock_bh(rt_hash_lock_addr(hash));
1027 
1028 			rt_drop(rt);
1029 			*rp = rth;
1030 			return 0;
1031 		}
1032 
1033 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1034 			u32 score = rt_score(rth);
1035 
1036 			if (score <= min_score) {
1037 				cand = rth;
1038 				candp = rthp;
1039 				min_score = score;
1040 			}
1041 		}
1042 
1043 		chain_length++;
1044 
1045 		rthp = &rth->u.dst.rt_next;
1046 	}
1047 
1048 	if (cand) {
1049 		/* ip_rt_gc_elasticity used to be average length of chain
1050 		 * length, when exceeded gc becomes really aggressive.
1051 		 *
1052 		 * The second limit is less certain. At the moment it allows
1053 		 * only 2 entries per bucket. We will see.
1054 		 */
1055 		if (chain_length > ip_rt_gc_elasticity) {
1056 			*candp = cand->u.dst.rt_next;
1057 			rt_free(cand);
1058 		}
1059 	}
1060 
1061 	/* Try to bind route to arp only if it is output
1062 	   route or unicast forwarding path.
1063 	 */
1064 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1065 		int err = arp_bind_neighbour(&rt->u.dst);
1066 		if (err) {
1067 			spin_unlock_bh(rt_hash_lock_addr(hash));
1068 
1069 			if (err != -ENOBUFS) {
1070 				rt_drop(rt);
1071 				return err;
1072 			}
1073 
1074 			/* Neighbour tables are full and nothing
1075 			   can be released. Try to shrink route cache,
1076 			   it is most likely it holds some neighbour records.
1077 			 */
1078 			if (attempts-- > 0) {
1079 				int saved_elasticity = ip_rt_gc_elasticity;
1080 				int saved_int = ip_rt_gc_min_interval;
1081 				ip_rt_gc_elasticity	= 1;
1082 				ip_rt_gc_min_interval	= 0;
1083 				rt_garbage_collect(&ipv4_dst_ops);
1084 				ip_rt_gc_min_interval	= saved_int;
1085 				ip_rt_gc_elasticity	= saved_elasticity;
1086 				goto restart;
1087 			}
1088 
1089 			if (net_ratelimit())
1090 				printk(KERN_WARNING "Neighbour table overflow.\n");
1091 			rt_drop(rt);
1092 			return -ENOBUFS;
1093 		}
1094 	}
1095 
1096 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1097 #if RT_CACHE_DEBUG >= 2
1098 	if (rt->u.dst.rt_next) {
1099 		struct rtable *trt;
1100 		printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1101 		       NIPQUAD(rt->rt_dst));
1102 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1103 			printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1104 		printk("\n");
1105 	}
1106 #endif
1107 	rt_hash_table[hash].chain = rt;
1108 	spin_unlock_bh(rt_hash_lock_addr(hash));
1109 	*rp = rt;
1110 	return 0;
1111 }
1112 
1113 void rt_bind_peer(struct rtable *rt, int create)
1114 {
1115 	static DEFINE_SPINLOCK(rt_peer_lock);
1116 	struct inet_peer *peer;
1117 
1118 	peer = inet_getpeer(rt->rt_dst, create);
1119 
1120 	spin_lock_bh(&rt_peer_lock);
1121 	if (rt->peer == NULL) {
1122 		rt->peer = peer;
1123 		peer = NULL;
1124 	}
1125 	spin_unlock_bh(&rt_peer_lock);
1126 	if (peer)
1127 		inet_putpeer(peer);
1128 }
1129 
1130 /*
1131  * Peer allocation may fail only in serious out-of-memory conditions.  However
1132  * we still can generate some output.
1133  * Random ID selection looks a bit dangerous because we have no chances to
1134  * select ID being unique in a reasonable period of time.
1135  * But broken packet identifier may be better than no packet at all.
1136  */
1137 static void ip_select_fb_ident(struct iphdr *iph)
1138 {
1139 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1140 	static u32 ip_fallback_id;
1141 	u32 salt;
1142 
1143 	spin_lock_bh(&ip_fb_id_lock);
1144 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1145 	iph->id = htons(salt & 0xFFFF);
1146 	ip_fallback_id = salt;
1147 	spin_unlock_bh(&ip_fb_id_lock);
1148 }
1149 
1150 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1151 {
1152 	struct rtable *rt = (struct rtable *) dst;
1153 
1154 	if (rt) {
1155 		if (rt->peer == NULL)
1156 			rt_bind_peer(rt, 1);
1157 
1158 		/* If peer is attached to destination, it is never detached,
1159 		   so that we need not to grab a lock to dereference it.
1160 		 */
1161 		if (rt->peer) {
1162 			iph->id = htons(inet_getid(rt->peer, more));
1163 			return;
1164 		}
1165 	} else
1166 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1167 		       __builtin_return_address(0));
1168 
1169 	ip_select_fb_ident(iph);
1170 }
1171 
1172 static void rt_del(unsigned hash, struct rtable *rt)
1173 {
1174 	struct rtable **rthp, *aux;
1175 
1176 	rthp = &rt_hash_table[hash].chain;
1177 	spin_lock_bh(rt_hash_lock_addr(hash));
1178 	ip_rt_put(rt);
1179 	while ((aux = *rthp) != NULL) {
1180 		if (aux == rt || rt_is_expired(aux)) {
1181 			*rthp = aux->u.dst.rt_next;
1182 			rt_free(aux);
1183 			continue;
1184 		}
1185 		rthp = &aux->u.dst.rt_next;
1186 	}
1187 	spin_unlock_bh(rt_hash_lock_addr(hash));
1188 }
1189 
1190 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1191 		    __be32 saddr, struct net_device *dev)
1192 {
1193 	int i, k;
1194 	struct in_device *in_dev = in_dev_get(dev);
1195 	struct rtable *rth, **rthp;
1196 	__be32  skeys[2] = { saddr, 0 };
1197 	int  ikeys[2] = { dev->ifindex, 0 };
1198 	struct netevent_redirect netevent;
1199 	struct net *net;
1200 
1201 	if (!in_dev)
1202 		return;
1203 
1204 	net = dev_net(dev);
1205 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1206 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1207 	    || ipv4_is_zeronet(new_gw))
1208 		goto reject_redirect;
1209 
1210 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1211 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1212 			goto reject_redirect;
1213 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1214 			goto reject_redirect;
1215 	} else {
1216 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1217 			goto reject_redirect;
1218 	}
1219 
1220 	for (i = 0; i < 2; i++) {
1221 		for (k = 0; k < 2; k++) {
1222 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1223 						rt_genid(net));
1224 
1225 			rthp=&rt_hash_table[hash].chain;
1226 
1227 			rcu_read_lock();
1228 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1229 				struct rtable *rt;
1230 
1231 				if (rth->fl.fl4_dst != daddr ||
1232 				    rth->fl.fl4_src != skeys[i] ||
1233 				    rth->fl.oif != ikeys[k] ||
1234 				    rth->fl.iif != 0 ||
1235 				    rt_is_expired(rth) ||
1236 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1237 					rthp = &rth->u.dst.rt_next;
1238 					continue;
1239 				}
1240 
1241 				if (rth->rt_dst != daddr ||
1242 				    rth->rt_src != saddr ||
1243 				    rth->u.dst.error ||
1244 				    rth->rt_gateway != old_gw ||
1245 				    rth->u.dst.dev != dev)
1246 					break;
1247 
1248 				dst_hold(&rth->u.dst);
1249 				rcu_read_unlock();
1250 
1251 				rt = dst_alloc(&ipv4_dst_ops);
1252 				if (rt == NULL) {
1253 					ip_rt_put(rth);
1254 					in_dev_put(in_dev);
1255 					return;
1256 				}
1257 
1258 				/* Copy all the information. */
1259 				*rt = *rth;
1260 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1261 				rt->u.dst.__use		= 1;
1262 				atomic_set(&rt->u.dst.__refcnt, 1);
1263 				rt->u.dst.child		= NULL;
1264 				if (rt->u.dst.dev)
1265 					dev_hold(rt->u.dst.dev);
1266 				if (rt->idev)
1267 					in_dev_hold(rt->idev);
1268 				rt->u.dst.obsolete	= 0;
1269 				rt->u.dst.lastuse	= jiffies;
1270 				rt->u.dst.path		= &rt->u.dst;
1271 				rt->u.dst.neighbour	= NULL;
1272 				rt->u.dst.hh		= NULL;
1273 				rt->u.dst.xfrm		= NULL;
1274 				rt->rt_genid		= rt_genid(net);
1275 				rt->rt_flags		|= RTCF_REDIRECTED;
1276 
1277 				/* Gateway is different ... */
1278 				rt->rt_gateway		= new_gw;
1279 
1280 				/* Redirect received -> path was valid */
1281 				dst_confirm(&rth->u.dst);
1282 
1283 				if (rt->peer)
1284 					atomic_inc(&rt->peer->refcnt);
1285 
1286 				if (arp_bind_neighbour(&rt->u.dst) ||
1287 				    !(rt->u.dst.neighbour->nud_state &
1288 					    NUD_VALID)) {
1289 					if (rt->u.dst.neighbour)
1290 						neigh_event_send(rt->u.dst.neighbour, NULL);
1291 					ip_rt_put(rth);
1292 					rt_drop(rt);
1293 					goto do_next;
1294 				}
1295 
1296 				netevent.old = &rth->u.dst;
1297 				netevent.new = &rt->u.dst;
1298 				call_netevent_notifiers(NETEVENT_REDIRECT,
1299 							&netevent);
1300 
1301 				rt_del(hash, rth);
1302 				if (!rt_intern_hash(hash, rt, &rt))
1303 					ip_rt_put(rt);
1304 				goto do_next;
1305 			}
1306 			rcu_read_unlock();
1307 		do_next:
1308 			;
1309 		}
1310 	}
1311 	in_dev_put(in_dev);
1312 	return;
1313 
1314 reject_redirect:
1315 #ifdef CONFIG_IP_ROUTE_VERBOSE
1316 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1317 		printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1318 			NIPQUAD_FMT " ignored.\n"
1319 			"  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1320 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1321 		       NIPQUAD(saddr), NIPQUAD(daddr));
1322 #endif
1323 	in_dev_put(in_dev);
1324 }
1325 
1326 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1327 {
1328 	struct rtable *rt = (struct rtable *)dst;
1329 	struct dst_entry *ret = dst;
1330 
1331 	if (rt) {
1332 		if (dst->obsolete) {
1333 			ip_rt_put(rt);
1334 			ret = NULL;
1335 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1336 			   rt->u.dst.expires) {
1337 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1338 						rt->fl.oif,
1339 						rt_genid(dev_net(dst->dev)));
1340 #if RT_CACHE_DEBUG >= 1
1341 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1342 					  NIPQUAD_FMT "/%02x dropped\n",
1343 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1344 #endif
1345 			rt_del(hash, rt);
1346 			ret = NULL;
1347 		}
1348 	}
1349 	return ret;
1350 }
1351 
1352 /*
1353  * Algorithm:
1354  *	1. The first ip_rt_redirect_number redirects are sent
1355  *	   with exponential backoff, then we stop sending them at all,
1356  *	   assuming that the host ignores our redirects.
1357  *	2. If we did not see packets requiring redirects
1358  *	   during ip_rt_redirect_silence, we assume that the host
1359  *	   forgot redirected route and start to send redirects again.
1360  *
1361  * This algorithm is much cheaper and more intelligent than dumb load limiting
1362  * in icmp.c.
1363  *
1364  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1365  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1366  */
1367 
1368 void ip_rt_send_redirect(struct sk_buff *skb)
1369 {
1370 	struct rtable *rt = skb->rtable;
1371 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1372 
1373 	if (!in_dev)
1374 		return;
1375 
1376 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1377 		goto out;
1378 
1379 	/* No redirected packets during ip_rt_redirect_silence;
1380 	 * reset the algorithm.
1381 	 */
1382 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1383 		rt->u.dst.rate_tokens = 0;
1384 
1385 	/* Too many ignored redirects; do not send anything
1386 	 * set u.dst.rate_last to the last seen redirected packet.
1387 	 */
1388 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1389 		rt->u.dst.rate_last = jiffies;
1390 		goto out;
1391 	}
1392 
1393 	/* Check for load limit; set rate_last to the latest sent
1394 	 * redirect.
1395 	 */
1396 	if (rt->u.dst.rate_tokens == 0 ||
1397 	    time_after(jiffies,
1398 		       (rt->u.dst.rate_last +
1399 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1400 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1401 		rt->u.dst.rate_last = jiffies;
1402 		++rt->u.dst.rate_tokens;
1403 #ifdef CONFIG_IP_ROUTE_VERBOSE
1404 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1405 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1406 		    net_ratelimit())
1407 			printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1408 				"redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1409 				NIPQUAD(rt->rt_src), rt->rt_iif,
1410 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1411 #endif
1412 	}
1413 out:
1414 	in_dev_put(in_dev);
1415 }
1416 
1417 static int ip_error(struct sk_buff *skb)
1418 {
1419 	struct rtable *rt = skb->rtable;
1420 	unsigned long now;
1421 	int code;
1422 
1423 	switch (rt->u.dst.error) {
1424 		case EINVAL:
1425 		default:
1426 			goto out;
1427 		case EHOSTUNREACH:
1428 			code = ICMP_HOST_UNREACH;
1429 			break;
1430 		case ENETUNREACH:
1431 			code = ICMP_NET_UNREACH;
1432 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1433 					IPSTATS_MIB_INNOROUTES);
1434 			break;
1435 		case EACCES:
1436 			code = ICMP_PKT_FILTERED;
1437 			break;
1438 	}
1439 
1440 	now = jiffies;
1441 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1442 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1443 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1444 	rt->u.dst.rate_last = now;
1445 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1446 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1447 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1448 	}
1449 
1450 out:	kfree_skb(skb);
1451 	return 0;
1452 }
1453 
1454 /*
1455  *	The last two values are not from the RFC but
1456  *	are needed for AMPRnet AX.25 paths.
1457  */
1458 
1459 static const unsigned short mtu_plateau[] =
1460 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1461 
1462 static inline unsigned short guess_mtu(unsigned short old_mtu)
1463 {
1464 	int i;
1465 
1466 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1467 		if (old_mtu > mtu_plateau[i])
1468 			return mtu_plateau[i];
1469 	return 68;
1470 }
1471 
1472 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1473 				 unsigned short new_mtu,
1474 				 struct net_device *dev)
1475 {
1476 	int i, k;
1477 	unsigned short old_mtu = ntohs(iph->tot_len);
1478 	struct rtable *rth;
1479 	int  ikeys[2] = { dev->ifindex, 0 };
1480 	__be32  skeys[2] = { iph->saddr, 0, };
1481 	__be32  daddr = iph->daddr;
1482 	unsigned short est_mtu = 0;
1483 
1484 	if (ipv4_config.no_pmtu_disc)
1485 		return 0;
1486 
1487 	for (k = 0; k < 2; k++) {
1488 		for (i = 0; i < 2; i++) {
1489 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1490 						rt_genid(net));
1491 
1492 			rcu_read_lock();
1493 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1494 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1495 				unsigned short mtu = new_mtu;
1496 
1497 				if (rth->fl.fl4_dst != daddr ||
1498 				    rth->fl.fl4_src != skeys[i] ||
1499 				    rth->rt_dst != daddr ||
1500 				    rth->rt_src != iph->saddr ||
1501 				    rth->fl.oif != ikeys[k] ||
1502 				    rth->fl.iif != 0 ||
1503 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1504 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1505 				    rt_is_expired(rth))
1506 					continue;
1507 
1508 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1509 
1510 					/* BSD 4.2 compatibility hack :-( */
1511 					if (mtu == 0 &&
1512 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1513 					    old_mtu >= 68 + (iph->ihl << 2))
1514 						old_mtu -= iph->ihl << 2;
1515 
1516 					mtu = guess_mtu(old_mtu);
1517 				}
1518 				if (mtu <= dst_mtu(&rth->u.dst)) {
1519 					if (mtu < dst_mtu(&rth->u.dst)) {
1520 						dst_confirm(&rth->u.dst);
1521 						if (mtu < ip_rt_min_pmtu) {
1522 							mtu = ip_rt_min_pmtu;
1523 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1524 								(1 << RTAX_MTU);
1525 						}
1526 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1527 						dst_set_expires(&rth->u.dst,
1528 							ip_rt_mtu_expires);
1529 					}
1530 					est_mtu = mtu;
1531 				}
1532 			}
1533 			rcu_read_unlock();
1534 		}
1535 	}
1536 	return est_mtu ? : new_mtu;
1537 }
1538 
1539 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1540 {
1541 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1542 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1543 		if (mtu < ip_rt_min_pmtu) {
1544 			mtu = ip_rt_min_pmtu;
1545 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1546 		}
1547 		dst->metrics[RTAX_MTU-1] = mtu;
1548 		dst_set_expires(dst, ip_rt_mtu_expires);
1549 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1550 	}
1551 }
1552 
1553 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1554 {
1555 	return NULL;
1556 }
1557 
1558 static void ipv4_dst_destroy(struct dst_entry *dst)
1559 {
1560 	struct rtable *rt = (struct rtable *) dst;
1561 	struct inet_peer *peer = rt->peer;
1562 	struct in_device *idev = rt->idev;
1563 
1564 	if (peer) {
1565 		rt->peer = NULL;
1566 		inet_putpeer(peer);
1567 	}
1568 
1569 	if (idev) {
1570 		rt->idev = NULL;
1571 		in_dev_put(idev);
1572 	}
1573 }
1574 
1575 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1576 			    int how)
1577 {
1578 	struct rtable *rt = (struct rtable *) dst;
1579 	struct in_device *idev = rt->idev;
1580 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1581 		struct in_device *loopback_idev =
1582 			in_dev_get(dev_net(dev)->loopback_dev);
1583 		if (loopback_idev) {
1584 			rt->idev = loopback_idev;
1585 			in_dev_put(idev);
1586 		}
1587 	}
1588 }
1589 
1590 static void ipv4_link_failure(struct sk_buff *skb)
1591 {
1592 	struct rtable *rt;
1593 
1594 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1595 
1596 	rt = skb->rtable;
1597 	if (rt)
1598 		dst_set_expires(&rt->u.dst, 0);
1599 }
1600 
1601 static int ip_rt_bug(struct sk_buff *skb)
1602 {
1603 	printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1604 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1605 		skb->dev ? skb->dev->name : "?");
1606 	kfree_skb(skb);
1607 	return 0;
1608 }
1609 
1610 /*
1611    We do not cache source address of outgoing interface,
1612    because it is used only by IP RR, TS and SRR options,
1613    so that it out of fast path.
1614 
1615    BTW remember: "addr" is allowed to be not aligned
1616    in IP options!
1617  */
1618 
1619 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1620 {
1621 	__be32 src;
1622 	struct fib_result res;
1623 
1624 	if (rt->fl.iif == 0)
1625 		src = rt->rt_src;
1626 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1627 		src = FIB_RES_PREFSRC(res);
1628 		fib_res_put(&res);
1629 	} else
1630 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1631 					RT_SCOPE_UNIVERSE);
1632 	memcpy(addr, &src, 4);
1633 }
1634 
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 static void set_class_tag(struct rtable *rt, u32 tag)
1637 {
1638 	if (!(rt->u.dst.tclassid & 0xFFFF))
1639 		rt->u.dst.tclassid |= tag & 0xFFFF;
1640 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1641 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1642 }
1643 #endif
1644 
1645 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1646 {
1647 	struct fib_info *fi = res->fi;
1648 
1649 	if (fi) {
1650 		if (FIB_RES_GW(*res) &&
1651 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1652 			rt->rt_gateway = FIB_RES_GW(*res);
1653 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1654 		       sizeof(rt->u.dst.metrics));
1655 		if (fi->fib_mtu == 0) {
1656 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1657 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1658 			    rt->rt_gateway != rt->rt_dst &&
1659 			    rt->u.dst.dev->mtu > 576)
1660 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1661 		}
1662 #ifdef CONFIG_NET_CLS_ROUTE
1663 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1664 #endif
1665 	} else
1666 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1667 
1668 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1669 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1670 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1671 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1672 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1673 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1674 				       ip_rt_min_advmss);
1675 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1676 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1677 
1678 #ifdef CONFIG_NET_CLS_ROUTE
1679 #ifdef CONFIG_IP_MULTIPLE_TABLES
1680 	set_class_tag(rt, fib_rules_tclass(res));
1681 #endif
1682 	set_class_tag(rt, itag);
1683 #endif
1684 	rt->rt_type = res->type;
1685 }
1686 
1687 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688 				u8 tos, struct net_device *dev, int our)
1689 {
1690 	unsigned hash;
1691 	struct rtable *rth;
1692 	__be32 spec_dst;
1693 	struct in_device *in_dev = in_dev_get(dev);
1694 	u32 itag = 0;
1695 
1696 	/* Primary sanity checks. */
1697 
1698 	if (in_dev == NULL)
1699 		return -EINVAL;
1700 
1701 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1703 		goto e_inval;
1704 
1705 	if (ipv4_is_zeronet(saddr)) {
1706 		if (!ipv4_is_local_multicast(daddr))
1707 			goto e_inval;
1708 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1709 	} else if (fib_validate_source(saddr, 0, tos, 0,
1710 					dev, &spec_dst, &itag) < 0)
1711 		goto e_inval;
1712 
1713 	rth = dst_alloc(&ipv4_dst_ops);
1714 	if (!rth)
1715 		goto e_nobufs;
1716 
1717 	rth->u.dst.output= ip_rt_bug;
1718 
1719 	atomic_set(&rth->u.dst.__refcnt, 1);
1720 	rth->u.dst.flags= DST_HOST;
1721 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1722 		rth->u.dst.flags |= DST_NOPOLICY;
1723 	rth->fl.fl4_dst	= daddr;
1724 	rth->rt_dst	= daddr;
1725 	rth->fl.fl4_tos	= tos;
1726 	rth->fl.mark    = skb->mark;
1727 	rth->fl.fl4_src	= saddr;
1728 	rth->rt_src	= saddr;
1729 #ifdef CONFIG_NET_CLS_ROUTE
1730 	rth->u.dst.tclassid = itag;
1731 #endif
1732 	rth->rt_iif	=
1733 	rth->fl.iif	= dev->ifindex;
1734 	rth->u.dst.dev	= init_net.loopback_dev;
1735 	dev_hold(rth->u.dst.dev);
1736 	rth->idev	= in_dev_get(rth->u.dst.dev);
1737 	rth->fl.oif	= 0;
1738 	rth->rt_gateway	= daddr;
1739 	rth->rt_spec_dst= spec_dst;
1740 	rth->rt_genid	= rt_genid(dev_net(dev));
1741 	rth->rt_flags	= RTCF_MULTICAST;
1742 	rth->rt_type	= RTN_MULTICAST;
1743 	if (our) {
1744 		rth->u.dst.input= ip_local_deliver;
1745 		rth->rt_flags |= RTCF_LOCAL;
1746 	}
1747 
1748 #ifdef CONFIG_IP_MROUTE
1749 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1750 		rth->u.dst.input = ip_mr_input;
1751 #endif
1752 	RT_CACHE_STAT_INC(in_slow_mc);
1753 
1754 	in_dev_put(in_dev);
1755 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1756 	return rt_intern_hash(hash, rth, &skb->rtable);
1757 
1758 e_nobufs:
1759 	in_dev_put(in_dev);
1760 	return -ENOBUFS;
1761 
1762 e_inval:
1763 	in_dev_put(in_dev);
1764 	return -EINVAL;
1765 }
1766 
1767 
1768 static void ip_handle_martian_source(struct net_device *dev,
1769 				     struct in_device *in_dev,
1770 				     struct sk_buff *skb,
1771 				     __be32 daddr,
1772 				     __be32 saddr)
1773 {
1774 	RT_CACHE_STAT_INC(in_martian_src);
1775 #ifdef CONFIG_IP_ROUTE_VERBOSE
1776 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1777 		/*
1778 		 *	RFC1812 recommendation, if source is martian,
1779 		 *	the only hint is MAC header.
1780 		 */
1781 		printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1782 			NIPQUAD_FMT", on dev %s\n",
1783 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1784 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1785 			int i;
1786 			const unsigned char *p = skb_mac_header(skb);
1787 			printk(KERN_WARNING "ll header: ");
1788 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1789 				printk("%02x", *p);
1790 				if (i < (dev->hard_header_len - 1))
1791 					printk(":");
1792 			}
1793 			printk("\n");
1794 		}
1795 	}
1796 #endif
1797 }
1798 
1799 static int __mkroute_input(struct sk_buff *skb,
1800 			   struct fib_result *res,
1801 			   struct in_device *in_dev,
1802 			   __be32 daddr, __be32 saddr, u32 tos,
1803 			   struct rtable **result)
1804 {
1805 
1806 	struct rtable *rth;
1807 	int err;
1808 	struct in_device *out_dev;
1809 	unsigned flags = 0;
1810 	__be32 spec_dst;
1811 	u32 itag;
1812 
1813 	/* get a working reference to the output device */
1814 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1815 	if (out_dev == NULL) {
1816 		if (net_ratelimit())
1817 			printk(KERN_CRIT "Bug in ip_route_input" \
1818 			       "_slow(). Please, report\n");
1819 		return -EINVAL;
1820 	}
1821 
1822 
1823 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1824 				  in_dev->dev, &spec_dst, &itag);
1825 	if (err < 0) {
1826 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1827 					 saddr);
1828 
1829 		err = -EINVAL;
1830 		goto cleanup;
1831 	}
1832 
1833 	if (err)
1834 		flags |= RTCF_DIRECTSRC;
1835 
1836 	if (out_dev == in_dev && err &&
1837 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1838 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1839 		flags |= RTCF_DOREDIRECT;
1840 
1841 	if (skb->protocol != htons(ETH_P_IP)) {
1842 		/* Not IP (i.e. ARP). Do not create route, if it is
1843 		 * invalid for proxy arp. DNAT routes are always valid.
1844 		 */
1845 		if (out_dev == in_dev) {
1846 			err = -EINVAL;
1847 			goto cleanup;
1848 		}
1849 	}
1850 
1851 
1852 	rth = dst_alloc(&ipv4_dst_ops);
1853 	if (!rth) {
1854 		err = -ENOBUFS;
1855 		goto cleanup;
1856 	}
1857 
1858 	atomic_set(&rth->u.dst.__refcnt, 1);
1859 	rth->u.dst.flags= DST_HOST;
1860 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1861 		rth->u.dst.flags |= DST_NOPOLICY;
1862 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1863 		rth->u.dst.flags |= DST_NOXFRM;
1864 	rth->fl.fl4_dst	= daddr;
1865 	rth->rt_dst	= daddr;
1866 	rth->fl.fl4_tos	= tos;
1867 	rth->fl.mark    = skb->mark;
1868 	rth->fl.fl4_src	= saddr;
1869 	rth->rt_src	= saddr;
1870 	rth->rt_gateway	= daddr;
1871 	rth->rt_iif 	=
1872 		rth->fl.iif	= in_dev->dev->ifindex;
1873 	rth->u.dst.dev	= (out_dev)->dev;
1874 	dev_hold(rth->u.dst.dev);
1875 	rth->idev	= in_dev_get(rth->u.dst.dev);
1876 	rth->fl.oif 	= 0;
1877 	rth->rt_spec_dst= spec_dst;
1878 
1879 	rth->u.dst.input = ip_forward;
1880 	rth->u.dst.output = ip_output;
1881 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1882 
1883 	rt_set_nexthop(rth, res, itag);
1884 
1885 	rth->rt_flags = flags;
1886 
1887 	*result = rth;
1888 	err = 0;
1889  cleanup:
1890 	/* release the working reference to the output device */
1891 	in_dev_put(out_dev);
1892 	return err;
1893 }
1894 
1895 static int ip_mkroute_input(struct sk_buff *skb,
1896 			    struct fib_result *res,
1897 			    const struct flowi *fl,
1898 			    struct in_device *in_dev,
1899 			    __be32 daddr, __be32 saddr, u32 tos)
1900 {
1901 	struct rtable* rth = NULL;
1902 	int err;
1903 	unsigned hash;
1904 
1905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1906 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1907 		fib_select_multipath(fl, res);
1908 #endif
1909 
1910 	/* create a routing cache entry */
1911 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1912 	if (err)
1913 		return err;
1914 
1915 	/* put it into the cache */
1916 	hash = rt_hash(daddr, saddr, fl->iif,
1917 		       rt_genid(dev_net(rth->u.dst.dev)));
1918 	return rt_intern_hash(hash, rth, &skb->rtable);
1919 }
1920 
1921 /*
1922  *	NOTE. We drop all the packets that has local source
1923  *	addresses, because every properly looped back packet
1924  *	must have correct destination already attached by output routine.
1925  *
1926  *	Such approach solves two big problems:
1927  *	1. Not simplex devices are handled properly.
1928  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1929  */
1930 
1931 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1932 			       u8 tos, struct net_device *dev)
1933 {
1934 	struct fib_result res;
1935 	struct in_device *in_dev = in_dev_get(dev);
1936 	struct flowi fl = { .nl_u = { .ip4_u =
1937 				      { .daddr = daddr,
1938 					.saddr = saddr,
1939 					.tos = tos,
1940 					.scope = RT_SCOPE_UNIVERSE,
1941 				      } },
1942 			    .mark = skb->mark,
1943 			    .iif = dev->ifindex };
1944 	unsigned	flags = 0;
1945 	u32		itag = 0;
1946 	struct rtable * rth;
1947 	unsigned	hash;
1948 	__be32		spec_dst;
1949 	int		err = -EINVAL;
1950 	int		free_res = 0;
1951 	struct net    * net = dev_net(dev);
1952 
1953 	/* IP on this device is disabled. */
1954 
1955 	if (!in_dev)
1956 		goto out;
1957 
1958 	/* Check for the most weird martians, which can be not detected
1959 	   by fib_lookup.
1960 	 */
1961 
1962 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1963 	    ipv4_is_loopback(saddr))
1964 		goto martian_source;
1965 
1966 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1967 		goto brd_input;
1968 
1969 	/* Accept zero addresses only to limited broadcast;
1970 	 * I even do not know to fix it or not. Waiting for complains :-)
1971 	 */
1972 	if (ipv4_is_zeronet(saddr))
1973 		goto martian_source;
1974 
1975 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1976 	    ipv4_is_loopback(daddr))
1977 		goto martian_destination;
1978 
1979 	/*
1980 	 *	Now we are ready to route packet.
1981 	 */
1982 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1983 		if (!IN_DEV_FORWARD(in_dev))
1984 			goto e_hostunreach;
1985 		goto no_route;
1986 	}
1987 	free_res = 1;
1988 
1989 	RT_CACHE_STAT_INC(in_slow_tot);
1990 
1991 	if (res.type == RTN_BROADCAST)
1992 		goto brd_input;
1993 
1994 	if (res.type == RTN_LOCAL) {
1995 		int result;
1996 		result = fib_validate_source(saddr, daddr, tos,
1997 					     net->loopback_dev->ifindex,
1998 					     dev, &spec_dst, &itag);
1999 		if (result < 0)
2000 			goto martian_source;
2001 		if (result)
2002 			flags |= RTCF_DIRECTSRC;
2003 		spec_dst = daddr;
2004 		goto local_input;
2005 	}
2006 
2007 	if (!IN_DEV_FORWARD(in_dev))
2008 		goto e_hostunreach;
2009 	if (res.type != RTN_UNICAST)
2010 		goto martian_destination;
2011 
2012 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2013 done:
2014 	in_dev_put(in_dev);
2015 	if (free_res)
2016 		fib_res_put(&res);
2017 out:	return err;
2018 
2019 brd_input:
2020 	if (skb->protocol != htons(ETH_P_IP))
2021 		goto e_inval;
2022 
2023 	if (ipv4_is_zeronet(saddr))
2024 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2025 	else {
2026 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2027 					  &itag);
2028 		if (err < 0)
2029 			goto martian_source;
2030 		if (err)
2031 			flags |= RTCF_DIRECTSRC;
2032 	}
2033 	flags |= RTCF_BROADCAST;
2034 	res.type = RTN_BROADCAST;
2035 	RT_CACHE_STAT_INC(in_brd);
2036 
2037 local_input:
2038 	rth = dst_alloc(&ipv4_dst_ops);
2039 	if (!rth)
2040 		goto e_nobufs;
2041 
2042 	rth->u.dst.output= ip_rt_bug;
2043 	rth->rt_genid = rt_genid(net);
2044 
2045 	atomic_set(&rth->u.dst.__refcnt, 1);
2046 	rth->u.dst.flags= DST_HOST;
2047 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2048 		rth->u.dst.flags |= DST_NOPOLICY;
2049 	rth->fl.fl4_dst	= daddr;
2050 	rth->rt_dst	= daddr;
2051 	rth->fl.fl4_tos	= tos;
2052 	rth->fl.mark    = skb->mark;
2053 	rth->fl.fl4_src	= saddr;
2054 	rth->rt_src	= saddr;
2055 #ifdef CONFIG_NET_CLS_ROUTE
2056 	rth->u.dst.tclassid = itag;
2057 #endif
2058 	rth->rt_iif	=
2059 	rth->fl.iif	= dev->ifindex;
2060 	rth->u.dst.dev	= net->loopback_dev;
2061 	dev_hold(rth->u.dst.dev);
2062 	rth->idev	= in_dev_get(rth->u.dst.dev);
2063 	rth->rt_gateway	= daddr;
2064 	rth->rt_spec_dst= spec_dst;
2065 	rth->u.dst.input= ip_local_deliver;
2066 	rth->rt_flags 	= flags|RTCF_LOCAL;
2067 	if (res.type == RTN_UNREACHABLE) {
2068 		rth->u.dst.input= ip_error;
2069 		rth->u.dst.error= -err;
2070 		rth->rt_flags 	&= ~RTCF_LOCAL;
2071 	}
2072 	rth->rt_type	= res.type;
2073 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2074 	err = rt_intern_hash(hash, rth, &skb->rtable);
2075 	goto done;
2076 
2077 no_route:
2078 	RT_CACHE_STAT_INC(in_no_route);
2079 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2080 	res.type = RTN_UNREACHABLE;
2081 	if (err == -ESRCH)
2082 		err = -ENETUNREACH;
2083 	goto local_input;
2084 
2085 	/*
2086 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2087 	 */
2088 martian_destination:
2089 	RT_CACHE_STAT_INC(in_martian_dst);
2090 #ifdef CONFIG_IP_ROUTE_VERBOSE
2091 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2092 		printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2093 			NIPQUAD_FMT ", dev %s\n",
2094 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2095 #endif
2096 
2097 e_hostunreach:
2098 	err = -EHOSTUNREACH;
2099 	goto done;
2100 
2101 e_inval:
2102 	err = -EINVAL;
2103 	goto done;
2104 
2105 e_nobufs:
2106 	err = -ENOBUFS;
2107 	goto done;
2108 
2109 martian_source:
2110 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2111 	goto e_inval;
2112 }
2113 
2114 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2115 		   u8 tos, struct net_device *dev)
2116 {
2117 	struct rtable * rth;
2118 	unsigned	hash;
2119 	int iif = dev->ifindex;
2120 	struct net *net;
2121 
2122 	net = dev_net(dev);
2123 	tos &= IPTOS_RT_MASK;
2124 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2125 
2126 	rcu_read_lock();
2127 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2128 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2129 		if (((rth->fl.fl4_dst ^ daddr) |
2130 		     (rth->fl.fl4_src ^ saddr) |
2131 		     (rth->fl.iif ^ iif) |
2132 		     rth->fl.oif |
2133 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2134 		    rth->fl.mark == skb->mark &&
2135 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2136 		    !rt_is_expired(rth)) {
2137 			dst_use(&rth->u.dst, jiffies);
2138 			RT_CACHE_STAT_INC(in_hit);
2139 			rcu_read_unlock();
2140 			skb->rtable = rth;
2141 			return 0;
2142 		}
2143 		RT_CACHE_STAT_INC(in_hlist_search);
2144 	}
2145 	rcu_read_unlock();
2146 
2147 	/* Multicast recognition logic is moved from route cache to here.
2148 	   The problem was that too many Ethernet cards have broken/missing
2149 	   hardware multicast filters :-( As result the host on multicasting
2150 	   network acquires a lot of useless route cache entries, sort of
2151 	   SDR messages from all the world. Now we try to get rid of them.
2152 	   Really, provided software IP multicast filter is organized
2153 	   reasonably (at least, hashed), it does not result in a slowdown
2154 	   comparing with route cache reject entries.
2155 	   Note, that multicast routers are not affected, because
2156 	   route cache entry is created eventually.
2157 	 */
2158 	if (ipv4_is_multicast(daddr)) {
2159 		struct in_device *in_dev;
2160 
2161 		rcu_read_lock();
2162 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2163 			int our = ip_check_mc(in_dev, daddr, saddr,
2164 				ip_hdr(skb)->protocol);
2165 			if (our
2166 #ifdef CONFIG_IP_MROUTE
2167 			    || (!ipv4_is_local_multicast(daddr) &&
2168 				IN_DEV_MFORWARD(in_dev))
2169 #endif
2170 			    ) {
2171 				rcu_read_unlock();
2172 				return ip_route_input_mc(skb, daddr, saddr,
2173 							 tos, dev, our);
2174 			}
2175 		}
2176 		rcu_read_unlock();
2177 		return -EINVAL;
2178 	}
2179 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2180 }
2181 
2182 static int __mkroute_output(struct rtable **result,
2183 			    struct fib_result *res,
2184 			    const struct flowi *fl,
2185 			    const struct flowi *oldflp,
2186 			    struct net_device *dev_out,
2187 			    unsigned flags)
2188 {
2189 	struct rtable *rth;
2190 	struct in_device *in_dev;
2191 	u32 tos = RT_FL_TOS(oldflp);
2192 	int err = 0;
2193 
2194 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2195 		return -EINVAL;
2196 
2197 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2198 		res->type = RTN_BROADCAST;
2199 	else if (ipv4_is_multicast(fl->fl4_dst))
2200 		res->type = RTN_MULTICAST;
2201 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2202 		return -EINVAL;
2203 
2204 	if (dev_out->flags & IFF_LOOPBACK)
2205 		flags |= RTCF_LOCAL;
2206 
2207 	/* get work reference to inet device */
2208 	in_dev = in_dev_get(dev_out);
2209 	if (!in_dev)
2210 		return -EINVAL;
2211 
2212 	if (res->type == RTN_BROADCAST) {
2213 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2214 		if (res->fi) {
2215 			fib_info_put(res->fi);
2216 			res->fi = NULL;
2217 		}
2218 	} else if (res->type == RTN_MULTICAST) {
2219 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2220 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2221 				 oldflp->proto))
2222 			flags &= ~RTCF_LOCAL;
2223 		/* If multicast route do not exist use
2224 		   default one, but do not gateway in this case.
2225 		   Yes, it is hack.
2226 		 */
2227 		if (res->fi && res->prefixlen < 4) {
2228 			fib_info_put(res->fi);
2229 			res->fi = NULL;
2230 		}
2231 	}
2232 
2233 
2234 	rth = dst_alloc(&ipv4_dst_ops);
2235 	if (!rth) {
2236 		err = -ENOBUFS;
2237 		goto cleanup;
2238 	}
2239 
2240 	atomic_set(&rth->u.dst.__refcnt, 1);
2241 	rth->u.dst.flags= DST_HOST;
2242 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2243 		rth->u.dst.flags |= DST_NOXFRM;
2244 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2245 		rth->u.dst.flags |= DST_NOPOLICY;
2246 
2247 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2248 	rth->fl.fl4_tos	= tos;
2249 	rth->fl.fl4_src	= oldflp->fl4_src;
2250 	rth->fl.oif	= oldflp->oif;
2251 	rth->fl.mark    = oldflp->mark;
2252 	rth->rt_dst	= fl->fl4_dst;
2253 	rth->rt_src	= fl->fl4_src;
2254 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2255 	/* get references to the devices that are to be hold by the routing
2256 	   cache entry */
2257 	rth->u.dst.dev	= dev_out;
2258 	dev_hold(dev_out);
2259 	rth->idev	= in_dev_get(dev_out);
2260 	rth->rt_gateway = fl->fl4_dst;
2261 	rth->rt_spec_dst= fl->fl4_src;
2262 
2263 	rth->u.dst.output=ip_output;
2264 	rth->rt_genid = rt_genid(dev_net(dev_out));
2265 
2266 	RT_CACHE_STAT_INC(out_slow_tot);
2267 
2268 	if (flags & RTCF_LOCAL) {
2269 		rth->u.dst.input = ip_local_deliver;
2270 		rth->rt_spec_dst = fl->fl4_dst;
2271 	}
2272 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2273 		rth->rt_spec_dst = fl->fl4_src;
2274 		if (flags & RTCF_LOCAL &&
2275 		    !(dev_out->flags & IFF_LOOPBACK)) {
2276 			rth->u.dst.output = ip_mc_output;
2277 			RT_CACHE_STAT_INC(out_slow_mc);
2278 		}
2279 #ifdef CONFIG_IP_MROUTE
2280 		if (res->type == RTN_MULTICAST) {
2281 			if (IN_DEV_MFORWARD(in_dev) &&
2282 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2283 				rth->u.dst.input = ip_mr_input;
2284 				rth->u.dst.output = ip_mc_output;
2285 			}
2286 		}
2287 #endif
2288 	}
2289 
2290 	rt_set_nexthop(rth, res, 0);
2291 
2292 	rth->rt_flags = flags;
2293 
2294 	*result = rth;
2295  cleanup:
2296 	/* release work reference to inet device */
2297 	in_dev_put(in_dev);
2298 
2299 	return err;
2300 }
2301 
2302 static int ip_mkroute_output(struct rtable **rp,
2303 			     struct fib_result *res,
2304 			     const struct flowi *fl,
2305 			     const struct flowi *oldflp,
2306 			     struct net_device *dev_out,
2307 			     unsigned flags)
2308 {
2309 	struct rtable *rth = NULL;
2310 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2311 	unsigned hash;
2312 	if (err == 0) {
2313 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2314 			       rt_genid(dev_net(dev_out)));
2315 		err = rt_intern_hash(hash, rth, rp);
2316 	}
2317 
2318 	return err;
2319 }
2320 
2321 /*
2322  * Major route resolver routine.
2323  */
2324 
2325 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2326 				const struct flowi *oldflp)
2327 {
2328 	u32 tos	= RT_FL_TOS(oldflp);
2329 	struct flowi fl = { .nl_u = { .ip4_u =
2330 				      { .daddr = oldflp->fl4_dst,
2331 					.saddr = oldflp->fl4_src,
2332 					.tos = tos & IPTOS_RT_MASK,
2333 					.scope = ((tos & RTO_ONLINK) ?
2334 						  RT_SCOPE_LINK :
2335 						  RT_SCOPE_UNIVERSE),
2336 				      } },
2337 			    .mark = oldflp->mark,
2338 			    .iif = net->loopback_dev->ifindex,
2339 			    .oif = oldflp->oif };
2340 	struct fib_result res;
2341 	unsigned flags = 0;
2342 	struct net_device *dev_out = NULL;
2343 	int free_res = 0;
2344 	int err;
2345 
2346 
2347 	res.fi		= NULL;
2348 #ifdef CONFIG_IP_MULTIPLE_TABLES
2349 	res.r		= NULL;
2350 #endif
2351 
2352 	if (oldflp->fl4_src) {
2353 		err = -EINVAL;
2354 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2355 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2356 		    ipv4_is_zeronet(oldflp->fl4_src))
2357 			goto out;
2358 
2359 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2360 		dev_out = ip_dev_find(net, oldflp->fl4_src);
2361 		if (dev_out == NULL)
2362 			goto out;
2363 
2364 		/* I removed check for oif == dev_out->oif here.
2365 		   It was wrong for two reasons:
2366 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2367 		      is assigned to multiple interfaces.
2368 		   2. Moreover, we are allowed to send packets with saddr
2369 		      of another iface. --ANK
2370 		 */
2371 
2372 		if (oldflp->oif == 0
2373 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2374 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2375 			/* Special hack: user can direct multicasts
2376 			   and limited broadcast via necessary interface
2377 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2378 			   This hack is not just for fun, it allows
2379 			   vic,vat and friends to work.
2380 			   They bind socket to loopback, set ttl to zero
2381 			   and expect that it will work.
2382 			   From the viewpoint of routing cache they are broken,
2383 			   because we are not allowed to build multicast path
2384 			   with loopback source addr (look, routing cache
2385 			   cannot know, that ttl is zero, so that packet
2386 			   will not leave this host and route is valid).
2387 			   Luckily, this hack is good workaround.
2388 			 */
2389 
2390 			fl.oif = dev_out->ifindex;
2391 			goto make_route;
2392 		}
2393 		if (dev_out)
2394 			dev_put(dev_out);
2395 		dev_out = NULL;
2396 	}
2397 
2398 
2399 	if (oldflp->oif) {
2400 		dev_out = dev_get_by_index(net, oldflp->oif);
2401 		err = -ENODEV;
2402 		if (dev_out == NULL)
2403 			goto out;
2404 
2405 		/* RACE: Check return value of inet_select_addr instead. */
2406 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2407 			dev_put(dev_out);
2408 			goto out;	/* Wrong error code */
2409 		}
2410 
2411 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2412 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2413 			if (!fl.fl4_src)
2414 				fl.fl4_src = inet_select_addr(dev_out, 0,
2415 							      RT_SCOPE_LINK);
2416 			goto make_route;
2417 		}
2418 		if (!fl.fl4_src) {
2419 			if (ipv4_is_multicast(oldflp->fl4_dst))
2420 				fl.fl4_src = inet_select_addr(dev_out, 0,
2421 							      fl.fl4_scope);
2422 			else if (!oldflp->fl4_dst)
2423 				fl.fl4_src = inet_select_addr(dev_out, 0,
2424 							      RT_SCOPE_HOST);
2425 		}
2426 	}
2427 
2428 	if (!fl.fl4_dst) {
2429 		fl.fl4_dst = fl.fl4_src;
2430 		if (!fl.fl4_dst)
2431 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2432 		if (dev_out)
2433 			dev_put(dev_out);
2434 		dev_out = net->loopback_dev;
2435 		dev_hold(dev_out);
2436 		fl.oif = net->loopback_dev->ifindex;
2437 		res.type = RTN_LOCAL;
2438 		flags |= RTCF_LOCAL;
2439 		goto make_route;
2440 	}
2441 
2442 	if (fib_lookup(net, &fl, &res)) {
2443 		res.fi = NULL;
2444 		if (oldflp->oif) {
2445 			/* Apparently, routing tables are wrong. Assume,
2446 			   that the destination is on link.
2447 
2448 			   WHY? DW.
2449 			   Because we are allowed to send to iface
2450 			   even if it has NO routes and NO assigned
2451 			   addresses. When oif is specified, routing
2452 			   tables are looked up with only one purpose:
2453 			   to catch if destination is gatewayed, rather than
2454 			   direct. Moreover, if MSG_DONTROUTE is set,
2455 			   we send packet, ignoring both routing tables
2456 			   and ifaddr state. --ANK
2457 
2458 
2459 			   We could make it even if oif is unknown,
2460 			   likely IPv6, but we do not.
2461 			 */
2462 
2463 			if (fl.fl4_src == 0)
2464 				fl.fl4_src = inet_select_addr(dev_out, 0,
2465 							      RT_SCOPE_LINK);
2466 			res.type = RTN_UNICAST;
2467 			goto make_route;
2468 		}
2469 		if (dev_out)
2470 			dev_put(dev_out);
2471 		err = -ENETUNREACH;
2472 		goto out;
2473 	}
2474 	free_res = 1;
2475 
2476 	if (res.type == RTN_LOCAL) {
2477 		if (!fl.fl4_src)
2478 			fl.fl4_src = fl.fl4_dst;
2479 		if (dev_out)
2480 			dev_put(dev_out);
2481 		dev_out = net->loopback_dev;
2482 		dev_hold(dev_out);
2483 		fl.oif = dev_out->ifindex;
2484 		if (res.fi)
2485 			fib_info_put(res.fi);
2486 		res.fi = NULL;
2487 		flags |= RTCF_LOCAL;
2488 		goto make_route;
2489 	}
2490 
2491 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2492 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2493 		fib_select_multipath(&fl, &res);
2494 	else
2495 #endif
2496 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2497 		fib_select_default(net, &fl, &res);
2498 
2499 	if (!fl.fl4_src)
2500 		fl.fl4_src = FIB_RES_PREFSRC(res);
2501 
2502 	if (dev_out)
2503 		dev_put(dev_out);
2504 	dev_out = FIB_RES_DEV(res);
2505 	dev_hold(dev_out);
2506 	fl.oif = dev_out->ifindex;
2507 
2508 
2509 make_route:
2510 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2511 
2512 
2513 	if (free_res)
2514 		fib_res_put(&res);
2515 	if (dev_out)
2516 		dev_put(dev_out);
2517 out:	return err;
2518 }
2519 
2520 int __ip_route_output_key(struct net *net, struct rtable **rp,
2521 			  const struct flowi *flp)
2522 {
2523 	unsigned hash;
2524 	struct rtable *rth;
2525 
2526 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2527 
2528 	rcu_read_lock_bh();
2529 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2530 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2531 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2532 		    rth->fl.fl4_src == flp->fl4_src &&
2533 		    rth->fl.iif == 0 &&
2534 		    rth->fl.oif == flp->oif &&
2535 		    rth->fl.mark == flp->mark &&
2536 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2537 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2538 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2539 		    !rt_is_expired(rth)) {
2540 			dst_use(&rth->u.dst, jiffies);
2541 			RT_CACHE_STAT_INC(out_hit);
2542 			rcu_read_unlock_bh();
2543 			*rp = rth;
2544 			return 0;
2545 		}
2546 		RT_CACHE_STAT_INC(out_hlist_search);
2547 	}
2548 	rcu_read_unlock_bh();
2549 
2550 	return ip_route_output_slow(net, rp, flp);
2551 }
2552 
2553 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2554 
2555 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2556 {
2557 }
2558 
2559 static struct dst_ops ipv4_dst_blackhole_ops = {
2560 	.family			=	AF_INET,
2561 	.protocol		=	__constant_htons(ETH_P_IP),
2562 	.destroy		=	ipv4_dst_destroy,
2563 	.check			=	ipv4_dst_check,
2564 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2565 	.entry_size		=	sizeof(struct rtable),
2566 	.entries		=	ATOMIC_INIT(0),
2567 };
2568 
2569 
2570 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2571 {
2572 	struct rtable *ort = *rp;
2573 	struct rtable *rt = (struct rtable *)
2574 		dst_alloc(&ipv4_dst_blackhole_ops);
2575 
2576 	if (rt) {
2577 		struct dst_entry *new = &rt->u.dst;
2578 
2579 		atomic_set(&new->__refcnt, 1);
2580 		new->__use = 1;
2581 		new->input = dst_discard;
2582 		new->output = dst_discard;
2583 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2584 
2585 		new->dev = ort->u.dst.dev;
2586 		if (new->dev)
2587 			dev_hold(new->dev);
2588 
2589 		rt->fl = ort->fl;
2590 
2591 		rt->idev = ort->idev;
2592 		if (rt->idev)
2593 			in_dev_hold(rt->idev);
2594 		rt->rt_genid = rt_genid(net);
2595 		rt->rt_flags = ort->rt_flags;
2596 		rt->rt_type = ort->rt_type;
2597 		rt->rt_dst = ort->rt_dst;
2598 		rt->rt_src = ort->rt_src;
2599 		rt->rt_iif = ort->rt_iif;
2600 		rt->rt_gateway = ort->rt_gateway;
2601 		rt->rt_spec_dst = ort->rt_spec_dst;
2602 		rt->peer = ort->peer;
2603 		if (rt->peer)
2604 			atomic_inc(&rt->peer->refcnt);
2605 
2606 		dst_free(new);
2607 	}
2608 
2609 	dst_release(&(*rp)->u.dst);
2610 	*rp = rt;
2611 	return (rt ? 0 : -ENOMEM);
2612 }
2613 
2614 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2615 			 struct sock *sk, int flags)
2616 {
2617 	int err;
2618 
2619 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2620 		return err;
2621 
2622 	if (flp->proto) {
2623 		if (!flp->fl4_src)
2624 			flp->fl4_src = (*rp)->rt_src;
2625 		if (!flp->fl4_dst)
2626 			flp->fl4_dst = (*rp)->rt_dst;
2627 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2628 				    flags ? XFRM_LOOKUP_WAIT : 0);
2629 		if (err == -EREMOTE)
2630 			err = ipv4_dst_blackhole(net, rp, flp);
2631 
2632 		return err;
2633 	}
2634 
2635 	return 0;
2636 }
2637 
2638 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2639 
2640 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2641 {
2642 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2643 }
2644 
2645 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2646 			int nowait, unsigned int flags)
2647 {
2648 	struct rtable *rt = skb->rtable;
2649 	struct rtmsg *r;
2650 	struct nlmsghdr *nlh;
2651 	long expires;
2652 	u32 id = 0, ts = 0, tsage = 0, error;
2653 
2654 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2655 	if (nlh == NULL)
2656 		return -EMSGSIZE;
2657 
2658 	r = nlmsg_data(nlh);
2659 	r->rtm_family	 = AF_INET;
2660 	r->rtm_dst_len	= 32;
2661 	r->rtm_src_len	= 0;
2662 	r->rtm_tos	= rt->fl.fl4_tos;
2663 	r->rtm_table	= RT_TABLE_MAIN;
2664 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2665 	r->rtm_type	= rt->rt_type;
2666 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2667 	r->rtm_protocol = RTPROT_UNSPEC;
2668 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2669 	if (rt->rt_flags & RTCF_NOTIFY)
2670 		r->rtm_flags |= RTM_F_NOTIFY;
2671 
2672 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2673 
2674 	if (rt->fl.fl4_src) {
2675 		r->rtm_src_len = 32;
2676 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2677 	}
2678 	if (rt->u.dst.dev)
2679 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2680 #ifdef CONFIG_NET_CLS_ROUTE
2681 	if (rt->u.dst.tclassid)
2682 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2683 #endif
2684 	if (rt->fl.iif)
2685 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2686 	else if (rt->rt_src != rt->fl.fl4_src)
2687 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2688 
2689 	if (rt->rt_dst != rt->rt_gateway)
2690 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2691 
2692 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2693 		goto nla_put_failure;
2694 
2695 	error = rt->u.dst.error;
2696 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2697 	if (rt->peer) {
2698 		id = rt->peer->ip_id_count;
2699 		if (rt->peer->tcp_ts_stamp) {
2700 			ts = rt->peer->tcp_ts;
2701 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2702 		}
2703 	}
2704 
2705 	if (rt->fl.iif) {
2706 #ifdef CONFIG_IP_MROUTE
2707 		__be32 dst = rt->rt_dst;
2708 
2709 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2710 		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2711 			int err = ipmr_get_route(skb, r, nowait);
2712 			if (err <= 0) {
2713 				if (!nowait) {
2714 					if (err == 0)
2715 						return 0;
2716 					goto nla_put_failure;
2717 				} else {
2718 					if (err == -EMSGSIZE)
2719 						goto nla_put_failure;
2720 					error = err;
2721 				}
2722 			}
2723 		} else
2724 #endif
2725 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2726 	}
2727 
2728 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2729 			       expires, error) < 0)
2730 		goto nla_put_failure;
2731 
2732 	return nlmsg_end(skb, nlh);
2733 
2734 nla_put_failure:
2735 	nlmsg_cancel(skb, nlh);
2736 	return -EMSGSIZE;
2737 }
2738 
2739 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2740 {
2741 	struct net *net = sock_net(in_skb->sk);
2742 	struct rtmsg *rtm;
2743 	struct nlattr *tb[RTA_MAX+1];
2744 	struct rtable *rt = NULL;
2745 	__be32 dst = 0;
2746 	__be32 src = 0;
2747 	u32 iif;
2748 	int err;
2749 	struct sk_buff *skb;
2750 
2751 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2752 	if (err < 0)
2753 		goto errout;
2754 
2755 	rtm = nlmsg_data(nlh);
2756 
2757 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2758 	if (skb == NULL) {
2759 		err = -ENOBUFS;
2760 		goto errout;
2761 	}
2762 
2763 	/* Reserve room for dummy headers, this skb can pass
2764 	   through good chunk of routing engine.
2765 	 */
2766 	skb_reset_mac_header(skb);
2767 	skb_reset_network_header(skb);
2768 
2769 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2770 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2771 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2772 
2773 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2774 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2775 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2776 
2777 	if (iif) {
2778 		struct net_device *dev;
2779 
2780 		dev = __dev_get_by_index(net, iif);
2781 		if (dev == NULL) {
2782 			err = -ENODEV;
2783 			goto errout_free;
2784 		}
2785 
2786 		skb->protocol	= htons(ETH_P_IP);
2787 		skb->dev	= dev;
2788 		local_bh_disable();
2789 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2790 		local_bh_enable();
2791 
2792 		rt = skb->rtable;
2793 		if (err == 0 && rt->u.dst.error)
2794 			err = -rt->u.dst.error;
2795 	} else {
2796 		struct flowi fl = {
2797 			.nl_u = {
2798 				.ip4_u = {
2799 					.daddr = dst,
2800 					.saddr = src,
2801 					.tos = rtm->rtm_tos,
2802 				},
2803 			},
2804 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2805 		};
2806 		err = ip_route_output_key(net, &rt, &fl);
2807 	}
2808 
2809 	if (err)
2810 		goto errout_free;
2811 
2812 	skb->rtable = rt;
2813 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2814 		rt->rt_flags |= RTCF_NOTIFY;
2815 
2816 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2817 			   RTM_NEWROUTE, 0, 0);
2818 	if (err <= 0)
2819 		goto errout_free;
2820 
2821 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2822 errout:
2823 	return err;
2824 
2825 errout_free:
2826 	kfree_skb(skb);
2827 	goto errout;
2828 }
2829 
2830 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2831 {
2832 	struct rtable *rt;
2833 	int h, s_h;
2834 	int idx, s_idx;
2835 	struct net *net;
2836 
2837 	net = sock_net(skb->sk);
2838 
2839 	s_h = cb->args[0];
2840 	if (s_h < 0)
2841 		s_h = 0;
2842 	s_idx = idx = cb->args[1];
2843 	for (h = s_h; h <= rt_hash_mask; h++) {
2844 		rcu_read_lock_bh();
2845 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2846 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2847 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2848 				continue;
2849 			if (rt_is_expired(rt))
2850 				continue;
2851 			skb->dst = dst_clone(&rt->u.dst);
2852 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2853 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2854 					 1, NLM_F_MULTI) <= 0) {
2855 				dst_release(xchg(&skb->dst, NULL));
2856 				rcu_read_unlock_bh();
2857 				goto done;
2858 			}
2859 			dst_release(xchg(&skb->dst, NULL));
2860 		}
2861 		rcu_read_unlock_bh();
2862 		s_idx = 0;
2863 	}
2864 
2865 done:
2866 	cb->args[0] = h;
2867 	cb->args[1] = idx;
2868 	return skb->len;
2869 }
2870 
2871 void ip_rt_multicast_event(struct in_device *in_dev)
2872 {
2873 	rt_cache_flush(dev_net(in_dev->dev), 0);
2874 }
2875 
2876 #ifdef CONFIG_SYSCTL
2877 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2878 					struct file *filp, void __user *buffer,
2879 					size_t *lenp, loff_t *ppos)
2880 {
2881 	if (write) {
2882 		int flush_delay;
2883 		ctl_table ctl;
2884 		struct net *net;
2885 
2886 		memcpy(&ctl, __ctl, sizeof(ctl));
2887 		ctl.data = &flush_delay;
2888 		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
2889 
2890 		net = (struct net *)__ctl->extra1;
2891 		rt_cache_flush(net, flush_delay);
2892 		return 0;
2893 	}
2894 
2895 	return -EINVAL;
2896 }
2897 
2898 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2899 						int __user *name,
2900 						int nlen,
2901 						void __user *oldval,
2902 						size_t __user *oldlenp,
2903 						void __user *newval,
2904 						size_t newlen)
2905 {
2906 	int delay;
2907 	struct net *net;
2908 	if (newlen != sizeof(int))
2909 		return -EINVAL;
2910 	if (get_user(delay, (int __user *)newval))
2911 		return -EFAULT;
2912 	net = (struct net *)table->extra1;
2913 	rt_cache_flush(net, delay);
2914 	return 0;
2915 }
2916 
2917 static void rt_secret_reschedule(int old)
2918 {
2919 	struct net *net;
2920 	int new = ip_rt_secret_interval;
2921 	int diff = new - old;
2922 
2923 	if (!diff)
2924 		return;
2925 
2926 	rtnl_lock();
2927 	for_each_net(net) {
2928 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
2929 
2930 		if (!new)
2931 			continue;
2932 
2933 		if (deleted) {
2934 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
2935 
2936 			if (time <= 0 || (time += diff) <= 0)
2937 				time = 0;
2938 
2939 			net->ipv4.rt_secret_timer.expires = time;
2940 		} else
2941 			net->ipv4.rt_secret_timer.expires = new;
2942 
2943 		net->ipv4.rt_secret_timer.expires += jiffies;
2944 		add_timer(&net->ipv4.rt_secret_timer);
2945 	}
2946 	rtnl_unlock();
2947 }
2948 
2949 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
2950 					  struct file *filp,
2951 					  void __user *buffer, size_t *lenp,
2952 					  loff_t *ppos)
2953 {
2954 	int old = ip_rt_secret_interval;
2955 	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
2956 
2957 	rt_secret_reschedule(old);
2958 
2959 	return ret;
2960 }
2961 
2962 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
2963 						   int __user *name,
2964 						   int nlen,
2965 						   void __user *oldval,
2966 						   size_t __user *oldlenp,
2967 						   void __user *newval,
2968 						   size_t newlen)
2969 {
2970 	int old = ip_rt_secret_interval;
2971 	int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval,
2972 				 newlen);
2973 
2974 	rt_secret_reschedule(old);
2975 
2976 	return ret;
2977 }
2978 
2979 static ctl_table ipv4_route_table[] = {
2980 	{
2981 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2982 		.procname	= "gc_thresh",
2983 		.data		= &ipv4_dst_ops.gc_thresh,
2984 		.maxlen		= sizeof(int),
2985 		.mode		= 0644,
2986 		.proc_handler	= &proc_dointvec,
2987 	},
2988 	{
2989 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2990 		.procname	= "max_size",
2991 		.data		= &ip_rt_max_size,
2992 		.maxlen		= sizeof(int),
2993 		.mode		= 0644,
2994 		.proc_handler	= &proc_dointvec,
2995 	},
2996 	{
2997 		/*  Deprecated. Use gc_min_interval_ms */
2998 
2999 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3000 		.procname	= "gc_min_interval",
3001 		.data		= &ip_rt_gc_min_interval,
3002 		.maxlen		= sizeof(int),
3003 		.mode		= 0644,
3004 		.proc_handler	= &proc_dointvec_jiffies,
3005 		.strategy	= &sysctl_jiffies,
3006 	},
3007 	{
3008 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3009 		.procname	= "gc_min_interval_ms",
3010 		.data		= &ip_rt_gc_min_interval,
3011 		.maxlen		= sizeof(int),
3012 		.mode		= 0644,
3013 		.proc_handler	= &proc_dointvec_ms_jiffies,
3014 		.strategy	= &sysctl_ms_jiffies,
3015 	},
3016 	{
3017 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3018 		.procname	= "gc_timeout",
3019 		.data		= &ip_rt_gc_timeout,
3020 		.maxlen		= sizeof(int),
3021 		.mode		= 0644,
3022 		.proc_handler	= &proc_dointvec_jiffies,
3023 		.strategy	= &sysctl_jiffies,
3024 	},
3025 	{
3026 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3027 		.procname	= "gc_interval",
3028 		.data		= &ip_rt_gc_interval,
3029 		.maxlen		= sizeof(int),
3030 		.mode		= 0644,
3031 		.proc_handler	= &proc_dointvec_jiffies,
3032 		.strategy	= &sysctl_jiffies,
3033 	},
3034 	{
3035 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3036 		.procname	= "redirect_load",
3037 		.data		= &ip_rt_redirect_load,
3038 		.maxlen		= sizeof(int),
3039 		.mode		= 0644,
3040 		.proc_handler	= &proc_dointvec,
3041 	},
3042 	{
3043 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3044 		.procname	= "redirect_number",
3045 		.data		= &ip_rt_redirect_number,
3046 		.maxlen		= sizeof(int),
3047 		.mode		= 0644,
3048 		.proc_handler	= &proc_dointvec,
3049 	},
3050 	{
3051 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3052 		.procname	= "redirect_silence",
3053 		.data		= &ip_rt_redirect_silence,
3054 		.maxlen		= sizeof(int),
3055 		.mode		= 0644,
3056 		.proc_handler	= &proc_dointvec,
3057 	},
3058 	{
3059 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3060 		.procname	= "error_cost",
3061 		.data		= &ip_rt_error_cost,
3062 		.maxlen		= sizeof(int),
3063 		.mode		= 0644,
3064 		.proc_handler	= &proc_dointvec,
3065 	},
3066 	{
3067 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3068 		.procname	= "error_burst",
3069 		.data		= &ip_rt_error_burst,
3070 		.maxlen		= sizeof(int),
3071 		.mode		= 0644,
3072 		.proc_handler	= &proc_dointvec,
3073 	},
3074 	{
3075 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3076 		.procname	= "gc_elasticity",
3077 		.data		= &ip_rt_gc_elasticity,
3078 		.maxlen		= sizeof(int),
3079 		.mode		= 0644,
3080 		.proc_handler	= &proc_dointvec,
3081 	},
3082 	{
3083 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3084 		.procname	= "mtu_expires",
3085 		.data		= &ip_rt_mtu_expires,
3086 		.maxlen		= sizeof(int),
3087 		.mode		= 0644,
3088 		.proc_handler	= &proc_dointvec_jiffies,
3089 		.strategy	= &sysctl_jiffies,
3090 	},
3091 	{
3092 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3093 		.procname	= "min_pmtu",
3094 		.data		= &ip_rt_min_pmtu,
3095 		.maxlen		= sizeof(int),
3096 		.mode		= 0644,
3097 		.proc_handler	= &proc_dointvec,
3098 	},
3099 	{
3100 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3101 		.procname	= "min_adv_mss",
3102 		.data		= &ip_rt_min_advmss,
3103 		.maxlen		= sizeof(int),
3104 		.mode		= 0644,
3105 		.proc_handler	= &proc_dointvec,
3106 	},
3107 	{
3108 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3109 		.procname	= "secret_interval",
3110 		.data		= &ip_rt_secret_interval,
3111 		.maxlen		= sizeof(int),
3112 		.mode		= 0644,
3113 		.proc_handler	= &ipv4_sysctl_rt_secret_interval,
3114 		.strategy	= &ipv4_sysctl_rt_secret_interval_strategy,
3115 	},
3116 	{ .ctl_name = 0 }
3117 };
3118 
3119 static struct ctl_table empty[1];
3120 
3121 static struct ctl_table ipv4_skeleton[] =
3122 {
3123 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3124 	  .mode = 0555, .child = ipv4_route_table},
3125 	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3126 	  .mode = 0555, .child = empty},
3127 	{ }
3128 };
3129 
3130 static __net_initdata struct ctl_path ipv4_path[] = {
3131 	{ .procname = "net", .ctl_name = CTL_NET, },
3132 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3133 	{ },
3134 };
3135 
3136 static struct ctl_table ipv4_route_flush_table[] = {
3137 	{
3138 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
3139 		.procname	= "flush",
3140 		.maxlen		= sizeof(int),
3141 		.mode		= 0200,
3142 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
3143 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
3144 	},
3145 	{ .ctl_name = 0 },
3146 };
3147 
3148 static __net_initdata struct ctl_path ipv4_route_path[] = {
3149 	{ .procname = "net", .ctl_name = CTL_NET, },
3150 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3151 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3152 	{ },
3153 };
3154 
3155 static __net_init int sysctl_route_net_init(struct net *net)
3156 {
3157 	struct ctl_table *tbl;
3158 
3159 	tbl = ipv4_route_flush_table;
3160 	if (net != &init_net) {
3161 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3162 		if (tbl == NULL)
3163 			goto err_dup;
3164 	}
3165 	tbl[0].extra1 = net;
3166 
3167 	net->ipv4.route_hdr =
3168 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3169 	if (net->ipv4.route_hdr == NULL)
3170 		goto err_reg;
3171 	return 0;
3172 
3173 err_reg:
3174 	if (tbl != ipv4_route_flush_table)
3175 		kfree(tbl);
3176 err_dup:
3177 	return -ENOMEM;
3178 }
3179 
3180 static __net_exit void sysctl_route_net_exit(struct net *net)
3181 {
3182 	struct ctl_table *tbl;
3183 
3184 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3185 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3186 	BUG_ON(tbl == ipv4_route_flush_table);
3187 	kfree(tbl);
3188 }
3189 
3190 static __net_initdata struct pernet_operations sysctl_route_ops = {
3191 	.init = sysctl_route_net_init,
3192 	.exit = sysctl_route_net_exit,
3193 };
3194 #endif
3195 
3196 
3197 static __net_init int rt_secret_timer_init(struct net *net)
3198 {
3199 	atomic_set(&net->ipv4.rt_genid,
3200 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3201 			(jiffies ^ (jiffies >> 7))));
3202 
3203 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3204 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3205 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3206 
3207 	if (ip_rt_secret_interval) {
3208 		net->ipv4.rt_secret_timer.expires =
3209 			jiffies + net_random() % ip_rt_secret_interval +
3210 			ip_rt_secret_interval;
3211 		add_timer(&net->ipv4.rt_secret_timer);
3212 	}
3213 	return 0;
3214 }
3215 
3216 static __net_exit void rt_secret_timer_exit(struct net *net)
3217 {
3218 	del_timer_sync(&net->ipv4.rt_secret_timer);
3219 }
3220 
3221 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3222 	.init = rt_secret_timer_init,
3223 	.exit = rt_secret_timer_exit,
3224 };
3225 
3226 
3227 #ifdef CONFIG_NET_CLS_ROUTE
3228 struct ip_rt_acct *ip_rt_acct __read_mostly;
3229 #endif /* CONFIG_NET_CLS_ROUTE */
3230 
3231 static __initdata unsigned long rhash_entries;
3232 static int __init set_rhash_entries(char *str)
3233 {
3234 	if (!str)
3235 		return 0;
3236 	rhash_entries = simple_strtoul(str, &str, 0);
3237 	return 1;
3238 }
3239 __setup("rhash_entries=", set_rhash_entries);
3240 
3241 int __init ip_rt_init(void)
3242 {
3243 	int rc = 0;
3244 
3245 #ifdef CONFIG_NET_CLS_ROUTE
3246 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3247 	if (!ip_rt_acct)
3248 		panic("IP: failed to allocate ip_rt_acct\n");
3249 #endif
3250 
3251 	ipv4_dst_ops.kmem_cachep =
3252 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3253 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3254 
3255 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3256 
3257 	rt_hash_table = (struct rt_hash_bucket *)
3258 		alloc_large_system_hash("IP route cache",
3259 					sizeof(struct rt_hash_bucket),
3260 					rhash_entries,
3261 					(num_physpages >= 128 * 1024) ?
3262 					15 : 17,
3263 					0,
3264 					&rt_hash_log,
3265 					&rt_hash_mask,
3266 					0);
3267 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3268 	rt_hash_lock_init();
3269 
3270 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3271 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3272 
3273 	devinet_init();
3274 	ip_fib_init();
3275 
3276 	/* All the timers, started at system startup tend
3277 	   to synchronize. Perturb it a bit.
3278 	 */
3279 	schedule_delayed_work(&expires_work,
3280 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3281 
3282 	if (register_pernet_subsys(&rt_secret_timer_ops))
3283 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3284 
3285 	if (ip_rt_proc_init())
3286 		printk(KERN_ERR "Unable to create route proc files\n");
3287 #ifdef CONFIG_XFRM
3288 	xfrm_init();
3289 	xfrm4_init();
3290 #endif
3291 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3292 
3293 #ifdef CONFIG_SYSCTL
3294 	register_pernet_subsys(&sysctl_route_ops);
3295 #endif
3296 	return rc;
3297 }
3298 
3299 #ifdef CONFIG_SYSCTL
3300 /*
3301  * We really need to sanitize the damn ipv4 init order, then all
3302  * this nonsense will go away.
3303  */
3304 void __init ip_static_sysctl_init(void)
3305 {
3306 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3307 }
3308 #endif
3309 
3310 EXPORT_SYMBOL(__ip_select_ident);
3311 EXPORT_SYMBOL(ip_route_input);
3312 EXPORT_SYMBOL(ip_route_output_key);
3313