xref: /openbmc/linux/net/ipv4/route.c (revision 22246614)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 
116 #define IP_MAX_MTU	0xFFF0
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
124 static int ip_rt_redirect_number __read_mostly	= 9;
125 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly	= HZ;
128 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly	= 8;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
134 
135 static void rt_worker_func(struct work_struct *work);
136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 static struct timer_list rt_secret_timer;
138 
139 /*
140  *	Interface to generic destination cache.
141  */
142 
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void		 ipv4_dst_destroy(struct dst_entry *dst);
145 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
146 					 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void		 ipv4_link_failure(struct sk_buff *skb);
149 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
151 
152 
153 static struct dst_ops ipv4_dst_ops = {
154 	.family =		AF_INET,
155 	.protocol =		__constant_htons(ETH_P_IP),
156 	.gc =			rt_garbage_collect,
157 	.check =		ipv4_dst_check,
158 	.destroy =		ipv4_dst_destroy,
159 	.ifdown =		ipv4_dst_ifdown,
160 	.negative_advice =	ipv4_negative_advice,
161 	.link_failure =		ipv4_link_failure,
162 	.update_pmtu =		ip_rt_update_pmtu,
163 	.local_out =		ip_local_out,
164 	.entry_size =		sizeof(struct rtable),
165 	.entries =		ATOMIC_INIT(0),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 const __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 	defined(CONFIG_PROVE_LOCKING)
209 /*
210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211  * The size of this table is a power of two and depends on the number of CPUS.
212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213  */
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ	256
216 #else
217 # if NR_CPUS >= 32
218 #  define RT_HASH_LOCK_SZ	4096
219 # elif NR_CPUS >= 16
220 #  define RT_HASH_LOCK_SZ	2048
221 # elif NR_CPUS >= 8
222 #  define RT_HASH_LOCK_SZ	1024
223 # elif NR_CPUS >= 4
224 #  define RT_HASH_LOCK_SZ	512
225 # else
226 #  define RT_HASH_LOCK_SZ	256
227 # endif
228 #endif
229 
230 static spinlock_t	*rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 
233 static __init void rt_hash_lock_init(void)
234 {
235 	int i;
236 
237 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 			GFP_KERNEL);
239 	if (!rt_hash_locks)
240 		panic("IP: failed to allocate rt_hash_locks\n");
241 
242 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 		spin_lock_init(&rt_hash_locks[i]);
244 }
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247 
248 static inline void rt_hash_lock_init(void)
249 {
250 }
251 #endif
252 
253 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
254 static unsigned			rt_hash_mask __read_mostly;
255 static unsigned int		rt_hash_log  __read_mostly;
256 static atomic_t			rt_genid __read_mostly;
257 
258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
259 #define RT_CACHE_STAT_INC(field) \
260 	(__raw_get_cpu_var(rt_cache_stat).field++)
261 
262 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
263 {
264 	return jhash_3words((__force u32)(__be32)(daddr),
265 			    (__force u32)(__be32)(saddr),
266 			    idx, atomic_read(&rt_genid))
267 		& rt_hash_mask;
268 }
269 
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 	struct seq_net_private p;
273 	int bucket;
274 	int genid;
275 };
276 
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279 	struct rt_cache_iter_state *st = seq->private;
280 	struct rtable *r = NULL;
281 
282 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 		rcu_read_lock_bh();
284 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
285 		while (r) {
286 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
287 			    r->rt_genid == st->genid)
288 				return r;
289 			r = rcu_dereference(r->u.dst.rt_next);
290 		}
291 		rcu_read_unlock_bh();
292 	}
293 	return r;
294 }
295 
296 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
297 					  struct rtable *r)
298 {
299 	struct rt_cache_iter_state *st = seq->private;
300 	r = r->u.dst.rt_next;
301 	while (!r) {
302 		rcu_read_unlock_bh();
303 		if (--st->bucket < 0)
304 			break;
305 		rcu_read_lock_bh();
306 		r = rt_hash_table[st->bucket].chain;
307 	}
308 	return rcu_dereference(r);
309 }
310 
311 static struct rtable *rt_cache_get_next(struct seq_file *seq,
312 					struct rtable *r)
313 {
314 	struct rt_cache_iter_state *st = seq->private;
315 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
316 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
317 			continue;
318 		if (r->rt_genid == st->genid)
319 			break;
320 	}
321 	return r;
322 }
323 
324 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
325 {
326 	struct rtable *r = rt_cache_get_first(seq);
327 
328 	if (r)
329 		while (pos && (r = rt_cache_get_next(seq, r)))
330 			--pos;
331 	return pos ? NULL : r;
332 }
333 
334 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
335 {
336 	struct rt_cache_iter_state *st = seq->private;
337 	if (*pos)
338 		return rt_cache_get_idx(seq, *pos - 1);
339 	st->genid = atomic_read(&rt_genid);
340 	return SEQ_START_TOKEN;
341 }
342 
343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
344 {
345 	struct rtable *r;
346 
347 	if (v == SEQ_START_TOKEN)
348 		r = rt_cache_get_first(seq);
349 	else
350 		r = rt_cache_get_next(seq, v);
351 	++*pos;
352 	return r;
353 }
354 
355 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
356 {
357 	if (v && v != SEQ_START_TOKEN)
358 		rcu_read_unlock_bh();
359 }
360 
361 static int rt_cache_seq_show(struct seq_file *seq, void *v)
362 {
363 	if (v == SEQ_START_TOKEN)
364 		seq_printf(seq, "%-127s\n",
365 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
366 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
367 			   "HHUptod\tSpecDst");
368 	else {
369 		struct rtable *r = v;
370 		int len;
371 
372 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
373 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
374 			r->u.dst.dev ? r->u.dst.dev->name : "*",
375 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
376 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
377 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
378 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
379 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
380 			dst_metric(&r->u.dst, RTAX_WINDOW),
381 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
382 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
383 			r->fl.fl4_tos,
384 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
385 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
386 				       dev_queue_xmit) : 0,
387 			r->rt_spec_dst, &len);
388 
389 		seq_printf(seq, "%*s\n", 127 - len, "");
390 	}
391 	return 0;
392 }
393 
394 static const struct seq_operations rt_cache_seq_ops = {
395 	.start  = rt_cache_seq_start,
396 	.next   = rt_cache_seq_next,
397 	.stop   = rt_cache_seq_stop,
398 	.show   = rt_cache_seq_show,
399 };
400 
401 static int rt_cache_seq_open(struct inode *inode, struct file *file)
402 {
403 	return seq_open_net(inode, file, &rt_cache_seq_ops,
404 			sizeof(struct rt_cache_iter_state));
405 }
406 
407 static const struct file_operations rt_cache_seq_fops = {
408 	.owner	 = THIS_MODULE,
409 	.open	 = rt_cache_seq_open,
410 	.read	 = seq_read,
411 	.llseek	 = seq_lseek,
412 	.release = seq_release_net,
413 };
414 
415 
416 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
417 {
418 	int cpu;
419 
420 	if (*pos == 0)
421 		return SEQ_START_TOKEN;
422 
423 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
424 		if (!cpu_possible(cpu))
425 			continue;
426 		*pos = cpu+1;
427 		return &per_cpu(rt_cache_stat, cpu);
428 	}
429 	return NULL;
430 }
431 
432 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
433 {
434 	int cpu;
435 
436 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
437 		if (!cpu_possible(cpu))
438 			continue;
439 		*pos = cpu+1;
440 		return &per_cpu(rt_cache_stat, cpu);
441 	}
442 	return NULL;
443 
444 }
445 
446 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
447 {
448 
449 }
450 
451 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
452 {
453 	struct rt_cache_stat *st = v;
454 
455 	if (v == SEQ_START_TOKEN) {
456 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
457 		return 0;
458 	}
459 
460 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
461 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
462 		   atomic_read(&ipv4_dst_ops.entries),
463 		   st->in_hit,
464 		   st->in_slow_tot,
465 		   st->in_slow_mc,
466 		   st->in_no_route,
467 		   st->in_brd,
468 		   st->in_martian_dst,
469 		   st->in_martian_src,
470 
471 		   st->out_hit,
472 		   st->out_slow_tot,
473 		   st->out_slow_mc,
474 
475 		   st->gc_total,
476 		   st->gc_ignored,
477 		   st->gc_goal_miss,
478 		   st->gc_dst_overflow,
479 		   st->in_hlist_search,
480 		   st->out_hlist_search
481 		);
482 	return 0;
483 }
484 
485 static const struct seq_operations rt_cpu_seq_ops = {
486 	.start  = rt_cpu_seq_start,
487 	.next   = rt_cpu_seq_next,
488 	.stop   = rt_cpu_seq_stop,
489 	.show   = rt_cpu_seq_show,
490 };
491 
492 
493 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
494 {
495 	return seq_open(file, &rt_cpu_seq_ops);
496 }
497 
498 static const struct file_operations rt_cpu_seq_fops = {
499 	.owner	 = THIS_MODULE,
500 	.open	 = rt_cpu_seq_open,
501 	.read	 = seq_read,
502 	.llseek	 = seq_lseek,
503 	.release = seq_release,
504 };
505 
506 #ifdef CONFIG_NET_CLS_ROUTE
507 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
508 			   int length, int *eof, void *data)
509 {
510 	unsigned int i;
511 
512 	if ((offset & 3) || (length & 3))
513 		return -EIO;
514 
515 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
516 		*eof = 1;
517 		return 0;
518 	}
519 
520 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
521 		length = sizeof(struct ip_rt_acct) * 256 - offset;
522 		*eof = 1;
523 	}
524 
525 	offset /= sizeof(u32);
526 
527 	if (length > 0) {
528 		u32 *dst = (u32 *) buffer;
529 
530 		*start = buffer;
531 		memset(dst, 0, length);
532 
533 		for_each_possible_cpu(i) {
534 			unsigned int j;
535 			u32 *src;
536 
537 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
538 			for (j = 0; j < length/4; j++)
539 				dst[j] += src[j];
540 		}
541 	}
542 	return length;
543 }
544 #endif
545 
546 static int __net_init ip_rt_do_proc_init(struct net *net)
547 {
548 	struct proc_dir_entry *pde;
549 
550 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
551 			&rt_cache_seq_fops);
552 	if (!pde)
553 		goto err1;
554 
555 	pde = proc_create("rt_cache", S_IRUGO,
556 			  net->proc_net_stat, &rt_cpu_seq_fops);
557 	if (!pde)
558 		goto err2;
559 
560 #ifdef CONFIG_NET_CLS_ROUTE
561 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
562 			ip_rt_acct_read, NULL);
563 	if (!pde)
564 		goto err3;
565 #endif
566 	return 0;
567 
568 #ifdef CONFIG_NET_CLS_ROUTE
569 err3:
570 	remove_proc_entry("rt_cache", net->proc_net_stat);
571 #endif
572 err2:
573 	remove_proc_entry("rt_cache", net->proc_net);
574 err1:
575 	return -ENOMEM;
576 }
577 
578 static void __net_exit ip_rt_do_proc_exit(struct net *net)
579 {
580 	remove_proc_entry("rt_cache", net->proc_net_stat);
581 	remove_proc_entry("rt_cache", net->proc_net);
582 	remove_proc_entry("rt_acct", net->proc_net);
583 }
584 
585 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
586 	.init = ip_rt_do_proc_init,
587 	.exit = ip_rt_do_proc_exit,
588 };
589 
590 static int __init ip_rt_proc_init(void)
591 {
592 	return register_pernet_subsys(&ip_rt_proc_ops);
593 }
594 
595 #else
596 static inline int ip_rt_proc_init(void)
597 {
598 	return 0;
599 }
600 #endif /* CONFIG_PROC_FS */
601 
602 static inline void rt_free(struct rtable *rt)
603 {
604 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
605 }
606 
607 static inline void rt_drop(struct rtable *rt)
608 {
609 	ip_rt_put(rt);
610 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
611 }
612 
613 static inline int rt_fast_clean(struct rtable *rth)
614 {
615 	/* Kill broadcast/multicast entries very aggresively, if they
616 	   collide in hash table with more useful entries */
617 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
618 		rth->fl.iif && rth->u.dst.rt_next;
619 }
620 
621 static inline int rt_valuable(struct rtable *rth)
622 {
623 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
624 		rth->u.dst.expires;
625 }
626 
627 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
628 {
629 	unsigned long age;
630 	int ret = 0;
631 
632 	if (atomic_read(&rth->u.dst.__refcnt))
633 		goto out;
634 
635 	ret = 1;
636 	if (rth->u.dst.expires &&
637 	    time_after_eq(jiffies, rth->u.dst.expires))
638 		goto out;
639 
640 	age = jiffies - rth->u.dst.lastuse;
641 	ret = 0;
642 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
643 	    (age <= tmo2 && rt_valuable(rth)))
644 		goto out;
645 	ret = 1;
646 out:	return ret;
647 }
648 
649 /* Bits of score are:
650  * 31: very valuable
651  * 30: not quite useless
652  * 29..0: usage counter
653  */
654 static inline u32 rt_score(struct rtable *rt)
655 {
656 	u32 score = jiffies - rt->u.dst.lastuse;
657 
658 	score = ~score & ~(3<<30);
659 
660 	if (rt_valuable(rt))
661 		score |= (1<<31);
662 
663 	if (!rt->fl.iif ||
664 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
665 		score |= (1<<30);
666 
667 	return score;
668 }
669 
670 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
671 {
672 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
673 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
674 		(fl1->mark ^ fl2->mark) |
675 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
676 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
677 		(fl1->oif ^ fl2->oif) |
678 		(fl1->iif ^ fl2->iif)) == 0;
679 }
680 
681 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
682 {
683 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
684 }
685 
686 /*
687  * Perform a full scan of hash table and free all entries.
688  * Can be called by a softirq or a process.
689  * In the later case, we want to be reschedule if necessary
690  */
691 static void rt_do_flush(int process_context)
692 {
693 	unsigned int i;
694 	struct rtable *rth, *next;
695 
696 	for (i = 0; i <= rt_hash_mask; i++) {
697 		if (process_context && need_resched())
698 			cond_resched();
699 		rth = rt_hash_table[i].chain;
700 		if (!rth)
701 			continue;
702 
703 		spin_lock_bh(rt_hash_lock_addr(i));
704 		rth = rt_hash_table[i].chain;
705 		rt_hash_table[i].chain = NULL;
706 		spin_unlock_bh(rt_hash_lock_addr(i));
707 
708 		for (; rth; rth = next) {
709 			next = rth->u.dst.rt_next;
710 			rt_free(rth);
711 		}
712 	}
713 }
714 
715 static void rt_check_expire(void)
716 {
717 	static unsigned int rover;
718 	unsigned int i = rover, goal;
719 	struct rtable *rth, **rthp;
720 	u64 mult;
721 
722 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
723 	if (ip_rt_gc_timeout > 1)
724 		do_div(mult, ip_rt_gc_timeout);
725 	goal = (unsigned int)mult;
726 	if (goal > rt_hash_mask)
727 		goal = rt_hash_mask + 1;
728 	for (; goal > 0; goal--) {
729 		unsigned long tmo = ip_rt_gc_timeout;
730 
731 		i = (i + 1) & rt_hash_mask;
732 		rthp = &rt_hash_table[i].chain;
733 
734 		if (need_resched())
735 			cond_resched();
736 
737 		if (*rthp == NULL)
738 			continue;
739 		spin_lock_bh(rt_hash_lock_addr(i));
740 		while ((rth = *rthp) != NULL) {
741 			if (rth->rt_genid != atomic_read(&rt_genid)) {
742 				*rthp = rth->u.dst.rt_next;
743 				rt_free(rth);
744 				continue;
745 			}
746 			if (rth->u.dst.expires) {
747 				/* Entry is expired even if it is in use */
748 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
749 					tmo >>= 1;
750 					rthp = &rth->u.dst.rt_next;
751 					continue;
752 				}
753 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
754 				tmo >>= 1;
755 				rthp = &rth->u.dst.rt_next;
756 				continue;
757 			}
758 
759 			/* Cleanup aged off entries. */
760 			*rthp = rth->u.dst.rt_next;
761 			rt_free(rth);
762 		}
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 	}
765 	rover = i;
766 }
767 
768 /*
769  * rt_worker_func() is run in process context.
770  * we call rt_check_expire() to scan part of the hash table
771  */
772 static void rt_worker_func(struct work_struct *work)
773 {
774 	rt_check_expire();
775 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
776 }
777 
778 /*
779  * Pertubation of rt_genid by a small quantity [1..256]
780  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
781  * many times (2^24) without giving recent rt_genid.
782  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
783  */
784 static void rt_cache_invalidate(void)
785 {
786 	unsigned char shuffle;
787 
788 	get_random_bytes(&shuffle, sizeof(shuffle));
789 	atomic_add(shuffle + 1U, &rt_genid);
790 }
791 
792 /*
793  * delay < 0  : invalidate cache (fast : entries will be deleted later)
794  * delay >= 0 : invalidate & flush cache (can be long)
795  */
796 void rt_cache_flush(int delay)
797 {
798 	rt_cache_invalidate();
799 	if (delay >= 0)
800 		rt_do_flush(!in_softirq());
801 }
802 
803 /*
804  * We change rt_genid and let gc do the cleanup
805  */
806 static void rt_secret_rebuild(unsigned long dummy)
807 {
808 	rt_cache_invalidate();
809 	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
810 }
811 
812 /*
813    Short description of GC goals.
814 
815    We want to build algorithm, which will keep routing cache
816    at some equilibrium point, when number of aged off entries
817    is kept approximately equal to newly generated ones.
818 
819    Current expiration strength is variable "expire".
820    We try to adjust it dynamically, so that if networking
821    is idle expires is large enough to keep enough of warm entries,
822    and when load increases it reduces to limit cache size.
823  */
824 
825 static int rt_garbage_collect(struct dst_ops *ops)
826 {
827 	static unsigned long expire = RT_GC_TIMEOUT;
828 	static unsigned long last_gc;
829 	static int rover;
830 	static int equilibrium;
831 	struct rtable *rth, **rthp;
832 	unsigned long now = jiffies;
833 	int goal;
834 
835 	/*
836 	 * Garbage collection is pretty expensive,
837 	 * do not make it too frequently.
838 	 */
839 
840 	RT_CACHE_STAT_INC(gc_total);
841 
842 	if (now - last_gc < ip_rt_gc_min_interval &&
843 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
844 		RT_CACHE_STAT_INC(gc_ignored);
845 		goto out;
846 	}
847 
848 	/* Calculate number of entries, which we want to expire now. */
849 	goal = atomic_read(&ipv4_dst_ops.entries) -
850 		(ip_rt_gc_elasticity << rt_hash_log);
851 	if (goal <= 0) {
852 		if (equilibrium < ipv4_dst_ops.gc_thresh)
853 			equilibrium = ipv4_dst_ops.gc_thresh;
854 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
855 		if (goal > 0) {
856 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
857 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
858 		}
859 	} else {
860 		/* We are in dangerous area. Try to reduce cache really
861 		 * aggressively.
862 		 */
863 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
864 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
865 	}
866 
867 	if (now - last_gc >= ip_rt_gc_min_interval)
868 		last_gc = now;
869 
870 	if (goal <= 0) {
871 		equilibrium += goal;
872 		goto work_done;
873 	}
874 
875 	do {
876 		int i, k;
877 
878 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
879 			unsigned long tmo = expire;
880 
881 			k = (k + 1) & rt_hash_mask;
882 			rthp = &rt_hash_table[k].chain;
883 			spin_lock_bh(rt_hash_lock_addr(k));
884 			while ((rth = *rthp) != NULL) {
885 				if (rth->rt_genid == atomic_read(&rt_genid) &&
886 					!rt_may_expire(rth, tmo, expire)) {
887 					tmo >>= 1;
888 					rthp = &rth->u.dst.rt_next;
889 					continue;
890 				}
891 				*rthp = rth->u.dst.rt_next;
892 				rt_free(rth);
893 				goal--;
894 			}
895 			spin_unlock_bh(rt_hash_lock_addr(k));
896 			if (goal <= 0)
897 				break;
898 		}
899 		rover = k;
900 
901 		if (goal <= 0)
902 			goto work_done;
903 
904 		/* Goal is not achieved. We stop process if:
905 
906 		   - if expire reduced to zero. Otherwise, expire is halfed.
907 		   - if table is not full.
908 		   - if we are called from interrupt.
909 		   - jiffies check is just fallback/debug loop breaker.
910 		     We will not spin here for long time in any case.
911 		 */
912 
913 		RT_CACHE_STAT_INC(gc_goal_miss);
914 
915 		if (expire == 0)
916 			break;
917 
918 		expire >>= 1;
919 #if RT_CACHE_DEBUG >= 2
920 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
921 				atomic_read(&ipv4_dst_ops.entries), goal, i);
922 #endif
923 
924 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
925 			goto out;
926 	} while (!in_softirq() && time_before_eq(jiffies, now));
927 
928 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
929 		goto out;
930 	if (net_ratelimit())
931 		printk(KERN_WARNING "dst cache overflow\n");
932 	RT_CACHE_STAT_INC(gc_dst_overflow);
933 	return 1;
934 
935 work_done:
936 	expire += ip_rt_gc_min_interval;
937 	if (expire > ip_rt_gc_timeout ||
938 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
939 		expire = ip_rt_gc_timeout;
940 #if RT_CACHE_DEBUG >= 2
941 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
942 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
943 #endif
944 out:	return 0;
945 }
946 
947 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
948 {
949 	struct rtable	*rth, **rthp;
950 	unsigned long	now;
951 	struct rtable *cand, **candp;
952 	u32 		min_score;
953 	int		chain_length;
954 	int attempts = !in_softirq();
955 
956 restart:
957 	chain_length = 0;
958 	min_score = ~(u32)0;
959 	cand = NULL;
960 	candp = NULL;
961 	now = jiffies;
962 
963 	rthp = &rt_hash_table[hash].chain;
964 
965 	spin_lock_bh(rt_hash_lock_addr(hash));
966 	while ((rth = *rthp) != NULL) {
967 		if (rth->rt_genid != atomic_read(&rt_genid)) {
968 			*rthp = rth->u.dst.rt_next;
969 			rt_free(rth);
970 			continue;
971 		}
972 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
973 			/* Put it first */
974 			*rthp = rth->u.dst.rt_next;
975 			/*
976 			 * Since lookup is lockfree, the deletion
977 			 * must be visible to another weakly ordered CPU before
978 			 * the insertion at the start of the hash chain.
979 			 */
980 			rcu_assign_pointer(rth->u.dst.rt_next,
981 					   rt_hash_table[hash].chain);
982 			/*
983 			 * Since lookup is lockfree, the update writes
984 			 * must be ordered for consistency on SMP.
985 			 */
986 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
987 
988 			dst_use(&rth->u.dst, now);
989 			spin_unlock_bh(rt_hash_lock_addr(hash));
990 
991 			rt_drop(rt);
992 			*rp = rth;
993 			return 0;
994 		}
995 
996 		if (!atomic_read(&rth->u.dst.__refcnt)) {
997 			u32 score = rt_score(rth);
998 
999 			if (score <= min_score) {
1000 				cand = rth;
1001 				candp = rthp;
1002 				min_score = score;
1003 			}
1004 		}
1005 
1006 		chain_length++;
1007 
1008 		rthp = &rth->u.dst.rt_next;
1009 	}
1010 
1011 	if (cand) {
1012 		/* ip_rt_gc_elasticity used to be average length of chain
1013 		 * length, when exceeded gc becomes really aggressive.
1014 		 *
1015 		 * The second limit is less certain. At the moment it allows
1016 		 * only 2 entries per bucket. We will see.
1017 		 */
1018 		if (chain_length > ip_rt_gc_elasticity) {
1019 			*candp = cand->u.dst.rt_next;
1020 			rt_free(cand);
1021 		}
1022 	}
1023 
1024 	/* Try to bind route to arp only if it is output
1025 	   route or unicast forwarding path.
1026 	 */
1027 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1028 		int err = arp_bind_neighbour(&rt->u.dst);
1029 		if (err) {
1030 			spin_unlock_bh(rt_hash_lock_addr(hash));
1031 
1032 			if (err != -ENOBUFS) {
1033 				rt_drop(rt);
1034 				return err;
1035 			}
1036 
1037 			/* Neighbour tables are full and nothing
1038 			   can be released. Try to shrink route cache,
1039 			   it is most likely it holds some neighbour records.
1040 			 */
1041 			if (attempts-- > 0) {
1042 				int saved_elasticity = ip_rt_gc_elasticity;
1043 				int saved_int = ip_rt_gc_min_interval;
1044 				ip_rt_gc_elasticity	= 1;
1045 				ip_rt_gc_min_interval	= 0;
1046 				rt_garbage_collect(&ipv4_dst_ops);
1047 				ip_rt_gc_min_interval	= saved_int;
1048 				ip_rt_gc_elasticity	= saved_elasticity;
1049 				goto restart;
1050 			}
1051 
1052 			if (net_ratelimit())
1053 				printk(KERN_WARNING "Neighbour table overflow.\n");
1054 			rt_drop(rt);
1055 			return -ENOBUFS;
1056 		}
1057 	}
1058 
1059 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1060 #if RT_CACHE_DEBUG >= 2
1061 	if (rt->u.dst.rt_next) {
1062 		struct rtable *trt;
1063 		printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1064 		       NIPQUAD(rt->rt_dst));
1065 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1066 			printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1067 		printk("\n");
1068 	}
1069 #endif
1070 	rt_hash_table[hash].chain = rt;
1071 	spin_unlock_bh(rt_hash_lock_addr(hash));
1072 	*rp = rt;
1073 	return 0;
1074 }
1075 
1076 void rt_bind_peer(struct rtable *rt, int create)
1077 {
1078 	static DEFINE_SPINLOCK(rt_peer_lock);
1079 	struct inet_peer *peer;
1080 
1081 	peer = inet_getpeer(rt->rt_dst, create);
1082 
1083 	spin_lock_bh(&rt_peer_lock);
1084 	if (rt->peer == NULL) {
1085 		rt->peer = peer;
1086 		peer = NULL;
1087 	}
1088 	spin_unlock_bh(&rt_peer_lock);
1089 	if (peer)
1090 		inet_putpeer(peer);
1091 }
1092 
1093 /*
1094  * Peer allocation may fail only in serious out-of-memory conditions.  However
1095  * we still can generate some output.
1096  * Random ID selection looks a bit dangerous because we have no chances to
1097  * select ID being unique in a reasonable period of time.
1098  * But broken packet identifier may be better than no packet at all.
1099  */
1100 static void ip_select_fb_ident(struct iphdr *iph)
1101 {
1102 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1103 	static u32 ip_fallback_id;
1104 	u32 salt;
1105 
1106 	spin_lock_bh(&ip_fb_id_lock);
1107 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1108 	iph->id = htons(salt & 0xFFFF);
1109 	ip_fallback_id = salt;
1110 	spin_unlock_bh(&ip_fb_id_lock);
1111 }
1112 
1113 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 {
1115 	struct rtable *rt = (struct rtable *) dst;
1116 
1117 	if (rt) {
1118 		if (rt->peer == NULL)
1119 			rt_bind_peer(rt, 1);
1120 
1121 		/* If peer is attached to destination, it is never detached,
1122 		   so that we need not to grab a lock to dereference it.
1123 		 */
1124 		if (rt->peer) {
1125 			iph->id = htons(inet_getid(rt->peer, more));
1126 			return;
1127 		}
1128 	} else
1129 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1130 		       __builtin_return_address(0));
1131 
1132 	ip_select_fb_ident(iph);
1133 }
1134 
1135 static void rt_del(unsigned hash, struct rtable *rt)
1136 {
1137 	struct rtable **rthp, *aux;
1138 
1139 	rthp = &rt_hash_table[hash].chain;
1140 	spin_lock_bh(rt_hash_lock_addr(hash));
1141 	ip_rt_put(rt);
1142 	while ((aux = *rthp) != NULL) {
1143 		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1144 			*rthp = aux->u.dst.rt_next;
1145 			rt_free(aux);
1146 			continue;
1147 		}
1148 		rthp = &aux->u.dst.rt_next;
1149 	}
1150 	spin_unlock_bh(rt_hash_lock_addr(hash));
1151 }
1152 
1153 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1154 		    __be32 saddr, struct net_device *dev)
1155 {
1156 	int i, k;
1157 	struct in_device *in_dev = in_dev_get(dev);
1158 	struct rtable *rth, **rthp;
1159 	__be32  skeys[2] = { saddr, 0 };
1160 	int  ikeys[2] = { dev->ifindex, 0 };
1161 	struct netevent_redirect netevent;
1162 	struct net *net;
1163 
1164 	if (!in_dev)
1165 		return;
1166 
1167 	net = dev_net(dev);
1168 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1169 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1170 	    || ipv4_is_zeronet(new_gw))
1171 		goto reject_redirect;
1172 
1173 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1174 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1175 			goto reject_redirect;
1176 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1177 			goto reject_redirect;
1178 	} else {
1179 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1180 			goto reject_redirect;
1181 	}
1182 
1183 	for (i = 0; i < 2; i++) {
1184 		for (k = 0; k < 2; k++) {
1185 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186 
1187 			rthp=&rt_hash_table[hash].chain;
1188 
1189 			rcu_read_lock();
1190 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1191 				struct rtable *rt;
1192 
1193 				if (rth->fl.fl4_dst != daddr ||
1194 				    rth->fl.fl4_src != skeys[i] ||
1195 				    rth->fl.oif != ikeys[k] ||
1196 				    rth->fl.iif != 0 ||
1197 				    rth->rt_genid != atomic_read(&rt_genid) ||
1198 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1199 					rthp = &rth->u.dst.rt_next;
1200 					continue;
1201 				}
1202 
1203 				if (rth->rt_dst != daddr ||
1204 				    rth->rt_src != saddr ||
1205 				    rth->u.dst.error ||
1206 				    rth->rt_gateway != old_gw ||
1207 				    rth->u.dst.dev != dev)
1208 					break;
1209 
1210 				dst_hold(&rth->u.dst);
1211 				rcu_read_unlock();
1212 
1213 				rt = dst_alloc(&ipv4_dst_ops);
1214 				if (rt == NULL) {
1215 					ip_rt_put(rth);
1216 					in_dev_put(in_dev);
1217 					return;
1218 				}
1219 
1220 				/* Copy all the information. */
1221 				*rt = *rth;
1222 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223 				rt->u.dst.__use		= 1;
1224 				atomic_set(&rt->u.dst.__refcnt, 1);
1225 				rt->u.dst.child		= NULL;
1226 				if (rt->u.dst.dev)
1227 					dev_hold(rt->u.dst.dev);
1228 				if (rt->idev)
1229 					in_dev_hold(rt->idev);
1230 				rt->u.dst.obsolete	= 0;
1231 				rt->u.dst.lastuse	= jiffies;
1232 				rt->u.dst.path		= &rt->u.dst;
1233 				rt->u.dst.neighbour	= NULL;
1234 				rt->u.dst.hh		= NULL;
1235 				rt->u.dst.xfrm		= NULL;
1236 				rt->rt_genid		= atomic_read(&rt_genid);
1237 				rt->rt_flags		|= RTCF_REDIRECTED;
1238 
1239 				/* Gateway is different ... */
1240 				rt->rt_gateway		= new_gw;
1241 
1242 				/* Redirect received -> path was valid */
1243 				dst_confirm(&rth->u.dst);
1244 
1245 				if (rt->peer)
1246 					atomic_inc(&rt->peer->refcnt);
1247 
1248 				if (arp_bind_neighbour(&rt->u.dst) ||
1249 				    !(rt->u.dst.neighbour->nud_state &
1250 					    NUD_VALID)) {
1251 					if (rt->u.dst.neighbour)
1252 						neigh_event_send(rt->u.dst.neighbour, NULL);
1253 					ip_rt_put(rth);
1254 					rt_drop(rt);
1255 					goto do_next;
1256 				}
1257 
1258 				netevent.old = &rth->u.dst;
1259 				netevent.new = &rt->u.dst;
1260 				call_netevent_notifiers(NETEVENT_REDIRECT,
1261 							&netevent);
1262 
1263 				rt_del(hash, rth);
1264 				if (!rt_intern_hash(hash, rt, &rt))
1265 					ip_rt_put(rt);
1266 				goto do_next;
1267 			}
1268 			rcu_read_unlock();
1269 		do_next:
1270 			;
1271 		}
1272 	}
1273 	in_dev_put(in_dev);
1274 	return;
1275 
1276 reject_redirect:
1277 #ifdef CONFIG_IP_ROUTE_VERBOSE
1278 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279 		printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280 			NIPQUAD_FMT " ignored.\n"
1281 			"  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283 		       NIPQUAD(saddr), NIPQUAD(daddr));
1284 #endif
1285 	in_dev_put(in_dev);
1286 }
1287 
1288 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 {
1290 	struct rtable *rt = (struct rtable *)dst;
1291 	struct dst_entry *ret = dst;
1292 
1293 	if (rt) {
1294 		if (dst->obsolete) {
1295 			ip_rt_put(rt);
1296 			ret = NULL;
1297 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298 			   rt->u.dst.expires) {
1299 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300 						rt->fl.oif);
1301 #if RT_CACHE_DEBUG >= 1
1302 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1303 					  NIPQUAD_FMT "/%02x dropped\n",
1304 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1305 #endif
1306 			rt_del(hash, rt);
1307 			ret = NULL;
1308 		}
1309 	}
1310 	return ret;
1311 }
1312 
1313 /*
1314  * Algorithm:
1315  *	1. The first ip_rt_redirect_number redirects are sent
1316  *	   with exponential backoff, then we stop sending them at all,
1317  *	   assuming that the host ignores our redirects.
1318  *	2. If we did not see packets requiring redirects
1319  *	   during ip_rt_redirect_silence, we assume that the host
1320  *	   forgot redirected route and start to send redirects again.
1321  *
1322  * This algorithm is much cheaper and more intelligent than dumb load limiting
1323  * in icmp.c.
1324  *
1325  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1326  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327  */
1328 
1329 void ip_rt_send_redirect(struct sk_buff *skb)
1330 {
1331 	struct rtable *rt = skb->rtable;
1332 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333 
1334 	if (!in_dev)
1335 		return;
1336 
1337 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1338 		goto out;
1339 
1340 	/* No redirected packets during ip_rt_redirect_silence;
1341 	 * reset the algorithm.
1342 	 */
1343 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1344 		rt->u.dst.rate_tokens = 0;
1345 
1346 	/* Too many ignored redirects; do not send anything
1347 	 * set u.dst.rate_last to the last seen redirected packet.
1348 	 */
1349 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1350 		rt->u.dst.rate_last = jiffies;
1351 		goto out;
1352 	}
1353 
1354 	/* Check for load limit; set rate_last to the latest sent
1355 	 * redirect.
1356 	 */
1357 	if (rt->u.dst.rate_tokens == 0 ||
1358 	    time_after(jiffies,
1359 		       (rt->u.dst.rate_last +
1360 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1361 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1362 		rt->u.dst.rate_last = jiffies;
1363 		++rt->u.dst.rate_tokens;
1364 #ifdef CONFIG_IP_ROUTE_VERBOSE
1365 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1366 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367 		    net_ratelimit())
1368 			printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1369 				"redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1370 				NIPQUAD(rt->rt_src), rt->rt_iif,
1371 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1372 #endif
1373 	}
1374 out:
1375 	in_dev_put(in_dev);
1376 }
1377 
1378 static int ip_error(struct sk_buff *skb)
1379 {
1380 	struct rtable *rt = skb->rtable;
1381 	unsigned long now;
1382 	int code;
1383 
1384 	switch (rt->u.dst.error) {
1385 		case EINVAL:
1386 		default:
1387 			goto out;
1388 		case EHOSTUNREACH:
1389 			code = ICMP_HOST_UNREACH;
1390 			break;
1391 		case ENETUNREACH:
1392 			code = ICMP_NET_UNREACH;
1393 			IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394 			break;
1395 		case EACCES:
1396 			code = ICMP_PKT_FILTERED;
1397 			break;
1398 	}
1399 
1400 	now = jiffies;
1401 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1402 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1403 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1404 	rt->u.dst.rate_last = now;
1405 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1406 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1407 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408 	}
1409 
1410 out:	kfree_skb(skb);
1411 	return 0;
1412 }
1413 
1414 /*
1415  *	The last two values are not from the RFC but
1416  *	are needed for AMPRnet AX.25 paths.
1417  */
1418 
1419 static const unsigned short mtu_plateau[] =
1420 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421 
1422 static inline unsigned short guess_mtu(unsigned short old_mtu)
1423 {
1424 	int i;
1425 
1426 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1427 		if (old_mtu > mtu_plateau[i])
1428 			return mtu_plateau[i];
1429 	return 68;
1430 }
1431 
1432 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1433 				 unsigned short new_mtu,
1434 				 struct net_device *dev)
1435 {
1436 	int i, k;
1437 	unsigned short old_mtu = ntohs(iph->tot_len);
1438 	struct rtable *rth;
1439 	int  ikeys[2] = { dev->ifindex, 0 };
1440 	__be32  skeys[2] = { iph->saddr, 0, };
1441 	__be32  daddr = iph->daddr;
1442 	unsigned short est_mtu = 0;
1443 
1444 	if (ipv4_config.no_pmtu_disc)
1445 		return 0;
1446 
1447 	for (k = 0; k < 2; k++) {
1448 		for (i = 0; i < 2; i++) {
1449 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1450 
1451 			rcu_read_lock();
1452 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1453 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1454 				unsigned short mtu = new_mtu;
1455 
1456 				if (rth->fl.fl4_dst != daddr ||
1457 				    rth->fl.fl4_src != skeys[i] ||
1458 				    rth->rt_dst != daddr ||
1459 				    rth->rt_src != iph->saddr ||
1460 				    rth->fl.oif != ikeys[k] ||
1461 				    rth->fl.iif != 0 ||
1462 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1463 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1464 				    rth->rt_genid != atomic_read(&rt_genid))
1465 					continue;
1466 
1467 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1468 
1469 					/* BSD 4.2 compatibility hack :-( */
1470 					if (mtu == 0 &&
1471 					    old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1472 					    old_mtu >= 68 + (iph->ihl << 2))
1473 						old_mtu -= iph->ihl << 2;
1474 
1475 					mtu = guess_mtu(old_mtu);
1476 				}
1477 				if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1478 					if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1479 						dst_confirm(&rth->u.dst);
1480 						if (mtu < ip_rt_min_pmtu) {
1481 							mtu = ip_rt_min_pmtu;
1482 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1483 								(1 << RTAX_MTU);
1484 						}
1485 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1486 						dst_set_expires(&rth->u.dst,
1487 							ip_rt_mtu_expires);
1488 					}
1489 					est_mtu = mtu;
1490 				}
1491 			}
1492 			rcu_read_unlock();
1493 		}
1494 	}
1495 	return est_mtu ? : new_mtu;
1496 }
1497 
1498 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1499 {
1500 	if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1501 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1502 		if (mtu < ip_rt_min_pmtu) {
1503 			mtu = ip_rt_min_pmtu;
1504 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1505 		}
1506 		dst->metrics[RTAX_MTU-1] = mtu;
1507 		dst_set_expires(dst, ip_rt_mtu_expires);
1508 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1509 	}
1510 }
1511 
1512 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1513 {
1514 	return NULL;
1515 }
1516 
1517 static void ipv4_dst_destroy(struct dst_entry *dst)
1518 {
1519 	struct rtable *rt = (struct rtable *) dst;
1520 	struct inet_peer *peer = rt->peer;
1521 	struct in_device *idev = rt->idev;
1522 
1523 	if (peer) {
1524 		rt->peer = NULL;
1525 		inet_putpeer(peer);
1526 	}
1527 
1528 	if (idev) {
1529 		rt->idev = NULL;
1530 		in_dev_put(idev);
1531 	}
1532 }
1533 
1534 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1535 			    int how)
1536 {
1537 	struct rtable *rt = (struct rtable *) dst;
1538 	struct in_device *idev = rt->idev;
1539 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1540 		struct in_device *loopback_idev =
1541 			in_dev_get(dev_net(dev)->loopback_dev);
1542 		if (loopback_idev) {
1543 			rt->idev = loopback_idev;
1544 			in_dev_put(idev);
1545 		}
1546 	}
1547 }
1548 
1549 static void ipv4_link_failure(struct sk_buff *skb)
1550 {
1551 	struct rtable *rt;
1552 
1553 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1554 
1555 	rt = skb->rtable;
1556 	if (rt)
1557 		dst_set_expires(&rt->u.dst, 0);
1558 }
1559 
1560 static int ip_rt_bug(struct sk_buff *skb)
1561 {
1562 	printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1563 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1564 		skb->dev ? skb->dev->name : "?");
1565 	kfree_skb(skb);
1566 	return 0;
1567 }
1568 
1569 /*
1570    We do not cache source address of outgoing interface,
1571    because it is used only by IP RR, TS and SRR options,
1572    so that it out of fast path.
1573 
1574    BTW remember: "addr" is allowed to be not aligned
1575    in IP options!
1576  */
1577 
1578 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1579 {
1580 	__be32 src;
1581 	struct fib_result res;
1582 
1583 	if (rt->fl.iif == 0)
1584 		src = rt->rt_src;
1585 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1586 		src = FIB_RES_PREFSRC(res);
1587 		fib_res_put(&res);
1588 	} else
1589 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1590 					RT_SCOPE_UNIVERSE);
1591 	memcpy(addr, &src, 4);
1592 }
1593 
1594 #ifdef CONFIG_NET_CLS_ROUTE
1595 static void set_class_tag(struct rtable *rt, u32 tag)
1596 {
1597 	if (!(rt->u.dst.tclassid & 0xFFFF))
1598 		rt->u.dst.tclassid |= tag & 0xFFFF;
1599 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1600 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1601 }
1602 #endif
1603 
1604 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1605 {
1606 	struct fib_info *fi = res->fi;
1607 
1608 	if (fi) {
1609 		if (FIB_RES_GW(*res) &&
1610 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1611 			rt->rt_gateway = FIB_RES_GW(*res);
1612 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1613 		       sizeof(rt->u.dst.metrics));
1614 		if (fi->fib_mtu == 0) {
1615 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1616 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1617 			    rt->rt_gateway != rt->rt_dst &&
1618 			    rt->u.dst.dev->mtu > 576)
1619 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1620 		}
1621 #ifdef CONFIG_NET_CLS_ROUTE
1622 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1623 #endif
1624 	} else
1625 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1626 
1627 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1628 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1629 	if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1630 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1631 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1632 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1633 				       ip_rt_min_advmss);
1634 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1635 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1636 
1637 #ifdef CONFIG_NET_CLS_ROUTE
1638 #ifdef CONFIG_IP_MULTIPLE_TABLES
1639 	set_class_tag(rt, fib_rules_tclass(res));
1640 #endif
1641 	set_class_tag(rt, itag);
1642 #endif
1643 	rt->rt_type = res->type;
1644 }
1645 
1646 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1647 				u8 tos, struct net_device *dev, int our)
1648 {
1649 	unsigned hash;
1650 	struct rtable *rth;
1651 	__be32 spec_dst;
1652 	struct in_device *in_dev = in_dev_get(dev);
1653 	u32 itag = 0;
1654 
1655 	/* Primary sanity checks. */
1656 
1657 	if (in_dev == NULL)
1658 		return -EINVAL;
1659 
1660 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1661 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1662 		goto e_inval;
1663 
1664 	if (ipv4_is_zeronet(saddr)) {
1665 		if (!ipv4_is_local_multicast(daddr))
1666 			goto e_inval;
1667 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1668 	} else if (fib_validate_source(saddr, 0, tos, 0,
1669 					dev, &spec_dst, &itag) < 0)
1670 		goto e_inval;
1671 
1672 	rth = dst_alloc(&ipv4_dst_ops);
1673 	if (!rth)
1674 		goto e_nobufs;
1675 
1676 	rth->u.dst.output= ip_rt_bug;
1677 
1678 	atomic_set(&rth->u.dst.__refcnt, 1);
1679 	rth->u.dst.flags= DST_HOST;
1680 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1681 		rth->u.dst.flags |= DST_NOPOLICY;
1682 	rth->fl.fl4_dst	= daddr;
1683 	rth->rt_dst	= daddr;
1684 	rth->fl.fl4_tos	= tos;
1685 	rth->fl.mark    = skb->mark;
1686 	rth->fl.fl4_src	= saddr;
1687 	rth->rt_src	= saddr;
1688 #ifdef CONFIG_NET_CLS_ROUTE
1689 	rth->u.dst.tclassid = itag;
1690 #endif
1691 	rth->rt_iif	=
1692 	rth->fl.iif	= dev->ifindex;
1693 	rth->u.dst.dev	= init_net.loopback_dev;
1694 	dev_hold(rth->u.dst.dev);
1695 	rth->idev	= in_dev_get(rth->u.dst.dev);
1696 	rth->fl.oif	= 0;
1697 	rth->rt_gateway	= daddr;
1698 	rth->rt_spec_dst= spec_dst;
1699 	rth->rt_genid	= atomic_read(&rt_genid);
1700 	rth->rt_flags	= RTCF_MULTICAST;
1701 	rth->rt_type	= RTN_MULTICAST;
1702 	if (our) {
1703 		rth->u.dst.input= ip_local_deliver;
1704 		rth->rt_flags |= RTCF_LOCAL;
1705 	}
1706 
1707 #ifdef CONFIG_IP_MROUTE
1708 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1709 		rth->u.dst.input = ip_mr_input;
1710 #endif
1711 	RT_CACHE_STAT_INC(in_slow_mc);
1712 
1713 	in_dev_put(in_dev);
1714 	hash = rt_hash(daddr, saddr, dev->ifindex);
1715 	return rt_intern_hash(hash, rth, &skb->rtable);
1716 
1717 e_nobufs:
1718 	in_dev_put(in_dev);
1719 	return -ENOBUFS;
1720 
1721 e_inval:
1722 	in_dev_put(in_dev);
1723 	return -EINVAL;
1724 }
1725 
1726 
1727 static void ip_handle_martian_source(struct net_device *dev,
1728 				     struct in_device *in_dev,
1729 				     struct sk_buff *skb,
1730 				     __be32 daddr,
1731 				     __be32 saddr)
1732 {
1733 	RT_CACHE_STAT_INC(in_martian_src);
1734 #ifdef CONFIG_IP_ROUTE_VERBOSE
1735 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1736 		/*
1737 		 *	RFC1812 recommendation, if source is martian,
1738 		 *	the only hint is MAC header.
1739 		 */
1740 		printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1741 			NIPQUAD_FMT", on dev %s\n",
1742 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1743 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1744 			int i;
1745 			const unsigned char *p = skb_mac_header(skb);
1746 			printk(KERN_WARNING "ll header: ");
1747 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1748 				printk("%02x", *p);
1749 				if (i < (dev->hard_header_len - 1))
1750 					printk(":");
1751 			}
1752 			printk("\n");
1753 		}
1754 	}
1755 #endif
1756 }
1757 
1758 static int __mkroute_input(struct sk_buff *skb,
1759 			   struct fib_result *res,
1760 			   struct in_device *in_dev,
1761 			   __be32 daddr, __be32 saddr, u32 tos,
1762 			   struct rtable **result)
1763 {
1764 
1765 	struct rtable *rth;
1766 	int err;
1767 	struct in_device *out_dev;
1768 	unsigned flags = 0;
1769 	__be32 spec_dst;
1770 	u32 itag;
1771 
1772 	/* get a working reference to the output device */
1773 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1774 	if (out_dev == NULL) {
1775 		if (net_ratelimit())
1776 			printk(KERN_CRIT "Bug in ip_route_input" \
1777 			       "_slow(). Please, report\n");
1778 		return -EINVAL;
1779 	}
1780 
1781 
1782 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1783 				  in_dev->dev, &spec_dst, &itag);
1784 	if (err < 0) {
1785 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1786 					 saddr);
1787 
1788 		err = -EINVAL;
1789 		goto cleanup;
1790 	}
1791 
1792 	if (err)
1793 		flags |= RTCF_DIRECTSRC;
1794 
1795 	if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1796 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1797 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1798 		flags |= RTCF_DOREDIRECT;
1799 
1800 	if (skb->protocol != htons(ETH_P_IP)) {
1801 		/* Not IP (i.e. ARP). Do not create route, if it is
1802 		 * invalid for proxy arp. DNAT routes are always valid.
1803 		 */
1804 		if (out_dev == in_dev) {
1805 			err = -EINVAL;
1806 			goto cleanup;
1807 		}
1808 	}
1809 
1810 
1811 	rth = dst_alloc(&ipv4_dst_ops);
1812 	if (!rth) {
1813 		err = -ENOBUFS;
1814 		goto cleanup;
1815 	}
1816 
1817 	atomic_set(&rth->u.dst.__refcnt, 1);
1818 	rth->u.dst.flags= DST_HOST;
1819 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1820 		rth->u.dst.flags |= DST_NOPOLICY;
1821 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1822 		rth->u.dst.flags |= DST_NOXFRM;
1823 	rth->fl.fl4_dst	= daddr;
1824 	rth->rt_dst	= daddr;
1825 	rth->fl.fl4_tos	= tos;
1826 	rth->fl.mark    = skb->mark;
1827 	rth->fl.fl4_src	= saddr;
1828 	rth->rt_src	= saddr;
1829 	rth->rt_gateway	= daddr;
1830 	rth->rt_iif 	=
1831 		rth->fl.iif	= in_dev->dev->ifindex;
1832 	rth->u.dst.dev	= (out_dev)->dev;
1833 	dev_hold(rth->u.dst.dev);
1834 	rth->idev	= in_dev_get(rth->u.dst.dev);
1835 	rth->fl.oif 	= 0;
1836 	rth->rt_spec_dst= spec_dst;
1837 
1838 	rth->u.dst.input = ip_forward;
1839 	rth->u.dst.output = ip_output;
1840 	rth->rt_genid = atomic_read(&rt_genid);
1841 
1842 	rt_set_nexthop(rth, res, itag);
1843 
1844 	rth->rt_flags = flags;
1845 
1846 	*result = rth;
1847 	err = 0;
1848  cleanup:
1849 	/* release the working reference to the output device */
1850 	in_dev_put(out_dev);
1851 	return err;
1852 }
1853 
1854 static int ip_mkroute_input(struct sk_buff *skb,
1855 			    struct fib_result *res,
1856 			    const struct flowi *fl,
1857 			    struct in_device *in_dev,
1858 			    __be32 daddr, __be32 saddr, u32 tos)
1859 {
1860 	struct rtable* rth = NULL;
1861 	int err;
1862 	unsigned hash;
1863 
1864 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1865 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1866 		fib_select_multipath(fl, res);
1867 #endif
1868 
1869 	/* create a routing cache entry */
1870 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1871 	if (err)
1872 		return err;
1873 
1874 	/* put it into the cache */
1875 	hash = rt_hash(daddr, saddr, fl->iif);
1876 	return rt_intern_hash(hash, rth, &skb->rtable);
1877 }
1878 
1879 /*
1880  *	NOTE. We drop all the packets that has local source
1881  *	addresses, because every properly looped back packet
1882  *	must have correct destination already attached by output routine.
1883  *
1884  *	Such approach solves two big problems:
1885  *	1. Not simplex devices are handled properly.
1886  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1887  */
1888 
1889 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1890 			       u8 tos, struct net_device *dev)
1891 {
1892 	struct fib_result res;
1893 	struct in_device *in_dev = in_dev_get(dev);
1894 	struct flowi fl = { .nl_u = { .ip4_u =
1895 				      { .daddr = daddr,
1896 					.saddr = saddr,
1897 					.tos = tos,
1898 					.scope = RT_SCOPE_UNIVERSE,
1899 				      } },
1900 			    .mark = skb->mark,
1901 			    .iif = dev->ifindex };
1902 	unsigned	flags = 0;
1903 	u32		itag = 0;
1904 	struct rtable * rth;
1905 	unsigned	hash;
1906 	__be32		spec_dst;
1907 	int		err = -EINVAL;
1908 	int		free_res = 0;
1909 	struct net    * net = dev_net(dev);
1910 
1911 	/* IP on this device is disabled. */
1912 
1913 	if (!in_dev)
1914 		goto out;
1915 
1916 	/* Check for the most weird martians, which can be not detected
1917 	   by fib_lookup.
1918 	 */
1919 
1920 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1921 	    ipv4_is_loopback(saddr))
1922 		goto martian_source;
1923 
1924 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1925 		goto brd_input;
1926 
1927 	/* Accept zero addresses only to limited broadcast;
1928 	 * I even do not know to fix it or not. Waiting for complains :-)
1929 	 */
1930 	if (ipv4_is_zeronet(saddr))
1931 		goto martian_source;
1932 
1933 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1934 	    ipv4_is_loopback(daddr))
1935 		goto martian_destination;
1936 
1937 	/*
1938 	 *	Now we are ready to route packet.
1939 	 */
1940 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1941 		if (!IN_DEV_FORWARD(in_dev))
1942 			goto e_hostunreach;
1943 		goto no_route;
1944 	}
1945 	free_res = 1;
1946 
1947 	RT_CACHE_STAT_INC(in_slow_tot);
1948 
1949 	if (res.type == RTN_BROADCAST)
1950 		goto brd_input;
1951 
1952 	if (res.type == RTN_LOCAL) {
1953 		int result;
1954 		result = fib_validate_source(saddr, daddr, tos,
1955 					     net->loopback_dev->ifindex,
1956 					     dev, &spec_dst, &itag);
1957 		if (result < 0)
1958 			goto martian_source;
1959 		if (result)
1960 			flags |= RTCF_DIRECTSRC;
1961 		spec_dst = daddr;
1962 		goto local_input;
1963 	}
1964 
1965 	if (!IN_DEV_FORWARD(in_dev))
1966 		goto e_hostunreach;
1967 	if (res.type != RTN_UNICAST)
1968 		goto martian_destination;
1969 
1970 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 done:
1972 	in_dev_put(in_dev);
1973 	if (free_res)
1974 		fib_res_put(&res);
1975 out:	return err;
1976 
1977 brd_input:
1978 	if (skb->protocol != htons(ETH_P_IP))
1979 		goto e_inval;
1980 
1981 	if (ipv4_is_zeronet(saddr))
1982 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1983 	else {
1984 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1985 					  &itag);
1986 		if (err < 0)
1987 			goto martian_source;
1988 		if (err)
1989 			flags |= RTCF_DIRECTSRC;
1990 	}
1991 	flags |= RTCF_BROADCAST;
1992 	res.type = RTN_BROADCAST;
1993 	RT_CACHE_STAT_INC(in_brd);
1994 
1995 local_input:
1996 	rth = dst_alloc(&ipv4_dst_ops);
1997 	if (!rth)
1998 		goto e_nobufs;
1999 
2000 	rth->u.dst.output= ip_rt_bug;
2001 	rth->rt_genid = atomic_read(&rt_genid);
2002 
2003 	atomic_set(&rth->u.dst.__refcnt, 1);
2004 	rth->u.dst.flags= DST_HOST;
2005 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2006 		rth->u.dst.flags |= DST_NOPOLICY;
2007 	rth->fl.fl4_dst	= daddr;
2008 	rth->rt_dst	= daddr;
2009 	rth->fl.fl4_tos	= tos;
2010 	rth->fl.mark    = skb->mark;
2011 	rth->fl.fl4_src	= saddr;
2012 	rth->rt_src	= saddr;
2013 #ifdef CONFIG_NET_CLS_ROUTE
2014 	rth->u.dst.tclassid = itag;
2015 #endif
2016 	rth->rt_iif	=
2017 	rth->fl.iif	= dev->ifindex;
2018 	rth->u.dst.dev	= net->loopback_dev;
2019 	dev_hold(rth->u.dst.dev);
2020 	rth->idev	= in_dev_get(rth->u.dst.dev);
2021 	rth->rt_gateway	= daddr;
2022 	rth->rt_spec_dst= spec_dst;
2023 	rth->u.dst.input= ip_local_deliver;
2024 	rth->rt_flags 	= flags|RTCF_LOCAL;
2025 	if (res.type == RTN_UNREACHABLE) {
2026 		rth->u.dst.input= ip_error;
2027 		rth->u.dst.error= -err;
2028 		rth->rt_flags 	&= ~RTCF_LOCAL;
2029 	}
2030 	rth->rt_type	= res.type;
2031 	hash = rt_hash(daddr, saddr, fl.iif);
2032 	err = rt_intern_hash(hash, rth, &skb->rtable);
2033 	goto done;
2034 
2035 no_route:
2036 	RT_CACHE_STAT_INC(in_no_route);
2037 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2038 	res.type = RTN_UNREACHABLE;
2039 	if (err == -ESRCH)
2040 		err = -ENETUNREACH;
2041 	goto local_input;
2042 
2043 	/*
2044 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2045 	 */
2046 martian_destination:
2047 	RT_CACHE_STAT_INC(in_martian_dst);
2048 #ifdef CONFIG_IP_ROUTE_VERBOSE
2049 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2050 		printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2051 			NIPQUAD_FMT ", dev %s\n",
2052 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2053 #endif
2054 
2055 e_hostunreach:
2056 	err = -EHOSTUNREACH;
2057 	goto done;
2058 
2059 e_inval:
2060 	err = -EINVAL;
2061 	goto done;
2062 
2063 e_nobufs:
2064 	err = -ENOBUFS;
2065 	goto done;
2066 
2067 martian_source:
2068 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2069 	goto e_inval;
2070 }
2071 
2072 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073 		   u8 tos, struct net_device *dev)
2074 {
2075 	struct rtable * rth;
2076 	unsigned	hash;
2077 	int iif = dev->ifindex;
2078 	struct net *net;
2079 
2080 	net = dev_net(dev);
2081 	tos &= IPTOS_RT_MASK;
2082 	hash = rt_hash(daddr, saddr, iif);
2083 
2084 	rcu_read_lock();
2085 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2086 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2087 		if (((rth->fl.fl4_dst ^ daddr) |
2088 		     (rth->fl.fl4_src ^ saddr) |
2089 		     (rth->fl.iif ^ iif) |
2090 		     rth->fl.oif |
2091 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2092 		    rth->fl.mark == skb->mark &&
2093 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2094 		    rth->rt_genid == atomic_read(&rt_genid)) {
2095 			dst_use(&rth->u.dst, jiffies);
2096 			RT_CACHE_STAT_INC(in_hit);
2097 			rcu_read_unlock();
2098 			skb->rtable = rth;
2099 			return 0;
2100 		}
2101 		RT_CACHE_STAT_INC(in_hlist_search);
2102 	}
2103 	rcu_read_unlock();
2104 
2105 	/* Multicast recognition logic is moved from route cache to here.
2106 	   The problem was that too many Ethernet cards have broken/missing
2107 	   hardware multicast filters :-( As result the host on multicasting
2108 	   network acquires a lot of useless route cache entries, sort of
2109 	   SDR messages from all the world. Now we try to get rid of them.
2110 	   Really, provided software IP multicast filter is organized
2111 	   reasonably (at least, hashed), it does not result in a slowdown
2112 	   comparing with route cache reject entries.
2113 	   Note, that multicast routers are not affected, because
2114 	   route cache entry is created eventually.
2115 	 */
2116 	if (ipv4_is_multicast(daddr)) {
2117 		struct in_device *in_dev;
2118 
2119 		rcu_read_lock();
2120 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2121 			int our = ip_check_mc(in_dev, daddr, saddr,
2122 				ip_hdr(skb)->protocol);
2123 			if (our
2124 #ifdef CONFIG_IP_MROUTE
2125 			    || (!ipv4_is_local_multicast(daddr) &&
2126 				IN_DEV_MFORWARD(in_dev))
2127 #endif
2128 			    ) {
2129 				rcu_read_unlock();
2130 				return ip_route_input_mc(skb, daddr, saddr,
2131 							 tos, dev, our);
2132 			}
2133 		}
2134 		rcu_read_unlock();
2135 		return -EINVAL;
2136 	}
2137 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2138 }
2139 
2140 static int __mkroute_output(struct rtable **result,
2141 			    struct fib_result *res,
2142 			    const struct flowi *fl,
2143 			    const struct flowi *oldflp,
2144 			    struct net_device *dev_out,
2145 			    unsigned flags)
2146 {
2147 	struct rtable *rth;
2148 	struct in_device *in_dev;
2149 	u32 tos = RT_FL_TOS(oldflp);
2150 	int err = 0;
2151 
2152 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2153 		return -EINVAL;
2154 
2155 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2156 		res->type = RTN_BROADCAST;
2157 	else if (ipv4_is_multicast(fl->fl4_dst))
2158 		res->type = RTN_MULTICAST;
2159 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2160 		return -EINVAL;
2161 
2162 	if (dev_out->flags & IFF_LOOPBACK)
2163 		flags |= RTCF_LOCAL;
2164 
2165 	/* get work reference to inet device */
2166 	in_dev = in_dev_get(dev_out);
2167 	if (!in_dev)
2168 		return -EINVAL;
2169 
2170 	if (res->type == RTN_BROADCAST) {
2171 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2172 		if (res->fi) {
2173 			fib_info_put(res->fi);
2174 			res->fi = NULL;
2175 		}
2176 	} else if (res->type == RTN_MULTICAST) {
2177 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2178 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2179 				 oldflp->proto))
2180 			flags &= ~RTCF_LOCAL;
2181 		/* If multicast route do not exist use
2182 		   default one, but do not gateway in this case.
2183 		   Yes, it is hack.
2184 		 */
2185 		if (res->fi && res->prefixlen < 4) {
2186 			fib_info_put(res->fi);
2187 			res->fi = NULL;
2188 		}
2189 	}
2190 
2191 
2192 	rth = dst_alloc(&ipv4_dst_ops);
2193 	if (!rth) {
2194 		err = -ENOBUFS;
2195 		goto cleanup;
2196 	}
2197 
2198 	atomic_set(&rth->u.dst.__refcnt, 1);
2199 	rth->u.dst.flags= DST_HOST;
2200 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2201 		rth->u.dst.flags |= DST_NOXFRM;
2202 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2203 		rth->u.dst.flags |= DST_NOPOLICY;
2204 
2205 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2206 	rth->fl.fl4_tos	= tos;
2207 	rth->fl.fl4_src	= oldflp->fl4_src;
2208 	rth->fl.oif	= oldflp->oif;
2209 	rth->fl.mark    = oldflp->mark;
2210 	rth->rt_dst	= fl->fl4_dst;
2211 	rth->rt_src	= fl->fl4_src;
2212 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2213 	/* get references to the devices that are to be hold by the routing
2214 	   cache entry */
2215 	rth->u.dst.dev	= dev_out;
2216 	dev_hold(dev_out);
2217 	rth->idev	= in_dev_get(dev_out);
2218 	rth->rt_gateway = fl->fl4_dst;
2219 	rth->rt_spec_dst= fl->fl4_src;
2220 
2221 	rth->u.dst.output=ip_output;
2222 	rth->rt_genid = atomic_read(&rt_genid);
2223 
2224 	RT_CACHE_STAT_INC(out_slow_tot);
2225 
2226 	if (flags & RTCF_LOCAL) {
2227 		rth->u.dst.input = ip_local_deliver;
2228 		rth->rt_spec_dst = fl->fl4_dst;
2229 	}
2230 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2231 		rth->rt_spec_dst = fl->fl4_src;
2232 		if (flags & RTCF_LOCAL &&
2233 		    !(dev_out->flags & IFF_LOOPBACK)) {
2234 			rth->u.dst.output = ip_mc_output;
2235 			RT_CACHE_STAT_INC(out_slow_mc);
2236 		}
2237 #ifdef CONFIG_IP_MROUTE
2238 		if (res->type == RTN_MULTICAST) {
2239 			if (IN_DEV_MFORWARD(in_dev) &&
2240 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2241 				rth->u.dst.input = ip_mr_input;
2242 				rth->u.dst.output = ip_mc_output;
2243 			}
2244 		}
2245 #endif
2246 	}
2247 
2248 	rt_set_nexthop(rth, res, 0);
2249 
2250 	rth->rt_flags = flags;
2251 
2252 	*result = rth;
2253  cleanup:
2254 	/* release work reference to inet device */
2255 	in_dev_put(in_dev);
2256 
2257 	return err;
2258 }
2259 
2260 static int ip_mkroute_output(struct rtable **rp,
2261 			     struct fib_result *res,
2262 			     const struct flowi *fl,
2263 			     const struct flowi *oldflp,
2264 			     struct net_device *dev_out,
2265 			     unsigned flags)
2266 {
2267 	struct rtable *rth = NULL;
2268 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2269 	unsigned hash;
2270 	if (err == 0) {
2271 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2272 		err = rt_intern_hash(hash, rth, rp);
2273 	}
2274 
2275 	return err;
2276 }
2277 
2278 /*
2279  * Major route resolver routine.
2280  */
2281 
2282 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2283 				const struct flowi *oldflp)
2284 {
2285 	u32 tos	= RT_FL_TOS(oldflp);
2286 	struct flowi fl = { .nl_u = { .ip4_u =
2287 				      { .daddr = oldflp->fl4_dst,
2288 					.saddr = oldflp->fl4_src,
2289 					.tos = tos & IPTOS_RT_MASK,
2290 					.scope = ((tos & RTO_ONLINK) ?
2291 						  RT_SCOPE_LINK :
2292 						  RT_SCOPE_UNIVERSE),
2293 				      } },
2294 			    .mark = oldflp->mark,
2295 			    .iif = net->loopback_dev->ifindex,
2296 			    .oif = oldflp->oif };
2297 	struct fib_result res;
2298 	unsigned flags = 0;
2299 	struct net_device *dev_out = NULL;
2300 	int free_res = 0;
2301 	int err;
2302 
2303 
2304 	res.fi		= NULL;
2305 #ifdef CONFIG_IP_MULTIPLE_TABLES
2306 	res.r		= NULL;
2307 #endif
2308 
2309 	if (oldflp->fl4_src) {
2310 		err = -EINVAL;
2311 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2312 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2313 		    ipv4_is_zeronet(oldflp->fl4_src))
2314 			goto out;
2315 
2316 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2317 		dev_out = ip_dev_find(net, oldflp->fl4_src);
2318 		if (dev_out == NULL)
2319 			goto out;
2320 
2321 		/* I removed check for oif == dev_out->oif here.
2322 		   It was wrong for two reasons:
2323 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2324 		      is assigned to multiple interfaces.
2325 		   2. Moreover, we are allowed to send packets with saddr
2326 		      of another iface. --ANK
2327 		 */
2328 
2329 		if (oldflp->oif == 0
2330 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2331 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2332 			/* Special hack: user can direct multicasts
2333 			   and limited broadcast via necessary interface
2334 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2335 			   This hack is not just for fun, it allows
2336 			   vic,vat and friends to work.
2337 			   They bind socket to loopback, set ttl to zero
2338 			   and expect that it will work.
2339 			   From the viewpoint of routing cache they are broken,
2340 			   because we are not allowed to build multicast path
2341 			   with loopback source addr (look, routing cache
2342 			   cannot know, that ttl is zero, so that packet
2343 			   will not leave this host and route is valid).
2344 			   Luckily, this hack is good workaround.
2345 			 */
2346 
2347 			fl.oif = dev_out->ifindex;
2348 			goto make_route;
2349 		}
2350 		if (dev_out)
2351 			dev_put(dev_out);
2352 		dev_out = NULL;
2353 	}
2354 
2355 
2356 	if (oldflp->oif) {
2357 		dev_out = dev_get_by_index(net, oldflp->oif);
2358 		err = -ENODEV;
2359 		if (dev_out == NULL)
2360 			goto out;
2361 
2362 		/* RACE: Check return value of inet_select_addr instead. */
2363 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2364 			dev_put(dev_out);
2365 			goto out;	/* Wrong error code */
2366 		}
2367 
2368 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2369 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2370 			if (!fl.fl4_src)
2371 				fl.fl4_src = inet_select_addr(dev_out, 0,
2372 							      RT_SCOPE_LINK);
2373 			goto make_route;
2374 		}
2375 		if (!fl.fl4_src) {
2376 			if (ipv4_is_multicast(oldflp->fl4_dst))
2377 				fl.fl4_src = inet_select_addr(dev_out, 0,
2378 							      fl.fl4_scope);
2379 			else if (!oldflp->fl4_dst)
2380 				fl.fl4_src = inet_select_addr(dev_out, 0,
2381 							      RT_SCOPE_HOST);
2382 		}
2383 	}
2384 
2385 	if (!fl.fl4_dst) {
2386 		fl.fl4_dst = fl.fl4_src;
2387 		if (!fl.fl4_dst)
2388 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2389 		if (dev_out)
2390 			dev_put(dev_out);
2391 		dev_out = net->loopback_dev;
2392 		dev_hold(dev_out);
2393 		fl.oif = net->loopback_dev->ifindex;
2394 		res.type = RTN_LOCAL;
2395 		flags |= RTCF_LOCAL;
2396 		goto make_route;
2397 	}
2398 
2399 	if (fib_lookup(net, &fl, &res)) {
2400 		res.fi = NULL;
2401 		if (oldflp->oif) {
2402 			/* Apparently, routing tables are wrong. Assume,
2403 			   that the destination is on link.
2404 
2405 			   WHY? DW.
2406 			   Because we are allowed to send to iface
2407 			   even if it has NO routes and NO assigned
2408 			   addresses. When oif is specified, routing
2409 			   tables are looked up with only one purpose:
2410 			   to catch if destination is gatewayed, rather than
2411 			   direct. Moreover, if MSG_DONTROUTE is set,
2412 			   we send packet, ignoring both routing tables
2413 			   and ifaddr state. --ANK
2414 
2415 
2416 			   We could make it even if oif is unknown,
2417 			   likely IPv6, but we do not.
2418 			 */
2419 
2420 			if (fl.fl4_src == 0)
2421 				fl.fl4_src = inet_select_addr(dev_out, 0,
2422 							      RT_SCOPE_LINK);
2423 			res.type = RTN_UNICAST;
2424 			goto make_route;
2425 		}
2426 		if (dev_out)
2427 			dev_put(dev_out);
2428 		err = -ENETUNREACH;
2429 		goto out;
2430 	}
2431 	free_res = 1;
2432 
2433 	if (res.type == RTN_LOCAL) {
2434 		if (!fl.fl4_src)
2435 			fl.fl4_src = fl.fl4_dst;
2436 		if (dev_out)
2437 			dev_put(dev_out);
2438 		dev_out = net->loopback_dev;
2439 		dev_hold(dev_out);
2440 		fl.oif = dev_out->ifindex;
2441 		if (res.fi)
2442 			fib_info_put(res.fi);
2443 		res.fi = NULL;
2444 		flags |= RTCF_LOCAL;
2445 		goto make_route;
2446 	}
2447 
2448 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2449 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2450 		fib_select_multipath(&fl, &res);
2451 	else
2452 #endif
2453 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2454 		fib_select_default(net, &fl, &res);
2455 
2456 	if (!fl.fl4_src)
2457 		fl.fl4_src = FIB_RES_PREFSRC(res);
2458 
2459 	if (dev_out)
2460 		dev_put(dev_out);
2461 	dev_out = FIB_RES_DEV(res);
2462 	dev_hold(dev_out);
2463 	fl.oif = dev_out->ifindex;
2464 
2465 
2466 make_route:
2467 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2468 
2469 
2470 	if (free_res)
2471 		fib_res_put(&res);
2472 	if (dev_out)
2473 		dev_put(dev_out);
2474 out:	return err;
2475 }
2476 
2477 int __ip_route_output_key(struct net *net, struct rtable **rp,
2478 			  const struct flowi *flp)
2479 {
2480 	unsigned hash;
2481 	struct rtable *rth;
2482 
2483 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2484 
2485 	rcu_read_lock_bh();
2486 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2487 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2488 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2489 		    rth->fl.fl4_src == flp->fl4_src &&
2490 		    rth->fl.iif == 0 &&
2491 		    rth->fl.oif == flp->oif &&
2492 		    rth->fl.mark == flp->mark &&
2493 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2494 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2495 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2496 		    rth->rt_genid == atomic_read(&rt_genid)) {
2497 			dst_use(&rth->u.dst, jiffies);
2498 			RT_CACHE_STAT_INC(out_hit);
2499 			rcu_read_unlock_bh();
2500 			*rp = rth;
2501 			return 0;
2502 		}
2503 		RT_CACHE_STAT_INC(out_hlist_search);
2504 	}
2505 	rcu_read_unlock_bh();
2506 
2507 	return ip_route_output_slow(net, rp, flp);
2508 }
2509 
2510 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2511 
2512 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2513 {
2514 }
2515 
2516 static struct dst_ops ipv4_dst_blackhole_ops = {
2517 	.family			=	AF_INET,
2518 	.protocol		=	__constant_htons(ETH_P_IP),
2519 	.destroy		=	ipv4_dst_destroy,
2520 	.check			=	ipv4_dst_check,
2521 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2522 	.entry_size		=	sizeof(struct rtable),
2523 	.entries		=	ATOMIC_INIT(0),
2524 };
2525 
2526 
2527 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2528 {
2529 	struct rtable *ort = *rp;
2530 	struct rtable *rt = (struct rtable *)
2531 		dst_alloc(&ipv4_dst_blackhole_ops);
2532 
2533 	if (rt) {
2534 		struct dst_entry *new = &rt->u.dst;
2535 
2536 		atomic_set(&new->__refcnt, 1);
2537 		new->__use = 1;
2538 		new->input = dst_discard;
2539 		new->output = dst_discard;
2540 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2541 
2542 		new->dev = ort->u.dst.dev;
2543 		if (new->dev)
2544 			dev_hold(new->dev);
2545 
2546 		rt->fl = ort->fl;
2547 
2548 		rt->idev = ort->idev;
2549 		if (rt->idev)
2550 			in_dev_hold(rt->idev);
2551 		rt->rt_genid = atomic_read(&rt_genid);
2552 		rt->rt_flags = ort->rt_flags;
2553 		rt->rt_type = ort->rt_type;
2554 		rt->rt_dst = ort->rt_dst;
2555 		rt->rt_src = ort->rt_src;
2556 		rt->rt_iif = ort->rt_iif;
2557 		rt->rt_gateway = ort->rt_gateway;
2558 		rt->rt_spec_dst = ort->rt_spec_dst;
2559 		rt->peer = ort->peer;
2560 		if (rt->peer)
2561 			atomic_inc(&rt->peer->refcnt);
2562 
2563 		dst_free(new);
2564 	}
2565 
2566 	dst_release(&(*rp)->u.dst);
2567 	*rp = rt;
2568 	return (rt ? 0 : -ENOMEM);
2569 }
2570 
2571 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2572 			 struct sock *sk, int flags)
2573 {
2574 	int err;
2575 
2576 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2577 		return err;
2578 
2579 	if (flp->proto) {
2580 		if (!flp->fl4_src)
2581 			flp->fl4_src = (*rp)->rt_src;
2582 		if (!flp->fl4_dst)
2583 			flp->fl4_dst = (*rp)->rt_dst;
2584 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2585 				    flags ? XFRM_LOOKUP_WAIT : 0);
2586 		if (err == -EREMOTE)
2587 			err = ipv4_dst_blackhole(rp, flp);
2588 
2589 		return err;
2590 	}
2591 
2592 	return 0;
2593 }
2594 
2595 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2596 
2597 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2598 {
2599 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2600 }
2601 
2602 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2603 			int nowait, unsigned int flags)
2604 {
2605 	struct rtable *rt = skb->rtable;
2606 	struct rtmsg *r;
2607 	struct nlmsghdr *nlh;
2608 	long expires;
2609 	u32 id = 0, ts = 0, tsage = 0, error;
2610 
2611 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2612 	if (nlh == NULL)
2613 		return -EMSGSIZE;
2614 
2615 	r = nlmsg_data(nlh);
2616 	r->rtm_family	 = AF_INET;
2617 	r->rtm_dst_len	= 32;
2618 	r->rtm_src_len	= 0;
2619 	r->rtm_tos	= rt->fl.fl4_tos;
2620 	r->rtm_table	= RT_TABLE_MAIN;
2621 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2622 	r->rtm_type	= rt->rt_type;
2623 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2624 	r->rtm_protocol = RTPROT_UNSPEC;
2625 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2626 	if (rt->rt_flags & RTCF_NOTIFY)
2627 		r->rtm_flags |= RTM_F_NOTIFY;
2628 
2629 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2630 
2631 	if (rt->fl.fl4_src) {
2632 		r->rtm_src_len = 32;
2633 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2634 	}
2635 	if (rt->u.dst.dev)
2636 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2637 #ifdef CONFIG_NET_CLS_ROUTE
2638 	if (rt->u.dst.tclassid)
2639 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2640 #endif
2641 	if (rt->fl.iif)
2642 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2643 	else if (rt->rt_src != rt->fl.fl4_src)
2644 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2645 
2646 	if (rt->rt_dst != rt->rt_gateway)
2647 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2648 
2649 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2650 		goto nla_put_failure;
2651 
2652 	error = rt->u.dst.error;
2653 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2654 	if (rt->peer) {
2655 		id = rt->peer->ip_id_count;
2656 		if (rt->peer->tcp_ts_stamp) {
2657 			ts = rt->peer->tcp_ts;
2658 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2659 		}
2660 	}
2661 
2662 	if (rt->fl.iif) {
2663 #ifdef CONFIG_IP_MROUTE
2664 		__be32 dst = rt->rt_dst;
2665 
2666 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2667 		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2668 			int err = ipmr_get_route(skb, r, nowait);
2669 			if (err <= 0) {
2670 				if (!nowait) {
2671 					if (err == 0)
2672 						return 0;
2673 					goto nla_put_failure;
2674 				} else {
2675 					if (err == -EMSGSIZE)
2676 						goto nla_put_failure;
2677 					error = err;
2678 				}
2679 			}
2680 		} else
2681 #endif
2682 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2683 	}
2684 
2685 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2686 			       expires, error) < 0)
2687 		goto nla_put_failure;
2688 
2689 	return nlmsg_end(skb, nlh);
2690 
2691 nla_put_failure:
2692 	nlmsg_cancel(skb, nlh);
2693 	return -EMSGSIZE;
2694 }
2695 
2696 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2697 {
2698 	struct net *net = sock_net(in_skb->sk);
2699 	struct rtmsg *rtm;
2700 	struct nlattr *tb[RTA_MAX+1];
2701 	struct rtable *rt = NULL;
2702 	__be32 dst = 0;
2703 	__be32 src = 0;
2704 	u32 iif;
2705 	int err;
2706 	struct sk_buff *skb;
2707 
2708 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2709 	if (err < 0)
2710 		goto errout;
2711 
2712 	rtm = nlmsg_data(nlh);
2713 
2714 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2715 	if (skb == NULL) {
2716 		err = -ENOBUFS;
2717 		goto errout;
2718 	}
2719 
2720 	/* Reserve room for dummy headers, this skb can pass
2721 	   through good chunk of routing engine.
2722 	 */
2723 	skb_reset_mac_header(skb);
2724 	skb_reset_network_header(skb);
2725 
2726 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2727 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2728 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2729 
2730 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2731 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2732 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2733 
2734 	if (iif) {
2735 		struct net_device *dev;
2736 
2737 		dev = __dev_get_by_index(net, iif);
2738 		if (dev == NULL) {
2739 			err = -ENODEV;
2740 			goto errout_free;
2741 		}
2742 
2743 		skb->protocol	= htons(ETH_P_IP);
2744 		skb->dev	= dev;
2745 		local_bh_disable();
2746 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2747 		local_bh_enable();
2748 
2749 		rt = skb->rtable;
2750 		if (err == 0 && rt->u.dst.error)
2751 			err = -rt->u.dst.error;
2752 	} else {
2753 		struct flowi fl = {
2754 			.nl_u = {
2755 				.ip4_u = {
2756 					.daddr = dst,
2757 					.saddr = src,
2758 					.tos = rtm->rtm_tos,
2759 				},
2760 			},
2761 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2762 		};
2763 		err = ip_route_output_key(net, &rt, &fl);
2764 	}
2765 
2766 	if (err)
2767 		goto errout_free;
2768 
2769 	skb->rtable = rt;
2770 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2771 		rt->rt_flags |= RTCF_NOTIFY;
2772 
2773 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2774 			   RTM_NEWROUTE, 0, 0);
2775 	if (err <= 0)
2776 		goto errout_free;
2777 
2778 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2779 errout:
2780 	return err;
2781 
2782 errout_free:
2783 	kfree_skb(skb);
2784 	goto errout;
2785 }
2786 
2787 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2788 {
2789 	struct rtable *rt;
2790 	int h, s_h;
2791 	int idx, s_idx;
2792 	struct net *net;
2793 
2794 	net = sock_net(skb->sk);
2795 
2796 	s_h = cb->args[0];
2797 	if (s_h < 0)
2798 		s_h = 0;
2799 	s_idx = idx = cb->args[1];
2800 	for (h = s_h; h <= rt_hash_mask; h++) {
2801 		rcu_read_lock_bh();
2802 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2803 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2804 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2805 				continue;
2806 			if (rt->rt_genid != atomic_read(&rt_genid))
2807 				continue;
2808 			skb->dst = dst_clone(&rt->u.dst);
2809 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2810 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2811 					 1, NLM_F_MULTI) <= 0) {
2812 				dst_release(xchg(&skb->dst, NULL));
2813 				rcu_read_unlock_bh();
2814 				goto done;
2815 			}
2816 			dst_release(xchg(&skb->dst, NULL));
2817 		}
2818 		rcu_read_unlock_bh();
2819 		s_idx = 0;
2820 	}
2821 
2822 done:
2823 	cb->args[0] = h;
2824 	cb->args[1] = idx;
2825 	return skb->len;
2826 }
2827 
2828 void ip_rt_multicast_event(struct in_device *in_dev)
2829 {
2830 	rt_cache_flush(0);
2831 }
2832 
2833 #ifdef CONFIG_SYSCTL
2834 static int flush_delay;
2835 
2836 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2837 					struct file *filp, void __user *buffer,
2838 					size_t *lenp, loff_t *ppos)
2839 {
2840 	if (write) {
2841 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2842 		rt_cache_flush(flush_delay);
2843 		return 0;
2844 	}
2845 
2846 	return -EINVAL;
2847 }
2848 
2849 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2850 						int __user *name,
2851 						int nlen,
2852 						void __user *oldval,
2853 						size_t __user *oldlenp,
2854 						void __user *newval,
2855 						size_t newlen)
2856 {
2857 	int delay;
2858 	if (newlen != sizeof(int))
2859 		return -EINVAL;
2860 	if (get_user(delay, (int __user *)newval))
2861 		return -EFAULT;
2862 	rt_cache_flush(delay);
2863 	return 0;
2864 }
2865 
2866 ctl_table ipv4_route_table[] = {
2867 	{
2868 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2869 		.procname	= "flush",
2870 		.data		= &flush_delay,
2871 		.maxlen		= sizeof(int),
2872 		.mode		= 0200,
2873 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2874 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2875 	},
2876 	{
2877 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2878 		.procname	= "gc_thresh",
2879 		.data		= &ipv4_dst_ops.gc_thresh,
2880 		.maxlen		= sizeof(int),
2881 		.mode		= 0644,
2882 		.proc_handler	= &proc_dointvec,
2883 	},
2884 	{
2885 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2886 		.procname	= "max_size",
2887 		.data		= &ip_rt_max_size,
2888 		.maxlen		= sizeof(int),
2889 		.mode		= 0644,
2890 		.proc_handler	= &proc_dointvec,
2891 	},
2892 	{
2893 		/*  Deprecated. Use gc_min_interval_ms */
2894 
2895 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2896 		.procname	= "gc_min_interval",
2897 		.data		= &ip_rt_gc_min_interval,
2898 		.maxlen		= sizeof(int),
2899 		.mode		= 0644,
2900 		.proc_handler	= &proc_dointvec_jiffies,
2901 		.strategy	= &sysctl_jiffies,
2902 	},
2903 	{
2904 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2905 		.procname	= "gc_min_interval_ms",
2906 		.data		= &ip_rt_gc_min_interval,
2907 		.maxlen		= sizeof(int),
2908 		.mode		= 0644,
2909 		.proc_handler	= &proc_dointvec_ms_jiffies,
2910 		.strategy	= &sysctl_ms_jiffies,
2911 	},
2912 	{
2913 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2914 		.procname	= "gc_timeout",
2915 		.data		= &ip_rt_gc_timeout,
2916 		.maxlen		= sizeof(int),
2917 		.mode		= 0644,
2918 		.proc_handler	= &proc_dointvec_jiffies,
2919 		.strategy	= &sysctl_jiffies,
2920 	},
2921 	{
2922 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2923 		.procname	= "gc_interval",
2924 		.data		= &ip_rt_gc_interval,
2925 		.maxlen		= sizeof(int),
2926 		.mode		= 0644,
2927 		.proc_handler	= &proc_dointvec_jiffies,
2928 		.strategy	= &sysctl_jiffies,
2929 	},
2930 	{
2931 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2932 		.procname	= "redirect_load",
2933 		.data		= &ip_rt_redirect_load,
2934 		.maxlen		= sizeof(int),
2935 		.mode		= 0644,
2936 		.proc_handler	= &proc_dointvec,
2937 	},
2938 	{
2939 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2940 		.procname	= "redirect_number",
2941 		.data		= &ip_rt_redirect_number,
2942 		.maxlen		= sizeof(int),
2943 		.mode		= 0644,
2944 		.proc_handler	= &proc_dointvec,
2945 	},
2946 	{
2947 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2948 		.procname	= "redirect_silence",
2949 		.data		= &ip_rt_redirect_silence,
2950 		.maxlen		= sizeof(int),
2951 		.mode		= 0644,
2952 		.proc_handler	= &proc_dointvec,
2953 	},
2954 	{
2955 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2956 		.procname	= "error_cost",
2957 		.data		= &ip_rt_error_cost,
2958 		.maxlen		= sizeof(int),
2959 		.mode		= 0644,
2960 		.proc_handler	= &proc_dointvec,
2961 	},
2962 	{
2963 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2964 		.procname	= "error_burst",
2965 		.data		= &ip_rt_error_burst,
2966 		.maxlen		= sizeof(int),
2967 		.mode		= 0644,
2968 		.proc_handler	= &proc_dointvec,
2969 	},
2970 	{
2971 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2972 		.procname	= "gc_elasticity",
2973 		.data		= &ip_rt_gc_elasticity,
2974 		.maxlen		= sizeof(int),
2975 		.mode		= 0644,
2976 		.proc_handler	= &proc_dointvec,
2977 	},
2978 	{
2979 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2980 		.procname	= "mtu_expires",
2981 		.data		= &ip_rt_mtu_expires,
2982 		.maxlen		= sizeof(int),
2983 		.mode		= 0644,
2984 		.proc_handler	= &proc_dointvec_jiffies,
2985 		.strategy	= &sysctl_jiffies,
2986 	},
2987 	{
2988 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2989 		.procname	= "min_pmtu",
2990 		.data		= &ip_rt_min_pmtu,
2991 		.maxlen		= sizeof(int),
2992 		.mode		= 0644,
2993 		.proc_handler	= &proc_dointvec,
2994 	},
2995 	{
2996 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2997 		.procname	= "min_adv_mss",
2998 		.data		= &ip_rt_min_advmss,
2999 		.maxlen		= sizeof(int),
3000 		.mode		= 0644,
3001 		.proc_handler	= &proc_dointvec,
3002 	},
3003 	{
3004 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3005 		.procname	= "secret_interval",
3006 		.data		= &ip_rt_secret_interval,
3007 		.maxlen		= sizeof(int),
3008 		.mode		= 0644,
3009 		.proc_handler	= &proc_dointvec_jiffies,
3010 		.strategy	= &sysctl_jiffies,
3011 	},
3012 	{ .ctl_name = 0 }
3013 };
3014 #endif
3015 
3016 #ifdef CONFIG_NET_CLS_ROUTE
3017 struct ip_rt_acct *ip_rt_acct __read_mostly;
3018 #endif /* CONFIG_NET_CLS_ROUTE */
3019 
3020 static __initdata unsigned long rhash_entries;
3021 static int __init set_rhash_entries(char *str)
3022 {
3023 	if (!str)
3024 		return 0;
3025 	rhash_entries = simple_strtoul(str, &str, 0);
3026 	return 1;
3027 }
3028 __setup("rhash_entries=", set_rhash_entries);
3029 
3030 int __init ip_rt_init(void)
3031 {
3032 	int rc = 0;
3033 
3034 	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3035 			     (jiffies ^ (jiffies >> 7))));
3036 
3037 #ifdef CONFIG_NET_CLS_ROUTE
3038 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3039 	if (!ip_rt_acct)
3040 		panic("IP: failed to allocate ip_rt_acct\n");
3041 #endif
3042 
3043 	ipv4_dst_ops.kmem_cachep =
3044 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3045 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3046 
3047 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3048 
3049 	rt_hash_table = (struct rt_hash_bucket *)
3050 		alloc_large_system_hash("IP route cache",
3051 					sizeof(struct rt_hash_bucket),
3052 					rhash_entries,
3053 					(num_physpages >= 128 * 1024) ?
3054 					15 : 17,
3055 					0,
3056 					&rt_hash_log,
3057 					&rt_hash_mask,
3058 					0);
3059 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3060 	rt_hash_lock_init();
3061 
3062 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3063 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3064 
3065 	devinet_init();
3066 	ip_fib_init();
3067 
3068 	rt_secret_timer.function = rt_secret_rebuild;
3069 	rt_secret_timer.data = 0;
3070 	init_timer_deferrable(&rt_secret_timer);
3071 
3072 	/* All the timers, started at system startup tend
3073 	   to synchronize. Perturb it a bit.
3074 	 */
3075 	schedule_delayed_work(&expires_work,
3076 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3077 
3078 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3079 		ip_rt_secret_interval;
3080 	add_timer(&rt_secret_timer);
3081 
3082 	if (ip_rt_proc_init())
3083 		printk(KERN_ERR "Unable to create route proc files\n");
3084 #ifdef CONFIG_XFRM
3085 	xfrm_init();
3086 	xfrm4_init();
3087 #endif
3088 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3089 
3090 	return rc;
3091 }
3092 
3093 EXPORT_SYMBOL(__ip_select_ident);
3094 EXPORT_SYMBOL(ip_route_input);
3095 EXPORT_SYMBOL(ip_route_output_key);
3096