xref: /openbmc/linux/net/ipv4/route.c (revision a1e58bbd)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 
116 #define IP_MAX_MTU	0xFFF0
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval		= 60 * HZ;
123 static int ip_rt_gc_min_interval	= HZ / 2;
124 static int ip_rt_redirect_number	= 9;
125 static int ip_rt_redirect_load		= HZ / 50;
126 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost		= HZ;
128 static int ip_rt_error_burst		= 5 * HZ;
129 static int ip_rt_gc_elasticity		= 8;
130 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu		= 512 + 20 + 20;
132 static int ip_rt_min_advmss		= 256;
133 static int ip_rt_secret_interval	= 10 * 60 * HZ;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.local_out =		ip_local_out,
166 	.entry_size =		sizeof(struct rtable),
167 	.entries =		ATOMIC_INIT(0),
168 };
169 
170 #define ECN_OR_COST(class)	TC_PRIO_##class
171 
172 const __u8 ip_tos2prio[16] = {
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(FILLER),
175 	TC_PRIO_BESTEFFORT,
176 	ECN_OR_COST(BESTEFFORT),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_BULK,
180 	ECN_OR_COST(BULK),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE,
184 	ECN_OR_COST(INTERACTIVE),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK),
187 	TC_PRIO_INTERACTIVE_BULK,
188 	ECN_OR_COST(INTERACTIVE_BULK)
189 };
190 
191 
192 /*
193  * Route cache.
194  */
195 
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205 
206 struct rt_hash_bucket {
207 	struct rtable	*chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 	defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ	256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ	4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ	2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ	1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ	512
227 # else
228 #  define RT_HASH_LOCK_SZ	256
229 # endif
230 #endif
231 
232 static spinlock_t	*rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 
235 static __init void rt_hash_lock_init(void)
236 {
237 	int i;
238 
239 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240 			GFP_KERNEL);
241 	if (!rt_hash_locks)
242 		panic("IP: failed to allocate rt_hash_locks\n");
243 
244 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245 		spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249 
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254 
255 static struct rt_hash_bucket 	*rt_hash_table;
256 static unsigned			rt_hash_mask;
257 static unsigned int		rt_hash_log;
258 static atomic_t			rt_genid;
259 
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262 	(__raw_get_cpu_var(rt_cache_stat).field++)
263 
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266 	return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267 		& rt_hash_mask;
268 }
269 
270 #define rt_hash(daddr, saddr, idx) \
271 	rt_hash_code((__force u32)(__be32)(daddr),\
272 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	int bucket;
277 	int genid;
278 };
279 
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
281 {
282 	struct rtable *r = NULL;
283 
284 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 		rcu_read_lock_bh();
286 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 		while (r) {
288 			if (r->rt_genid == st->genid)
289 				return r;
290 			r = rcu_dereference(r->u.dst.rt_next);
291 		}
292 		rcu_read_unlock_bh();
293 	}
294 	return r;
295 }
296 
297 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
298 {
299 	r = r->u.dst.rt_next;
300 	while (!r) {
301 		rcu_read_unlock_bh();
302 		if (--st->bucket < 0)
303 			break;
304 		rcu_read_lock_bh();
305 		r = rt_hash_table[st->bucket].chain;
306 	}
307 	return rcu_dereference(r);
308 }
309 
310 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
311 {
312 	struct rtable *r = rt_cache_get_first(st);
313 
314 	if (r)
315 		while (pos && (r = rt_cache_get_next(st, r))) {
316 			if (r->rt_genid != st->genid)
317 				continue;
318 			--pos;
319 		}
320 	return pos ? NULL : r;
321 }
322 
323 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
324 {
325 	struct rt_cache_iter_state *st = seq->private;
326 
327 	if (*pos)
328 		return rt_cache_get_idx(st, *pos - 1);
329 	st->genid = atomic_read(&rt_genid);
330 	return SEQ_START_TOKEN;
331 }
332 
333 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
334 {
335 	struct rtable *r;
336 	struct rt_cache_iter_state *st = seq->private;
337 
338 	if (v == SEQ_START_TOKEN)
339 		r = rt_cache_get_first(st);
340 	else
341 		r = rt_cache_get_next(st, v);
342 	++*pos;
343 	return r;
344 }
345 
346 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
347 {
348 	if (v && v != SEQ_START_TOKEN)
349 		rcu_read_unlock_bh();
350 }
351 
352 static int rt_cache_seq_show(struct seq_file *seq, void *v)
353 {
354 	if (v == SEQ_START_TOKEN)
355 		seq_printf(seq, "%-127s\n",
356 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
357 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
358 			   "HHUptod\tSpecDst");
359 	else {
360 		struct rtable *r = v;
361 		char temp[256];
362 
363 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
364 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
365 			r->u.dst.dev ? r->u.dst.dev->name : "*",
366 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
367 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
368 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
369 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
370 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
371 			dst_metric(&r->u.dst, RTAX_WINDOW),
372 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
373 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
374 			r->fl.fl4_tos,
375 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
376 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
377 				       dev_queue_xmit) : 0,
378 			r->rt_spec_dst);
379 		seq_printf(seq, "%-127s\n", temp);
380 	}
381 	return 0;
382 }
383 
384 static const struct seq_operations rt_cache_seq_ops = {
385 	.start  = rt_cache_seq_start,
386 	.next   = rt_cache_seq_next,
387 	.stop   = rt_cache_seq_stop,
388 	.show   = rt_cache_seq_show,
389 };
390 
391 static int rt_cache_seq_open(struct inode *inode, struct file *file)
392 {
393 	return seq_open_private(file, &rt_cache_seq_ops,
394 			sizeof(struct rt_cache_iter_state));
395 }
396 
397 static const struct file_operations rt_cache_seq_fops = {
398 	.owner	 = THIS_MODULE,
399 	.open	 = rt_cache_seq_open,
400 	.read	 = seq_read,
401 	.llseek	 = seq_lseek,
402 	.release = seq_release_private,
403 };
404 
405 
406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
407 {
408 	int cpu;
409 
410 	if (*pos == 0)
411 		return SEQ_START_TOKEN;
412 
413 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
414 		if (!cpu_possible(cpu))
415 			continue;
416 		*pos = cpu+1;
417 		return &per_cpu(rt_cache_stat, cpu);
418 	}
419 	return NULL;
420 }
421 
422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
423 {
424 	int cpu;
425 
426 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
427 		if (!cpu_possible(cpu))
428 			continue;
429 		*pos = cpu+1;
430 		return &per_cpu(rt_cache_stat, cpu);
431 	}
432 	return NULL;
433 
434 }
435 
436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
437 {
438 
439 }
440 
441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 {
443 	struct rt_cache_stat *st = v;
444 
445 	if (v == SEQ_START_TOKEN) {
446 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
447 		return 0;
448 	}
449 
450 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
451 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
452 		   atomic_read(&ipv4_dst_ops.entries),
453 		   st->in_hit,
454 		   st->in_slow_tot,
455 		   st->in_slow_mc,
456 		   st->in_no_route,
457 		   st->in_brd,
458 		   st->in_martian_dst,
459 		   st->in_martian_src,
460 
461 		   st->out_hit,
462 		   st->out_slow_tot,
463 		   st->out_slow_mc,
464 
465 		   st->gc_total,
466 		   st->gc_ignored,
467 		   st->gc_goal_miss,
468 		   st->gc_dst_overflow,
469 		   st->in_hlist_search,
470 		   st->out_hlist_search
471 		);
472 	return 0;
473 }
474 
475 static const struct seq_operations rt_cpu_seq_ops = {
476 	.start  = rt_cpu_seq_start,
477 	.next   = rt_cpu_seq_next,
478 	.stop   = rt_cpu_seq_stop,
479 	.show   = rt_cpu_seq_show,
480 };
481 
482 
483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 {
485 	return seq_open(file, &rt_cpu_seq_ops);
486 }
487 
488 static const struct file_operations rt_cpu_seq_fops = {
489 	.owner	 = THIS_MODULE,
490 	.open	 = rt_cpu_seq_open,
491 	.read	 = seq_read,
492 	.llseek	 = seq_lseek,
493 	.release = seq_release,
494 };
495 
496 #ifdef CONFIG_NET_CLS_ROUTE
497 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
498 			   int length, int *eof, void *data)
499 {
500 	unsigned int i;
501 
502 	if ((offset & 3) || (length & 3))
503 		return -EIO;
504 
505 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
506 		*eof = 1;
507 		return 0;
508 	}
509 
510 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
511 		length = sizeof(struct ip_rt_acct) * 256 - offset;
512 		*eof = 1;
513 	}
514 
515 	offset /= sizeof(u32);
516 
517 	if (length > 0) {
518 		u32 *dst = (u32 *) buffer;
519 
520 		*start = buffer;
521 		memset(dst, 0, length);
522 
523 		for_each_possible_cpu(i) {
524 			unsigned int j;
525 			u32 *src;
526 
527 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
528 			for (j = 0; j < length/4; j++)
529 				dst[j] += src[j];
530 		}
531 	}
532 	return length;
533 }
534 #endif
535 
536 static __init int ip_rt_proc_init(struct net *net)
537 {
538 	struct proc_dir_entry *pde;
539 
540 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
541 			&rt_cache_seq_fops);
542 	if (!pde)
543 		goto err1;
544 
545 	pde = proc_create("rt_cache", S_IRUGO,
546 			  net->proc_net_stat, &rt_cpu_seq_fops);
547 	if (!pde)
548 		goto err2;
549 
550 #ifdef CONFIG_NET_CLS_ROUTE
551 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
552 			ip_rt_acct_read, NULL);
553 	if (!pde)
554 		goto err3;
555 #endif
556 	return 0;
557 
558 #ifdef CONFIG_NET_CLS_ROUTE
559 err3:
560 	remove_proc_entry("rt_cache", net->proc_net_stat);
561 #endif
562 err2:
563 	remove_proc_entry("rt_cache", net->proc_net);
564 err1:
565 	return -ENOMEM;
566 }
567 #else
568 static inline int ip_rt_proc_init(struct net *net)
569 {
570 	return 0;
571 }
572 #endif /* CONFIG_PROC_FS */
573 
574 static __inline__ void rt_free(struct rtable *rt)
575 {
576 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
577 }
578 
579 static __inline__ void rt_drop(struct rtable *rt)
580 {
581 	ip_rt_put(rt);
582 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
583 }
584 
585 static __inline__ int rt_fast_clean(struct rtable *rth)
586 {
587 	/* Kill broadcast/multicast entries very aggresively, if they
588 	   collide in hash table with more useful entries */
589 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
590 		rth->fl.iif && rth->u.dst.rt_next;
591 }
592 
593 static __inline__ int rt_valuable(struct rtable *rth)
594 {
595 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
596 		rth->u.dst.expires;
597 }
598 
599 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
600 {
601 	unsigned long age;
602 	int ret = 0;
603 
604 	if (atomic_read(&rth->u.dst.__refcnt))
605 		goto out;
606 
607 	ret = 1;
608 	if (rth->u.dst.expires &&
609 	    time_after_eq(jiffies, rth->u.dst.expires))
610 		goto out;
611 
612 	age = jiffies - rth->u.dst.lastuse;
613 	ret = 0;
614 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
615 	    (age <= tmo2 && rt_valuable(rth)))
616 		goto out;
617 	ret = 1;
618 out:	return ret;
619 }
620 
621 /* Bits of score are:
622  * 31: very valuable
623  * 30: not quite useless
624  * 29..0: usage counter
625  */
626 static inline u32 rt_score(struct rtable *rt)
627 {
628 	u32 score = jiffies - rt->u.dst.lastuse;
629 
630 	score = ~score & ~(3<<30);
631 
632 	if (rt_valuable(rt))
633 		score |= (1<<31);
634 
635 	if (!rt->fl.iif ||
636 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
637 		score |= (1<<30);
638 
639 	return score;
640 }
641 
642 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
643 {
644 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
645 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
646 		(fl1->mark ^ fl2->mark) |
647 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
648 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
649 		(fl1->oif ^ fl2->oif) |
650 		(fl1->iif ^ fl2->iif)) == 0;
651 }
652 
653 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
654 {
655 	return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
656 }
657 
658 /*
659  * Perform a full scan of hash table and free all entries.
660  * Can be called by a softirq or a process.
661  * In the later case, we want to be reschedule if necessary
662  */
663 static void rt_do_flush(int process_context)
664 {
665 	unsigned int i;
666 	struct rtable *rth, *next;
667 
668 	for (i = 0; i <= rt_hash_mask; i++) {
669 		if (process_context && need_resched())
670 			cond_resched();
671 		rth = rt_hash_table[i].chain;
672 		if (!rth)
673 			continue;
674 
675 		spin_lock_bh(rt_hash_lock_addr(i));
676 		rth = rt_hash_table[i].chain;
677 		rt_hash_table[i].chain = NULL;
678 		spin_unlock_bh(rt_hash_lock_addr(i));
679 
680 		for (; rth; rth = next) {
681 			next = rth->u.dst.rt_next;
682 			rt_free(rth);
683 		}
684 	}
685 }
686 
687 static void rt_check_expire(void)
688 {
689 	static unsigned int rover;
690 	unsigned int i = rover, goal;
691 	struct rtable *rth, **rthp;
692 	u64 mult;
693 
694 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
695 	if (ip_rt_gc_timeout > 1)
696 		do_div(mult, ip_rt_gc_timeout);
697 	goal = (unsigned int)mult;
698 	if (goal > rt_hash_mask)
699 		goal = rt_hash_mask + 1;
700 	for (; goal > 0; goal--) {
701 		unsigned long tmo = ip_rt_gc_timeout;
702 
703 		i = (i + 1) & rt_hash_mask;
704 		rthp = &rt_hash_table[i].chain;
705 
706 		if (need_resched())
707 			cond_resched();
708 
709 		if (*rthp == NULL)
710 			continue;
711 		spin_lock_bh(rt_hash_lock_addr(i));
712 		while ((rth = *rthp) != NULL) {
713 			if (rth->rt_genid != atomic_read(&rt_genid)) {
714 				*rthp = rth->u.dst.rt_next;
715 				rt_free(rth);
716 				continue;
717 			}
718 			if (rth->u.dst.expires) {
719 				/* Entry is expired even if it is in use */
720 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
721 					tmo >>= 1;
722 					rthp = &rth->u.dst.rt_next;
723 					continue;
724 				}
725 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
726 				tmo >>= 1;
727 				rthp = &rth->u.dst.rt_next;
728 				continue;
729 			}
730 
731 			/* Cleanup aged off entries. */
732 			*rthp = rth->u.dst.rt_next;
733 			rt_free(rth);
734 		}
735 		spin_unlock_bh(rt_hash_lock_addr(i));
736 	}
737 	rover = i;
738 }
739 
740 /*
741  * rt_worker_func() is run in process context.
742  * we call rt_check_expire() to scan part of the hash table
743  */
744 static void rt_worker_func(struct work_struct *work)
745 {
746 	rt_check_expire();
747 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
748 }
749 
750 /*
751  * Pertubation of rt_genid by a small quantity [1..256]
752  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
753  * many times (2^24) without giving recent rt_genid.
754  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
755  */
756 static void rt_cache_invalidate(void)
757 {
758 	unsigned char shuffle;
759 
760 	get_random_bytes(&shuffle, sizeof(shuffle));
761 	atomic_add(shuffle + 1U, &rt_genid);
762 }
763 
764 /*
765  * delay < 0  : invalidate cache (fast : entries will be deleted later)
766  * delay >= 0 : invalidate & flush cache (can be long)
767  */
768 void rt_cache_flush(int delay)
769 {
770 	rt_cache_invalidate();
771 	if (delay >= 0)
772 		rt_do_flush(!in_softirq());
773 }
774 
775 /*
776  * We change rt_genid and let gc do the cleanup
777  */
778 static void rt_secret_rebuild(unsigned long dummy)
779 {
780 	rt_cache_invalidate();
781 	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
782 }
783 
784 /*
785    Short description of GC goals.
786 
787    We want to build algorithm, which will keep routing cache
788    at some equilibrium point, when number of aged off entries
789    is kept approximately equal to newly generated ones.
790 
791    Current expiration strength is variable "expire".
792    We try to adjust it dynamically, so that if networking
793    is idle expires is large enough to keep enough of warm entries,
794    and when load increases it reduces to limit cache size.
795  */
796 
797 static int rt_garbage_collect(struct dst_ops *ops)
798 {
799 	static unsigned long expire = RT_GC_TIMEOUT;
800 	static unsigned long last_gc;
801 	static int rover;
802 	static int equilibrium;
803 	struct rtable *rth, **rthp;
804 	unsigned long now = jiffies;
805 	int goal;
806 
807 	/*
808 	 * Garbage collection is pretty expensive,
809 	 * do not make it too frequently.
810 	 */
811 
812 	RT_CACHE_STAT_INC(gc_total);
813 
814 	if (now - last_gc < ip_rt_gc_min_interval &&
815 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
816 		RT_CACHE_STAT_INC(gc_ignored);
817 		goto out;
818 	}
819 
820 	/* Calculate number of entries, which we want to expire now. */
821 	goal = atomic_read(&ipv4_dst_ops.entries) -
822 		(ip_rt_gc_elasticity << rt_hash_log);
823 	if (goal <= 0) {
824 		if (equilibrium < ipv4_dst_ops.gc_thresh)
825 			equilibrium = ipv4_dst_ops.gc_thresh;
826 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
827 		if (goal > 0) {
828 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
829 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
830 		}
831 	} else {
832 		/* We are in dangerous area. Try to reduce cache really
833 		 * aggressively.
834 		 */
835 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
836 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
837 	}
838 
839 	if (now - last_gc >= ip_rt_gc_min_interval)
840 		last_gc = now;
841 
842 	if (goal <= 0) {
843 		equilibrium += goal;
844 		goto work_done;
845 	}
846 
847 	do {
848 		int i, k;
849 
850 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
851 			unsigned long tmo = expire;
852 
853 			k = (k + 1) & rt_hash_mask;
854 			rthp = &rt_hash_table[k].chain;
855 			spin_lock_bh(rt_hash_lock_addr(k));
856 			while ((rth = *rthp) != NULL) {
857 				if (rth->rt_genid == atomic_read(&rt_genid) &&
858 					!rt_may_expire(rth, tmo, expire)) {
859 					tmo >>= 1;
860 					rthp = &rth->u.dst.rt_next;
861 					continue;
862 				}
863 				*rthp = rth->u.dst.rt_next;
864 				rt_free(rth);
865 				goal--;
866 			}
867 			spin_unlock_bh(rt_hash_lock_addr(k));
868 			if (goal <= 0)
869 				break;
870 		}
871 		rover = k;
872 
873 		if (goal <= 0)
874 			goto work_done;
875 
876 		/* Goal is not achieved. We stop process if:
877 
878 		   - if expire reduced to zero. Otherwise, expire is halfed.
879 		   - if table is not full.
880 		   - if we are called from interrupt.
881 		   - jiffies check is just fallback/debug loop breaker.
882 		     We will not spin here for long time in any case.
883 		 */
884 
885 		RT_CACHE_STAT_INC(gc_goal_miss);
886 
887 		if (expire == 0)
888 			break;
889 
890 		expire >>= 1;
891 #if RT_CACHE_DEBUG >= 2
892 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
893 				atomic_read(&ipv4_dst_ops.entries), goal, i);
894 #endif
895 
896 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
897 			goto out;
898 	} while (!in_softirq() && time_before_eq(jiffies, now));
899 
900 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 		goto out;
902 	if (net_ratelimit())
903 		printk(KERN_WARNING "dst cache overflow\n");
904 	RT_CACHE_STAT_INC(gc_dst_overflow);
905 	return 1;
906 
907 work_done:
908 	expire += ip_rt_gc_min_interval;
909 	if (expire > ip_rt_gc_timeout ||
910 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
911 		expire = ip_rt_gc_timeout;
912 #if RT_CACHE_DEBUG >= 2
913 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
914 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
915 #endif
916 out:	return 0;
917 }
918 
919 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
920 {
921 	struct rtable	*rth, **rthp;
922 	unsigned long	now;
923 	struct rtable *cand, **candp;
924 	u32 		min_score;
925 	int		chain_length;
926 	int attempts = !in_softirq();
927 
928 restart:
929 	chain_length = 0;
930 	min_score = ~(u32)0;
931 	cand = NULL;
932 	candp = NULL;
933 	now = jiffies;
934 
935 	rthp = &rt_hash_table[hash].chain;
936 
937 	spin_lock_bh(rt_hash_lock_addr(hash));
938 	while ((rth = *rthp) != NULL) {
939 		if (rth->rt_genid != atomic_read(&rt_genid)) {
940 			*rthp = rth->u.dst.rt_next;
941 			rt_free(rth);
942 			continue;
943 		}
944 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
945 			/* Put it first */
946 			*rthp = rth->u.dst.rt_next;
947 			/*
948 			 * Since lookup is lockfree, the deletion
949 			 * must be visible to another weakly ordered CPU before
950 			 * the insertion at the start of the hash chain.
951 			 */
952 			rcu_assign_pointer(rth->u.dst.rt_next,
953 					   rt_hash_table[hash].chain);
954 			/*
955 			 * Since lookup is lockfree, the update writes
956 			 * must be ordered for consistency on SMP.
957 			 */
958 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
959 
960 			dst_use(&rth->u.dst, now);
961 			spin_unlock_bh(rt_hash_lock_addr(hash));
962 
963 			rt_drop(rt);
964 			*rp = rth;
965 			return 0;
966 		}
967 
968 		if (!atomic_read(&rth->u.dst.__refcnt)) {
969 			u32 score = rt_score(rth);
970 
971 			if (score <= min_score) {
972 				cand = rth;
973 				candp = rthp;
974 				min_score = score;
975 			}
976 		}
977 
978 		chain_length++;
979 
980 		rthp = &rth->u.dst.rt_next;
981 	}
982 
983 	if (cand) {
984 		/* ip_rt_gc_elasticity used to be average length of chain
985 		 * length, when exceeded gc becomes really aggressive.
986 		 *
987 		 * The second limit is less certain. At the moment it allows
988 		 * only 2 entries per bucket. We will see.
989 		 */
990 		if (chain_length > ip_rt_gc_elasticity) {
991 			*candp = cand->u.dst.rt_next;
992 			rt_free(cand);
993 		}
994 	}
995 
996 	/* Try to bind route to arp only if it is output
997 	   route or unicast forwarding path.
998 	 */
999 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1000 		int err = arp_bind_neighbour(&rt->u.dst);
1001 		if (err) {
1002 			spin_unlock_bh(rt_hash_lock_addr(hash));
1003 
1004 			if (err != -ENOBUFS) {
1005 				rt_drop(rt);
1006 				return err;
1007 			}
1008 
1009 			/* Neighbour tables are full and nothing
1010 			   can be released. Try to shrink route cache,
1011 			   it is most likely it holds some neighbour records.
1012 			 */
1013 			if (attempts-- > 0) {
1014 				int saved_elasticity = ip_rt_gc_elasticity;
1015 				int saved_int = ip_rt_gc_min_interval;
1016 				ip_rt_gc_elasticity	= 1;
1017 				ip_rt_gc_min_interval	= 0;
1018 				rt_garbage_collect(&ipv4_dst_ops);
1019 				ip_rt_gc_min_interval	= saved_int;
1020 				ip_rt_gc_elasticity	= saved_elasticity;
1021 				goto restart;
1022 			}
1023 
1024 			if (net_ratelimit())
1025 				printk(KERN_WARNING "Neighbour table overflow.\n");
1026 			rt_drop(rt);
1027 			return -ENOBUFS;
1028 		}
1029 	}
1030 
1031 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1032 #if RT_CACHE_DEBUG >= 2
1033 	if (rt->u.dst.rt_next) {
1034 		struct rtable *trt;
1035 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1036 		       NIPQUAD(rt->rt_dst));
1037 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1038 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1039 		printk("\n");
1040 	}
1041 #endif
1042 	rt_hash_table[hash].chain = rt;
1043 	spin_unlock_bh(rt_hash_lock_addr(hash));
1044 	*rp = rt;
1045 	return 0;
1046 }
1047 
1048 void rt_bind_peer(struct rtable *rt, int create)
1049 {
1050 	static DEFINE_SPINLOCK(rt_peer_lock);
1051 	struct inet_peer *peer;
1052 
1053 	peer = inet_getpeer(rt->rt_dst, create);
1054 
1055 	spin_lock_bh(&rt_peer_lock);
1056 	if (rt->peer == NULL) {
1057 		rt->peer = peer;
1058 		peer = NULL;
1059 	}
1060 	spin_unlock_bh(&rt_peer_lock);
1061 	if (peer)
1062 		inet_putpeer(peer);
1063 }
1064 
1065 /*
1066  * Peer allocation may fail only in serious out-of-memory conditions.  However
1067  * we still can generate some output.
1068  * Random ID selection looks a bit dangerous because we have no chances to
1069  * select ID being unique in a reasonable period of time.
1070  * But broken packet identifier may be better than no packet at all.
1071  */
1072 static void ip_select_fb_ident(struct iphdr *iph)
1073 {
1074 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1075 	static u32 ip_fallback_id;
1076 	u32 salt;
1077 
1078 	spin_lock_bh(&ip_fb_id_lock);
1079 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1080 	iph->id = htons(salt & 0xFFFF);
1081 	ip_fallback_id = salt;
1082 	spin_unlock_bh(&ip_fb_id_lock);
1083 }
1084 
1085 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1086 {
1087 	struct rtable *rt = (struct rtable *) dst;
1088 
1089 	if (rt) {
1090 		if (rt->peer == NULL)
1091 			rt_bind_peer(rt, 1);
1092 
1093 		/* If peer is attached to destination, it is never detached,
1094 		   so that we need not to grab a lock to dereference it.
1095 		 */
1096 		if (rt->peer) {
1097 			iph->id = htons(inet_getid(rt->peer, more));
1098 			return;
1099 		}
1100 	} else
1101 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1102 		       __builtin_return_address(0));
1103 
1104 	ip_select_fb_ident(iph);
1105 }
1106 
1107 static void rt_del(unsigned hash, struct rtable *rt)
1108 {
1109 	struct rtable **rthp, *aux;
1110 
1111 	rthp = &rt_hash_table[hash].chain;
1112 	spin_lock_bh(rt_hash_lock_addr(hash));
1113 	ip_rt_put(rt);
1114 	while ((aux = *rthp) != NULL) {
1115 		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1116 			*rthp = aux->u.dst.rt_next;
1117 			rt_free(aux);
1118 			continue;
1119 		}
1120 		rthp = &aux->u.dst.rt_next;
1121 	}
1122 	spin_unlock_bh(rt_hash_lock_addr(hash));
1123 }
1124 
1125 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1126 		    __be32 saddr, struct net_device *dev)
1127 {
1128 	int i, k;
1129 	struct in_device *in_dev = in_dev_get(dev);
1130 	struct rtable *rth, **rthp;
1131 	__be32  skeys[2] = { saddr, 0 };
1132 	int  ikeys[2] = { dev->ifindex, 0 };
1133 	struct netevent_redirect netevent;
1134 
1135 	if (!in_dev)
1136 		return;
1137 
1138 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1139 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1140 	    || ipv4_is_zeronet(new_gw))
1141 		goto reject_redirect;
1142 
1143 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1144 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1145 			goto reject_redirect;
1146 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1147 			goto reject_redirect;
1148 	} else {
1149 		if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1150 			goto reject_redirect;
1151 	}
1152 
1153 	for (i = 0; i < 2; i++) {
1154 		for (k = 0; k < 2; k++) {
1155 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1156 
1157 			rthp=&rt_hash_table[hash].chain;
1158 
1159 			rcu_read_lock();
1160 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1161 				struct rtable *rt;
1162 
1163 				if (rth->fl.fl4_dst != daddr ||
1164 				    rth->fl.fl4_src != skeys[i] ||
1165 				    rth->fl.oif != ikeys[k] ||
1166 				    rth->fl.iif != 0 ||
1167 				    rth->rt_genid != atomic_read(&rt_genid)) {
1168 					rthp = &rth->u.dst.rt_next;
1169 					continue;
1170 				}
1171 
1172 				if (rth->rt_dst != daddr ||
1173 				    rth->rt_src != saddr ||
1174 				    rth->u.dst.error ||
1175 				    rth->rt_gateway != old_gw ||
1176 				    rth->u.dst.dev != dev)
1177 					break;
1178 
1179 				dst_hold(&rth->u.dst);
1180 				rcu_read_unlock();
1181 
1182 				rt = dst_alloc(&ipv4_dst_ops);
1183 				if (rt == NULL) {
1184 					ip_rt_put(rth);
1185 					in_dev_put(in_dev);
1186 					return;
1187 				}
1188 
1189 				/* Copy all the information. */
1190 				*rt = *rth;
1191 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1192 				rt->u.dst.__use		= 1;
1193 				atomic_set(&rt->u.dst.__refcnt, 1);
1194 				rt->u.dst.child		= NULL;
1195 				if (rt->u.dst.dev)
1196 					dev_hold(rt->u.dst.dev);
1197 				if (rt->idev)
1198 					in_dev_hold(rt->idev);
1199 				rt->u.dst.obsolete	= 0;
1200 				rt->u.dst.lastuse	= jiffies;
1201 				rt->u.dst.path		= &rt->u.dst;
1202 				rt->u.dst.neighbour	= NULL;
1203 				rt->u.dst.hh		= NULL;
1204 				rt->u.dst.xfrm		= NULL;
1205 				rt->rt_genid		= atomic_read(&rt_genid);
1206 				rt->rt_flags		|= RTCF_REDIRECTED;
1207 
1208 				/* Gateway is different ... */
1209 				rt->rt_gateway		= new_gw;
1210 
1211 				/* Redirect received -> path was valid */
1212 				dst_confirm(&rth->u.dst);
1213 
1214 				if (rt->peer)
1215 					atomic_inc(&rt->peer->refcnt);
1216 
1217 				if (arp_bind_neighbour(&rt->u.dst) ||
1218 				    !(rt->u.dst.neighbour->nud_state &
1219 					    NUD_VALID)) {
1220 					if (rt->u.dst.neighbour)
1221 						neigh_event_send(rt->u.dst.neighbour, NULL);
1222 					ip_rt_put(rth);
1223 					rt_drop(rt);
1224 					goto do_next;
1225 				}
1226 
1227 				netevent.old = &rth->u.dst;
1228 				netevent.new = &rt->u.dst;
1229 				call_netevent_notifiers(NETEVENT_REDIRECT,
1230 							&netevent);
1231 
1232 				rt_del(hash, rth);
1233 				if (!rt_intern_hash(hash, rt, &rt))
1234 					ip_rt_put(rt);
1235 				goto do_next;
1236 			}
1237 			rcu_read_unlock();
1238 		do_next:
1239 			;
1240 		}
1241 	}
1242 	in_dev_put(in_dev);
1243 	return;
1244 
1245 reject_redirect:
1246 #ifdef CONFIG_IP_ROUTE_VERBOSE
1247 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1248 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1249 			"%u.%u.%u.%u ignored.\n"
1250 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1251 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1252 		       NIPQUAD(saddr), NIPQUAD(daddr));
1253 #endif
1254 	in_dev_put(in_dev);
1255 }
1256 
1257 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1258 {
1259 	struct rtable *rt = (struct rtable*)dst;
1260 	struct dst_entry *ret = dst;
1261 
1262 	if (rt) {
1263 		if (dst->obsolete) {
1264 			ip_rt_put(rt);
1265 			ret = NULL;
1266 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1267 			   rt->u.dst.expires) {
1268 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1269 						rt->fl.oif);
1270 #if RT_CACHE_DEBUG >= 1
1271 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1272 					  "%u.%u.%u.%u/%02x dropped\n",
1273 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1274 #endif
1275 			rt_del(hash, rt);
1276 			ret = NULL;
1277 		}
1278 	}
1279 	return ret;
1280 }
1281 
1282 /*
1283  * Algorithm:
1284  *	1. The first ip_rt_redirect_number redirects are sent
1285  *	   with exponential backoff, then we stop sending them at all,
1286  *	   assuming that the host ignores our redirects.
1287  *	2. If we did not see packets requiring redirects
1288  *	   during ip_rt_redirect_silence, we assume that the host
1289  *	   forgot redirected route and start to send redirects again.
1290  *
1291  * This algorithm is much cheaper and more intelligent than dumb load limiting
1292  * in icmp.c.
1293  *
1294  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1295  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1296  */
1297 
1298 void ip_rt_send_redirect(struct sk_buff *skb)
1299 {
1300 	struct rtable *rt = (struct rtable*)skb->dst;
1301 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1302 
1303 	if (!in_dev)
1304 		return;
1305 
1306 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1307 		goto out;
1308 
1309 	/* No redirected packets during ip_rt_redirect_silence;
1310 	 * reset the algorithm.
1311 	 */
1312 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1313 		rt->u.dst.rate_tokens = 0;
1314 
1315 	/* Too many ignored redirects; do not send anything
1316 	 * set u.dst.rate_last to the last seen redirected packet.
1317 	 */
1318 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1319 		rt->u.dst.rate_last = jiffies;
1320 		goto out;
1321 	}
1322 
1323 	/* Check for load limit; set rate_last to the latest sent
1324 	 * redirect.
1325 	 */
1326 	if (rt->u.dst.rate_tokens == 0 ||
1327 	    time_after(jiffies,
1328 		       (rt->u.dst.rate_last +
1329 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1330 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1331 		rt->u.dst.rate_last = jiffies;
1332 		++rt->u.dst.rate_tokens;
1333 #ifdef CONFIG_IP_ROUTE_VERBOSE
1334 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1335 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1336 		    net_ratelimit())
1337 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1338 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1339 				NIPQUAD(rt->rt_src), rt->rt_iif,
1340 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1341 #endif
1342 	}
1343 out:
1344 	in_dev_put(in_dev);
1345 }
1346 
1347 static int ip_error(struct sk_buff *skb)
1348 {
1349 	struct rtable *rt = (struct rtable*)skb->dst;
1350 	unsigned long now;
1351 	int code;
1352 
1353 	switch (rt->u.dst.error) {
1354 		case EINVAL:
1355 		default:
1356 			goto out;
1357 		case EHOSTUNREACH:
1358 			code = ICMP_HOST_UNREACH;
1359 			break;
1360 		case ENETUNREACH:
1361 			code = ICMP_NET_UNREACH;
1362 			IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1363 			break;
1364 		case EACCES:
1365 			code = ICMP_PKT_FILTERED;
1366 			break;
1367 	}
1368 
1369 	now = jiffies;
1370 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1373 	rt->u.dst.rate_last = now;
1374 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377 	}
1378 
1379 out:	kfree_skb(skb);
1380 	return 0;
1381 }
1382 
1383 /*
1384  *	The last two values are not from the RFC but
1385  *	are needed for AMPRnet AX.25 paths.
1386  */
1387 
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390 
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393 	int i;
1394 
1395 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396 		if (old_mtu > mtu_plateau[i])
1397 			return mtu_plateau[i];
1398 	return 68;
1399 }
1400 
1401 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1402 				 unsigned short new_mtu)
1403 {
1404 	int i;
1405 	unsigned short old_mtu = ntohs(iph->tot_len);
1406 	struct rtable *rth;
1407 	__be32  skeys[2] = { iph->saddr, 0, };
1408 	__be32  daddr = iph->daddr;
1409 	unsigned short est_mtu = 0;
1410 
1411 	if (ipv4_config.no_pmtu_disc)
1412 		return 0;
1413 
1414 	for (i = 0; i < 2; i++) {
1415 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1416 
1417 		rcu_read_lock();
1418 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1420 			if (rth->fl.fl4_dst == daddr &&
1421 			    rth->fl.fl4_src == skeys[i] &&
1422 			    rth->rt_dst  == daddr &&
1423 			    rth->rt_src  == iph->saddr &&
1424 			    rth->fl.iif == 0 &&
1425 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1426 			    rth->u.dst.dev->nd_net == net &&
1427 			    rth->rt_genid == atomic_read(&rt_genid)) {
1428 				unsigned short mtu = new_mtu;
1429 
1430 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1431 
1432 					/* BSD 4.2 compatibility hack :-( */
1433 					if (mtu == 0 &&
1434 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1435 					    old_mtu >= 68 + (iph->ihl << 2))
1436 						old_mtu -= iph->ihl << 2;
1437 
1438 					mtu = guess_mtu(old_mtu);
1439 				}
1440 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1441 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1442 						dst_confirm(&rth->u.dst);
1443 						if (mtu < ip_rt_min_pmtu) {
1444 							mtu = ip_rt_min_pmtu;
1445 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1446 								(1 << RTAX_MTU);
1447 						}
1448 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1449 						dst_set_expires(&rth->u.dst,
1450 							ip_rt_mtu_expires);
1451 					}
1452 					est_mtu = mtu;
1453 				}
1454 			}
1455 		}
1456 		rcu_read_unlock();
1457 	}
1458 	return est_mtu ? : new_mtu;
1459 }
1460 
1461 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1462 {
1463 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1464 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1465 		if (mtu < ip_rt_min_pmtu) {
1466 			mtu = ip_rt_min_pmtu;
1467 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1468 		}
1469 		dst->metrics[RTAX_MTU-1] = mtu;
1470 		dst_set_expires(dst, ip_rt_mtu_expires);
1471 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1472 	}
1473 }
1474 
1475 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1476 {
1477 	return NULL;
1478 }
1479 
1480 static void ipv4_dst_destroy(struct dst_entry *dst)
1481 {
1482 	struct rtable *rt = (struct rtable *) dst;
1483 	struct inet_peer *peer = rt->peer;
1484 	struct in_device *idev = rt->idev;
1485 
1486 	if (peer) {
1487 		rt->peer = NULL;
1488 		inet_putpeer(peer);
1489 	}
1490 
1491 	if (idev) {
1492 		rt->idev = NULL;
1493 		in_dev_put(idev);
1494 	}
1495 }
1496 
1497 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1498 			    int how)
1499 {
1500 	struct rtable *rt = (struct rtable *) dst;
1501 	struct in_device *idev = rt->idev;
1502 	if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1503 		struct in_device *loopback_idev =
1504 			in_dev_get(dev->nd_net->loopback_dev);
1505 		if (loopback_idev) {
1506 			rt->idev = loopback_idev;
1507 			in_dev_put(idev);
1508 		}
1509 	}
1510 }
1511 
1512 static void ipv4_link_failure(struct sk_buff *skb)
1513 {
1514 	struct rtable *rt;
1515 
1516 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1517 
1518 	rt = (struct rtable *) skb->dst;
1519 	if (rt)
1520 		dst_set_expires(&rt->u.dst, 0);
1521 }
1522 
1523 static int ip_rt_bug(struct sk_buff *skb)
1524 {
1525 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1526 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1527 		skb->dev ? skb->dev->name : "?");
1528 	kfree_skb(skb);
1529 	return 0;
1530 }
1531 
1532 /*
1533    We do not cache source address of outgoing interface,
1534    because it is used only by IP RR, TS and SRR options,
1535    so that it out of fast path.
1536 
1537    BTW remember: "addr" is allowed to be not aligned
1538    in IP options!
1539  */
1540 
1541 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1542 {
1543 	__be32 src;
1544 	struct fib_result res;
1545 
1546 	if (rt->fl.iif == 0)
1547 		src = rt->rt_src;
1548 	else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1549 		src = FIB_RES_PREFSRC(res);
1550 		fib_res_put(&res);
1551 	} else
1552 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1553 					RT_SCOPE_UNIVERSE);
1554 	memcpy(addr, &src, 4);
1555 }
1556 
1557 #ifdef CONFIG_NET_CLS_ROUTE
1558 static void set_class_tag(struct rtable *rt, u32 tag)
1559 {
1560 	if (!(rt->u.dst.tclassid & 0xFFFF))
1561 		rt->u.dst.tclassid |= tag & 0xFFFF;
1562 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1563 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1564 }
1565 #endif
1566 
1567 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1568 {
1569 	struct fib_info *fi = res->fi;
1570 
1571 	if (fi) {
1572 		if (FIB_RES_GW(*res) &&
1573 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1574 			rt->rt_gateway = FIB_RES_GW(*res);
1575 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1576 		       sizeof(rt->u.dst.metrics));
1577 		if (fi->fib_mtu == 0) {
1578 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1579 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1580 			    rt->rt_gateway != rt->rt_dst &&
1581 			    rt->u.dst.dev->mtu > 576)
1582 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1583 		}
1584 #ifdef CONFIG_NET_CLS_ROUTE
1585 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1586 #endif
1587 	} else
1588 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1589 
1590 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1591 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1592 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1593 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1594 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1595 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1596 				       ip_rt_min_advmss);
1597 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1598 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1599 
1600 #ifdef CONFIG_NET_CLS_ROUTE
1601 #ifdef CONFIG_IP_MULTIPLE_TABLES
1602 	set_class_tag(rt, fib_rules_tclass(res));
1603 #endif
1604 	set_class_tag(rt, itag);
1605 #endif
1606 	rt->rt_type = res->type;
1607 }
1608 
1609 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1610 				u8 tos, struct net_device *dev, int our)
1611 {
1612 	unsigned hash;
1613 	struct rtable *rth;
1614 	__be32 spec_dst;
1615 	struct in_device *in_dev = in_dev_get(dev);
1616 	u32 itag = 0;
1617 
1618 	/* Primary sanity checks. */
1619 
1620 	if (in_dev == NULL)
1621 		return -EINVAL;
1622 
1623 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1624 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1625 		goto e_inval;
1626 
1627 	if (ipv4_is_zeronet(saddr)) {
1628 		if (!ipv4_is_local_multicast(daddr))
1629 			goto e_inval;
1630 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1631 	} else if (fib_validate_source(saddr, 0, tos, 0,
1632 					dev, &spec_dst, &itag) < 0)
1633 		goto e_inval;
1634 
1635 	rth = dst_alloc(&ipv4_dst_ops);
1636 	if (!rth)
1637 		goto e_nobufs;
1638 
1639 	rth->u.dst.output= ip_rt_bug;
1640 
1641 	atomic_set(&rth->u.dst.__refcnt, 1);
1642 	rth->u.dst.flags= DST_HOST;
1643 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1644 		rth->u.dst.flags |= DST_NOPOLICY;
1645 	rth->fl.fl4_dst	= daddr;
1646 	rth->rt_dst	= daddr;
1647 	rth->fl.fl4_tos	= tos;
1648 	rth->fl.mark    = skb->mark;
1649 	rth->fl.fl4_src	= saddr;
1650 	rth->rt_src	= saddr;
1651 #ifdef CONFIG_NET_CLS_ROUTE
1652 	rth->u.dst.tclassid = itag;
1653 #endif
1654 	rth->rt_iif	=
1655 	rth->fl.iif	= dev->ifindex;
1656 	rth->u.dst.dev	= init_net.loopback_dev;
1657 	dev_hold(rth->u.dst.dev);
1658 	rth->idev	= in_dev_get(rth->u.dst.dev);
1659 	rth->fl.oif	= 0;
1660 	rth->rt_gateway	= daddr;
1661 	rth->rt_spec_dst= spec_dst;
1662 	rth->rt_genid	= atomic_read(&rt_genid);
1663 	rth->rt_flags	= RTCF_MULTICAST;
1664 	rth->rt_type	= RTN_MULTICAST;
1665 	if (our) {
1666 		rth->u.dst.input= ip_local_deliver;
1667 		rth->rt_flags |= RTCF_LOCAL;
1668 	}
1669 
1670 #ifdef CONFIG_IP_MROUTE
1671 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1672 		rth->u.dst.input = ip_mr_input;
1673 #endif
1674 	RT_CACHE_STAT_INC(in_slow_mc);
1675 
1676 	in_dev_put(in_dev);
1677 	hash = rt_hash(daddr, saddr, dev->ifindex);
1678 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1679 
1680 e_nobufs:
1681 	in_dev_put(in_dev);
1682 	return -ENOBUFS;
1683 
1684 e_inval:
1685 	in_dev_put(in_dev);
1686 	return -EINVAL;
1687 }
1688 
1689 
1690 static void ip_handle_martian_source(struct net_device *dev,
1691 				     struct in_device *in_dev,
1692 				     struct sk_buff *skb,
1693 				     __be32 daddr,
1694 				     __be32 saddr)
1695 {
1696 	RT_CACHE_STAT_INC(in_martian_src);
1697 #ifdef CONFIG_IP_ROUTE_VERBOSE
1698 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1699 		/*
1700 		 *	RFC1812 recommendation, if source is martian,
1701 		 *	the only hint is MAC header.
1702 		 */
1703 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1704 			"%u.%u.%u.%u, on dev %s\n",
1705 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1706 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1707 			int i;
1708 			const unsigned char *p = skb_mac_header(skb);
1709 			printk(KERN_WARNING "ll header: ");
1710 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1711 				printk("%02x", *p);
1712 				if (i < (dev->hard_header_len - 1))
1713 					printk(":");
1714 			}
1715 			printk("\n");
1716 		}
1717 	}
1718 #endif
1719 }
1720 
1721 static inline int __mkroute_input(struct sk_buff *skb,
1722 				  struct fib_result* res,
1723 				  struct in_device *in_dev,
1724 				  __be32 daddr, __be32 saddr, u32 tos,
1725 				  struct rtable **result)
1726 {
1727 
1728 	struct rtable *rth;
1729 	int err;
1730 	struct in_device *out_dev;
1731 	unsigned flags = 0;
1732 	__be32 spec_dst;
1733 	u32 itag;
1734 
1735 	/* get a working reference to the output device */
1736 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1737 	if (out_dev == NULL) {
1738 		if (net_ratelimit())
1739 			printk(KERN_CRIT "Bug in ip_route_input" \
1740 			       "_slow(). Please, report\n");
1741 		return -EINVAL;
1742 	}
1743 
1744 
1745 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1746 				  in_dev->dev, &spec_dst, &itag);
1747 	if (err < 0) {
1748 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1749 					 saddr);
1750 
1751 		err = -EINVAL;
1752 		goto cleanup;
1753 	}
1754 
1755 	if (err)
1756 		flags |= RTCF_DIRECTSRC;
1757 
1758 	if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1759 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1760 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1761 		flags |= RTCF_DOREDIRECT;
1762 
1763 	if (skb->protocol != htons(ETH_P_IP)) {
1764 		/* Not IP (i.e. ARP). Do not create route, if it is
1765 		 * invalid for proxy arp. DNAT routes are always valid.
1766 		 */
1767 		if (out_dev == in_dev) {
1768 			err = -EINVAL;
1769 			goto cleanup;
1770 		}
1771 	}
1772 
1773 
1774 	rth = dst_alloc(&ipv4_dst_ops);
1775 	if (!rth) {
1776 		err = -ENOBUFS;
1777 		goto cleanup;
1778 	}
1779 
1780 	atomic_set(&rth->u.dst.__refcnt, 1);
1781 	rth->u.dst.flags= DST_HOST;
1782 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1783 		rth->u.dst.flags |= DST_NOPOLICY;
1784 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1785 		rth->u.dst.flags |= DST_NOXFRM;
1786 	rth->fl.fl4_dst	= daddr;
1787 	rth->rt_dst	= daddr;
1788 	rth->fl.fl4_tos	= tos;
1789 	rth->fl.mark    = skb->mark;
1790 	rth->fl.fl4_src	= saddr;
1791 	rth->rt_src	= saddr;
1792 	rth->rt_gateway	= daddr;
1793 	rth->rt_iif 	=
1794 		rth->fl.iif	= in_dev->dev->ifindex;
1795 	rth->u.dst.dev	= (out_dev)->dev;
1796 	dev_hold(rth->u.dst.dev);
1797 	rth->idev	= in_dev_get(rth->u.dst.dev);
1798 	rth->fl.oif 	= 0;
1799 	rth->rt_spec_dst= spec_dst;
1800 
1801 	rth->u.dst.input = ip_forward;
1802 	rth->u.dst.output = ip_output;
1803 	rth->rt_genid = atomic_read(&rt_genid);
1804 
1805 	rt_set_nexthop(rth, res, itag);
1806 
1807 	rth->rt_flags = flags;
1808 
1809 	*result = rth;
1810 	err = 0;
1811  cleanup:
1812 	/* release the working reference to the output device */
1813 	in_dev_put(out_dev);
1814 	return err;
1815 }
1816 
1817 static inline int ip_mkroute_input(struct sk_buff *skb,
1818 				   struct fib_result* res,
1819 				   const struct flowi *fl,
1820 				   struct in_device *in_dev,
1821 				   __be32 daddr, __be32 saddr, u32 tos)
1822 {
1823 	struct rtable* rth = NULL;
1824 	int err;
1825 	unsigned hash;
1826 
1827 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1828 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1829 		fib_select_multipath(fl, res);
1830 #endif
1831 
1832 	/* create a routing cache entry */
1833 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1834 	if (err)
1835 		return err;
1836 
1837 	/* put it into the cache */
1838 	hash = rt_hash(daddr, saddr, fl->iif);
1839 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1840 }
1841 
1842 /*
1843  *	NOTE. We drop all the packets that has local source
1844  *	addresses, because every properly looped back packet
1845  *	must have correct destination already attached by output routine.
1846  *
1847  *	Such approach solves two big problems:
1848  *	1. Not simplex devices are handled properly.
1849  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1850  */
1851 
1852 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1853 			       u8 tos, struct net_device *dev)
1854 {
1855 	struct fib_result res;
1856 	struct in_device *in_dev = in_dev_get(dev);
1857 	struct flowi fl = { .nl_u = { .ip4_u =
1858 				      { .daddr = daddr,
1859 					.saddr = saddr,
1860 					.tos = tos,
1861 					.scope = RT_SCOPE_UNIVERSE,
1862 				      } },
1863 			    .mark = skb->mark,
1864 			    .iif = dev->ifindex };
1865 	unsigned	flags = 0;
1866 	u32		itag = 0;
1867 	struct rtable * rth;
1868 	unsigned	hash;
1869 	__be32		spec_dst;
1870 	int		err = -EINVAL;
1871 	int		free_res = 0;
1872 	struct net    * net = dev->nd_net;
1873 
1874 	/* IP on this device is disabled. */
1875 
1876 	if (!in_dev)
1877 		goto out;
1878 
1879 	/* Check for the most weird martians, which can be not detected
1880 	   by fib_lookup.
1881 	 */
1882 
1883 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1884 	    ipv4_is_loopback(saddr))
1885 		goto martian_source;
1886 
1887 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1888 		goto brd_input;
1889 
1890 	/* Accept zero addresses only to limited broadcast;
1891 	 * I even do not know to fix it or not. Waiting for complains :-)
1892 	 */
1893 	if (ipv4_is_zeronet(saddr))
1894 		goto martian_source;
1895 
1896 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1897 	    ipv4_is_loopback(daddr))
1898 		goto martian_destination;
1899 
1900 	/*
1901 	 *	Now we are ready to route packet.
1902 	 */
1903 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1904 		if (!IN_DEV_FORWARD(in_dev))
1905 			goto e_hostunreach;
1906 		goto no_route;
1907 	}
1908 	free_res = 1;
1909 
1910 	RT_CACHE_STAT_INC(in_slow_tot);
1911 
1912 	if (res.type == RTN_BROADCAST)
1913 		goto brd_input;
1914 
1915 	if (res.type == RTN_LOCAL) {
1916 		int result;
1917 		result = fib_validate_source(saddr, daddr, tos,
1918 					     net->loopback_dev->ifindex,
1919 					     dev, &spec_dst, &itag);
1920 		if (result < 0)
1921 			goto martian_source;
1922 		if (result)
1923 			flags |= RTCF_DIRECTSRC;
1924 		spec_dst = daddr;
1925 		goto local_input;
1926 	}
1927 
1928 	if (!IN_DEV_FORWARD(in_dev))
1929 		goto e_hostunreach;
1930 	if (res.type != RTN_UNICAST)
1931 		goto martian_destination;
1932 
1933 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1934 done:
1935 	in_dev_put(in_dev);
1936 	if (free_res)
1937 		fib_res_put(&res);
1938 out:	return err;
1939 
1940 brd_input:
1941 	if (skb->protocol != htons(ETH_P_IP))
1942 		goto e_inval;
1943 
1944 	if (ipv4_is_zeronet(saddr))
1945 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1946 	else {
1947 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1948 					  &itag);
1949 		if (err < 0)
1950 			goto martian_source;
1951 		if (err)
1952 			flags |= RTCF_DIRECTSRC;
1953 	}
1954 	flags |= RTCF_BROADCAST;
1955 	res.type = RTN_BROADCAST;
1956 	RT_CACHE_STAT_INC(in_brd);
1957 
1958 local_input:
1959 	rth = dst_alloc(&ipv4_dst_ops);
1960 	if (!rth)
1961 		goto e_nobufs;
1962 
1963 	rth->u.dst.output= ip_rt_bug;
1964 	rth->rt_genid = atomic_read(&rt_genid);
1965 
1966 	atomic_set(&rth->u.dst.__refcnt, 1);
1967 	rth->u.dst.flags= DST_HOST;
1968 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1969 		rth->u.dst.flags |= DST_NOPOLICY;
1970 	rth->fl.fl4_dst	= daddr;
1971 	rth->rt_dst	= daddr;
1972 	rth->fl.fl4_tos	= tos;
1973 	rth->fl.mark    = skb->mark;
1974 	rth->fl.fl4_src	= saddr;
1975 	rth->rt_src	= saddr;
1976 #ifdef CONFIG_NET_CLS_ROUTE
1977 	rth->u.dst.tclassid = itag;
1978 #endif
1979 	rth->rt_iif	=
1980 	rth->fl.iif	= dev->ifindex;
1981 	rth->u.dst.dev	= net->loopback_dev;
1982 	dev_hold(rth->u.dst.dev);
1983 	rth->idev	= in_dev_get(rth->u.dst.dev);
1984 	rth->rt_gateway	= daddr;
1985 	rth->rt_spec_dst= spec_dst;
1986 	rth->u.dst.input= ip_local_deliver;
1987 	rth->rt_flags 	= flags|RTCF_LOCAL;
1988 	if (res.type == RTN_UNREACHABLE) {
1989 		rth->u.dst.input= ip_error;
1990 		rth->u.dst.error= -err;
1991 		rth->rt_flags 	&= ~RTCF_LOCAL;
1992 	}
1993 	rth->rt_type	= res.type;
1994 	hash = rt_hash(daddr, saddr, fl.iif);
1995 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1996 	goto done;
1997 
1998 no_route:
1999 	RT_CACHE_STAT_INC(in_no_route);
2000 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2001 	res.type = RTN_UNREACHABLE;
2002 	if (err == -ESRCH)
2003 		err = -ENETUNREACH;
2004 	goto local_input;
2005 
2006 	/*
2007 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2008 	 */
2009 martian_destination:
2010 	RT_CACHE_STAT_INC(in_martian_dst);
2011 #ifdef CONFIG_IP_ROUTE_VERBOSE
2012 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2013 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2014 			"%u.%u.%u.%u, dev %s\n",
2015 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2016 #endif
2017 
2018 e_hostunreach:
2019 	err = -EHOSTUNREACH;
2020 	goto done;
2021 
2022 e_inval:
2023 	err = -EINVAL;
2024 	goto done;
2025 
2026 e_nobufs:
2027 	err = -ENOBUFS;
2028 	goto done;
2029 
2030 martian_source:
2031 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2032 	goto e_inval;
2033 }
2034 
2035 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036 		   u8 tos, struct net_device *dev)
2037 {
2038 	struct rtable * rth;
2039 	unsigned	hash;
2040 	int iif = dev->ifindex;
2041 	struct net *net;
2042 
2043 	net = dev->nd_net;
2044 	tos &= IPTOS_RT_MASK;
2045 	hash = rt_hash(daddr, saddr, iif);
2046 
2047 	rcu_read_lock();
2048 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2049 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2050 		if (rth->fl.fl4_dst == daddr &&
2051 		    rth->fl.fl4_src == saddr &&
2052 		    rth->fl.iif == iif &&
2053 		    rth->fl.oif == 0 &&
2054 		    rth->fl.mark == skb->mark &&
2055 		    rth->fl.fl4_tos == tos &&
2056 		    rth->u.dst.dev->nd_net == net &&
2057 		    rth->rt_genid == atomic_read(&rt_genid)) {
2058 			dst_use(&rth->u.dst, jiffies);
2059 			RT_CACHE_STAT_INC(in_hit);
2060 			rcu_read_unlock();
2061 			skb->dst = (struct dst_entry*)rth;
2062 			return 0;
2063 		}
2064 		RT_CACHE_STAT_INC(in_hlist_search);
2065 	}
2066 	rcu_read_unlock();
2067 
2068 	/* Multicast recognition logic is moved from route cache to here.
2069 	   The problem was that too many Ethernet cards have broken/missing
2070 	   hardware multicast filters :-( As result the host on multicasting
2071 	   network acquires a lot of useless route cache entries, sort of
2072 	   SDR messages from all the world. Now we try to get rid of them.
2073 	   Really, provided software IP multicast filter is organized
2074 	   reasonably (at least, hashed), it does not result in a slowdown
2075 	   comparing with route cache reject entries.
2076 	   Note, that multicast routers are not affected, because
2077 	   route cache entry is created eventually.
2078 	 */
2079 	if (ipv4_is_multicast(daddr)) {
2080 		struct in_device *in_dev;
2081 
2082 		rcu_read_lock();
2083 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2084 			int our = ip_check_mc(in_dev, daddr, saddr,
2085 				ip_hdr(skb)->protocol);
2086 			if (our
2087 #ifdef CONFIG_IP_MROUTE
2088 			    || (!ipv4_is_local_multicast(daddr) &&
2089 				IN_DEV_MFORWARD(in_dev))
2090 #endif
2091 			    ) {
2092 				rcu_read_unlock();
2093 				return ip_route_input_mc(skb, daddr, saddr,
2094 							 tos, dev, our);
2095 			}
2096 		}
2097 		rcu_read_unlock();
2098 		return -EINVAL;
2099 	}
2100 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2101 }
2102 
2103 static inline int __mkroute_output(struct rtable **result,
2104 				   struct fib_result* res,
2105 				   const struct flowi *fl,
2106 				   const struct flowi *oldflp,
2107 				   struct net_device *dev_out,
2108 				   unsigned flags)
2109 {
2110 	struct rtable *rth;
2111 	struct in_device *in_dev;
2112 	u32 tos = RT_FL_TOS(oldflp);
2113 	int err = 0;
2114 
2115 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2116 		return -EINVAL;
2117 
2118 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2119 		res->type = RTN_BROADCAST;
2120 	else if (ipv4_is_multicast(fl->fl4_dst))
2121 		res->type = RTN_MULTICAST;
2122 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2123 		return -EINVAL;
2124 
2125 	if (dev_out->flags & IFF_LOOPBACK)
2126 		flags |= RTCF_LOCAL;
2127 
2128 	/* get work reference to inet device */
2129 	in_dev = in_dev_get(dev_out);
2130 	if (!in_dev)
2131 		return -EINVAL;
2132 
2133 	if (res->type == RTN_BROADCAST) {
2134 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2135 		if (res->fi) {
2136 			fib_info_put(res->fi);
2137 			res->fi = NULL;
2138 		}
2139 	} else if (res->type == RTN_MULTICAST) {
2140 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2141 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2142 				 oldflp->proto))
2143 			flags &= ~RTCF_LOCAL;
2144 		/* If multicast route do not exist use
2145 		   default one, but do not gateway in this case.
2146 		   Yes, it is hack.
2147 		 */
2148 		if (res->fi && res->prefixlen < 4) {
2149 			fib_info_put(res->fi);
2150 			res->fi = NULL;
2151 		}
2152 	}
2153 
2154 
2155 	rth = dst_alloc(&ipv4_dst_ops);
2156 	if (!rth) {
2157 		err = -ENOBUFS;
2158 		goto cleanup;
2159 	}
2160 
2161 	atomic_set(&rth->u.dst.__refcnt, 1);
2162 	rth->u.dst.flags= DST_HOST;
2163 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2164 		rth->u.dst.flags |= DST_NOXFRM;
2165 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2166 		rth->u.dst.flags |= DST_NOPOLICY;
2167 
2168 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2169 	rth->fl.fl4_tos	= tos;
2170 	rth->fl.fl4_src	= oldflp->fl4_src;
2171 	rth->fl.oif	= oldflp->oif;
2172 	rth->fl.mark    = oldflp->mark;
2173 	rth->rt_dst	= fl->fl4_dst;
2174 	rth->rt_src	= fl->fl4_src;
2175 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2176 	/* get references to the devices that are to be hold by the routing
2177 	   cache entry */
2178 	rth->u.dst.dev	= dev_out;
2179 	dev_hold(dev_out);
2180 	rth->idev	= in_dev_get(dev_out);
2181 	rth->rt_gateway = fl->fl4_dst;
2182 	rth->rt_spec_dst= fl->fl4_src;
2183 
2184 	rth->u.dst.output=ip_output;
2185 	rth->rt_genid = atomic_read(&rt_genid);
2186 
2187 	RT_CACHE_STAT_INC(out_slow_tot);
2188 
2189 	if (flags & RTCF_LOCAL) {
2190 		rth->u.dst.input = ip_local_deliver;
2191 		rth->rt_spec_dst = fl->fl4_dst;
2192 	}
2193 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2194 		rth->rt_spec_dst = fl->fl4_src;
2195 		if (flags & RTCF_LOCAL &&
2196 		    !(dev_out->flags & IFF_LOOPBACK)) {
2197 			rth->u.dst.output = ip_mc_output;
2198 			RT_CACHE_STAT_INC(out_slow_mc);
2199 		}
2200 #ifdef CONFIG_IP_MROUTE
2201 		if (res->type == RTN_MULTICAST) {
2202 			if (IN_DEV_MFORWARD(in_dev) &&
2203 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2204 				rth->u.dst.input = ip_mr_input;
2205 				rth->u.dst.output = ip_mc_output;
2206 			}
2207 		}
2208 #endif
2209 	}
2210 
2211 	rt_set_nexthop(rth, res, 0);
2212 
2213 	rth->rt_flags = flags;
2214 
2215 	*result = rth;
2216  cleanup:
2217 	/* release work reference to inet device */
2218 	in_dev_put(in_dev);
2219 
2220 	return err;
2221 }
2222 
2223 static inline int ip_mkroute_output(struct rtable **rp,
2224 				    struct fib_result* res,
2225 				    const struct flowi *fl,
2226 				    const struct flowi *oldflp,
2227 				    struct net_device *dev_out,
2228 				    unsigned flags)
2229 {
2230 	struct rtable *rth = NULL;
2231 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2232 	unsigned hash;
2233 	if (err == 0) {
2234 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2235 		err = rt_intern_hash(hash, rth, rp);
2236 	}
2237 
2238 	return err;
2239 }
2240 
2241 /*
2242  * Major route resolver routine.
2243  */
2244 
2245 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2246 				const struct flowi *oldflp)
2247 {
2248 	u32 tos	= RT_FL_TOS(oldflp);
2249 	struct flowi fl = { .nl_u = { .ip4_u =
2250 				      { .daddr = oldflp->fl4_dst,
2251 					.saddr = oldflp->fl4_src,
2252 					.tos = tos & IPTOS_RT_MASK,
2253 					.scope = ((tos & RTO_ONLINK) ?
2254 						  RT_SCOPE_LINK :
2255 						  RT_SCOPE_UNIVERSE),
2256 				      } },
2257 			    .mark = oldflp->mark,
2258 			    .iif = net->loopback_dev->ifindex,
2259 			    .oif = oldflp->oif };
2260 	struct fib_result res;
2261 	unsigned flags = 0;
2262 	struct net_device *dev_out = NULL;
2263 	int free_res = 0;
2264 	int err;
2265 
2266 
2267 	res.fi		= NULL;
2268 #ifdef CONFIG_IP_MULTIPLE_TABLES
2269 	res.r		= NULL;
2270 #endif
2271 
2272 	if (oldflp->fl4_src) {
2273 		err = -EINVAL;
2274 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2275 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2276 		    ipv4_is_zeronet(oldflp->fl4_src))
2277 			goto out;
2278 
2279 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2280 		dev_out = ip_dev_find(net, oldflp->fl4_src);
2281 		if (dev_out == NULL)
2282 			goto out;
2283 
2284 		/* I removed check for oif == dev_out->oif here.
2285 		   It was wrong for two reasons:
2286 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2287 		      is assigned to multiple interfaces.
2288 		   2. Moreover, we are allowed to send packets with saddr
2289 		      of another iface. --ANK
2290 		 */
2291 
2292 		if (oldflp->oif == 0
2293 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2294 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2295 			/* Special hack: user can direct multicasts
2296 			   and limited broadcast via necessary interface
2297 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2298 			   This hack is not just for fun, it allows
2299 			   vic,vat and friends to work.
2300 			   They bind socket to loopback, set ttl to zero
2301 			   and expect that it will work.
2302 			   From the viewpoint of routing cache they are broken,
2303 			   because we are not allowed to build multicast path
2304 			   with loopback source addr (look, routing cache
2305 			   cannot know, that ttl is zero, so that packet
2306 			   will not leave this host and route is valid).
2307 			   Luckily, this hack is good workaround.
2308 			 */
2309 
2310 			fl.oif = dev_out->ifindex;
2311 			goto make_route;
2312 		}
2313 		if (dev_out)
2314 			dev_put(dev_out);
2315 		dev_out = NULL;
2316 	}
2317 
2318 
2319 	if (oldflp->oif) {
2320 		dev_out = dev_get_by_index(net, oldflp->oif);
2321 		err = -ENODEV;
2322 		if (dev_out == NULL)
2323 			goto out;
2324 
2325 		/* RACE: Check return value of inet_select_addr instead. */
2326 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2327 			dev_put(dev_out);
2328 			goto out;	/* Wrong error code */
2329 		}
2330 
2331 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2332 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2333 			if (!fl.fl4_src)
2334 				fl.fl4_src = inet_select_addr(dev_out, 0,
2335 							      RT_SCOPE_LINK);
2336 			goto make_route;
2337 		}
2338 		if (!fl.fl4_src) {
2339 			if (ipv4_is_multicast(oldflp->fl4_dst))
2340 				fl.fl4_src = inet_select_addr(dev_out, 0,
2341 							      fl.fl4_scope);
2342 			else if (!oldflp->fl4_dst)
2343 				fl.fl4_src = inet_select_addr(dev_out, 0,
2344 							      RT_SCOPE_HOST);
2345 		}
2346 	}
2347 
2348 	if (!fl.fl4_dst) {
2349 		fl.fl4_dst = fl.fl4_src;
2350 		if (!fl.fl4_dst)
2351 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2352 		if (dev_out)
2353 			dev_put(dev_out);
2354 		dev_out = net->loopback_dev;
2355 		dev_hold(dev_out);
2356 		fl.oif = net->loopback_dev->ifindex;
2357 		res.type = RTN_LOCAL;
2358 		flags |= RTCF_LOCAL;
2359 		goto make_route;
2360 	}
2361 
2362 	if (fib_lookup(net, &fl, &res)) {
2363 		res.fi = NULL;
2364 		if (oldflp->oif) {
2365 			/* Apparently, routing tables are wrong. Assume,
2366 			   that the destination is on link.
2367 
2368 			   WHY? DW.
2369 			   Because we are allowed to send to iface
2370 			   even if it has NO routes and NO assigned
2371 			   addresses. When oif is specified, routing
2372 			   tables are looked up with only one purpose:
2373 			   to catch if destination is gatewayed, rather than
2374 			   direct. Moreover, if MSG_DONTROUTE is set,
2375 			   we send packet, ignoring both routing tables
2376 			   and ifaddr state. --ANK
2377 
2378 
2379 			   We could make it even if oif is unknown,
2380 			   likely IPv6, but we do not.
2381 			 */
2382 
2383 			if (fl.fl4_src == 0)
2384 				fl.fl4_src = inet_select_addr(dev_out, 0,
2385 							      RT_SCOPE_LINK);
2386 			res.type = RTN_UNICAST;
2387 			goto make_route;
2388 		}
2389 		if (dev_out)
2390 			dev_put(dev_out);
2391 		err = -ENETUNREACH;
2392 		goto out;
2393 	}
2394 	free_res = 1;
2395 
2396 	if (res.type == RTN_LOCAL) {
2397 		if (!fl.fl4_src)
2398 			fl.fl4_src = fl.fl4_dst;
2399 		if (dev_out)
2400 			dev_put(dev_out);
2401 		dev_out = net->loopback_dev;
2402 		dev_hold(dev_out);
2403 		fl.oif = dev_out->ifindex;
2404 		if (res.fi)
2405 			fib_info_put(res.fi);
2406 		res.fi = NULL;
2407 		flags |= RTCF_LOCAL;
2408 		goto make_route;
2409 	}
2410 
2411 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2412 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2413 		fib_select_multipath(&fl, &res);
2414 	else
2415 #endif
2416 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2417 		fib_select_default(net, &fl, &res);
2418 
2419 	if (!fl.fl4_src)
2420 		fl.fl4_src = FIB_RES_PREFSRC(res);
2421 
2422 	if (dev_out)
2423 		dev_put(dev_out);
2424 	dev_out = FIB_RES_DEV(res);
2425 	dev_hold(dev_out);
2426 	fl.oif = dev_out->ifindex;
2427 
2428 
2429 make_route:
2430 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2431 
2432 
2433 	if (free_res)
2434 		fib_res_put(&res);
2435 	if (dev_out)
2436 		dev_put(dev_out);
2437 out:	return err;
2438 }
2439 
2440 int __ip_route_output_key(struct net *net, struct rtable **rp,
2441 			  const struct flowi *flp)
2442 {
2443 	unsigned hash;
2444 	struct rtable *rth;
2445 
2446 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2447 
2448 	rcu_read_lock_bh();
2449 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2450 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2451 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2452 		    rth->fl.fl4_src == flp->fl4_src &&
2453 		    rth->fl.iif == 0 &&
2454 		    rth->fl.oif == flp->oif &&
2455 		    rth->fl.mark == flp->mark &&
2456 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2457 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2458 		    rth->u.dst.dev->nd_net == net &&
2459 		    rth->rt_genid == atomic_read(&rt_genid)) {
2460 			dst_use(&rth->u.dst, jiffies);
2461 			RT_CACHE_STAT_INC(out_hit);
2462 			rcu_read_unlock_bh();
2463 			*rp = rth;
2464 			return 0;
2465 		}
2466 		RT_CACHE_STAT_INC(out_hlist_search);
2467 	}
2468 	rcu_read_unlock_bh();
2469 
2470 	return ip_route_output_slow(net, rp, flp);
2471 }
2472 
2473 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2474 
2475 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2476 {
2477 }
2478 
2479 static struct dst_ops ipv4_dst_blackhole_ops = {
2480 	.family			=	AF_INET,
2481 	.protocol		=	__constant_htons(ETH_P_IP),
2482 	.destroy		=	ipv4_dst_destroy,
2483 	.check			=	ipv4_dst_check,
2484 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2485 	.entry_size		=	sizeof(struct rtable),
2486 	.entries		=	ATOMIC_INIT(0),
2487 };
2488 
2489 
2490 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2491 {
2492 	struct rtable *ort = *rp;
2493 	struct rtable *rt = (struct rtable *)
2494 		dst_alloc(&ipv4_dst_blackhole_ops);
2495 
2496 	if (rt) {
2497 		struct dst_entry *new = &rt->u.dst;
2498 
2499 		atomic_set(&new->__refcnt, 1);
2500 		new->__use = 1;
2501 		new->input = dst_discard;
2502 		new->output = dst_discard;
2503 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2504 
2505 		new->dev = ort->u.dst.dev;
2506 		if (new->dev)
2507 			dev_hold(new->dev);
2508 
2509 		rt->fl = ort->fl;
2510 
2511 		rt->idev = ort->idev;
2512 		if (rt->idev)
2513 			in_dev_hold(rt->idev);
2514 		rt->rt_genid = atomic_read(&rt_genid);
2515 		rt->rt_flags = ort->rt_flags;
2516 		rt->rt_type = ort->rt_type;
2517 		rt->rt_dst = ort->rt_dst;
2518 		rt->rt_src = ort->rt_src;
2519 		rt->rt_iif = ort->rt_iif;
2520 		rt->rt_gateway = ort->rt_gateway;
2521 		rt->rt_spec_dst = ort->rt_spec_dst;
2522 		rt->peer = ort->peer;
2523 		if (rt->peer)
2524 			atomic_inc(&rt->peer->refcnt);
2525 
2526 		dst_free(new);
2527 	}
2528 
2529 	dst_release(&(*rp)->u.dst);
2530 	*rp = rt;
2531 	return (rt ? 0 : -ENOMEM);
2532 }
2533 
2534 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2535 			 struct sock *sk, int flags)
2536 {
2537 	int err;
2538 
2539 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2540 		return err;
2541 
2542 	if (flp->proto) {
2543 		if (!flp->fl4_src)
2544 			flp->fl4_src = (*rp)->rt_src;
2545 		if (!flp->fl4_dst)
2546 			flp->fl4_dst = (*rp)->rt_dst;
2547 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2548 				    flags ? XFRM_LOOKUP_WAIT : 0);
2549 		if (err == -EREMOTE)
2550 			err = ipv4_dst_blackhole(rp, flp, sk);
2551 
2552 		return err;
2553 	}
2554 
2555 	return 0;
2556 }
2557 
2558 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2559 
2560 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2561 {
2562 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2563 }
2564 
2565 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2566 			int nowait, unsigned int flags)
2567 {
2568 	struct rtable *rt = (struct rtable*)skb->dst;
2569 	struct rtmsg *r;
2570 	struct nlmsghdr *nlh;
2571 	long expires;
2572 	u32 id = 0, ts = 0, tsage = 0, error;
2573 
2574 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2575 	if (nlh == NULL)
2576 		return -EMSGSIZE;
2577 
2578 	r = nlmsg_data(nlh);
2579 	r->rtm_family	 = AF_INET;
2580 	r->rtm_dst_len	= 32;
2581 	r->rtm_src_len	= 0;
2582 	r->rtm_tos	= rt->fl.fl4_tos;
2583 	r->rtm_table	= RT_TABLE_MAIN;
2584 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2585 	r->rtm_type	= rt->rt_type;
2586 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2587 	r->rtm_protocol = RTPROT_UNSPEC;
2588 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2589 	if (rt->rt_flags & RTCF_NOTIFY)
2590 		r->rtm_flags |= RTM_F_NOTIFY;
2591 
2592 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2593 
2594 	if (rt->fl.fl4_src) {
2595 		r->rtm_src_len = 32;
2596 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2597 	}
2598 	if (rt->u.dst.dev)
2599 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2600 #ifdef CONFIG_NET_CLS_ROUTE
2601 	if (rt->u.dst.tclassid)
2602 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2603 #endif
2604 	if (rt->fl.iif)
2605 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2606 	else if (rt->rt_src != rt->fl.fl4_src)
2607 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2608 
2609 	if (rt->rt_dst != rt->rt_gateway)
2610 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2611 
2612 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2613 		goto nla_put_failure;
2614 
2615 	error = rt->u.dst.error;
2616 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2617 	if (rt->peer) {
2618 		id = rt->peer->ip_id_count;
2619 		if (rt->peer->tcp_ts_stamp) {
2620 			ts = rt->peer->tcp_ts;
2621 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2622 		}
2623 	}
2624 
2625 	if (rt->fl.iif) {
2626 #ifdef CONFIG_IP_MROUTE
2627 		__be32 dst = rt->rt_dst;
2628 
2629 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2630 		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2631 			int err = ipmr_get_route(skb, r, nowait);
2632 			if (err <= 0) {
2633 				if (!nowait) {
2634 					if (err == 0)
2635 						return 0;
2636 					goto nla_put_failure;
2637 				} else {
2638 					if (err == -EMSGSIZE)
2639 						goto nla_put_failure;
2640 					error = err;
2641 				}
2642 			}
2643 		} else
2644 #endif
2645 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2646 	}
2647 
2648 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2649 			       expires, error) < 0)
2650 		goto nla_put_failure;
2651 
2652 	return nlmsg_end(skb, nlh);
2653 
2654 nla_put_failure:
2655 	nlmsg_cancel(skb, nlh);
2656 	return -EMSGSIZE;
2657 }
2658 
2659 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2660 {
2661 	struct net *net = in_skb->sk->sk_net;
2662 	struct rtmsg *rtm;
2663 	struct nlattr *tb[RTA_MAX+1];
2664 	struct rtable *rt = NULL;
2665 	__be32 dst = 0;
2666 	__be32 src = 0;
2667 	u32 iif;
2668 	int err;
2669 	struct sk_buff *skb;
2670 
2671 	if (net != &init_net)
2672 		return -EINVAL;
2673 
2674 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2675 	if (err < 0)
2676 		goto errout;
2677 
2678 	rtm = nlmsg_data(nlh);
2679 
2680 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2681 	if (skb == NULL) {
2682 		err = -ENOBUFS;
2683 		goto errout;
2684 	}
2685 
2686 	/* Reserve room for dummy headers, this skb can pass
2687 	   through good chunk of routing engine.
2688 	 */
2689 	skb_reset_mac_header(skb);
2690 	skb_reset_network_header(skb);
2691 
2692 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2693 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2694 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2695 
2696 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2697 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2698 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2699 
2700 	if (iif) {
2701 		struct net_device *dev;
2702 
2703 		dev = __dev_get_by_index(&init_net, iif);
2704 		if (dev == NULL) {
2705 			err = -ENODEV;
2706 			goto errout_free;
2707 		}
2708 
2709 		skb->protocol	= htons(ETH_P_IP);
2710 		skb->dev	= dev;
2711 		local_bh_disable();
2712 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2713 		local_bh_enable();
2714 
2715 		rt = (struct rtable*) skb->dst;
2716 		if (err == 0 && rt->u.dst.error)
2717 			err = -rt->u.dst.error;
2718 	} else {
2719 		struct flowi fl = {
2720 			.nl_u = {
2721 				.ip4_u = {
2722 					.daddr = dst,
2723 					.saddr = src,
2724 					.tos = rtm->rtm_tos,
2725 				},
2726 			},
2727 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2728 		};
2729 		err = ip_route_output_key(&init_net, &rt, &fl);
2730 	}
2731 
2732 	if (err)
2733 		goto errout_free;
2734 
2735 	skb->dst = &rt->u.dst;
2736 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2737 		rt->rt_flags |= RTCF_NOTIFY;
2738 
2739 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2740 				RTM_NEWROUTE, 0, 0);
2741 	if (err <= 0)
2742 		goto errout_free;
2743 
2744 	err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2745 errout:
2746 	return err;
2747 
2748 errout_free:
2749 	kfree_skb(skb);
2750 	goto errout;
2751 }
2752 
2753 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2754 {
2755 	struct rtable *rt;
2756 	int h, s_h;
2757 	int idx, s_idx;
2758 
2759 	s_h = cb->args[0];
2760 	if (s_h < 0)
2761 		s_h = 0;
2762 	s_idx = idx = cb->args[1];
2763 	for (h = s_h; h <= rt_hash_mask; h++) {
2764 		rcu_read_lock_bh();
2765 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2766 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2767 			if (idx < s_idx)
2768 				continue;
2769 			if (rt->rt_genid != atomic_read(&rt_genid))
2770 				continue;
2771 			skb->dst = dst_clone(&rt->u.dst);
2772 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2773 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2774 					 1, NLM_F_MULTI) <= 0) {
2775 				dst_release(xchg(&skb->dst, NULL));
2776 				rcu_read_unlock_bh();
2777 				goto done;
2778 			}
2779 			dst_release(xchg(&skb->dst, NULL));
2780 		}
2781 		rcu_read_unlock_bh();
2782 		s_idx = 0;
2783 	}
2784 
2785 done:
2786 	cb->args[0] = h;
2787 	cb->args[1] = idx;
2788 	return skb->len;
2789 }
2790 
2791 void ip_rt_multicast_event(struct in_device *in_dev)
2792 {
2793 	rt_cache_flush(0);
2794 }
2795 
2796 #ifdef CONFIG_SYSCTL
2797 static int flush_delay;
2798 
2799 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2800 					struct file *filp, void __user *buffer,
2801 					size_t *lenp, loff_t *ppos)
2802 {
2803 	if (write) {
2804 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2805 		rt_cache_flush(flush_delay);
2806 		return 0;
2807 	}
2808 
2809 	return -EINVAL;
2810 }
2811 
2812 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2813 						int __user *name,
2814 						int nlen,
2815 						void __user *oldval,
2816 						size_t __user *oldlenp,
2817 						void __user *newval,
2818 						size_t newlen)
2819 {
2820 	int delay;
2821 	if (newlen != sizeof(int))
2822 		return -EINVAL;
2823 	if (get_user(delay, (int __user *)newval))
2824 		return -EFAULT;
2825 	rt_cache_flush(delay);
2826 	return 0;
2827 }
2828 
2829 ctl_table ipv4_route_table[] = {
2830 	{
2831 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2832 		.procname	= "flush",
2833 		.data		= &flush_delay,
2834 		.maxlen		= sizeof(int),
2835 		.mode		= 0200,
2836 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2837 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2838 	},
2839 	{
2840 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2841 		.procname	= "gc_thresh",
2842 		.data		= &ipv4_dst_ops.gc_thresh,
2843 		.maxlen		= sizeof(int),
2844 		.mode		= 0644,
2845 		.proc_handler	= &proc_dointvec,
2846 	},
2847 	{
2848 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2849 		.procname	= "max_size",
2850 		.data		= &ip_rt_max_size,
2851 		.maxlen		= sizeof(int),
2852 		.mode		= 0644,
2853 		.proc_handler	= &proc_dointvec,
2854 	},
2855 	{
2856 		/*  Deprecated. Use gc_min_interval_ms */
2857 
2858 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2859 		.procname	= "gc_min_interval",
2860 		.data		= &ip_rt_gc_min_interval,
2861 		.maxlen		= sizeof(int),
2862 		.mode		= 0644,
2863 		.proc_handler	= &proc_dointvec_jiffies,
2864 		.strategy	= &sysctl_jiffies,
2865 	},
2866 	{
2867 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2868 		.procname	= "gc_min_interval_ms",
2869 		.data		= &ip_rt_gc_min_interval,
2870 		.maxlen		= sizeof(int),
2871 		.mode		= 0644,
2872 		.proc_handler	= &proc_dointvec_ms_jiffies,
2873 		.strategy	= &sysctl_ms_jiffies,
2874 	},
2875 	{
2876 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2877 		.procname	= "gc_timeout",
2878 		.data		= &ip_rt_gc_timeout,
2879 		.maxlen		= sizeof(int),
2880 		.mode		= 0644,
2881 		.proc_handler	= &proc_dointvec_jiffies,
2882 		.strategy	= &sysctl_jiffies,
2883 	},
2884 	{
2885 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2886 		.procname	= "gc_interval",
2887 		.data		= &ip_rt_gc_interval,
2888 		.maxlen		= sizeof(int),
2889 		.mode		= 0644,
2890 		.proc_handler	= &proc_dointvec_jiffies,
2891 		.strategy	= &sysctl_jiffies,
2892 	},
2893 	{
2894 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2895 		.procname	= "redirect_load",
2896 		.data		= &ip_rt_redirect_load,
2897 		.maxlen		= sizeof(int),
2898 		.mode		= 0644,
2899 		.proc_handler	= &proc_dointvec,
2900 	},
2901 	{
2902 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2903 		.procname	= "redirect_number",
2904 		.data		= &ip_rt_redirect_number,
2905 		.maxlen		= sizeof(int),
2906 		.mode		= 0644,
2907 		.proc_handler	= &proc_dointvec,
2908 	},
2909 	{
2910 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2911 		.procname	= "redirect_silence",
2912 		.data		= &ip_rt_redirect_silence,
2913 		.maxlen		= sizeof(int),
2914 		.mode		= 0644,
2915 		.proc_handler	= &proc_dointvec,
2916 	},
2917 	{
2918 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2919 		.procname	= "error_cost",
2920 		.data		= &ip_rt_error_cost,
2921 		.maxlen		= sizeof(int),
2922 		.mode		= 0644,
2923 		.proc_handler	= &proc_dointvec,
2924 	},
2925 	{
2926 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2927 		.procname	= "error_burst",
2928 		.data		= &ip_rt_error_burst,
2929 		.maxlen		= sizeof(int),
2930 		.mode		= 0644,
2931 		.proc_handler	= &proc_dointvec,
2932 	},
2933 	{
2934 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2935 		.procname	= "gc_elasticity",
2936 		.data		= &ip_rt_gc_elasticity,
2937 		.maxlen		= sizeof(int),
2938 		.mode		= 0644,
2939 		.proc_handler	= &proc_dointvec,
2940 	},
2941 	{
2942 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2943 		.procname	= "mtu_expires",
2944 		.data		= &ip_rt_mtu_expires,
2945 		.maxlen		= sizeof(int),
2946 		.mode		= 0644,
2947 		.proc_handler	= &proc_dointvec_jiffies,
2948 		.strategy	= &sysctl_jiffies,
2949 	},
2950 	{
2951 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2952 		.procname	= "min_pmtu",
2953 		.data		= &ip_rt_min_pmtu,
2954 		.maxlen		= sizeof(int),
2955 		.mode		= 0644,
2956 		.proc_handler	= &proc_dointvec,
2957 	},
2958 	{
2959 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2960 		.procname	= "min_adv_mss",
2961 		.data		= &ip_rt_min_advmss,
2962 		.maxlen		= sizeof(int),
2963 		.mode		= 0644,
2964 		.proc_handler	= &proc_dointvec,
2965 	},
2966 	{
2967 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
2968 		.procname	= "secret_interval",
2969 		.data		= &ip_rt_secret_interval,
2970 		.maxlen		= sizeof(int),
2971 		.mode		= 0644,
2972 		.proc_handler	= &proc_dointvec_jiffies,
2973 		.strategy	= &sysctl_jiffies,
2974 	},
2975 	{ .ctl_name = 0 }
2976 };
2977 #endif
2978 
2979 #ifdef CONFIG_NET_CLS_ROUTE
2980 struct ip_rt_acct *ip_rt_acct __read_mostly;
2981 #endif /* CONFIG_NET_CLS_ROUTE */
2982 
2983 static __initdata unsigned long rhash_entries;
2984 static int __init set_rhash_entries(char *str)
2985 {
2986 	if (!str)
2987 		return 0;
2988 	rhash_entries = simple_strtoul(str, &str, 0);
2989 	return 1;
2990 }
2991 __setup("rhash_entries=", set_rhash_entries);
2992 
2993 int __init ip_rt_init(void)
2994 {
2995 	int rc = 0;
2996 
2997 	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
2998 			     (jiffies ^ (jiffies >> 7))));
2999 
3000 #ifdef CONFIG_NET_CLS_ROUTE
3001 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3002 	if (!ip_rt_acct)
3003 		panic("IP: failed to allocate ip_rt_acct\n");
3004 #endif
3005 
3006 	ipv4_dst_ops.kmem_cachep =
3007 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3008 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3009 
3010 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3011 
3012 	rt_hash_table = (struct rt_hash_bucket *)
3013 		alloc_large_system_hash("IP route cache",
3014 					sizeof(struct rt_hash_bucket),
3015 					rhash_entries,
3016 					(num_physpages >= 128 * 1024) ?
3017 					15 : 17,
3018 					0,
3019 					&rt_hash_log,
3020 					&rt_hash_mask,
3021 					0);
3022 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3023 	rt_hash_lock_init();
3024 
3025 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3026 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3027 
3028 	devinet_init();
3029 	ip_fib_init();
3030 
3031 	setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3032 
3033 	/* All the timers, started at system startup tend
3034 	   to synchronize. Perturb it a bit.
3035 	 */
3036 	schedule_delayed_work(&expires_work,
3037 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3038 
3039 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3040 		ip_rt_secret_interval;
3041 	add_timer(&rt_secret_timer);
3042 
3043 	if (ip_rt_proc_init(&init_net))
3044 		printk(KERN_ERR "Unable to create route proc files\n");
3045 #ifdef CONFIG_XFRM
3046 	xfrm_init();
3047 	xfrm4_init();
3048 #endif
3049 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3050 
3051 	return rc;
3052 }
3053 
3054 EXPORT_SYMBOL(__ip_select_ident);
3055 EXPORT_SYMBOL(ip_route_input);
3056 EXPORT_SYMBOL(ip_route_output_key);
3057