xref: /openbmc/linux/net/ipv4/route.c (revision 4e0c1159d83a658d1ffba5bc3442f4ec4cadb436)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112 
113 #define IP_MAX_MTU	0xFFF0
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_min_delay		= 2 * HZ;
118 static int ip_rt_max_delay		= 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval		= 60 * HZ;
122 static int ip_rt_gc_min_interval	= HZ / 2;
123 static int ip_rt_redirect_number	= 9;
124 static int ip_rt_redirect_load		= HZ / 50;
125 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost		= HZ;
127 static int ip_rt_error_burst		= 5 * HZ;
128 static int ip_rt_gc_elasticity		= 8;
129 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu		= 512 + 20 + 20;
131 static int ip_rt_min_advmss		= 256;
132 static int ip_rt_secret_interval	= 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.entry_size =		sizeof(struct rtable),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ	4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ	2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ	1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ	512
220 #else
221 #define RT_HASH_LOCK_SZ	256
222 #endif
223 
224 static spinlock_t	*rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()	{ \
227 		int i; \
228 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 			spin_lock_init(&rt_hash_locks[i]); \
232 		}
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237 
238 static struct rt_hash_bucket 	*rt_hash_table;
239 static unsigned			rt_hash_mask;
240 static int			rt_hash_log;
241 static unsigned int		rt_hash_rnd;
242 
243 static struct rt_cache_stat *rt_cache_stat;
244 #define RT_CACHE_STAT_INC(field)					  \
245 		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
246 
247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
248 				struct rtable **res);
249 
250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251 {
252 	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253 		& rt_hash_mask);
254 }
255 
256 #ifdef CONFIG_PROC_FS
257 struct rt_cache_iter_state {
258 	int bucket;
259 };
260 
261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
262 {
263 	struct rtable *r = NULL;
264 	struct rt_cache_iter_state *st = seq->private;
265 
266 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267 		rcu_read_lock_bh();
268 		r = rt_hash_table[st->bucket].chain;
269 		if (r)
270 			break;
271 		rcu_read_unlock_bh();
272 	}
273 	return r;
274 }
275 
276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277 {
278 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279 
280 	r = r->u.rt_next;
281 	while (!r) {
282 		rcu_read_unlock_bh();
283 		if (--st->bucket < 0)
284 			break;
285 		rcu_read_lock_bh();
286 		r = rt_hash_table[st->bucket].chain;
287 	}
288 	return r;
289 }
290 
291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292 {
293 	struct rtable *r = rt_cache_get_first(seq);
294 
295 	if (r)
296 		while (pos && (r = rt_cache_get_next(seq, r)))
297 			--pos;
298 	return pos ? NULL : r;
299 }
300 
301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302 {
303 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 }
305 
306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307 {
308 	struct rtable *r = NULL;
309 
310 	if (v == SEQ_START_TOKEN)
311 		r = rt_cache_get_first(seq);
312 	else
313 		r = rt_cache_get_next(seq, v);
314 	++*pos;
315 	return r;
316 }
317 
318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319 {
320 	if (v && v != SEQ_START_TOKEN)
321 		rcu_read_unlock_bh();
322 }
323 
324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
325 {
326 	if (v == SEQ_START_TOKEN)
327 		seq_printf(seq, "%-127s\n",
328 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330 			   "HHUptod\tSpecDst");
331 	else {
332 		struct rtable *r = v;
333 		char temp[256];
334 
335 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337 			r->u.dst.dev ? r->u.dst.dev->name : "*",
338 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
341 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343 			dst_metric(&r->u.dst, RTAX_WINDOW),
344 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
346 			r->fl.fl4_tos,
347 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349 				       dev_queue_xmit) : 0,
350 			r->rt_spec_dst);
351 		seq_printf(seq, "%-127s\n", temp);
352         }
353   	return 0;
354 }
355 
356 static struct seq_operations rt_cache_seq_ops = {
357 	.start  = rt_cache_seq_start,
358 	.next   = rt_cache_seq_next,
359 	.stop   = rt_cache_seq_stop,
360 	.show   = rt_cache_seq_show,
361 };
362 
363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
364 {
365 	struct seq_file *seq;
366 	int rc = -ENOMEM;
367 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368 
369 	if (!s)
370 		goto out;
371 	rc = seq_open(file, &rt_cache_seq_ops);
372 	if (rc)
373 		goto out_kfree;
374 	seq          = file->private_data;
375 	seq->private = s;
376 	memset(s, 0, sizeof(*s));
377 out:
378 	return rc;
379 out_kfree:
380 	kfree(s);
381 	goto out;
382 }
383 
384 static struct file_operations rt_cache_seq_fops = {
385 	.owner	 = THIS_MODULE,
386 	.open	 = rt_cache_seq_open,
387 	.read	 = seq_read,
388 	.llseek	 = seq_lseek,
389 	.release = seq_release_private,
390 };
391 
392 
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394 {
395 	int cpu;
396 
397 	if (*pos == 0)
398 		return SEQ_START_TOKEN;
399 
400 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401 		if (!cpu_possible(cpu))
402 			continue;
403 		*pos = cpu+1;
404 		return per_cpu_ptr(rt_cache_stat, cpu);
405 	}
406 	return NULL;
407 }
408 
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 {
411 	int cpu;
412 
413 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414 		if (!cpu_possible(cpu))
415 			continue;
416 		*pos = cpu+1;
417 		return per_cpu_ptr(rt_cache_stat, cpu);
418 	}
419 	return NULL;
420 
421 }
422 
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424 {
425 
426 }
427 
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 {
430 	struct rt_cache_stat *st = v;
431 
432 	if (v == SEQ_START_TOKEN) {
433 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434 		return 0;
435 	}
436 
437 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
438 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439 		   atomic_read(&ipv4_dst_ops.entries),
440 		   st->in_hit,
441 		   st->in_slow_tot,
442 		   st->in_slow_mc,
443 		   st->in_no_route,
444 		   st->in_brd,
445 		   st->in_martian_dst,
446 		   st->in_martian_src,
447 
448 		   st->out_hit,
449 		   st->out_slow_tot,
450 		   st->out_slow_mc,
451 
452 		   st->gc_total,
453 		   st->gc_ignored,
454 		   st->gc_goal_miss,
455 		   st->gc_dst_overflow,
456 		   st->in_hlist_search,
457 		   st->out_hlist_search
458 		);
459 	return 0;
460 }
461 
462 static struct seq_operations rt_cpu_seq_ops = {
463 	.start  = rt_cpu_seq_start,
464 	.next   = rt_cpu_seq_next,
465 	.stop   = rt_cpu_seq_stop,
466 	.show   = rt_cpu_seq_show,
467 };
468 
469 
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 {
472 	return seq_open(file, &rt_cpu_seq_ops);
473 }
474 
475 static struct file_operations rt_cpu_seq_fops = {
476 	.owner	 = THIS_MODULE,
477 	.open	 = rt_cpu_seq_open,
478 	.read	 = seq_read,
479 	.llseek	 = seq_lseek,
480 	.release = seq_release,
481 };
482 
483 #endif /* CONFIG_PROC_FS */
484 
485 static __inline__ void rt_free(struct rtable *rt)
486 {
487 	multipath_remove(rt);
488 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 }
490 
491 static __inline__ void rt_drop(struct rtable *rt)
492 {
493 	multipath_remove(rt);
494 	ip_rt_put(rt);
495 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 }
497 
498 static __inline__ int rt_fast_clean(struct rtable *rth)
499 {
500 	/* Kill broadcast/multicast entries very aggresively, if they
501 	   collide in hash table with more useful entries */
502 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503 		rth->fl.iif && rth->u.rt_next;
504 }
505 
506 static __inline__ int rt_valuable(struct rtable *rth)
507 {
508 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509 		rth->u.dst.expires;
510 }
511 
512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513 {
514 	unsigned long age;
515 	int ret = 0;
516 
517 	if (atomic_read(&rth->u.dst.__refcnt))
518 		goto out;
519 
520 	ret = 1;
521 	if (rth->u.dst.expires &&
522 	    time_after_eq(jiffies, rth->u.dst.expires))
523 		goto out;
524 
525 	age = jiffies - rth->u.dst.lastuse;
526 	ret = 0;
527 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528 	    (age <= tmo2 && rt_valuable(rth)))
529 		goto out;
530 	ret = 1;
531 out:	return ret;
532 }
533 
534 /* Bits of score are:
535  * 31: very valuable
536  * 30: not quite useless
537  * 29..0: usage counter
538  */
539 static inline u32 rt_score(struct rtable *rt)
540 {
541 	u32 score = jiffies - rt->u.dst.lastuse;
542 
543 	score = ~score & ~(3<<30);
544 
545 	if (rt_valuable(rt))
546 		score |= (1<<31);
547 
548 	if (!rt->fl.iif ||
549 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
550 		score |= (1<<30);
551 
552 	return score;
553 }
554 
555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 {
557 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558 	       fl1->oif     == fl2->oif &&
559 	       fl1->iif     == fl2->iif;
560 }
561 
562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564 						struct rtable *expentry,
565 						int *removed_count)
566 {
567 	int passedexpired = 0;
568 	struct rtable **nextstep = NULL;
569 	struct rtable **rthp = chain_head;
570 	struct rtable *rth;
571 
572 	if (removed_count)
573 		*removed_count = 0;
574 
575 	while ((rth = *rthp) != NULL) {
576 		if (rth == expentry)
577 			passedexpired = 1;
578 
579 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
580 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
581 			if (*rthp == expentry) {
582 				*rthp = rth->u.rt_next;
583 				continue;
584 			} else {
585 				*rthp = rth->u.rt_next;
586 				rt_free(rth);
587 				if (removed_count)
588 					++(*removed_count);
589 			}
590 		} else {
591 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592 			    passedexpired && !nextstep)
593 				nextstep = &rth->u.rt_next;
594 
595 			rthp = &rth->u.rt_next;
596 		}
597 	}
598 
599 	rt_free(expentry);
600 	if (removed_count)
601 		++(*removed_count);
602 
603 	return nextstep;
604 }
605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606 
607 
608 /* This runs via a timer and thus is always in BH context. */
609 static void rt_check_expire(unsigned long dummy)
610 {
611 	static unsigned int rover;
612 	unsigned int i = rover, goal;
613 	struct rtable *rth, **rthp;
614 	unsigned long now = jiffies;
615 	u64 mult;
616 
617 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618 	if (ip_rt_gc_timeout > 1)
619 		do_div(mult, ip_rt_gc_timeout);
620 	goal = (unsigned int)mult;
621 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622 	for (; goal > 0; goal--) {
623 		unsigned long tmo = ip_rt_gc_timeout;
624 
625 		i = (i + 1) & rt_hash_mask;
626 		rthp = &rt_hash_table[i].chain;
627 
628 		if (*rthp == 0)
629 			continue;
630 		spin_lock(rt_hash_lock_addr(i));
631 		while ((rth = *rthp) != NULL) {
632 			if (rth->u.dst.expires) {
633 				/* Entry is expired even if it is in use */
634 				if (time_before_eq(now, rth->u.dst.expires)) {
635 					tmo >>= 1;
636 					rthp = &rth->u.rt_next;
637 					continue;
638 				}
639 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640 				tmo >>= 1;
641 				rthp = &rth->u.rt_next;
642 				continue;
643 			}
644 
645 			/* Cleanup aged off entries. */
646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647 			/* remove all related balanced entries if necessary */
648 			if (rth->u.dst.flags & DST_BALANCED) {
649 				rthp = rt_remove_balanced_route(
650 					&rt_hash_table[i].chain,
651 					rth, NULL);
652 				if (!rthp)
653 					break;
654 			} else {
655 				*rthp = rth->u.rt_next;
656 				rt_free(rth);
657 			}
658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659  			*rthp = rth->u.rt_next;
660  			rt_free(rth);
661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662 		}
663 		spin_unlock(rt_hash_lock_addr(i));
664 
665 		/* Fallback loop breaker. */
666 		if (time_after(jiffies, now))
667 			break;
668 	}
669 	rover = i;
670 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 }
672 
673 /* This can run from both BH and non-BH contexts, the latter
674  * in the case of a forced flush event.
675  */
676 static void rt_run_flush(unsigned long dummy)
677 {
678 	int i;
679 	struct rtable *rth, *next;
680 
681 	rt_deadline = 0;
682 
683 	get_random_bytes(&rt_hash_rnd, 4);
684 
685 	for (i = rt_hash_mask; i >= 0; i--) {
686 		spin_lock_bh(rt_hash_lock_addr(i));
687 		rth = rt_hash_table[i].chain;
688 		if (rth)
689 			rt_hash_table[i].chain = NULL;
690 		spin_unlock_bh(rt_hash_lock_addr(i));
691 
692 		for (; rth; rth = next) {
693 			next = rth->u.rt_next;
694 			rt_free(rth);
695 		}
696 	}
697 }
698 
699 static DEFINE_SPINLOCK(rt_flush_lock);
700 
701 void rt_cache_flush(int delay)
702 {
703 	unsigned long now = jiffies;
704 	int user_mode = !in_softirq();
705 
706 	if (delay < 0)
707 		delay = ip_rt_min_delay;
708 
709 	/* flush existing multipath state*/
710 	multipath_flush();
711 
712 	spin_lock_bh(&rt_flush_lock);
713 
714 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715 		long tmo = (long)(rt_deadline - now);
716 
717 		/* If flush timer is already running
718 		   and flush request is not immediate (delay > 0):
719 
720 		   if deadline is not achieved, prolongate timer to "delay",
721 		   otherwise fire it at deadline time.
722 		 */
723 
724 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
725 			tmo = 0;
726 
727 		if (delay > tmo)
728 			delay = tmo;
729 	}
730 
731 	if (delay <= 0) {
732 		spin_unlock_bh(&rt_flush_lock);
733 		rt_run_flush(0);
734 		return;
735 	}
736 
737 	if (rt_deadline == 0)
738 		rt_deadline = now + ip_rt_max_delay;
739 
740 	mod_timer(&rt_flush_timer, now+delay);
741 	spin_unlock_bh(&rt_flush_lock);
742 }
743 
744 static void rt_secret_rebuild(unsigned long dummy)
745 {
746 	unsigned long now = jiffies;
747 
748 	rt_cache_flush(0);
749 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
750 }
751 
752 /*
753    Short description of GC goals.
754 
755    We want to build algorithm, which will keep routing cache
756    at some equilibrium point, when number of aged off entries
757    is kept approximately equal to newly generated ones.
758 
759    Current expiration strength is variable "expire".
760    We try to adjust it dynamically, so that if networking
761    is idle expires is large enough to keep enough of warm entries,
762    and when load increases it reduces to limit cache size.
763  */
764 
765 static int rt_garbage_collect(void)
766 {
767 	static unsigned long expire = RT_GC_TIMEOUT;
768 	static unsigned long last_gc;
769 	static int rover;
770 	static int equilibrium;
771 	struct rtable *rth, **rthp;
772 	unsigned long now = jiffies;
773 	int goal;
774 
775 	/*
776 	 * Garbage collection is pretty expensive,
777 	 * do not make it too frequently.
778 	 */
779 
780 	RT_CACHE_STAT_INC(gc_total);
781 
782 	if (now - last_gc < ip_rt_gc_min_interval &&
783 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784 		RT_CACHE_STAT_INC(gc_ignored);
785 		goto out;
786 	}
787 
788 	/* Calculate number of entries, which we want to expire now. */
789 	goal = atomic_read(&ipv4_dst_ops.entries) -
790 		(ip_rt_gc_elasticity << rt_hash_log);
791 	if (goal <= 0) {
792 		if (equilibrium < ipv4_dst_ops.gc_thresh)
793 			equilibrium = ipv4_dst_ops.gc_thresh;
794 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795 		if (goal > 0) {
796 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798 		}
799 	} else {
800 		/* We are in dangerous area. Try to reduce cache really
801 		 * aggressively.
802 		 */
803 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805 	}
806 
807 	if (now - last_gc >= ip_rt_gc_min_interval)
808 		last_gc = now;
809 
810 	if (goal <= 0) {
811 		equilibrium += goal;
812 		goto work_done;
813 	}
814 
815 	do {
816 		int i, k;
817 
818 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819 			unsigned long tmo = expire;
820 
821 			k = (k + 1) & rt_hash_mask;
822 			rthp = &rt_hash_table[k].chain;
823 			spin_lock_bh(rt_hash_lock_addr(k));
824 			while ((rth = *rthp) != NULL) {
825 				if (!rt_may_expire(rth, tmo, expire)) {
826 					tmo >>= 1;
827 					rthp = &rth->u.rt_next;
828 					continue;
829 				}
830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831 				/* remove all related balanced entries
832 				 * if necessary
833 				 */
834 				if (rth->u.dst.flags & DST_BALANCED) {
835 					int r;
836 
837 					rthp = rt_remove_balanced_route(
838 						&rt_hash_table[i].chain,
839 						rth,
840 						&r);
841 					goal -= r;
842 					if (!rthp)
843 						break;
844 				} else {
845 					*rthp = rth->u.rt_next;
846 					rt_free(rth);
847 					goal--;
848 				}
849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850 				*rthp = rth->u.rt_next;
851 				rt_free(rth);
852 				goal--;
853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854 			}
855 			spin_unlock_bh(rt_hash_lock_addr(k));
856 			if (goal <= 0)
857 				break;
858 		}
859 		rover = k;
860 
861 		if (goal <= 0)
862 			goto work_done;
863 
864 		/* Goal is not achieved. We stop process if:
865 
866 		   - if expire reduced to zero. Otherwise, expire is halfed.
867 		   - if table is not full.
868 		   - if we are called from interrupt.
869 		   - jiffies check is just fallback/debug loop breaker.
870 		     We will not spin here for long time in any case.
871 		 */
872 
873 		RT_CACHE_STAT_INC(gc_goal_miss);
874 
875 		if (expire == 0)
876 			break;
877 
878 		expire >>= 1;
879 #if RT_CACHE_DEBUG >= 2
880 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881 				atomic_read(&ipv4_dst_ops.entries), goal, i);
882 #endif
883 
884 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885 			goto out;
886 	} while (!in_softirq() && time_before_eq(jiffies, now));
887 
888 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889 		goto out;
890 	if (net_ratelimit())
891 		printk(KERN_WARNING "dst cache overflow\n");
892 	RT_CACHE_STAT_INC(gc_dst_overflow);
893 	return 1;
894 
895 work_done:
896 	expire += ip_rt_gc_min_interval;
897 	if (expire > ip_rt_gc_timeout ||
898 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899 		expire = ip_rt_gc_timeout;
900 #if RT_CACHE_DEBUG >= 2
901 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
903 #endif
904 out:	return 0;
905 }
906 
907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908 {
909 	struct rtable	*rth, **rthp;
910 	unsigned long	now;
911 	struct rtable *cand, **candp;
912 	u32 		min_score;
913 	int		chain_length;
914 	int attempts = !in_softirq();
915 
916 restart:
917 	chain_length = 0;
918 	min_score = ~(u32)0;
919 	cand = NULL;
920 	candp = NULL;
921 	now = jiffies;
922 
923 	rthp = &rt_hash_table[hash].chain;
924 
925 	spin_lock_bh(rt_hash_lock_addr(hash));
926 	while ((rth = *rthp) != NULL) {
927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928 		if (!(rth->u.dst.flags & DST_BALANCED) &&
929 		    compare_keys(&rth->fl, &rt->fl)) {
930 #else
931 		if (compare_keys(&rth->fl, &rt->fl)) {
932 #endif
933 			/* Put it first */
934 			*rthp = rth->u.rt_next;
935 			/*
936 			 * Since lookup is lockfree, the deletion
937 			 * must be visible to another weakly ordered CPU before
938 			 * the insertion at the start of the hash chain.
939 			 */
940 			rcu_assign_pointer(rth->u.rt_next,
941 					   rt_hash_table[hash].chain);
942 			/*
943 			 * Since lookup is lockfree, the update writes
944 			 * must be ordered for consistency on SMP.
945 			 */
946 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947 
948 			rth->u.dst.__use++;
949 			dst_hold(&rth->u.dst);
950 			rth->u.dst.lastuse = now;
951 			spin_unlock_bh(rt_hash_lock_addr(hash));
952 
953 			rt_drop(rt);
954 			*rp = rth;
955 			return 0;
956 		}
957 
958 		if (!atomic_read(&rth->u.dst.__refcnt)) {
959 			u32 score = rt_score(rth);
960 
961 			if (score <= min_score) {
962 				cand = rth;
963 				candp = rthp;
964 				min_score = score;
965 			}
966 		}
967 
968 		chain_length++;
969 
970 		rthp = &rth->u.rt_next;
971 	}
972 
973 	if (cand) {
974 		/* ip_rt_gc_elasticity used to be average length of chain
975 		 * length, when exceeded gc becomes really aggressive.
976 		 *
977 		 * The second limit is less certain. At the moment it allows
978 		 * only 2 entries per bucket. We will see.
979 		 */
980 		if (chain_length > ip_rt_gc_elasticity) {
981 			*candp = cand->u.rt_next;
982 			rt_free(cand);
983 		}
984 	}
985 
986 	/* Try to bind route to arp only if it is output
987 	   route or unicast forwarding path.
988 	 */
989 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990 		int err = arp_bind_neighbour(&rt->u.dst);
991 		if (err) {
992 			spin_unlock_bh(rt_hash_lock_addr(hash));
993 
994 			if (err != -ENOBUFS) {
995 				rt_drop(rt);
996 				return err;
997 			}
998 
999 			/* Neighbour tables are full and nothing
1000 			   can be released. Try to shrink route cache,
1001 			   it is most likely it holds some neighbour records.
1002 			 */
1003 			if (attempts-- > 0) {
1004 				int saved_elasticity = ip_rt_gc_elasticity;
1005 				int saved_int = ip_rt_gc_min_interval;
1006 				ip_rt_gc_elasticity	= 1;
1007 				ip_rt_gc_min_interval	= 0;
1008 				rt_garbage_collect();
1009 				ip_rt_gc_min_interval	= saved_int;
1010 				ip_rt_gc_elasticity	= saved_elasticity;
1011 				goto restart;
1012 			}
1013 
1014 			if (net_ratelimit())
1015 				printk(KERN_WARNING "Neighbour table overflow.\n");
1016 			rt_drop(rt);
1017 			return -ENOBUFS;
1018 		}
1019 	}
1020 
1021 	rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023 	if (rt->u.rt_next) {
1024 		struct rtable *trt;
1025 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026 		       NIPQUAD(rt->rt_dst));
1027 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029 		printk("\n");
1030 	}
1031 #endif
1032 	rt_hash_table[hash].chain = rt;
1033 	spin_unlock_bh(rt_hash_lock_addr(hash));
1034 	*rp = rt;
1035 	return 0;
1036 }
1037 
1038 void rt_bind_peer(struct rtable *rt, int create)
1039 {
1040 	static DEFINE_SPINLOCK(rt_peer_lock);
1041 	struct inet_peer *peer;
1042 
1043 	peer = inet_getpeer(rt->rt_dst, create);
1044 
1045 	spin_lock_bh(&rt_peer_lock);
1046 	if (rt->peer == NULL) {
1047 		rt->peer = peer;
1048 		peer = NULL;
1049 	}
1050 	spin_unlock_bh(&rt_peer_lock);
1051 	if (peer)
1052 		inet_putpeer(peer);
1053 }
1054 
1055 /*
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
1057  * we still can generate some output.
1058  * Random ID selection looks a bit dangerous because we have no chances to
1059  * select ID being unique in a reasonable period of time.
1060  * But broken packet identifier may be better than no packet at all.
1061  */
1062 static void ip_select_fb_ident(struct iphdr *iph)
1063 {
1064 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1065 	static u32 ip_fallback_id;
1066 	u32 salt;
1067 
1068 	spin_lock_bh(&ip_fb_id_lock);
1069 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070 	iph->id = htons(salt & 0xFFFF);
1071 	ip_fallback_id = salt;
1072 	spin_unlock_bh(&ip_fb_id_lock);
1073 }
1074 
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 {
1077 	struct rtable *rt = (struct rtable *) dst;
1078 
1079 	if (rt) {
1080 		if (rt->peer == NULL)
1081 			rt_bind_peer(rt, 1);
1082 
1083 		/* If peer is attached to destination, it is never detached,
1084 		   so that we need not to grab a lock to dereference it.
1085 		 */
1086 		if (rt->peer) {
1087 			iph->id = htons(inet_getid(rt->peer, more));
1088 			return;
1089 		}
1090 	} else
1091 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1092 		       __builtin_return_address(0));
1093 
1094 	ip_select_fb_ident(iph);
1095 }
1096 
1097 static void rt_del(unsigned hash, struct rtable *rt)
1098 {
1099 	struct rtable **rthp;
1100 
1101 	spin_lock_bh(rt_hash_lock_addr(hash));
1102 	ip_rt_put(rt);
1103 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1104 	     rthp = &(*rthp)->u.rt_next)
1105 		if (*rthp == rt) {
1106 			*rthp = rt->u.rt_next;
1107 			rt_free(rt);
1108 			break;
1109 		}
1110 	spin_unlock_bh(rt_hash_lock_addr(hash));
1111 }
1112 
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114 		    u32 saddr, u8 tos, struct net_device *dev)
1115 {
1116 	int i, k;
1117 	struct in_device *in_dev = in_dev_get(dev);
1118 	struct rtable *rth, **rthp;
1119 	u32  skeys[2] = { saddr, 0 };
1120 	int  ikeys[2] = { dev->ifindex, 0 };
1121 
1122 	tos &= IPTOS_RT_MASK;
1123 
1124 	if (!in_dev)
1125 		return;
1126 
1127 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129 		goto reject_redirect;
1130 
1131 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133 			goto reject_redirect;
1134 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135 			goto reject_redirect;
1136 	} else {
1137 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1138 			goto reject_redirect;
1139 	}
1140 
1141 	for (i = 0; i < 2; i++) {
1142 		for (k = 0; k < 2; k++) {
1143 			unsigned hash = rt_hash_code(daddr,
1144 						     skeys[i] ^ (ikeys[k] << 5),
1145 						     tos);
1146 
1147 			rthp=&rt_hash_table[hash].chain;
1148 
1149 			rcu_read_lock();
1150 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1151 				struct rtable *rt;
1152 
1153 				if (rth->fl.fl4_dst != daddr ||
1154 				    rth->fl.fl4_src != skeys[i] ||
1155 				    rth->fl.fl4_tos != tos ||
1156 				    rth->fl.oif != ikeys[k] ||
1157 				    rth->fl.iif != 0) {
1158 					rthp = &rth->u.rt_next;
1159 					continue;
1160 				}
1161 
1162 				if (rth->rt_dst != daddr ||
1163 				    rth->rt_src != saddr ||
1164 				    rth->u.dst.error ||
1165 				    rth->rt_gateway != old_gw ||
1166 				    rth->u.dst.dev != dev)
1167 					break;
1168 
1169 				dst_hold(&rth->u.dst);
1170 				rcu_read_unlock();
1171 
1172 				rt = dst_alloc(&ipv4_dst_ops);
1173 				if (rt == NULL) {
1174 					ip_rt_put(rth);
1175 					in_dev_put(in_dev);
1176 					return;
1177 				}
1178 
1179 				/* Copy all the information. */
1180 				*rt = *rth;
1181  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182 				rt->u.dst.__use		= 1;
1183 				atomic_set(&rt->u.dst.__refcnt, 1);
1184 				rt->u.dst.child		= NULL;
1185 				if (rt->u.dst.dev)
1186 					dev_hold(rt->u.dst.dev);
1187 				if (rt->idev)
1188 					in_dev_hold(rt->idev);
1189 				rt->u.dst.obsolete	= 0;
1190 				rt->u.dst.lastuse	= jiffies;
1191 				rt->u.dst.path		= &rt->u.dst;
1192 				rt->u.dst.neighbour	= NULL;
1193 				rt->u.dst.hh		= NULL;
1194 				rt->u.dst.xfrm		= NULL;
1195 
1196 				rt->rt_flags		|= RTCF_REDIRECTED;
1197 
1198 				/* Gateway is different ... */
1199 				rt->rt_gateway		= new_gw;
1200 
1201 				/* Redirect received -> path was valid */
1202 				dst_confirm(&rth->u.dst);
1203 
1204 				if (rt->peer)
1205 					atomic_inc(&rt->peer->refcnt);
1206 
1207 				if (arp_bind_neighbour(&rt->u.dst) ||
1208 				    !(rt->u.dst.neighbour->nud_state &
1209 					    NUD_VALID)) {
1210 					if (rt->u.dst.neighbour)
1211 						neigh_event_send(rt->u.dst.neighbour, NULL);
1212 					ip_rt_put(rth);
1213 					rt_drop(rt);
1214 					goto do_next;
1215 				}
1216 
1217 				rt_del(hash, rth);
1218 				if (!rt_intern_hash(hash, rt, &rt))
1219 					ip_rt_put(rt);
1220 				goto do_next;
1221 			}
1222 			rcu_read_unlock();
1223 		do_next:
1224 			;
1225 		}
1226 	}
1227 	in_dev_put(in_dev);
1228 	return;
1229 
1230 reject_redirect:
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234 			"%u.%u.%u.%u ignored.\n"
1235 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236 			"tos %02x\n",
1237 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238 		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239 #endif
1240 	in_dev_put(in_dev);
1241 }
1242 
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 {
1245 	struct rtable *rt = (struct rtable*)dst;
1246 	struct dst_entry *ret = dst;
1247 
1248 	if (rt) {
1249 		if (dst->obsolete) {
1250 			ip_rt_put(rt);
1251 			ret = NULL;
1252 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253 			   rt->u.dst.expires) {
1254 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255 						     rt->fl.fl4_src ^
1256 							(rt->fl.oif << 5),
1257 						     rt->fl.fl4_tos);
1258 #if RT_CACHE_DEBUG >= 1
1259 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260 					  "%u.%u.%u.%u/%02x dropped\n",
1261 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 #endif
1263 			rt_del(hash, rt);
1264 			ret = NULL;
1265 		}
1266 	}
1267 	return ret;
1268 }
1269 
1270 /*
1271  * Algorithm:
1272  *	1. The first ip_rt_redirect_number redirects are sent
1273  *	   with exponential backoff, then we stop sending them at all,
1274  *	   assuming that the host ignores our redirects.
1275  *	2. If we did not see packets requiring redirects
1276  *	   during ip_rt_redirect_silence, we assume that the host
1277  *	   forgot redirected route and start to send redirects again.
1278  *
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
1280  * in icmp.c.
1281  *
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284  */
1285 
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1287 {
1288 	struct rtable *rt = (struct rtable*)skb->dst;
1289 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290 
1291 	if (!in_dev)
1292 		return;
1293 
1294 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1295 		goto out;
1296 
1297 	/* No redirected packets during ip_rt_redirect_silence;
1298 	 * reset the algorithm.
1299 	 */
1300 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301 		rt->u.dst.rate_tokens = 0;
1302 
1303 	/* Too many ignored redirects; do not send anything
1304 	 * set u.dst.rate_last to the last seen redirected packet.
1305 	 */
1306 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307 		rt->u.dst.rate_last = jiffies;
1308 		goto out;
1309 	}
1310 
1311 	/* Check for load limit; set rate_last to the latest sent
1312 	 * redirect.
1313 	 */
1314 	if (time_after(jiffies,
1315 		       (rt->u.dst.rate_last +
1316 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318 		rt->u.dst.rate_last = jiffies;
1319 		++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323 		    net_ratelimit())
1324 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326 				NIPQUAD(rt->rt_src), rt->rt_iif,
1327 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328 #endif
1329 	}
1330 out:
1331         in_dev_put(in_dev);
1332 }
1333 
1334 static int ip_error(struct sk_buff *skb)
1335 {
1336 	struct rtable *rt = (struct rtable*)skb->dst;
1337 	unsigned long now;
1338 	int code;
1339 
1340 	switch (rt->u.dst.error) {
1341 		case EINVAL:
1342 		default:
1343 			goto out;
1344 		case EHOSTUNREACH:
1345 			code = ICMP_HOST_UNREACH;
1346 			break;
1347 		case ENETUNREACH:
1348 			code = ICMP_NET_UNREACH;
1349 			break;
1350 		case EACCES:
1351 			code = ICMP_PKT_FILTERED;
1352 			break;
1353 	}
1354 
1355 	now = jiffies;
1356 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1359 	rt->u.dst.rate_last = now;
1360 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363 	}
1364 
1365 out:	kfree_skb(skb);
1366 	return 0;
1367 }
1368 
1369 /*
1370  *	The last two values are not from the RFC but
1371  *	are needed for AMPRnet AX.25 paths.
1372  */
1373 
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376 
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378 {
1379 	int i;
1380 
1381 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382 		if (old_mtu > mtu_plateau[i])
1383 			return mtu_plateau[i];
1384 	return 68;
1385 }
1386 
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 {
1389 	int i;
1390 	unsigned short old_mtu = ntohs(iph->tot_len);
1391 	struct rtable *rth;
1392 	u32  skeys[2] = { iph->saddr, 0, };
1393 	u32  daddr = iph->daddr;
1394 	u8   tos = iph->tos & IPTOS_RT_MASK;
1395 	unsigned short est_mtu = 0;
1396 
1397 	if (ipv4_config.no_pmtu_disc)
1398 		return 0;
1399 
1400 	for (i = 0; i < 2; i++) {
1401 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402 
1403 		rcu_read_lock();
1404 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 		     rth = rcu_dereference(rth->u.rt_next)) {
1406 			if (rth->fl.fl4_dst == daddr &&
1407 			    rth->fl.fl4_src == skeys[i] &&
1408 			    rth->rt_dst  == daddr &&
1409 			    rth->rt_src  == iph->saddr &&
1410 			    rth->fl.fl4_tos == tos &&
1411 			    rth->fl.iif == 0 &&
1412 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413 				unsigned short mtu = new_mtu;
1414 
1415 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1416 
1417 					/* BSD 4.2 compatibility hack :-( */
1418 					if (mtu == 0 &&
1419 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420 					    old_mtu >= 68 + (iph->ihl << 2))
1421 						old_mtu -= iph->ihl << 2;
1422 
1423 					mtu = guess_mtu(old_mtu);
1424 				}
1425 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1427 						dst_confirm(&rth->u.dst);
1428 						if (mtu < ip_rt_min_pmtu) {
1429 							mtu = ip_rt_min_pmtu;
1430 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1431 								(1 << RTAX_MTU);
1432 						}
1433 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434 						dst_set_expires(&rth->u.dst,
1435 							ip_rt_mtu_expires);
1436 					}
1437 					est_mtu = mtu;
1438 				}
1439 			}
1440 		}
1441 		rcu_read_unlock();
1442 	}
1443 	return est_mtu ? : new_mtu;
1444 }
1445 
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 {
1448 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1450 		if (mtu < ip_rt_min_pmtu) {
1451 			mtu = ip_rt_min_pmtu;
1452 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 		}
1454 		dst->metrics[RTAX_MTU-1] = mtu;
1455 		dst_set_expires(dst, ip_rt_mtu_expires);
1456 	}
1457 }
1458 
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 {
1461 	return NULL;
1462 }
1463 
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 {
1466 	struct rtable *rt = (struct rtable *) dst;
1467 	struct inet_peer *peer = rt->peer;
1468 	struct in_device *idev = rt->idev;
1469 
1470 	if (peer) {
1471 		rt->peer = NULL;
1472 		inet_putpeer(peer);
1473 	}
1474 
1475 	if (idev) {
1476 		rt->idev = NULL;
1477 		in_dev_put(idev);
1478 	}
1479 }
1480 
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482 			    int how)
1483 {
1484 	struct rtable *rt = (struct rtable *) dst;
1485 	struct in_device *idev = rt->idev;
1486 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1487 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488 		if (loopback_idev) {
1489 			rt->idev = loopback_idev;
1490 			in_dev_put(idev);
1491 		}
1492 	}
1493 }
1494 
1495 static void ipv4_link_failure(struct sk_buff *skb)
1496 {
1497 	struct rtable *rt;
1498 
1499 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500 
1501 	rt = (struct rtable *) skb->dst;
1502 	if (rt)
1503 		dst_set_expires(&rt->u.dst, 0);
1504 }
1505 
1506 static int ip_rt_bug(struct sk_buff *skb)
1507 {
1508 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510 		skb->dev ? skb->dev->name : "?");
1511 	kfree_skb(skb);
1512 	return 0;
1513 }
1514 
1515 /*
1516    We do not cache source address of outgoing interface,
1517    because it is used only by IP RR, TS and SRR options,
1518    so that it out of fast path.
1519 
1520    BTW remember: "addr" is allowed to be not aligned
1521    in IP options!
1522  */
1523 
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 {
1526 	u32 src;
1527 	struct fib_result res;
1528 
1529 	if (rt->fl.iif == 0)
1530 		src = rt->rt_src;
1531 	else if (fib_lookup(&rt->fl, &res) == 0) {
1532 		src = FIB_RES_PREFSRC(res);
1533 		fib_res_put(&res);
1534 	} else
1535 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536 					RT_SCOPE_UNIVERSE);
1537 	memcpy(addr, &src, 4);
1538 }
1539 
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1542 {
1543 	if (!(rt->u.dst.tclassid & 0xFFFF))
1544 		rt->u.dst.tclassid |= tag & 0xFFFF;
1545 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 }
1548 #endif
1549 
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 {
1552 	struct fib_info *fi = res->fi;
1553 
1554 	if (fi) {
1555 		if (FIB_RES_GW(*res) &&
1556 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557 			rt->rt_gateway = FIB_RES_GW(*res);
1558 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559 		       sizeof(rt->u.dst.metrics));
1560 		if (fi->fib_mtu == 0) {
1561 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563 			    rt->rt_gateway != rt->rt_dst &&
1564 			    rt->u.dst.dev->mtu > 576)
1565 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 		}
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 #endif
1570 	} else
1571 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572 
1573 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579 				       ip_rt_min_advmss);
1580 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582 
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585 	set_class_tag(rt, fib_rules_tclass(res));
1586 #endif
1587 	set_class_tag(rt, itag);
1588 #endif
1589         rt->rt_type = res->type;
1590 }
1591 
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593 				u8 tos, struct net_device *dev, int our)
1594 {
1595 	unsigned hash;
1596 	struct rtable *rth;
1597 	u32 spec_dst;
1598 	struct in_device *in_dev = in_dev_get(dev);
1599 	u32 itag = 0;
1600 
1601 	/* Primary sanity checks. */
1602 
1603 	if (in_dev == NULL)
1604 		return -EINVAL;
1605 
1606 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607 	    skb->protocol != htons(ETH_P_IP))
1608 		goto e_inval;
1609 
1610 	if (ZERONET(saddr)) {
1611 		if (!LOCAL_MCAST(daddr))
1612 			goto e_inval;
1613 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614 	} else if (fib_validate_source(saddr, 0, tos, 0,
1615 					dev, &spec_dst, &itag) < 0)
1616 		goto e_inval;
1617 
1618 	rth = dst_alloc(&ipv4_dst_ops);
1619 	if (!rth)
1620 		goto e_nobufs;
1621 
1622 	rth->u.dst.output= ip_rt_bug;
1623 
1624 	atomic_set(&rth->u.dst.__refcnt, 1);
1625 	rth->u.dst.flags= DST_HOST;
1626 	if (in_dev->cnf.no_policy)
1627 		rth->u.dst.flags |= DST_NOPOLICY;
1628 	rth->fl.fl4_dst	= daddr;
1629 	rth->rt_dst	= daddr;
1630 	rth->fl.fl4_tos	= tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632 	rth->fl.fl4_fwmark= skb->nfmark;
1633 #endif
1634 	rth->fl.fl4_src	= saddr;
1635 	rth->rt_src	= saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637 	rth->u.dst.tclassid = itag;
1638 #endif
1639 	rth->rt_iif	=
1640 	rth->fl.iif	= dev->ifindex;
1641 	rth->u.dst.dev	= &loopback_dev;
1642 	dev_hold(rth->u.dst.dev);
1643 	rth->idev	= in_dev_get(rth->u.dst.dev);
1644 	rth->fl.oif	= 0;
1645 	rth->rt_gateway	= daddr;
1646 	rth->rt_spec_dst= spec_dst;
1647 	rth->rt_type	= RTN_MULTICAST;
1648 	rth->rt_flags	= RTCF_MULTICAST;
1649 	if (our) {
1650 		rth->u.dst.input= ip_local_deliver;
1651 		rth->rt_flags |= RTCF_LOCAL;
1652 	}
1653 
1654 #ifdef CONFIG_IP_MROUTE
1655 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656 		rth->u.dst.input = ip_mr_input;
1657 #endif
1658 	RT_CACHE_STAT_INC(in_slow_mc);
1659 
1660 	in_dev_put(in_dev);
1661 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663 
1664 e_nobufs:
1665 	in_dev_put(in_dev);
1666 	return -ENOBUFS;
1667 
1668 e_inval:
1669 	in_dev_put(in_dev);
1670 	return -EINVAL;
1671 }
1672 
1673 
1674 static void ip_handle_martian_source(struct net_device *dev,
1675 				     struct in_device *in_dev,
1676 				     struct sk_buff *skb,
1677 				     u32 daddr,
1678 				     u32 saddr)
1679 {
1680 	RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 		/*
1684 		 *	RFC1812 recommendation, if source is martian,
1685 		 *	the only hint is MAC header.
1686 		 */
1687 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688 			"%u.%u.%u.%u, on dev %s\n",
1689 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690 		if (dev->hard_header_len && skb->mac.raw) {
1691 			int i;
1692 			unsigned char *p = skb->mac.raw;
1693 			printk(KERN_WARNING "ll header: ");
1694 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1695 				printk("%02x", *p);
1696 				if (i < (dev->hard_header_len - 1))
1697 					printk(":");
1698 			}
1699 			printk("\n");
1700 		}
1701 	}
1702 #endif
1703 }
1704 
1705 static inline int __mkroute_input(struct sk_buff *skb,
1706 				  struct fib_result* res,
1707 				  struct in_device *in_dev,
1708 				  u32 daddr, u32 saddr, u32 tos,
1709 				  struct rtable **result)
1710 {
1711 
1712 	struct rtable *rth;
1713 	int err;
1714 	struct in_device *out_dev;
1715 	unsigned flags = 0;
1716 	u32 spec_dst, itag;
1717 
1718 	/* get a working reference to the output device */
1719 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1720 	if (out_dev == NULL) {
1721 		if (net_ratelimit())
1722 			printk(KERN_CRIT "Bug in ip_route_input" \
1723 			       "_slow(). Please, report\n");
1724 		return -EINVAL;
1725 	}
1726 
1727 
1728 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1729 				  in_dev->dev, &spec_dst, &itag);
1730 	if (err < 0) {
1731 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1732 					 saddr);
1733 
1734 		err = -EINVAL;
1735 		goto cleanup;
1736 	}
1737 
1738 	if (err)
1739 		flags |= RTCF_DIRECTSRC;
1740 
1741 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1743 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744 		flags |= RTCF_DOREDIRECT;
1745 
1746 	if (skb->protocol != htons(ETH_P_IP)) {
1747 		/* Not IP (i.e. ARP). Do not create route, if it is
1748 		 * invalid for proxy arp. DNAT routes are always valid.
1749 		 */
1750 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751 			err = -EINVAL;
1752 			goto cleanup;
1753 		}
1754 	}
1755 
1756 
1757 	rth = dst_alloc(&ipv4_dst_ops);
1758 	if (!rth) {
1759 		err = -ENOBUFS;
1760 		goto cleanup;
1761 	}
1762 
1763 	atomic_set(&rth->u.dst.__refcnt, 1);
1764 	rth->u.dst.flags= DST_HOST;
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1766 	if (res->fi->fib_nhs > 1)
1767 		rth->u.dst.flags |= DST_BALANCED;
1768 #endif
1769 	if (in_dev->cnf.no_policy)
1770 		rth->u.dst.flags |= DST_NOPOLICY;
1771 	if (in_dev->cnf.no_xfrm)
1772 		rth->u.dst.flags |= DST_NOXFRM;
1773 	rth->fl.fl4_dst	= daddr;
1774 	rth->rt_dst	= daddr;
1775 	rth->fl.fl4_tos	= tos;
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
1777 	rth->fl.fl4_fwmark= skb->nfmark;
1778 #endif
1779 	rth->fl.fl4_src	= saddr;
1780 	rth->rt_src	= saddr;
1781 	rth->rt_gateway	= daddr;
1782 	rth->rt_iif 	=
1783 		rth->fl.iif	= in_dev->dev->ifindex;
1784 	rth->u.dst.dev	= (out_dev)->dev;
1785 	dev_hold(rth->u.dst.dev);
1786 	rth->idev	= in_dev_get(rth->u.dst.dev);
1787 	rth->fl.oif 	= 0;
1788 	rth->rt_spec_dst= spec_dst;
1789 
1790 	rth->u.dst.input = ip_forward;
1791 	rth->u.dst.output = ip_output;
1792 
1793 	rt_set_nexthop(rth, res, itag);
1794 
1795 	rth->rt_flags = flags;
1796 
1797 	*result = rth;
1798 	err = 0;
1799  cleanup:
1800 	/* release the working reference to the output device */
1801 	in_dev_put(out_dev);
1802 	return err;
1803 }
1804 
1805 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1806 				       struct fib_result* res,
1807 				       const struct flowi *fl,
1808 				       struct in_device *in_dev,
1809 				       u32 daddr, u32 saddr, u32 tos)
1810 {
1811 	struct rtable* rth = NULL;
1812 	int err;
1813 	unsigned hash;
1814 
1815 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1816 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1817 		fib_select_multipath(fl, res);
1818 #endif
1819 
1820 	/* create a routing cache entry */
1821 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1822 	if (err)
1823 		return err;
1824 
1825 	/* put it into the cache */
1826 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828 }
1829 
1830 static inline int ip_mkroute_input(struct sk_buff *skb,
1831 				   struct fib_result* res,
1832 				   const struct flowi *fl,
1833 				   struct in_device *in_dev,
1834 				   u32 daddr, u32 saddr, u32 tos)
1835 {
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837 	struct rtable* rth = NULL, *rtres;
1838 	unsigned char hop, hopcount;
1839 	int err = -EINVAL;
1840 	unsigned int hash;
1841 
1842 	if (res->fi)
1843 		hopcount = res->fi->fib_nhs;
1844 	else
1845 		hopcount = 1;
1846 
1847 	/* distinguish between multipath and singlepath */
1848 	if (hopcount < 2)
1849 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850 					    saddr, tos);
1851 
1852 	/* add all alternatives to the routing cache */
1853 	for (hop = 0; hop < hopcount; hop++) {
1854 		res->nh_sel = hop;
1855 
1856 		/* put reference to previous result */
1857 		if (hop)
1858 			ip_rt_put(rtres);
1859 
1860 		/* create a routing cache entry */
1861 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1862 				      &rth);
1863 		if (err)
1864 			return err;
1865 
1866 		/* put it into the cache */
1867 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1868 		err = rt_intern_hash(hash, rth, &rtres);
1869 		if (err)
1870 			return err;
1871 
1872 		/* forward hop information to multipath impl. */
1873 		multipath_set_nhinfo(rth,
1874 				     FIB_RES_NETWORK(*res),
1875 				     FIB_RES_NETMASK(*res),
1876 				     res->prefixlen,
1877 				     &FIB_RES_NH(*res));
1878 	}
1879 	skb->dst = &rtres->u.dst;
1880 	return err;
1881 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1882 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1883 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1884 }
1885 
1886 
1887 /*
1888  *	NOTE. We drop all the packets that has local source
1889  *	addresses, because every properly looped back packet
1890  *	must have correct destination already attached by output routine.
1891  *
1892  *	Such approach solves two big problems:
1893  *	1. Not simplex devices are handled properly.
1894  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1895  */
1896 
1897 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1898 			       u8 tos, struct net_device *dev)
1899 {
1900 	struct fib_result res;
1901 	struct in_device *in_dev = in_dev_get(dev);
1902 	struct flowi fl = { .nl_u = { .ip4_u =
1903 				      { .daddr = daddr,
1904 					.saddr = saddr,
1905 					.tos = tos,
1906 					.scope = RT_SCOPE_UNIVERSE,
1907 #ifdef CONFIG_IP_ROUTE_FWMARK
1908 					.fwmark = skb->nfmark
1909 #endif
1910 				      } },
1911 			    .iif = dev->ifindex };
1912 	unsigned	flags = 0;
1913 	u32		itag = 0;
1914 	struct rtable * rth;
1915 	unsigned	hash;
1916 	u32		spec_dst;
1917 	int		err = -EINVAL;
1918 	int		free_res = 0;
1919 
1920 	/* IP on this device is disabled. */
1921 
1922 	if (!in_dev)
1923 		goto out;
1924 
1925 	/* Check for the most weird martians, which can be not detected
1926 	   by fib_lookup.
1927 	 */
1928 
1929 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1930 		goto martian_source;
1931 
1932 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1933 		goto brd_input;
1934 
1935 	/* Accept zero addresses only to limited broadcast;
1936 	 * I even do not know to fix it or not. Waiting for complains :-)
1937 	 */
1938 	if (ZERONET(saddr))
1939 		goto martian_source;
1940 
1941 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1942 		goto martian_destination;
1943 
1944 	/*
1945 	 *	Now we are ready to route packet.
1946 	 */
1947 	if ((err = fib_lookup(&fl, &res)) != 0) {
1948 		if (!IN_DEV_FORWARD(in_dev))
1949 			goto e_hostunreach;
1950 		goto no_route;
1951 	}
1952 	free_res = 1;
1953 
1954 	RT_CACHE_STAT_INC(in_slow_tot);
1955 
1956 	if (res.type == RTN_BROADCAST)
1957 		goto brd_input;
1958 
1959 	if (res.type == RTN_LOCAL) {
1960 		int result;
1961 		result = fib_validate_source(saddr, daddr, tos,
1962 					     loopback_dev.ifindex,
1963 					     dev, &spec_dst, &itag);
1964 		if (result < 0)
1965 			goto martian_source;
1966 		if (result)
1967 			flags |= RTCF_DIRECTSRC;
1968 		spec_dst = daddr;
1969 		goto local_input;
1970 	}
1971 
1972 	if (!IN_DEV_FORWARD(in_dev))
1973 		goto e_hostunreach;
1974 	if (res.type != RTN_UNICAST)
1975 		goto martian_destination;
1976 
1977 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1978 	if (err == -ENOBUFS)
1979 		goto e_nobufs;
1980 	if (err == -EINVAL)
1981 		goto e_inval;
1982 
1983 done:
1984 	in_dev_put(in_dev);
1985 	if (free_res)
1986 		fib_res_put(&res);
1987 out:	return err;
1988 
1989 brd_input:
1990 	if (skb->protocol != htons(ETH_P_IP))
1991 		goto e_inval;
1992 
1993 	if (ZERONET(saddr))
1994 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995 	else {
1996 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1997 					  &itag);
1998 		if (err < 0)
1999 			goto martian_source;
2000 		if (err)
2001 			flags |= RTCF_DIRECTSRC;
2002 	}
2003 	flags |= RTCF_BROADCAST;
2004 	res.type = RTN_BROADCAST;
2005 	RT_CACHE_STAT_INC(in_brd);
2006 
2007 local_input:
2008 	rth = dst_alloc(&ipv4_dst_ops);
2009 	if (!rth)
2010 		goto e_nobufs;
2011 
2012 	rth->u.dst.output= ip_rt_bug;
2013 
2014 	atomic_set(&rth->u.dst.__refcnt, 1);
2015 	rth->u.dst.flags= DST_HOST;
2016 	if (in_dev->cnf.no_policy)
2017 		rth->u.dst.flags |= DST_NOPOLICY;
2018 	rth->fl.fl4_dst	= daddr;
2019 	rth->rt_dst	= daddr;
2020 	rth->fl.fl4_tos	= tos;
2021 #ifdef CONFIG_IP_ROUTE_FWMARK
2022 	rth->fl.fl4_fwmark= skb->nfmark;
2023 #endif
2024 	rth->fl.fl4_src	= saddr;
2025 	rth->rt_src	= saddr;
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027 	rth->u.dst.tclassid = itag;
2028 #endif
2029 	rth->rt_iif	=
2030 	rth->fl.iif	= dev->ifindex;
2031 	rth->u.dst.dev	= &loopback_dev;
2032 	dev_hold(rth->u.dst.dev);
2033 	rth->idev	= in_dev_get(rth->u.dst.dev);
2034 	rth->rt_gateway	= daddr;
2035 	rth->rt_spec_dst= spec_dst;
2036 	rth->u.dst.input= ip_local_deliver;
2037 	rth->rt_flags 	= flags|RTCF_LOCAL;
2038 	if (res.type == RTN_UNREACHABLE) {
2039 		rth->u.dst.input= ip_error;
2040 		rth->u.dst.error= -err;
2041 		rth->rt_flags 	&= ~RTCF_LOCAL;
2042 	}
2043 	rth->rt_type	= res.type;
2044 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2045 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2046 	goto done;
2047 
2048 no_route:
2049 	RT_CACHE_STAT_INC(in_no_route);
2050 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2051 	res.type = RTN_UNREACHABLE;
2052 	goto local_input;
2053 
2054 	/*
2055 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2056 	 */
2057 martian_destination:
2058 	RT_CACHE_STAT_INC(in_martian_dst);
2059 #ifdef CONFIG_IP_ROUTE_VERBOSE
2060 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2061 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2062 			"%u.%u.%u.%u, dev %s\n",
2063 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2064 #endif
2065 
2066 e_hostunreach:
2067         err = -EHOSTUNREACH;
2068         goto done;
2069 
2070 e_inval:
2071 	err = -EINVAL;
2072 	goto done;
2073 
2074 e_nobufs:
2075 	err = -ENOBUFS;
2076 	goto done;
2077 
2078 martian_source:
2079 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2080 	goto e_inval;
2081 }
2082 
2083 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2084 		   u8 tos, struct net_device *dev)
2085 {
2086 	struct rtable * rth;
2087 	unsigned	hash;
2088 	int iif = dev->ifindex;
2089 
2090 	tos &= IPTOS_RT_MASK;
2091 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2092 
2093 	rcu_read_lock();
2094 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2095 	     rth = rcu_dereference(rth->u.rt_next)) {
2096 		if (rth->fl.fl4_dst == daddr &&
2097 		    rth->fl.fl4_src == saddr &&
2098 		    rth->fl.iif == iif &&
2099 		    rth->fl.oif == 0 &&
2100 #ifdef CONFIG_IP_ROUTE_FWMARK
2101 		    rth->fl.fl4_fwmark == skb->nfmark &&
2102 #endif
2103 		    rth->fl.fl4_tos == tos) {
2104 			rth->u.dst.lastuse = jiffies;
2105 			dst_hold(&rth->u.dst);
2106 			rth->u.dst.__use++;
2107 			RT_CACHE_STAT_INC(in_hit);
2108 			rcu_read_unlock();
2109 			skb->dst = (struct dst_entry*)rth;
2110 			return 0;
2111 		}
2112 		RT_CACHE_STAT_INC(in_hlist_search);
2113 	}
2114 	rcu_read_unlock();
2115 
2116 	/* Multicast recognition logic is moved from route cache to here.
2117 	   The problem was that too many Ethernet cards have broken/missing
2118 	   hardware multicast filters :-( As result the host on multicasting
2119 	   network acquires a lot of useless route cache entries, sort of
2120 	   SDR messages from all the world. Now we try to get rid of them.
2121 	   Really, provided software IP multicast filter is organized
2122 	   reasonably (at least, hashed), it does not result in a slowdown
2123 	   comparing with route cache reject entries.
2124 	   Note, that multicast routers are not affected, because
2125 	   route cache entry is created eventually.
2126 	 */
2127 	if (MULTICAST(daddr)) {
2128 		struct in_device *in_dev;
2129 
2130 		rcu_read_lock();
2131 		if ((in_dev = __in_dev_get(dev)) != NULL) {
2132 			int our = ip_check_mc(in_dev, daddr, saddr,
2133 				skb->nh.iph->protocol);
2134 			if (our
2135 #ifdef CONFIG_IP_MROUTE
2136 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2137 #endif
2138 			    ) {
2139 				rcu_read_unlock();
2140 				return ip_route_input_mc(skb, daddr, saddr,
2141 							 tos, dev, our);
2142 			}
2143 		}
2144 		rcu_read_unlock();
2145 		return -EINVAL;
2146 	}
2147 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2148 }
2149 
2150 static inline int __mkroute_output(struct rtable **result,
2151 				   struct fib_result* res,
2152 				   const struct flowi *fl,
2153 				   const struct flowi *oldflp,
2154 				   struct net_device *dev_out,
2155 				   unsigned flags)
2156 {
2157 	struct rtable *rth;
2158 	struct in_device *in_dev;
2159 	u32 tos = RT_FL_TOS(oldflp);
2160 	int err = 0;
2161 
2162 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2163 		return -EINVAL;
2164 
2165 	if (fl->fl4_dst == 0xFFFFFFFF)
2166 		res->type = RTN_BROADCAST;
2167 	else if (MULTICAST(fl->fl4_dst))
2168 		res->type = RTN_MULTICAST;
2169 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2170 		return -EINVAL;
2171 
2172 	if (dev_out->flags & IFF_LOOPBACK)
2173 		flags |= RTCF_LOCAL;
2174 
2175 	/* get work reference to inet device */
2176 	in_dev = in_dev_get(dev_out);
2177 	if (!in_dev)
2178 		return -EINVAL;
2179 
2180 	if (res->type == RTN_BROADCAST) {
2181 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182 		if (res->fi) {
2183 			fib_info_put(res->fi);
2184 			res->fi = NULL;
2185 		}
2186 	} else if (res->type == RTN_MULTICAST) {
2187 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2188 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2189 				 oldflp->proto))
2190 			flags &= ~RTCF_LOCAL;
2191 		/* If multicast route do not exist use
2192 		   default one, but do not gateway in this case.
2193 		   Yes, it is hack.
2194 		 */
2195 		if (res->fi && res->prefixlen < 4) {
2196 			fib_info_put(res->fi);
2197 			res->fi = NULL;
2198 		}
2199 	}
2200 
2201 
2202 	rth = dst_alloc(&ipv4_dst_ops);
2203 	if (!rth) {
2204 		err = -ENOBUFS;
2205 		goto cleanup;
2206 	}
2207 
2208 	atomic_set(&rth->u.dst.__refcnt, 1);
2209 	rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 	if (res->fi) {
2212 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 		if (res->fi->fib_nhs > 1)
2214 			rth->u.dst.flags |= DST_BALANCED;
2215 	}
2216 #endif
2217 	if (in_dev->cnf.no_xfrm)
2218 		rth->u.dst.flags |= DST_NOXFRM;
2219 	if (in_dev->cnf.no_policy)
2220 		rth->u.dst.flags |= DST_NOPOLICY;
2221 
2222 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2223 	rth->fl.fl4_tos	= tos;
2224 	rth->fl.fl4_src	= oldflp->fl4_src;
2225 	rth->fl.oif	= oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229 	rth->rt_dst	= fl->fl4_dst;
2230 	rth->rt_src	= fl->fl4_src;
2231 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2232 	/* get references to the devices that are to be hold by the routing
2233 	   cache entry */
2234 	rth->u.dst.dev	= dev_out;
2235 	dev_hold(dev_out);
2236 	rth->idev	= in_dev_get(dev_out);
2237 	rth->rt_gateway = fl->fl4_dst;
2238 	rth->rt_spec_dst= fl->fl4_src;
2239 
2240 	rth->u.dst.output=ip_output;
2241 
2242 	RT_CACHE_STAT_INC(out_slow_tot);
2243 
2244 	if (flags & RTCF_LOCAL) {
2245 		rth->u.dst.input = ip_local_deliver;
2246 		rth->rt_spec_dst = fl->fl4_dst;
2247 	}
2248 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 		rth->rt_spec_dst = fl->fl4_src;
2250 		if (flags & RTCF_LOCAL &&
2251 		    !(dev_out->flags & IFF_LOOPBACK)) {
2252 			rth->u.dst.output = ip_mc_output;
2253 			RT_CACHE_STAT_INC(out_slow_mc);
2254 		}
2255 #ifdef CONFIG_IP_MROUTE
2256 		if (res->type == RTN_MULTICAST) {
2257 			if (IN_DEV_MFORWARD(in_dev) &&
2258 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 				rth->u.dst.input = ip_mr_input;
2260 				rth->u.dst.output = ip_mc_output;
2261 			}
2262 		}
2263 #endif
2264 	}
2265 
2266 	rt_set_nexthop(rth, res, 0);
2267 
2268 	rth->rt_flags = flags;
2269 
2270 	*result = rth;
2271  cleanup:
2272 	/* release work reference to inet device */
2273 	in_dev_put(in_dev);
2274 
2275 	return err;
2276 }
2277 
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279 					struct fib_result* res,
2280 					const struct flowi *fl,
2281 					const struct flowi *oldflp,
2282 					struct net_device *dev_out,
2283 					unsigned flags)
2284 {
2285 	struct rtable *rth = NULL;
2286 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287 	unsigned hash;
2288 	if (err == 0) {
2289 		u32 tos = RT_FL_TOS(oldflp);
2290 
2291 		hash = rt_hash_code(oldflp->fl4_dst,
2292 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2293 		err = rt_intern_hash(hash, rth, rp);
2294 	}
2295 
2296 	return err;
2297 }
2298 
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300 				    struct fib_result* res,
2301 				    const struct flowi *fl,
2302 				    const struct flowi *oldflp,
2303 				    struct net_device *dev_out,
2304 				    unsigned flags)
2305 {
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307 	u32 tos = RT_FL_TOS(oldflp);
2308 	unsigned char hop;
2309 	unsigned hash;
2310 	int err = -EINVAL;
2311 	struct rtable *rth = NULL;
2312 
2313 	if (res->fi && res->fi->fib_nhs > 1) {
2314 		unsigned char hopcount = res->fi->fib_nhs;
2315 
2316 		for (hop = 0; hop < hopcount; hop++) {
2317 			struct net_device *dev2nexthop;
2318 
2319 			res->nh_sel = hop;
2320 
2321 			/* hold a work reference to the output device */
2322 			dev2nexthop = FIB_RES_DEV(*res);
2323 			dev_hold(dev2nexthop);
2324 
2325 			/* put reference to previous result */
2326 			if (hop)
2327 				ip_rt_put(*rp);
2328 
2329 			err = __mkroute_output(&rth, res, fl, oldflp,
2330 					       dev2nexthop, flags);
2331 
2332 			if (err != 0)
2333 				goto cleanup;
2334 
2335 			hash = rt_hash_code(oldflp->fl4_dst,
2336 					    oldflp->fl4_src ^
2337 					    (oldflp->oif << 5), tos);
2338 			err = rt_intern_hash(hash, rth, rp);
2339 
2340 			/* forward hop information to multipath impl. */
2341 			multipath_set_nhinfo(rth,
2342 					     FIB_RES_NETWORK(*res),
2343 					     FIB_RES_NETMASK(*res),
2344 					     res->prefixlen,
2345 					     &FIB_RES_NH(*res));
2346 		cleanup:
2347 			/* release work reference to output device */
2348 			dev_put(dev2nexthop);
2349 
2350 			if (err != 0)
2351 				return err;
2352 		}
2353 		return err;
2354 	} else {
2355 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2356 					     flags);
2357 	}
2358 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2359 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2360 #endif
2361 }
2362 
2363 /*
2364  * Major route resolver routine.
2365  */
2366 
2367 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 {
2369 	u32 tos	= RT_FL_TOS(oldflp);
2370 	struct flowi fl = { .nl_u = { .ip4_u =
2371 				      { .daddr = oldflp->fl4_dst,
2372 					.saddr = oldflp->fl4_src,
2373 					.tos = tos & IPTOS_RT_MASK,
2374 					.scope = ((tos & RTO_ONLINK) ?
2375 						  RT_SCOPE_LINK :
2376 						  RT_SCOPE_UNIVERSE),
2377 #ifdef CONFIG_IP_ROUTE_FWMARK
2378 					.fwmark = oldflp->fl4_fwmark
2379 #endif
2380 				      } },
2381 			    .iif = loopback_dev.ifindex,
2382 			    .oif = oldflp->oif };
2383 	struct fib_result res;
2384 	unsigned flags = 0;
2385 	struct net_device *dev_out = NULL;
2386 	int free_res = 0;
2387 	int err;
2388 
2389 
2390 	res.fi		= NULL;
2391 #ifdef CONFIG_IP_MULTIPLE_TABLES
2392 	res.r		= NULL;
2393 #endif
2394 
2395 	if (oldflp->fl4_src) {
2396 		err = -EINVAL;
2397 		if (MULTICAST(oldflp->fl4_src) ||
2398 		    BADCLASS(oldflp->fl4_src) ||
2399 		    ZERONET(oldflp->fl4_src))
2400 			goto out;
2401 
2402 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2403 		dev_out = ip_dev_find(oldflp->fl4_src);
2404 		if (dev_out == NULL)
2405 			goto out;
2406 
2407 		/* I removed check for oif == dev_out->oif here.
2408 		   It was wrong for two reasons:
2409 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2410 		      assigned to multiple interfaces.
2411 		   2. Moreover, we are allowed to send packets with saddr
2412 		      of another iface. --ANK
2413 		 */
2414 
2415 		if (oldflp->oif == 0
2416 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2417 			/* Special hack: user can direct multicasts
2418 			   and limited broadcast via necessary interface
2419 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2420 			   This hack is not just for fun, it allows
2421 			   vic,vat and friends to work.
2422 			   They bind socket to loopback, set ttl to zero
2423 			   and expect that it will work.
2424 			   From the viewpoint of routing cache they are broken,
2425 			   because we are not allowed to build multicast path
2426 			   with loopback source addr (look, routing cache
2427 			   cannot know, that ttl is zero, so that packet
2428 			   will not leave this host and route is valid).
2429 			   Luckily, this hack is good workaround.
2430 			 */
2431 
2432 			fl.oif = dev_out->ifindex;
2433 			goto make_route;
2434 		}
2435 		if (dev_out)
2436 			dev_put(dev_out);
2437 		dev_out = NULL;
2438 	}
2439 
2440 
2441 	if (oldflp->oif) {
2442 		dev_out = dev_get_by_index(oldflp->oif);
2443 		err = -ENODEV;
2444 		if (dev_out == NULL)
2445 			goto out;
2446 		if (__in_dev_get(dev_out) == NULL) {
2447 			dev_put(dev_out);
2448 			goto out;	/* Wrong error code */
2449 		}
2450 
2451 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2452 			if (!fl.fl4_src)
2453 				fl.fl4_src = inet_select_addr(dev_out, 0,
2454 							      RT_SCOPE_LINK);
2455 			goto make_route;
2456 		}
2457 		if (!fl.fl4_src) {
2458 			if (MULTICAST(oldflp->fl4_dst))
2459 				fl.fl4_src = inet_select_addr(dev_out, 0,
2460 							      fl.fl4_scope);
2461 			else if (!oldflp->fl4_dst)
2462 				fl.fl4_src = inet_select_addr(dev_out, 0,
2463 							      RT_SCOPE_HOST);
2464 		}
2465 	}
2466 
2467 	if (!fl.fl4_dst) {
2468 		fl.fl4_dst = fl.fl4_src;
2469 		if (!fl.fl4_dst)
2470 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2471 		if (dev_out)
2472 			dev_put(dev_out);
2473 		dev_out = &loopback_dev;
2474 		dev_hold(dev_out);
2475 		fl.oif = loopback_dev.ifindex;
2476 		res.type = RTN_LOCAL;
2477 		flags |= RTCF_LOCAL;
2478 		goto make_route;
2479 	}
2480 
2481 	if (fib_lookup(&fl, &res)) {
2482 		res.fi = NULL;
2483 		if (oldflp->oif) {
2484 			/* Apparently, routing tables are wrong. Assume,
2485 			   that the destination is on link.
2486 
2487 			   WHY? DW.
2488 			   Because we are allowed to send to iface
2489 			   even if it has NO routes and NO assigned
2490 			   addresses. When oif is specified, routing
2491 			   tables are looked up with only one purpose:
2492 			   to catch if destination is gatewayed, rather than
2493 			   direct. Moreover, if MSG_DONTROUTE is set,
2494 			   we send packet, ignoring both routing tables
2495 			   and ifaddr state. --ANK
2496 
2497 
2498 			   We could make it even if oif is unknown,
2499 			   likely IPv6, but we do not.
2500 			 */
2501 
2502 			if (fl.fl4_src == 0)
2503 				fl.fl4_src = inet_select_addr(dev_out, 0,
2504 							      RT_SCOPE_LINK);
2505 			res.type = RTN_UNICAST;
2506 			goto make_route;
2507 		}
2508 		if (dev_out)
2509 			dev_put(dev_out);
2510 		err = -ENETUNREACH;
2511 		goto out;
2512 	}
2513 	free_res = 1;
2514 
2515 	if (res.type == RTN_LOCAL) {
2516 		if (!fl.fl4_src)
2517 			fl.fl4_src = fl.fl4_dst;
2518 		if (dev_out)
2519 			dev_put(dev_out);
2520 		dev_out = &loopback_dev;
2521 		dev_hold(dev_out);
2522 		fl.oif = dev_out->ifindex;
2523 		if (res.fi)
2524 			fib_info_put(res.fi);
2525 		res.fi = NULL;
2526 		flags |= RTCF_LOCAL;
2527 		goto make_route;
2528 	}
2529 
2530 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2531 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2532 		fib_select_multipath(&fl, &res);
2533 	else
2534 #endif
2535 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2536 		fib_select_default(&fl, &res);
2537 
2538 	if (!fl.fl4_src)
2539 		fl.fl4_src = FIB_RES_PREFSRC(res);
2540 
2541 	if (dev_out)
2542 		dev_put(dev_out);
2543 	dev_out = FIB_RES_DEV(res);
2544 	dev_hold(dev_out);
2545 	fl.oif = dev_out->ifindex;
2546 
2547 
2548 make_route:
2549 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2550 
2551 
2552 	if (free_res)
2553 		fib_res_put(&res);
2554 	if (dev_out)
2555 		dev_put(dev_out);
2556 out:	return err;
2557 }
2558 
2559 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2560 {
2561 	unsigned hash;
2562 	struct rtable *rth;
2563 
2564 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2565 
2566 	rcu_read_lock_bh();
2567 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2568 		rth = rcu_dereference(rth->u.rt_next)) {
2569 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2570 		    rth->fl.fl4_src == flp->fl4_src &&
2571 		    rth->fl.iif == 0 &&
2572 		    rth->fl.oif == flp->oif &&
2573 #ifdef CONFIG_IP_ROUTE_FWMARK
2574 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2575 #endif
2576 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2577 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2578 
2579 			/* check for multipath routes and choose one if
2580 			 * necessary
2581 			 */
2582 			if (multipath_select_route(flp, rth, rp)) {
2583 				dst_hold(&(*rp)->u.dst);
2584 				RT_CACHE_STAT_INC(out_hit);
2585 				rcu_read_unlock_bh();
2586 				return 0;
2587 			}
2588 
2589 			rth->u.dst.lastuse = jiffies;
2590 			dst_hold(&rth->u.dst);
2591 			rth->u.dst.__use++;
2592 			RT_CACHE_STAT_INC(out_hit);
2593 			rcu_read_unlock_bh();
2594 			*rp = rth;
2595 			return 0;
2596 		}
2597 		RT_CACHE_STAT_INC(out_hlist_search);
2598 	}
2599 	rcu_read_unlock_bh();
2600 
2601 	return ip_route_output_slow(rp, flp);
2602 }
2603 
2604 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2605 
2606 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2607 {
2608 	int err;
2609 
2610 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2611 		return err;
2612 
2613 	if (flp->proto) {
2614 		if (!flp->fl4_src)
2615 			flp->fl4_src = (*rp)->rt_src;
2616 		if (!flp->fl4_dst)
2617 			flp->fl4_dst = (*rp)->rt_dst;
2618 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2619 	}
2620 
2621 	return 0;
2622 }
2623 
2624 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2625 
2626 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2627 {
2628 	return ip_route_output_flow(rp, flp, NULL, 0);
2629 }
2630 
2631 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2632 			int nowait, unsigned int flags)
2633 {
2634 	struct rtable *rt = (struct rtable*)skb->dst;
2635 	struct rtmsg *r;
2636 	struct nlmsghdr  *nlh;
2637 	unsigned char	 *b = skb->tail;
2638 	struct rta_cacheinfo ci;
2639 #ifdef CONFIG_IP_MROUTE
2640 	struct rtattr *eptr;
2641 #endif
2642 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2643 	r = NLMSG_DATA(nlh);
2644 	r->rtm_family	 = AF_INET;
2645 	r->rtm_dst_len	= 32;
2646 	r->rtm_src_len	= 0;
2647 	r->rtm_tos	= rt->fl.fl4_tos;
2648 	r->rtm_table	= RT_TABLE_MAIN;
2649 	r->rtm_type	= rt->rt_type;
2650 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2651 	r->rtm_protocol = RTPROT_UNSPEC;
2652 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2653 	if (rt->rt_flags & RTCF_NOTIFY)
2654 		r->rtm_flags |= RTM_F_NOTIFY;
2655 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2656 	if (rt->fl.fl4_src) {
2657 		r->rtm_src_len = 32;
2658 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2659 	}
2660 	if (rt->u.dst.dev)
2661 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2662 #ifdef CONFIG_NET_CLS_ROUTE
2663 	if (rt->u.dst.tclassid)
2664 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2665 #endif
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2668 		__u32 alg = rt->rt_multipath_alg;
2669 
2670 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2671 	}
2672 #endif
2673 	if (rt->fl.iif)
2674 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2675 	else if (rt->rt_src != rt->fl.fl4_src)
2676 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2677 	if (rt->rt_dst != rt->rt_gateway)
2678 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2679 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2680 		goto rtattr_failure;
2681 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2682 	ci.rta_used	= rt->u.dst.__use;
2683 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2684 	if (rt->u.dst.expires)
2685 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2686 	else
2687 		ci.rta_expires = 0;
2688 	ci.rta_error	= rt->u.dst.error;
2689 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2690 	if (rt->peer) {
2691 		ci.rta_id = rt->peer->ip_id_count;
2692 		if (rt->peer->tcp_ts_stamp) {
2693 			ci.rta_ts = rt->peer->tcp_ts;
2694 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2695 		}
2696 	}
2697 #ifdef CONFIG_IP_MROUTE
2698 	eptr = (struct rtattr*)skb->tail;
2699 #endif
2700 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2701 	if (rt->fl.iif) {
2702 #ifdef CONFIG_IP_MROUTE
2703 		u32 dst = rt->rt_dst;
2704 
2705 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2706 		    ipv4_devconf.mc_forwarding) {
2707 			int err = ipmr_get_route(skb, r, nowait);
2708 			if (err <= 0) {
2709 				if (!nowait) {
2710 					if (err == 0)
2711 						return 0;
2712 					goto nlmsg_failure;
2713 				} else {
2714 					if (err == -EMSGSIZE)
2715 						goto nlmsg_failure;
2716 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2717 				}
2718 			}
2719 		} else
2720 #endif
2721 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2722 	}
2723 
2724 	nlh->nlmsg_len = skb->tail - b;
2725 	return skb->len;
2726 
2727 nlmsg_failure:
2728 rtattr_failure:
2729 	skb_trim(skb, b - skb->data);
2730 	return -1;
2731 }
2732 
2733 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2734 {
2735 	struct rtattr **rta = arg;
2736 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2737 	struct rtable *rt = NULL;
2738 	u32 dst = 0;
2739 	u32 src = 0;
2740 	int iif = 0;
2741 	int err = -ENOBUFS;
2742 	struct sk_buff *skb;
2743 
2744 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2745 	if (!skb)
2746 		goto out;
2747 
2748 	/* Reserve room for dummy headers, this skb can pass
2749 	   through good chunk of routing engine.
2750 	 */
2751 	skb->mac.raw = skb->data;
2752 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2753 
2754 	if (rta[RTA_SRC - 1])
2755 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2756 	if (rta[RTA_DST - 1])
2757 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2758 	if (rta[RTA_IIF - 1])
2759 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2760 
2761 	if (iif) {
2762 		struct net_device *dev = __dev_get_by_index(iif);
2763 		err = -ENODEV;
2764 		if (!dev)
2765 			goto out_free;
2766 		skb->protocol	= htons(ETH_P_IP);
2767 		skb->dev	= dev;
2768 		local_bh_disable();
2769 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2770 		local_bh_enable();
2771 		rt = (struct rtable*)skb->dst;
2772 		if (!err && rt->u.dst.error)
2773 			err = -rt->u.dst.error;
2774 	} else {
2775 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2776 							 .saddr = src,
2777 							 .tos = rtm->rtm_tos } } };
2778 		int oif = 0;
2779 		if (rta[RTA_OIF - 1])
2780 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2781 		fl.oif = oif;
2782 		err = ip_route_output_key(&rt, &fl);
2783 	}
2784 	if (err)
2785 		goto out_free;
2786 
2787 	skb->dst = &rt->u.dst;
2788 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2789 		rt->rt_flags |= RTCF_NOTIFY;
2790 
2791 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2792 
2793 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2794 				RTM_NEWROUTE, 0, 0);
2795 	if (!err)
2796 		goto out_free;
2797 	if (err < 0) {
2798 		err = -EMSGSIZE;
2799 		goto out_free;
2800 	}
2801 
2802 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2803 	if (err > 0)
2804 		err = 0;
2805 out:	return err;
2806 
2807 out_free:
2808 	kfree_skb(skb);
2809 	goto out;
2810 }
2811 
2812 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2813 {
2814 	struct rtable *rt;
2815 	int h, s_h;
2816 	int idx, s_idx;
2817 
2818 	s_h = cb->args[0];
2819 	s_idx = idx = cb->args[1];
2820 	for (h = 0; h <= rt_hash_mask; h++) {
2821 		if (h < s_h) continue;
2822 		if (h > s_h)
2823 			s_idx = 0;
2824 		rcu_read_lock_bh();
2825 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2826 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2827 			if (idx < s_idx)
2828 				continue;
2829 			skb->dst = dst_clone(&rt->u.dst);
2830 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2831 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2832 					 1, NLM_F_MULTI) <= 0) {
2833 				dst_release(xchg(&skb->dst, NULL));
2834 				rcu_read_unlock_bh();
2835 				goto done;
2836 			}
2837 			dst_release(xchg(&skb->dst, NULL));
2838 		}
2839 		rcu_read_unlock_bh();
2840 	}
2841 
2842 done:
2843 	cb->args[0] = h;
2844 	cb->args[1] = idx;
2845 	return skb->len;
2846 }
2847 
2848 void ip_rt_multicast_event(struct in_device *in_dev)
2849 {
2850 	rt_cache_flush(0);
2851 }
2852 
2853 #ifdef CONFIG_SYSCTL
2854 static int flush_delay;
2855 
2856 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2857 					struct file *filp, void __user *buffer,
2858 					size_t *lenp, loff_t *ppos)
2859 {
2860 	if (write) {
2861 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2862 		rt_cache_flush(flush_delay);
2863 		return 0;
2864 	}
2865 
2866 	return -EINVAL;
2867 }
2868 
2869 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2870 						int __user *name,
2871 						int nlen,
2872 						void __user *oldval,
2873 						size_t __user *oldlenp,
2874 						void __user *newval,
2875 						size_t newlen,
2876 						void **context)
2877 {
2878 	int delay;
2879 	if (newlen != sizeof(int))
2880 		return -EINVAL;
2881 	if (get_user(delay, (int __user *)newval))
2882 		return -EFAULT;
2883 	rt_cache_flush(delay);
2884 	return 0;
2885 }
2886 
2887 ctl_table ipv4_route_table[] = {
2888         {
2889 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2890 		.procname	= "flush",
2891 		.data		= &flush_delay,
2892 		.maxlen		= sizeof(int),
2893 		.mode		= 0200,
2894 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2895 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2896 	},
2897 	{
2898 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2899 		.procname	= "min_delay",
2900 		.data		= &ip_rt_min_delay,
2901 		.maxlen		= sizeof(int),
2902 		.mode		= 0644,
2903 		.proc_handler	= &proc_dointvec_jiffies,
2904 		.strategy	= &sysctl_jiffies,
2905 	},
2906 	{
2907 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2908 		.procname	= "max_delay",
2909 		.data		= &ip_rt_max_delay,
2910 		.maxlen		= sizeof(int),
2911 		.mode		= 0644,
2912 		.proc_handler	= &proc_dointvec_jiffies,
2913 		.strategy	= &sysctl_jiffies,
2914 	},
2915 	{
2916 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2917 		.procname	= "gc_thresh",
2918 		.data		= &ipv4_dst_ops.gc_thresh,
2919 		.maxlen		= sizeof(int),
2920 		.mode		= 0644,
2921 		.proc_handler	= &proc_dointvec,
2922 	},
2923 	{
2924 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2925 		.procname	= "max_size",
2926 		.data		= &ip_rt_max_size,
2927 		.maxlen		= sizeof(int),
2928 		.mode		= 0644,
2929 		.proc_handler	= &proc_dointvec,
2930 	},
2931 	{
2932 		/*  Deprecated. Use gc_min_interval_ms */
2933 
2934 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935 		.procname	= "gc_min_interval",
2936 		.data		= &ip_rt_gc_min_interval,
2937 		.maxlen		= sizeof(int),
2938 		.mode		= 0644,
2939 		.proc_handler	= &proc_dointvec_jiffies,
2940 		.strategy	= &sysctl_jiffies,
2941 	},
2942 	{
2943 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944 		.procname	= "gc_min_interval_ms",
2945 		.data		= &ip_rt_gc_min_interval,
2946 		.maxlen		= sizeof(int),
2947 		.mode		= 0644,
2948 		.proc_handler	= &proc_dointvec_ms_jiffies,
2949 		.strategy	= &sysctl_ms_jiffies,
2950 	},
2951 	{
2952 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2953 		.procname	= "gc_timeout",
2954 		.data		= &ip_rt_gc_timeout,
2955 		.maxlen		= sizeof(int),
2956 		.mode		= 0644,
2957 		.proc_handler	= &proc_dointvec_jiffies,
2958 		.strategy	= &sysctl_jiffies,
2959 	},
2960 	{
2961 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2962 		.procname	= "gc_interval",
2963 		.data		= &ip_rt_gc_interval,
2964 		.maxlen		= sizeof(int),
2965 		.mode		= 0644,
2966 		.proc_handler	= &proc_dointvec_jiffies,
2967 		.strategy	= &sysctl_jiffies,
2968 	},
2969 	{
2970 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2971 		.procname	= "redirect_load",
2972 		.data		= &ip_rt_redirect_load,
2973 		.maxlen		= sizeof(int),
2974 		.mode		= 0644,
2975 		.proc_handler	= &proc_dointvec,
2976 	},
2977 	{
2978 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979 		.procname	= "redirect_number",
2980 		.data		= &ip_rt_redirect_number,
2981 		.maxlen		= sizeof(int),
2982 		.mode		= 0644,
2983 		.proc_handler	= &proc_dointvec,
2984 	},
2985 	{
2986 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987 		.procname	= "redirect_silence",
2988 		.data		= &ip_rt_redirect_silence,
2989 		.maxlen		= sizeof(int),
2990 		.mode		= 0644,
2991 		.proc_handler	= &proc_dointvec,
2992 	},
2993 	{
2994 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2995 		.procname	= "error_cost",
2996 		.data		= &ip_rt_error_cost,
2997 		.maxlen		= sizeof(int),
2998 		.mode		= 0644,
2999 		.proc_handler	= &proc_dointvec,
3000 	},
3001 	{
3002 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3003 		.procname	= "error_burst",
3004 		.data		= &ip_rt_error_burst,
3005 		.maxlen		= sizeof(int),
3006 		.mode		= 0644,
3007 		.proc_handler	= &proc_dointvec,
3008 	},
3009 	{
3010 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3011 		.procname	= "gc_elasticity",
3012 		.data		= &ip_rt_gc_elasticity,
3013 		.maxlen		= sizeof(int),
3014 		.mode		= 0644,
3015 		.proc_handler	= &proc_dointvec,
3016 	},
3017 	{
3018 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3019 		.procname	= "mtu_expires",
3020 		.data		= &ip_rt_mtu_expires,
3021 		.maxlen		= sizeof(int),
3022 		.mode		= 0644,
3023 		.proc_handler	= &proc_dointvec_jiffies,
3024 		.strategy	= &sysctl_jiffies,
3025 	},
3026 	{
3027 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3028 		.procname	= "min_pmtu",
3029 		.data		= &ip_rt_min_pmtu,
3030 		.maxlen		= sizeof(int),
3031 		.mode		= 0644,
3032 		.proc_handler	= &proc_dointvec,
3033 	},
3034 	{
3035 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3036 		.procname	= "min_adv_mss",
3037 		.data		= &ip_rt_min_advmss,
3038 		.maxlen		= sizeof(int),
3039 		.mode		= 0644,
3040 		.proc_handler	= &proc_dointvec,
3041 	},
3042 	{
3043 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3044 		.procname	= "secret_interval",
3045 		.data		= &ip_rt_secret_interval,
3046 		.maxlen		= sizeof(int),
3047 		.mode		= 0644,
3048 		.proc_handler	= &proc_dointvec_jiffies,
3049 		.strategy	= &sysctl_jiffies,
3050 	},
3051 	{ .ctl_name = 0 }
3052 };
3053 #endif
3054 
3055 #ifdef CONFIG_NET_CLS_ROUTE
3056 struct ip_rt_acct *ip_rt_acct;
3057 
3058 /* This code sucks.  But you should have seen it before! --RR */
3059 
3060 /* IP route accounting ptr for this logical cpu number. */
3061 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062 
3063 #ifdef CONFIG_PROC_FS
3064 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065 			   int length, int *eof, void *data)
3066 {
3067 	unsigned int i;
3068 
3069 	if ((offset & 3) || (length & 3))
3070 		return -EIO;
3071 
3072 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073 		*eof = 1;
3074 		return 0;
3075 	}
3076 
3077 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3079 		*eof = 1;
3080 	}
3081 
3082 	offset /= sizeof(u32);
3083 
3084 	if (length > 0) {
3085 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086 		u32 *dst = (u32 *) buffer;
3087 
3088 		/* Copy first cpu. */
3089 		*start = buffer;
3090 		memcpy(dst, src, length);
3091 
3092 		/* Add the other cpus in, one int at a time */
3093 		for_each_cpu(i) {
3094 			unsigned int j;
3095 
3096 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097 
3098 			for (j = 0; j < length/4; j++)
3099 				dst[j] += src[j];
3100 		}
3101 	}
3102 	return length;
3103 }
3104 #endif /* CONFIG_PROC_FS */
3105 #endif /* CONFIG_NET_CLS_ROUTE */
3106 
3107 static __initdata unsigned long rhash_entries;
3108 static int __init set_rhash_entries(char *str)
3109 {
3110 	if (!str)
3111 		return 0;
3112 	rhash_entries = simple_strtoul(str, &str, 0);
3113 	return 1;
3114 }
3115 __setup("rhash_entries=", set_rhash_entries);
3116 
3117 int __init ip_rt_init(void)
3118 {
3119 	int rc = 0;
3120 
3121 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122 			     (jiffies ^ (jiffies >> 7)));
3123 
3124 #ifdef CONFIG_NET_CLS_ROUTE
3125 	{
3126 	int order;
3127 	for (order = 0;
3128 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129 		/* NOTHING */;
3130 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131 	if (!ip_rt_acct)
3132 		panic("IP: failed to allocate ip_rt_acct\n");
3133 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3134 	}
3135 #endif
3136 
3137 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3138 						     sizeof(struct rtable),
3139 						     0, SLAB_HWCACHE_ALIGN,
3140 						     NULL, NULL);
3141 
3142 	if (!ipv4_dst_ops.kmem_cachep)
3143 		panic("IP: failed to allocate ip_dst_cache\n");
3144 
3145 	rt_hash_table = (struct rt_hash_bucket *)
3146 		alloc_large_system_hash("IP route cache",
3147 					sizeof(struct rt_hash_bucket),
3148 					rhash_entries,
3149 					(num_physpages >= 128 * 1024) ?
3150 						(27 - PAGE_SHIFT) :
3151 						(29 - PAGE_SHIFT),
3152 					HASH_HIGHMEM,
3153 					&rt_hash_log,
3154 					&rt_hash_mask,
3155 					0);
3156 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 	rt_hash_lock_init();
3158 
3159 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161 
3162 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3163 	if (!rt_cache_stat)
3164 		return -ENOMEM;
3165 
3166 	devinet_init();
3167 	ip_fib_init();
3168 
3169 	init_timer(&rt_flush_timer);
3170 	rt_flush_timer.function = rt_run_flush;
3171 	init_timer(&rt_periodic_timer);
3172 	rt_periodic_timer.function = rt_check_expire;
3173 	init_timer(&rt_secret_timer);
3174 	rt_secret_timer.function = rt_secret_rebuild;
3175 
3176 	/* All the timers, started at system startup tend
3177 	   to synchronize. Perturb it a bit.
3178 	 */
3179 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3180 					ip_rt_gc_interval;
3181 	add_timer(&rt_periodic_timer);
3182 
3183 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3184 		ip_rt_secret_interval;
3185 	add_timer(&rt_secret_timer);
3186 
3187 #ifdef CONFIG_PROC_FS
3188 	{
3189 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3190 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3191 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3192 			    		     proc_net_stat))) {
3193 		free_percpu(rt_cache_stat);
3194 		return -ENOMEM;
3195 	}
3196 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3197 	}
3198 #ifdef CONFIG_NET_CLS_ROUTE
3199 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3200 #endif
3201 #endif
3202 #ifdef CONFIG_XFRM
3203 	xfrm_init();
3204 	xfrm4_init();
3205 #endif
3206 	return rc;
3207 }
3208 
3209 EXPORT_SYMBOL(__ip_select_ident);
3210 EXPORT_SYMBOL(ip_route_input);
3211 EXPORT_SYMBOL(ip_route_output_key);
3212