xref: /openbmc/linux/net/ipv4/route.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112 
113 #define IP_MAX_MTU	0xFFF0
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_min_delay		= 2 * HZ;
118 static int ip_rt_max_delay		= 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval		= 60 * HZ;
122 static int ip_rt_gc_min_interval	= HZ / 2;
123 static int ip_rt_redirect_number	= 9;
124 static int ip_rt_redirect_load		= HZ / 50;
125 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost		= HZ;
127 static int ip_rt_error_burst		= 5 * HZ;
128 static int ip_rt_gc_elasticity		= 8;
129 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu		= 512 + 20 + 20;
131 static int ip_rt_min_advmss		= 256;
132 static int ip_rt_secret_interval	= 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.entry_size =		sizeof(struct rtable),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ	4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ	2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ	1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ	512
220 #else
221 #define RT_HASH_LOCK_SZ	256
222 #endif
223 
224 static spinlock_t	*rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()	{ \
227 		int i; \
228 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 			spin_lock_init(&rt_hash_locks[i]); \
232 		}
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237 
238 static struct rt_hash_bucket 	*rt_hash_table;
239 static unsigned			rt_hash_mask;
240 static int			rt_hash_log;
241 static unsigned int		rt_hash_rnd;
242 
243 static struct rt_cache_stat *rt_cache_stat;
244 #define RT_CACHE_STAT_INC(field)					  \
245 		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
246 
247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
248 				struct rtable **res);
249 
250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251 {
252 	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253 		& rt_hash_mask);
254 }
255 
256 #ifdef CONFIG_PROC_FS
257 struct rt_cache_iter_state {
258 	int bucket;
259 };
260 
261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
262 {
263 	struct rtable *r = NULL;
264 	struct rt_cache_iter_state *st = seq->private;
265 
266 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267 		rcu_read_lock_bh();
268 		r = rt_hash_table[st->bucket].chain;
269 		if (r)
270 			break;
271 		rcu_read_unlock_bh();
272 	}
273 	return r;
274 }
275 
276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277 {
278 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279 
280 	r = r->u.rt_next;
281 	while (!r) {
282 		rcu_read_unlock_bh();
283 		if (--st->bucket < 0)
284 			break;
285 		rcu_read_lock_bh();
286 		r = rt_hash_table[st->bucket].chain;
287 	}
288 	return r;
289 }
290 
291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292 {
293 	struct rtable *r = rt_cache_get_first(seq);
294 
295 	if (r)
296 		while (pos && (r = rt_cache_get_next(seq, r)))
297 			--pos;
298 	return pos ? NULL : r;
299 }
300 
301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302 {
303 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 }
305 
306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307 {
308 	struct rtable *r = NULL;
309 
310 	if (v == SEQ_START_TOKEN)
311 		r = rt_cache_get_first(seq);
312 	else
313 		r = rt_cache_get_next(seq, v);
314 	++*pos;
315 	return r;
316 }
317 
318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319 {
320 	if (v && v != SEQ_START_TOKEN)
321 		rcu_read_unlock_bh();
322 }
323 
324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
325 {
326 	if (v == SEQ_START_TOKEN)
327 		seq_printf(seq, "%-127s\n",
328 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330 			   "HHUptod\tSpecDst");
331 	else {
332 		struct rtable *r = v;
333 		char temp[256];
334 
335 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337 			r->u.dst.dev ? r->u.dst.dev->name : "*",
338 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
341 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343 			dst_metric(&r->u.dst, RTAX_WINDOW),
344 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
346 			r->fl.fl4_tos,
347 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349 				       dev_queue_xmit) : 0,
350 			r->rt_spec_dst);
351 		seq_printf(seq, "%-127s\n", temp);
352         }
353   	return 0;
354 }
355 
356 static struct seq_operations rt_cache_seq_ops = {
357 	.start  = rt_cache_seq_start,
358 	.next   = rt_cache_seq_next,
359 	.stop   = rt_cache_seq_stop,
360 	.show   = rt_cache_seq_show,
361 };
362 
363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
364 {
365 	struct seq_file *seq;
366 	int rc = -ENOMEM;
367 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368 
369 	if (!s)
370 		goto out;
371 	rc = seq_open(file, &rt_cache_seq_ops);
372 	if (rc)
373 		goto out_kfree;
374 	seq          = file->private_data;
375 	seq->private = s;
376 	memset(s, 0, sizeof(*s));
377 out:
378 	return rc;
379 out_kfree:
380 	kfree(s);
381 	goto out;
382 }
383 
384 static struct file_operations rt_cache_seq_fops = {
385 	.owner	 = THIS_MODULE,
386 	.open	 = rt_cache_seq_open,
387 	.read	 = seq_read,
388 	.llseek	 = seq_lseek,
389 	.release = seq_release_private,
390 };
391 
392 
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394 {
395 	int cpu;
396 
397 	if (*pos == 0)
398 		return SEQ_START_TOKEN;
399 
400 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401 		if (!cpu_possible(cpu))
402 			continue;
403 		*pos = cpu+1;
404 		return per_cpu_ptr(rt_cache_stat, cpu);
405 	}
406 	return NULL;
407 }
408 
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 {
411 	int cpu;
412 
413 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414 		if (!cpu_possible(cpu))
415 			continue;
416 		*pos = cpu+1;
417 		return per_cpu_ptr(rt_cache_stat, cpu);
418 	}
419 	return NULL;
420 
421 }
422 
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424 {
425 
426 }
427 
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 {
430 	struct rt_cache_stat *st = v;
431 
432 	if (v == SEQ_START_TOKEN) {
433 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434 		return 0;
435 	}
436 
437 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
438 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439 		   atomic_read(&ipv4_dst_ops.entries),
440 		   st->in_hit,
441 		   st->in_slow_tot,
442 		   st->in_slow_mc,
443 		   st->in_no_route,
444 		   st->in_brd,
445 		   st->in_martian_dst,
446 		   st->in_martian_src,
447 
448 		   st->out_hit,
449 		   st->out_slow_tot,
450 		   st->out_slow_mc,
451 
452 		   st->gc_total,
453 		   st->gc_ignored,
454 		   st->gc_goal_miss,
455 		   st->gc_dst_overflow,
456 		   st->in_hlist_search,
457 		   st->out_hlist_search
458 		);
459 	return 0;
460 }
461 
462 static struct seq_operations rt_cpu_seq_ops = {
463 	.start  = rt_cpu_seq_start,
464 	.next   = rt_cpu_seq_next,
465 	.stop   = rt_cpu_seq_stop,
466 	.show   = rt_cpu_seq_show,
467 };
468 
469 
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 {
472 	return seq_open(file, &rt_cpu_seq_ops);
473 }
474 
475 static struct file_operations rt_cpu_seq_fops = {
476 	.owner	 = THIS_MODULE,
477 	.open	 = rt_cpu_seq_open,
478 	.read	 = seq_read,
479 	.llseek	 = seq_lseek,
480 	.release = seq_release,
481 };
482 
483 #endif /* CONFIG_PROC_FS */
484 
485 static __inline__ void rt_free(struct rtable *rt)
486 {
487 	multipath_remove(rt);
488 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 }
490 
491 static __inline__ void rt_drop(struct rtable *rt)
492 {
493 	multipath_remove(rt);
494 	ip_rt_put(rt);
495 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 }
497 
498 static __inline__ int rt_fast_clean(struct rtable *rth)
499 {
500 	/* Kill broadcast/multicast entries very aggresively, if they
501 	   collide in hash table with more useful entries */
502 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503 		rth->fl.iif && rth->u.rt_next;
504 }
505 
506 static __inline__ int rt_valuable(struct rtable *rth)
507 {
508 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509 		rth->u.dst.expires;
510 }
511 
512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513 {
514 	unsigned long age;
515 	int ret = 0;
516 
517 	if (atomic_read(&rth->u.dst.__refcnt))
518 		goto out;
519 
520 	ret = 1;
521 	if (rth->u.dst.expires &&
522 	    time_after_eq(jiffies, rth->u.dst.expires))
523 		goto out;
524 
525 	age = jiffies - rth->u.dst.lastuse;
526 	ret = 0;
527 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528 	    (age <= tmo2 && rt_valuable(rth)))
529 		goto out;
530 	ret = 1;
531 out:	return ret;
532 }
533 
534 /* Bits of score are:
535  * 31: very valuable
536  * 30: not quite useless
537  * 29..0: usage counter
538  */
539 static inline u32 rt_score(struct rtable *rt)
540 {
541 	u32 score = jiffies - rt->u.dst.lastuse;
542 
543 	score = ~score & ~(3<<30);
544 
545 	if (rt_valuable(rt))
546 		score |= (1<<31);
547 
548 	if (!rt->fl.iif ||
549 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
550 		score |= (1<<30);
551 
552 	return score;
553 }
554 
555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 {
557 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558 	       fl1->oif     == fl2->oif &&
559 	       fl1->iif     == fl2->iif;
560 }
561 
562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564 						struct rtable *expentry,
565 						int *removed_count)
566 {
567 	int passedexpired = 0;
568 	struct rtable **nextstep = NULL;
569 	struct rtable **rthp = chain_head;
570 	struct rtable *rth;
571 
572 	if (removed_count)
573 		*removed_count = 0;
574 
575 	while ((rth = *rthp) != NULL) {
576 		if (rth == expentry)
577 			passedexpired = 1;
578 
579 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
580 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
581 			if (*rthp == expentry) {
582 				*rthp = rth->u.rt_next;
583 				continue;
584 			} else {
585 				*rthp = rth->u.rt_next;
586 				rt_free(rth);
587 				if (removed_count)
588 					++(*removed_count);
589 			}
590 		} else {
591 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592 			    passedexpired && !nextstep)
593 				nextstep = &rth->u.rt_next;
594 
595 			rthp = &rth->u.rt_next;
596 		}
597 	}
598 
599 	rt_free(expentry);
600 	if (removed_count)
601 		++(*removed_count);
602 
603 	return nextstep;
604 }
605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606 
607 
608 /* This runs via a timer and thus is always in BH context. */
609 static void rt_check_expire(unsigned long dummy)
610 {
611 	static unsigned int rover;
612 	unsigned int i = rover, goal;
613 	struct rtable *rth, **rthp;
614 	unsigned long now = jiffies;
615 	u64 mult;
616 
617 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618 	if (ip_rt_gc_timeout > 1)
619 		do_div(mult, ip_rt_gc_timeout);
620 	goal = (unsigned int)mult;
621 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622 	for (; goal > 0; goal--) {
623 		unsigned long tmo = ip_rt_gc_timeout;
624 
625 		i = (i + 1) & rt_hash_mask;
626 		rthp = &rt_hash_table[i].chain;
627 
628 		if (*rthp == 0)
629 			continue;
630 		spin_lock(rt_hash_lock_addr(i));
631 		while ((rth = *rthp) != NULL) {
632 			if (rth->u.dst.expires) {
633 				/* Entry is expired even if it is in use */
634 				if (time_before_eq(now, rth->u.dst.expires)) {
635 					tmo >>= 1;
636 					rthp = &rth->u.rt_next;
637 					continue;
638 				}
639 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640 				tmo >>= 1;
641 				rthp = &rth->u.rt_next;
642 				continue;
643 			}
644 
645 			/* Cleanup aged off entries. */
646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647 			/* remove all related balanced entries if necessary */
648 			if (rth->u.dst.flags & DST_BALANCED) {
649 				rthp = rt_remove_balanced_route(
650 					&rt_hash_table[i].chain,
651 					rth, NULL);
652 				if (!rthp)
653 					break;
654 			} else {
655 				*rthp = rth->u.rt_next;
656 				rt_free(rth);
657 			}
658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659  			*rthp = rth->u.rt_next;
660  			rt_free(rth);
661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662 		}
663 		spin_unlock(rt_hash_lock_addr(i));
664 
665 		/* Fallback loop breaker. */
666 		if (time_after(jiffies, now))
667 			break;
668 	}
669 	rover = i;
670 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 }
672 
673 /* This can run from both BH and non-BH contexts, the latter
674  * in the case of a forced flush event.
675  */
676 static void rt_run_flush(unsigned long dummy)
677 {
678 	int i;
679 	struct rtable *rth, *next;
680 
681 	rt_deadline = 0;
682 
683 	get_random_bytes(&rt_hash_rnd, 4);
684 
685 	for (i = rt_hash_mask; i >= 0; i--) {
686 		spin_lock_bh(rt_hash_lock_addr(i));
687 		rth = rt_hash_table[i].chain;
688 		if (rth)
689 			rt_hash_table[i].chain = NULL;
690 		spin_unlock_bh(rt_hash_lock_addr(i));
691 
692 		for (; rth; rth = next) {
693 			next = rth->u.rt_next;
694 			rt_free(rth);
695 		}
696 	}
697 }
698 
699 static DEFINE_SPINLOCK(rt_flush_lock);
700 
701 void rt_cache_flush(int delay)
702 {
703 	unsigned long now = jiffies;
704 	int user_mode = !in_softirq();
705 
706 	if (delay < 0)
707 		delay = ip_rt_min_delay;
708 
709 	/* flush existing multipath state*/
710 	multipath_flush();
711 
712 	spin_lock_bh(&rt_flush_lock);
713 
714 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715 		long tmo = (long)(rt_deadline - now);
716 
717 		/* If flush timer is already running
718 		   and flush request is not immediate (delay > 0):
719 
720 		   if deadline is not achieved, prolongate timer to "delay",
721 		   otherwise fire it at deadline time.
722 		 */
723 
724 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
725 			tmo = 0;
726 
727 		if (delay > tmo)
728 			delay = tmo;
729 	}
730 
731 	if (delay <= 0) {
732 		spin_unlock_bh(&rt_flush_lock);
733 		rt_run_flush(0);
734 		return;
735 	}
736 
737 	if (rt_deadline == 0)
738 		rt_deadline = now + ip_rt_max_delay;
739 
740 	mod_timer(&rt_flush_timer, now+delay);
741 	spin_unlock_bh(&rt_flush_lock);
742 }
743 
744 static void rt_secret_rebuild(unsigned long dummy)
745 {
746 	unsigned long now = jiffies;
747 
748 	rt_cache_flush(0);
749 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
750 }
751 
752 /*
753    Short description of GC goals.
754 
755    We want to build algorithm, which will keep routing cache
756    at some equilibrium point, when number of aged off entries
757    is kept approximately equal to newly generated ones.
758 
759    Current expiration strength is variable "expire".
760    We try to adjust it dynamically, so that if networking
761    is idle expires is large enough to keep enough of warm entries,
762    and when load increases it reduces to limit cache size.
763  */
764 
765 static int rt_garbage_collect(void)
766 {
767 	static unsigned long expire = RT_GC_TIMEOUT;
768 	static unsigned long last_gc;
769 	static int rover;
770 	static int equilibrium;
771 	struct rtable *rth, **rthp;
772 	unsigned long now = jiffies;
773 	int goal;
774 
775 	/*
776 	 * Garbage collection is pretty expensive,
777 	 * do not make it too frequently.
778 	 */
779 
780 	RT_CACHE_STAT_INC(gc_total);
781 
782 	if (now - last_gc < ip_rt_gc_min_interval &&
783 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784 		RT_CACHE_STAT_INC(gc_ignored);
785 		goto out;
786 	}
787 
788 	/* Calculate number of entries, which we want to expire now. */
789 	goal = atomic_read(&ipv4_dst_ops.entries) -
790 		(ip_rt_gc_elasticity << rt_hash_log);
791 	if (goal <= 0) {
792 		if (equilibrium < ipv4_dst_ops.gc_thresh)
793 			equilibrium = ipv4_dst_ops.gc_thresh;
794 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795 		if (goal > 0) {
796 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798 		}
799 	} else {
800 		/* We are in dangerous area. Try to reduce cache really
801 		 * aggressively.
802 		 */
803 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805 	}
806 
807 	if (now - last_gc >= ip_rt_gc_min_interval)
808 		last_gc = now;
809 
810 	if (goal <= 0) {
811 		equilibrium += goal;
812 		goto work_done;
813 	}
814 
815 	do {
816 		int i, k;
817 
818 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819 			unsigned long tmo = expire;
820 
821 			k = (k + 1) & rt_hash_mask;
822 			rthp = &rt_hash_table[k].chain;
823 			spin_lock_bh(rt_hash_lock_addr(k));
824 			while ((rth = *rthp) != NULL) {
825 				if (!rt_may_expire(rth, tmo, expire)) {
826 					tmo >>= 1;
827 					rthp = &rth->u.rt_next;
828 					continue;
829 				}
830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831 				/* remove all related balanced entries
832 				 * if necessary
833 				 */
834 				if (rth->u.dst.flags & DST_BALANCED) {
835 					int r;
836 
837 					rthp = rt_remove_balanced_route(
838 						&rt_hash_table[i].chain,
839 						rth,
840 						&r);
841 					goal -= r;
842 					if (!rthp)
843 						break;
844 				} else {
845 					*rthp = rth->u.rt_next;
846 					rt_free(rth);
847 					goal--;
848 				}
849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850 				*rthp = rth->u.rt_next;
851 				rt_free(rth);
852 				goal--;
853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854 			}
855 			spin_unlock_bh(rt_hash_lock_addr(k));
856 			if (goal <= 0)
857 				break;
858 		}
859 		rover = k;
860 
861 		if (goal <= 0)
862 			goto work_done;
863 
864 		/* Goal is not achieved. We stop process if:
865 
866 		   - if expire reduced to zero. Otherwise, expire is halfed.
867 		   - if table is not full.
868 		   - if we are called from interrupt.
869 		   - jiffies check is just fallback/debug loop breaker.
870 		     We will not spin here for long time in any case.
871 		 */
872 
873 		RT_CACHE_STAT_INC(gc_goal_miss);
874 
875 		if (expire == 0)
876 			break;
877 
878 		expire >>= 1;
879 #if RT_CACHE_DEBUG >= 2
880 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881 				atomic_read(&ipv4_dst_ops.entries), goal, i);
882 #endif
883 
884 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885 			goto out;
886 	} while (!in_softirq() && time_before_eq(jiffies, now));
887 
888 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889 		goto out;
890 	if (net_ratelimit())
891 		printk(KERN_WARNING "dst cache overflow\n");
892 	RT_CACHE_STAT_INC(gc_dst_overflow);
893 	return 1;
894 
895 work_done:
896 	expire += ip_rt_gc_min_interval;
897 	if (expire > ip_rt_gc_timeout ||
898 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899 		expire = ip_rt_gc_timeout;
900 #if RT_CACHE_DEBUG >= 2
901 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
903 #endif
904 out:	return 0;
905 }
906 
907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908 {
909 	struct rtable	*rth, **rthp;
910 	unsigned long	now;
911 	struct rtable *cand, **candp;
912 	u32 		min_score;
913 	int		chain_length;
914 	int attempts = !in_softirq();
915 
916 restart:
917 	chain_length = 0;
918 	min_score = ~(u32)0;
919 	cand = NULL;
920 	candp = NULL;
921 	now = jiffies;
922 
923 	rthp = &rt_hash_table[hash].chain;
924 
925 	spin_lock_bh(rt_hash_lock_addr(hash));
926 	while ((rth = *rthp) != NULL) {
927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928 		if (!(rth->u.dst.flags & DST_BALANCED) &&
929 		    compare_keys(&rth->fl, &rt->fl)) {
930 #else
931 		if (compare_keys(&rth->fl, &rt->fl)) {
932 #endif
933 			/* Put it first */
934 			*rthp = rth->u.rt_next;
935 			/*
936 			 * Since lookup is lockfree, the deletion
937 			 * must be visible to another weakly ordered CPU before
938 			 * the insertion at the start of the hash chain.
939 			 */
940 			rcu_assign_pointer(rth->u.rt_next,
941 					   rt_hash_table[hash].chain);
942 			/*
943 			 * Since lookup is lockfree, the update writes
944 			 * must be ordered for consistency on SMP.
945 			 */
946 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947 
948 			rth->u.dst.__use++;
949 			dst_hold(&rth->u.dst);
950 			rth->u.dst.lastuse = now;
951 			spin_unlock_bh(rt_hash_lock_addr(hash));
952 
953 			rt_drop(rt);
954 			*rp = rth;
955 			return 0;
956 		}
957 
958 		if (!atomic_read(&rth->u.dst.__refcnt)) {
959 			u32 score = rt_score(rth);
960 
961 			if (score <= min_score) {
962 				cand = rth;
963 				candp = rthp;
964 				min_score = score;
965 			}
966 		}
967 
968 		chain_length++;
969 
970 		rthp = &rth->u.rt_next;
971 	}
972 
973 	if (cand) {
974 		/* ip_rt_gc_elasticity used to be average length of chain
975 		 * length, when exceeded gc becomes really aggressive.
976 		 *
977 		 * The second limit is less certain. At the moment it allows
978 		 * only 2 entries per bucket. We will see.
979 		 */
980 		if (chain_length > ip_rt_gc_elasticity) {
981 			*candp = cand->u.rt_next;
982 			rt_free(cand);
983 		}
984 	}
985 
986 	/* Try to bind route to arp only if it is output
987 	   route or unicast forwarding path.
988 	 */
989 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990 		int err = arp_bind_neighbour(&rt->u.dst);
991 		if (err) {
992 			spin_unlock_bh(rt_hash_lock_addr(hash));
993 
994 			if (err != -ENOBUFS) {
995 				rt_drop(rt);
996 				return err;
997 			}
998 
999 			/* Neighbour tables are full and nothing
1000 			   can be released. Try to shrink route cache,
1001 			   it is most likely it holds some neighbour records.
1002 			 */
1003 			if (attempts-- > 0) {
1004 				int saved_elasticity = ip_rt_gc_elasticity;
1005 				int saved_int = ip_rt_gc_min_interval;
1006 				ip_rt_gc_elasticity	= 1;
1007 				ip_rt_gc_min_interval	= 0;
1008 				rt_garbage_collect();
1009 				ip_rt_gc_min_interval	= saved_int;
1010 				ip_rt_gc_elasticity	= saved_elasticity;
1011 				goto restart;
1012 			}
1013 
1014 			if (net_ratelimit())
1015 				printk(KERN_WARNING "Neighbour table overflow.\n");
1016 			rt_drop(rt);
1017 			return -ENOBUFS;
1018 		}
1019 	}
1020 
1021 	rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023 	if (rt->u.rt_next) {
1024 		struct rtable *trt;
1025 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026 		       NIPQUAD(rt->rt_dst));
1027 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029 		printk("\n");
1030 	}
1031 #endif
1032 	rt_hash_table[hash].chain = rt;
1033 	spin_unlock_bh(rt_hash_lock_addr(hash));
1034 	*rp = rt;
1035 	return 0;
1036 }
1037 
1038 void rt_bind_peer(struct rtable *rt, int create)
1039 {
1040 	static DEFINE_SPINLOCK(rt_peer_lock);
1041 	struct inet_peer *peer;
1042 
1043 	peer = inet_getpeer(rt->rt_dst, create);
1044 
1045 	spin_lock_bh(&rt_peer_lock);
1046 	if (rt->peer == NULL) {
1047 		rt->peer = peer;
1048 		peer = NULL;
1049 	}
1050 	spin_unlock_bh(&rt_peer_lock);
1051 	if (peer)
1052 		inet_putpeer(peer);
1053 }
1054 
1055 /*
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
1057  * we still can generate some output.
1058  * Random ID selection looks a bit dangerous because we have no chances to
1059  * select ID being unique in a reasonable period of time.
1060  * But broken packet identifier may be better than no packet at all.
1061  */
1062 static void ip_select_fb_ident(struct iphdr *iph)
1063 {
1064 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1065 	static u32 ip_fallback_id;
1066 	u32 salt;
1067 
1068 	spin_lock_bh(&ip_fb_id_lock);
1069 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070 	iph->id = htons(salt & 0xFFFF);
1071 	ip_fallback_id = salt;
1072 	spin_unlock_bh(&ip_fb_id_lock);
1073 }
1074 
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 {
1077 	struct rtable *rt = (struct rtable *) dst;
1078 
1079 	if (rt) {
1080 		if (rt->peer == NULL)
1081 			rt_bind_peer(rt, 1);
1082 
1083 		/* If peer is attached to destination, it is never detached,
1084 		   so that we need not to grab a lock to dereference it.
1085 		 */
1086 		if (rt->peer) {
1087 			iph->id = htons(inet_getid(rt->peer, more));
1088 			return;
1089 		}
1090 	} else
1091 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1092 		       __builtin_return_address(0));
1093 
1094 	ip_select_fb_ident(iph);
1095 }
1096 
1097 static void rt_del(unsigned hash, struct rtable *rt)
1098 {
1099 	struct rtable **rthp;
1100 
1101 	spin_lock_bh(rt_hash_lock_addr(hash));
1102 	ip_rt_put(rt);
1103 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1104 	     rthp = &(*rthp)->u.rt_next)
1105 		if (*rthp == rt) {
1106 			*rthp = rt->u.rt_next;
1107 			rt_free(rt);
1108 			break;
1109 		}
1110 	spin_unlock_bh(rt_hash_lock_addr(hash));
1111 }
1112 
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114 		    u32 saddr, u8 tos, struct net_device *dev)
1115 {
1116 	int i, k;
1117 	struct in_device *in_dev = in_dev_get(dev);
1118 	struct rtable *rth, **rthp;
1119 	u32  skeys[2] = { saddr, 0 };
1120 	int  ikeys[2] = { dev->ifindex, 0 };
1121 
1122 	tos &= IPTOS_RT_MASK;
1123 
1124 	if (!in_dev)
1125 		return;
1126 
1127 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129 		goto reject_redirect;
1130 
1131 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133 			goto reject_redirect;
1134 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135 			goto reject_redirect;
1136 	} else {
1137 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1138 			goto reject_redirect;
1139 	}
1140 
1141 	for (i = 0; i < 2; i++) {
1142 		for (k = 0; k < 2; k++) {
1143 			unsigned hash = rt_hash_code(daddr,
1144 						     skeys[i] ^ (ikeys[k] << 5),
1145 						     tos);
1146 
1147 			rthp=&rt_hash_table[hash].chain;
1148 
1149 			rcu_read_lock();
1150 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1151 				struct rtable *rt;
1152 
1153 				if (rth->fl.fl4_dst != daddr ||
1154 				    rth->fl.fl4_src != skeys[i] ||
1155 				    rth->fl.fl4_tos != tos ||
1156 				    rth->fl.oif != ikeys[k] ||
1157 				    rth->fl.iif != 0) {
1158 					rthp = &rth->u.rt_next;
1159 					continue;
1160 				}
1161 
1162 				if (rth->rt_dst != daddr ||
1163 				    rth->rt_src != saddr ||
1164 				    rth->u.dst.error ||
1165 				    rth->rt_gateway != old_gw ||
1166 				    rth->u.dst.dev != dev)
1167 					break;
1168 
1169 				dst_hold(&rth->u.dst);
1170 				rcu_read_unlock();
1171 
1172 				rt = dst_alloc(&ipv4_dst_ops);
1173 				if (rt == NULL) {
1174 					ip_rt_put(rth);
1175 					in_dev_put(in_dev);
1176 					return;
1177 				}
1178 
1179 				/* Copy all the information. */
1180 				*rt = *rth;
1181  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182 				rt->u.dst.__use		= 1;
1183 				atomic_set(&rt->u.dst.__refcnt, 1);
1184 				rt->u.dst.child		= NULL;
1185 				if (rt->u.dst.dev)
1186 					dev_hold(rt->u.dst.dev);
1187 				if (rt->idev)
1188 					in_dev_hold(rt->idev);
1189 				rt->u.dst.obsolete	= 0;
1190 				rt->u.dst.lastuse	= jiffies;
1191 				rt->u.dst.path		= &rt->u.dst;
1192 				rt->u.dst.neighbour	= NULL;
1193 				rt->u.dst.hh		= NULL;
1194 				rt->u.dst.xfrm		= NULL;
1195 
1196 				rt->rt_flags		|= RTCF_REDIRECTED;
1197 
1198 				/* Gateway is different ... */
1199 				rt->rt_gateway		= new_gw;
1200 
1201 				/* Redirect received -> path was valid */
1202 				dst_confirm(&rth->u.dst);
1203 
1204 				if (rt->peer)
1205 					atomic_inc(&rt->peer->refcnt);
1206 
1207 				if (arp_bind_neighbour(&rt->u.dst) ||
1208 				    !(rt->u.dst.neighbour->nud_state &
1209 					    NUD_VALID)) {
1210 					if (rt->u.dst.neighbour)
1211 						neigh_event_send(rt->u.dst.neighbour, NULL);
1212 					ip_rt_put(rth);
1213 					rt_drop(rt);
1214 					goto do_next;
1215 				}
1216 
1217 				rt_del(hash, rth);
1218 				if (!rt_intern_hash(hash, rt, &rt))
1219 					ip_rt_put(rt);
1220 				goto do_next;
1221 			}
1222 			rcu_read_unlock();
1223 		do_next:
1224 			;
1225 		}
1226 	}
1227 	in_dev_put(in_dev);
1228 	return;
1229 
1230 reject_redirect:
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234 			"%u.%u.%u.%u ignored.\n"
1235 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236 			"tos %02x\n",
1237 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238 		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239 #endif
1240 	in_dev_put(in_dev);
1241 }
1242 
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 {
1245 	struct rtable *rt = (struct rtable*)dst;
1246 	struct dst_entry *ret = dst;
1247 
1248 	if (rt) {
1249 		if (dst->obsolete) {
1250 			ip_rt_put(rt);
1251 			ret = NULL;
1252 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253 			   rt->u.dst.expires) {
1254 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255 						     rt->fl.fl4_src ^
1256 							(rt->fl.oif << 5),
1257 						     rt->fl.fl4_tos);
1258 #if RT_CACHE_DEBUG >= 1
1259 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260 					  "%u.%u.%u.%u/%02x dropped\n",
1261 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 #endif
1263 			rt_del(hash, rt);
1264 			ret = NULL;
1265 		}
1266 	}
1267 	return ret;
1268 }
1269 
1270 /*
1271  * Algorithm:
1272  *	1. The first ip_rt_redirect_number redirects are sent
1273  *	   with exponential backoff, then we stop sending them at all,
1274  *	   assuming that the host ignores our redirects.
1275  *	2. If we did not see packets requiring redirects
1276  *	   during ip_rt_redirect_silence, we assume that the host
1277  *	   forgot redirected route and start to send redirects again.
1278  *
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
1280  * in icmp.c.
1281  *
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284  */
1285 
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1287 {
1288 	struct rtable *rt = (struct rtable*)skb->dst;
1289 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290 
1291 	if (!in_dev)
1292 		return;
1293 
1294 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1295 		goto out;
1296 
1297 	/* No redirected packets during ip_rt_redirect_silence;
1298 	 * reset the algorithm.
1299 	 */
1300 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301 		rt->u.dst.rate_tokens = 0;
1302 
1303 	/* Too many ignored redirects; do not send anything
1304 	 * set u.dst.rate_last to the last seen redirected packet.
1305 	 */
1306 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307 		rt->u.dst.rate_last = jiffies;
1308 		goto out;
1309 	}
1310 
1311 	/* Check for load limit; set rate_last to the latest sent
1312 	 * redirect.
1313 	 */
1314 	if (time_after(jiffies,
1315 		       (rt->u.dst.rate_last +
1316 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318 		rt->u.dst.rate_last = jiffies;
1319 		++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323 		    net_ratelimit())
1324 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326 				NIPQUAD(rt->rt_src), rt->rt_iif,
1327 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328 #endif
1329 	}
1330 out:
1331         in_dev_put(in_dev);
1332 }
1333 
1334 static int ip_error(struct sk_buff *skb)
1335 {
1336 	struct rtable *rt = (struct rtable*)skb->dst;
1337 	unsigned long now;
1338 	int code;
1339 
1340 	switch (rt->u.dst.error) {
1341 		case EINVAL:
1342 		default:
1343 			goto out;
1344 		case EHOSTUNREACH:
1345 			code = ICMP_HOST_UNREACH;
1346 			break;
1347 		case ENETUNREACH:
1348 			code = ICMP_NET_UNREACH;
1349 			break;
1350 		case EACCES:
1351 			code = ICMP_PKT_FILTERED;
1352 			break;
1353 	}
1354 
1355 	now = jiffies;
1356 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1359 	rt->u.dst.rate_last = now;
1360 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363 	}
1364 
1365 out:	kfree_skb(skb);
1366 	return 0;
1367 }
1368 
1369 /*
1370  *	The last two values are not from the RFC but
1371  *	are needed for AMPRnet AX.25 paths.
1372  */
1373 
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376 
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378 {
1379 	int i;
1380 
1381 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382 		if (old_mtu > mtu_plateau[i])
1383 			return mtu_plateau[i];
1384 	return 68;
1385 }
1386 
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 {
1389 	int i;
1390 	unsigned short old_mtu = ntohs(iph->tot_len);
1391 	struct rtable *rth;
1392 	u32  skeys[2] = { iph->saddr, 0, };
1393 	u32  daddr = iph->daddr;
1394 	u8   tos = iph->tos & IPTOS_RT_MASK;
1395 	unsigned short est_mtu = 0;
1396 
1397 	if (ipv4_config.no_pmtu_disc)
1398 		return 0;
1399 
1400 	for (i = 0; i < 2; i++) {
1401 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402 
1403 		rcu_read_lock();
1404 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 		     rth = rcu_dereference(rth->u.rt_next)) {
1406 			if (rth->fl.fl4_dst == daddr &&
1407 			    rth->fl.fl4_src == skeys[i] &&
1408 			    rth->rt_dst  == daddr &&
1409 			    rth->rt_src  == iph->saddr &&
1410 			    rth->fl.fl4_tos == tos &&
1411 			    rth->fl.iif == 0 &&
1412 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413 				unsigned short mtu = new_mtu;
1414 
1415 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1416 
1417 					/* BSD 4.2 compatibility hack :-( */
1418 					if (mtu == 0 &&
1419 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420 					    old_mtu >= 68 + (iph->ihl << 2))
1421 						old_mtu -= iph->ihl << 2;
1422 
1423 					mtu = guess_mtu(old_mtu);
1424 				}
1425 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1427 						dst_confirm(&rth->u.dst);
1428 						if (mtu < ip_rt_min_pmtu) {
1429 							mtu = ip_rt_min_pmtu;
1430 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1431 								(1 << RTAX_MTU);
1432 						}
1433 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434 						dst_set_expires(&rth->u.dst,
1435 							ip_rt_mtu_expires);
1436 					}
1437 					est_mtu = mtu;
1438 				}
1439 			}
1440 		}
1441 		rcu_read_unlock();
1442 	}
1443 	return est_mtu ? : new_mtu;
1444 }
1445 
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 {
1448 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1450 		if (mtu < ip_rt_min_pmtu) {
1451 			mtu = ip_rt_min_pmtu;
1452 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 		}
1454 		dst->metrics[RTAX_MTU-1] = mtu;
1455 		dst_set_expires(dst, ip_rt_mtu_expires);
1456 	}
1457 }
1458 
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 {
1461 	return NULL;
1462 }
1463 
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 {
1466 	struct rtable *rt = (struct rtable *) dst;
1467 	struct inet_peer *peer = rt->peer;
1468 	struct in_device *idev = rt->idev;
1469 
1470 	if (peer) {
1471 		rt->peer = NULL;
1472 		inet_putpeer(peer);
1473 	}
1474 
1475 	if (idev) {
1476 		rt->idev = NULL;
1477 		in_dev_put(idev);
1478 	}
1479 }
1480 
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482 			    int how)
1483 {
1484 	struct rtable *rt = (struct rtable *) dst;
1485 	struct in_device *idev = rt->idev;
1486 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1487 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488 		if (loopback_idev) {
1489 			rt->idev = loopback_idev;
1490 			in_dev_put(idev);
1491 		}
1492 	}
1493 }
1494 
1495 static void ipv4_link_failure(struct sk_buff *skb)
1496 {
1497 	struct rtable *rt;
1498 
1499 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500 
1501 	rt = (struct rtable *) skb->dst;
1502 	if (rt)
1503 		dst_set_expires(&rt->u.dst, 0);
1504 }
1505 
1506 static int ip_rt_bug(struct sk_buff *skb)
1507 {
1508 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510 		skb->dev ? skb->dev->name : "?");
1511 	kfree_skb(skb);
1512 	return 0;
1513 }
1514 
1515 /*
1516    We do not cache source address of outgoing interface,
1517    because it is used only by IP RR, TS and SRR options,
1518    so that it out of fast path.
1519 
1520    BTW remember: "addr" is allowed to be not aligned
1521    in IP options!
1522  */
1523 
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 {
1526 	u32 src;
1527 	struct fib_result res;
1528 
1529 	if (rt->fl.iif == 0)
1530 		src = rt->rt_src;
1531 	else if (fib_lookup(&rt->fl, &res) == 0) {
1532 		src = FIB_RES_PREFSRC(res);
1533 		fib_res_put(&res);
1534 	} else
1535 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536 					RT_SCOPE_UNIVERSE);
1537 	memcpy(addr, &src, 4);
1538 }
1539 
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1542 {
1543 	if (!(rt->u.dst.tclassid & 0xFFFF))
1544 		rt->u.dst.tclassid |= tag & 0xFFFF;
1545 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 }
1548 #endif
1549 
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 {
1552 	struct fib_info *fi = res->fi;
1553 
1554 	if (fi) {
1555 		if (FIB_RES_GW(*res) &&
1556 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557 			rt->rt_gateway = FIB_RES_GW(*res);
1558 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559 		       sizeof(rt->u.dst.metrics));
1560 		if (fi->fib_mtu == 0) {
1561 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563 			    rt->rt_gateway != rt->rt_dst &&
1564 			    rt->u.dst.dev->mtu > 576)
1565 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 		}
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 #endif
1570 	} else
1571 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572 
1573 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579 				       ip_rt_min_advmss);
1580 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582 
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585 	set_class_tag(rt, fib_rules_tclass(res));
1586 #endif
1587 	set_class_tag(rt, itag);
1588 #endif
1589         rt->rt_type = res->type;
1590 }
1591 
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593 				u8 tos, struct net_device *dev, int our)
1594 {
1595 	unsigned hash;
1596 	struct rtable *rth;
1597 	u32 spec_dst;
1598 	struct in_device *in_dev = in_dev_get(dev);
1599 	u32 itag = 0;
1600 
1601 	/* Primary sanity checks. */
1602 
1603 	if (in_dev == NULL)
1604 		return -EINVAL;
1605 
1606 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607 	    skb->protocol != htons(ETH_P_IP))
1608 		goto e_inval;
1609 
1610 	if (ZERONET(saddr)) {
1611 		if (!LOCAL_MCAST(daddr))
1612 			goto e_inval;
1613 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614 	} else if (fib_validate_source(saddr, 0, tos, 0,
1615 					dev, &spec_dst, &itag) < 0)
1616 		goto e_inval;
1617 
1618 	rth = dst_alloc(&ipv4_dst_ops);
1619 	if (!rth)
1620 		goto e_nobufs;
1621 
1622 	rth->u.dst.output= ip_rt_bug;
1623 
1624 	atomic_set(&rth->u.dst.__refcnt, 1);
1625 	rth->u.dst.flags= DST_HOST;
1626 	if (in_dev->cnf.no_policy)
1627 		rth->u.dst.flags |= DST_NOPOLICY;
1628 	rth->fl.fl4_dst	= daddr;
1629 	rth->rt_dst	= daddr;
1630 	rth->fl.fl4_tos	= tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632 	rth->fl.fl4_fwmark= skb->nfmark;
1633 #endif
1634 	rth->fl.fl4_src	= saddr;
1635 	rth->rt_src	= saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637 	rth->u.dst.tclassid = itag;
1638 #endif
1639 	rth->rt_iif	=
1640 	rth->fl.iif	= dev->ifindex;
1641 	rth->u.dst.dev	= &loopback_dev;
1642 	dev_hold(rth->u.dst.dev);
1643 	rth->idev	= in_dev_get(rth->u.dst.dev);
1644 	rth->fl.oif	= 0;
1645 	rth->rt_gateway	= daddr;
1646 	rth->rt_spec_dst= spec_dst;
1647 	rth->rt_type	= RTN_MULTICAST;
1648 	rth->rt_flags	= RTCF_MULTICAST;
1649 	if (our) {
1650 		rth->u.dst.input= ip_local_deliver;
1651 		rth->rt_flags |= RTCF_LOCAL;
1652 	}
1653 
1654 #ifdef CONFIG_IP_MROUTE
1655 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656 		rth->u.dst.input = ip_mr_input;
1657 #endif
1658 	RT_CACHE_STAT_INC(in_slow_mc);
1659 
1660 	in_dev_put(in_dev);
1661 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663 
1664 e_nobufs:
1665 	in_dev_put(in_dev);
1666 	return -ENOBUFS;
1667 
1668 e_inval:
1669 	in_dev_put(in_dev);
1670 	return -EINVAL;
1671 }
1672 
1673 
1674 static void ip_handle_martian_source(struct net_device *dev,
1675 				     struct in_device *in_dev,
1676 				     struct sk_buff *skb,
1677 				     u32 daddr,
1678 				     u32 saddr)
1679 {
1680 	RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 		/*
1684 		 *	RFC1812 recommendation, if source is martian,
1685 		 *	the only hint is MAC header.
1686 		 */
1687 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688 			"%u.%u.%u.%u, on dev %s\n",
1689 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690 		if (dev->hard_header_len && skb->mac.raw) {
1691 			int i;
1692 			unsigned char *p = skb->mac.raw;
1693 			printk(KERN_WARNING "ll header: ");
1694 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1695 				printk("%02x", *p);
1696 				if (i < (dev->hard_header_len - 1))
1697 					printk(":");
1698 			}
1699 			printk("\n");
1700 		}
1701 	}
1702 #endif
1703 }
1704 
1705 static inline int __mkroute_input(struct sk_buff *skb,
1706 				  struct fib_result* res,
1707 				  struct in_device *in_dev,
1708 				  u32 daddr, u32 saddr, u32 tos,
1709 				  struct rtable **result)
1710 {
1711 
1712 	struct rtable *rth;
1713 	int err;
1714 	struct in_device *out_dev;
1715 	unsigned flags = 0;
1716 	u32 spec_dst, itag;
1717 
1718 	/* get a working reference to the output device */
1719 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1720 	if (out_dev == NULL) {
1721 		if (net_ratelimit())
1722 			printk(KERN_CRIT "Bug in ip_route_input" \
1723 			       "_slow(). Please, report\n");
1724 		return -EINVAL;
1725 	}
1726 
1727 
1728 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1729 				  in_dev->dev, &spec_dst, &itag);
1730 	if (err < 0) {
1731 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1732 					 saddr);
1733 
1734 		err = -EINVAL;
1735 		goto cleanup;
1736 	}
1737 
1738 	if (err)
1739 		flags |= RTCF_DIRECTSRC;
1740 
1741 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1743 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744 		flags |= RTCF_DOREDIRECT;
1745 
1746 	if (skb->protocol != htons(ETH_P_IP)) {
1747 		/* Not IP (i.e. ARP). Do not create route, if it is
1748 		 * invalid for proxy arp. DNAT routes are always valid.
1749 		 */
1750 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751 			err = -EINVAL;
1752 			goto cleanup;
1753 		}
1754 	}
1755 
1756 
1757 	rth = dst_alloc(&ipv4_dst_ops);
1758 	if (!rth) {
1759 		err = -ENOBUFS;
1760 		goto cleanup;
1761 	}
1762 
1763 	atomic_set(&rth->u.dst.__refcnt, 1);
1764 	rth->u.dst.flags= DST_HOST;
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1766 	if (res->fi->fib_nhs > 1)
1767 		rth->u.dst.flags |= DST_BALANCED;
1768 #endif
1769 	if (in_dev->cnf.no_policy)
1770 		rth->u.dst.flags |= DST_NOPOLICY;
1771 	if (in_dev->cnf.no_xfrm)
1772 		rth->u.dst.flags |= DST_NOXFRM;
1773 	rth->fl.fl4_dst	= daddr;
1774 	rth->rt_dst	= daddr;
1775 	rth->fl.fl4_tos	= tos;
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
1777 	rth->fl.fl4_fwmark= skb->nfmark;
1778 #endif
1779 	rth->fl.fl4_src	= saddr;
1780 	rth->rt_src	= saddr;
1781 	rth->rt_gateway	= daddr;
1782 	rth->rt_iif 	=
1783 		rth->fl.iif	= in_dev->dev->ifindex;
1784 	rth->u.dst.dev	= (out_dev)->dev;
1785 	dev_hold(rth->u.dst.dev);
1786 	rth->idev	= in_dev_get(rth->u.dst.dev);
1787 	rth->fl.oif 	= 0;
1788 	rth->rt_spec_dst= spec_dst;
1789 
1790 	rth->u.dst.input = ip_forward;
1791 	rth->u.dst.output = ip_output;
1792 
1793 	rt_set_nexthop(rth, res, itag);
1794 
1795 	rth->rt_flags = flags;
1796 
1797 	*result = rth;
1798 	err = 0;
1799  cleanup:
1800 	/* release the working reference to the output device */
1801 	in_dev_put(out_dev);
1802 	return err;
1803 }
1804 
1805 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1806 				       struct fib_result* res,
1807 				       const struct flowi *fl,
1808 				       struct in_device *in_dev,
1809 				       u32 daddr, u32 saddr, u32 tos)
1810 {
1811 	struct rtable* rth = NULL;
1812 	int err;
1813 	unsigned hash;
1814 
1815 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1816 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1817 		fib_select_multipath(fl, res);
1818 #endif
1819 
1820 	/* create a routing cache entry */
1821 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1822 	if (err)
1823 		return err;
1824 
1825 	/* put it into the cache */
1826 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828 }
1829 
1830 static inline int ip_mkroute_input(struct sk_buff *skb,
1831 				   struct fib_result* res,
1832 				   const struct flowi *fl,
1833 				   struct in_device *in_dev,
1834 				   u32 daddr, u32 saddr, u32 tos)
1835 {
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837 	struct rtable* rth = NULL, *rtres;
1838 	unsigned char hop, hopcount;
1839 	int err = -EINVAL;
1840 	unsigned int hash;
1841 
1842 	if (res->fi)
1843 		hopcount = res->fi->fib_nhs;
1844 	else
1845 		hopcount = 1;
1846 
1847 	/* distinguish between multipath and singlepath */
1848 	if (hopcount < 2)
1849 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850 					    saddr, tos);
1851 
1852 	/* add all alternatives to the routing cache */
1853 	for (hop = 0; hop < hopcount; hop++) {
1854 		res->nh_sel = hop;
1855 
1856 		/* put reference to previous result */
1857 		if (hop)
1858 			ip_rt_put(rtres);
1859 
1860 		/* create a routing cache entry */
1861 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1862 				      &rth);
1863 		if (err)
1864 			return err;
1865 
1866 		/* put it into the cache */
1867 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1868 		err = rt_intern_hash(hash, rth, &rtres);
1869 		if (err)
1870 			return err;
1871 
1872 		/* forward hop information to multipath impl. */
1873 		multipath_set_nhinfo(rth,
1874 				     FIB_RES_NETWORK(*res),
1875 				     FIB_RES_NETMASK(*res),
1876 				     res->prefixlen,
1877 				     &FIB_RES_NH(*res));
1878 	}
1879 	skb->dst = &rtres->u.dst;
1880 	return err;
1881 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1882 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1883 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1884 }
1885 
1886 
1887 /*
1888  *	NOTE. We drop all the packets that has local source
1889  *	addresses, because every properly looped back packet
1890  *	must have correct destination already attached by output routine.
1891  *
1892  *	Such approach solves two big problems:
1893  *	1. Not simplex devices are handled properly.
1894  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1895  */
1896 
1897 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1898 			       u8 tos, struct net_device *dev)
1899 {
1900 	struct fib_result res;
1901 	struct in_device *in_dev = in_dev_get(dev);
1902 	struct flowi fl = { .nl_u = { .ip4_u =
1903 				      { .daddr = daddr,
1904 					.saddr = saddr,
1905 					.tos = tos,
1906 					.scope = RT_SCOPE_UNIVERSE,
1907 #ifdef CONFIG_IP_ROUTE_FWMARK
1908 					.fwmark = skb->nfmark
1909 #endif
1910 				      } },
1911 			    .iif = dev->ifindex };
1912 	unsigned	flags = 0;
1913 	u32		itag = 0;
1914 	struct rtable * rth;
1915 	unsigned	hash;
1916 	u32		spec_dst;
1917 	int		err = -EINVAL;
1918 	int		free_res = 0;
1919 
1920 	/* IP on this device is disabled. */
1921 
1922 	if (!in_dev)
1923 		goto out;
1924 
1925 	/* Check for the most weird martians, which can be not detected
1926 	   by fib_lookup.
1927 	 */
1928 
1929 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1930 		goto martian_source;
1931 
1932 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1933 		goto brd_input;
1934 
1935 	/* Accept zero addresses only to limited broadcast;
1936 	 * I even do not know to fix it or not. Waiting for complains :-)
1937 	 */
1938 	if (ZERONET(saddr))
1939 		goto martian_source;
1940 
1941 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1942 		goto martian_destination;
1943 
1944 	/*
1945 	 *	Now we are ready to route packet.
1946 	 */
1947 	if ((err = fib_lookup(&fl, &res)) != 0) {
1948 		if (!IN_DEV_FORWARD(in_dev))
1949 			goto e_hostunreach;
1950 		goto no_route;
1951 	}
1952 	free_res = 1;
1953 
1954 	RT_CACHE_STAT_INC(in_slow_tot);
1955 
1956 	if (res.type == RTN_BROADCAST)
1957 		goto brd_input;
1958 
1959 	if (res.type == RTN_LOCAL) {
1960 		int result;
1961 		result = fib_validate_source(saddr, daddr, tos,
1962 					     loopback_dev.ifindex,
1963 					     dev, &spec_dst, &itag);
1964 		if (result < 0)
1965 			goto martian_source;
1966 		if (result)
1967 			flags |= RTCF_DIRECTSRC;
1968 		spec_dst = daddr;
1969 		goto local_input;
1970 	}
1971 
1972 	if (!IN_DEV_FORWARD(in_dev))
1973 		goto e_hostunreach;
1974 	if (res.type != RTN_UNICAST)
1975 		goto martian_destination;
1976 
1977 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1978 	if (err == -ENOBUFS)
1979 		goto e_nobufs;
1980 	if (err == -EINVAL)
1981 		goto e_inval;
1982 
1983 done:
1984 	in_dev_put(in_dev);
1985 	if (free_res)
1986 		fib_res_put(&res);
1987 out:	return err;
1988 
1989 brd_input:
1990 	if (skb->protocol != htons(ETH_P_IP))
1991 		goto e_inval;
1992 
1993 	if (ZERONET(saddr))
1994 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995 	else {
1996 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1997 					  &itag);
1998 		if (err < 0)
1999 			goto martian_source;
2000 		if (err)
2001 			flags |= RTCF_DIRECTSRC;
2002 	}
2003 	flags |= RTCF_BROADCAST;
2004 	res.type = RTN_BROADCAST;
2005 	RT_CACHE_STAT_INC(in_brd);
2006 
2007 local_input:
2008 	rth = dst_alloc(&ipv4_dst_ops);
2009 	if (!rth)
2010 		goto e_nobufs;
2011 
2012 	rth->u.dst.output= ip_rt_bug;
2013 
2014 	atomic_set(&rth->u.dst.__refcnt, 1);
2015 	rth->u.dst.flags= DST_HOST;
2016 	if (in_dev->cnf.no_policy)
2017 		rth->u.dst.flags |= DST_NOPOLICY;
2018 	rth->fl.fl4_dst	= daddr;
2019 	rth->rt_dst	= daddr;
2020 	rth->fl.fl4_tos	= tos;
2021 #ifdef CONFIG_IP_ROUTE_FWMARK
2022 	rth->fl.fl4_fwmark= skb->nfmark;
2023 #endif
2024 	rth->fl.fl4_src	= saddr;
2025 	rth->rt_src	= saddr;
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027 	rth->u.dst.tclassid = itag;
2028 #endif
2029 	rth->rt_iif	=
2030 	rth->fl.iif	= dev->ifindex;
2031 	rth->u.dst.dev	= &loopback_dev;
2032 	dev_hold(rth->u.dst.dev);
2033 	rth->idev	= in_dev_get(rth->u.dst.dev);
2034 	rth->rt_gateway	= daddr;
2035 	rth->rt_spec_dst= spec_dst;
2036 	rth->u.dst.input= ip_local_deliver;
2037 	rth->rt_flags 	= flags|RTCF_LOCAL;
2038 	if (res.type == RTN_UNREACHABLE) {
2039 		rth->u.dst.input= ip_error;
2040 		rth->u.dst.error= -err;
2041 		rth->rt_flags 	&= ~RTCF_LOCAL;
2042 	}
2043 	rth->rt_type	= res.type;
2044 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2045 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2046 	goto done;
2047 
2048 no_route:
2049 	RT_CACHE_STAT_INC(in_no_route);
2050 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2051 	res.type = RTN_UNREACHABLE;
2052 	goto local_input;
2053 
2054 	/*
2055 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2056 	 */
2057 martian_destination:
2058 	RT_CACHE_STAT_INC(in_martian_dst);
2059 #ifdef CONFIG_IP_ROUTE_VERBOSE
2060 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2061 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2062 			"%u.%u.%u.%u, dev %s\n",
2063 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2064 #endif
2065 
2066 e_hostunreach:
2067         err = -EHOSTUNREACH;
2068         goto done;
2069 
2070 e_inval:
2071 	err = -EINVAL;
2072 	goto done;
2073 
2074 e_nobufs:
2075 	err = -ENOBUFS;
2076 	goto done;
2077 
2078 martian_source:
2079 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2080 	goto e_inval;
2081 }
2082 
2083 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2084 		   u8 tos, struct net_device *dev)
2085 {
2086 	struct rtable * rth;
2087 	unsigned	hash;
2088 	int iif = dev->ifindex;
2089 
2090 	tos &= IPTOS_RT_MASK;
2091 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2092 
2093 	rcu_read_lock();
2094 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2095 	     rth = rcu_dereference(rth->u.rt_next)) {
2096 		if (rth->fl.fl4_dst == daddr &&
2097 		    rth->fl.fl4_src == saddr &&
2098 		    rth->fl.iif == iif &&
2099 		    rth->fl.oif == 0 &&
2100 #ifdef CONFIG_IP_ROUTE_FWMARK
2101 		    rth->fl.fl4_fwmark == skb->nfmark &&
2102 #endif
2103 		    rth->fl.fl4_tos == tos) {
2104 			rth->u.dst.lastuse = jiffies;
2105 			dst_hold(&rth->u.dst);
2106 			rth->u.dst.__use++;
2107 			RT_CACHE_STAT_INC(in_hit);
2108 			rcu_read_unlock();
2109 			skb->dst = (struct dst_entry*)rth;
2110 			return 0;
2111 		}
2112 		RT_CACHE_STAT_INC(in_hlist_search);
2113 	}
2114 	rcu_read_unlock();
2115 
2116 	/* Multicast recognition logic is moved from route cache to here.
2117 	   The problem was that too many Ethernet cards have broken/missing
2118 	   hardware multicast filters :-( As result the host on multicasting
2119 	   network acquires a lot of useless route cache entries, sort of
2120 	   SDR messages from all the world. Now we try to get rid of them.
2121 	   Really, provided software IP multicast filter is organized
2122 	   reasonably (at least, hashed), it does not result in a slowdown
2123 	   comparing with route cache reject entries.
2124 	   Note, that multicast routers are not affected, because
2125 	   route cache entry is created eventually.
2126 	 */
2127 	if (MULTICAST(daddr)) {
2128 		struct in_device *in_dev;
2129 
2130 		rcu_read_lock();
2131 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2132 			int our = ip_check_mc(in_dev, daddr, saddr,
2133 				skb->nh.iph->protocol);
2134 			if (our
2135 #ifdef CONFIG_IP_MROUTE
2136 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2137 #endif
2138 			    ) {
2139 				rcu_read_unlock();
2140 				return ip_route_input_mc(skb, daddr, saddr,
2141 							 tos, dev, our);
2142 			}
2143 		}
2144 		rcu_read_unlock();
2145 		return -EINVAL;
2146 	}
2147 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2148 }
2149 
2150 static inline int __mkroute_output(struct rtable **result,
2151 				   struct fib_result* res,
2152 				   const struct flowi *fl,
2153 				   const struct flowi *oldflp,
2154 				   struct net_device *dev_out,
2155 				   unsigned flags)
2156 {
2157 	struct rtable *rth;
2158 	struct in_device *in_dev;
2159 	u32 tos = RT_FL_TOS(oldflp);
2160 	int err = 0;
2161 
2162 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2163 		return -EINVAL;
2164 
2165 	if (fl->fl4_dst == 0xFFFFFFFF)
2166 		res->type = RTN_BROADCAST;
2167 	else if (MULTICAST(fl->fl4_dst))
2168 		res->type = RTN_MULTICAST;
2169 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2170 		return -EINVAL;
2171 
2172 	if (dev_out->flags & IFF_LOOPBACK)
2173 		flags |= RTCF_LOCAL;
2174 
2175 	/* get work reference to inet device */
2176 	in_dev = in_dev_get(dev_out);
2177 	if (!in_dev)
2178 		return -EINVAL;
2179 
2180 	if (res->type == RTN_BROADCAST) {
2181 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182 		if (res->fi) {
2183 			fib_info_put(res->fi);
2184 			res->fi = NULL;
2185 		}
2186 	} else if (res->type == RTN_MULTICAST) {
2187 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2188 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2189 				 oldflp->proto))
2190 			flags &= ~RTCF_LOCAL;
2191 		/* If multicast route do not exist use
2192 		   default one, but do not gateway in this case.
2193 		   Yes, it is hack.
2194 		 */
2195 		if (res->fi && res->prefixlen < 4) {
2196 			fib_info_put(res->fi);
2197 			res->fi = NULL;
2198 		}
2199 	}
2200 
2201 
2202 	rth = dst_alloc(&ipv4_dst_ops);
2203 	if (!rth) {
2204 		err = -ENOBUFS;
2205 		goto cleanup;
2206 	}
2207 
2208 	atomic_set(&rth->u.dst.__refcnt, 1);
2209 	rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 	if (res->fi) {
2212 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 		if (res->fi->fib_nhs > 1)
2214 			rth->u.dst.flags |= DST_BALANCED;
2215 	}
2216 #endif
2217 	if (in_dev->cnf.no_xfrm)
2218 		rth->u.dst.flags |= DST_NOXFRM;
2219 	if (in_dev->cnf.no_policy)
2220 		rth->u.dst.flags |= DST_NOPOLICY;
2221 
2222 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2223 	rth->fl.fl4_tos	= tos;
2224 	rth->fl.fl4_src	= oldflp->fl4_src;
2225 	rth->fl.oif	= oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229 	rth->rt_dst	= fl->fl4_dst;
2230 	rth->rt_src	= fl->fl4_src;
2231 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2232 	/* get references to the devices that are to be hold by the routing
2233 	   cache entry */
2234 	rth->u.dst.dev	= dev_out;
2235 	dev_hold(dev_out);
2236 	rth->idev	= in_dev_get(dev_out);
2237 	rth->rt_gateway = fl->fl4_dst;
2238 	rth->rt_spec_dst= fl->fl4_src;
2239 
2240 	rth->u.dst.output=ip_output;
2241 
2242 	RT_CACHE_STAT_INC(out_slow_tot);
2243 
2244 	if (flags & RTCF_LOCAL) {
2245 		rth->u.dst.input = ip_local_deliver;
2246 		rth->rt_spec_dst = fl->fl4_dst;
2247 	}
2248 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 		rth->rt_spec_dst = fl->fl4_src;
2250 		if (flags & RTCF_LOCAL &&
2251 		    !(dev_out->flags & IFF_LOOPBACK)) {
2252 			rth->u.dst.output = ip_mc_output;
2253 			RT_CACHE_STAT_INC(out_slow_mc);
2254 		}
2255 #ifdef CONFIG_IP_MROUTE
2256 		if (res->type == RTN_MULTICAST) {
2257 			if (IN_DEV_MFORWARD(in_dev) &&
2258 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 				rth->u.dst.input = ip_mr_input;
2260 				rth->u.dst.output = ip_mc_output;
2261 			}
2262 		}
2263 #endif
2264 	}
2265 
2266 	rt_set_nexthop(rth, res, 0);
2267 
2268 	rth->rt_flags = flags;
2269 
2270 	*result = rth;
2271  cleanup:
2272 	/* release work reference to inet device */
2273 	in_dev_put(in_dev);
2274 
2275 	return err;
2276 }
2277 
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279 					struct fib_result* res,
2280 					const struct flowi *fl,
2281 					const struct flowi *oldflp,
2282 					struct net_device *dev_out,
2283 					unsigned flags)
2284 {
2285 	struct rtable *rth = NULL;
2286 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287 	unsigned hash;
2288 	if (err == 0) {
2289 		u32 tos = RT_FL_TOS(oldflp);
2290 
2291 		hash = rt_hash_code(oldflp->fl4_dst,
2292 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2293 		err = rt_intern_hash(hash, rth, rp);
2294 	}
2295 
2296 	return err;
2297 }
2298 
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300 				    struct fib_result* res,
2301 				    const struct flowi *fl,
2302 				    const struct flowi *oldflp,
2303 				    struct net_device *dev_out,
2304 				    unsigned flags)
2305 {
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307 	u32 tos = RT_FL_TOS(oldflp);
2308 	unsigned char hop;
2309 	unsigned hash;
2310 	int err = -EINVAL;
2311 	struct rtable *rth = NULL;
2312 
2313 	if (res->fi && res->fi->fib_nhs > 1) {
2314 		unsigned char hopcount = res->fi->fib_nhs;
2315 
2316 		for (hop = 0; hop < hopcount; hop++) {
2317 			struct net_device *dev2nexthop;
2318 
2319 			res->nh_sel = hop;
2320 
2321 			/* hold a work reference to the output device */
2322 			dev2nexthop = FIB_RES_DEV(*res);
2323 			dev_hold(dev2nexthop);
2324 
2325 			/* put reference to previous result */
2326 			if (hop)
2327 				ip_rt_put(*rp);
2328 
2329 			err = __mkroute_output(&rth, res, fl, oldflp,
2330 					       dev2nexthop, flags);
2331 
2332 			if (err != 0)
2333 				goto cleanup;
2334 
2335 			hash = rt_hash_code(oldflp->fl4_dst,
2336 					    oldflp->fl4_src ^
2337 					    (oldflp->oif << 5), tos);
2338 			err = rt_intern_hash(hash, rth, rp);
2339 
2340 			/* forward hop information to multipath impl. */
2341 			multipath_set_nhinfo(rth,
2342 					     FIB_RES_NETWORK(*res),
2343 					     FIB_RES_NETMASK(*res),
2344 					     res->prefixlen,
2345 					     &FIB_RES_NH(*res));
2346 		cleanup:
2347 			/* release work reference to output device */
2348 			dev_put(dev2nexthop);
2349 
2350 			if (err != 0)
2351 				return err;
2352 		}
2353 		return err;
2354 	} else {
2355 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2356 					     flags);
2357 	}
2358 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2359 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2360 #endif
2361 }
2362 
2363 /*
2364  * Major route resolver routine.
2365  */
2366 
2367 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 {
2369 	u32 tos	= RT_FL_TOS(oldflp);
2370 	struct flowi fl = { .nl_u = { .ip4_u =
2371 				      { .daddr = oldflp->fl4_dst,
2372 					.saddr = oldflp->fl4_src,
2373 					.tos = tos & IPTOS_RT_MASK,
2374 					.scope = ((tos & RTO_ONLINK) ?
2375 						  RT_SCOPE_LINK :
2376 						  RT_SCOPE_UNIVERSE),
2377 #ifdef CONFIG_IP_ROUTE_FWMARK
2378 					.fwmark = oldflp->fl4_fwmark
2379 #endif
2380 				      } },
2381 			    .iif = loopback_dev.ifindex,
2382 			    .oif = oldflp->oif };
2383 	struct fib_result res;
2384 	unsigned flags = 0;
2385 	struct net_device *dev_out = NULL;
2386 	int free_res = 0;
2387 	int err;
2388 
2389 
2390 	res.fi		= NULL;
2391 #ifdef CONFIG_IP_MULTIPLE_TABLES
2392 	res.r		= NULL;
2393 #endif
2394 
2395 	if (oldflp->fl4_src) {
2396 		err = -EINVAL;
2397 		if (MULTICAST(oldflp->fl4_src) ||
2398 		    BADCLASS(oldflp->fl4_src) ||
2399 		    ZERONET(oldflp->fl4_src))
2400 			goto out;
2401 
2402 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2403 		dev_out = ip_dev_find(oldflp->fl4_src);
2404 		if (dev_out == NULL)
2405 			goto out;
2406 
2407 		/* I removed check for oif == dev_out->oif here.
2408 		   It was wrong for two reasons:
2409 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2410 		      assigned to multiple interfaces.
2411 		   2. Moreover, we are allowed to send packets with saddr
2412 		      of another iface. --ANK
2413 		 */
2414 
2415 		if (oldflp->oif == 0
2416 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2417 			/* Special hack: user can direct multicasts
2418 			   and limited broadcast via necessary interface
2419 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2420 			   This hack is not just for fun, it allows
2421 			   vic,vat and friends to work.
2422 			   They bind socket to loopback, set ttl to zero
2423 			   and expect that it will work.
2424 			   From the viewpoint of routing cache they are broken,
2425 			   because we are not allowed to build multicast path
2426 			   with loopback source addr (look, routing cache
2427 			   cannot know, that ttl is zero, so that packet
2428 			   will not leave this host and route is valid).
2429 			   Luckily, this hack is good workaround.
2430 			 */
2431 
2432 			fl.oif = dev_out->ifindex;
2433 			goto make_route;
2434 		}
2435 		if (dev_out)
2436 			dev_put(dev_out);
2437 		dev_out = NULL;
2438 	}
2439 
2440 
2441 	if (oldflp->oif) {
2442 		dev_out = dev_get_by_index(oldflp->oif);
2443 		err = -ENODEV;
2444 		if (dev_out == NULL)
2445 			goto out;
2446 
2447 		/* RACE: Check return value of inet_select_addr instead. */
2448 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2449 			dev_put(dev_out);
2450 			goto out;	/* Wrong error code */
2451 		}
2452 
2453 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2454 			if (!fl.fl4_src)
2455 				fl.fl4_src = inet_select_addr(dev_out, 0,
2456 							      RT_SCOPE_LINK);
2457 			goto make_route;
2458 		}
2459 		if (!fl.fl4_src) {
2460 			if (MULTICAST(oldflp->fl4_dst))
2461 				fl.fl4_src = inet_select_addr(dev_out, 0,
2462 							      fl.fl4_scope);
2463 			else if (!oldflp->fl4_dst)
2464 				fl.fl4_src = inet_select_addr(dev_out, 0,
2465 							      RT_SCOPE_HOST);
2466 		}
2467 	}
2468 
2469 	if (!fl.fl4_dst) {
2470 		fl.fl4_dst = fl.fl4_src;
2471 		if (!fl.fl4_dst)
2472 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2473 		if (dev_out)
2474 			dev_put(dev_out);
2475 		dev_out = &loopback_dev;
2476 		dev_hold(dev_out);
2477 		fl.oif = loopback_dev.ifindex;
2478 		res.type = RTN_LOCAL;
2479 		flags |= RTCF_LOCAL;
2480 		goto make_route;
2481 	}
2482 
2483 	if (fib_lookup(&fl, &res)) {
2484 		res.fi = NULL;
2485 		if (oldflp->oif) {
2486 			/* Apparently, routing tables are wrong. Assume,
2487 			   that the destination is on link.
2488 
2489 			   WHY? DW.
2490 			   Because we are allowed to send to iface
2491 			   even if it has NO routes and NO assigned
2492 			   addresses. When oif is specified, routing
2493 			   tables are looked up with only one purpose:
2494 			   to catch if destination is gatewayed, rather than
2495 			   direct. Moreover, if MSG_DONTROUTE is set,
2496 			   we send packet, ignoring both routing tables
2497 			   and ifaddr state. --ANK
2498 
2499 
2500 			   We could make it even if oif is unknown,
2501 			   likely IPv6, but we do not.
2502 			 */
2503 
2504 			if (fl.fl4_src == 0)
2505 				fl.fl4_src = inet_select_addr(dev_out, 0,
2506 							      RT_SCOPE_LINK);
2507 			res.type = RTN_UNICAST;
2508 			goto make_route;
2509 		}
2510 		if (dev_out)
2511 			dev_put(dev_out);
2512 		err = -ENETUNREACH;
2513 		goto out;
2514 	}
2515 	free_res = 1;
2516 
2517 	if (res.type == RTN_LOCAL) {
2518 		if (!fl.fl4_src)
2519 			fl.fl4_src = fl.fl4_dst;
2520 		if (dev_out)
2521 			dev_put(dev_out);
2522 		dev_out = &loopback_dev;
2523 		dev_hold(dev_out);
2524 		fl.oif = dev_out->ifindex;
2525 		if (res.fi)
2526 			fib_info_put(res.fi);
2527 		res.fi = NULL;
2528 		flags |= RTCF_LOCAL;
2529 		goto make_route;
2530 	}
2531 
2532 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2533 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2534 		fib_select_multipath(&fl, &res);
2535 	else
2536 #endif
2537 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2538 		fib_select_default(&fl, &res);
2539 
2540 	if (!fl.fl4_src)
2541 		fl.fl4_src = FIB_RES_PREFSRC(res);
2542 
2543 	if (dev_out)
2544 		dev_put(dev_out);
2545 	dev_out = FIB_RES_DEV(res);
2546 	dev_hold(dev_out);
2547 	fl.oif = dev_out->ifindex;
2548 
2549 
2550 make_route:
2551 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2552 
2553 
2554 	if (free_res)
2555 		fib_res_put(&res);
2556 	if (dev_out)
2557 		dev_put(dev_out);
2558 out:	return err;
2559 }
2560 
2561 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2562 {
2563 	unsigned hash;
2564 	struct rtable *rth;
2565 
2566 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2567 
2568 	rcu_read_lock_bh();
2569 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2570 		rth = rcu_dereference(rth->u.rt_next)) {
2571 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2572 		    rth->fl.fl4_src == flp->fl4_src &&
2573 		    rth->fl.iif == 0 &&
2574 		    rth->fl.oif == flp->oif &&
2575 #ifdef CONFIG_IP_ROUTE_FWMARK
2576 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2577 #endif
2578 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2579 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2580 
2581 			/* check for multipath routes and choose one if
2582 			 * necessary
2583 			 */
2584 			if (multipath_select_route(flp, rth, rp)) {
2585 				dst_hold(&(*rp)->u.dst);
2586 				RT_CACHE_STAT_INC(out_hit);
2587 				rcu_read_unlock_bh();
2588 				return 0;
2589 			}
2590 
2591 			rth->u.dst.lastuse = jiffies;
2592 			dst_hold(&rth->u.dst);
2593 			rth->u.dst.__use++;
2594 			RT_CACHE_STAT_INC(out_hit);
2595 			rcu_read_unlock_bh();
2596 			*rp = rth;
2597 			return 0;
2598 		}
2599 		RT_CACHE_STAT_INC(out_hlist_search);
2600 	}
2601 	rcu_read_unlock_bh();
2602 
2603 	return ip_route_output_slow(rp, flp);
2604 }
2605 
2606 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2607 
2608 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2609 {
2610 	int err;
2611 
2612 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2613 		return err;
2614 
2615 	if (flp->proto) {
2616 		if (!flp->fl4_src)
2617 			flp->fl4_src = (*rp)->rt_src;
2618 		if (!flp->fl4_dst)
2619 			flp->fl4_dst = (*rp)->rt_dst;
2620 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2621 	}
2622 
2623 	return 0;
2624 }
2625 
2626 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2627 
2628 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2629 {
2630 	return ip_route_output_flow(rp, flp, NULL, 0);
2631 }
2632 
2633 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2634 			int nowait, unsigned int flags)
2635 {
2636 	struct rtable *rt = (struct rtable*)skb->dst;
2637 	struct rtmsg *r;
2638 	struct nlmsghdr  *nlh;
2639 	unsigned char	 *b = skb->tail;
2640 	struct rta_cacheinfo ci;
2641 #ifdef CONFIG_IP_MROUTE
2642 	struct rtattr *eptr;
2643 #endif
2644 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2645 	r = NLMSG_DATA(nlh);
2646 	r->rtm_family	 = AF_INET;
2647 	r->rtm_dst_len	= 32;
2648 	r->rtm_src_len	= 0;
2649 	r->rtm_tos	= rt->fl.fl4_tos;
2650 	r->rtm_table	= RT_TABLE_MAIN;
2651 	r->rtm_type	= rt->rt_type;
2652 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2653 	r->rtm_protocol = RTPROT_UNSPEC;
2654 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2655 	if (rt->rt_flags & RTCF_NOTIFY)
2656 		r->rtm_flags |= RTM_F_NOTIFY;
2657 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2658 	if (rt->fl.fl4_src) {
2659 		r->rtm_src_len = 32;
2660 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2661 	}
2662 	if (rt->u.dst.dev)
2663 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2664 #ifdef CONFIG_NET_CLS_ROUTE
2665 	if (rt->u.dst.tclassid)
2666 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2667 #endif
2668 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2669 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2670 		__u32 alg = rt->rt_multipath_alg;
2671 
2672 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2673 	}
2674 #endif
2675 	if (rt->fl.iif)
2676 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2677 	else if (rt->rt_src != rt->fl.fl4_src)
2678 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2679 	if (rt->rt_dst != rt->rt_gateway)
2680 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2681 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2682 		goto rtattr_failure;
2683 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2684 	ci.rta_used	= rt->u.dst.__use;
2685 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2686 	if (rt->u.dst.expires)
2687 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2688 	else
2689 		ci.rta_expires = 0;
2690 	ci.rta_error	= rt->u.dst.error;
2691 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2692 	if (rt->peer) {
2693 		ci.rta_id = rt->peer->ip_id_count;
2694 		if (rt->peer->tcp_ts_stamp) {
2695 			ci.rta_ts = rt->peer->tcp_ts;
2696 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2697 		}
2698 	}
2699 #ifdef CONFIG_IP_MROUTE
2700 	eptr = (struct rtattr*)skb->tail;
2701 #endif
2702 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2703 	if (rt->fl.iif) {
2704 #ifdef CONFIG_IP_MROUTE
2705 		u32 dst = rt->rt_dst;
2706 
2707 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2708 		    ipv4_devconf.mc_forwarding) {
2709 			int err = ipmr_get_route(skb, r, nowait);
2710 			if (err <= 0) {
2711 				if (!nowait) {
2712 					if (err == 0)
2713 						return 0;
2714 					goto nlmsg_failure;
2715 				} else {
2716 					if (err == -EMSGSIZE)
2717 						goto nlmsg_failure;
2718 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2719 				}
2720 			}
2721 		} else
2722 #endif
2723 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2724 	}
2725 
2726 	nlh->nlmsg_len = skb->tail - b;
2727 	return skb->len;
2728 
2729 nlmsg_failure:
2730 rtattr_failure:
2731 	skb_trim(skb, b - skb->data);
2732 	return -1;
2733 }
2734 
2735 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2736 {
2737 	struct rtattr **rta = arg;
2738 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2739 	struct rtable *rt = NULL;
2740 	u32 dst = 0;
2741 	u32 src = 0;
2742 	int iif = 0;
2743 	int err = -ENOBUFS;
2744 	struct sk_buff *skb;
2745 
2746 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2747 	if (!skb)
2748 		goto out;
2749 
2750 	/* Reserve room for dummy headers, this skb can pass
2751 	   through good chunk of routing engine.
2752 	 */
2753 	skb->mac.raw = skb->data;
2754 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 
2756 	if (rta[RTA_SRC - 1])
2757 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2758 	if (rta[RTA_DST - 1])
2759 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2760 	if (rta[RTA_IIF - 1])
2761 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2762 
2763 	if (iif) {
2764 		struct net_device *dev = __dev_get_by_index(iif);
2765 		err = -ENODEV;
2766 		if (!dev)
2767 			goto out_free;
2768 		skb->protocol	= htons(ETH_P_IP);
2769 		skb->dev	= dev;
2770 		local_bh_disable();
2771 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772 		local_bh_enable();
2773 		rt = (struct rtable*)skb->dst;
2774 		if (!err && rt->u.dst.error)
2775 			err = -rt->u.dst.error;
2776 	} else {
2777 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2778 							 .saddr = src,
2779 							 .tos = rtm->rtm_tos } } };
2780 		int oif = 0;
2781 		if (rta[RTA_OIF - 1])
2782 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2783 		fl.oif = oif;
2784 		err = ip_route_output_key(&rt, &fl);
2785 	}
2786 	if (err)
2787 		goto out_free;
2788 
2789 	skb->dst = &rt->u.dst;
2790 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2791 		rt->rt_flags |= RTCF_NOTIFY;
2792 
2793 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2794 
2795 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2796 				RTM_NEWROUTE, 0, 0);
2797 	if (!err)
2798 		goto out_free;
2799 	if (err < 0) {
2800 		err = -EMSGSIZE;
2801 		goto out_free;
2802 	}
2803 
2804 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2805 	if (err > 0)
2806 		err = 0;
2807 out:	return err;
2808 
2809 out_free:
2810 	kfree_skb(skb);
2811 	goto out;
2812 }
2813 
2814 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2815 {
2816 	struct rtable *rt;
2817 	int h, s_h;
2818 	int idx, s_idx;
2819 
2820 	s_h = cb->args[0];
2821 	s_idx = idx = cb->args[1];
2822 	for (h = 0; h <= rt_hash_mask; h++) {
2823 		if (h < s_h) continue;
2824 		if (h > s_h)
2825 			s_idx = 0;
2826 		rcu_read_lock_bh();
2827 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2829 			if (idx < s_idx)
2830 				continue;
2831 			skb->dst = dst_clone(&rt->u.dst);
2832 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2833 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2834 					 1, NLM_F_MULTI) <= 0) {
2835 				dst_release(xchg(&skb->dst, NULL));
2836 				rcu_read_unlock_bh();
2837 				goto done;
2838 			}
2839 			dst_release(xchg(&skb->dst, NULL));
2840 		}
2841 		rcu_read_unlock_bh();
2842 	}
2843 
2844 done:
2845 	cb->args[0] = h;
2846 	cb->args[1] = idx;
2847 	return skb->len;
2848 }
2849 
2850 void ip_rt_multicast_event(struct in_device *in_dev)
2851 {
2852 	rt_cache_flush(0);
2853 }
2854 
2855 #ifdef CONFIG_SYSCTL
2856 static int flush_delay;
2857 
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859 					struct file *filp, void __user *buffer,
2860 					size_t *lenp, loff_t *ppos)
2861 {
2862 	if (write) {
2863 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864 		rt_cache_flush(flush_delay);
2865 		return 0;
2866 	}
2867 
2868 	return -EINVAL;
2869 }
2870 
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2872 						int __user *name,
2873 						int nlen,
2874 						void __user *oldval,
2875 						size_t __user *oldlenp,
2876 						void __user *newval,
2877 						size_t newlen,
2878 						void **context)
2879 {
2880 	int delay;
2881 	if (newlen != sizeof(int))
2882 		return -EINVAL;
2883 	if (get_user(delay, (int __user *)newval))
2884 		return -EFAULT;
2885 	rt_cache_flush(delay);
2886 	return 0;
2887 }
2888 
2889 ctl_table ipv4_route_table[] = {
2890         {
2891 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2892 		.procname	= "flush",
2893 		.data		= &flush_delay,
2894 		.maxlen		= sizeof(int),
2895 		.mode		= 0200,
2896 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2897 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2898 	},
2899 	{
2900 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2901 		.procname	= "min_delay",
2902 		.data		= &ip_rt_min_delay,
2903 		.maxlen		= sizeof(int),
2904 		.mode		= 0644,
2905 		.proc_handler	= &proc_dointvec_jiffies,
2906 		.strategy	= &sysctl_jiffies,
2907 	},
2908 	{
2909 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2910 		.procname	= "max_delay",
2911 		.data		= &ip_rt_max_delay,
2912 		.maxlen		= sizeof(int),
2913 		.mode		= 0644,
2914 		.proc_handler	= &proc_dointvec_jiffies,
2915 		.strategy	= &sysctl_jiffies,
2916 	},
2917 	{
2918 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2919 		.procname	= "gc_thresh",
2920 		.data		= &ipv4_dst_ops.gc_thresh,
2921 		.maxlen		= sizeof(int),
2922 		.mode		= 0644,
2923 		.proc_handler	= &proc_dointvec,
2924 	},
2925 	{
2926 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2927 		.procname	= "max_size",
2928 		.data		= &ip_rt_max_size,
2929 		.maxlen		= sizeof(int),
2930 		.mode		= 0644,
2931 		.proc_handler	= &proc_dointvec,
2932 	},
2933 	{
2934 		/*  Deprecated. Use gc_min_interval_ms */
2935 
2936 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2937 		.procname	= "gc_min_interval",
2938 		.data		= &ip_rt_gc_min_interval,
2939 		.maxlen		= sizeof(int),
2940 		.mode		= 0644,
2941 		.proc_handler	= &proc_dointvec_jiffies,
2942 		.strategy	= &sysctl_jiffies,
2943 	},
2944 	{
2945 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2946 		.procname	= "gc_min_interval_ms",
2947 		.data		= &ip_rt_gc_min_interval,
2948 		.maxlen		= sizeof(int),
2949 		.mode		= 0644,
2950 		.proc_handler	= &proc_dointvec_ms_jiffies,
2951 		.strategy	= &sysctl_ms_jiffies,
2952 	},
2953 	{
2954 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2955 		.procname	= "gc_timeout",
2956 		.data		= &ip_rt_gc_timeout,
2957 		.maxlen		= sizeof(int),
2958 		.mode		= 0644,
2959 		.proc_handler	= &proc_dointvec_jiffies,
2960 		.strategy	= &sysctl_jiffies,
2961 	},
2962 	{
2963 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2964 		.procname	= "gc_interval",
2965 		.data		= &ip_rt_gc_interval,
2966 		.maxlen		= sizeof(int),
2967 		.mode		= 0644,
2968 		.proc_handler	= &proc_dointvec_jiffies,
2969 		.strategy	= &sysctl_jiffies,
2970 	},
2971 	{
2972 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2973 		.procname	= "redirect_load",
2974 		.data		= &ip_rt_redirect_load,
2975 		.maxlen		= sizeof(int),
2976 		.mode		= 0644,
2977 		.proc_handler	= &proc_dointvec,
2978 	},
2979 	{
2980 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2981 		.procname	= "redirect_number",
2982 		.data		= &ip_rt_redirect_number,
2983 		.maxlen		= sizeof(int),
2984 		.mode		= 0644,
2985 		.proc_handler	= &proc_dointvec,
2986 	},
2987 	{
2988 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2989 		.procname	= "redirect_silence",
2990 		.data		= &ip_rt_redirect_silence,
2991 		.maxlen		= sizeof(int),
2992 		.mode		= 0644,
2993 		.proc_handler	= &proc_dointvec,
2994 	},
2995 	{
2996 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2997 		.procname	= "error_cost",
2998 		.data		= &ip_rt_error_cost,
2999 		.maxlen		= sizeof(int),
3000 		.mode		= 0644,
3001 		.proc_handler	= &proc_dointvec,
3002 	},
3003 	{
3004 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3005 		.procname	= "error_burst",
3006 		.data		= &ip_rt_error_burst,
3007 		.maxlen		= sizeof(int),
3008 		.mode		= 0644,
3009 		.proc_handler	= &proc_dointvec,
3010 	},
3011 	{
3012 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3013 		.procname	= "gc_elasticity",
3014 		.data		= &ip_rt_gc_elasticity,
3015 		.maxlen		= sizeof(int),
3016 		.mode		= 0644,
3017 		.proc_handler	= &proc_dointvec,
3018 	},
3019 	{
3020 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3021 		.procname	= "mtu_expires",
3022 		.data		= &ip_rt_mtu_expires,
3023 		.maxlen		= sizeof(int),
3024 		.mode		= 0644,
3025 		.proc_handler	= &proc_dointvec_jiffies,
3026 		.strategy	= &sysctl_jiffies,
3027 	},
3028 	{
3029 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3030 		.procname	= "min_pmtu",
3031 		.data		= &ip_rt_min_pmtu,
3032 		.maxlen		= sizeof(int),
3033 		.mode		= 0644,
3034 		.proc_handler	= &proc_dointvec,
3035 	},
3036 	{
3037 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3038 		.procname	= "min_adv_mss",
3039 		.data		= &ip_rt_min_advmss,
3040 		.maxlen		= sizeof(int),
3041 		.mode		= 0644,
3042 		.proc_handler	= &proc_dointvec,
3043 	},
3044 	{
3045 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3046 		.procname	= "secret_interval",
3047 		.data		= &ip_rt_secret_interval,
3048 		.maxlen		= sizeof(int),
3049 		.mode		= 0644,
3050 		.proc_handler	= &proc_dointvec_jiffies,
3051 		.strategy	= &sysctl_jiffies,
3052 	},
3053 	{ .ctl_name = 0 }
3054 };
3055 #endif
3056 
3057 #ifdef CONFIG_NET_CLS_ROUTE
3058 struct ip_rt_acct *ip_rt_acct;
3059 
3060 /* This code sucks.  But you should have seen it before! --RR */
3061 
3062 /* IP route accounting ptr for this logical cpu number. */
3063 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 
3065 #ifdef CONFIG_PROC_FS
3066 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3067 			   int length, int *eof, void *data)
3068 {
3069 	unsigned int i;
3070 
3071 	if ((offset & 3) || (length & 3))
3072 		return -EIO;
3073 
3074 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3075 		*eof = 1;
3076 		return 0;
3077 	}
3078 
3079 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3080 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3081 		*eof = 1;
3082 	}
3083 
3084 	offset /= sizeof(u32);
3085 
3086 	if (length > 0) {
3087 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3088 		u32 *dst = (u32 *) buffer;
3089 
3090 		/* Copy first cpu. */
3091 		*start = buffer;
3092 		memcpy(dst, src, length);
3093 
3094 		/* Add the other cpus in, one int at a time */
3095 		for_each_cpu(i) {
3096 			unsigned int j;
3097 
3098 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 
3100 			for (j = 0; j < length/4; j++)
3101 				dst[j] += src[j];
3102 		}
3103 	}
3104 	return length;
3105 }
3106 #endif /* CONFIG_PROC_FS */
3107 #endif /* CONFIG_NET_CLS_ROUTE */
3108 
3109 static __initdata unsigned long rhash_entries;
3110 static int __init set_rhash_entries(char *str)
3111 {
3112 	if (!str)
3113 		return 0;
3114 	rhash_entries = simple_strtoul(str, &str, 0);
3115 	return 1;
3116 }
3117 __setup("rhash_entries=", set_rhash_entries);
3118 
3119 int __init ip_rt_init(void)
3120 {
3121 	int rc = 0;
3122 
3123 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3124 			     (jiffies ^ (jiffies >> 7)));
3125 
3126 #ifdef CONFIG_NET_CLS_ROUTE
3127 	{
3128 	int order;
3129 	for (order = 0;
3130 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3131 		/* NOTHING */;
3132 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3133 	if (!ip_rt_acct)
3134 		panic("IP: failed to allocate ip_rt_acct\n");
3135 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3136 	}
3137 #endif
3138 
3139 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3140 						     sizeof(struct rtable),
3141 						     0, SLAB_HWCACHE_ALIGN,
3142 						     NULL, NULL);
3143 
3144 	if (!ipv4_dst_ops.kmem_cachep)
3145 		panic("IP: failed to allocate ip_dst_cache\n");
3146 
3147 	rt_hash_table = (struct rt_hash_bucket *)
3148 		alloc_large_system_hash("IP route cache",
3149 					sizeof(struct rt_hash_bucket),
3150 					rhash_entries,
3151 					(num_physpages >= 128 * 1024) ?
3152 						(27 - PAGE_SHIFT) :
3153 						(29 - PAGE_SHIFT),
3154 					HASH_HIGHMEM,
3155 					&rt_hash_log,
3156 					&rt_hash_mask,
3157 					0);
3158 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3159 	rt_hash_lock_init();
3160 
3161 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3162 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3163 
3164 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3165 	if (!rt_cache_stat)
3166 		return -ENOMEM;
3167 
3168 	devinet_init();
3169 	ip_fib_init();
3170 
3171 	init_timer(&rt_flush_timer);
3172 	rt_flush_timer.function = rt_run_flush;
3173 	init_timer(&rt_periodic_timer);
3174 	rt_periodic_timer.function = rt_check_expire;
3175 	init_timer(&rt_secret_timer);
3176 	rt_secret_timer.function = rt_secret_rebuild;
3177 
3178 	/* All the timers, started at system startup tend
3179 	   to synchronize. Perturb it a bit.
3180 	 */
3181 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3182 					ip_rt_gc_interval;
3183 	add_timer(&rt_periodic_timer);
3184 
3185 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3186 		ip_rt_secret_interval;
3187 	add_timer(&rt_secret_timer);
3188 
3189 #ifdef CONFIG_PROC_FS
3190 	{
3191 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3192 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3193 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3194 			    		     proc_net_stat))) {
3195 		free_percpu(rt_cache_stat);
3196 		return -ENOMEM;
3197 	}
3198 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3199 	}
3200 #ifdef CONFIG_NET_CLS_ROUTE
3201 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3202 #endif
3203 #endif
3204 #ifdef CONFIG_XFRM
3205 	xfrm_init();
3206 	xfrm4_init();
3207 #endif
3208 	return rc;
3209 }
3210 
3211 EXPORT_SYMBOL(__ip_select_ident);
3212 EXPORT_SYMBOL(ip_route_input);
3213 EXPORT_SYMBOL(ip_route_output_key);
3214