xref: /openbmc/linux/net/ipv4/route.c (revision e868d61272caa648214046a096e5a6bfc068dc8c)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/ip_mp_alg.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_min_delay		= 2 * HZ;
119 static int ip_rt_max_delay		= 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval		= 60 * HZ;
123 static int ip_rt_gc_min_interval	= HZ / 2;
124 static int ip_rt_redirect_number	= 9;
125 static int ip_rt_redirect_load		= HZ / 50;
126 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost		= HZ;
128 static int ip_rt_error_burst		= 5 * HZ;
129 static int ip_rt_gc_elasticity		= 8;
130 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu		= 512 + 20 + 20;
132 static int ip_rt_min_advmss		= 256;
133 static int ip_rt_secret_interval	= 10 * 60 * HZ;
134 static unsigned long rt_deadline;
135 
136 #define RTprint(a...)	printk(KERN_DEBUG a)
137 
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
141 
142 /*
143  *	Interface to generic destination cache.
144  */
145 
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void		 ipv4_dst_destroy(struct dst_entry *dst);
148 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
149 					 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void		 ipv4_link_failure(struct sk_buff *skb);
152 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
154 
155 
156 static struct dst_ops ipv4_dst_ops = {
157 	.family =		AF_INET,
158 	.protocol =		__constant_htons(ETH_P_IP),
159 	.gc =			rt_garbage_collect,
160 	.check =		ipv4_dst_check,
161 	.destroy =		ipv4_dst_destroy,
162 	.ifdown =		ipv4_dst_ifdown,
163 	.negative_advice =	ipv4_negative_advice,
164 	.link_failure =		ipv4_link_failure,
165 	.update_pmtu =		ip_rt_update_pmtu,
166 	.entry_size =		sizeof(struct rtable),
167 };
168 
169 #define ECN_OR_COST(class)	TC_PRIO_##class
170 
171 __u8 ip_tos2prio[16] = {
172 	TC_PRIO_BESTEFFORT,
173 	ECN_OR_COST(FILLER),
174 	TC_PRIO_BESTEFFORT,
175 	ECN_OR_COST(BESTEFFORT),
176 	TC_PRIO_BULK,
177 	ECN_OR_COST(BULK),
178 	TC_PRIO_BULK,
179 	ECN_OR_COST(BULK),
180 	TC_PRIO_INTERACTIVE,
181 	ECN_OR_COST(INTERACTIVE),
182 	TC_PRIO_INTERACTIVE,
183 	ECN_OR_COST(INTERACTIVE),
184 	TC_PRIO_INTERACTIVE_BULK,
185 	ECN_OR_COST(INTERACTIVE_BULK),
186 	TC_PRIO_INTERACTIVE_BULK,
187 	ECN_OR_COST(INTERACTIVE_BULK)
188 };
189 
190 
191 /*
192  * Route cache.
193  */
194 
195 /* The locking scheme is rather straight forward:
196  *
197  * 1) Read-Copy Update protects the buckets of the central route hash.
198  * 2) Only writers remove entries, and they hold the lock
199  *    as they look at rtable reference counts.
200  * 3) Only readers acquire references to rtable entries,
201  *    they do so with atomic increments and with the
202  *    lock held.
203  */
204 
205 struct rt_hash_bucket {
206 	struct rtable	*chain;
207 };
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 	defined(CONFIG_PROVE_LOCKING)
210 /*
211  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212  * The size of this table is a power of two and depends on the number of CPUS.
213  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214  */
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ	256
217 #else
218 # if NR_CPUS >= 32
219 #  define RT_HASH_LOCK_SZ	4096
220 # elif NR_CPUS >= 16
221 #  define RT_HASH_LOCK_SZ	2048
222 # elif NR_CPUS >= 8
223 #  define RT_HASH_LOCK_SZ	1024
224 # elif NR_CPUS >= 4
225 #  define RT_HASH_LOCK_SZ	512
226 # else
227 #  define RT_HASH_LOCK_SZ	256
228 # endif
229 #endif
230 
231 static spinlock_t	*rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init()	{ \
234 		int i; \
235 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 			spin_lock_init(&rt_hash_locks[i]); \
239 		}
240 #else
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
243 #endif
244 
245 static struct rt_hash_bucket 	*rt_hash_table;
246 static unsigned			rt_hash_mask;
247 static int			rt_hash_log;
248 static unsigned int		rt_hash_rnd;
249 
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252 	(__raw_get_cpu_var(rt_cache_stat).field++)
253 
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 				struct rtable **res);
256 
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
258 {
259 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
260 		& rt_hash_mask);
261 }
262 
263 #define rt_hash(daddr, saddr, idx) \
264 	rt_hash_code((__force u32)(__be32)(daddr),\
265 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
266 
267 #ifdef CONFIG_PROC_FS
268 struct rt_cache_iter_state {
269 	int bucket;
270 };
271 
272 static struct rtable *rt_cache_get_first(struct seq_file *seq)
273 {
274 	struct rtable *r = NULL;
275 	struct rt_cache_iter_state *st = seq->private;
276 
277 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278 		rcu_read_lock_bh();
279 		r = rt_hash_table[st->bucket].chain;
280 		if (r)
281 			break;
282 		rcu_read_unlock_bh();
283 	}
284 	return r;
285 }
286 
287 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
288 {
289 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
290 
291 	r = r->u.dst.rt_next;
292 	while (!r) {
293 		rcu_read_unlock_bh();
294 		if (--st->bucket < 0)
295 			break;
296 		rcu_read_lock_bh();
297 		r = rt_hash_table[st->bucket].chain;
298 	}
299 	return r;
300 }
301 
302 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
303 {
304 	struct rtable *r = rt_cache_get_first(seq);
305 
306 	if (r)
307 		while (pos && (r = rt_cache_get_next(seq, r)))
308 			--pos;
309 	return pos ? NULL : r;
310 }
311 
312 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
313 {
314 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
315 }
316 
317 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
318 {
319 	struct rtable *r = NULL;
320 
321 	if (v == SEQ_START_TOKEN)
322 		r = rt_cache_get_first(seq);
323 	else
324 		r = rt_cache_get_next(seq, v);
325 	++*pos;
326 	return r;
327 }
328 
329 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
330 {
331 	if (v && v != SEQ_START_TOKEN)
332 		rcu_read_unlock_bh();
333 }
334 
335 static int rt_cache_seq_show(struct seq_file *seq, void *v)
336 {
337 	if (v == SEQ_START_TOKEN)
338 		seq_printf(seq, "%-127s\n",
339 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341 			   "HHUptod\tSpecDst");
342 	else {
343 		struct rtable *r = v;
344 		char temp[256];
345 
346 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348 			r->u.dst.dev ? r->u.dst.dev->name : "*",
349 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
352 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354 			dst_metric(&r->u.dst, RTAX_WINDOW),
355 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
357 			r->fl.fl4_tos,
358 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360 				       dev_queue_xmit) : 0,
361 			r->rt_spec_dst);
362 		seq_printf(seq, "%-127s\n", temp);
363 	}
364 	return 0;
365 }
366 
367 static const struct seq_operations rt_cache_seq_ops = {
368 	.start  = rt_cache_seq_start,
369 	.next   = rt_cache_seq_next,
370 	.stop   = rt_cache_seq_stop,
371 	.show   = rt_cache_seq_show,
372 };
373 
374 static int rt_cache_seq_open(struct inode *inode, struct file *file)
375 {
376 	struct seq_file *seq;
377 	int rc = -ENOMEM;
378 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
379 
380 	if (!s)
381 		goto out;
382 	rc = seq_open(file, &rt_cache_seq_ops);
383 	if (rc)
384 		goto out_kfree;
385 	seq          = file->private_data;
386 	seq->private = s;
387 	memset(s, 0, sizeof(*s));
388 out:
389 	return rc;
390 out_kfree:
391 	kfree(s);
392 	goto out;
393 }
394 
395 static const struct file_operations rt_cache_seq_fops = {
396 	.owner	 = THIS_MODULE,
397 	.open	 = rt_cache_seq_open,
398 	.read	 = seq_read,
399 	.llseek	 = seq_lseek,
400 	.release = seq_release_private,
401 };
402 
403 
404 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405 {
406 	int cpu;
407 
408 	if (*pos == 0)
409 		return SEQ_START_TOKEN;
410 
411 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 		if (!cpu_possible(cpu))
413 			continue;
414 		*pos = cpu+1;
415 		return &per_cpu(rt_cache_stat, cpu);
416 	}
417 	return NULL;
418 }
419 
420 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421 {
422 	int cpu;
423 
424 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 		if (!cpu_possible(cpu))
426 			continue;
427 		*pos = cpu+1;
428 		return &per_cpu(rt_cache_stat, cpu);
429 	}
430 	return NULL;
431 
432 }
433 
434 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435 {
436 
437 }
438 
439 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440 {
441 	struct rt_cache_stat *st = v;
442 
443 	if (v == SEQ_START_TOKEN) {
444 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
445 		return 0;
446 	}
447 
448 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
449 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 		   atomic_read(&ipv4_dst_ops.entries),
451 		   st->in_hit,
452 		   st->in_slow_tot,
453 		   st->in_slow_mc,
454 		   st->in_no_route,
455 		   st->in_brd,
456 		   st->in_martian_dst,
457 		   st->in_martian_src,
458 
459 		   st->out_hit,
460 		   st->out_slow_tot,
461 		   st->out_slow_mc,
462 
463 		   st->gc_total,
464 		   st->gc_ignored,
465 		   st->gc_goal_miss,
466 		   st->gc_dst_overflow,
467 		   st->in_hlist_search,
468 		   st->out_hlist_search
469 		);
470 	return 0;
471 }
472 
473 static const struct seq_operations rt_cpu_seq_ops = {
474 	.start  = rt_cpu_seq_start,
475 	.next   = rt_cpu_seq_next,
476 	.stop   = rt_cpu_seq_stop,
477 	.show   = rt_cpu_seq_show,
478 };
479 
480 
481 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482 {
483 	return seq_open(file, &rt_cpu_seq_ops);
484 }
485 
486 static const struct file_operations rt_cpu_seq_fops = {
487 	.owner	 = THIS_MODULE,
488 	.open	 = rt_cpu_seq_open,
489 	.read	 = seq_read,
490 	.llseek	 = seq_lseek,
491 	.release = seq_release,
492 };
493 
494 #endif /* CONFIG_PROC_FS */
495 
496 static __inline__ void rt_free(struct rtable *rt)
497 {
498 	multipath_remove(rt);
499 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
500 }
501 
502 static __inline__ void rt_drop(struct rtable *rt)
503 {
504 	multipath_remove(rt);
505 	ip_rt_put(rt);
506 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
507 }
508 
509 static __inline__ int rt_fast_clean(struct rtable *rth)
510 {
511 	/* Kill broadcast/multicast entries very aggresively, if they
512 	   collide in hash table with more useful entries */
513 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
514 		rth->fl.iif && rth->u.dst.rt_next;
515 }
516 
517 static __inline__ int rt_valuable(struct rtable *rth)
518 {
519 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
520 		rth->u.dst.expires;
521 }
522 
523 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
524 {
525 	unsigned long age;
526 	int ret = 0;
527 
528 	if (atomic_read(&rth->u.dst.__refcnt))
529 		goto out;
530 
531 	ret = 1;
532 	if (rth->u.dst.expires &&
533 	    time_after_eq(jiffies, rth->u.dst.expires))
534 		goto out;
535 
536 	age = jiffies - rth->u.dst.lastuse;
537 	ret = 0;
538 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
539 	    (age <= tmo2 && rt_valuable(rth)))
540 		goto out;
541 	ret = 1;
542 out:	return ret;
543 }
544 
545 /* Bits of score are:
546  * 31: very valuable
547  * 30: not quite useless
548  * 29..0: usage counter
549  */
550 static inline u32 rt_score(struct rtable *rt)
551 {
552 	u32 score = jiffies - rt->u.dst.lastuse;
553 
554 	score = ~score & ~(3<<30);
555 
556 	if (rt_valuable(rt))
557 		score |= (1<<31);
558 
559 	if (!rt->fl.iif ||
560 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
561 		score |= (1<<30);
562 
563 	return score;
564 }
565 
566 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
567 {
568 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
569 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
570 		(fl1->mark ^ fl2->mark) |
571 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
572 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
573 		(fl1->oif ^ fl2->oif) |
574 		(fl1->iif ^ fl2->iif)) == 0;
575 }
576 
577 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
578 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
579 						struct rtable *expentry,
580 						int *removed_count)
581 {
582 	int passedexpired = 0;
583 	struct rtable **nextstep = NULL;
584 	struct rtable **rthp = chain_head;
585 	struct rtable *rth;
586 
587 	if (removed_count)
588 		*removed_count = 0;
589 
590 	while ((rth = *rthp) != NULL) {
591 		if (rth == expentry)
592 			passedexpired = 1;
593 
594 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
595 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
596 			if (*rthp == expentry) {
597 				*rthp = rth->u.dst.rt_next;
598 				continue;
599 			} else {
600 				*rthp = rth->u.dst.rt_next;
601 				rt_free(rth);
602 				if (removed_count)
603 					++(*removed_count);
604 			}
605 		} else {
606 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
607 			    passedexpired && !nextstep)
608 				nextstep = &rth->u.dst.rt_next;
609 
610 			rthp = &rth->u.dst.rt_next;
611 		}
612 	}
613 
614 	rt_free(expentry);
615 	if (removed_count)
616 		++(*removed_count);
617 
618 	return nextstep;
619 }
620 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
621 
622 
623 /* This runs via a timer and thus is always in BH context. */
624 static void rt_check_expire(unsigned long dummy)
625 {
626 	static unsigned int rover;
627 	unsigned int i = rover, goal;
628 	struct rtable *rth, **rthp;
629 	unsigned long now = jiffies;
630 	u64 mult;
631 
632 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
633 	if (ip_rt_gc_timeout > 1)
634 		do_div(mult, ip_rt_gc_timeout);
635 	goal = (unsigned int)mult;
636 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
637 	for (; goal > 0; goal--) {
638 		unsigned long tmo = ip_rt_gc_timeout;
639 
640 		i = (i + 1) & rt_hash_mask;
641 		rthp = &rt_hash_table[i].chain;
642 
643 		if (*rthp == 0)
644 			continue;
645 		spin_lock(rt_hash_lock_addr(i));
646 		while ((rth = *rthp) != NULL) {
647 			if (rth->u.dst.expires) {
648 				/* Entry is expired even if it is in use */
649 				if (time_before_eq(now, rth->u.dst.expires)) {
650 					tmo >>= 1;
651 					rthp = &rth->u.dst.rt_next;
652 					continue;
653 				}
654 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
655 				tmo >>= 1;
656 				rthp = &rth->u.dst.rt_next;
657 				continue;
658 			}
659 
660 			/* Cleanup aged off entries. */
661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
662 			/* remove all related balanced entries if necessary */
663 			if (rth->u.dst.flags & DST_BALANCED) {
664 				rthp = rt_remove_balanced_route(
665 					&rt_hash_table[i].chain,
666 					rth, NULL);
667 				if (!rthp)
668 					break;
669 			} else {
670 				*rthp = rth->u.dst.rt_next;
671 				rt_free(rth);
672 			}
673 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
674 			*rthp = rth->u.dst.rt_next;
675 			rt_free(rth);
676 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
677 		}
678 		spin_unlock(rt_hash_lock_addr(i));
679 
680 		/* Fallback loop breaker. */
681 		if (time_after(jiffies, now))
682 			break;
683 	}
684 	rover = i;
685 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
686 }
687 
688 /* This can run from both BH and non-BH contexts, the latter
689  * in the case of a forced flush event.
690  */
691 static void rt_run_flush(unsigned long dummy)
692 {
693 	int i;
694 	struct rtable *rth, *next;
695 
696 	rt_deadline = 0;
697 
698 	get_random_bytes(&rt_hash_rnd, 4);
699 
700 	for (i = rt_hash_mask; i >= 0; i--) {
701 		spin_lock_bh(rt_hash_lock_addr(i));
702 		rth = rt_hash_table[i].chain;
703 		if (rth)
704 			rt_hash_table[i].chain = NULL;
705 		spin_unlock_bh(rt_hash_lock_addr(i));
706 
707 		for (; rth; rth = next) {
708 			next = rth->u.dst.rt_next;
709 			rt_free(rth);
710 		}
711 	}
712 }
713 
714 static DEFINE_SPINLOCK(rt_flush_lock);
715 
716 void rt_cache_flush(int delay)
717 {
718 	unsigned long now = jiffies;
719 	int user_mode = !in_softirq();
720 
721 	if (delay < 0)
722 		delay = ip_rt_min_delay;
723 
724 	/* flush existing multipath state*/
725 	multipath_flush();
726 
727 	spin_lock_bh(&rt_flush_lock);
728 
729 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
730 		long tmo = (long)(rt_deadline - now);
731 
732 		/* If flush timer is already running
733 		   and flush request is not immediate (delay > 0):
734 
735 		   if deadline is not achieved, prolongate timer to "delay",
736 		   otherwise fire it at deadline time.
737 		 */
738 
739 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
740 			tmo = 0;
741 
742 		if (delay > tmo)
743 			delay = tmo;
744 	}
745 
746 	if (delay <= 0) {
747 		spin_unlock_bh(&rt_flush_lock);
748 		rt_run_flush(0);
749 		return;
750 	}
751 
752 	if (rt_deadline == 0)
753 		rt_deadline = now + ip_rt_max_delay;
754 
755 	mod_timer(&rt_flush_timer, now+delay);
756 	spin_unlock_bh(&rt_flush_lock);
757 }
758 
759 static void rt_secret_rebuild(unsigned long dummy)
760 {
761 	unsigned long now = jiffies;
762 
763 	rt_cache_flush(0);
764 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
765 }
766 
767 /*
768    Short description of GC goals.
769 
770    We want to build algorithm, which will keep routing cache
771    at some equilibrium point, when number of aged off entries
772    is kept approximately equal to newly generated ones.
773 
774    Current expiration strength is variable "expire".
775    We try to adjust it dynamically, so that if networking
776    is idle expires is large enough to keep enough of warm entries,
777    and when load increases it reduces to limit cache size.
778  */
779 
780 static int rt_garbage_collect(void)
781 {
782 	static unsigned long expire = RT_GC_TIMEOUT;
783 	static unsigned long last_gc;
784 	static int rover;
785 	static int equilibrium;
786 	struct rtable *rth, **rthp;
787 	unsigned long now = jiffies;
788 	int goal;
789 
790 	/*
791 	 * Garbage collection is pretty expensive,
792 	 * do not make it too frequently.
793 	 */
794 
795 	RT_CACHE_STAT_INC(gc_total);
796 
797 	if (now - last_gc < ip_rt_gc_min_interval &&
798 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
799 		RT_CACHE_STAT_INC(gc_ignored);
800 		goto out;
801 	}
802 
803 	/* Calculate number of entries, which we want to expire now. */
804 	goal = atomic_read(&ipv4_dst_ops.entries) -
805 		(ip_rt_gc_elasticity << rt_hash_log);
806 	if (goal <= 0) {
807 		if (equilibrium < ipv4_dst_ops.gc_thresh)
808 			equilibrium = ipv4_dst_ops.gc_thresh;
809 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
810 		if (goal > 0) {
811 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
812 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
813 		}
814 	} else {
815 		/* We are in dangerous area. Try to reduce cache really
816 		 * aggressively.
817 		 */
818 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
819 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
820 	}
821 
822 	if (now - last_gc >= ip_rt_gc_min_interval)
823 		last_gc = now;
824 
825 	if (goal <= 0) {
826 		equilibrium += goal;
827 		goto work_done;
828 	}
829 
830 	do {
831 		int i, k;
832 
833 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
834 			unsigned long tmo = expire;
835 
836 			k = (k + 1) & rt_hash_mask;
837 			rthp = &rt_hash_table[k].chain;
838 			spin_lock_bh(rt_hash_lock_addr(k));
839 			while ((rth = *rthp) != NULL) {
840 				if (!rt_may_expire(rth, tmo, expire)) {
841 					tmo >>= 1;
842 					rthp = &rth->u.dst.rt_next;
843 					continue;
844 				}
845 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
846 				/* remove all related balanced entries
847 				 * if necessary
848 				 */
849 				if (rth->u.dst.flags & DST_BALANCED) {
850 					int r;
851 
852 					rthp = rt_remove_balanced_route(
853 						&rt_hash_table[k].chain,
854 						rth,
855 						&r);
856 					goal -= r;
857 					if (!rthp)
858 						break;
859 				} else {
860 					*rthp = rth->u.dst.rt_next;
861 					rt_free(rth);
862 					goal--;
863 				}
864 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
865 				*rthp = rth->u.dst.rt_next;
866 				rt_free(rth);
867 				goal--;
868 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
869 			}
870 			spin_unlock_bh(rt_hash_lock_addr(k));
871 			if (goal <= 0)
872 				break;
873 		}
874 		rover = k;
875 
876 		if (goal <= 0)
877 			goto work_done;
878 
879 		/* Goal is not achieved. We stop process if:
880 
881 		   - if expire reduced to zero. Otherwise, expire is halfed.
882 		   - if table is not full.
883 		   - if we are called from interrupt.
884 		   - jiffies check is just fallback/debug loop breaker.
885 		     We will not spin here for long time in any case.
886 		 */
887 
888 		RT_CACHE_STAT_INC(gc_goal_miss);
889 
890 		if (expire == 0)
891 			break;
892 
893 		expire >>= 1;
894 #if RT_CACHE_DEBUG >= 2
895 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
896 				atomic_read(&ipv4_dst_ops.entries), goal, i);
897 #endif
898 
899 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
900 			goto out;
901 	} while (!in_softirq() && time_before_eq(jiffies, now));
902 
903 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
904 		goto out;
905 	if (net_ratelimit())
906 		printk(KERN_WARNING "dst cache overflow\n");
907 	RT_CACHE_STAT_INC(gc_dst_overflow);
908 	return 1;
909 
910 work_done:
911 	expire += ip_rt_gc_min_interval;
912 	if (expire > ip_rt_gc_timeout ||
913 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
914 		expire = ip_rt_gc_timeout;
915 #if RT_CACHE_DEBUG >= 2
916 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
917 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
918 #endif
919 out:	return 0;
920 }
921 
922 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
923 {
924 	struct rtable	*rth, **rthp;
925 	unsigned long	now;
926 	struct rtable *cand, **candp;
927 	u32 		min_score;
928 	int		chain_length;
929 	int attempts = !in_softirq();
930 
931 restart:
932 	chain_length = 0;
933 	min_score = ~(u32)0;
934 	cand = NULL;
935 	candp = NULL;
936 	now = jiffies;
937 
938 	rthp = &rt_hash_table[hash].chain;
939 
940 	spin_lock_bh(rt_hash_lock_addr(hash));
941 	while ((rth = *rthp) != NULL) {
942 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
943 		if (!(rth->u.dst.flags & DST_BALANCED) &&
944 		    compare_keys(&rth->fl, &rt->fl)) {
945 #else
946 		if (compare_keys(&rth->fl, &rt->fl)) {
947 #endif
948 			/* Put it first */
949 			*rthp = rth->u.dst.rt_next;
950 			/*
951 			 * Since lookup is lockfree, the deletion
952 			 * must be visible to another weakly ordered CPU before
953 			 * the insertion at the start of the hash chain.
954 			 */
955 			rcu_assign_pointer(rth->u.dst.rt_next,
956 					   rt_hash_table[hash].chain);
957 			/*
958 			 * Since lookup is lockfree, the update writes
959 			 * must be ordered for consistency on SMP.
960 			 */
961 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
962 
963 			rth->u.dst.__use++;
964 			dst_hold(&rth->u.dst);
965 			rth->u.dst.lastuse = now;
966 			spin_unlock_bh(rt_hash_lock_addr(hash));
967 
968 			rt_drop(rt);
969 			*rp = rth;
970 			return 0;
971 		}
972 
973 		if (!atomic_read(&rth->u.dst.__refcnt)) {
974 			u32 score = rt_score(rth);
975 
976 			if (score <= min_score) {
977 				cand = rth;
978 				candp = rthp;
979 				min_score = score;
980 			}
981 		}
982 
983 		chain_length++;
984 
985 		rthp = &rth->u.dst.rt_next;
986 	}
987 
988 	if (cand) {
989 		/* ip_rt_gc_elasticity used to be average length of chain
990 		 * length, when exceeded gc becomes really aggressive.
991 		 *
992 		 * The second limit is less certain. At the moment it allows
993 		 * only 2 entries per bucket. We will see.
994 		 */
995 		if (chain_length > ip_rt_gc_elasticity) {
996 			*candp = cand->u.dst.rt_next;
997 			rt_free(cand);
998 		}
999 	}
1000 
1001 	/* Try to bind route to arp only if it is output
1002 	   route or unicast forwarding path.
1003 	 */
1004 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1005 		int err = arp_bind_neighbour(&rt->u.dst);
1006 		if (err) {
1007 			spin_unlock_bh(rt_hash_lock_addr(hash));
1008 
1009 			if (err != -ENOBUFS) {
1010 				rt_drop(rt);
1011 				return err;
1012 			}
1013 
1014 			/* Neighbour tables are full and nothing
1015 			   can be released. Try to shrink route cache,
1016 			   it is most likely it holds some neighbour records.
1017 			 */
1018 			if (attempts-- > 0) {
1019 				int saved_elasticity = ip_rt_gc_elasticity;
1020 				int saved_int = ip_rt_gc_min_interval;
1021 				ip_rt_gc_elasticity	= 1;
1022 				ip_rt_gc_min_interval	= 0;
1023 				rt_garbage_collect();
1024 				ip_rt_gc_min_interval	= saved_int;
1025 				ip_rt_gc_elasticity	= saved_elasticity;
1026 				goto restart;
1027 			}
1028 
1029 			if (net_ratelimit())
1030 				printk(KERN_WARNING "Neighbour table overflow.\n");
1031 			rt_drop(rt);
1032 			return -ENOBUFS;
1033 		}
1034 	}
1035 
1036 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1037 #if RT_CACHE_DEBUG >= 2
1038 	if (rt->u.dst.rt_next) {
1039 		struct rtable *trt;
1040 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1041 		       NIPQUAD(rt->rt_dst));
1042 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1043 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1044 		printk("\n");
1045 	}
1046 #endif
1047 	rt_hash_table[hash].chain = rt;
1048 	spin_unlock_bh(rt_hash_lock_addr(hash));
1049 	*rp = rt;
1050 	return 0;
1051 }
1052 
1053 void rt_bind_peer(struct rtable *rt, int create)
1054 {
1055 	static DEFINE_SPINLOCK(rt_peer_lock);
1056 	struct inet_peer *peer;
1057 
1058 	peer = inet_getpeer(rt->rt_dst, create);
1059 
1060 	spin_lock_bh(&rt_peer_lock);
1061 	if (rt->peer == NULL) {
1062 		rt->peer = peer;
1063 		peer = NULL;
1064 	}
1065 	spin_unlock_bh(&rt_peer_lock);
1066 	if (peer)
1067 		inet_putpeer(peer);
1068 }
1069 
1070 /*
1071  * Peer allocation may fail only in serious out-of-memory conditions.  However
1072  * we still can generate some output.
1073  * Random ID selection looks a bit dangerous because we have no chances to
1074  * select ID being unique in a reasonable period of time.
1075  * But broken packet identifier may be better than no packet at all.
1076  */
1077 static void ip_select_fb_ident(struct iphdr *iph)
1078 {
1079 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1080 	static u32 ip_fallback_id;
1081 	u32 salt;
1082 
1083 	spin_lock_bh(&ip_fb_id_lock);
1084 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1085 	iph->id = htons(salt & 0xFFFF);
1086 	ip_fallback_id = salt;
1087 	spin_unlock_bh(&ip_fb_id_lock);
1088 }
1089 
1090 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1091 {
1092 	struct rtable *rt = (struct rtable *) dst;
1093 
1094 	if (rt) {
1095 		if (rt->peer == NULL)
1096 			rt_bind_peer(rt, 1);
1097 
1098 		/* If peer is attached to destination, it is never detached,
1099 		   so that we need not to grab a lock to dereference it.
1100 		 */
1101 		if (rt->peer) {
1102 			iph->id = htons(inet_getid(rt->peer, more));
1103 			return;
1104 		}
1105 	} else
1106 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1107 		       __builtin_return_address(0));
1108 
1109 	ip_select_fb_ident(iph);
1110 }
1111 
1112 static void rt_del(unsigned hash, struct rtable *rt)
1113 {
1114 	struct rtable **rthp;
1115 
1116 	spin_lock_bh(rt_hash_lock_addr(hash));
1117 	ip_rt_put(rt);
1118 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1119 	     rthp = &(*rthp)->u.dst.rt_next)
1120 		if (*rthp == rt) {
1121 			*rthp = rt->u.dst.rt_next;
1122 			rt_free(rt);
1123 			break;
1124 		}
1125 	spin_unlock_bh(rt_hash_lock_addr(hash));
1126 }
1127 
1128 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1129 		    __be32 saddr, struct net_device *dev)
1130 {
1131 	int i, k;
1132 	struct in_device *in_dev = in_dev_get(dev);
1133 	struct rtable *rth, **rthp;
1134 	__be32  skeys[2] = { saddr, 0 };
1135 	int  ikeys[2] = { dev->ifindex, 0 };
1136 	struct netevent_redirect netevent;
1137 
1138 	if (!in_dev)
1139 		return;
1140 
1141 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1142 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1143 		goto reject_redirect;
1144 
1145 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1146 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1147 			goto reject_redirect;
1148 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1149 			goto reject_redirect;
1150 	} else {
1151 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1152 			goto reject_redirect;
1153 	}
1154 
1155 	for (i = 0; i < 2; i++) {
1156 		for (k = 0; k < 2; k++) {
1157 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1158 
1159 			rthp=&rt_hash_table[hash].chain;
1160 
1161 			rcu_read_lock();
1162 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1163 				struct rtable *rt;
1164 
1165 				if (rth->fl.fl4_dst != daddr ||
1166 				    rth->fl.fl4_src != skeys[i] ||
1167 				    rth->fl.oif != ikeys[k] ||
1168 				    rth->fl.iif != 0) {
1169 					rthp = &rth->u.dst.rt_next;
1170 					continue;
1171 				}
1172 
1173 				if (rth->rt_dst != daddr ||
1174 				    rth->rt_src != saddr ||
1175 				    rth->u.dst.error ||
1176 				    rth->rt_gateway != old_gw ||
1177 				    rth->u.dst.dev != dev)
1178 					break;
1179 
1180 				dst_hold(&rth->u.dst);
1181 				rcu_read_unlock();
1182 
1183 				rt = dst_alloc(&ipv4_dst_ops);
1184 				if (rt == NULL) {
1185 					ip_rt_put(rth);
1186 					in_dev_put(in_dev);
1187 					return;
1188 				}
1189 
1190 				/* Copy all the information. */
1191 				*rt = *rth;
1192 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1193 				rt->u.dst.__use		= 1;
1194 				atomic_set(&rt->u.dst.__refcnt, 1);
1195 				rt->u.dst.child		= NULL;
1196 				if (rt->u.dst.dev)
1197 					dev_hold(rt->u.dst.dev);
1198 				if (rt->idev)
1199 					in_dev_hold(rt->idev);
1200 				rt->u.dst.obsolete	= 0;
1201 				rt->u.dst.lastuse	= jiffies;
1202 				rt->u.dst.path		= &rt->u.dst;
1203 				rt->u.dst.neighbour	= NULL;
1204 				rt->u.dst.hh		= NULL;
1205 				rt->u.dst.xfrm		= NULL;
1206 
1207 				rt->rt_flags		|= RTCF_REDIRECTED;
1208 
1209 				/* Gateway is different ... */
1210 				rt->rt_gateway		= new_gw;
1211 
1212 				/* Redirect received -> path was valid */
1213 				dst_confirm(&rth->u.dst);
1214 
1215 				if (rt->peer)
1216 					atomic_inc(&rt->peer->refcnt);
1217 
1218 				if (arp_bind_neighbour(&rt->u.dst) ||
1219 				    !(rt->u.dst.neighbour->nud_state &
1220 					    NUD_VALID)) {
1221 					if (rt->u.dst.neighbour)
1222 						neigh_event_send(rt->u.dst.neighbour, NULL);
1223 					ip_rt_put(rth);
1224 					rt_drop(rt);
1225 					goto do_next;
1226 				}
1227 
1228 				netevent.old = &rth->u.dst;
1229 				netevent.new = &rt->u.dst;
1230 				call_netevent_notifiers(NETEVENT_REDIRECT,
1231 							&netevent);
1232 
1233 				rt_del(hash, rth);
1234 				if (!rt_intern_hash(hash, rt, &rt))
1235 					ip_rt_put(rt);
1236 				goto do_next;
1237 			}
1238 			rcu_read_unlock();
1239 		do_next:
1240 			;
1241 		}
1242 	}
1243 	in_dev_put(in_dev);
1244 	return;
1245 
1246 reject_redirect:
1247 #ifdef CONFIG_IP_ROUTE_VERBOSE
1248 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250 			"%u.%u.%u.%u ignored.\n"
1251 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1252 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1253 		       NIPQUAD(saddr), NIPQUAD(daddr));
1254 #endif
1255 	in_dev_put(in_dev);
1256 }
1257 
1258 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1259 {
1260 	struct rtable *rt = (struct rtable*)dst;
1261 	struct dst_entry *ret = dst;
1262 
1263 	if (rt) {
1264 		if (dst->obsolete) {
1265 			ip_rt_put(rt);
1266 			ret = NULL;
1267 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268 			   rt->u.dst.expires) {
1269 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1270 						rt->fl.oif);
1271 #if RT_CACHE_DEBUG >= 1
1272 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1273 					  "%u.%u.%u.%u/%02x dropped\n",
1274 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1275 #endif
1276 			rt_del(hash, rt);
1277 			ret = NULL;
1278 		}
1279 	}
1280 	return ret;
1281 }
1282 
1283 /*
1284  * Algorithm:
1285  *	1. The first ip_rt_redirect_number redirects are sent
1286  *	   with exponential backoff, then we stop sending them at all,
1287  *	   assuming that the host ignores our redirects.
1288  *	2. If we did not see packets requiring redirects
1289  *	   during ip_rt_redirect_silence, we assume that the host
1290  *	   forgot redirected route and start to send redirects again.
1291  *
1292  * This algorithm is much cheaper and more intelligent than dumb load limiting
1293  * in icmp.c.
1294  *
1295  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1297  */
1298 
1299 void ip_rt_send_redirect(struct sk_buff *skb)
1300 {
1301 	struct rtable *rt = (struct rtable*)skb->dst;
1302 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1303 
1304 	if (!in_dev)
1305 		return;
1306 
1307 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1308 		goto out;
1309 
1310 	/* No redirected packets during ip_rt_redirect_silence;
1311 	 * reset the algorithm.
1312 	 */
1313 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314 		rt->u.dst.rate_tokens = 0;
1315 
1316 	/* Too many ignored redirects; do not send anything
1317 	 * set u.dst.rate_last to the last seen redirected packet.
1318 	 */
1319 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320 		rt->u.dst.rate_last = jiffies;
1321 		goto out;
1322 	}
1323 
1324 	/* Check for load limit; set rate_last to the latest sent
1325 	 * redirect.
1326 	 */
1327 	if (rt->u.dst.rate_tokens == 0 ||
1328 	    time_after(jiffies,
1329 		       (rt->u.dst.rate_last +
1330 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332 		rt->u.dst.rate_last = jiffies;
1333 		++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337 		    net_ratelimit())
1338 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340 				NIPQUAD(rt->rt_src), rt->rt_iif,
1341 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343 	}
1344 out:
1345 	in_dev_put(in_dev);
1346 }
1347 
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350 	struct rtable *rt = (struct rtable*)skb->dst;
1351 	unsigned long now;
1352 	int code;
1353 
1354 	switch (rt->u.dst.error) {
1355 		case EINVAL:
1356 		default:
1357 			goto out;
1358 		case EHOSTUNREACH:
1359 			code = ICMP_HOST_UNREACH;
1360 			break;
1361 		case ENETUNREACH:
1362 			code = ICMP_NET_UNREACH;
1363 			break;
1364 		case EACCES:
1365 			code = ICMP_PKT_FILTERED;
1366 			break;
1367 	}
1368 
1369 	now = jiffies;
1370 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1373 	rt->u.dst.rate_last = now;
1374 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377 	}
1378 
1379 out:	kfree_skb(skb);
1380 	return 0;
1381 }
1382 
1383 /*
1384  *	The last two values are not from the RFC but
1385  *	are needed for AMPRnet AX.25 paths.
1386  */
1387 
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390 
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393 	int i;
1394 
1395 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396 		if (old_mtu > mtu_plateau[i])
1397 			return mtu_plateau[i];
1398 	return 68;
1399 }
1400 
1401 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402 {
1403 	int i;
1404 	unsigned short old_mtu = ntohs(iph->tot_len);
1405 	struct rtable *rth;
1406 	__be32  skeys[2] = { iph->saddr, 0, };
1407 	__be32  daddr = iph->daddr;
1408 	unsigned short est_mtu = 0;
1409 
1410 	if (ipv4_config.no_pmtu_disc)
1411 		return 0;
1412 
1413 	for (i = 0; i < 2; i++) {
1414 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1415 
1416 		rcu_read_lock();
1417 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1419 			if (rth->fl.fl4_dst == daddr &&
1420 			    rth->fl.fl4_src == skeys[i] &&
1421 			    rth->rt_dst  == daddr &&
1422 			    rth->rt_src  == iph->saddr &&
1423 			    rth->fl.iif == 0 &&
1424 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425 				unsigned short mtu = new_mtu;
1426 
1427 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1428 
1429 					/* BSD 4.2 compatibility hack :-( */
1430 					if (mtu == 0 &&
1431 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432 					    old_mtu >= 68 + (iph->ihl << 2))
1433 						old_mtu -= iph->ihl << 2;
1434 
1435 					mtu = guess_mtu(old_mtu);
1436 				}
1437 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439 						dst_confirm(&rth->u.dst);
1440 						if (mtu < ip_rt_min_pmtu) {
1441 							mtu = ip_rt_min_pmtu;
1442 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1443 								(1 << RTAX_MTU);
1444 						}
1445 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446 						dst_set_expires(&rth->u.dst,
1447 							ip_rt_mtu_expires);
1448 					}
1449 					est_mtu = mtu;
1450 				}
1451 			}
1452 		}
1453 		rcu_read_unlock();
1454 	}
1455 	return est_mtu ? : new_mtu;
1456 }
1457 
1458 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459 {
1460 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1462 		if (mtu < ip_rt_min_pmtu) {
1463 			mtu = ip_rt_min_pmtu;
1464 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465 		}
1466 		dst->metrics[RTAX_MTU-1] = mtu;
1467 		dst_set_expires(dst, ip_rt_mtu_expires);
1468 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1469 	}
1470 }
1471 
1472 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473 {
1474 	return NULL;
1475 }
1476 
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1478 {
1479 	struct rtable *rt = (struct rtable *) dst;
1480 	struct inet_peer *peer = rt->peer;
1481 	struct in_device *idev = rt->idev;
1482 
1483 	if (peer) {
1484 		rt->peer = NULL;
1485 		inet_putpeer(peer);
1486 	}
1487 
1488 	if (idev) {
1489 		rt->idev = NULL;
1490 		in_dev_put(idev);
1491 	}
1492 }
1493 
1494 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495 			    int how)
1496 {
1497 	struct rtable *rt = (struct rtable *) dst;
1498 	struct in_device *idev = rt->idev;
1499 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1500 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501 		if (loopback_idev) {
1502 			rt->idev = loopback_idev;
1503 			in_dev_put(idev);
1504 		}
1505 	}
1506 }
1507 
1508 static void ipv4_link_failure(struct sk_buff *skb)
1509 {
1510 	struct rtable *rt;
1511 
1512 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513 
1514 	rt = (struct rtable *) skb->dst;
1515 	if (rt)
1516 		dst_set_expires(&rt->u.dst, 0);
1517 }
1518 
1519 static int ip_rt_bug(struct sk_buff *skb)
1520 {
1521 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1523 		skb->dev ? skb->dev->name : "?");
1524 	kfree_skb(skb);
1525 	return 0;
1526 }
1527 
1528 /*
1529    We do not cache source address of outgoing interface,
1530    because it is used only by IP RR, TS and SRR options,
1531    so that it out of fast path.
1532 
1533    BTW remember: "addr" is allowed to be not aligned
1534    in IP options!
1535  */
1536 
1537 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538 {
1539 	__be32 src;
1540 	struct fib_result res;
1541 
1542 	if (rt->fl.iif == 0)
1543 		src = rt->rt_src;
1544 	else if (fib_lookup(&rt->fl, &res) == 0) {
1545 		src = FIB_RES_PREFSRC(res);
1546 		fib_res_put(&res);
1547 	} else
1548 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549 					RT_SCOPE_UNIVERSE);
1550 	memcpy(addr, &src, 4);
1551 }
1552 
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 static void set_class_tag(struct rtable *rt, u32 tag)
1555 {
1556 	if (!(rt->u.dst.tclassid & 0xFFFF))
1557 		rt->u.dst.tclassid |= tag & 0xFFFF;
1558 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560 }
1561 #endif
1562 
1563 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564 {
1565 	struct fib_info *fi = res->fi;
1566 
1567 	if (fi) {
1568 		if (FIB_RES_GW(*res) &&
1569 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570 			rt->rt_gateway = FIB_RES_GW(*res);
1571 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572 		       sizeof(rt->u.dst.metrics));
1573 		if (fi->fib_mtu == 0) {
1574 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576 			    rt->rt_gateway != rt->rt_dst &&
1577 			    rt->u.dst.dev->mtu > 576)
1578 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579 		}
1580 #ifdef CONFIG_NET_CLS_ROUTE
1581 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582 #endif
1583 	} else
1584 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585 
1586 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592 				       ip_rt_min_advmss);
1593 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595 
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 #ifdef CONFIG_IP_MULTIPLE_TABLES
1598 	set_class_tag(rt, fib_rules_tclass(res));
1599 #endif
1600 	set_class_tag(rt, itag);
1601 #endif
1602 	rt->rt_type = res->type;
1603 }
1604 
1605 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606 				u8 tos, struct net_device *dev, int our)
1607 {
1608 	unsigned hash;
1609 	struct rtable *rth;
1610 	__be32 spec_dst;
1611 	struct in_device *in_dev = in_dev_get(dev);
1612 	u32 itag = 0;
1613 
1614 	/* Primary sanity checks. */
1615 
1616 	if (in_dev == NULL)
1617 		return -EINVAL;
1618 
1619 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620 	    skb->protocol != htons(ETH_P_IP))
1621 		goto e_inval;
1622 
1623 	if (ZERONET(saddr)) {
1624 		if (!LOCAL_MCAST(daddr))
1625 			goto e_inval;
1626 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627 	} else if (fib_validate_source(saddr, 0, tos, 0,
1628 					dev, &spec_dst, &itag) < 0)
1629 		goto e_inval;
1630 
1631 	rth = dst_alloc(&ipv4_dst_ops);
1632 	if (!rth)
1633 		goto e_nobufs;
1634 
1635 	rth->u.dst.output= ip_rt_bug;
1636 
1637 	atomic_set(&rth->u.dst.__refcnt, 1);
1638 	rth->u.dst.flags= DST_HOST;
1639 	if (in_dev->cnf.no_policy)
1640 		rth->u.dst.flags |= DST_NOPOLICY;
1641 	rth->fl.fl4_dst	= daddr;
1642 	rth->rt_dst	= daddr;
1643 	rth->fl.fl4_tos	= tos;
1644 	rth->fl.mark    = skb->mark;
1645 	rth->fl.fl4_src	= saddr;
1646 	rth->rt_src	= saddr;
1647 #ifdef CONFIG_NET_CLS_ROUTE
1648 	rth->u.dst.tclassid = itag;
1649 #endif
1650 	rth->rt_iif	=
1651 	rth->fl.iif	= dev->ifindex;
1652 	rth->u.dst.dev	= &loopback_dev;
1653 	dev_hold(rth->u.dst.dev);
1654 	rth->idev	= in_dev_get(rth->u.dst.dev);
1655 	rth->fl.oif	= 0;
1656 	rth->rt_gateway	= daddr;
1657 	rth->rt_spec_dst= spec_dst;
1658 	rth->rt_type	= RTN_MULTICAST;
1659 	rth->rt_flags	= RTCF_MULTICAST;
1660 	if (our) {
1661 		rth->u.dst.input= ip_local_deliver;
1662 		rth->rt_flags |= RTCF_LOCAL;
1663 	}
1664 
1665 #ifdef CONFIG_IP_MROUTE
1666 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667 		rth->u.dst.input = ip_mr_input;
1668 #endif
1669 	RT_CACHE_STAT_INC(in_slow_mc);
1670 
1671 	in_dev_put(in_dev);
1672 	hash = rt_hash(daddr, saddr, dev->ifindex);
1673 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674 
1675 e_nobufs:
1676 	in_dev_put(in_dev);
1677 	return -ENOBUFS;
1678 
1679 e_inval:
1680 	in_dev_put(in_dev);
1681 	return -EINVAL;
1682 }
1683 
1684 
1685 static void ip_handle_martian_source(struct net_device *dev,
1686 				     struct in_device *in_dev,
1687 				     struct sk_buff *skb,
1688 				     __be32 daddr,
1689 				     __be32 saddr)
1690 {
1691 	RT_CACHE_STAT_INC(in_martian_src);
1692 #ifdef CONFIG_IP_ROUTE_VERBOSE
1693 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694 		/*
1695 		 *	RFC1812 recommendation, if source is martian,
1696 		 *	the only hint is MAC header.
1697 		 */
1698 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699 			"%u.%u.%u.%u, on dev %s\n",
1700 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1702 			int i;
1703 			const unsigned char *p = skb_mac_header(skb);
1704 			printk(KERN_WARNING "ll header: ");
1705 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1706 				printk("%02x", *p);
1707 				if (i < (dev->hard_header_len - 1))
1708 					printk(":");
1709 			}
1710 			printk("\n");
1711 		}
1712 	}
1713 #endif
1714 }
1715 
1716 static inline int __mkroute_input(struct sk_buff *skb,
1717 				  struct fib_result* res,
1718 				  struct in_device *in_dev,
1719 				  __be32 daddr, __be32 saddr, u32 tos,
1720 				  struct rtable **result)
1721 {
1722 
1723 	struct rtable *rth;
1724 	int err;
1725 	struct in_device *out_dev;
1726 	unsigned flags = 0;
1727 	__be32 spec_dst;
1728 	u32 itag;
1729 
1730 	/* get a working reference to the output device */
1731 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1732 	if (out_dev == NULL) {
1733 		if (net_ratelimit())
1734 			printk(KERN_CRIT "Bug in ip_route_input" \
1735 			       "_slow(). Please, report\n");
1736 		return -EINVAL;
1737 	}
1738 
1739 
1740 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741 				  in_dev->dev, &spec_dst, &itag);
1742 	if (err < 0) {
1743 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1744 					 saddr);
1745 
1746 		err = -EINVAL;
1747 		goto cleanup;
1748 	}
1749 
1750 	if (err)
1751 		flags |= RTCF_DIRECTSRC;
1752 
1753 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1755 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756 		flags |= RTCF_DOREDIRECT;
1757 
1758 	if (skb->protocol != htons(ETH_P_IP)) {
1759 		/* Not IP (i.e. ARP). Do not create route, if it is
1760 		 * invalid for proxy arp. DNAT routes are always valid.
1761 		 */
1762 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763 			err = -EINVAL;
1764 			goto cleanup;
1765 		}
1766 	}
1767 
1768 
1769 	rth = dst_alloc(&ipv4_dst_ops);
1770 	if (!rth) {
1771 		err = -ENOBUFS;
1772 		goto cleanup;
1773 	}
1774 
1775 	atomic_set(&rth->u.dst.__refcnt, 1);
1776 	rth->u.dst.flags= DST_HOST;
1777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778 	if (res->fi->fib_nhs > 1)
1779 		rth->u.dst.flags |= DST_BALANCED;
1780 #endif
1781 	if (in_dev->cnf.no_policy)
1782 		rth->u.dst.flags |= DST_NOPOLICY;
1783 	if (out_dev->cnf.no_xfrm)
1784 		rth->u.dst.flags |= DST_NOXFRM;
1785 	rth->fl.fl4_dst	= daddr;
1786 	rth->rt_dst	= daddr;
1787 	rth->fl.fl4_tos	= tos;
1788 	rth->fl.mark    = skb->mark;
1789 	rth->fl.fl4_src	= saddr;
1790 	rth->rt_src	= saddr;
1791 	rth->rt_gateway	= daddr;
1792 	rth->rt_iif 	=
1793 		rth->fl.iif	= in_dev->dev->ifindex;
1794 	rth->u.dst.dev	= (out_dev)->dev;
1795 	dev_hold(rth->u.dst.dev);
1796 	rth->idev	= in_dev_get(rth->u.dst.dev);
1797 	rth->fl.oif 	= 0;
1798 	rth->rt_spec_dst= spec_dst;
1799 
1800 	rth->u.dst.input = ip_forward;
1801 	rth->u.dst.output = ip_output;
1802 
1803 	rt_set_nexthop(rth, res, itag);
1804 
1805 	rth->rt_flags = flags;
1806 
1807 	*result = rth;
1808 	err = 0;
1809  cleanup:
1810 	/* release the working reference to the output device */
1811 	in_dev_put(out_dev);
1812 	return err;
1813 }
1814 
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816 				       struct fib_result* res,
1817 				       const struct flowi *fl,
1818 				       struct in_device *in_dev,
1819 				       __be32 daddr, __be32 saddr, u32 tos)
1820 {
1821 	struct rtable* rth = NULL;
1822 	int err;
1823 	unsigned hash;
1824 
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827 		fib_select_multipath(fl, res);
1828 #endif
1829 
1830 	/* create a routing cache entry */
1831 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832 	if (err)
1833 		return err;
1834 
1835 	/* put it into the cache */
1836 	hash = rt_hash(daddr, saddr, fl->iif);
1837 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1838 }
1839 
1840 static inline int ip_mkroute_input(struct sk_buff *skb,
1841 				   struct fib_result* res,
1842 				   const struct flowi *fl,
1843 				   struct in_device *in_dev,
1844 				   __be32 daddr, __be32 saddr, u32 tos)
1845 {
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847 	struct rtable* rth = NULL, *rtres;
1848 	unsigned char hop, hopcount;
1849 	int err = -EINVAL;
1850 	unsigned int hash;
1851 
1852 	if (res->fi)
1853 		hopcount = res->fi->fib_nhs;
1854 	else
1855 		hopcount = 1;
1856 
1857 	/* distinguish between multipath and singlepath */
1858 	if (hopcount < 2)
1859 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860 					    saddr, tos);
1861 
1862 	/* add all alternatives to the routing cache */
1863 	for (hop = 0; hop < hopcount; hop++) {
1864 		res->nh_sel = hop;
1865 
1866 		/* put reference to previous result */
1867 		if (hop)
1868 			ip_rt_put(rtres);
1869 
1870 		/* create a routing cache entry */
1871 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872 				      &rth);
1873 		if (err)
1874 			return err;
1875 
1876 		/* put it into the cache */
1877 		hash = rt_hash(daddr, saddr, fl->iif);
1878 		err = rt_intern_hash(hash, rth, &rtres);
1879 		if (err)
1880 			return err;
1881 
1882 		/* forward hop information to multipath impl. */
1883 		multipath_set_nhinfo(rth,
1884 				     FIB_RES_NETWORK(*res),
1885 				     FIB_RES_NETMASK(*res),
1886 				     res->prefixlen,
1887 				     &FIB_RES_NH(*res));
1888 	}
1889 	skb->dst = &rtres->u.dst;
1890 	return err;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894 }
1895 
1896 
1897 /*
1898  *	NOTE. We drop all the packets that has local source
1899  *	addresses, because every properly looped back packet
1900  *	must have correct destination already attached by output routine.
1901  *
1902  *	Such approach solves two big problems:
1903  *	1. Not simplex devices are handled properly.
1904  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1905  */
1906 
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908 			       u8 tos, struct net_device *dev)
1909 {
1910 	struct fib_result res;
1911 	struct in_device *in_dev = in_dev_get(dev);
1912 	struct flowi fl = { .nl_u = { .ip4_u =
1913 				      { .daddr = daddr,
1914 					.saddr = saddr,
1915 					.tos = tos,
1916 					.scope = RT_SCOPE_UNIVERSE,
1917 				      } },
1918 			    .mark = skb->mark,
1919 			    .iif = dev->ifindex };
1920 	unsigned	flags = 0;
1921 	u32		itag = 0;
1922 	struct rtable * rth;
1923 	unsigned	hash;
1924 	__be32		spec_dst;
1925 	int		err = -EINVAL;
1926 	int		free_res = 0;
1927 
1928 	/* IP on this device is disabled. */
1929 
1930 	if (!in_dev)
1931 		goto out;
1932 
1933 	/* Check for the most weird martians, which can be not detected
1934 	   by fib_lookup.
1935 	 */
1936 
1937 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938 		goto martian_source;
1939 
1940 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1941 		goto brd_input;
1942 
1943 	/* Accept zero addresses only to limited broadcast;
1944 	 * I even do not know to fix it or not. Waiting for complains :-)
1945 	 */
1946 	if (ZERONET(saddr))
1947 		goto martian_source;
1948 
1949 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950 		goto martian_destination;
1951 
1952 	/*
1953 	 *	Now we are ready to route packet.
1954 	 */
1955 	if ((err = fib_lookup(&fl, &res)) != 0) {
1956 		if (!IN_DEV_FORWARD(in_dev))
1957 			goto e_hostunreach;
1958 		goto no_route;
1959 	}
1960 	free_res = 1;
1961 
1962 	RT_CACHE_STAT_INC(in_slow_tot);
1963 
1964 	if (res.type == RTN_BROADCAST)
1965 		goto brd_input;
1966 
1967 	if (res.type == RTN_LOCAL) {
1968 		int result;
1969 		result = fib_validate_source(saddr, daddr, tos,
1970 					     loopback_dev.ifindex,
1971 					     dev, &spec_dst, &itag);
1972 		if (result < 0)
1973 			goto martian_source;
1974 		if (result)
1975 			flags |= RTCF_DIRECTSRC;
1976 		spec_dst = daddr;
1977 		goto local_input;
1978 	}
1979 
1980 	if (!IN_DEV_FORWARD(in_dev))
1981 		goto e_hostunreach;
1982 	if (res.type != RTN_UNICAST)
1983 		goto martian_destination;
1984 
1985 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986 	if (err == -ENOBUFS)
1987 		goto e_nobufs;
1988 	if (err == -EINVAL)
1989 		goto e_inval;
1990 
1991 done:
1992 	in_dev_put(in_dev);
1993 	if (free_res)
1994 		fib_res_put(&res);
1995 out:	return err;
1996 
1997 brd_input:
1998 	if (skb->protocol != htons(ETH_P_IP))
1999 		goto e_inval;
2000 
2001 	if (ZERONET(saddr))
2002 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003 	else {
2004 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005 					  &itag);
2006 		if (err < 0)
2007 			goto martian_source;
2008 		if (err)
2009 			flags |= RTCF_DIRECTSRC;
2010 	}
2011 	flags |= RTCF_BROADCAST;
2012 	res.type = RTN_BROADCAST;
2013 	RT_CACHE_STAT_INC(in_brd);
2014 
2015 local_input:
2016 	rth = dst_alloc(&ipv4_dst_ops);
2017 	if (!rth)
2018 		goto e_nobufs;
2019 
2020 	rth->u.dst.output= ip_rt_bug;
2021 
2022 	atomic_set(&rth->u.dst.__refcnt, 1);
2023 	rth->u.dst.flags= DST_HOST;
2024 	if (in_dev->cnf.no_policy)
2025 		rth->u.dst.flags |= DST_NOPOLICY;
2026 	rth->fl.fl4_dst	= daddr;
2027 	rth->rt_dst	= daddr;
2028 	rth->fl.fl4_tos	= tos;
2029 	rth->fl.mark    = skb->mark;
2030 	rth->fl.fl4_src	= saddr;
2031 	rth->rt_src	= saddr;
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033 	rth->u.dst.tclassid = itag;
2034 #endif
2035 	rth->rt_iif	=
2036 	rth->fl.iif	= dev->ifindex;
2037 	rth->u.dst.dev	= &loopback_dev;
2038 	dev_hold(rth->u.dst.dev);
2039 	rth->idev	= in_dev_get(rth->u.dst.dev);
2040 	rth->rt_gateway	= daddr;
2041 	rth->rt_spec_dst= spec_dst;
2042 	rth->u.dst.input= ip_local_deliver;
2043 	rth->rt_flags 	= flags|RTCF_LOCAL;
2044 	if (res.type == RTN_UNREACHABLE) {
2045 		rth->u.dst.input= ip_error;
2046 		rth->u.dst.error= -err;
2047 		rth->rt_flags 	&= ~RTCF_LOCAL;
2048 	}
2049 	rth->rt_type	= res.type;
2050 	hash = rt_hash(daddr, saddr, fl.iif);
2051 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052 	goto done;
2053 
2054 no_route:
2055 	RT_CACHE_STAT_INC(in_no_route);
2056 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057 	res.type = RTN_UNREACHABLE;
2058 	goto local_input;
2059 
2060 	/*
2061 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2062 	 */
2063 martian_destination:
2064 	RT_CACHE_STAT_INC(in_martian_dst);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068 			"%u.%u.%u.%u, dev %s\n",
2069 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070 #endif
2071 
2072 e_hostunreach:
2073 	err = -EHOSTUNREACH;
2074 	goto done;
2075 
2076 e_inval:
2077 	err = -EINVAL;
2078 	goto done;
2079 
2080 e_nobufs:
2081 	err = -ENOBUFS;
2082 	goto done;
2083 
2084 martian_source:
2085 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086 	goto e_inval;
2087 }
2088 
2089 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090 		   u8 tos, struct net_device *dev)
2091 {
2092 	struct rtable * rth;
2093 	unsigned	hash;
2094 	int iif = dev->ifindex;
2095 
2096 	tos &= IPTOS_RT_MASK;
2097 	hash = rt_hash(daddr, saddr, iif);
2098 
2099 	rcu_read_lock();
2100 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2102 		if (rth->fl.fl4_dst == daddr &&
2103 		    rth->fl.fl4_src == saddr &&
2104 		    rth->fl.iif == iif &&
2105 		    rth->fl.oif == 0 &&
2106 		    rth->fl.mark == skb->mark &&
2107 		    rth->fl.fl4_tos == tos) {
2108 			rth->u.dst.lastuse = jiffies;
2109 			dst_hold(&rth->u.dst);
2110 			rth->u.dst.__use++;
2111 			RT_CACHE_STAT_INC(in_hit);
2112 			rcu_read_unlock();
2113 			skb->dst = (struct dst_entry*)rth;
2114 			return 0;
2115 		}
2116 		RT_CACHE_STAT_INC(in_hlist_search);
2117 	}
2118 	rcu_read_unlock();
2119 
2120 	/* Multicast recognition logic is moved from route cache to here.
2121 	   The problem was that too many Ethernet cards have broken/missing
2122 	   hardware multicast filters :-( As result the host on multicasting
2123 	   network acquires a lot of useless route cache entries, sort of
2124 	   SDR messages from all the world. Now we try to get rid of them.
2125 	   Really, provided software IP multicast filter is organized
2126 	   reasonably (at least, hashed), it does not result in a slowdown
2127 	   comparing with route cache reject entries.
2128 	   Note, that multicast routers are not affected, because
2129 	   route cache entry is created eventually.
2130 	 */
2131 	if (MULTICAST(daddr)) {
2132 		struct in_device *in_dev;
2133 
2134 		rcu_read_lock();
2135 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136 			int our = ip_check_mc(in_dev, daddr, saddr,
2137 				ip_hdr(skb)->protocol);
2138 			if (our
2139 #ifdef CONFIG_IP_MROUTE
2140 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141 #endif
2142 			    ) {
2143 				rcu_read_unlock();
2144 				return ip_route_input_mc(skb, daddr, saddr,
2145 							 tos, dev, our);
2146 			}
2147 		}
2148 		rcu_read_unlock();
2149 		return -EINVAL;
2150 	}
2151 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152 }
2153 
2154 static inline int __mkroute_output(struct rtable **result,
2155 				   struct fib_result* res,
2156 				   const struct flowi *fl,
2157 				   const struct flowi *oldflp,
2158 				   struct net_device *dev_out,
2159 				   unsigned flags)
2160 {
2161 	struct rtable *rth;
2162 	struct in_device *in_dev;
2163 	u32 tos = RT_FL_TOS(oldflp);
2164 	int err = 0;
2165 
2166 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167 		return -EINVAL;
2168 
2169 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170 		res->type = RTN_BROADCAST;
2171 	else if (MULTICAST(fl->fl4_dst))
2172 		res->type = RTN_MULTICAST;
2173 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174 		return -EINVAL;
2175 
2176 	if (dev_out->flags & IFF_LOOPBACK)
2177 		flags |= RTCF_LOCAL;
2178 
2179 	/* get work reference to inet device */
2180 	in_dev = in_dev_get(dev_out);
2181 	if (!in_dev)
2182 		return -EINVAL;
2183 
2184 	if (res->type == RTN_BROADCAST) {
2185 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186 		if (res->fi) {
2187 			fib_info_put(res->fi);
2188 			res->fi = NULL;
2189 		}
2190 	} else if (res->type == RTN_MULTICAST) {
2191 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193 				 oldflp->proto))
2194 			flags &= ~RTCF_LOCAL;
2195 		/* If multicast route do not exist use
2196 		   default one, but do not gateway in this case.
2197 		   Yes, it is hack.
2198 		 */
2199 		if (res->fi && res->prefixlen < 4) {
2200 			fib_info_put(res->fi);
2201 			res->fi = NULL;
2202 		}
2203 	}
2204 
2205 
2206 	rth = dst_alloc(&ipv4_dst_ops);
2207 	if (!rth) {
2208 		err = -ENOBUFS;
2209 		goto cleanup;
2210 	}
2211 
2212 	atomic_set(&rth->u.dst.__refcnt, 1);
2213 	rth->u.dst.flags= DST_HOST;
2214 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215 	if (res->fi) {
2216 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217 		if (res->fi->fib_nhs > 1)
2218 			rth->u.dst.flags |= DST_BALANCED;
2219 	}
2220 #endif
2221 	if (in_dev->cnf.no_xfrm)
2222 		rth->u.dst.flags |= DST_NOXFRM;
2223 	if (in_dev->cnf.no_policy)
2224 		rth->u.dst.flags |= DST_NOPOLICY;
2225 
2226 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2227 	rth->fl.fl4_tos	= tos;
2228 	rth->fl.fl4_src	= oldflp->fl4_src;
2229 	rth->fl.oif	= oldflp->oif;
2230 	rth->fl.mark    = oldflp->mark;
2231 	rth->rt_dst	= fl->fl4_dst;
2232 	rth->rt_src	= fl->fl4_src;
2233 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2234 	/* get references to the devices that are to be hold by the routing
2235 	   cache entry */
2236 	rth->u.dst.dev	= dev_out;
2237 	dev_hold(dev_out);
2238 	rth->idev	= in_dev_get(dev_out);
2239 	rth->rt_gateway = fl->fl4_dst;
2240 	rth->rt_spec_dst= fl->fl4_src;
2241 
2242 	rth->u.dst.output=ip_output;
2243 
2244 	RT_CACHE_STAT_INC(out_slow_tot);
2245 
2246 	if (flags & RTCF_LOCAL) {
2247 		rth->u.dst.input = ip_local_deliver;
2248 		rth->rt_spec_dst = fl->fl4_dst;
2249 	}
2250 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 		rth->rt_spec_dst = fl->fl4_src;
2252 		if (flags & RTCF_LOCAL &&
2253 		    !(dev_out->flags & IFF_LOOPBACK)) {
2254 			rth->u.dst.output = ip_mc_output;
2255 			RT_CACHE_STAT_INC(out_slow_mc);
2256 		}
2257 #ifdef CONFIG_IP_MROUTE
2258 		if (res->type == RTN_MULTICAST) {
2259 			if (IN_DEV_MFORWARD(in_dev) &&
2260 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 				rth->u.dst.input = ip_mr_input;
2262 				rth->u.dst.output = ip_mc_output;
2263 			}
2264 		}
2265 #endif
2266 	}
2267 
2268 	rt_set_nexthop(rth, res, 0);
2269 
2270 	rth->rt_flags = flags;
2271 
2272 	*result = rth;
2273  cleanup:
2274 	/* release work reference to inet device */
2275 	in_dev_put(in_dev);
2276 
2277 	return err;
2278 }
2279 
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281 					struct fib_result* res,
2282 					const struct flowi *fl,
2283 					const struct flowi *oldflp,
2284 					struct net_device *dev_out,
2285 					unsigned flags)
2286 {
2287 	struct rtable *rth = NULL;
2288 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289 	unsigned hash;
2290 	if (err == 0) {
2291 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292 		err = rt_intern_hash(hash, rth, rp);
2293 	}
2294 
2295 	return err;
2296 }
2297 
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299 				    struct fib_result* res,
2300 				    const struct flowi *fl,
2301 				    const struct flowi *oldflp,
2302 				    struct net_device *dev_out,
2303 				    unsigned flags)
2304 {
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306 	unsigned char hop;
2307 	unsigned hash;
2308 	int err = -EINVAL;
2309 	struct rtable *rth = NULL;
2310 
2311 	if (res->fi && res->fi->fib_nhs > 1) {
2312 		unsigned char hopcount = res->fi->fib_nhs;
2313 
2314 		for (hop = 0; hop < hopcount; hop++) {
2315 			struct net_device *dev2nexthop;
2316 
2317 			res->nh_sel = hop;
2318 
2319 			/* hold a work reference to the output device */
2320 			dev2nexthop = FIB_RES_DEV(*res);
2321 			dev_hold(dev2nexthop);
2322 
2323 			/* put reference to previous result */
2324 			if (hop)
2325 				ip_rt_put(*rp);
2326 
2327 			err = __mkroute_output(&rth, res, fl, oldflp,
2328 					       dev2nexthop, flags);
2329 
2330 			if (err != 0)
2331 				goto cleanup;
2332 
2333 			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334 					oldflp->oif);
2335 			err = rt_intern_hash(hash, rth, rp);
2336 
2337 			/* forward hop information to multipath impl. */
2338 			multipath_set_nhinfo(rth,
2339 					     FIB_RES_NETWORK(*res),
2340 					     FIB_RES_NETMASK(*res),
2341 					     res->prefixlen,
2342 					     &FIB_RES_NH(*res));
2343 		cleanup:
2344 			/* release work reference to output device */
2345 			dev_put(dev2nexthop);
2346 
2347 			if (err != 0)
2348 				return err;
2349 		}
2350 		return err;
2351 	} else {
2352 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353 					     flags);
2354 	}
2355 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357 #endif
2358 }
2359 
2360 /*
2361  * Major route resolver routine.
2362  */
2363 
2364 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 {
2366 	u32 tos	= RT_FL_TOS(oldflp);
2367 	struct flowi fl = { .nl_u = { .ip4_u =
2368 				      { .daddr = oldflp->fl4_dst,
2369 					.saddr = oldflp->fl4_src,
2370 					.tos = tos & IPTOS_RT_MASK,
2371 					.scope = ((tos & RTO_ONLINK) ?
2372 						  RT_SCOPE_LINK :
2373 						  RT_SCOPE_UNIVERSE),
2374 				      } },
2375 			    .mark = oldflp->mark,
2376 			    .iif = loopback_dev.ifindex,
2377 			    .oif = oldflp->oif };
2378 	struct fib_result res;
2379 	unsigned flags = 0;
2380 	struct net_device *dev_out = NULL;
2381 	int free_res = 0;
2382 	int err;
2383 
2384 
2385 	res.fi		= NULL;
2386 #ifdef CONFIG_IP_MULTIPLE_TABLES
2387 	res.r		= NULL;
2388 #endif
2389 
2390 	if (oldflp->fl4_src) {
2391 		err = -EINVAL;
2392 		if (MULTICAST(oldflp->fl4_src) ||
2393 		    BADCLASS(oldflp->fl4_src) ||
2394 		    ZERONET(oldflp->fl4_src))
2395 			goto out;
2396 
2397 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398 		dev_out = ip_dev_find(oldflp->fl4_src);
2399 		if ((dev_out == NULL) && !(sysctl_ip_nonlocal_bind))
2400 			goto out;
2401 
2402 		/* I removed check for oif == dev_out->oif here.
2403 		   It was wrong for two reasons:
2404 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405 		      assigned to multiple interfaces.
2406 		   2. Moreover, we are allowed to send packets with saddr
2407 		      of another iface. --ANK
2408 		 */
2409 
2410 		if (dev_out && oldflp->oif == 0
2411 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412 			/* Special hack: user can direct multicasts
2413 			   and limited broadcast via necessary interface
2414 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415 			   This hack is not just for fun, it allows
2416 			   vic,vat and friends to work.
2417 			   They bind socket to loopback, set ttl to zero
2418 			   and expect that it will work.
2419 			   From the viewpoint of routing cache they are broken,
2420 			   because we are not allowed to build multicast path
2421 			   with loopback source addr (look, routing cache
2422 			   cannot know, that ttl is zero, so that packet
2423 			   will not leave this host and route is valid).
2424 			   Luckily, this hack is good workaround.
2425 			 */
2426 
2427 			fl.oif = dev_out->ifindex;
2428 			goto make_route;
2429 		}
2430 		if (dev_out)
2431 			dev_put(dev_out);
2432 		dev_out = NULL;
2433 	}
2434 
2435 
2436 	if (oldflp->oif) {
2437 		dev_out = dev_get_by_index(oldflp->oif);
2438 		err = -ENODEV;
2439 		if (dev_out == NULL)
2440 			goto out;
2441 
2442 		/* RACE: Check return value of inet_select_addr instead. */
2443 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2444 			dev_put(dev_out);
2445 			goto out;	/* Wrong error code */
2446 		}
2447 
2448 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2449 			if (!fl.fl4_src)
2450 				fl.fl4_src = inet_select_addr(dev_out, 0,
2451 							      RT_SCOPE_LINK);
2452 			goto make_route;
2453 		}
2454 		if (!fl.fl4_src) {
2455 			if (MULTICAST(oldflp->fl4_dst))
2456 				fl.fl4_src = inet_select_addr(dev_out, 0,
2457 							      fl.fl4_scope);
2458 			else if (!oldflp->fl4_dst)
2459 				fl.fl4_src = inet_select_addr(dev_out, 0,
2460 							      RT_SCOPE_HOST);
2461 		}
2462 	}
2463 
2464 	if (!fl.fl4_dst) {
2465 		fl.fl4_dst = fl.fl4_src;
2466 		if (!fl.fl4_dst)
2467 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468 		if (dev_out)
2469 			dev_put(dev_out);
2470 		dev_out = &loopback_dev;
2471 		dev_hold(dev_out);
2472 		fl.oif = loopback_dev.ifindex;
2473 		res.type = RTN_LOCAL;
2474 		flags |= RTCF_LOCAL;
2475 		goto make_route;
2476 	}
2477 
2478 	if (fib_lookup(&fl, &res)) {
2479 		res.fi = NULL;
2480 		if (oldflp->oif) {
2481 			/* Apparently, routing tables are wrong. Assume,
2482 			   that the destination is on link.
2483 
2484 			   WHY? DW.
2485 			   Because we are allowed to send to iface
2486 			   even if it has NO routes and NO assigned
2487 			   addresses. When oif is specified, routing
2488 			   tables are looked up with only one purpose:
2489 			   to catch if destination is gatewayed, rather than
2490 			   direct. Moreover, if MSG_DONTROUTE is set,
2491 			   we send packet, ignoring both routing tables
2492 			   and ifaddr state. --ANK
2493 
2494 
2495 			   We could make it even if oif is unknown,
2496 			   likely IPv6, but we do not.
2497 			 */
2498 
2499 			if (fl.fl4_src == 0)
2500 				fl.fl4_src = inet_select_addr(dev_out, 0,
2501 							      RT_SCOPE_LINK);
2502 			res.type = RTN_UNICAST;
2503 			goto make_route;
2504 		}
2505 		if (dev_out)
2506 			dev_put(dev_out);
2507 		err = -ENETUNREACH;
2508 		goto out;
2509 	}
2510 	free_res = 1;
2511 
2512 	if (res.type == RTN_LOCAL) {
2513 		if (!fl.fl4_src)
2514 			fl.fl4_src = fl.fl4_dst;
2515 		if (dev_out)
2516 			dev_put(dev_out);
2517 		dev_out = &loopback_dev;
2518 		dev_hold(dev_out);
2519 		fl.oif = dev_out->ifindex;
2520 		if (res.fi)
2521 			fib_info_put(res.fi);
2522 		res.fi = NULL;
2523 		flags |= RTCF_LOCAL;
2524 		goto make_route;
2525 	}
2526 
2527 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2528 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529 		fib_select_multipath(&fl, &res);
2530 	else
2531 #endif
2532 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533 		fib_select_default(&fl, &res);
2534 
2535 	if (!fl.fl4_src)
2536 		fl.fl4_src = FIB_RES_PREFSRC(res);
2537 
2538 	if (dev_out)
2539 		dev_put(dev_out);
2540 	dev_out = FIB_RES_DEV(res);
2541 	dev_hold(dev_out);
2542 	fl.oif = dev_out->ifindex;
2543 
2544 
2545 make_route:
2546 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547 
2548 
2549 	if (free_res)
2550 		fib_res_put(&res);
2551 	if (dev_out)
2552 		dev_put(dev_out);
2553 out:	return err;
2554 }
2555 
2556 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557 {
2558 	unsigned hash;
2559 	struct rtable *rth;
2560 
2561 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2562 
2563 	rcu_read_lock_bh();
2564 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2566 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2567 		    rth->fl.fl4_src == flp->fl4_src &&
2568 		    rth->fl.iif == 0 &&
2569 		    rth->fl.oif == flp->oif &&
2570 		    rth->fl.mark == flp->mark &&
2571 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2573 
2574 			/* check for multipath routes and choose one if
2575 			 * necessary
2576 			 */
2577 			if (multipath_select_route(flp, rth, rp)) {
2578 				dst_hold(&(*rp)->u.dst);
2579 				RT_CACHE_STAT_INC(out_hit);
2580 				rcu_read_unlock_bh();
2581 				return 0;
2582 			}
2583 
2584 			rth->u.dst.lastuse = jiffies;
2585 			dst_hold(&rth->u.dst);
2586 			rth->u.dst.__use++;
2587 			RT_CACHE_STAT_INC(out_hit);
2588 			rcu_read_unlock_bh();
2589 			*rp = rth;
2590 			return 0;
2591 		}
2592 		RT_CACHE_STAT_INC(out_hlist_search);
2593 	}
2594 	rcu_read_unlock_bh();
2595 
2596 	return ip_route_output_slow(rp, flp);
2597 }
2598 
2599 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600 
2601 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2602 {
2603 	int err;
2604 
2605 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2606 		return err;
2607 
2608 	if (flp->proto) {
2609 		if (!flp->fl4_src)
2610 			flp->fl4_src = (*rp)->rt_src;
2611 		if (!flp->fl4_dst)
2612 			flp->fl4_dst = (*rp)->rt_dst;
2613 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2614 	}
2615 
2616 	return 0;
2617 }
2618 
2619 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2620 
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622 {
2623 	return ip_route_output_flow(rp, flp, NULL, 0);
2624 }
2625 
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627 			int nowait, unsigned int flags)
2628 {
2629 	struct rtable *rt = (struct rtable*)skb->dst;
2630 	struct rtmsg *r;
2631 	struct nlmsghdr *nlh;
2632 	long expires;
2633 	u32 id = 0, ts = 0, tsage = 0, error;
2634 
2635 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2636 	if (nlh == NULL)
2637 		return -EMSGSIZE;
2638 
2639 	r = nlmsg_data(nlh);
2640 	r->rtm_family	 = AF_INET;
2641 	r->rtm_dst_len	= 32;
2642 	r->rtm_src_len	= 0;
2643 	r->rtm_tos	= rt->fl.fl4_tos;
2644 	r->rtm_table	= RT_TABLE_MAIN;
2645 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2646 	r->rtm_type	= rt->rt_type;
2647 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2648 	r->rtm_protocol = RTPROT_UNSPEC;
2649 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2650 	if (rt->rt_flags & RTCF_NOTIFY)
2651 		r->rtm_flags |= RTM_F_NOTIFY;
2652 
2653 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2654 
2655 	if (rt->fl.fl4_src) {
2656 		r->rtm_src_len = 32;
2657 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2658 	}
2659 	if (rt->u.dst.dev)
2660 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2661 #ifdef CONFIG_NET_CLS_ROUTE
2662 	if (rt->u.dst.tclassid)
2663 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2664 #endif
2665 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2666 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2667 		NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2668 #endif
2669 	if (rt->fl.iif)
2670 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2671 	else if (rt->rt_src != rt->fl.fl4_src)
2672 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2673 
2674 	if (rt->rt_dst != rt->rt_gateway)
2675 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2676 
2677 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678 		goto nla_put_failure;
2679 
2680 	error = rt->u.dst.error;
2681 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2682 	if (rt->peer) {
2683 		id = rt->peer->ip_id_count;
2684 		if (rt->peer->tcp_ts_stamp) {
2685 			ts = rt->peer->tcp_ts;
2686 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2687 		}
2688 	}
2689 
2690 	if (rt->fl.iif) {
2691 #ifdef CONFIG_IP_MROUTE
2692 		__be32 dst = rt->rt_dst;
2693 
2694 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2695 		    ipv4_devconf.mc_forwarding) {
2696 			int err = ipmr_get_route(skb, r, nowait);
2697 			if (err <= 0) {
2698 				if (!nowait) {
2699 					if (err == 0)
2700 						return 0;
2701 					goto nla_put_failure;
2702 				} else {
2703 					if (err == -EMSGSIZE)
2704 						goto nla_put_failure;
2705 					error = err;
2706 				}
2707 			}
2708 		} else
2709 #endif
2710 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2711 	}
2712 
2713 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2714 			       expires, error) < 0)
2715 		goto nla_put_failure;
2716 
2717 	return nlmsg_end(skb, nlh);
2718 
2719 nla_put_failure:
2720 	nlmsg_cancel(skb, nlh);
2721 	return -EMSGSIZE;
2722 }
2723 
2724 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2725 {
2726 	struct rtmsg *rtm;
2727 	struct nlattr *tb[RTA_MAX+1];
2728 	struct rtable *rt = NULL;
2729 	__be32 dst = 0;
2730 	__be32 src = 0;
2731 	u32 iif;
2732 	int err;
2733 	struct sk_buff *skb;
2734 
2735 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2736 	if (err < 0)
2737 		goto errout;
2738 
2739 	rtm = nlmsg_data(nlh);
2740 
2741 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2742 	if (skb == NULL) {
2743 		err = -ENOBUFS;
2744 		goto errout;
2745 	}
2746 
2747 	/* Reserve room for dummy headers, this skb can pass
2748 	   through good chunk of routing engine.
2749 	 */
2750 	skb_reset_mac_header(skb);
2751 	skb_reset_network_header(skb);
2752 
2753 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2754 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2755 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2756 
2757 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2758 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2759 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2760 
2761 	if (iif) {
2762 		struct net_device *dev;
2763 
2764 		dev = __dev_get_by_index(iif);
2765 		if (dev == NULL) {
2766 			err = -ENODEV;
2767 			goto errout_free;
2768 		}
2769 
2770 		skb->protocol	= htons(ETH_P_IP);
2771 		skb->dev	= dev;
2772 		local_bh_disable();
2773 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2774 		local_bh_enable();
2775 
2776 		rt = (struct rtable*) skb->dst;
2777 		if (err == 0 && rt->u.dst.error)
2778 			err = -rt->u.dst.error;
2779 	} else {
2780 		struct flowi fl = {
2781 			.nl_u = {
2782 				.ip4_u = {
2783 					.daddr = dst,
2784 					.saddr = src,
2785 					.tos = rtm->rtm_tos,
2786 				},
2787 			},
2788 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2789 		};
2790 		err = ip_route_output_key(&rt, &fl);
2791 	}
2792 
2793 	if (err)
2794 		goto errout_free;
2795 
2796 	skb->dst = &rt->u.dst;
2797 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2798 		rt->rt_flags |= RTCF_NOTIFY;
2799 
2800 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2801 				RTM_NEWROUTE, 0, 0);
2802 	if (err <= 0)
2803 		goto errout_free;
2804 
2805 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2806 errout:
2807 	return err;
2808 
2809 errout_free:
2810 	kfree_skb(skb);
2811 	goto errout;
2812 }
2813 
2814 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2815 {
2816 	struct rtable *rt;
2817 	int h, s_h;
2818 	int idx, s_idx;
2819 
2820 	s_h = cb->args[0];
2821 	s_idx = idx = cb->args[1];
2822 	for (h = 0; h <= rt_hash_mask; h++) {
2823 		if (h < s_h) continue;
2824 		if (h > s_h)
2825 			s_idx = 0;
2826 		rcu_read_lock_bh();
2827 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2829 			if (idx < s_idx)
2830 				continue;
2831 			skb->dst = dst_clone(&rt->u.dst);
2832 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2833 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2834 					 1, NLM_F_MULTI) <= 0) {
2835 				dst_release(xchg(&skb->dst, NULL));
2836 				rcu_read_unlock_bh();
2837 				goto done;
2838 			}
2839 			dst_release(xchg(&skb->dst, NULL));
2840 		}
2841 		rcu_read_unlock_bh();
2842 	}
2843 
2844 done:
2845 	cb->args[0] = h;
2846 	cb->args[1] = idx;
2847 	return skb->len;
2848 }
2849 
2850 void ip_rt_multicast_event(struct in_device *in_dev)
2851 {
2852 	rt_cache_flush(0);
2853 }
2854 
2855 #ifdef CONFIG_SYSCTL
2856 static int flush_delay;
2857 
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859 					struct file *filp, void __user *buffer,
2860 					size_t *lenp, loff_t *ppos)
2861 {
2862 	if (write) {
2863 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864 		rt_cache_flush(flush_delay);
2865 		return 0;
2866 	}
2867 
2868 	return -EINVAL;
2869 }
2870 
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2872 						int __user *name,
2873 						int nlen,
2874 						void __user *oldval,
2875 						size_t __user *oldlenp,
2876 						void __user *newval,
2877 						size_t newlen)
2878 {
2879 	int delay;
2880 	if (newlen != sizeof(int))
2881 		return -EINVAL;
2882 	if (get_user(delay, (int __user *)newval))
2883 		return -EFAULT;
2884 	rt_cache_flush(delay);
2885 	return 0;
2886 }
2887 
2888 ctl_table ipv4_route_table[] = {
2889 	{
2890 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2891 		.procname	= "flush",
2892 		.data		= &flush_delay,
2893 		.maxlen		= sizeof(int),
2894 		.mode		= 0200,
2895 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2896 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2897 	},
2898 	{
2899 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2900 		.procname	= "min_delay",
2901 		.data		= &ip_rt_min_delay,
2902 		.maxlen		= sizeof(int),
2903 		.mode		= 0644,
2904 		.proc_handler	= &proc_dointvec_jiffies,
2905 		.strategy	= &sysctl_jiffies,
2906 	},
2907 	{
2908 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2909 		.procname	= "max_delay",
2910 		.data		= &ip_rt_max_delay,
2911 		.maxlen		= sizeof(int),
2912 		.mode		= 0644,
2913 		.proc_handler	= &proc_dointvec_jiffies,
2914 		.strategy	= &sysctl_jiffies,
2915 	},
2916 	{
2917 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2918 		.procname	= "gc_thresh",
2919 		.data		= &ipv4_dst_ops.gc_thresh,
2920 		.maxlen		= sizeof(int),
2921 		.mode		= 0644,
2922 		.proc_handler	= &proc_dointvec,
2923 	},
2924 	{
2925 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2926 		.procname	= "max_size",
2927 		.data		= &ip_rt_max_size,
2928 		.maxlen		= sizeof(int),
2929 		.mode		= 0644,
2930 		.proc_handler	= &proc_dointvec,
2931 	},
2932 	{
2933 		/*  Deprecated. Use gc_min_interval_ms */
2934 
2935 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 		.procname	= "gc_min_interval",
2937 		.data		= &ip_rt_gc_min_interval,
2938 		.maxlen		= sizeof(int),
2939 		.mode		= 0644,
2940 		.proc_handler	= &proc_dointvec_jiffies,
2941 		.strategy	= &sysctl_jiffies,
2942 	},
2943 	{
2944 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 		.procname	= "gc_min_interval_ms",
2946 		.data		= &ip_rt_gc_min_interval,
2947 		.maxlen		= sizeof(int),
2948 		.mode		= 0644,
2949 		.proc_handler	= &proc_dointvec_ms_jiffies,
2950 		.strategy	= &sysctl_ms_jiffies,
2951 	},
2952 	{
2953 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2954 		.procname	= "gc_timeout",
2955 		.data		= &ip_rt_gc_timeout,
2956 		.maxlen		= sizeof(int),
2957 		.mode		= 0644,
2958 		.proc_handler	= &proc_dointvec_jiffies,
2959 		.strategy	= &sysctl_jiffies,
2960 	},
2961 	{
2962 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2963 		.procname	= "gc_interval",
2964 		.data		= &ip_rt_gc_interval,
2965 		.maxlen		= sizeof(int),
2966 		.mode		= 0644,
2967 		.proc_handler	= &proc_dointvec_jiffies,
2968 		.strategy	= &sysctl_jiffies,
2969 	},
2970 	{
2971 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 		.procname	= "redirect_load",
2973 		.data		= &ip_rt_redirect_load,
2974 		.maxlen		= sizeof(int),
2975 		.mode		= 0644,
2976 		.proc_handler	= &proc_dointvec,
2977 	},
2978 	{
2979 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 		.procname	= "redirect_number",
2981 		.data		= &ip_rt_redirect_number,
2982 		.maxlen		= sizeof(int),
2983 		.mode		= 0644,
2984 		.proc_handler	= &proc_dointvec,
2985 	},
2986 	{
2987 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 		.procname	= "redirect_silence",
2989 		.data		= &ip_rt_redirect_silence,
2990 		.maxlen		= sizeof(int),
2991 		.mode		= 0644,
2992 		.proc_handler	= &proc_dointvec,
2993 	},
2994 	{
2995 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2996 		.procname	= "error_cost",
2997 		.data		= &ip_rt_error_cost,
2998 		.maxlen		= sizeof(int),
2999 		.mode		= 0644,
3000 		.proc_handler	= &proc_dointvec,
3001 	},
3002 	{
3003 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3004 		.procname	= "error_burst",
3005 		.data		= &ip_rt_error_burst,
3006 		.maxlen		= sizeof(int),
3007 		.mode		= 0644,
3008 		.proc_handler	= &proc_dointvec,
3009 	},
3010 	{
3011 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3012 		.procname	= "gc_elasticity",
3013 		.data		= &ip_rt_gc_elasticity,
3014 		.maxlen		= sizeof(int),
3015 		.mode		= 0644,
3016 		.proc_handler	= &proc_dointvec,
3017 	},
3018 	{
3019 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3020 		.procname	= "mtu_expires",
3021 		.data		= &ip_rt_mtu_expires,
3022 		.maxlen		= sizeof(int),
3023 		.mode		= 0644,
3024 		.proc_handler	= &proc_dointvec_jiffies,
3025 		.strategy	= &sysctl_jiffies,
3026 	},
3027 	{
3028 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3029 		.procname	= "min_pmtu",
3030 		.data		= &ip_rt_min_pmtu,
3031 		.maxlen		= sizeof(int),
3032 		.mode		= 0644,
3033 		.proc_handler	= &proc_dointvec,
3034 	},
3035 	{
3036 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3037 		.procname	= "min_adv_mss",
3038 		.data		= &ip_rt_min_advmss,
3039 		.maxlen		= sizeof(int),
3040 		.mode		= 0644,
3041 		.proc_handler	= &proc_dointvec,
3042 	},
3043 	{
3044 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 		.procname	= "secret_interval",
3046 		.data		= &ip_rt_secret_interval,
3047 		.maxlen		= sizeof(int),
3048 		.mode		= 0644,
3049 		.proc_handler	= &proc_dointvec_jiffies,
3050 		.strategy	= &sysctl_jiffies,
3051 	},
3052 	{ .ctl_name = 0 }
3053 };
3054 #endif
3055 
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3058 
3059 /* This code sucks.  But you should have seen it before! --RR */
3060 
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063 
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 			   int length, int *eof, void *data)
3067 {
3068 	unsigned int i;
3069 
3070 	if ((offset & 3) || (length & 3))
3071 		return -EIO;
3072 
3073 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 		*eof = 1;
3075 		return 0;
3076 	}
3077 
3078 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 		*eof = 1;
3081 	}
3082 
3083 	offset /= sizeof(u32);
3084 
3085 	if (length > 0) {
3086 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 		u32 *dst = (u32 *) buffer;
3088 
3089 		/* Copy first cpu. */
3090 		*start = buffer;
3091 		memcpy(dst, src, length);
3092 
3093 		/* Add the other cpus in, one int at a time */
3094 		for_each_possible_cpu(i) {
3095 			unsigned int j;
3096 
3097 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098 
3099 			for (j = 0; j < length/4; j++)
3100 				dst[j] += src[j];
3101 		}
3102 	}
3103 	return length;
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3107 
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3110 {
3111 	if (!str)
3112 		return 0;
3113 	rhash_entries = simple_strtoul(str, &str, 0);
3114 	return 1;
3115 }
3116 __setup("rhash_entries=", set_rhash_entries);
3117 
3118 int __init ip_rt_init(void)
3119 {
3120 	int rc = 0;
3121 
3122 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 			     (jiffies ^ (jiffies >> 7)));
3124 
3125 #ifdef CONFIG_NET_CLS_ROUTE
3126 	{
3127 	int order;
3128 	for (order = 0;
3129 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 		/* NOTHING */;
3131 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 	if (!ip_rt_acct)
3133 		panic("IP: failed to allocate ip_rt_acct\n");
3134 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3135 	}
3136 #endif
3137 
3138 	ipv4_dst_ops.kmem_cachep =
3139 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3140 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3141 
3142 	rt_hash_table = (struct rt_hash_bucket *)
3143 		alloc_large_system_hash("IP route cache",
3144 					sizeof(struct rt_hash_bucket),
3145 					rhash_entries,
3146 					(num_physpages >= 128 * 1024) ?
3147 					15 : 17,
3148 					0,
3149 					&rt_hash_log,
3150 					&rt_hash_mask,
3151 					0);
3152 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3153 	rt_hash_lock_init();
3154 
3155 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3156 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3157 
3158 	devinet_init();
3159 	ip_fib_init();
3160 
3161 	init_timer(&rt_flush_timer);
3162 	rt_flush_timer.function = rt_run_flush;
3163 	init_timer(&rt_periodic_timer);
3164 	rt_periodic_timer.function = rt_check_expire;
3165 	init_timer(&rt_secret_timer);
3166 	rt_secret_timer.function = rt_secret_rebuild;
3167 
3168 	/* All the timers, started at system startup tend
3169 	   to synchronize. Perturb it a bit.
3170 	 */
3171 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3172 					ip_rt_gc_interval;
3173 	add_timer(&rt_periodic_timer);
3174 
3175 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3176 		ip_rt_secret_interval;
3177 	add_timer(&rt_secret_timer);
3178 
3179 #ifdef CONFIG_PROC_FS
3180 	{
3181 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3182 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3183 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3184 					     proc_net_stat))) {
3185 		return -ENOMEM;
3186 	}
3187 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3188 	}
3189 #ifdef CONFIG_NET_CLS_ROUTE
3190 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3191 #endif
3192 #endif
3193 #ifdef CONFIG_XFRM
3194 	xfrm_init();
3195 	xfrm4_init();
3196 #endif
3197 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3198 
3199 	return rc;
3200 }
3201 
3202 EXPORT_SYMBOL(__ip_select_ident);
3203 EXPORT_SYMBOL(ip_route_input);
3204 EXPORT_SYMBOL(ip_route_output_key);
3205