xref: /openbmc/linux/net/ipv4/route.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *
58  *		This program is free software; you can redistribute it and/or
59  *		modify it under the terms of the GNU General Public License
60  *		as published by the Free Software Foundation; either version
61  *		2 of the License, or (at your option) any later version.
62  */
63 
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/ip_mp_alg.h>
104 #ifdef CONFIG_SYSCTL
105 #include <linux/sysctl.h>
106 #endif
107 
108 #define RT_FL_TOS(oldflp) \
109     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
110 
111 #define IP_MAX_MTU	0xFFF0
112 
113 #define RT_GC_TIMEOUT (300*HZ)
114 
115 static int ip_rt_min_delay		= 2 * HZ;
116 static int ip_rt_max_delay		= 10 * HZ;
117 static int ip_rt_max_size;
118 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
119 static int ip_rt_gc_interval		= 60 * HZ;
120 static int ip_rt_gc_min_interval	= HZ / 2;
121 static int ip_rt_redirect_number	= 9;
122 static int ip_rt_redirect_load		= HZ / 50;
123 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost		= HZ;
125 static int ip_rt_error_burst		= 5 * HZ;
126 static int ip_rt_gc_elasticity		= 8;
127 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
128 static int ip_rt_min_pmtu		= 512 + 20 + 20;
129 static int ip_rt_min_advmss		= 256;
130 static int ip_rt_secret_interval	= 10 * 60 * HZ;
131 static unsigned long rt_deadline;
132 
133 #define RTprint(a...)	printk(KERN_DEBUG a)
134 
135 static struct timer_list rt_flush_timer;
136 static struct timer_list rt_periodic_timer;
137 static struct timer_list rt_secret_timer;
138 
139 /*
140  *	Interface to generic destination cache.
141  */
142 
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void		 ipv4_dst_destroy(struct dst_entry *dst);
145 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
146 					 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void		 ipv4_link_failure(struct sk_buff *skb);
149 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(void);
151 
152 
153 static struct dst_ops ipv4_dst_ops = {
154 	.family =		AF_INET,
155 	.protocol =		__constant_htons(ETH_P_IP),
156 	.gc =			rt_garbage_collect,
157 	.check =		ipv4_dst_check,
158 	.destroy =		ipv4_dst_destroy,
159 	.ifdown =		ipv4_dst_ifdown,
160 	.negative_advice =	ipv4_negative_advice,
161 	.link_failure =		ipv4_link_failure,
162 	.update_pmtu =		ip_rt_update_pmtu,
163 	.entry_size =		sizeof(struct rtable),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 	spinlock_t	lock;
205 } __attribute__((__aligned__(8)));
206 
207 static struct rt_hash_bucket 	*rt_hash_table;
208 static unsigned			rt_hash_mask;
209 static int			rt_hash_log;
210 static unsigned int		rt_hash_rnd;
211 
212 struct rt_cache_stat *rt_cache_stat;
213 
214 static int rt_intern_hash(unsigned hash, struct rtable *rth,
215 				struct rtable **res);
216 
217 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
218 {
219 	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
220 		& rt_hash_mask);
221 }
222 
223 #ifdef CONFIG_PROC_FS
224 struct rt_cache_iter_state {
225 	int bucket;
226 };
227 
228 static struct rtable *rt_cache_get_first(struct seq_file *seq)
229 {
230 	struct rtable *r = NULL;
231 	struct rt_cache_iter_state *st = seq->private;
232 
233 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
234 		rcu_read_lock_bh();
235 		r = rt_hash_table[st->bucket].chain;
236 		if (r)
237 			break;
238 		rcu_read_unlock_bh();
239 	}
240 	return r;
241 }
242 
243 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
244 {
245 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
246 
247 	r = r->u.rt_next;
248 	while (!r) {
249 		rcu_read_unlock_bh();
250 		if (--st->bucket < 0)
251 			break;
252 		rcu_read_lock_bh();
253 		r = rt_hash_table[st->bucket].chain;
254 	}
255 	return r;
256 }
257 
258 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
259 {
260 	struct rtable *r = rt_cache_get_first(seq);
261 
262 	if (r)
263 		while (pos && (r = rt_cache_get_next(seq, r)))
264 			--pos;
265 	return pos ? NULL : r;
266 }
267 
268 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
269 {
270 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
271 }
272 
273 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274 {
275 	struct rtable *r = NULL;
276 
277 	if (v == SEQ_START_TOKEN)
278 		r = rt_cache_get_first(seq);
279 	else
280 		r = rt_cache_get_next(seq, v);
281 	++*pos;
282 	return r;
283 }
284 
285 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
286 {
287 	if (v && v != SEQ_START_TOKEN)
288 		rcu_read_unlock_bh();
289 }
290 
291 static int rt_cache_seq_show(struct seq_file *seq, void *v)
292 {
293 	if (v == SEQ_START_TOKEN)
294 		seq_printf(seq, "%-127s\n",
295 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
296 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
297 			   "HHUptod\tSpecDst");
298 	else {
299 		struct rtable *r = v;
300 		char temp[256];
301 
302 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
303 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
304 			r->u.dst.dev ? r->u.dst.dev->name : "*",
305 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
306 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
307 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
308 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
309 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
310 			dst_metric(&r->u.dst, RTAX_WINDOW),
311 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
312 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
313 			r->fl.fl4_tos,
314 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
315 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
316 				       dev_queue_xmit) : 0,
317 			r->rt_spec_dst);
318 		seq_printf(seq, "%-127s\n", temp);
319         }
320   	return 0;
321 }
322 
323 static struct seq_operations rt_cache_seq_ops = {
324 	.start  = rt_cache_seq_start,
325 	.next   = rt_cache_seq_next,
326 	.stop   = rt_cache_seq_stop,
327 	.show   = rt_cache_seq_show,
328 };
329 
330 static int rt_cache_seq_open(struct inode *inode, struct file *file)
331 {
332 	struct seq_file *seq;
333 	int rc = -ENOMEM;
334 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
335 
336 	if (!s)
337 		goto out;
338 	rc = seq_open(file, &rt_cache_seq_ops);
339 	if (rc)
340 		goto out_kfree;
341 	seq          = file->private_data;
342 	seq->private = s;
343 	memset(s, 0, sizeof(*s));
344 out:
345 	return rc;
346 out_kfree:
347 	kfree(s);
348 	goto out;
349 }
350 
351 static struct file_operations rt_cache_seq_fops = {
352 	.owner	 = THIS_MODULE,
353 	.open	 = rt_cache_seq_open,
354 	.read	 = seq_read,
355 	.llseek	 = seq_lseek,
356 	.release = seq_release_private,
357 };
358 
359 
360 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
361 {
362 	int cpu;
363 
364 	if (*pos == 0)
365 		return SEQ_START_TOKEN;
366 
367 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
368 		if (!cpu_possible(cpu))
369 			continue;
370 		*pos = cpu+1;
371 		return per_cpu_ptr(rt_cache_stat, cpu);
372 	}
373 	return NULL;
374 }
375 
376 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
377 {
378 	int cpu;
379 
380 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
381 		if (!cpu_possible(cpu))
382 			continue;
383 		*pos = cpu+1;
384 		return per_cpu_ptr(rt_cache_stat, cpu);
385 	}
386 	return NULL;
387 
388 }
389 
390 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
391 {
392 
393 }
394 
395 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
396 {
397 	struct rt_cache_stat *st = v;
398 
399 	if (v == SEQ_START_TOKEN) {
400 		seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
401 		return 0;
402 	}
403 
404 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
405 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
406 		   atomic_read(&ipv4_dst_ops.entries),
407 		   st->in_hit,
408 		   st->in_slow_tot,
409 		   st->in_slow_mc,
410 		   st->in_no_route,
411 		   st->in_brd,
412 		   st->in_martian_dst,
413 		   st->in_martian_src,
414 
415 		   st->out_hit,
416 		   st->out_slow_tot,
417 		   st->out_slow_mc,
418 
419 		   st->gc_total,
420 		   st->gc_ignored,
421 		   st->gc_goal_miss,
422 		   st->gc_dst_overflow,
423 		   st->in_hlist_search,
424 		   st->out_hlist_search
425 		);
426 	return 0;
427 }
428 
429 static struct seq_operations rt_cpu_seq_ops = {
430 	.start  = rt_cpu_seq_start,
431 	.next   = rt_cpu_seq_next,
432 	.stop   = rt_cpu_seq_stop,
433 	.show   = rt_cpu_seq_show,
434 };
435 
436 
437 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
438 {
439 	return seq_open(file, &rt_cpu_seq_ops);
440 }
441 
442 static struct file_operations rt_cpu_seq_fops = {
443 	.owner	 = THIS_MODULE,
444 	.open	 = rt_cpu_seq_open,
445 	.read	 = seq_read,
446 	.llseek	 = seq_lseek,
447 	.release = seq_release,
448 };
449 
450 #endif /* CONFIG_PROC_FS */
451 
452 static __inline__ void rt_free(struct rtable *rt)
453 {
454 	multipath_remove(rt);
455 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
456 }
457 
458 static __inline__ void rt_drop(struct rtable *rt)
459 {
460 	multipath_remove(rt);
461 	ip_rt_put(rt);
462 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
463 }
464 
465 static __inline__ int rt_fast_clean(struct rtable *rth)
466 {
467 	/* Kill broadcast/multicast entries very aggresively, if they
468 	   collide in hash table with more useful entries */
469 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
470 		rth->fl.iif && rth->u.rt_next;
471 }
472 
473 static __inline__ int rt_valuable(struct rtable *rth)
474 {
475 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
476 		rth->u.dst.expires;
477 }
478 
479 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
480 {
481 	unsigned long age;
482 	int ret = 0;
483 
484 	if (atomic_read(&rth->u.dst.__refcnt))
485 		goto out;
486 
487 	ret = 1;
488 	if (rth->u.dst.expires &&
489 	    time_after_eq(jiffies, rth->u.dst.expires))
490 		goto out;
491 
492 	age = jiffies - rth->u.dst.lastuse;
493 	ret = 0;
494 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
495 	    (age <= tmo2 && rt_valuable(rth)))
496 		goto out;
497 	ret = 1;
498 out:	return ret;
499 }
500 
501 /* Bits of score are:
502  * 31: very valuable
503  * 30: not quite useless
504  * 29..0: usage counter
505  */
506 static inline u32 rt_score(struct rtable *rt)
507 {
508 	u32 score = jiffies - rt->u.dst.lastuse;
509 
510 	score = ~score & ~(3<<30);
511 
512 	if (rt_valuable(rt))
513 		score |= (1<<31);
514 
515 	if (!rt->fl.iif ||
516 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
517 		score |= (1<<30);
518 
519 	return score;
520 }
521 
522 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
523 {
524 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
525 	       fl1->oif     == fl2->oif &&
526 	       fl1->iif     == fl2->iif;
527 }
528 
529 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
530 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
531 						struct rtable *expentry,
532 						int *removed_count)
533 {
534 	int passedexpired = 0;
535 	struct rtable **nextstep = NULL;
536 	struct rtable **rthp = chain_head;
537 	struct rtable *rth;
538 
539 	if (removed_count)
540 		*removed_count = 0;
541 
542 	while ((rth = *rthp) != NULL) {
543 		if (rth == expentry)
544 			passedexpired = 1;
545 
546 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
547 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
548 			if (*rthp == expentry) {
549 				*rthp = rth->u.rt_next;
550 				continue;
551 			} else {
552 				*rthp = rth->u.rt_next;
553 				rt_free(rth);
554 				if (removed_count)
555 					++(*removed_count);
556 			}
557 		} else {
558 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
559 			    passedexpired && !nextstep)
560 				nextstep = &rth->u.rt_next;
561 
562 			rthp = &rth->u.rt_next;
563 		}
564 	}
565 
566 	rt_free(expentry);
567 	if (removed_count)
568 		++(*removed_count);
569 
570 	return nextstep;
571 }
572 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
573 
574 
575 /* This runs via a timer and thus is always in BH context. */
576 static void rt_check_expire(unsigned long dummy)
577 {
578 	static int rover;
579 	int i = rover, t;
580 	struct rtable *rth, **rthp;
581 	unsigned long now = jiffies;
582 
583 	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
584 	     t -= ip_rt_gc_timeout) {
585 		unsigned long tmo = ip_rt_gc_timeout;
586 
587 		i = (i + 1) & rt_hash_mask;
588 		rthp = &rt_hash_table[i].chain;
589 
590 		spin_lock(&rt_hash_table[i].lock);
591 		while ((rth = *rthp) != NULL) {
592 			if (rth->u.dst.expires) {
593 				/* Entry is expired even if it is in use */
594 				if (time_before_eq(now, rth->u.dst.expires)) {
595 					tmo >>= 1;
596 					rthp = &rth->u.rt_next;
597 					continue;
598 				}
599 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
600 				tmo >>= 1;
601 				rthp = &rth->u.rt_next;
602 				continue;
603 			}
604 
605 			/* Cleanup aged off entries. */
606 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
607 			/* remove all related balanced entries if necessary */
608 			if (rth->u.dst.flags & DST_BALANCED) {
609 				rthp = rt_remove_balanced_route(
610 					&rt_hash_table[i].chain,
611 					rth, NULL);
612 				if (!rthp)
613 					break;
614 			} else {
615 				*rthp = rth->u.rt_next;
616 				rt_free(rth);
617 			}
618 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
619  			*rthp = rth->u.rt_next;
620  			rt_free(rth);
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 		}
623 		spin_unlock(&rt_hash_table[i].lock);
624 
625 		/* Fallback loop breaker. */
626 		if (time_after(jiffies, now))
627 			break;
628 	}
629 	rover = i;
630 	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
631 }
632 
633 /* This can run from both BH and non-BH contexts, the latter
634  * in the case of a forced flush event.
635  */
636 static void rt_run_flush(unsigned long dummy)
637 {
638 	int i;
639 	struct rtable *rth, *next;
640 
641 	rt_deadline = 0;
642 
643 	get_random_bytes(&rt_hash_rnd, 4);
644 
645 	for (i = rt_hash_mask; i >= 0; i--) {
646 		spin_lock_bh(&rt_hash_table[i].lock);
647 		rth = rt_hash_table[i].chain;
648 		if (rth)
649 			rt_hash_table[i].chain = NULL;
650 		spin_unlock_bh(&rt_hash_table[i].lock);
651 
652 		for (; rth; rth = next) {
653 			next = rth->u.rt_next;
654 			rt_free(rth);
655 		}
656 	}
657 }
658 
659 static DEFINE_SPINLOCK(rt_flush_lock);
660 
661 void rt_cache_flush(int delay)
662 {
663 	unsigned long now = jiffies;
664 	int user_mode = !in_softirq();
665 
666 	if (delay < 0)
667 		delay = ip_rt_min_delay;
668 
669 	/* flush existing multipath state*/
670 	multipath_flush();
671 
672 	spin_lock_bh(&rt_flush_lock);
673 
674 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
675 		long tmo = (long)(rt_deadline - now);
676 
677 		/* If flush timer is already running
678 		   and flush request is not immediate (delay > 0):
679 
680 		   if deadline is not achieved, prolongate timer to "delay",
681 		   otherwise fire it at deadline time.
682 		 */
683 
684 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
685 			tmo = 0;
686 
687 		if (delay > tmo)
688 			delay = tmo;
689 	}
690 
691 	if (delay <= 0) {
692 		spin_unlock_bh(&rt_flush_lock);
693 		rt_run_flush(0);
694 		return;
695 	}
696 
697 	if (rt_deadline == 0)
698 		rt_deadline = now + ip_rt_max_delay;
699 
700 	mod_timer(&rt_flush_timer, now+delay);
701 	spin_unlock_bh(&rt_flush_lock);
702 }
703 
704 static void rt_secret_rebuild(unsigned long dummy)
705 {
706 	unsigned long now = jiffies;
707 
708 	rt_cache_flush(0);
709 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
710 }
711 
712 /*
713    Short description of GC goals.
714 
715    We want to build algorithm, which will keep routing cache
716    at some equilibrium point, when number of aged off entries
717    is kept approximately equal to newly generated ones.
718 
719    Current expiration strength is variable "expire".
720    We try to adjust it dynamically, so that if networking
721    is idle expires is large enough to keep enough of warm entries,
722    and when load increases it reduces to limit cache size.
723  */
724 
725 static int rt_garbage_collect(void)
726 {
727 	static unsigned long expire = RT_GC_TIMEOUT;
728 	static unsigned long last_gc;
729 	static int rover;
730 	static int equilibrium;
731 	struct rtable *rth, **rthp;
732 	unsigned long now = jiffies;
733 	int goal;
734 
735 	/*
736 	 * Garbage collection is pretty expensive,
737 	 * do not make it too frequently.
738 	 */
739 
740 	RT_CACHE_STAT_INC(gc_total);
741 
742 	if (now - last_gc < ip_rt_gc_min_interval &&
743 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
744 		RT_CACHE_STAT_INC(gc_ignored);
745 		goto out;
746 	}
747 
748 	/* Calculate number of entries, which we want to expire now. */
749 	goal = atomic_read(&ipv4_dst_ops.entries) -
750 		(ip_rt_gc_elasticity << rt_hash_log);
751 	if (goal <= 0) {
752 		if (equilibrium < ipv4_dst_ops.gc_thresh)
753 			equilibrium = ipv4_dst_ops.gc_thresh;
754 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
755 		if (goal > 0) {
756 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
757 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
758 		}
759 	} else {
760 		/* We are in dangerous area. Try to reduce cache really
761 		 * aggressively.
762 		 */
763 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
764 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
765 	}
766 
767 	if (now - last_gc >= ip_rt_gc_min_interval)
768 		last_gc = now;
769 
770 	if (goal <= 0) {
771 		equilibrium += goal;
772 		goto work_done;
773 	}
774 
775 	do {
776 		int i, k;
777 
778 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
779 			unsigned long tmo = expire;
780 
781 			k = (k + 1) & rt_hash_mask;
782 			rthp = &rt_hash_table[k].chain;
783 			spin_lock_bh(&rt_hash_table[k].lock);
784 			while ((rth = *rthp) != NULL) {
785 				if (!rt_may_expire(rth, tmo, expire)) {
786 					tmo >>= 1;
787 					rthp = &rth->u.rt_next;
788 					continue;
789 				}
790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791 				/* remove all related balanced entries
792 				 * if necessary
793 				 */
794 				if (rth->u.dst.flags & DST_BALANCED) {
795 					int r;
796 
797 					rthp = rt_remove_balanced_route(
798 						&rt_hash_table[i].chain,
799 						rth,
800 						&r);
801 					goal -= r;
802 					if (!rthp)
803 						break;
804 				} else {
805 					*rthp = rth->u.rt_next;
806 					rt_free(rth);
807 					goal--;
808 				}
809 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
810 				*rthp = rth->u.rt_next;
811 				rt_free(rth);
812 				goal--;
813 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 			}
815 			spin_unlock_bh(&rt_hash_table[k].lock);
816 			if (goal <= 0)
817 				break;
818 		}
819 		rover = k;
820 
821 		if (goal <= 0)
822 			goto work_done;
823 
824 		/* Goal is not achieved. We stop process if:
825 
826 		   - if expire reduced to zero. Otherwise, expire is halfed.
827 		   - if table is not full.
828 		   - if we are called from interrupt.
829 		   - jiffies check is just fallback/debug loop breaker.
830 		     We will not spin here for long time in any case.
831 		 */
832 
833 		RT_CACHE_STAT_INC(gc_goal_miss);
834 
835 		if (expire == 0)
836 			break;
837 
838 		expire >>= 1;
839 #if RT_CACHE_DEBUG >= 2
840 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
841 				atomic_read(&ipv4_dst_ops.entries), goal, i);
842 #endif
843 
844 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
845 			goto out;
846 	} while (!in_softirq() && time_before_eq(jiffies, now));
847 
848 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
849 		goto out;
850 	if (net_ratelimit())
851 		printk(KERN_WARNING "dst cache overflow\n");
852 	RT_CACHE_STAT_INC(gc_dst_overflow);
853 	return 1;
854 
855 work_done:
856 	expire += ip_rt_gc_min_interval;
857 	if (expire > ip_rt_gc_timeout ||
858 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
859 		expire = ip_rt_gc_timeout;
860 #if RT_CACHE_DEBUG >= 2
861 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
862 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
863 #endif
864 out:	return 0;
865 }
866 
867 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
868 {
869 	struct rtable	*rth, **rthp;
870 	unsigned long	now;
871 	struct rtable *cand, **candp;
872 	u32 		min_score;
873 	int		chain_length;
874 	int attempts = !in_softirq();
875 
876 restart:
877 	chain_length = 0;
878 	min_score = ~(u32)0;
879 	cand = NULL;
880 	candp = NULL;
881 	now = jiffies;
882 
883 	rthp = &rt_hash_table[hash].chain;
884 
885 	spin_lock_bh(&rt_hash_table[hash].lock);
886 	while ((rth = *rthp) != NULL) {
887 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 		if (!(rth->u.dst.flags & DST_BALANCED) &&
889 		    compare_keys(&rth->fl, &rt->fl)) {
890 #else
891 		if (compare_keys(&rth->fl, &rt->fl)) {
892 #endif
893 			/* Put it first */
894 			*rthp = rth->u.rt_next;
895 			/*
896 			 * Since lookup is lockfree, the deletion
897 			 * must be visible to another weakly ordered CPU before
898 			 * the insertion at the start of the hash chain.
899 			 */
900 			rcu_assign_pointer(rth->u.rt_next,
901 					   rt_hash_table[hash].chain);
902 			/*
903 			 * Since lookup is lockfree, the update writes
904 			 * must be ordered for consistency on SMP.
905 			 */
906 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
907 
908 			rth->u.dst.__use++;
909 			dst_hold(&rth->u.dst);
910 			rth->u.dst.lastuse = now;
911 			spin_unlock_bh(&rt_hash_table[hash].lock);
912 
913 			rt_drop(rt);
914 			*rp = rth;
915 			return 0;
916 		}
917 
918 		if (!atomic_read(&rth->u.dst.__refcnt)) {
919 			u32 score = rt_score(rth);
920 
921 			if (score <= min_score) {
922 				cand = rth;
923 				candp = rthp;
924 				min_score = score;
925 			}
926 		}
927 
928 		chain_length++;
929 
930 		rthp = &rth->u.rt_next;
931 	}
932 
933 	if (cand) {
934 		/* ip_rt_gc_elasticity used to be average length of chain
935 		 * length, when exceeded gc becomes really aggressive.
936 		 *
937 		 * The second limit is less certain. At the moment it allows
938 		 * only 2 entries per bucket. We will see.
939 		 */
940 		if (chain_length > ip_rt_gc_elasticity) {
941 			*candp = cand->u.rt_next;
942 			rt_free(cand);
943 		}
944 	}
945 
946 	/* Try to bind route to arp only if it is output
947 	   route or unicast forwarding path.
948 	 */
949 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 		int err = arp_bind_neighbour(&rt->u.dst);
951 		if (err) {
952 			spin_unlock_bh(&rt_hash_table[hash].lock);
953 
954 			if (err != -ENOBUFS) {
955 				rt_drop(rt);
956 				return err;
957 			}
958 
959 			/* Neighbour tables are full and nothing
960 			   can be released. Try to shrink route cache,
961 			   it is most likely it holds some neighbour records.
962 			 */
963 			if (attempts-- > 0) {
964 				int saved_elasticity = ip_rt_gc_elasticity;
965 				int saved_int = ip_rt_gc_min_interval;
966 				ip_rt_gc_elasticity	= 1;
967 				ip_rt_gc_min_interval	= 0;
968 				rt_garbage_collect();
969 				ip_rt_gc_min_interval	= saved_int;
970 				ip_rt_gc_elasticity	= saved_elasticity;
971 				goto restart;
972 			}
973 
974 			if (net_ratelimit())
975 				printk(KERN_WARNING "Neighbour table overflow.\n");
976 			rt_drop(rt);
977 			return -ENOBUFS;
978 		}
979 	}
980 
981 	rt->u.rt_next = rt_hash_table[hash].chain;
982 #if RT_CACHE_DEBUG >= 2
983 	if (rt->u.rt_next) {
984 		struct rtable *trt;
985 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
986 		       NIPQUAD(rt->rt_dst));
987 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
988 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
989 		printk("\n");
990 	}
991 #endif
992 	rt_hash_table[hash].chain = rt;
993 	spin_unlock_bh(&rt_hash_table[hash].lock);
994 	*rp = rt;
995 	return 0;
996 }
997 
998 void rt_bind_peer(struct rtable *rt, int create)
999 {
1000 	static DEFINE_SPINLOCK(rt_peer_lock);
1001 	struct inet_peer *peer;
1002 
1003 	peer = inet_getpeer(rt->rt_dst, create);
1004 
1005 	spin_lock_bh(&rt_peer_lock);
1006 	if (rt->peer == NULL) {
1007 		rt->peer = peer;
1008 		peer = NULL;
1009 	}
1010 	spin_unlock_bh(&rt_peer_lock);
1011 	if (peer)
1012 		inet_putpeer(peer);
1013 }
1014 
1015 /*
1016  * Peer allocation may fail only in serious out-of-memory conditions.  However
1017  * we still can generate some output.
1018  * Random ID selection looks a bit dangerous because we have no chances to
1019  * select ID being unique in a reasonable period of time.
1020  * But broken packet identifier may be better than no packet at all.
1021  */
1022 static void ip_select_fb_ident(struct iphdr *iph)
1023 {
1024 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1025 	static u32 ip_fallback_id;
1026 	u32 salt;
1027 
1028 	spin_lock_bh(&ip_fb_id_lock);
1029 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030 	iph->id = htons(salt & 0xFFFF);
1031 	ip_fallback_id = salt;
1032 	spin_unlock_bh(&ip_fb_id_lock);
1033 }
1034 
1035 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1036 {
1037 	struct rtable *rt = (struct rtable *) dst;
1038 
1039 	if (rt) {
1040 		if (rt->peer == NULL)
1041 			rt_bind_peer(rt, 1);
1042 
1043 		/* If peer is attached to destination, it is never detached,
1044 		   so that we need not to grab a lock to dereference it.
1045 		 */
1046 		if (rt->peer) {
1047 			iph->id = htons(inet_getid(rt->peer, more));
1048 			return;
1049 		}
1050 	} else
1051 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
1052 
1053 	ip_select_fb_ident(iph);
1054 }
1055 
1056 static void rt_del(unsigned hash, struct rtable *rt)
1057 {
1058 	struct rtable **rthp;
1059 
1060 	spin_lock_bh(&rt_hash_table[hash].lock);
1061 	ip_rt_put(rt);
1062 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1063 	     rthp = &(*rthp)->u.rt_next)
1064 		if (*rthp == rt) {
1065 			*rthp = rt->u.rt_next;
1066 			rt_free(rt);
1067 			break;
1068 		}
1069 	spin_unlock_bh(&rt_hash_table[hash].lock);
1070 }
1071 
1072 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1073 		    u32 saddr, u8 tos, struct net_device *dev)
1074 {
1075 	int i, k;
1076 	struct in_device *in_dev = in_dev_get(dev);
1077 	struct rtable *rth, **rthp;
1078 	u32  skeys[2] = { saddr, 0 };
1079 	int  ikeys[2] = { dev->ifindex, 0 };
1080 
1081 	tos &= IPTOS_RT_MASK;
1082 
1083 	if (!in_dev)
1084 		return;
1085 
1086 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1087 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1088 		goto reject_redirect;
1089 
1090 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1091 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1092 			goto reject_redirect;
1093 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1094 			goto reject_redirect;
1095 	} else {
1096 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1097 			goto reject_redirect;
1098 	}
1099 
1100 	for (i = 0; i < 2; i++) {
1101 		for (k = 0; k < 2; k++) {
1102 			unsigned hash = rt_hash_code(daddr,
1103 						     skeys[i] ^ (ikeys[k] << 5),
1104 						     tos);
1105 
1106 			rthp=&rt_hash_table[hash].chain;
1107 
1108 			rcu_read_lock();
1109 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1110 				struct rtable *rt;
1111 
1112 				if (rth->fl.fl4_dst != daddr ||
1113 				    rth->fl.fl4_src != skeys[i] ||
1114 				    rth->fl.fl4_tos != tos ||
1115 				    rth->fl.oif != ikeys[k] ||
1116 				    rth->fl.iif != 0) {
1117 					rthp = &rth->u.rt_next;
1118 					continue;
1119 				}
1120 
1121 				if (rth->rt_dst != daddr ||
1122 				    rth->rt_src != saddr ||
1123 				    rth->u.dst.error ||
1124 				    rth->rt_gateway != old_gw ||
1125 				    rth->u.dst.dev != dev)
1126 					break;
1127 
1128 				dst_hold(&rth->u.dst);
1129 				rcu_read_unlock();
1130 
1131 				rt = dst_alloc(&ipv4_dst_ops);
1132 				if (rt == NULL) {
1133 					ip_rt_put(rth);
1134 					in_dev_put(in_dev);
1135 					return;
1136 				}
1137 
1138 				/* Copy all the information. */
1139 				*rt = *rth;
1140  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1141 				rt->u.dst.__use		= 1;
1142 				atomic_set(&rt->u.dst.__refcnt, 1);
1143 				rt->u.dst.child		= NULL;
1144 				if (rt->u.dst.dev)
1145 					dev_hold(rt->u.dst.dev);
1146 				if (rt->idev)
1147 					in_dev_hold(rt->idev);
1148 				rt->u.dst.obsolete	= 0;
1149 				rt->u.dst.lastuse	= jiffies;
1150 				rt->u.dst.path		= &rt->u.dst;
1151 				rt->u.dst.neighbour	= NULL;
1152 				rt->u.dst.hh		= NULL;
1153 				rt->u.dst.xfrm		= NULL;
1154 
1155 				rt->rt_flags		|= RTCF_REDIRECTED;
1156 
1157 				/* Gateway is different ... */
1158 				rt->rt_gateway		= new_gw;
1159 
1160 				/* Redirect received -> path was valid */
1161 				dst_confirm(&rth->u.dst);
1162 
1163 				if (rt->peer)
1164 					atomic_inc(&rt->peer->refcnt);
1165 
1166 				if (arp_bind_neighbour(&rt->u.dst) ||
1167 				    !(rt->u.dst.neighbour->nud_state &
1168 					    NUD_VALID)) {
1169 					if (rt->u.dst.neighbour)
1170 						neigh_event_send(rt->u.dst.neighbour, NULL);
1171 					ip_rt_put(rth);
1172 					rt_drop(rt);
1173 					goto do_next;
1174 				}
1175 
1176 				rt_del(hash, rth);
1177 				if (!rt_intern_hash(hash, rt, &rt))
1178 					ip_rt_put(rt);
1179 				goto do_next;
1180 			}
1181 			rcu_read_unlock();
1182 		do_next:
1183 			;
1184 		}
1185 	}
1186 	in_dev_put(in_dev);
1187 	return;
1188 
1189 reject_redirect:
1190 #ifdef CONFIG_IP_ROUTE_VERBOSE
1191 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1192 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1193 			"%u.%u.%u.%u ignored.\n"
1194 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1195 			"tos %02x\n",
1196 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1197 		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
1198 #endif
1199 	in_dev_put(in_dev);
1200 }
1201 
1202 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1203 {
1204 	struct rtable *rt = (struct rtable*)dst;
1205 	struct dst_entry *ret = dst;
1206 
1207 	if (rt) {
1208 		if (dst->obsolete) {
1209 			ip_rt_put(rt);
1210 			ret = NULL;
1211 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1212 			   rt->u.dst.expires) {
1213 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1214 						     rt->fl.fl4_src ^
1215 							(rt->fl.oif << 5),
1216 						     rt->fl.fl4_tos);
1217 #if RT_CACHE_DEBUG >= 1
1218 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1219 					  "%u.%u.%u.%u/%02x dropped\n",
1220 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1221 #endif
1222 			rt_del(hash, rt);
1223 			ret = NULL;
1224 		}
1225 	}
1226 	return ret;
1227 }
1228 
1229 /*
1230  * Algorithm:
1231  *	1. The first ip_rt_redirect_number redirects are sent
1232  *	   with exponential backoff, then we stop sending them at all,
1233  *	   assuming that the host ignores our redirects.
1234  *	2. If we did not see packets requiring redirects
1235  *	   during ip_rt_redirect_silence, we assume that the host
1236  *	   forgot redirected route and start to send redirects again.
1237  *
1238  * This algorithm is much cheaper and more intelligent than dumb load limiting
1239  * in icmp.c.
1240  *
1241  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1242  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1243  */
1244 
1245 void ip_rt_send_redirect(struct sk_buff *skb)
1246 {
1247 	struct rtable *rt = (struct rtable*)skb->dst;
1248 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1249 
1250 	if (!in_dev)
1251 		return;
1252 
1253 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1254 		goto out;
1255 
1256 	/* No redirected packets during ip_rt_redirect_silence;
1257 	 * reset the algorithm.
1258 	 */
1259 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1260 		rt->u.dst.rate_tokens = 0;
1261 
1262 	/* Too many ignored redirects; do not send anything
1263 	 * set u.dst.rate_last to the last seen redirected packet.
1264 	 */
1265 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1266 		rt->u.dst.rate_last = jiffies;
1267 		goto out;
1268 	}
1269 
1270 	/* Check for load limit; set rate_last to the latest sent
1271 	 * redirect.
1272 	 */
1273 	if (time_after(jiffies,
1274 		       (rt->u.dst.rate_last +
1275 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1276 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1277 		rt->u.dst.rate_last = jiffies;
1278 		++rt->u.dst.rate_tokens;
1279 #ifdef CONFIG_IP_ROUTE_VERBOSE
1280 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1281 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1282 		    net_ratelimit())
1283 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1284 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1285 				NIPQUAD(rt->rt_src), rt->rt_iif,
1286 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1287 #endif
1288 	}
1289 out:
1290         in_dev_put(in_dev);
1291 }
1292 
1293 static int ip_error(struct sk_buff *skb)
1294 {
1295 	struct rtable *rt = (struct rtable*)skb->dst;
1296 	unsigned long now;
1297 	int code;
1298 
1299 	switch (rt->u.dst.error) {
1300 		case EINVAL:
1301 		default:
1302 			goto out;
1303 		case EHOSTUNREACH:
1304 			code = ICMP_HOST_UNREACH;
1305 			break;
1306 		case ENETUNREACH:
1307 			code = ICMP_NET_UNREACH;
1308 			break;
1309 		case EACCES:
1310 			code = ICMP_PKT_FILTERED;
1311 			break;
1312 	}
1313 
1314 	now = jiffies;
1315 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1316 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1317 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1318 	rt->u.dst.rate_last = now;
1319 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1320 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1321 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1322 	}
1323 
1324 out:	kfree_skb(skb);
1325 	return 0;
1326 }
1327 
1328 /*
1329  *	The last two values are not from the RFC but
1330  *	are needed for AMPRnet AX.25 paths.
1331  */
1332 
1333 static unsigned short mtu_plateau[] =
1334 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1335 
1336 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1337 {
1338 	int i;
1339 
1340 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1341 		if (old_mtu > mtu_plateau[i])
1342 			return mtu_plateau[i];
1343 	return 68;
1344 }
1345 
1346 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1347 {
1348 	int i;
1349 	unsigned short old_mtu = ntohs(iph->tot_len);
1350 	struct rtable *rth;
1351 	u32  skeys[2] = { iph->saddr, 0, };
1352 	u32  daddr = iph->daddr;
1353 	u8   tos = iph->tos & IPTOS_RT_MASK;
1354 	unsigned short est_mtu = 0;
1355 
1356 	if (ipv4_config.no_pmtu_disc)
1357 		return 0;
1358 
1359 	for (i = 0; i < 2; i++) {
1360 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1361 
1362 		rcu_read_lock();
1363 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1364 		     rth = rcu_dereference(rth->u.rt_next)) {
1365 			if (rth->fl.fl4_dst == daddr &&
1366 			    rth->fl.fl4_src == skeys[i] &&
1367 			    rth->rt_dst  == daddr &&
1368 			    rth->rt_src  == iph->saddr &&
1369 			    rth->fl.fl4_tos == tos &&
1370 			    rth->fl.iif == 0 &&
1371 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1372 				unsigned short mtu = new_mtu;
1373 
1374 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1375 
1376 					/* BSD 4.2 compatibility hack :-( */
1377 					if (mtu == 0 &&
1378 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1379 					    old_mtu >= 68 + (iph->ihl << 2))
1380 						old_mtu -= iph->ihl << 2;
1381 
1382 					mtu = guess_mtu(old_mtu);
1383 				}
1384 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1385 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1386 						dst_confirm(&rth->u.dst);
1387 						if (mtu < ip_rt_min_pmtu) {
1388 							mtu = ip_rt_min_pmtu;
1389 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1390 								(1 << RTAX_MTU);
1391 						}
1392 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1393 						dst_set_expires(&rth->u.dst,
1394 							ip_rt_mtu_expires);
1395 					}
1396 					est_mtu = mtu;
1397 				}
1398 			}
1399 		}
1400 		rcu_read_unlock();
1401 	}
1402 	return est_mtu ? : new_mtu;
1403 }
1404 
1405 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1406 {
1407 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1408 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1409 		if (mtu < ip_rt_min_pmtu) {
1410 			mtu = ip_rt_min_pmtu;
1411 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1412 		}
1413 		dst->metrics[RTAX_MTU-1] = mtu;
1414 		dst_set_expires(dst, ip_rt_mtu_expires);
1415 	}
1416 }
1417 
1418 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1419 {
1420 	return NULL;
1421 }
1422 
1423 static void ipv4_dst_destroy(struct dst_entry *dst)
1424 {
1425 	struct rtable *rt = (struct rtable *) dst;
1426 	struct inet_peer *peer = rt->peer;
1427 	struct in_device *idev = rt->idev;
1428 
1429 	if (peer) {
1430 		rt->peer = NULL;
1431 		inet_putpeer(peer);
1432 	}
1433 
1434 	if (idev) {
1435 		rt->idev = NULL;
1436 		in_dev_put(idev);
1437 	}
1438 }
1439 
1440 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1441 			    int how)
1442 {
1443 	struct rtable *rt = (struct rtable *) dst;
1444 	struct in_device *idev = rt->idev;
1445 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1446 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1447 		if (loopback_idev) {
1448 			rt->idev = loopback_idev;
1449 			in_dev_put(idev);
1450 		}
1451 	}
1452 }
1453 
1454 static void ipv4_link_failure(struct sk_buff *skb)
1455 {
1456 	struct rtable *rt;
1457 
1458 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1459 
1460 	rt = (struct rtable *) skb->dst;
1461 	if (rt)
1462 		dst_set_expires(&rt->u.dst, 0);
1463 }
1464 
1465 static int ip_rt_bug(struct sk_buff *skb)
1466 {
1467 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1468 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1469 		skb->dev ? skb->dev->name : "?");
1470 	kfree_skb(skb);
1471 	return 0;
1472 }
1473 
1474 /*
1475    We do not cache source address of outgoing interface,
1476    because it is used only by IP RR, TS and SRR options,
1477    so that it out of fast path.
1478 
1479    BTW remember: "addr" is allowed to be not aligned
1480    in IP options!
1481  */
1482 
1483 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1484 {
1485 	u32 src;
1486 	struct fib_result res;
1487 
1488 	if (rt->fl.iif == 0)
1489 		src = rt->rt_src;
1490 	else if (fib_lookup(&rt->fl, &res) == 0) {
1491 		src = FIB_RES_PREFSRC(res);
1492 		fib_res_put(&res);
1493 	} else
1494 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1495 					RT_SCOPE_UNIVERSE);
1496 	memcpy(addr, &src, 4);
1497 }
1498 
1499 #ifdef CONFIG_NET_CLS_ROUTE
1500 static void set_class_tag(struct rtable *rt, u32 tag)
1501 {
1502 	if (!(rt->u.dst.tclassid & 0xFFFF))
1503 		rt->u.dst.tclassid |= tag & 0xFFFF;
1504 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1505 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1506 }
1507 #endif
1508 
1509 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1510 {
1511 	struct fib_info *fi = res->fi;
1512 
1513 	if (fi) {
1514 		if (FIB_RES_GW(*res) &&
1515 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1516 			rt->rt_gateway = FIB_RES_GW(*res);
1517 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1518 		       sizeof(rt->u.dst.metrics));
1519 		if (fi->fib_mtu == 0) {
1520 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1521 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1522 			    rt->rt_gateway != rt->rt_dst &&
1523 			    rt->u.dst.dev->mtu > 576)
1524 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1525 		}
1526 #ifdef CONFIG_NET_CLS_ROUTE
1527 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1528 #endif
1529 	} else
1530 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1531 
1532 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1533 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1534 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1535 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1536 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1537 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1538 				       ip_rt_min_advmss);
1539 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1540 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1541 
1542 #ifdef CONFIG_NET_CLS_ROUTE
1543 #ifdef CONFIG_IP_MULTIPLE_TABLES
1544 	set_class_tag(rt, fib_rules_tclass(res));
1545 #endif
1546 	set_class_tag(rt, itag);
1547 #endif
1548         rt->rt_type = res->type;
1549 }
1550 
1551 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1552 				u8 tos, struct net_device *dev, int our)
1553 {
1554 	unsigned hash;
1555 	struct rtable *rth;
1556 	u32 spec_dst;
1557 	struct in_device *in_dev = in_dev_get(dev);
1558 	u32 itag = 0;
1559 
1560 	/* Primary sanity checks. */
1561 
1562 	if (in_dev == NULL)
1563 		return -EINVAL;
1564 
1565 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1566 	    skb->protocol != htons(ETH_P_IP))
1567 		goto e_inval;
1568 
1569 	if (ZERONET(saddr)) {
1570 		if (!LOCAL_MCAST(daddr))
1571 			goto e_inval;
1572 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1573 	} else if (fib_validate_source(saddr, 0, tos, 0,
1574 					dev, &spec_dst, &itag) < 0)
1575 		goto e_inval;
1576 
1577 	rth = dst_alloc(&ipv4_dst_ops);
1578 	if (!rth)
1579 		goto e_nobufs;
1580 
1581 	rth->u.dst.output= ip_rt_bug;
1582 
1583 	atomic_set(&rth->u.dst.__refcnt, 1);
1584 	rth->u.dst.flags= DST_HOST;
1585 	if (in_dev->cnf.no_policy)
1586 		rth->u.dst.flags |= DST_NOPOLICY;
1587 	rth->fl.fl4_dst	= daddr;
1588 	rth->rt_dst	= daddr;
1589 	rth->fl.fl4_tos	= tos;
1590 #ifdef CONFIG_IP_ROUTE_FWMARK
1591 	rth->fl.fl4_fwmark= skb->nfmark;
1592 #endif
1593 	rth->fl.fl4_src	= saddr;
1594 	rth->rt_src	= saddr;
1595 #ifdef CONFIG_NET_CLS_ROUTE
1596 	rth->u.dst.tclassid = itag;
1597 #endif
1598 	rth->rt_iif	=
1599 	rth->fl.iif	= dev->ifindex;
1600 	rth->u.dst.dev	= &loopback_dev;
1601 	dev_hold(rth->u.dst.dev);
1602 	rth->idev	= in_dev_get(rth->u.dst.dev);
1603 	rth->fl.oif	= 0;
1604 	rth->rt_gateway	= daddr;
1605 	rth->rt_spec_dst= spec_dst;
1606 	rth->rt_type	= RTN_MULTICAST;
1607 	rth->rt_flags	= RTCF_MULTICAST;
1608 	if (our) {
1609 		rth->u.dst.input= ip_local_deliver;
1610 		rth->rt_flags |= RTCF_LOCAL;
1611 	}
1612 
1613 #ifdef CONFIG_IP_MROUTE
1614 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1615 		rth->u.dst.input = ip_mr_input;
1616 #endif
1617 	RT_CACHE_STAT_INC(in_slow_mc);
1618 
1619 	in_dev_put(in_dev);
1620 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1621 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1622 
1623 e_nobufs:
1624 	in_dev_put(in_dev);
1625 	return -ENOBUFS;
1626 
1627 e_inval:
1628 	in_dev_put(in_dev);
1629 	return -EINVAL;
1630 }
1631 
1632 
1633 static void ip_handle_martian_source(struct net_device *dev,
1634 				     struct in_device *in_dev,
1635 				     struct sk_buff *skb,
1636 				     u32 daddr,
1637 				     u32 saddr)
1638 {
1639 	RT_CACHE_STAT_INC(in_martian_src);
1640 #ifdef CONFIG_IP_ROUTE_VERBOSE
1641 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1642 		/*
1643 		 *	RFC1812 recommendation, if source is martian,
1644 		 *	the only hint is MAC header.
1645 		 */
1646 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1647 			"%u.%u.%u.%u, on dev %s\n",
1648 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1649 		if (dev->hard_header_len) {
1650 			int i;
1651 			unsigned char *p = skb->mac.raw;
1652 			printk(KERN_WARNING "ll header: ");
1653 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1654 				printk("%02x", *p);
1655 				if (i < (dev->hard_header_len - 1))
1656 					printk(":");
1657 			}
1658 			printk("\n");
1659 		}
1660 	}
1661 #endif
1662 }
1663 
1664 static inline int __mkroute_input(struct sk_buff *skb,
1665 				  struct fib_result* res,
1666 				  struct in_device *in_dev,
1667 				  u32 daddr, u32 saddr, u32 tos,
1668 				  struct rtable **result)
1669 {
1670 
1671 	struct rtable *rth;
1672 	int err;
1673 	struct in_device *out_dev;
1674 	unsigned flags = 0;
1675 	u32 spec_dst, itag;
1676 
1677 	/* get a working reference to the output device */
1678 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1679 	if (out_dev == NULL) {
1680 		if (net_ratelimit())
1681 			printk(KERN_CRIT "Bug in ip_route_input" \
1682 			       "_slow(). Please, report\n");
1683 		return -EINVAL;
1684 	}
1685 
1686 
1687 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1688 				  in_dev->dev, &spec_dst, &itag);
1689 	if (err < 0) {
1690 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1691 					 saddr);
1692 
1693 		err = -EINVAL;
1694 		goto cleanup;
1695 	}
1696 
1697 	if (err)
1698 		flags |= RTCF_DIRECTSRC;
1699 
1700 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1701 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1702 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1703 		flags |= RTCF_DOREDIRECT;
1704 
1705 	if (skb->protocol != htons(ETH_P_IP)) {
1706 		/* Not IP (i.e. ARP). Do not create route, if it is
1707 		 * invalid for proxy arp. DNAT routes are always valid.
1708 		 */
1709 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1710 			err = -EINVAL;
1711 			goto cleanup;
1712 		}
1713 	}
1714 
1715 
1716 	rth = dst_alloc(&ipv4_dst_ops);
1717 	if (!rth) {
1718 		err = -ENOBUFS;
1719 		goto cleanup;
1720 	}
1721 
1722 	rth->u.dst.flags= DST_HOST;
1723 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1724 	if (res->fi->fib_nhs > 1)
1725 		rth->u.dst.flags |= DST_BALANCED;
1726 #endif
1727 	if (in_dev->cnf.no_policy)
1728 		rth->u.dst.flags |= DST_NOPOLICY;
1729 	if (in_dev->cnf.no_xfrm)
1730 		rth->u.dst.flags |= DST_NOXFRM;
1731 	rth->fl.fl4_dst	= daddr;
1732 	rth->rt_dst	= daddr;
1733 	rth->fl.fl4_tos	= tos;
1734 #ifdef CONFIG_IP_ROUTE_FWMARK
1735 	rth->fl.fl4_fwmark= skb->nfmark;
1736 #endif
1737 	rth->fl.fl4_src	= saddr;
1738 	rth->rt_src	= saddr;
1739 	rth->rt_gateway	= daddr;
1740 	rth->rt_iif 	=
1741 		rth->fl.iif	= in_dev->dev->ifindex;
1742 	rth->u.dst.dev	= (out_dev)->dev;
1743 	dev_hold(rth->u.dst.dev);
1744 	rth->idev	= in_dev_get(rth->u.dst.dev);
1745 	rth->fl.oif 	= 0;
1746 	rth->rt_spec_dst= spec_dst;
1747 
1748 	rth->u.dst.input = ip_forward;
1749 	rth->u.dst.output = ip_output;
1750 
1751 	rt_set_nexthop(rth, res, itag);
1752 
1753 	rth->rt_flags = flags;
1754 
1755 	*result = rth;
1756 	err = 0;
1757  cleanup:
1758 	/* release the working reference to the output device */
1759 	in_dev_put(out_dev);
1760 	return err;
1761 }
1762 
1763 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1764 				       struct fib_result* res,
1765 				       const struct flowi *fl,
1766 				       struct in_device *in_dev,
1767 				       u32 daddr, u32 saddr, u32 tos)
1768 {
1769 	struct rtable* rth;
1770 	int err;
1771 	unsigned hash;
1772 
1773 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1774 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1775 		fib_select_multipath(fl, res);
1776 #endif
1777 
1778 	/* create a routing cache entry */
1779 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1780 	if (err)
1781 		return err;
1782 	atomic_set(&rth->u.dst.__refcnt, 1);
1783 
1784 	/* put it into the cache */
1785 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1786 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1787 }
1788 
1789 static inline int ip_mkroute_input(struct sk_buff *skb,
1790 				   struct fib_result* res,
1791 				   const struct flowi *fl,
1792 				   struct in_device *in_dev,
1793 				   u32 daddr, u32 saddr, u32 tos)
1794 {
1795 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1796 	struct rtable* rth;
1797 	unsigned char hop, hopcount, lasthop;
1798 	int err = -EINVAL;
1799 	unsigned int hash;
1800 
1801 	if (res->fi)
1802 		hopcount = res->fi->fib_nhs;
1803 	else
1804 		hopcount = 1;
1805 
1806 	lasthop = hopcount - 1;
1807 
1808 	/* distinguish between multipath and singlepath */
1809 	if (hopcount < 2)
1810 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1811 					    saddr, tos);
1812 
1813 	/* add all alternatives to the routing cache */
1814 	for (hop = 0; hop < hopcount; hop++) {
1815 		res->nh_sel = hop;
1816 
1817 		/* create a routing cache entry */
1818 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1819 				      &rth);
1820 		if (err)
1821 			return err;
1822 
1823 		/* put it into the cache */
1824 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 		err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1826 		if (err)
1827 			return err;
1828 
1829 		/* forward hop information to multipath impl. */
1830 		multipath_set_nhinfo(rth,
1831 				     FIB_RES_NETWORK(*res),
1832 				     FIB_RES_NETMASK(*res),
1833 				     res->prefixlen,
1834 				     &FIB_RES_NH(*res));
1835 
1836 		/* only for the last hop the reference count is handled
1837 		 * outside
1838 		 */
1839 		if (hop == lasthop)
1840 			atomic_set(&(skb->dst->__refcnt), 1);
1841 	}
1842 	return err;
1843 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1844 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1845 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1846 }
1847 
1848 
1849 /*
1850  *	NOTE. We drop all the packets that has local source
1851  *	addresses, because every properly looped back packet
1852  *	must have correct destination already attached by output routine.
1853  *
1854  *	Such approach solves two big problems:
1855  *	1. Not simplex devices are handled properly.
1856  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1857  */
1858 
1859 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1860 			       u8 tos, struct net_device *dev)
1861 {
1862 	struct fib_result res;
1863 	struct in_device *in_dev = in_dev_get(dev);
1864 	struct flowi fl = { .nl_u = { .ip4_u =
1865 				      { .daddr = daddr,
1866 					.saddr = saddr,
1867 					.tos = tos,
1868 					.scope = RT_SCOPE_UNIVERSE,
1869 #ifdef CONFIG_IP_ROUTE_FWMARK
1870 					.fwmark = skb->nfmark
1871 #endif
1872 				      } },
1873 			    .iif = dev->ifindex };
1874 	unsigned	flags = 0;
1875 	u32		itag = 0;
1876 	struct rtable * rth;
1877 	unsigned	hash;
1878 	u32		spec_dst;
1879 	int		err = -EINVAL;
1880 	int		free_res = 0;
1881 
1882 	/* IP on this device is disabled. */
1883 
1884 	if (!in_dev)
1885 		goto out;
1886 
1887 	/* Check for the most weird martians, which can be not detected
1888 	   by fib_lookup.
1889 	 */
1890 
1891 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1892 		goto martian_source;
1893 
1894 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1895 		goto brd_input;
1896 
1897 	/* Accept zero addresses only to limited broadcast;
1898 	 * I even do not know to fix it or not. Waiting for complains :-)
1899 	 */
1900 	if (ZERONET(saddr))
1901 		goto martian_source;
1902 
1903 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1904 		goto martian_destination;
1905 
1906 	/*
1907 	 *	Now we are ready to route packet.
1908 	 */
1909 	if ((err = fib_lookup(&fl, &res)) != 0) {
1910 		if (!IN_DEV_FORWARD(in_dev))
1911 			goto e_inval;
1912 		goto no_route;
1913 	}
1914 	free_res = 1;
1915 
1916 	RT_CACHE_STAT_INC(in_slow_tot);
1917 
1918 	if (res.type == RTN_BROADCAST)
1919 		goto brd_input;
1920 
1921 	if (res.type == RTN_LOCAL) {
1922 		int result;
1923 		result = fib_validate_source(saddr, daddr, tos,
1924 					     loopback_dev.ifindex,
1925 					     dev, &spec_dst, &itag);
1926 		if (result < 0)
1927 			goto martian_source;
1928 		if (result)
1929 			flags |= RTCF_DIRECTSRC;
1930 		spec_dst = daddr;
1931 		goto local_input;
1932 	}
1933 
1934 	if (!IN_DEV_FORWARD(in_dev))
1935 		goto e_inval;
1936 	if (res.type != RTN_UNICAST)
1937 		goto martian_destination;
1938 
1939 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1940 	if (err == -ENOBUFS)
1941 		goto e_nobufs;
1942 	if (err == -EINVAL)
1943 		goto e_inval;
1944 
1945 done:
1946 	in_dev_put(in_dev);
1947 	if (free_res)
1948 		fib_res_put(&res);
1949 out:	return err;
1950 
1951 brd_input:
1952 	if (skb->protocol != htons(ETH_P_IP))
1953 		goto e_inval;
1954 
1955 	if (ZERONET(saddr))
1956 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1957 	else {
1958 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1959 					  &itag);
1960 		if (err < 0)
1961 			goto martian_source;
1962 		if (err)
1963 			flags |= RTCF_DIRECTSRC;
1964 	}
1965 	flags |= RTCF_BROADCAST;
1966 	res.type = RTN_BROADCAST;
1967 	RT_CACHE_STAT_INC(in_brd);
1968 
1969 local_input:
1970 	rth = dst_alloc(&ipv4_dst_ops);
1971 	if (!rth)
1972 		goto e_nobufs;
1973 
1974 	rth->u.dst.output= ip_rt_bug;
1975 
1976 	atomic_set(&rth->u.dst.__refcnt, 1);
1977 	rth->u.dst.flags= DST_HOST;
1978 	if (in_dev->cnf.no_policy)
1979 		rth->u.dst.flags |= DST_NOPOLICY;
1980 	rth->fl.fl4_dst	= daddr;
1981 	rth->rt_dst	= daddr;
1982 	rth->fl.fl4_tos	= tos;
1983 #ifdef CONFIG_IP_ROUTE_FWMARK
1984 	rth->fl.fl4_fwmark= skb->nfmark;
1985 #endif
1986 	rth->fl.fl4_src	= saddr;
1987 	rth->rt_src	= saddr;
1988 #ifdef CONFIG_NET_CLS_ROUTE
1989 	rth->u.dst.tclassid = itag;
1990 #endif
1991 	rth->rt_iif	=
1992 	rth->fl.iif	= dev->ifindex;
1993 	rth->u.dst.dev	= &loopback_dev;
1994 	dev_hold(rth->u.dst.dev);
1995 	rth->idev	= in_dev_get(rth->u.dst.dev);
1996 	rth->rt_gateway	= daddr;
1997 	rth->rt_spec_dst= spec_dst;
1998 	rth->u.dst.input= ip_local_deliver;
1999 	rth->rt_flags 	= flags|RTCF_LOCAL;
2000 	if (res.type == RTN_UNREACHABLE) {
2001 		rth->u.dst.input= ip_error;
2002 		rth->u.dst.error= -err;
2003 		rth->rt_flags 	&= ~RTCF_LOCAL;
2004 	}
2005 	rth->rt_type	= res.type;
2006 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2007 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2008 	goto done;
2009 
2010 no_route:
2011 	RT_CACHE_STAT_INC(in_no_route);
2012 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2013 	res.type = RTN_UNREACHABLE;
2014 	goto local_input;
2015 
2016 	/*
2017 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2018 	 */
2019 martian_destination:
2020 	RT_CACHE_STAT_INC(in_martian_dst);
2021 #ifdef CONFIG_IP_ROUTE_VERBOSE
2022 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2023 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2024 			"%u.%u.%u.%u, dev %s\n",
2025 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2026 #endif
2027 e_inval:
2028 	err = -EINVAL;
2029 	goto done;
2030 
2031 e_nobufs:
2032 	err = -ENOBUFS;
2033 	goto done;
2034 
2035 martian_source:
2036 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2037 	goto e_inval;
2038 }
2039 
2040 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2041 		   u8 tos, struct net_device *dev)
2042 {
2043 	struct rtable * rth;
2044 	unsigned	hash;
2045 	int iif = dev->ifindex;
2046 
2047 	tos &= IPTOS_RT_MASK;
2048 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2049 
2050 	rcu_read_lock();
2051 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2052 	     rth = rcu_dereference(rth->u.rt_next)) {
2053 		if (rth->fl.fl4_dst == daddr &&
2054 		    rth->fl.fl4_src == saddr &&
2055 		    rth->fl.iif == iif &&
2056 		    rth->fl.oif == 0 &&
2057 #ifdef CONFIG_IP_ROUTE_FWMARK
2058 		    rth->fl.fl4_fwmark == skb->nfmark &&
2059 #endif
2060 		    rth->fl.fl4_tos == tos) {
2061 			rth->u.dst.lastuse = jiffies;
2062 			dst_hold(&rth->u.dst);
2063 			rth->u.dst.__use++;
2064 			RT_CACHE_STAT_INC(in_hit);
2065 			rcu_read_unlock();
2066 			skb->dst = (struct dst_entry*)rth;
2067 			return 0;
2068 		}
2069 		RT_CACHE_STAT_INC(in_hlist_search);
2070 	}
2071 	rcu_read_unlock();
2072 
2073 	/* Multicast recognition logic is moved from route cache to here.
2074 	   The problem was that too many Ethernet cards have broken/missing
2075 	   hardware multicast filters :-( As result the host on multicasting
2076 	   network acquires a lot of useless route cache entries, sort of
2077 	   SDR messages from all the world. Now we try to get rid of them.
2078 	   Really, provided software IP multicast filter is organized
2079 	   reasonably (at least, hashed), it does not result in a slowdown
2080 	   comparing with route cache reject entries.
2081 	   Note, that multicast routers are not affected, because
2082 	   route cache entry is created eventually.
2083 	 */
2084 	if (MULTICAST(daddr)) {
2085 		struct in_device *in_dev;
2086 
2087 		rcu_read_lock();
2088 		if ((in_dev = __in_dev_get(dev)) != NULL) {
2089 			int our = ip_check_mc(in_dev, daddr, saddr,
2090 				skb->nh.iph->protocol);
2091 			if (our
2092 #ifdef CONFIG_IP_MROUTE
2093 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2094 #endif
2095 			    ) {
2096 				rcu_read_unlock();
2097 				return ip_route_input_mc(skb, daddr, saddr,
2098 							 tos, dev, our);
2099 			}
2100 		}
2101 		rcu_read_unlock();
2102 		return -EINVAL;
2103 	}
2104 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2105 }
2106 
2107 static inline int __mkroute_output(struct rtable **result,
2108 				   struct fib_result* res,
2109 				   const struct flowi *fl,
2110 				   const struct flowi *oldflp,
2111 				   struct net_device *dev_out,
2112 				   unsigned flags)
2113 {
2114 	struct rtable *rth;
2115 	struct in_device *in_dev;
2116 	u32 tos = RT_FL_TOS(oldflp);
2117 	int err = 0;
2118 
2119 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2120 		return -EINVAL;
2121 
2122 	if (fl->fl4_dst == 0xFFFFFFFF)
2123 		res->type = RTN_BROADCAST;
2124 	else if (MULTICAST(fl->fl4_dst))
2125 		res->type = RTN_MULTICAST;
2126 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2127 		return -EINVAL;
2128 
2129 	if (dev_out->flags & IFF_LOOPBACK)
2130 		flags |= RTCF_LOCAL;
2131 
2132 	/* get work reference to inet device */
2133 	in_dev = in_dev_get(dev_out);
2134 	if (!in_dev)
2135 		return -EINVAL;
2136 
2137 	if (res->type == RTN_BROADCAST) {
2138 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2139 		if (res->fi) {
2140 			fib_info_put(res->fi);
2141 			res->fi = NULL;
2142 		}
2143 	} else if (res->type == RTN_MULTICAST) {
2144 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2145 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2146 				 oldflp->proto))
2147 			flags &= ~RTCF_LOCAL;
2148 		/* If multicast route do not exist use
2149 		   default one, but do not gateway in this case.
2150 		   Yes, it is hack.
2151 		 */
2152 		if (res->fi && res->prefixlen < 4) {
2153 			fib_info_put(res->fi);
2154 			res->fi = NULL;
2155 		}
2156 	}
2157 
2158 
2159 	rth = dst_alloc(&ipv4_dst_ops);
2160 	if (!rth) {
2161 		err = -ENOBUFS;
2162 		goto cleanup;
2163 	}
2164 
2165 	rth->u.dst.flags= DST_HOST;
2166 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2167 	if (res->fi) {
2168 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2169 		if (res->fi->fib_nhs > 1)
2170 			rth->u.dst.flags |= DST_BALANCED;
2171 	}
2172 #endif
2173 	if (in_dev->cnf.no_xfrm)
2174 		rth->u.dst.flags |= DST_NOXFRM;
2175 	if (in_dev->cnf.no_policy)
2176 		rth->u.dst.flags |= DST_NOPOLICY;
2177 
2178 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2179 	rth->fl.fl4_tos	= tos;
2180 	rth->fl.fl4_src	= oldflp->fl4_src;
2181 	rth->fl.oif	= oldflp->oif;
2182 #ifdef CONFIG_IP_ROUTE_FWMARK
2183 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2184 #endif
2185 	rth->rt_dst	= fl->fl4_dst;
2186 	rth->rt_src	= fl->fl4_src;
2187 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2188 	/* get references to the devices that are to be hold by the routing
2189 	   cache entry */
2190 	rth->u.dst.dev	= dev_out;
2191 	dev_hold(dev_out);
2192 	rth->idev	= in_dev_get(dev_out);
2193 	rth->rt_gateway = fl->fl4_dst;
2194 	rth->rt_spec_dst= fl->fl4_src;
2195 
2196 	rth->u.dst.output=ip_output;
2197 
2198 	RT_CACHE_STAT_INC(out_slow_tot);
2199 
2200 	if (flags & RTCF_LOCAL) {
2201 		rth->u.dst.input = ip_local_deliver;
2202 		rth->rt_spec_dst = fl->fl4_dst;
2203 	}
2204 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2205 		rth->rt_spec_dst = fl->fl4_src;
2206 		if (flags & RTCF_LOCAL &&
2207 		    !(dev_out->flags & IFF_LOOPBACK)) {
2208 			rth->u.dst.output = ip_mc_output;
2209 			RT_CACHE_STAT_INC(out_slow_mc);
2210 		}
2211 #ifdef CONFIG_IP_MROUTE
2212 		if (res->type == RTN_MULTICAST) {
2213 			if (IN_DEV_MFORWARD(in_dev) &&
2214 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2215 				rth->u.dst.input = ip_mr_input;
2216 				rth->u.dst.output = ip_mc_output;
2217 			}
2218 		}
2219 #endif
2220 	}
2221 
2222 	rt_set_nexthop(rth, res, 0);
2223 
2224 	rth->rt_flags = flags;
2225 
2226 	*result = rth;
2227  cleanup:
2228 	/* release work reference to inet device */
2229 	in_dev_put(in_dev);
2230 
2231 	return err;
2232 }
2233 
2234 static inline int ip_mkroute_output_def(struct rtable **rp,
2235 					struct fib_result* res,
2236 					const struct flowi *fl,
2237 					const struct flowi *oldflp,
2238 					struct net_device *dev_out,
2239 					unsigned flags)
2240 {
2241 	struct rtable *rth;
2242 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2243 	unsigned hash;
2244 	if (err == 0) {
2245 		u32 tos = RT_FL_TOS(oldflp);
2246 
2247 		atomic_set(&rth->u.dst.__refcnt, 1);
2248 
2249 		hash = rt_hash_code(oldflp->fl4_dst,
2250 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2251 		err = rt_intern_hash(hash, rth, rp);
2252 	}
2253 
2254 	return err;
2255 }
2256 
2257 static inline int ip_mkroute_output(struct rtable** rp,
2258 				    struct fib_result* res,
2259 				    const struct flowi *fl,
2260 				    const struct flowi *oldflp,
2261 				    struct net_device *dev_out,
2262 				    unsigned flags)
2263 {
2264 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2265 	u32 tos = RT_FL_TOS(oldflp);
2266 	unsigned char hop;
2267 	unsigned hash;
2268 	int err = -EINVAL;
2269 	struct rtable *rth;
2270 
2271 	if (res->fi && res->fi->fib_nhs > 1) {
2272 		unsigned char hopcount = res->fi->fib_nhs;
2273 
2274 		for (hop = 0; hop < hopcount; hop++) {
2275 			struct net_device *dev2nexthop;
2276 
2277 			res->nh_sel = hop;
2278 
2279 			/* hold a work reference to the output device */
2280 			dev2nexthop = FIB_RES_DEV(*res);
2281 			dev_hold(dev2nexthop);
2282 
2283 			err = __mkroute_output(&rth, res, fl, oldflp,
2284 					       dev2nexthop, flags);
2285 
2286 			if (err != 0)
2287 				goto cleanup;
2288 
2289 			hash = rt_hash_code(oldflp->fl4_dst,
2290 					    oldflp->fl4_src ^
2291 					    (oldflp->oif << 5), tos);
2292 			err = rt_intern_hash(hash, rth, rp);
2293 
2294 			/* forward hop information to multipath impl. */
2295 			multipath_set_nhinfo(rth,
2296 					     FIB_RES_NETWORK(*res),
2297 					     FIB_RES_NETMASK(*res),
2298 					     res->prefixlen,
2299 					     &FIB_RES_NH(*res));
2300 		cleanup:
2301 			/* release work reference to output device */
2302 			dev_put(dev2nexthop);
2303 
2304 			if (err != 0)
2305 				return err;
2306 		}
2307 		atomic_set(&(*rp)->u.dst.__refcnt, 1);
2308 		return err;
2309 	} else {
2310 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2311 					     flags);
2312 	}
2313 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2314 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2315 #endif
2316 }
2317 
2318 /*
2319  * Major route resolver routine.
2320  */
2321 
2322 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2323 {
2324 	u32 tos	= RT_FL_TOS(oldflp);
2325 	struct flowi fl = { .nl_u = { .ip4_u =
2326 				      { .daddr = oldflp->fl4_dst,
2327 					.saddr = oldflp->fl4_src,
2328 					.tos = tos & IPTOS_RT_MASK,
2329 					.scope = ((tos & RTO_ONLINK) ?
2330 						  RT_SCOPE_LINK :
2331 						  RT_SCOPE_UNIVERSE),
2332 #ifdef CONFIG_IP_ROUTE_FWMARK
2333 					.fwmark = oldflp->fl4_fwmark
2334 #endif
2335 				      } },
2336 			    .iif = loopback_dev.ifindex,
2337 			    .oif = oldflp->oif };
2338 	struct fib_result res;
2339 	unsigned flags = 0;
2340 	struct net_device *dev_out = NULL;
2341 	int free_res = 0;
2342 	int err;
2343 
2344 
2345 	res.fi		= NULL;
2346 #ifdef CONFIG_IP_MULTIPLE_TABLES
2347 	res.r		= NULL;
2348 #endif
2349 
2350 	if (oldflp->fl4_src) {
2351 		err = -EINVAL;
2352 		if (MULTICAST(oldflp->fl4_src) ||
2353 		    BADCLASS(oldflp->fl4_src) ||
2354 		    ZERONET(oldflp->fl4_src))
2355 			goto out;
2356 
2357 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2358 		dev_out = ip_dev_find(oldflp->fl4_src);
2359 		if (dev_out == NULL)
2360 			goto out;
2361 
2362 		/* I removed check for oif == dev_out->oif here.
2363 		   It was wrong for two reasons:
2364 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2365 		      assigned to multiple interfaces.
2366 		   2. Moreover, we are allowed to send packets with saddr
2367 		      of another iface. --ANK
2368 		 */
2369 
2370 		if (oldflp->oif == 0
2371 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2372 			/* Special hack: user can direct multicasts
2373 			   and limited broadcast via necessary interface
2374 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2375 			   This hack is not just for fun, it allows
2376 			   vic,vat and friends to work.
2377 			   They bind socket to loopback, set ttl to zero
2378 			   and expect that it will work.
2379 			   From the viewpoint of routing cache they are broken,
2380 			   because we are not allowed to build multicast path
2381 			   with loopback source addr (look, routing cache
2382 			   cannot know, that ttl is zero, so that packet
2383 			   will not leave this host and route is valid).
2384 			   Luckily, this hack is good workaround.
2385 			 */
2386 
2387 			fl.oif = dev_out->ifindex;
2388 			goto make_route;
2389 		}
2390 		if (dev_out)
2391 			dev_put(dev_out);
2392 		dev_out = NULL;
2393 	}
2394 
2395 
2396 	if (oldflp->oif) {
2397 		dev_out = dev_get_by_index(oldflp->oif);
2398 		err = -ENODEV;
2399 		if (dev_out == NULL)
2400 			goto out;
2401 		if (__in_dev_get(dev_out) == NULL) {
2402 			dev_put(dev_out);
2403 			goto out;	/* Wrong error code */
2404 		}
2405 
2406 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2407 			if (!fl.fl4_src)
2408 				fl.fl4_src = inet_select_addr(dev_out, 0,
2409 							      RT_SCOPE_LINK);
2410 			goto make_route;
2411 		}
2412 		if (!fl.fl4_src) {
2413 			if (MULTICAST(oldflp->fl4_dst))
2414 				fl.fl4_src = inet_select_addr(dev_out, 0,
2415 							      fl.fl4_scope);
2416 			else if (!oldflp->fl4_dst)
2417 				fl.fl4_src = inet_select_addr(dev_out, 0,
2418 							      RT_SCOPE_HOST);
2419 		}
2420 	}
2421 
2422 	if (!fl.fl4_dst) {
2423 		fl.fl4_dst = fl.fl4_src;
2424 		if (!fl.fl4_dst)
2425 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2426 		if (dev_out)
2427 			dev_put(dev_out);
2428 		dev_out = &loopback_dev;
2429 		dev_hold(dev_out);
2430 		fl.oif = loopback_dev.ifindex;
2431 		res.type = RTN_LOCAL;
2432 		flags |= RTCF_LOCAL;
2433 		goto make_route;
2434 	}
2435 
2436 	if (fib_lookup(&fl, &res)) {
2437 		res.fi = NULL;
2438 		if (oldflp->oif) {
2439 			/* Apparently, routing tables are wrong. Assume,
2440 			   that the destination is on link.
2441 
2442 			   WHY? DW.
2443 			   Because we are allowed to send to iface
2444 			   even if it has NO routes and NO assigned
2445 			   addresses. When oif is specified, routing
2446 			   tables are looked up with only one purpose:
2447 			   to catch if destination is gatewayed, rather than
2448 			   direct. Moreover, if MSG_DONTROUTE is set,
2449 			   we send packet, ignoring both routing tables
2450 			   and ifaddr state. --ANK
2451 
2452 
2453 			   We could make it even if oif is unknown,
2454 			   likely IPv6, but we do not.
2455 			 */
2456 
2457 			if (fl.fl4_src == 0)
2458 				fl.fl4_src = inet_select_addr(dev_out, 0,
2459 							      RT_SCOPE_LINK);
2460 			res.type = RTN_UNICAST;
2461 			goto make_route;
2462 		}
2463 		if (dev_out)
2464 			dev_put(dev_out);
2465 		err = -ENETUNREACH;
2466 		goto out;
2467 	}
2468 	free_res = 1;
2469 
2470 	if (res.type == RTN_LOCAL) {
2471 		if (!fl.fl4_src)
2472 			fl.fl4_src = fl.fl4_dst;
2473 		if (dev_out)
2474 			dev_put(dev_out);
2475 		dev_out = &loopback_dev;
2476 		dev_hold(dev_out);
2477 		fl.oif = dev_out->ifindex;
2478 		if (res.fi)
2479 			fib_info_put(res.fi);
2480 		res.fi = NULL;
2481 		flags |= RTCF_LOCAL;
2482 		goto make_route;
2483 	}
2484 
2485 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2486 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2487 		fib_select_multipath(&fl, &res);
2488 	else
2489 #endif
2490 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2491 		fib_select_default(&fl, &res);
2492 
2493 	if (!fl.fl4_src)
2494 		fl.fl4_src = FIB_RES_PREFSRC(res);
2495 
2496 	if (dev_out)
2497 		dev_put(dev_out);
2498 	dev_out = FIB_RES_DEV(res);
2499 	dev_hold(dev_out);
2500 	fl.oif = dev_out->ifindex;
2501 
2502 
2503 make_route:
2504 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2505 
2506 
2507 	if (free_res)
2508 		fib_res_put(&res);
2509 	if (dev_out)
2510 		dev_put(dev_out);
2511 out:	return err;
2512 }
2513 
2514 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2515 {
2516 	unsigned hash;
2517 	struct rtable *rth;
2518 
2519 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2520 
2521 	rcu_read_lock_bh();
2522 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2523 		rth = rcu_dereference(rth->u.rt_next)) {
2524 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2525 		    rth->fl.fl4_src == flp->fl4_src &&
2526 		    rth->fl.iif == 0 &&
2527 		    rth->fl.oif == flp->oif &&
2528 #ifdef CONFIG_IP_ROUTE_FWMARK
2529 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2530 #endif
2531 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2532 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2533 
2534 			/* check for multipath routes and choose one if
2535 			 * necessary
2536 			 */
2537 			if (multipath_select_route(flp, rth, rp)) {
2538 				dst_hold(&(*rp)->u.dst);
2539 				RT_CACHE_STAT_INC(out_hit);
2540 				rcu_read_unlock_bh();
2541 				return 0;
2542 			}
2543 
2544 			rth->u.dst.lastuse = jiffies;
2545 			dst_hold(&rth->u.dst);
2546 			rth->u.dst.__use++;
2547 			RT_CACHE_STAT_INC(out_hit);
2548 			rcu_read_unlock_bh();
2549 			*rp = rth;
2550 			return 0;
2551 		}
2552 		RT_CACHE_STAT_INC(out_hlist_search);
2553 	}
2554 	rcu_read_unlock_bh();
2555 
2556 	return ip_route_output_slow(rp, flp);
2557 }
2558 
2559 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2560 {
2561 	int err;
2562 
2563 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2564 		return err;
2565 
2566 	if (flp->proto) {
2567 		if (!flp->fl4_src)
2568 			flp->fl4_src = (*rp)->rt_src;
2569 		if (!flp->fl4_dst)
2570 			flp->fl4_dst = (*rp)->rt_dst;
2571 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2572 	}
2573 
2574 	return 0;
2575 }
2576 
2577 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2578 {
2579 	return ip_route_output_flow(rp, flp, NULL, 0);
2580 }
2581 
2582 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2583 			int nowait)
2584 {
2585 	struct rtable *rt = (struct rtable*)skb->dst;
2586 	struct rtmsg *r;
2587 	struct nlmsghdr  *nlh;
2588 	unsigned char	 *b = skb->tail;
2589 	struct rta_cacheinfo ci;
2590 #ifdef CONFIG_IP_MROUTE
2591 	struct rtattr *eptr;
2592 #endif
2593 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2594 	r = NLMSG_DATA(nlh);
2595 	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2596 	r->rtm_family	 = AF_INET;
2597 	r->rtm_dst_len	= 32;
2598 	r->rtm_src_len	= 0;
2599 	r->rtm_tos	= rt->fl.fl4_tos;
2600 	r->rtm_table	= RT_TABLE_MAIN;
2601 	r->rtm_type	= rt->rt_type;
2602 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2603 	r->rtm_protocol = RTPROT_UNSPEC;
2604 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2605 	if (rt->rt_flags & RTCF_NOTIFY)
2606 		r->rtm_flags |= RTM_F_NOTIFY;
2607 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2608 	if (rt->fl.fl4_src) {
2609 		r->rtm_src_len = 32;
2610 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2611 	}
2612 	if (rt->u.dst.dev)
2613 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2614 #ifdef CONFIG_NET_CLS_ROUTE
2615 	if (rt->u.dst.tclassid)
2616 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2617 #endif
2618 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2619 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2620 		__u32 alg = rt->rt_multipath_alg;
2621 
2622 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2623 	}
2624 #endif
2625 	if (rt->fl.iif)
2626 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2627 	else if (rt->rt_src != rt->fl.fl4_src)
2628 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2629 	if (rt->rt_dst != rt->rt_gateway)
2630 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2631 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2632 		goto rtattr_failure;
2633 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2634 	ci.rta_used	= rt->u.dst.__use;
2635 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2636 	if (rt->u.dst.expires)
2637 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2638 	else
2639 		ci.rta_expires = 0;
2640 	ci.rta_error	= rt->u.dst.error;
2641 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2642 	if (rt->peer) {
2643 		ci.rta_id = rt->peer->ip_id_count;
2644 		if (rt->peer->tcp_ts_stamp) {
2645 			ci.rta_ts = rt->peer->tcp_ts;
2646 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2647 		}
2648 	}
2649 #ifdef CONFIG_IP_MROUTE
2650 	eptr = (struct rtattr*)skb->tail;
2651 #endif
2652 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2653 	if (rt->fl.iif) {
2654 #ifdef CONFIG_IP_MROUTE
2655 		u32 dst = rt->rt_dst;
2656 
2657 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2658 		    ipv4_devconf.mc_forwarding) {
2659 			int err = ipmr_get_route(skb, r, nowait);
2660 			if (err <= 0) {
2661 				if (!nowait) {
2662 					if (err == 0)
2663 						return 0;
2664 					goto nlmsg_failure;
2665 				} else {
2666 					if (err == -EMSGSIZE)
2667 						goto nlmsg_failure;
2668 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2669 				}
2670 			}
2671 		} else
2672 #endif
2673 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2674 	}
2675 
2676 	nlh->nlmsg_len = skb->tail - b;
2677 	return skb->len;
2678 
2679 nlmsg_failure:
2680 rtattr_failure:
2681 	skb_trim(skb, b - skb->data);
2682 	return -1;
2683 }
2684 
2685 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2686 {
2687 	struct rtattr **rta = arg;
2688 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2689 	struct rtable *rt = NULL;
2690 	u32 dst = 0;
2691 	u32 src = 0;
2692 	int iif = 0;
2693 	int err = -ENOBUFS;
2694 	struct sk_buff *skb;
2695 
2696 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697 	if (!skb)
2698 		goto out;
2699 
2700 	/* Reserve room for dummy headers, this skb can pass
2701 	   through good chunk of routing engine.
2702 	 */
2703 	skb->mac.raw = skb->data;
2704 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2705 
2706 	if (rta[RTA_SRC - 1])
2707 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2708 	if (rta[RTA_DST - 1])
2709 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2710 	if (rta[RTA_IIF - 1])
2711 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2712 
2713 	if (iif) {
2714 		struct net_device *dev = __dev_get_by_index(iif);
2715 		err = -ENODEV;
2716 		if (!dev)
2717 			goto out_free;
2718 		skb->protocol	= htons(ETH_P_IP);
2719 		skb->dev	= dev;
2720 		local_bh_disable();
2721 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2722 		local_bh_enable();
2723 		rt = (struct rtable*)skb->dst;
2724 		if (!err && rt->u.dst.error)
2725 			err = -rt->u.dst.error;
2726 	} else {
2727 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2728 							 .saddr = src,
2729 							 .tos = rtm->rtm_tos } } };
2730 		int oif = 0;
2731 		if (rta[RTA_OIF - 1])
2732 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2733 		fl.oif = oif;
2734 		err = ip_route_output_key(&rt, &fl);
2735 	}
2736 	if (err)
2737 		goto out_free;
2738 
2739 	skb->dst = &rt->u.dst;
2740 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2741 		rt->rt_flags |= RTCF_NOTIFY;
2742 
2743 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2744 
2745 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2746 				RTM_NEWROUTE, 0);
2747 	if (!err)
2748 		goto out_free;
2749 	if (err < 0) {
2750 		err = -EMSGSIZE;
2751 		goto out_free;
2752 	}
2753 
2754 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2755 	if (err > 0)
2756 		err = 0;
2757 out:	return err;
2758 
2759 out_free:
2760 	kfree_skb(skb);
2761 	goto out;
2762 }
2763 
2764 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2765 {
2766 	struct rtable *rt;
2767 	int h, s_h;
2768 	int idx, s_idx;
2769 
2770 	s_h = cb->args[0];
2771 	s_idx = idx = cb->args[1];
2772 	for (h = 0; h <= rt_hash_mask; h++) {
2773 		if (h < s_h) continue;
2774 		if (h > s_h)
2775 			s_idx = 0;
2776 		rcu_read_lock_bh();
2777 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2778 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2779 			if (idx < s_idx)
2780 				continue;
2781 			skb->dst = dst_clone(&rt->u.dst);
2782 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2783 					 cb->nlh->nlmsg_seq,
2784 					 RTM_NEWROUTE, 1) <= 0) {
2785 				dst_release(xchg(&skb->dst, NULL));
2786 				rcu_read_unlock_bh();
2787 				goto done;
2788 			}
2789 			dst_release(xchg(&skb->dst, NULL));
2790 		}
2791 		rcu_read_unlock_bh();
2792 	}
2793 
2794 done:
2795 	cb->args[0] = h;
2796 	cb->args[1] = idx;
2797 	return skb->len;
2798 }
2799 
2800 void ip_rt_multicast_event(struct in_device *in_dev)
2801 {
2802 	rt_cache_flush(0);
2803 }
2804 
2805 #ifdef CONFIG_SYSCTL
2806 static int flush_delay;
2807 
2808 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2809 					struct file *filp, void __user *buffer,
2810 					size_t *lenp, loff_t *ppos)
2811 {
2812 	if (write) {
2813 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2814 		rt_cache_flush(flush_delay);
2815 		return 0;
2816 	}
2817 
2818 	return -EINVAL;
2819 }
2820 
2821 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2822 						int __user *name,
2823 						int nlen,
2824 						void __user *oldval,
2825 						size_t __user *oldlenp,
2826 						void __user *newval,
2827 						size_t newlen,
2828 						void **context)
2829 {
2830 	int delay;
2831 	if (newlen != sizeof(int))
2832 		return -EINVAL;
2833 	if (get_user(delay, (int __user *)newval))
2834 		return -EFAULT;
2835 	rt_cache_flush(delay);
2836 	return 0;
2837 }
2838 
2839 ctl_table ipv4_route_table[] = {
2840         {
2841 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2842 		.procname	= "flush",
2843 		.data		= &flush_delay,
2844 		.maxlen		= sizeof(int),
2845 		.mode		= 0644,
2846 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2847 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2848 	},
2849 	{
2850 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2851 		.procname	= "min_delay",
2852 		.data		= &ip_rt_min_delay,
2853 		.maxlen		= sizeof(int),
2854 		.mode		= 0644,
2855 		.proc_handler	= &proc_dointvec_jiffies,
2856 		.strategy	= &sysctl_jiffies,
2857 	},
2858 	{
2859 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2860 		.procname	= "max_delay",
2861 		.data		= &ip_rt_max_delay,
2862 		.maxlen		= sizeof(int),
2863 		.mode		= 0644,
2864 		.proc_handler	= &proc_dointvec_jiffies,
2865 		.strategy	= &sysctl_jiffies,
2866 	},
2867 	{
2868 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2869 		.procname	= "gc_thresh",
2870 		.data		= &ipv4_dst_ops.gc_thresh,
2871 		.maxlen		= sizeof(int),
2872 		.mode		= 0644,
2873 		.proc_handler	= &proc_dointvec,
2874 	},
2875 	{
2876 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2877 		.procname	= "max_size",
2878 		.data		= &ip_rt_max_size,
2879 		.maxlen		= sizeof(int),
2880 		.mode		= 0644,
2881 		.proc_handler	= &proc_dointvec,
2882 	},
2883 	{
2884 		/*  Deprecated. Use gc_min_interval_ms */
2885 
2886 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2887 		.procname	= "gc_min_interval",
2888 		.data		= &ip_rt_gc_min_interval,
2889 		.maxlen		= sizeof(int),
2890 		.mode		= 0644,
2891 		.proc_handler	= &proc_dointvec_jiffies,
2892 		.strategy	= &sysctl_jiffies,
2893 	},
2894 	{
2895 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2896 		.procname	= "gc_min_interval_ms",
2897 		.data		= &ip_rt_gc_min_interval,
2898 		.maxlen		= sizeof(int),
2899 		.mode		= 0644,
2900 		.proc_handler	= &proc_dointvec_ms_jiffies,
2901 		.strategy	= &sysctl_ms_jiffies,
2902 	},
2903 	{
2904 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2905 		.procname	= "gc_timeout",
2906 		.data		= &ip_rt_gc_timeout,
2907 		.maxlen		= sizeof(int),
2908 		.mode		= 0644,
2909 		.proc_handler	= &proc_dointvec_jiffies,
2910 		.strategy	= &sysctl_jiffies,
2911 	},
2912 	{
2913 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2914 		.procname	= "gc_interval",
2915 		.data		= &ip_rt_gc_interval,
2916 		.maxlen		= sizeof(int),
2917 		.mode		= 0644,
2918 		.proc_handler	= &proc_dointvec_jiffies,
2919 		.strategy	= &sysctl_jiffies,
2920 	},
2921 	{
2922 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2923 		.procname	= "redirect_load",
2924 		.data		= &ip_rt_redirect_load,
2925 		.maxlen		= sizeof(int),
2926 		.mode		= 0644,
2927 		.proc_handler	= &proc_dointvec,
2928 	},
2929 	{
2930 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2931 		.procname	= "redirect_number",
2932 		.data		= &ip_rt_redirect_number,
2933 		.maxlen		= sizeof(int),
2934 		.mode		= 0644,
2935 		.proc_handler	= &proc_dointvec,
2936 	},
2937 	{
2938 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2939 		.procname	= "redirect_silence",
2940 		.data		= &ip_rt_redirect_silence,
2941 		.maxlen		= sizeof(int),
2942 		.mode		= 0644,
2943 		.proc_handler	= &proc_dointvec,
2944 	},
2945 	{
2946 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2947 		.procname	= "error_cost",
2948 		.data		= &ip_rt_error_cost,
2949 		.maxlen		= sizeof(int),
2950 		.mode		= 0644,
2951 		.proc_handler	= &proc_dointvec,
2952 	},
2953 	{
2954 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2955 		.procname	= "error_burst",
2956 		.data		= &ip_rt_error_burst,
2957 		.maxlen		= sizeof(int),
2958 		.mode		= 0644,
2959 		.proc_handler	= &proc_dointvec,
2960 	},
2961 	{
2962 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2963 		.procname	= "gc_elasticity",
2964 		.data		= &ip_rt_gc_elasticity,
2965 		.maxlen		= sizeof(int),
2966 		.mode		= 0644,
2967 		.proc_handler	= &proc_dointvec,
2968 	},
2969 	{
2970 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2971 		.procname	= "mtu_expires",
2972 		.data		= &ip_rt_mtu_expires,
2973 		.maxlen		= sizeof(int),
2974 		.mode		= 0644,
2975 		.proc_handler	= &proc_dointvec_jiffies,
2976 		.strategy	= &sysctl_jiffies,
2977 	},
2978 	{
2979 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2980 		.procname	= "min_pmtu",
2981 		.data		= &ip_rt_min_pmtu,
2982 		.maxlen		= sizeof(int),
2983 		.mode		= 0644,
2984 		.proc_handler	= &proc_dointvec,
2985 	},
2986 	{
2987 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2988 		.procname	= "min_adv_mss",
2989 		.data		= &ip_rt_min_advmss,
2990 		.maxlen		= sizeof(int),
2991 		.mode		= 0644,
2992 		.proc_handler	= &proc_dointvec,
2993 	},
2994 	{
2995 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
2996 		.procname	= "secret_interval",
2997 		.data		= &ip_rt_secret_interval,
2998 		.maxlen		= sizeof(int),
2999 		.mode		= 0644,
3000 		.proc_handler	= &proc_dointvec_jiffies,
3001 		.strategy	= &sysctl_jiffies,
3002 	},
3003 	{ .ctl_name = 0 }
3004 };
3005 #endif
3006 
3007 #ifdef CONFIG_NET_CLS_ROUTE
3008 struct ip_rt_acct *ip_rt_acct;
3009 
3010 /* This code sucks.  But you should have seen it before! --RR */
3011 
3012 /* IP route accounting ptr for this logical cpu number. */
3013 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3014 
3015 #ifdef CONFIG_PROC_FS
3016 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3017 			   int length, int *eof, void *data)
3018 {
3019 	unsigned int i;
3020 
3021 	if ((offset & 3) || (length & 3))
3022 		return -EIO;
3023 
3024 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3025 		*eof = 1;
3026 		return 0;
3027 	}
3028 
3029 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3030 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3031 		*eof = 1;
3032 	}
3033 
3034 	offset /= sizeof(u32);
3035 
3036 	if (length > 0) {
3037 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3038 		u32 *dst = (u32 *) buffer;
3039 
3040 		/* Copy first cpu. */
3041 		*start = buffer;
3042 		memcpy(dst, src, length);
3043 
3044 		/* Add the other cpus in, one int at a time */
3045 		for_each_cpu(i) {
3046 			unsigned int j;
3047 
3048 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3049 
3050 			for (j = 0; j < length/4; j++)
3051 				dst[j] += src[j];
3052 		}
3053 	}
3054 	return length;
3055 }
3056 #endif /* CONFIG_PROC_FS */
3057 #endif /* CONFIG_NET_CLS_ROUTE */
3058 
3059 static __initdata unsigned long rhash_entries;
3060 static int __init set_rhash_entries(char *str)
3061 {
3062 	if (!str)
3063 		return 0;
3064 	rhash_entries = simple_strtoul(str, &str, 0);
3065 	return 1;
3066 }
3067 __setup("rhash_entries=", set_rhash_entries);
3068 
3069 int __init ip_rt_init(void)
3070 {
3071 	int i, order, goal, rc = 0;
3072 
3073 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 			     (jiffies ^ (jiffies >> 7)));
3075 
3076 #ifdef CONFIG_NET_CLS_ROUTE
3077 	for (order = 0;
3078 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 		/* NOTHING */;
3080 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3081 	if (!ip_rt_acct)
3082 		panic("IP: failed to allocate ip_rt_acct\n");
3083 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3084 #endif
3085 
3086 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3087 						     sizeof(struct rtable),
3088 						     0, SLAB_HWCACHE_ALIGN,
3089 						     NULL, NULL);
3090 
3091 	if (!ipv4_dst_ops.kmem_cachep)
3092 		panic("IP: failed to allocate ip_dst_cache\n");
3093 
3094 	goal = num_physpages >> (26 - PAGE_SHIFT);
3095 	if (rhash_entries)
3096 		goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3097 	for (order = 0; (1UL << order) < goal; order++)
3098 		/* NOTHING */;
3099 
3100 	do {
3101 		rt_hash_mask = (1UL << order) * PAGE_SIZE /
3102 			sizeof(struct rt_hash_bucket);
3103 		while (rt_hash_mask & (rt_hash_mask - 1))
3104 			rt_hash_mask--;
3105 		rt_hash_table = (struct rt_hash_bucket *)
3106 			__get_free_pages(GFP_ATOMIC, order);
3107 	} while (rt_hash_table == NULL && --order > 0);
3108 
3109 	if (!rt_hash_table)
3110 		panic("Failed to allocate IP route cache hash table\n");
3111 
3112 	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 	       rt_hash_mask,
3114 	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115 
3116 	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 		/* NOTHING */;
3118 
3119 	rt_hash_mask--;
3120 	for (i = 0; i <= rt_hash_mask; i++) {
3121 		spin_lock_init(&rt_hash_table[i].lock);
3122 		rt_hash_table[i].chain = NULL;
3123 	}
3124 
3125 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3127 
3128 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3129 	if (!rt_cache_stat)
3130 		return -ENOMEM;
3131 
3132 	devinet_init();
3133 	ip_fib_init();
3134 
3135 	init_timer(&rt_flush_timer);
3136 	rt_flush_timer.function = rt_run_flush;
3137 	init_timer(&rt_periodic_timer);
3138 	rt_periodic_timer.function = rt_check_expire;
3139 	init_timer(&rt_secret_timer);
3140 	rt_secret_timer.function = rt_secret_rebuild;
3141 
3142 	/* All the timers, started at system startup tend
3143 	   to synchronize. Perturb it a bit.
3144 	 */
3145 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3146 					ip_rt_gc_interval;
3147 	add_timer(&rt_periodic_timer);
3148 
3149 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3150 		ip_rt_secret_interval;
3151 	add_timer(&rt_secret_timer);
3152 
3153 #ifdef CONFIG_PROC_FS
3154 	{
3155 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3156 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3157 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3158 			    		     proc_net_stat))) {
3159 		free_percpu(rt_cache_stat);
3160 		return -ENOMEM;
3161 	}
3162 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3163 	}
3164 #ifdef CONFIG_NET_CLS_ROUTE
3165 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3166 #endif
3167 #endif
3168 #ifdef CONFIG_XFRM
3169 	xfrm_init();
3170 	xfrm4_init();
3171 #endif
3172 	return rc;
3173 }
3174 
3175 EXPORT_SYMBOL(__ip_select_ident);
3176 EXPORT_SYMBOL(ip_route_input);
3177 EXPORT_SYMBOL(ip_route_output_key);
3178