xref: /openbmc/linux/net/ipv4/route.c (revision c21b37f6)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112 
113 #define IP_MAX_MTU	0xFFF0
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_min_delay		= 2 * HZ;
118 static int ip_rt_max_delay		= 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval		= 60 * HZ;
122 static int ip_rt_gc_min_interval	= HZ / 2;
123 static int ip_rt_redirect_number	= 9;
124 static int ip_rt_redirect_load		= HZ / 50;
125 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost		= HZ;
127 static int ip_rt_error_burst		= 5 * HZ;
128 static int ip_rt_gc_elasticity		= 8;
129 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu		= 512 + 20 + 20;
131 static int ip_rt_min_advmss		= 256;
132 static int ip_rt_secret_interval	= 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.entry_size =		sizeof(struct rtable),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 const __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 	defined(CONFIG_PROVE_LOCKING)
209 /*
210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211  * The size of this table is a power of two and depends on the number of CPUS.
212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213  */
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ	256
216 #else
217 # if NR_CPUS >= 32
218 #  define RT_HASH_LOCK_SZ	4096
219 # elif NR_CPUS >= 16
220 #  define RT_HASH_LOCK_SZ	2048
221 # elif NR_CPUS >= 8
222 #  define RT_HASH_LOCK_SZ	1024
223 # elif NR_CPUS >= 4
224 #  define RT_HASH_LOCK_SZ	512
225 # else
226 #  define RT_HASH_LOCK_SZ	256
227 # endif
228 #endif
229 
230 static spinlock_t	*rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 # define rt_hash_lock_init()	{ \
233 		int i; \
234 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
235 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
236 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
237 			spin_lock_init(&rt_hash_locks[i]); \
238 		}
239 #else
240 # define rt_hash_lock_addr(slot) NULL
241 # define rt_hash_lock_init()
242 #endif
243 
244 static struct rt_hash_bucket 	*rt_hash_table;
245 static unsigned			rt_hash_mask;
246 static int			rt_hash_log;
247 static unsigned int		rt_hash_rnd;
248 
249 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
250 #define RT_CACHE_STAT_INC(field) \
251 	(__raw_get_cpu_var(rt_cache_stat).field++)
252 
253 static int rt_intern_hash(unsigned hash, struct rtable *rth,
254 				struct rtable **res);
255 
256 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
257 {
258 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
259 		& rt_hash_mask);
260 }
261 
262 #define rt_hash(daddr, saddr, idx) \
263 	rt_hash_code((__force u32)(__be32)(daddr),\
264 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
265 
266 #ifdef CONFIG_PROC_FS
267 struct rt_cache_iter_state {
268 	int bucket;
269 };
270 
271 static struct rtable *rt_cache_get_first(struct seq_file *seq)
272 {
273 	struct rtable *r = NULL;
274 	struct rt_cache_iter_state *st = seq->private;
275 
276 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
277 		rcu_read_lock_bh();
278 		r = rt_hash_table[st->bucket].chain;
279 		if (r)
280 			break;
281 		rcu_read_unlock_bh();
282 	}
283 	return r;
284 }
285 
286 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
287 {
288 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
289 
290 	r = r->u.dst.rt_next;
291 	while (!r) {
292 		rcu_read_unlock_bh();
293 		if (--st->bucket < 0)
294 			break;
295 		rcu_read_lock_bh();
296 		r = rt_hash_table[st->bucket].chain;
297 	}
298 	return r;
299 }
300 
301 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
302 {
303 	struct rtable *r = rt_cache_get_first(seq);
304 
305 	if (r)
306 		while (pos && (r = rt_cache_get_next(seq, r)))
307 			--pos;
308 	return pos ? NULL : r;
309 }
310 
311 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
312 {
313 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
314 }
315 
316 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
317 {
318 	struct rtable *r = NULL;
319 
320 	if (v == SEQ_START_TOKEN)
321 		r = rt_cache_get_first(seq);
322 	else
323 		r = rt_cache_get_next(seq, v);
324 	++*pos;
325 	return r;
326 }
327 
328 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
329 {
330 	if (v && v != SEQ_START_TOKEN)
331 		rcu_read_unlock_bh();
332 }
333 
334 static int rt_cache_seq_show(struct seq_file *seq, void *v)
335 {
336 	if (v == SEQ_START_TOKEN)
337 		seq_printf(seq, "%-127s\n",
338 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
339 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
340 			   "HHUptod\tSpecDst");
341 	else {
342 		struct rtable *r = v;
343 		char temp[256];
344 
345 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
346 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
347 			r->u.dst.dev ? r->u.dst.dev->name : "*",
348 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
349 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
350 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
351 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
352 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
353 			dst_metric(&r->u.dst, RTAX_WINDOW),
354 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
355 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
356 			r->fl.fl4_tos,
357 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
358 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
359 				       dev_queue_xmit) : 0,
360 			r->rt_spec_dst);
361 		seq_printf(seq, "%-127s\n", temp);
362 	}
363 	return 0;
364 }
365 
366 static const struct seq_operations rt_cache_seq_ops = {
367 	.start  = rt_cache_seq_start,
368 	.next   = rt_cache_seq_next,
369 	.stop   = rt_cache_seq_stop,
370 	.show   = rt_cache_seq_show,
371 };
372 
373 static int rt_cache_seq_open(struct inode *inode, struct file *file)
374 {
375 	struct seq_file *seq;
376 	int rc = -ENOMEM;
377 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
378 
379 	if (!s)
380 		goto out;
381 	rc = seq_open(file, &rt_cache_seq_ops);
382 	if (rc)
383 		goto out_kfree;
384 	seq          = file->private_data;
385 	seq->private = s;
386 	memset(s, 0, sizeof(*s));
387 out:
388 	return rc;
389 out_kfree:
390 	kfree(s);
391 	goto out;
392 }
393 
394 static const struct file_operations rt_cache_seq_fops = {
395 	.owner	 = THIS_MODULE,
396 	.open	 = rt_cache_seq_open,
397 	.read	 = seq_read,
398 	.llseek	 = seq_lseek,
399 	.release = seq_release_private,
400 };
401 
402 
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405 	int cpu;
406 
407 	if (*pos == 0)
408 		return SEQ_START_TOKEN;
409 
410 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411 		if (!cpu_possible(cpu))
412 			continue;
413 		*pos = cpu+1;
414 		return &per_cpu(rt_cache_stat, cpu);
415 	}
416 	return NULL;
417 }
418 
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421 	int cpu;
422 
423 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424 		if (!cpu_possible(cpu))
425 			continue;
426 		*pos = cpu+1;
427 		return &per_cpu(rt_cache_stat, cpu);
428 	}
429 	return NULL;
430 
431 }
432 
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435 
436 }
437 
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440 	struct rt_cache_stat *st = v;
441 
442 	if (v == SEQ_START_TOKEN) {
443 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444 		return 0;
445 	}
446 
447 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449 		   atomic_read(&ipv4_dst_ops.entries),
450 		   st->in_hit,
451 		   st->in_slow_tot,
452 		   st->in_slow_mc,
453 		   st->in_no_route,
454 		   st->in_brd,
455 		   st->in_martian_dst,
456 		   st->in_martian_src,
457 
458 		   st->out_hit,
459 		   st->out_slow_tot,
460 		   st->out_slow_mc,
461 
462 		   st->gc_total,
463 		   st->gc_ignored,
464 		   st->gc_goal_miss,
465 		   st->gc_dst_overflow,
466 		   st->in_hlist_search,
467 		   st->out_hlist_search
468 		);
469 	return 0;
470 }
471 
472 static const struct seq_operations rt_cpu_seq_ops = {
473 	.start  = rt_cpu_seq_start,
474 	.next   = rt_cpu_seq_next,
475 	.stop   = rt_cpu_seq_stop,
476 	.show   = rt_cpu_seq_show,
477 };
478 
479 
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482 	return seq_open(file, &rt_cpu_seq_ops);
483 }
484 
485 static const struct file_operations rt_cpu_seq_fops = {
486 	.owner	 = THIS_MODULE,
487 	.open	 = rt_cpu_seq_open,
488 	.read	 = seq_read,
489 	.llseek	 = seq_lseek,
490 	.release = seq_release,
491 };
492 
493 #endif /* CONFIG_PROC_FS */
494 
495 static __inline__ void rt_free(struct rtable *rt)
496 {
497 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
498 }
499 
500 static __inline__ void rt_drop(struct rtable *rt)
501 {
502 	ip_rt_put(rt);
503 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
504 }
505 
506 static __inline__ int rt_fast_clean(struct rtable *rth)
507 {
508 	/* Kill broadcast/multicast entries very aggresively, if they
509 	   collide in hash table with more useful entries */
510 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
511 		rth->fl.iif && rth->u.dst.rt_next;
512 }
513 
514 static __inline__ int rt_valuable(struct rtable *rth)
515 {
516 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
517 		rth->u.dst.expires;
518 }
519 
520 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
521 {
522 	unsigned long age;
523 	int ret = 0;
524 
525 	if (atomic_read(&rth->u.dst.__refcnt))
526 		goto out;
527 
528 	ret = 1;
529 	if (rth->u.dst.expires &&
530 	    time_after_eq(jiffies, rth->u.dst.expires))
531 		goto out;
532 
533 	age = jiffies - rth->u.dst.lastuse;
534 	ret = 0;
535 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
536 	    (age <= tmo2 && rt_valuable(rth)))
537 		goto out;
538 	ret = 1;
539 out:	return ret;
540 }
541 
542 /* Bits of score are:
543  * 31: very valuable
544  * 30: not quite useless
545  * 29..0: usage counter
546  */
547 static inline u32 rt_score(struct rtable *rt)
548 {
549 	u32 score = jiffies - rt->u.dst.lastuse;
550 
551 	score = ~score & ~(3<<30);
552 
553 	if (rt_valuable(rt))
554 		score |= (1<<31);
555 
556 	if (!rt->fl.iif ||
557 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
558 		score |= (1<<30);
559 
560 	return score;
561 }
562 
563 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
564 {
565 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
566 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
567 		(fl1->mark ^ fl2->mark) |
568 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
569 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
570 		(fl1->oif ^ fl2->oif) |
571 		(fl1->iif ^ fl2->iif)) == 0;
572 }
573 
574 /* This runs via a timer and thus is always in BH context. */
575 static void rt_check_expire(unsigned long dummy)
576 {
577 	static unsigned int rover;
578 	unsigned int i = rover, goal;
579 	struct rtable *rth, **rthp;
580 	unsigned long now = jiffies;
581 	u64 mult;
582 
583 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
584 	if (ip_rt_gc_timeout > 1)
585 		do_div(mult, ip_rt_gc_timeout);
586 	goal = (unsigned int)mult;
587 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
588 	for (; goal > 0; goal--) {
589 		unsigned long tmo = ip_rt_gc_timeout;
590 
591 		i = (i + 1) & rt_hash_mask;
592 		rthp = &rt_hash_table[i].chain;
593 
594 		if (*rthp == 0)
595 			continue;
596 		spin_lock(rt_hash_lock_addr(i));
597 		while ((rth = *rthp) != NULL) {
598 			if (rth->u.dst.expires) {
599 				/* Entry is expired even if it is in use */
600 				if (time_before_eq(now, rth->u.dst.expires)) {
601 					tmo >>= 1;
602 					rthp = &rth->u.dst.rt_next;
603 					continue;
604 				}
605 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
606 				tmo >>= 1;
607 				rthp = &rth->u.dst.rt_next;
608 				continue;
609 			}
610 
611 			/* Cleanup aged off entries. */
612 			*rthp = rth->u.dst.rt_next;
613 			rt_free(rth);
614 		}
615 		spin_unlock(rt_hash_lock_addr(i));
616 
617 		/* Fallback loop breaker. */
618 		if (time_after(jiffies, now))
619 			break;
620 	}
621 	rover = i;
622 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
623 }
624 
625 /* This can run from both BH and non-BH contexts, the latter
626  * in the case of a forced flush event.
627  */
628 static void rt_run_flush(unsigned long dummy)
629 {
630 	int i;
631 	struct rtable *rth, *next;
632 
633 	rt_deadline = 0;
634 
635 	get_random_bytes(&rt_hash_rnd, 4);
636 
637 	for (i = rt_hash_mask; i >= 0; i--) {
638 		spin_lock_bh(rt_hash_lock_addr(i));
639 		rth = rt_hash_table[i].chain;
640 		if (rth)
641 			rt_hash_table[i].chain = NULL;
642 		spin_unlock_bh(rt_hash_lock_addr(i));
643 
644 		for (; rth; rth = next) {
645 			next = rth->u.dst.rt_next;
646 			rt_free(rth);
647 		}
648 	}
649 }
650 
651 static DEFINE_SPINLOCK(rt_flush_lock);
652 
653 void rt_cache_flush(int delay)
654 {
655 	unsigned long now = jiffies;
656 	int user_mode = !in_softirq();
657 
658 	if (delay < 0)
659 		delay = ip_rt_min_delay;
660 
661 	spin_lock_bh(&rt_flush_lock);
662 
663 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
664 		long tmo = (long)(rt_deadline - now);
665 
666 		/* If flush timer is already running
667 		   and flush request is not immediate (delay > 0):
668 
669 		   if deadline is not achieved, prolongate timer to "delay",
670 		   otherwise fire it at deadline time.
671 		 */
672 
673 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
674 			tmo = 0;
675 
676 		if (delay > tmo)
677 			delay = tmo;
678 	}
679 
680 	if (delay <= 0) {
681 		spin_unlock_bh(&rt_flush_lock);
682 		rt_run_flush(0);
683 		return;
684 	}
685 
686 	if (rt_deadline == 0)
687 		rt_deadline = now + ip_rt_max_delay;
688 
689 	mod_timer(&rt_flush_timer, now+delay);
690 	spin_unlock_bh(&rt_flush_lock);
691 }
692 
693 static void rt_secret_rebuild(unsigned long dummy)
694 {
695 	unsigned long now = jiffies;
696 
697 	rt_cache_flush(0);
698 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
699 }
700 
701 /*
702    Short description of GC goals.
703 
704    We want to build algorithm, which will keep routing cache
705    at some equilibrium point, when number of aged off entries
706    is kept approximately equal to newly generated ones.
707 
708    Current expiration strength is variable "expire".
709    We try to adjust it dynamically, so that if networking
710    is idle expires is large enough to keep enough of warm entries,
711    and when load increases it reduces to limit cache size.
712  */
713 
714 static int rt_garbage_collect(void)
715 {
716 	static unsigned long expire = RT_GC_TIMEOUT;
717 	static unsigned long last_gc;
718 	static int rover;
719 	static int equilibrium;
720 	struct rtable *rth, **rthp;
721 	unsigned long now = jiffies;
722 	int goal;
723 
724 	/*
725 	 * Garbage collection is pretty expensive,
726 	 * do not make it too frequently.
727 	 */
728 
729 	RT_CACHE_STAT_INC(gc_total);
730 
731 	if (now - last_gc < ip_rt_gc_min_interval &&
732 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
733 		RT_CACHE_STAT_INC(gc_ignored);
734 		goto out;
735 	}
736 
737 	/* Calculate number of entries, which we want to expire now. */
738 	goal = atomic_read(&ipv4_dst_ops.entries) -
739 		(ip_rt_gc_elasticity << rt_hash_log);
740 	if (goal <= 0) {
741 		if (equilibrium < ipv4_dst_ops.gc_thresh)
742 			equilibrium = ipv4_dst_ops.gc_thresh;
743 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
744 		if (goal > 0) {
745 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
746 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
747 		}
748 	} else {
749 		/* We are in dangerous area. Try to reduce cache really
750 		 * aggressively.
751 		 */
752 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
753 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
754 	}
755 
756 	if (now - last_gc >= ip_rt_gc_min_interval)
757 		last_gc = now;
758 
759 	if (goal <= 0) {
760 		equilibrium += goal;
761 		goto work_done;
762 	}
763 
764 	do {
765 		int i, k;
766 
767 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
768 			unsigned long tmo = expire;
769 
770 			k = (k + 1) & rt_hash_mask;
771 			rthp = &rt_hash_table[k].chain;
772 			spin_lock_bh(rt_hash_lock_addr(k));
773 			while ((rth = *rthp) != NULL) {
774 				if (!rt_may_expire(rth, tmo, expire)) {
775 					tmo >>= 1;
776 					rthp = &rth->u.dst.rt_next;
777 					continue;
778 				}
779 				*rthp = rth->u.dst.rt_next;
780 				rt_free(rth);
781 				goal--;
782 			}
783 			spin_unlock_bh(rt_hash_lock_addr(k));
784 			if (goal <= 0)
785 				break;
786 		}
787 		rover = k;
788 
789 		if (goal <= 0)
790 			goto work_done;
791 
792 		/* Goal is not achieved. We stop process if:
793 
794 		   - if expire reduced to zero. Otherwise, expire is halfed.
795 		   - if table is not full.
796 		   - if we are called from interrupt.
797 		   - jiffies check is just fallback/debug loop breaker.
798 		     We will not spin here for long time in any case.
799 		 */
800 
801 		RT_CACHE_STAT_INC(gc_goal_miss);
802 
803 		if (expire == 0)
804 			break;
805 
806 		expire >>= 1;
807 #if RT_CACHE_DEBUG >= 2
808 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
809 				atomic_read(&ipv4_dst_ops.entries), goal, i);
810 #endif
811 
812 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
813 			goto out;
814 	} while (!in_softirq() && time_before_eq(jiffies, now));
815 
816 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
817 		goto out;
818 	if (net_ratelimit())
819 		printk(KERN_WARNING "dst cache overflow\n");
820 	RT_CACHE_STAT_INC(gc_dst_overflow);
821 	return 1;
822 
823 work_done:
824 	expire += ip_rt_gc_min_interval;
825 	if (expire > ip_rt_gc_timeout ||
826 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
827 		expire = ip_rt_gc_timeout;
828 #if RT_CACHE_DEBUG >= 2
829 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
830 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
831 #endif
832 out:	return 0;
833 }
834 
835 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
836 {
837 	struct rtable	*rth, **rthp;
838 	unsigned long	now;
839 	struct rtable *cand, **candp;
840 	u32 		min_score;
841 	int		chain_length;
842 	int attempts = !in_softirq();
843 
844 restart:
845 	chain_length = 0;
846 	min_score = ~(u32)0;
847 	cand = NULL;
848 	candp = NULL;
849 	now = jiffies;
850 
851 	rthp = &rt_hash_table[hash].chain;
852 
853 	spin_lock_bh(rt_hash_lock_addr(hash));
854 	while ((rth = *rthp) != NULL) {
855 		if (compare_keys(&rth->fl, &rt->fl)) {
856 			/* Put it first */
857 			*rthp = rth->u.dst.rt_next;
858 			/*
859 			 * Since lookup is lockfree, the deletion
860 			 * must be visible to another weakly ordered CPU before
861 			 * the insertion at the start of the hash chain.
862 			 */
863 			rcu_assign_pointer(rth->u.dst.rt_next,
864 					   rt_hash_table[hash].chain);
865 			/*
866 			 * Since lookup is lockfree, the update writes
867 			 * must be ordered for consistency on SMP.
868 			 */
869 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
870 
871 			rth->u.dst.__use++;
872 			dst_hold(&rth->u.dst);
873 			rth->u.dst.lastuse = now;
874 			spin_unlock_bh(rt_hash_lock_addr(hash));
875 
876 			rt_drop(rt);
877 			*rp = rth;
878 			return 0;
879 		}
880 
881 		if (!atomic_read(&rth->u.dst.__refcnt)) {
882 			u32 score = rt_score(rth);
883 
884 			if (score <= min_score) {
885 				cand = rth;
886 				candp = rthp;
887 				min_score = score;
888 			}
889 		}
890 
891 		chain_length++;
892 
893 		rthp = &rth->u.dst.rt_next;
894 	}
895 
896 	if (cand) {
897 		/* ip_rt_gc_elasticity used to be average length of chain
898 		 * length, when exceeded gc becomes really aggressive.
899 		 *
900 		 * The second limit is less certain. At the moment it allows
901 		 * only 2 entries per bucket. We will see.
902 		 */
903 		if (chain_length > ip_rt_gc_elasticity) {
904 			*candp = cand->u.dst.rt_next;
905 			rt_free(cand);
906 		}
907 	}
908 
909 	/* Try to bind route to arp only if it is output
910 	   route or unicast forwarding path.
911 	 */
912 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
913 		int err = arp_bind_neighbour(&rt->u.dst);
914 		if (err) {
915 			spin_unlock_bh(rt_hash_lock_addr(hash));
916 
917 			if (err != -ENOBUFS) {
918 				rt_drop(rt);
919 				return err;
920 			}
921 
922 			/* Neighbour tables are full and nothing
923 			   can be released. Try to shrink route cache,
924 			   it is most likely it holds some neighbour records.
925 			 */
926 			if (attempts-- > 0) {
927 				int saved_elasticity = ip_rt_gc_elasticity;
928 				int saved_int = ip_rt_gc_min_interval;
929 				ip_rt_gc_elasticity	= 1;
930 				ip_rt_gc_min_interval	= 0;
931 				rt_garbage_collect();
932 				ip_rt_gc_min_interval	= saved_int;
933 				ip_rt_gc_elasticity	= saved_elasticity;
934 				goto restart;
935 			}
936 
937 			if (net_ratelimit())
938 				printk(KERN_WARNING "Neighbour table overflow.\n");
939 			rt_drop(rt);
940 			return -ENOBUFS;
941 		}
942 	}
943 
944 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
945 #if RT_CACHE_DEBUG >= 2
946 	if (rt->u.dst.rt_next) {
947 		struct rtable *trt;
948 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
949 		       NIPQUAD(rt->rt_dst));
950 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
951 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
952 		printk("\n");
953 	}
954 #endif
955 	rt_hash_table[hash].chain = rt;
956 	spin_unlock_bh(rt_hash_lock_addr(hash));
957 	*rp = rt;
958 	return 0;
959 }
960 
961 void rt_bind_peer(struct rtable *rt, int create)
962 {
963 	static DEFINE_SPINLOCK(rt_peer_lock);
964 	struct inet_peer *peer;
965 
966 	peer = inet_getpeer(rt->rt_dst, create);
967 
968 	spin_lock_bh(&rt_peer_lock);
969 	if (rt->peer == NULL) {
970 		rt->peer = peer;
971 		peer = NULL;
972 	}
973 	spin_unlock_bh(&rt_peer_lock);
974 	if (peer)
975 		inet_putpeer(peer);
976 }
977 
978 /*
979  * Peer allocation may fail only in serious out-of-memory conditions.  However
980  * we still can generate some output.
981  * Random ID selection looks a bit dangerous because we have no chances to
982  * select ID being unique in a reasonable period of time.
983  * But broken packet identifier may be better than no packet at all.
984  */
985 static void ip_select_fb_ident(struct iphdr *iph)
986 {
987 	static DEFINE_SPINLOCK(ip_fb_id_lock);
988 	static u32 ip_fallback_id;
989 	u32 salt;
990 
991 	spin_lock_bh(&ip_fb_id_lock);
992 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
993 	iph->id = htons(salt & 0xFFFF);
994 	ip_fallback_id = salt;
995 	spin_unlock_bh(&ip_fb_id_lock);
996 }
997 
998 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
999 {
1000 	struct rtable *rt = (struct rtable *) dst;
1001 
1002 	if (rt) {
1003 		if (rt->peer == NULL)
1004 			rt_bind_peer(rt, 1);
1005 
1006 		/* If peer is attached to destination, it is never detached,
1007 		   so that we need not to grab a lock to dereference it.
1008 		 */
1009 		if (rt->peer) {
1010 			iph->id = htons(inet_getid(rt->peer, more));
1011 			return;
1012 		}
1013 	} else
1014 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1015 		       __builtin_return_address(0));
1016 
1017 	ip_select_fb_ident(iph);
1018 }
1019 
1020 static void rt_del(unsigned hash, struct rtable *rt)
1021 {
1022 	struct rtable **rthp;
1023 
1024 	spin_lock_bh(rt_hash_lock_addr(hash));
1025 	ip_rt_put(rt);
1026 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1027 	     rthp = &(*rthp)->u.dst.rt_next)
1028 		if (*rthp == rt) {
1029 			*rthp = rt->u.dst.rt_next;
1030 			rt_free(rt);
1031 			break;
1032 		}
1033 	spin_unlock_bh(rt_hash_lock_addr(hash));
1034 }
1035 
1036 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1037 		    __be32 saddr, struct net_device *dev)
1038 {
1039 	int i, k;
1040 	struct in_device *in_dev = in_dev_get(dev);
1041 	struct rtable *rth, **rthp;
1042 	__be32  skeys[2] = { saddr, 0 };
1043 	int  ikeys[2] = { dev->ifindex, 0 };
1044 	struct netevent_redirect netevent;
1045 
1046 	if (!in_dev)
1047 		return;
1048 
1049 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1050 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1051 		goto reject_redirect;
1052 
1053 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1054 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1055 			goto reject_redirect;
1056 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1057 			goto reject_redirect;
1058 	} else {
1059 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1060 			goto reject_redirect;
1061 	}
1062 
1063 	for (i = 0; i < 2; i++) {
1064 		for (k = 0; k < 2; k++) {
1065 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1066 
1067 			rthp=&rt_hash_table[hash].chain;
1068 
1069 			rcu_read_lock();
1070 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1071 				struct rtable *rt;
1072 
1073 				if (rth->fl.fl4_dst != daddr ||
1074 				    rth->fl.fl4_src != skeys[i] ||
1075 				    rth->fl.oif != ikeys[k] ||
1076 				    rth->fl.iif != 0) {
1077 					rthp = &rth->u.dst.rt_next;
1078 					continue;
1079 				}
1080 
1081 				if (rth->rt_dst != daddr ||
1082 				    rth->rt_src != saddr ||
1083 				    rth->u.dst.error ||
1084 				    rth->rt_gateway != old_gw ||
1085 				    rth->u.dst.dev != dev)
1086 					break;
1087 
1088 				dst_hold(&rth->u.dst);
1089 				rcu_read_unlock();
1090 
1091 				rt = dst_alloc(&ipv4_dst_ops);
1092 				if (rt == NULL) {
1093 					ip_rt_put(rth);
1094 					in_dev_put(in_dev);
1095 					return;
1096 				}
1097 
1098 				/* Copy all the information. */
1099 				*rt = *rth;
1100 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1101 				rt->u.dst.__use		= 1;
1102 				atomic_set(&rt->u.dst.__refcnt, 1);
1103 				rt->u.dst.child		= NULL;
1104 				if (rt->u.dst.dev)
1105 					dev_hold(rt->u.dst.dev);
1106 				if (rt->idev)
1107 					in_dev_hold(rt->idev);
1108 				rt->u.dst.obsolete	= 0;
1109 				rt->u.dst.lastuse	= jiffies;
1110 				rt->u.dst.path		= &rt->u.dst;
1111 				rt->u.dst.neighbour	= NULL;
1112 				rt->u.dst.hh		= NULL;
1113 				rt->u.dst.xfrm		= NULL;
1114 
1115 				rt->rt_flags		|= RTCF_REDIRECTED;
1116 
1117 				/* Gateway is different ... */
1118 				rt->rt_gateway		= new_gw;
1119 
1120 				/* Redirect received -> path was valid */
1121 				dst_confirm(&rth->u.dst);
1122 
1123 				if (rt->peer)
1124 					atomic_inc(&rt->peer->refcnt);
1125 
1126 				if (arp_bind_neighbour(&rt->u.dst) ||
1127 				    !(rt->u.dst.neighbour->nud_state &
1128 					    NUD_VALID)) {
1129 					if (rt->u.dst.neighbour)
1130 						neigh_event_send(rt->u.dst.neighbour, NULL);
1131 					ip_rt_put(rth);
1132 					rt_drop(rt);
1133 					goto do_next;
1134 				}
1135 
1136 				netevent.old = &rth->u.dst;
1137 				netevent.new = &rt->u.dst;
1138 				call_netevent_notifiers(NETEVENT_REDIRECT,
1139 							&netevent);
1140 
1141 				rt_del(hash, rth);
1142 				if (!rt_intern_hash(hash, rt, &rt))
1143 					ip_rt_put(rt);
1144 				goto do_next;
1145 			}
1146 			rcu_read_unlock();
1147 		do_next:
1148 			;
1149 		}
1150 	}
1151 	in_dev_put(in_dev);
1152 	return;
1153 
1154 reject_redirect:
1155 #ifdef CONFIG_IP_ROUTE_VERBOSE
1156 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1157 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1158 			"%u.%u.%u.%u ignored.\n"
1159 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1160 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1161 		       NIPQUAD(saddr), NIPQUAD(daddr));
1162 #endif
1163 	in_dev_put(in_dev);
1164 }
1165 
1166 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1167 {
1168 	struct rtable *rt = (struct rtable*)dst;
1169 	struct dst_entry *ret = dst;
1170 
1171 	if (rt) {
1172 		if (dst->obsolete) {
1173 			ip_rt_put(rt);
1174 			ret = NULL;
1175 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1176 			   rt->u.dst.expires) {
1177 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1178 						rt->fl.oif);
1179 #if RT_CACHE_DEBUG >= 1
1180 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1181 					  "%u.%u.%u.%u/%02x dropped\n",
1182 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1183 #endif
1184 			rt_del(hash, rt);
1185 			ret = NULL;
1186 		}
1187 	}
1188 	return ret;
1189 }
1190 
1191 /*
1192  * Algorithm:
1193  *	1. The first ip_rt_redirect_number redirects are sent
1194  *	   with exponential backoff, then we stop sending them at all,
1195  *	   assuming that the host ignores our redirects.
1196  *	2. If we did not see packets requiring redirects
1197  *	   during ip_rt_redirect_silence, we assume that the host
1198  *	   forgot redirected route and start to send redirects again.
1199  *
1200  * This algorithm is much cheaper and more intelligent than dumb load limiting
1201  * in icmp.c.
1202  *
1203  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1204  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1205  */
1206 
1207 void ip_rt_send_redirect(struct sk_buff *skb)
1208 {
1209 	struct rtable *rt = (struct rtable*)skb->dst;
1210 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1211 
1212 	if (!in_dev)
1213 		return;
1214 
1215 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1216 		goto out;
1217 
1218 	/* No redirected packets during ip_rt_redirect_silence;
1219 	 * reset the algorithm.
1220 	 */
1221 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1222 		rt->u.dst.rate_tokens = 0;
1223 
1224 	/* Too many ignored redirects; do not send anything
1225 	 * set u.dst.rate_last to the last seen redirected packet.
1226 	 */
1227 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1228 		rt->u.dst.rate_last = jiffies;
1229 		goto out;
1230 	}
1231 
1232 	/* Check for load limit; set rate_last to the latest sent
1233 	 * redirect.
1234 	 */
1235 	if (rt->u.dst.rate_tokens == 0 ||
1236 	    time_after(jiffies,
1237 		       (rt->u.dst.rate_last +
1238 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1239 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1240 		rt->u.dst.rate_last = jiffies;
1241 		++rt->u.dst.rate_tokens;
1242 #ifdef CONFIG_IP_ROUTE_VERBOSE
1243 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1244 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1245 		    net_ratelimit())
1246 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1247 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1248 				NIPQUAD(rt->rt_src), rt->rt_iif,
1249 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1250 #endif
1251 	}
1252 out:
1253 	in_dev_put(in_dev);
1254 }
1255 
1256 static int ip_error(struct sk_buff *skb)
1257 {
1258 	struct rtable *rt = (struct rtable*)skb->dst;
1259 	unsigned long now;
1260 	int code;
1261 
1262 	switch (rt->u.dst.error) {
1263 		case EINVAL:
1264 		default:
1265 			goto out;
1266 		case EHOSTUNREACH:
1267 			code = ICMP_HOST_UNREACH;
1268 			break;
1269 		case ENETUNREACH:
1270 			code = ICMP_NET_UNREACH;
1271 			break;
1272 		case EACCES:
1273 			code = ICMP_PKT_FILTERED;
1274 			break;
1275 	}
1276 
1277 	now = jiffies;
1278 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1279 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1280 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1281 	rt->u.dst.rate_last = now;
1282 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1283 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1284 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1285 	}
1286 
1287 out:	kfree_skb(skb);
1288 	return 0;
1289 }
1290 
1291 /*
1292  *	The last two values are not from the RFC but
1293  *	are needed for AMPRnet AX.25 paths.
1294  */
1295 
1296 static const unsigned short mtu_plateau[] =
1297 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1298 
1299 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1300 {
1301 	int i;
1302 
1303 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1304 		if (old_mtu > mtu_plateau[i])
1305 			return mtu_plateau[i];
1306 	return 68;
1307 }
1308 
1309 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1310 {
1311 	int i;
1312 	unsigned short old_mtu = ntohs(iph->tot_len);
1313 	struct rtable *rth;
1314 	__be32  skeys[2] = { iph->saddr, 0, };
1315 	__be32  daddr = iph->daddr;
1316 	unsigned short est_mtu = 0;
1317 
1318 	if (ipv4_config.no_pmtu_disc)
1319 		return 0;
1320 
1321 	for (i = 0; i < 2; i++) {
1322 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1323 
1324 		rcu_read_lock();
1325 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1326 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1327 			if (rth->fl.fl4_dst == daddr &&
1328 			    rth->fl.fl4_src == skeys[i] &&
1329 			    rth->rt_dst  == daddr &&
1330 			    rth->rt_src  == iph->saddr &&
1331 			    rth->fl.iif == 0 &&
1332 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1333 				unsigned short mtu = new_mtu;
1334 
1335 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1336 
1337 					/* BSD 4.2 compatibility hack :-( */
1338 					if (mtu == 0 &&
1339 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1340 					    old_mtu >= 68 + (iph->ihl << 2))
1341 						old_mtu -= iph->ihl << 2;
1342 
1343 					mtu = guess_mtu(old_mtu);
1344 				}
1345 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1346 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1347 						dst_confirm(&rth->u.dst);
1348 						if (mtu < ip_rt_min_pmtu) {
1349 							mtu = ip_rt_min_pmtu;
1350 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1351 								(1 << RTAX_MTU);
1352 						}
1353 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1354 						dst_set_expires(&rth->u.dst,
1355 							ip_rt_mtu_expires);
1356 					}
1357 					est_mtu = mtu;
1358 				}
1359 			}
1360 		}
1361 		rcu_read_unlock();
1362 	}
1363 	return est_mtu ? : new_mtu;
1364 }
1365 
1366 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1367 {
1368 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1369 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1370 		if (mtu < ip_rt_min_pmtu) {
1371 			mtu = ip_rt_min_pmtu;
1372 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1373 		}
1374 		dst->metrics[RTAX_MTU-1] = mtu;
1375 		dst_set_expires(dst, ip_rt_mtu_expires);
1376 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1377 	}
1378 }
1379 
1380 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1381 {
1382 	return NULL;
1383 }
1384 
1385 static void ipv4_dst_destroy(struct dst_entry *dst)
1386 {
1387 	struct rtable *rt = (struct rtable *) dst;
1388 	struct inet_peer *peer = rt->peer;
1389 	struct in_device *idev = rt->idev;
1390 
1391 	if (peer) {
1392 		rt->peer = NULL;
1393 		inet_putpeer(peer);
1394 	}
1395 
1396 	if (idev) {
1397 		rt->idev = NULL;
1398 		in_dev_put(idev);
1399 	}
1400 }
1401 
1402 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1403 			    int how)
1404 {
1405 	struct rtable *rt = (struct rtable *) dst;
1406 	struct in_device *idev = rt->idev;
1407 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1408 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1409 		if (loopback_idev) {
1410 			rt->idev = loopback_idev;
1411 			in_dev_put(idev);
1412 		}
1413 	}
1414 }
1415 
1416 static void ipv4_link_failure(struct sk_buff *skb)
1417 {
1418 	struct rtable *rt;
1419 
1420 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1421 
1422 	rt = (struct rtable *) skb->dst;
1423 	if (rt)
1424 		dst_set_expires(&rt->u.dst, 0);
1425 }
1426 
1427 static int ip_rt_bug(struct sk_buff *skb)
1428 {
1429 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1430 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1431 		skb->dev ? skb->dev->name : "?");
1432 	kfree_skb(skb);
1433 	return 0;
1434 }
1435 
1436 /*
1437    We do not cache source address of outgoing interface,
1438    because it is used only by IP RR, TS and SRR options,
1439    so that it out of fast path.
1440 
1441    BTW remember: "addr" is allowed to be not aligned
1442    in IP options!
1443  */
1444 
1445 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1446 {
1447 	__be32 src;
1448 	struct fib_result res;
1449 
1450 	if (rt->fl.iif == 0)
1451 		src = rt->rt_src;
1452 	else if (fib_lookup(&rt->fl, &res) == 0) {
1453 		src = FIB_RES_PREFSRC(res);
1454 		fib_res_put(&res);
1455 	} else
1456 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1457 					RT_SCOPE_UNIVERSE);
1458 	memcpy(addr, &src, 4);
1459 }
1460 
1461 #ifdef CONFIG_NET_CLS_ROUTE
1462 static void set_class_tag(struct rtable *rt, u32 tag)
1463 {
1464 	if (!(rt->u.dst.tclassid & 0xFFFF))
1465 		rt->u.dst.tclassid |= tag & 0xFFFF;
1466 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1467 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1468 }
1469 #endif
1470 
1471 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1472 {
1473 	struct fib_info *fi = res->fi;
1474 
1475 	if (fi) {
1476 		if (FIB_RES_GW(*res) &&
1477 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1478 			rt->rt_gateway = FIB_RES_GW(*res);
1479 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1480 		       sizeof(rt->u.dst.metrics));
1481 		if (fi->fib_mtu == 0) {
1482 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1483 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1484 			    rt->rt_gateway != rt->rt_dst &&
1485 			    rt->u.dst.dev->mtu > 576)
1486 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1487 		}
1488 #ifdef CONFIG_NET_CLS_ROUTE
1489 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1490 #endif
1491 	} else
1492 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1493 
1494 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1495 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1496 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1497 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1498 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1499 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1500 				       ip_rt_min_advmss);
1501 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1502 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1503 
1504 #ifdef CONFIG_NET_CLS_ROUTE
1505 #ifdef CONFIG_IP_MULTIPLE_TABLES
1506 	set_class_tag(rt, fib_rules_tclass(res));
1507 #endif
1508 	set_class_tag(rt, itag);
1509 #endif
1510 	rt->rt_type = res->type;
1511 }
1512 
1513 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1514 				u8 tos, struct net_device *dev, int our)
1515 {
1516 	unsigned hash;
1517 	struct rtable *rth;
1518 	__be32 spec_dst;
1519 	struct in_device *in_dev = in_dev_get(dev);
1520 	u32 itag = 0;
1521 
1522 	/* Primary sanity checks. */
1523 
1524 	if (in_dev == NULL)
1525 		return -EINVAL;
1526 
1527 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1528 	    skb->protocol != htons(ETH_P_IP))
1529 		goto e_inval;
1530 
1531 	if (ZERONET(saddr)) {
1532 		if (!LOCAL_MCAST(daddr))
1533 			goto e_inval;
1534 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1535 	} else if (fib_validate_source(saddr, 0, tos, 0,
1536 					dev, &spec_dst, &itag) < 0)
1537 		goto e_inval;
1538 
1539 	rth = dst_alloc(&ipv4_dst_ops);
1540 	if (!rth)
1541 		goto e_nobufs;
1542 
1543 	rth->u.dst.output= ip_rt_bug;
1544 
1545 	atomic_set(&rth->u.dst.__refcnt, 1);
1546 	rth->u.dst.flags= DST_HOST;
1547 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1548 		rth->u.dst.flags |= DST_NOPOLICY;
1549 	rth->fl.fl4_dst	= daddr;
1550 	rth->rt_dst	= daddr;
1551 	rth->fl.fl4_tos	= tos;
1552 	rth->fl.mark    = skb->mark;
1553 	rth->fl.fl4_src	= saddr;
1554 	rth->rt_src	= saddr;
1555 #ifdef CONFIG_NET_CLS_ROUTE
1556 	rth->u.dst.tclassid = itag;
1557 #endif
1558 	rth->rt_iif	=
1559 	rth->fl.iif	= dev->ifindex;
1560 	rth->u.dst.dev	= &loopback_dev;
1561 	dev_hold(rth->u.dst.dev);
1562 	rth->idev	= in_dev_get(rth->u.dst.dev);
1563 	rth->fl.oif	= 0;
1564 	rth->rt_gateway	= daddr;
1565 	rth->rt_spec_dst= spec_dst;
1566 	rth->rt_type	= RTN_MULTICAST;
1567 	rth->rt_flags	= RTCF_MULTICAST;
1568 	if (our) {
1569 		rth->u.dst.input= ip_local_deliver;
1570 		rth->rt_flags |= RTCF_LOCAL;
1571 	}
1572 
1573 #ifdef CONFIG_IP_MROUTE
1574 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1575 		rth->u.dst.input = ip_mr_input;
1576 #endif
1577 	RT_CACHE_STAT_INC(in_slow_mc);
1578 
1579 	in_dev_put(in_dev);
1580 	hash = rt_hash(daddr, saddr, dev->ifindex);
1581 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1582 
1583 e_nobufs:
1584 	in_dev_put(in_dev);
1585 	return -ENOBUFS;
1586 
1587 e_inval:
1588 	in_dev_put(in_dev);
1589 	return -EINVAL;
1590 }
1591 
1592 
1593 static void ip_handle_martian_source(struct net_device *dev,
1594 				     struct in_device *in_dev,
1595 				     struct sk_buff *skb,
1596 				     __be32 daddr,
1597 				     __be32 saddr)
1598 {
1599 	RT_CACHE_STAT_INC(in_martian_src);
1600 #ifdef CONFIG_IP_ROUTE_VERBOSE
1601 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1602 		/*
1603 		 *	RFC1812 recommendation, if source is martian,
1604 		 *	the only hint is MAC header.
1605 		 */
1606 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1607 			"%u.%u.%u.%u, on dev %s\n",
1608 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1609 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1610 			int i;
1611 			const unsigned char *p = skb_mac_header(skb);
1612 			printk(KERN_WARNING "ll header: ");
1613 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1614 				printk("%02x", *p);
1615 				if (i < (dev->hard_header_len - 1))
1616 					printk(":");
1617 			}
1618 			printk("\n");
1619 		}
1620 	}
1621 #endif
1622 }
1623 
1624 static inline int __mkroute_input(struct sk_buff *skb,
1625 				  struct fib_result* res,
1626 				  struct in_device *in_dev,
1627 				  __be32 daddr, __be32 saddr, u32 tos,
1628 				  struct rtable **result)
1629 {
1630 
1631 	struct rtable *rth;
1632 	int err;
1633 	struct in_device *out_dev;
1634 	unsigned flags = 0;
1635 	__be32 spec_dst;
1636 	u32 itag;
1637 
1638 	/* get a working reference to the output device */
1639 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1640 	if (out_dev == NULL) {
1641 		if (net_ratelimit())
1642 			printk(KERN_CRIT "Bug in ip_route_input" \
1643 			       "_slow(). Please, report\n");
1644 		return -EINVAL;
1645 	}
1646 
1647 
1648 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1649 				  in_dev->dev, &spec_dst, &itag);
1650 	if (err < 0) {
1651 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1652 					 saddr);
1653 
1654 		err = -EINVAL;
1655 		goto cleanup;
1656 	}
1657 
1658 	if (err)
1659 		flags |= RTCF_DIRECTSRC;
1660 
1661 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1662 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1663 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1664 		flags |= RTCF_DOREDIRECT;
1665 
1666 	if (skb->protocol != htons(ETH_P_IP)) {
1667 		/* Not IP (i.e. ARP). Do not create route, if it is
1668 		 * invalid for proxy arp. DNAT routes are always valid.
1669 		 */
1670 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1671 			err = -EINVAL;
1672 			goto cleanup;
1673 		}
1674 	}
1675 
1676 
1677 	rth = dst_alloc(&ipv4_dst_ops);
1678 	if (!rth) {
1679 		err = -ENOBUFS;
1680 		goto cleanup;
1681 	}
1682 
1683 	atomic_set(&rth->u.dst.__refcnt, 1);
1684 	rth->u.dst.flags= DST_HOST;
1685 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1686 		rth->u.dst.flags |= DST_NOPOLICY;
1687 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1688 		rth->u.dst.flags |= DST_NOXFRM;
1689 	rth->fl.fl4_dst	= daddr;
1690 	rth->rt_dst	= daddr;
1691 	rth->fl.fl4_tos	= tos;
1692 	rth->fl.mark    = skb->mark;
1693 	rth->fl.fl4_src	= saddr;
1694 	rth->rt_src	= saddr;
1695 	rth->rt_gateway	= daddr;
1696 	rth->rt_iif 	=
1697 		rth->fl.iif	= in_dev->dev->ifindex;
1698 	rth->u.dst.dev	= (out_dev)->dev;
1699 	dev_hold(rth->u.dst.dev);
1700 	rth->idev	= in_dev_get(rth->u.dst.dev);
1701 	rth->fl.oif 	= 0;
1702 	rth->rt_spec_dst= spec_dst;
1703 
1704 	rth->u.dst.input = ip_forward;
1705 	rth->u.dst.output = ip_output;
1706 
1707 	rt_set_nexthop(rth, res, itag);
1708 
1709 	rth->rt_flags = flags;
1710 
1711 	*result = rth;
1712 	err = 0;
1713  cleanup:
1714 	/* release the working reference to the output device */
1715 	in_dev_put(out_dev);
1716 	return err;
1717 }
1718 
1719 static inline int ip_mkroute_input(struct sk_buff *skb,
1720 				   struct fib_result* res,
1721 				   const struct flowi *fl,
1722 				   struct in_device *in_dev,
1723 				   __be32 daddr, __be32 saddr, u32 tos)
1724 {
1725 	struct rtable* rth = NULL;
1726 	int err;
1727 	unsigned hash;
1728 
1729 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1730 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1731 		fib_select_multipath(fl, res);
1732 #endif
1733 
1734 	/* create a routing cache entry */
1735 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1736 	if (err)
1737 		return err;
1738 
1739 	/* put it into the cache */
1740 	hash = rt_hash(daddr, saddr, fl->iif);
1741 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1742 }
1743 
1744 /*
1745  *	NOTE. We drop all the packets that has local source
1746  *	addresses, because every properly looped back packet
1747  *	must have correct destination already attached by output routine.
1748  *
1749  *	Such approach solves two big problems:
1750  *	1. Not simplex devices are handled properly.
1751  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1752  */
1753 
1754 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1755 			       u8 tos, struct net_device *dev)
1756 {
1757 	struct fib_result res;
1758 	struct in_device *in_dev = in_dev_get(dev);
1759 	struct flowi fl = { .nl_u = { .ip4_u =
1760 				      { .daddr = daddr,
1761 					.saddr = saddr,
1762 					.tos = tos,
1763 					.scope = RT_SCOPE_UNIVERSE,
1764 				      } },
1765 			    .mark = skb->mark,
1766 			    .iif = dev->ifindex };
1767 	unsigned	flags = 0;
1768 	u32		itag = 0;
1769 	struct rtable * rth;
1770 	unsigned	hash;
1771 	__be32		spec_dst;
1772 	int		err = -EINVAL;
1773 	int		free_res = 0;
1774 
1775 	/* IP on this device is disabled. */
1776 
1777 	if (!in_dev)
1778 		goto out;
1779 
1780 	/* Check for the most weird martians, which can be not detected
1781 	   by fib_lookup.
1782 	 */
1783 
1784 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1785 		goto martian_source;
1786 
1787 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1788 		goto brd_input;
1789 
1790 	/* Accept zero addresses only to limited broadcast;
1791 	 * I even do not know to fix it or not. Waiting for complains :-)
1792 	 */
1793 	if (ZERONET(saddr))
1794 		goto martian_source;
1795 
1796 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1797 		goto martian_destination;
1798 
1799 	/*
1800 	 *	Now we are ready to route packet.
1801 	 */
1802 	if ((err = fib_lookup(&fl, &res)) != 0) {
1803 		if (!IN_DEV_FORWARD(in_dev))
1804 			goto e_hostunreach;
1805 		goto no_route;
1806 	}
1807 	free_res = 1;
1808 
1809 	RT_CACHE_STAT_INC(in_slow_tot);
1810 
1811 	if (res.type == RTN_BROADCAST)
1812 		goto brd_input;
1813 
1814 	if (res.type == RTN_LOCAL) {
1815 		int result;
1816 		result = fib_validate_source(saddr, daddr, tos,
1817 					     loopback_dev.ifindex,
1818 					     dev, &spec_dst, &itag);
1819 		if (result < 0)
1820 			goto martian_source;
1821 		if (result)
1822 			flags |= RTCF_DIRECTSRC;
1823 		spec_dst = daddr;
1824 		goto local_input;
1825 	}
1826 
1827 	if (!IN_DEV_FORWARD(in_dev))
1828 		goto e_hostunreach;
1829 	if (res.type != RTN_UNICAST)
1830 		goto martian_destination;
1831 
1832 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1833 	if (err == -ENOBUFS)
1834 		goto e_nobufs;
1835 	if (err == -EINVAL)
1836 		goto e_inval;
1837 
1838 done:
1839 	in_dev_put(in_dev);
1840 	if (free_res)
1841 		fib_res_put(&res);
1842 out:	return err;
1843 
1844 brd_input:
1845 	if (skb->protocol != htons(ETH_P_IP))
1846 		goto e_inval;
1847 
1848 	if (ZERONET(saddr))
1849 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1850 	else {
1851 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1852 					  &itag);
1853 		if (err < 0)
1854 			goto martian_source;
1855 		if (err)
1856 			flags |= RTCF_DIRECTSRC;
1857 	}
1858 	flags |= RTCF_BROADCAST;
1859 	res.type = RTN_BROADCAST;
1860 	RT_CACHE_STAT_INC(in_brd);
1861 
1862 local_input:
1863 	rth = dst_alloc(&ipv4_dst_ops);
1864 	if (!rth)
1865 		goto e_nobufs;
1866 
1867 	rth->u.dst.output= ip_rt_bug;
1868 
1869 	atomic_set(&rth->u.dst.__refcnt, 1);
1870 	rth->u.dst.flags= DST_HOST;
1871 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872 		rth->u.dst.flags |= DST_NOPOLICY;
1873 	rth->fl.fl4_dst	= daddr;
1874 	rth->rt_dst	= daddr;
1875 	rth->fl.fl4_tos	= tos;
1876 	rth->fl.mark    = skb->mark;
1877 	rth->fl.fl4_src	= saddr;
1878 	rth->rt_src	= saddr;
1879 #ifdef CONFIG_NET_CLS_ROUTE
1880 	rth->u.dst.tclassid = itag;
1881 #endif
1882 	rth->rt_iif	=
1883 	rth->fl.iif	= dev->ifindex;
1884 	rth->u.dst.dev	= &loopback_dev;
1885 	dev_hold(rth->u.dst.dev);
1886 	rth->idev	= in_dev_get(rth->u.dst.dev);
1887 	rth->rt_gateway	= daddr;
1888 	rth->rt_spec_dst= spec_dst;
1889 	rth->u.dst.input= ip_local_deliver;
1890 	rth->rt_flags 	= flags|RTCF_LOCAL;
1891 	if (res.type == RTN_UNREACHABLE) {
1892 		rth->u.dst.input= ip_error;
1893 		rth->u.dst.error= -err;
1894 		rth->rt_flags 	&= ~RTCF_LOCAL;
1895 	}
1896 	rth->rt_type	= res.type;
1897 	hash = rt_hash(daddr, saddr, fl.iif);
1898 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1899 	goto done;
1900 
1901 no_route:
1902 	RT_CACHE_STAT_INC(in_no_route);
1903 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1904 	res.type = RTN_UNREACHABLE;
1905 	goto local_input;
1906 
1907 	/*
1908 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1909 	 */
1910 martian_destination:
1911 	RT_CACHE_STAT_INC(in_martian_dst);
1912 #ifdef CONFIG_IP_ROUTE_VERBOSE
1913 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1914 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1915 			"%u.%u.%u.%u, dev %s\n",
1916 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1917 #endif
1918 
1919 e_hostunreach:
1920 	err = -EHOSTUNREACH;
1921 	goto done;
1922 
1923 e_inval:
1924 	err = -EINVAL;
1925 	goto done;
1926 
1927 e_nobufs:
1928 	err = -ENOBUFS;
1929 	goto done;
1930 
1931 martian_source:
1932 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1933 	goto e_inval;
1934 }
1935 
1936 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1937 		   u8 tos, struct net_device *dev)
1938 {
1939 	struct rtable * rth;
1940 	unsigned	hash;
1941 	int iif = dev->ifindex;
1942 
1943 	tos &= IPTOS_RT_MASK;
1944 	hash = rt_hash(daddr, saddr, iif);
1945 
1946 	rcu_read_lock();
1947 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1948 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
1949 		if (rth->fl.fl4_dst == daddr &&
1950 		    rth->fl.fl4_src == saddr &&
1951 		    rth->fl.iif == iif &&
1952 		    rth->fl.oif == 0 &&
1953 		    rth->fl.mark == skb->mark &&
1954 		    rth->fl.fl4_tos == tos) {
1955 			rth->u.dst.lastuse = jiffies;
1956 			dst_hold(&rth->u.dst);
1957 			rth->u.dst.__use++;
1958 			RT_CACHE_STAT_INC(in_hit);
1959 			rcu_read_unlock();
1960 			skb->dst = (struct dst_entry*)rth;
1961 			return 0;
1962 		}
1963 		RT_CACHE_STAT_INC(in_hlist_search);
1964 	}
1965 	rcu_read_unlock();
1966 
1967 	/* Multicast recognition logic is moved from route cache to here.
1968 	   The problem was that too many Ethernet cards have broken/missing
1969 	   hardware multicast filters :-( As result the host on multicasting
1970 	   network acquires a lot of useless route cache entries, sort of
1971 	   SDR messages from all the world. Now we try to get rid of them.
1972 	   Really, provided software IP multicast filter is organized
1973 	   reasonably (at least, hashed), it does not result in a slowdown
1974 	   comparing with route cache reject entries.
1975 	   Note, that multicast routers are not affected, because
1976 	   route cache entry is created eventually.
1977 	 */
1978 	if (MULTICAST(daddr)) {
1979 		struct in_device *in_dev;
1980 
1981 		rcu_read_lock();
1982 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1983 			int our = ip_check_mc(in_dev, daddr, saddr,
1984 				ip_hdr(skb)->protocol);
1985 			if (our
1986 #ifdef CONFIG_IP_MROUTE
1987 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1988 #endif
1989 			    ) {
1990 				rcu_read_unlock();
1991 				return ip_route_input_mc(skb, daddr, saddr,
1992 							 tos, dev, our);
1993 			}
1994 		}
1995 		rcu_read_unlock();
1996 		return -EINVAL;
1997 	}
1998 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1999 }
2000 
2001 static inline int __mkroute_output(struct rtable **result,
2002 				   struct fib_result* res,
2003 				   const struct flowi *fl,
2004 				   const struct flowi *oldflp,
2005 				   struct net_device *dev_out,
2006 				   unsigned flags)
2007 {
2008 	struct rtable *rth;
2009 	struct in_device *in_dev;
2010 	u32 tos = RT_FL_TOS(oldflp);
2011 	int err = 0;
2012 
2013 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2014 		return -EINVAL;
2015 
2016 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2017 		res->type = RTN_BROADCAST;
2018 	else if (MULTICAST(fl->fl4_dst))
2019 		res->type = RTN_MULTICAST;
2020 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2021 		return -EINVAL;
2022 
2023 	if (dev_out->flags & IFF_LOOPBACK)
2024 		flags |= RTCF_LOCAL;
2025 
2026 	/* get work reference to inet device */
2027 	in_dev = in_dev_get(dev_out);
2028 	if (!in_dev)
2029 		return -EINVAL;
2030 
2031 	if (res->type == RTN_BROADCAST) {
2032 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2033 		if (res->fi) {
2034 			fib_info_put(res->fi);
2035 			res->fi = NULL;
2036 		}
2037 	} else if (res->type == RTN_MULTICAST) {
2038 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2039 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2040 				 oldflp->proto))
2041 			flags &= ~RTCF_LOCAL;
2042 		/* If multicast route do not exist use
2043 		   default one, but do not gateway in this case.
2044 		   Yes, it is hack.
2045 		 */
2046 		if (res->fi && res->prefixlen < 4) {
2047 			fib_info_put(res->fi);
2048 			res->fi = NULL;
2049 		}
2050 	}
2051 
2052 
2053 	rth = dst_alloc(&ipv4_dst_ops);
2054 	if (!rth) {
2055 		err = -ENOBUFS;
2056 		goto cleanup;
2057 	}
2058 
2059 	atomic_set(&rth->u.dst.__refcnt, 1);
2060 	rth->u.dst.flags= DST_HOST;
2061 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2062 		rth->u.dst.flags |= DST_NOXFRM;
2063 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2064 		rth->u.dst.flags |= DST_NOPOLICY;
2065 
2066 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2067 	rth->fl.fl4_tos	= tos;
2068 	rth->fl.fl4_src	= oldflp->fl4_src;
2069 	rth->fl.oif	= oldflp->oif;
2070 	rth->fl.mark    = oldflp->mark;
2071 	rth->rt_dst	= fl->fl4_dst;
2072 	rth->rt_src	= fl->fl4_src;
2073 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2074 	/* get references to the devices that are to be hold by the routing
2075 	   cache entry */
2076 	rth->u.dst.dev	= dev_out;
2077 	dev_hold(dev_out);
2078 	rth->idev	= in_dev_get(dev_out);
2079 	rth->rt_gateway = fl->fl4_dst;
2080 	rth->rt_spec_dst= fl->fl4_src;
2081 
2082 	rth->u.dst.output=ip_output;
2083 
2084 	RT_CACHE_STAT_INC(out_slow_tot);
2085 
2086 	if (flags & RTCF_LOCAL) {
2087 		rth->u.dst.input = ip_local_deliver;
2088 		rth->rt_spec_dst = fl->fl4_dst;
2089 	}
2090 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2091 		rth->rt_spec_dst = fl->fl4_src;
2092 		if (flags & RTCF_LOCAL &&
2093 		    !(dev_out->flags & IFF_LOOPBACK)) {
2094 			rth->u.dst.output = ip_mc_output;
2095 			RT_CACHE_STAT_INC(out_slow_mc);
2096 		}
2097 #ifdef CONFIG_IP_MROUTE
2098 		if (res->type == RTN_MULTICAST) {
2099 			if (IN_DEV_MFORWARD(in_dev) &&
2100 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2101 				rth->u.dst.input = ip_mr_input;
2102 				rth->u.dst.output = ip_mc_output;
2103 			}
2104 		}
2105 #endif
2106 	}
2107 
2108 	rt_set_nexthop(rth, res, 0);
2109 
2110 	rth->rt_flags = flags;
2111 
2112 	*result = rth;
2113  cleanup:
2114 	/* release work reference to inet device */
2115 	in_dev_put(in_dev);
2116 
2117 	return err;
2118 }
2119 
2120 static inline int ip_mkroute_output(struct rtable **rp,
2121 				    struct fib_result* res,
2122 				    const struct flowi *fl,
2123 				    const struct flowi *oldflp,
2124 				    struct net_device *dev_out,
2125 				    unsigned flags)
2126 {
2127 	struct rtable *rth = NULL;
2128 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2129 	unsigned hash;
2130 	if (err == 0) {
2131 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2132 		err = rt_intern_hash(hash, rth, rp);
2133 	}
2134 
2135 	return err;
2136 }
2137 
2138 /*
2139  * Major route resolver routine.
2140  */
2141 
2142 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2143 {
2144 	u32 tos	= RT_FL_TOS(oldflp);
2145 	struct flowi fl = { .nl_u = { .ip4_u =
2146 				      { .daddr = oldflp->fl4_dst,
2147 					.saddr = oldflp->fl4_src,
2148 					.tos = tos & IPTOS_RT_MASK,
2149 					.scope = ((tos & RTO_ONLINK) ?
2150 						  RT_SCOPE_LINK :
2151 						  RT_SCOPE_UNIVERSE),
2152 				      } },
2153 			    .mark = oldflp->mark,
2154 			    .iif = loopback_dev.ifindex,
2155 			    .oif = oldflp->oif };
2156 	struct fib_result res;
2157 	unsigned flags = 0;
2158 	struct net_device *dev_out = NULL;
2159 	int free_res = 0;
2160 	int err;
2161 
2162 
2163 	res.fi		= NULL;
2164 #ifdef CONFIG_IP_MULTIPLE_TABLES
2165 	res.r		= NULL;
2166 #endif
2167 
2168 	if (oldflp->fl4_src) {
2169 		err = -EINVAL;
2170 		if (MULTICAST(oldflp->fl4_src) ||
2171 		    BADCLASS(oldflp->fl4_src) ||
2172 		    ZERONET(oldflp->fl4_src))
2173 			goto out;
2174 
2175 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2176 		dev_out = ip_dev_find(oldflp->fl4_src);
2177 		if (dev_out == NULL)
2178 			goto out;
2179 
2180 		/* I removed check for oif == dev_out->oif here.
2181 		   It was wrong for two reasons:
2182 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2183 		      assigned to multiple interfaces.
2184 		   2. Moreover, we are allowed to send packets with saddr
2185 		      of another iface. --ANK
2186 		 */
2187 
2188 		if (oldflp->oif == 0
2189 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2190 			/* Special hack: user can direct multicasts
2191 			   and limited broadcast via necessary interface
2192 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2193 			   This hack is not just for fun, it allows
2194 			   vic,vat and friends to work.
2195 			   They bind socket to loopback, set ttl to zero
2196 			   and expect that it will work.
2197 			   From the viewpoint of routing cache they are broken,
2198 			   because we are not allowed to build multicast path
2199 			   with loopback source addr (look, routing cache
2200 			   cannot know, that ttl is zero, so that packet
2201 			   will not leave this host and route is valid).
2202 			   Luckily, this hack is good workaround.
2203 			 */
2204 
2205 			fl.oif = dev_out->ifindex;
2206 			goto make_route;
2207 		}
2208 		if (dev_out)
2209 			dev_put(dev_out);
2210 		dev_out = NULL;
2211 	}
2212 
2213 
2214 	if (oldflp->oif) {
2215 		dev_out = dev_get_by_index(oldflp->oif);
2216 		err = -ENODEV;
2217 		if (dev_out == NULL)
2218 			goto out;
2219 
2220 		/* RACE: Check return value of inet_select_addr instead. */
2221 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2222 			dev_put(dev_out);
2223 			goto out;	/* Wrong error code */
2224 		}
2225 
2226 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2227 			if (!fl.fl4_src)
2228 				fl.fl4_src = inet_select_addr(dev_out, 0,
2229 							      RT_SCOPE_LINK);
2230 			goto make_route;
2231 		}
2232 		if (!fl.fl4_src) {
2233 			if (MULTICAST(oldflp->fl4_dst))
2234 				fl.fl4_src = inet_select_addr(dev_out, 0,
2235 							      fl.fl4_scope);
2236 			else if (!oldflp->fl4_dst)
2237 				fl.fl4_src = inet_select_addr(dev_out, 0,
2238 							      RT_SCOPE_HOST);
2239 		}
2240 	}
2241 
2242 	if (!fl.fl4_dst) {
2243 		fl.fl4_dst = fl.fl4_src;
2244 		if (!fl.fl4_dst)
2245 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2246 		if (dev_out)
2247 			dev_put(dev_out);
2248 		dev_out = &loopback_dev;
2249 		dev_hold(dev_out);
2250 		fl.oif = loopback_dev.ifindex;
2251 		res.type = RTN_LOCAL;
2252 		flags |= RTCF_LOCAL;
2253 		goto make_route;
2254 	}
2255 
2256 	if (fib_lookup(&fl, &res)) {
2257 		res.fi = NULL;
2258 		if (oldflp->oif) {
2259 			/* Apparently, routing tables are wrong. Assume,
2260 			   that the destination is on link.
2261 
2262 			   WHY? DW.
2263 			   Because we are allowed to send to iface
2264 			   even if it has NO routes and NO assigned
2265 			   addresses. When oif is specified, routing
2266 			   tables are looked up with only one purpose:
2267 			   to catch if destination is gatewayed, rather than
2268 			   direct. Moreover, if MSG_DONTROUTE is set,
2269 			   we send packet, ignoring both routing tables
2270 			   and ifaddr state. --ANK
2271 
2272 
2273 			   We could make it even if oif is unknown,
2274 			   likely IPv6, but we do not.
2275 			 */
2276 
2277 			if (fl.fl4_src == 0)
2278 				fl.fl4_src = inet_select_addr(dev_out, 0,
2279 							      RT_SCOPE_LINK);
2280 			res.type = RTN_UNICAST;
2281 			goto make_route;
2282 		}
2283 		if (dev_out)
2284 			dev_put(dev_out);
2285 		err = -ENETUNREACH;
2286 		goto out;
2287 	}
2288 	free_res = 1;
2289 
2290 	if (res.type == RTN_LOCAL) {
2291 		if (!fl.fl4_src)
2292 			fl.fl4_src = fl.fl4_dst;
2293 		if (dev_out)
2294 			dev_put(dev_out);
2295 		dev_out = &loopback_dev;
2296 		dev_hold(dev_out);
2297 		fl.oif = dev_out->ifindex;
2298 		if (res.fi)
2299 			fib_info_put(res.fi);
2300 		res.fi = NULL;
2301 		flags |= RTCF_LOCAL;
2302 		goto make_route;
2303 	}
2304 
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2306 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2307 		fib_select_multipath(&fl, &res);
2308 	else
2309 #endif
2310 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2311 		fib_select_default(&fl, &res);
2312 
2313 	if (!fl.fl4_src)
2314 		fl.fl4_src = FIB_RES_PREFSRC(res);
2315 
2316 	if (dev_out)
2317 		dev_put(dev_out);
2318 	dev_out = FIB_RES_DEV(res);
2319 	dev_hold(dev_out);
2320 	fl.oif = dev_out->ifindex;
2321 
2322 
2323 make_route:
2324 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2325 
2326 
2327 	if (free_res)
2328 		fib_res_put(&res);
2329 	if (dev_out)
2330 		dev_put(dev_out);
2331 out:	return err;
2332 }
2333 
2334 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2335 {
2336 	unsigned hash;
2337 	struct rtable *rth;
2338 
2339 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2340 
2341 	rcu_read_lock_bh();
2342 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2343 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2344 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2345 		    rth->fl.fl4_src == flp->fl4_src &&
2346 		    rth->fl.iif == 0 &&
2347 		    rth->fl.oif == flp->oif &&
2348 		    rth->fl.mark == flp->mark &&
2349 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2350 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2351 			rth->u.dst.lastuse = jiffies;
2352 			dst_hold(&rth->u.dst);
2353 			rth->u.dst.__use++;
2354 			RT_CACHE_STAT_INC(out_hit);
2355 			rcu_read_unlock_bh();
2356 			*rp = rth;
2357 			return 0;
2358 		}
2359 		RT_CACHE_STAT_INC(out_hlist_search);
2360 	}
2361 	rcu_read_unlock_bh();
2362 
2363 	return ip_route_output_slow(rp, flp);
2364 }
2365 
2366 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2367 
2368 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2369 {
2370 }
2371 
2372 static struct dst_ops ipv4_dst_blackhole_ops = {
2373 	.family			=	AF_INET,
2374 	.protocol		=	__constant_htons(ETH_P_IP),
2375 	.destroy		=	ipv4_dst_destroy,
2376 	.check			=	ipv4_dst_check,
2377 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2378 	.entry_size		=	sizeof(struct rtable),
2379 };
2380 
2381 
2382 static int ipv4_blackhole_output(struct sk_buff *skb)
2383 {
2384 	kfree_skb(skb);
2385 	return 0;
2386 }
2387 
2388 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2389 {
2390 	struct rtable *ort = *rp;
2391 	struct rtable *rt = (struct rtable *)
2392 		dst_alloc(&ipv4_dst_blackhole_ops);
2393 
2394 	if (rt) {
2395 		struct dst_entry *new = &rt->u.dst;
2396 
2397 		atomic_set(&new->__refcnt, 1);
2398 		new->__use = 1;
2399 		new->input = ipv4_blackhole_output;
2400 		new->output = ipv4_blackhole_output;
2401 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2402 
2403 		new->dev = ort->u.dst.dev;
2404 		if (new->dev)
2405 			dev_hold(new->dev);
2406 
2407 		rt->fl = ort->fl;
2408 
2409 		rt->idev = ort->idev;
2410 		if (rt->idev)
2411 			in_dev_hold(rt->idev);
2412 		rt->rt_flags = ort->rt_flags;
2413 		rt->rt_type = ort->rt_type;
2414 		rt->rt_dst = ort->rt_dst;
2415 		rt->rt_src = ort->rt_src;
2416 		rt->rt_iif = ort->rt_iif;
2417 		rt->rt_gateway = ort->rt_gateway;
2418 		rt->rt_spec_dst = ort->rt_spec_dst;
2419 		rt->peer = ort->peer;
2420 		if (rt->peer)
2421 			atomic_inc(&rt->peer->refcnt);
2422 
2423 		dst_free(new);
2424 	}
2425 
2426 	dst_release(&(*rp)->u.dst);
2427 	*rp = rt;
2428 	return (rt ? 0 : -ENOMEM);
2429 }
2430 
2431 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2432 {
2433 	int err;
2434 
2435 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2436 		return err;
2437 
2438 	if (flp->proto) {
2439 		if (!flp->fl4_src)
2440 			flp->fl4_src = (*rp)->rt_src;
2441 		if (!flp->fl4_dst)
2442 			flp->fl4_dst = (*rp)->rt_dst;
2443 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2444 		if (err == -EREMOTE)
2445 			err = ipv4_dst_blackhole(rp, flp, sk);
2446 
2447 		return err;
2448 	}
2449 
2450 	return 0;
2451 }
2452 
2453 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2454 
2455 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2456 {
2457 	return ip_route_output_flow(rp, flp, NULL, 0);
2458 }
2459 
2460 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2461 			int nowait, unsigned int flags)
2462 {
2463 	struct rtable *rt = (struct rtable*)skb->dst;
2464 	struct rtmsg *r;
2465 	struct nlmsghdr *nlh;
2466 	long expires;
2467 	u32 id = 0, ts = 0, tsage = 0, error;
2468 
2469 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2470 	if (nlh == NULL)
2471 		return -EMSGSIZE;
2472 
2473 	r = nlmsg_data(nlh);
2474 	r->rtm_family	 = AF_INET;
2475 	r->rtm_dst_len	= 32;
2476 	r->rtm_src_len	= 0;
2477 	r->rtm_tos	= rt->fl.fl4_tos;
2478 	r->rtm_table	= RT_TABLE_MAIN;
2479 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2480 	r->rtm_type	= rt->rt_type;
2481 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2482 	r->rtm_protocol = RTPROT_UNSPEC;
2483 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2484 	if (rt->rt_flags & RTCF_NOTIFY)
2485 		r->rtm_flags |= RTM_F_NOTIFY;
2486 
2487 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2488 
2489 	if (rt->fl.fl4_src) {
2490 		r->rtm_src_len = 32;
2491 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2492 	}
2493 	if (rt->u.dst.dev)
2494 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2495 #ifdef CONFIG_NET_CLS_ROUTE
2496 	if (rt->u.dst.tclassid)
2497 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2498 #endif
2499 	if (rt->fl.iif)
2500 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2501 	else if (rt->rt_src != rt->fl.fl4_src)
2502 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2503 
2504 	if (rt->rt_dst != rt->rt_gateway)
2505 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2506 
2507 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2508 		goto nla_put_failure;
2509 
2510 	error = rt->u.dst.error;
2511 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2512 	if (rt->peer) {
2513 		id = rt->peer->ip_id_count;
2514 		if (rt->peer->tcp_ts_stamp) {
2515 			ts = rt->peer->tcp_ts;
2516 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2517 		}
2518 	}
2519 
2520 	if (rt->fl.iif) {
2521 #ifdef CONFIG_IP_MROUTE
2522 		__be32 dst = rt->rt_dst;
2523 
2524 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2525 		    IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2526 			int err = ipmr_get_route(skb, r, nowait);
2527 			if (err <= 0) {
2528 				if (!nowait) {
2529 					if (err == 0)
2530 						return 0;
2531 					goto nla_put_failure;
2532 				} else {
2533 					if (err == -EMSGSIZE)
2534 						goto nla_put_failure;
2535 					error = err;
2536 				}
2537 			}
2538 		} else
2539 #endif
2540 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2541 	}
2542 
2543 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2544 			       expires, error) < 0)
2545 		goto nla_put_failure;
2546 
2547 	return nlmsg_end(skb, nlh);
2548 
2549 nla_put_failure:
2550 	nlmsg_cancel(skb, nlh);
2551 	return -EMSGSIZE;
2552 }
2553 
2554 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2555 {
2556 	struct rtmsg *rtm;
2557 	struct nlattr *tb[RTA_MAX+1];
2558 	struct rtable *rt = NULL;
2559 	__be32 dst = 0;
2560 	__be32 src = 0;
2561 	u32 iif;
2562 	int err;
2563 	struct sk_buff *skb;
2564 
2565 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2566 	if (err < 0)
2567 		goto errout;
2568 
2569 	rtm = nlmsg_data(nlh);
2570 
2571 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2572 	if (skb == NULL) {
2573 		err = -ENOBUFS;
2574 		goto errout;
2575 	}
2576 
2577 	/* Reserve room for dummy headers, this skb can pass
2578 	   through good chunk of routing engine.
2579 	 */
2580 	skb_reset_mac_header(skb);
2581 	skb_reset_network_header(skb);
2582 
2583 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2584 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2585 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2586 
2587 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2588 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2589 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2590 
2591 	if (iif) {
2592 		struct net_device *dev;
2593 
2594 		dev = __dev_get_by_index(iif);
2595 		if (dev == NULL) {
2596 			err = -ENODEV;
2597 			goto errout_free;
2598 		}
2599 
2600 		skb->protocol	= htons(ETH_P_IP);
2601 		skb->dev	= dev;
2602 		local_bh_disable();
2603 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2604 		local_bh_enable();
2605 
2606 		rt = (struct rtable*) skb->dst;
2607 		if (err == 0 && rt->u.dst.error)
2608 			err = -rt->u.dst.error;
2609 	} else {
2610 		struct flowi fl = {
2611 			.nl_u = {
2612 				.ip4_u = {
2613 					.daddr = dst,
2614 					.saddr = src,
2615 					.tos = rtm->rtm_tos,
2616 				},
2617 			},
2618 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2619 		};
2620 		err = ip_route_output_key(&rt, &fl);
2621 	}
2622 
2623 	if (err)
2624 		goto errout_free;
2625 
2626 	skb->dst = &rt->u.dst;
2627 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2628 		rt->rt_flags |= RTCF_NOTIFY;
2629 
2630 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2631 				RTM_NEWROUTE, 0, 0);
2632 	if (err <= 0)
2633 		goto errout_free;
2634 
2635 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2636 errout:
2637 	return err;
2638 
2639 errout_free:
2640 	kfree_skb(skb);
2641 	goto errout;
2642 }
2643 
2644 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2645 {
2646 	struct rtable *rt;
2647 	int h, s_h;
2648 	int idx, s_idx;
2649 
2650 	s_h = cb->args[0];
2651 	s_idx = idx = cb->args[1];
2652 	for (h = 0; h <= rt_hash_mask; h++) {
2653 		if (h < s_h) continue;
2654 		if (h > s_h)
2655 			s_idx = 0;
2656 		rcu_read_lock_bh();
2657 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2658 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2659 			if (idx < s_idx)
2660 				continue;
2661 			skb->dst = dst_clone(&rt->u.dst);
2662 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2663 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2664 					 1, NLM_F_MULTI) <= 0) {
2665 				dst_release(xchg(&skb->dst, NULL));
2666 				rcu_read_unlock_bh();
2667 				goto done;
2668 			}
2669 			dst_release(xchg(&skb->dst, NULL));
2670 		}
2671 		rcu_read_unlock_bh();
2672 	}
2673 
2674 done:
2675 	cb->args[0] = h;
2676 	cb->args[1] = idx;
2677 	return skb->len;
2678 }
2679 
2680 void ip_rt_multicast_event(struct in_device *in_dev)
2681 {
2682 	rt_cache_flush(0);
2683 }
2684 
2685 #ifdef CONFIG_SYSCTL
2686 static int flush_delay;
2687 
2688 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2689 					struct file *filp, void __user *buffer,
2690 					size_t *lenp, loff_t *ppos)
2691 {
2692 	if (write) {
2693 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2694 		rt_cache_flush(flush_delay);
2695 		return 0;
2696 	}
2697 
2698 	return -EINVAL;
2699 }
2700 
2701 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2702 						int __user *name,
2703 						int nlen,
2704 						void __user *oldval,
2705 						size_t __user *oldlenp,
2706 						void __user *newval,
2707 						size_t newlen)
2708 {
2709 	int delay;
2710 	if (newlen != sizeof(int))
2711 		return -EINVAL;
2712 	if (get_user(delay, (int __user *)newval))
2713 		return -EFAULT;
2714 	rt_cache_flush(delay);
2715 	return 0;
2716 }
2717 
2718 ctl_table ipv4_route_table[] = {
2719 	{
2720 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2721 		.procname	= "flush",
2722 		.data		= &flush_delay,
2723 		.maxlen		= sizeof(int),
2724 		.mode		= 0200,
2725 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2726 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2727 	},
2728 	{
2729 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2730 		.procname	= "min_delay",
2731 		.data		= &ip_rt_min_delay,
2732 		.maxlen		= sizeof(int),
2733 		.mode		= 0644,
2734 		.proc_handler	= &proc_dointvec_jiffies,
2735 		.strategy	= &sysctl_jiffies,
2736 	},
2737 	{
2738 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2739 		.procname	= "max_delay",
2740 		.data		= &ip_rt_max_delay,
2741 		.maxlen		= sizeof(int),
2742 		.mode		= 0644,
2743 		.proc_handler	= &proc_dointvec_jiffies,
2744 		.strategy	= &sysctl_jiffies,
2745 	},
2746 	{
2747 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2748 		.procname	= "gc_thresh",
2749 		.data		= &ipv4_dst_ops.gc_thresh,
2750 		.maxlen		= sizeof(int),
2751 		.mode		= 0644,
2752 		.proc_handler	= &proc_dointvec,
2753 	},
2754 	{
2755 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2756 		.procname	= "max_size",
2757 		.data		= &ip_rt_max_size,
2758 		.maxlen		= sizeof(int),
2759 		.mode		= 0644,
2760 		.proc_handler	= &proc_dointvec,
2761 	},
2762 	{
2763 		/*  Deprecated. Use gc_min_interval_ms */
2764 
2765 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2766 		.procname	= "gc_min_interval",
2767 		.data		= &ip_rt_gc_min_interval,
2768 		.maxlen		= sizeof(int),
2769 		.mode		= 0644,
2770 		.proc_handler	= &proc_dointvec_jiffies,
2771 		.strategy	= &sysctl_jiffies,
2772 	},
2773 	{
2774 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2775 		.procname	= "gc_min_interval_ms",
2776 		.data		= &ip_rt_gc_min_interval,
2777 		.maxlen		= sizeof(int),
2778 		.mode		= 0644,
2779 		.proc_handler	= &proc_dointvec_ms_jiffies,
2780 		.strategy	= &sysctl_ms_jiffies,
2781 	},
2782 	{
2783 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2784 		.procname	= "gc_timeout",
2785 		.data		= &ip_rt_gc_timeout,
2786 		.maxlen		= sizeof(int),
2787 		.mode		= 0644,
2788 		.proc_handler	= &proc_dointvec_jiffies,
2789 		.strategy	= &sysctl_jiffies,
2790 	},
2791 	{
2792 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2793 		.procname	= "gc_interval",
2794 		.data		= &ip_rt_gc_interval,
2795 		.maxlen		= sizeof(int),
2796 		.mode		= 0644,
2797 		.proc_handler	= &proc_dointvec_jiffies,
2798 		.strategy	= &sysctl_jiffies,
2799 	},
2800 	{
2801 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2802 		.procname	= "redirect_load",
2803 		.data		= &ip_rt_redirect_load,
2804 		.maxlen		= sizeof(int),
2805 		.mode		= 0644,
2806 		.proc_handler	= &proc_dointvec,
2807 	},
2808 	{
2809 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2810 		.procname	= "redirect_number",
2811 		.data		= &ip_rt_redirect_number,
2812 		.maxlen		= sizeof(int),
2813 		.mode		= 0644,
2814 		.proc_handler	= &proc_dointvec,
2815 	},
2816 	{
2817 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2818 		.procname	= "redirect_silence",
2819 		.data		= &ip_rt_redirect_silence,
2820 		.maxlen		= sizeof(int),
2821 		.mode		= 0644,
2822 		.proc_handler	= &proc_dointvec,
2823 	},
2824 	{
2825 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2826 		.procname	= "error_cost",
2827 		.data		= &ip_rt_error_cost,
2828 		.maxlen		= sizeof(int),
2829 		.mode		= 0644,
2830 		.proc_handler	= &proc_dointvec,
2831 	},
2832 	{
2833 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2834 		.procname	= "error_burst",
2835 		.data		= &ip_rt_error_burst,
2836 		.maxlen		= sizeof(int),
2837 		.mode		= 0644,
2838 		.proc_handler	= &proc_dointvec,
2839 	},
2840 	{
2841 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2842 		.procname	= "gc_elasticity",
2843 		.data		= &ip_rt_gc_elasticity,
2844 		.maxlen		= sizeof(int),
2845 		.mode		= 0644,
2846 		.proc_handler	= &proc_dointvec,
2847 	},
2848 	{
2849 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2850 		.procname	= "mtu_expires",
2851 		.data		= &ip_rt_mtu_expires,
2852 		.maxlen		= sizeof(int),
2853 		.mode		= 0644,
2854 		.proc_handler	= &proc_dointvec_jiffies,
2855 		.strategy	= &sysctl_jiffies,
2856 	},
2857 	{
2858 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2859 		.procname	= "min_pmtu",
2860 		.data		= &ip_rt_min_pmtu,
2861 		.maxlen		= sizeof(int),
2862 		.mode		= 0644,
2863 		.proc_handler	= &proc_dointvec,
2864 	},
2865 	{
2866 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2867 		.procname	= "min_adv_mss",
2868 		.data		= &ip_rt_min_advmss,
2869 		.maxlen		= sizeof(int),
2870 		.mode		= 0644,
2871 		.proc_handler	= &proc_dointvec,
2872 	},
2873 	{
2874 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
2875 		.procname	= "secret_interval",
2876 		.data		= &ip_rt_secret_interval,
2877 		.maxlen		= sizeof(int),
2878 		.mode		= 0644,
2879 		.proc_handler	= &proc_dointvec_jiffies,
2880 		.strategy	= &sysctl_jiffies,
2881 	},
2882 	{ .ctl_name = 0 }
2883 };
2884 #endif
2885 
2886 #ifdef CONFIG_NET_CLS_ROUTE
2887 struct ip_rt_acct *ip_rt_acct;
2888 
2889 /* This code sucks.  But you should have seen it before! --RR */
2890 
2891 /* IP route accounting ptr for this logical cpu number. */
2892 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2893 
2894 #ifdef CONFIG_PROC_FS
2895 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2896 			   int length, int *eof, void *data)
2897 {
2898 	unsigned int i;
2899 
2900 	if ((offset & 3) || (length & 3))
2901 		return -EIO;
2902 
2903 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
2904 		*eof = 1;
2905 		return 0;
2906 	}
2907 
2908 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2909 		length = sizeof(struct ip_rt_acct) * 256 - offset;
2910 		*eof = 1;
2911 	}
2912 
2913 	offset /= sizeof(u32);
2914 
2915 	if (length > 0) {
2916 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2917 		u32 *dst = (u32 *) buffer;
2918 
2919 		/* Copy first cpu. */
2920 		*start = buffer;
2921 		memcpy(dst, src, length);
2922 
2923 		/* Add the other cpus in, one int at a time */
2924 		for_each_possible_cpu(i) {
2925 			unsigned int j;
2926 
2927 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2928 
2929 			for (j = 0; j < length/4; j++)
2930 				dst[j] += src[j];
2931 		}
2932 	}
2933 	return length;
2934 }
2935 #endif /* CONFIG_PROC_FS */
2936 #endif /* CONFIG_NET_CLS_ROUTE */
2937 
2938 static __initdata unsigned long rhash_entries;
2939 static int __init set_rhash_entries(char *str)
2940 {
2941 	if (!str)
2942 		return 0;
2943 	rhash_entries = simple_strtoul(str, &str, 0);
2944 	return 1;
2945 }
2946 __setup("rhash_entries=", set_rhash_entries);
2947 
2948 int __init ip_rt_init(void)
2949 {
2950 	int rc = 0;
2951 
2952 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2953 			     (jiffies ^ (jiffies >> 7)));
2954 
2955 #ifdef CONFIG_NET_CLS_ROUTE
2956 	{
2957 	int order;
2958 	for (order = 0;
2959 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2960 		/* NOTHING */;
2961 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2962 	if (!ip_rt_acct)
2963 		panic("IP: failed to allocate ip_rt_acct\n");
2964 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
2965 	}
2966 #endif
2967 
2968 	ipv4_dst_ops.kmem_cachep =
2969 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2970 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2971 
2972 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2973 
2974 	rt_hash_table = (struct rt_hash_bucket *)
2975 		alloc_large_system_hash("IP route cache",
2976 					sizeof(struct rt_hash_bucket),
2977 					rhash_entries,
2978 					(num_physpages >= 128 * 1024) ?
2979 					15 : 17,
2980 					0,
2981 					&rt_hash_log,
2982 					&rt_hash_mask,
2983 					0);
2984 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2985 	rt_hash_lock_init();
2986 
2987 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2988 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
2989 
2990 	devinet_init();
2991 	ip_fib_init();
2992 
2993 	init_timer(&rt_flush_timer);
2994 	rt_flush_timer.function = rt_run_flush;
2995 	init_timer(&rt_periodic_timer);
2996 	rt_periodic_timer.function = rt_check_expire;
2997 	init_timer(&rt_secret_timer);
2998 	rt_secret_timer.function = rt_secret_rebuild;
2999 
3000 	/* All the timers, started at system startup tend
3001 	   to synchronize. Perturb it a bit.
3002 	 */
3003 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3004 					ip_rt_gc_interval;
3005 	add_timer(&rt_periodic_timer);
3006 
3007 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3008 		ip_rt_secret_interval;
3009 	add_timer(&rt_secret_timer);
3010 
3011 #ifdef CONFIG_PROC_FS
3012 	{
3013 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3014 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3015 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3016 					     proc_net_stat))) {
3017 		return -ENOMEM;
3018 	}
3019 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3020 	}
3021 #ifdef CONFIG_NET_CLS_ROUTE
3022 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3023 #endif
3024 #endif
3025 #ifdef CONFIG_XFRM
3026 	xfrm_init();
3027 	xfrm4_init();
3028 #endif
3029 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3030 
3031 	return rc;
3032 }
3033 
3034 EXPORT_SYMBOL(__ip_select_ident);
3035 EXPORT_SYMBOL(ip_route_input);
3036 EXPORT_SYMBOL(ip_route_output_key);
3037