xref: /openbmc/linux/net/ipv4/route.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_min_delay		= 2 * HZ;
120 static int ip_rt_max_delay		= 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval		= 60 * HZ;
124 static int ip_rt_gc_min_interval	= HZ / 2;
125 static int ip_rt_redirect_number	= 9;
126 static int ip_rt_redirect_load		= HZ / 50;
127 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost		= HZ;
129 static int ip_rt_error_burst		= 5 * HZ;
130 static int ip_rt_gc_elasticity		= 8;
131 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu		= 512 + 20 + 20;
133 static int ip_rt_min_advmss		= 256;
134 static int ip_rt_secret_interval	= 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136 
137 #define RTprint(a...)	printk(KERN_DEBUG a)
138 
139 static struct timer_list rt_flush_timer;
140 static void rt_check_expire(struct work_struct *work);
141 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
142 static struct timer_list rt_secret_timer;
143 
144 /*
145  *	Interface to generic destination cache.
146  */
147 
148 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149 static void		 ipv4_dst_destroy(struct dst_entry *dst);
150 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
151 					 struct net_device *dev, int how);
152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153 static void		 ipv4_link_failure(struct sk_buff *skb);
154 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155 static int rt_garbage_collect(void);
156 
157 
158 static struct dst_ops ipv4_dst_ops = {
159 	.family =		AF_INET,
160 	.protocol =		__constant_htons(ETH_P_IP),
161 	.gc =			rt_garbage_collect,
162 	.check =		ipv4_dst_check,
163 	.destroy =		ipv4_dst_destroy,
164 	.ifdown =		ipv4_dst_ifdown,
165 	.negative_advice =	ipv4_negative_advice,
166 	.link_failure =		ipv4_link_failure,
167 	.update_pmtu =		ip_rt_update_pmtu,
168 	.entry_size =		sizeof(struct rtable),
169 };
170 
171 #define ECN_OR_COST(class)	TC_PRIO_##class
172 
173 const __u8 ip_tos2prio[16] = {
174 	TC_PRIO_BESTEFFORT,
175 	ECN_OR_COST(FILLER),
176 	TC_PRIO_BESTEFFORT,
177 	ECN_OR_COST(BESTEFFORT),
178 	TC_PRIO_BULK,
179 	ECN_OR_COST(BULK),
180 	TC_PRIO_BULK,
181 	ECN_OR_COST(BULK),
182 	TC_PRIO_INTERACTIVE,
183 	ECN_OR_COST(INTERACTIVE),
184 	TC_PRIO_INTERACTIVE,
185 	ECN_OR_COST(INTERACTIVE),
186 	TC_PRIO_INTERACTIVE_BULK,
187 	ECN_OR_COST(INTERACTIVE_BULK),
188 	TC_PRIO_INTERACTIVE_BULK,
189 	ECN_OR_COST(INTERACTIVE_BULK)
190 };
191 
192 
193 /*
194  * Route cache.
195  */
196 
197 /* The locking scheme is rather straight forward:
198  *
199  * 1) Read-Copy Update protects the buckets of the central route hash.
200  * 2) Only writers remove entries, and they hold the lock
201  *    as they look at rtable reference counts.
202  * 3) Only readers acquire references to rtable entries,
203  *    they do so with atomic increments and with the
204  *    lock held.
205  */
206 
207 struct rt_hash_bucket {
208 	struct rtable	*chain;
209 };
210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211 	defined(CONFIG_PROVE_LOCKING)
212 /*
213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
214  * The size of this table is a power of two and depends on the number of CPUS.
215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216  */
217 #ifdef CONFIG_LOCKDEP
218 # define RT_HASH_LOCK_SZ	256
219 #else
220 # if NR_CPUS >= 32
221 #  define RT_HASH_LOCK_SZ	4096
222 # elif NR_CPUS >= 16
223 #  define RT_HASH_LOCK_SZ	2048
224 # elif NR_CPUS >= 8
225 #  define RT_HASH_LOCK_SZ	1024
226 # elif NR_CPUS >= 4
227 #  define RT_HASH_LOCK_SZ	512
228 # else
229 #  define RT_HASH_LOCK_SZ	256
230 # endif
231 #endif
232 
233 static spinlock_t	*rt_hash_locks;
234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235 # define rt_hash_lock_init()	{ \
236 		int i; \
237 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240 			spin_lock_init(&rt_hash_locks[i]); \
241 		}
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 # define rt_hash_lock_init()
245 #endif
246 
247 static struct rt_hash_bucket 	*rt_hash_table;
248 static unsigned			rt_hash_mask;
249 static unsigned int		rt_hash_log;
250 static unsigned int		rt_hash_rnd;
251 
252 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
253 #define RT_CACHE_STAT_INC(field) \
254 	(__raw_get_cpu_var(rt_cache_stat).field++)
255 
256 static int rt_intern_hash(unsigned hash, struct rtable *rth,
257 				struct rtable **res);
258 
259 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 {
261 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
262 		& rt_hash_mask);
263 }
264 
265 #define rt_hash(daddr, saddr, idx) \
266 	rt_hash_code((__force u32)(__be32)(daddr),\
267 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268 
269 #ifdef CONFIG_PROC_FS
270 struct rt_cache_iter_state {
271 	int bucket;
272 };
273 
274 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 {
276 	struct rtable *r = NULL;
277 	struct rt_cache_iter_state *st = seq->private;
278 
279 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280 		rcu_read_lock_bh();
281 		r = rt_hash_table[st->bucket].chain;
282 		if (r)
283 			break;
284 		rcu_read_unlock_bh();
285 	}
286 	return r;
287 }
288 
289 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 {
291 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292 
293 	r = r->u.dst.rt_next;
294 	while (!r) {
295 		rcu_read_unlock_bh();
296 		if (--st->bucket < 0)
297 			break;
298 		rcu_read_lock_bh();
299 		r = rt_hash_table[st->bucket].chain;
300 	}
301 	return r;
302 }
303 
304 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 {
306 	struct rtable *r = rt_cache_get_first(seq);
307 
308 	if (r)
309 		while (pos && (r = rt_cache_get_next(seq, r)))
310 			--pos;
311 	return pos ? NULL : r;
312 }
313 
314 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 {
316 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 }
318 
319 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 {
321 	struct rtable *r = NULL;
322 
323 	if (v == SEQ_START_TOKEN)
324 		r = rt_cache_get_first(seq);
325 	else
326 		r = rt_cache_get_next(seq, v);
327 	++*pos;
328 	return r;
329 }
330 
331 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 {
333 	if (v && v != SEQ_START_TOKEN)
334 		rcu_read_unlock_bh();
335 }
336 
337 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 {
339 	if (v == SEQ_START_TOKEN)
340 		seq_printf(seq, "%-127s\n",
341 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343 			   "HHUptod\tSpecDst");
344 	else {
345 		struct rtable *r = v;
346 		char temp[256];
347 
348 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350 			r->u.dst.dev ? r->u.dst.dev->name : "*",
351 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
354 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356 			dst_metric(&r->u.dst, RTAX_WINDOW),
357 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
359 			r->fl.fl4_tos,
360 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362 				       dev_queue_xmit) : 0,
363 			r->rt_spec_dst);
364 		seq_printf(seq, "%-127s\n", temp);
365 	}
366 	return 0;
367 }
368 
369 static const struct seq_operations rt_cache_seq_ops = {
370 	.start  = rt_cache_seq_start,
371 	.next   = rt_cache_seq_next,
372 	.stop   = rt_cache_seq_stop,
373 	.show   = rt_cache_seq_show,
374 };
375 
376 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 {
378 	return seq_open_private(file, &rt_cache_seq_ops,
379 			sizeof(struct rt_cache_iter_state));
380 }
381 
382 static const struct file_operations rt_cache_seq_fops = {
383 	.owner	 = THIS_MODULE,
384 	.open	 = rt_cache_seq_open,
385 	.read	 = seq_read,
386 	.llseek	 = seq_lseek,
387 	.release = seq_release_private,
388 };
389 
390 
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393 	int cpu;
394 
395 	if (*pos == 0)
396 		return SEQ_START_TOKEN;
397 
398 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 		if (!cpu_possible(cpu))
400 			continue;
401 		*pos = cpu+1;
402 		return &per_cpu(rt_cache_stat, cpu);
403 	}
404 	return NULL;
405 }
406 
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409 	int cpu;
410 
411 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 		if (!cpu_possible(cpu))
413 			continue;
414 		*pos = cpu+1;
415 		return &per_cpu(rt_cache_stat, cpu);
416 	}
417 	return NULL;
418 
419 }
420 
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423 
424 }
425 
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428 	struct rt_cache_stat *st = v;
429 
430 	if (v == SEQ_START_TOKEN) {
431 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432 		return 0;
433 	}
434 
435 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 		   atomic_read(&ipv4_dst_ops.entries),
438 		   st->in_hit,
439 		   st->in_slow_tot,
440 		   st->in_slow_mc,
441 		   st->in_no_route,
442 		   st->in_brd,
443 		   st->in_martian_dst,
444 		   st->in_martian_src,
445 
446 		   st->out_hit,
447 		   st->out_slow_tot,
448 		   st->out_slow_mc,
449 
450 		   st->gc_total,
451 		   st->gc_ignored,
452 		   st->gc_goal_miss,
453 		   st->gc_dst_overflow,
454 		   st->in_hlist_search,
455 		   st->out_hlist_search
456 		);
457 	return 0;
458 }
459 
460 static const struct seq_operations rt_cpu_seq_ops = {
461 	.start  = rt_cpu_seq_start,
462 	.next   = rt_cpu_seq_next,
463 	.stop   = rt_cpu_seq_stop,
464 	.show   = rt_cpu_seq_show,
465 };
466 
467 
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470 	return seq_open(file, &rt_cpu_seq_ops);
471 }
472 
473 static const struct file_operations rt_cpu_seq_fops = {
474 	.owner	 = THIS_MODULE,
475 	.open	 = rt_cpu_seq_open,
476 	.read	 = seq_read,
477 	.llseek	 = seq_lseek,
478 	.release = seq_release,
479 };
480 
481 #endif /* CONFIG_PROC_FS */
482 
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486 }
487 
488 static __inline__ void rt_drop(struct rtable *rt)
489 {
490 	ip_rt_put(rt);
491 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492 }
493 
494 static __inline__ int rt_fast_clean(struct rtable *rth)
495 {
496 	/* Kill broadcast/multicast entries very aggresively, if they
497 	   collide in hash table with more useful entries */
498 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
499 		rth->fl.iif && rth->u.dst.rt_next;
500 }
501 
502 static __inline__ int rt_valuable(struct rtable *rth)
503 {
504 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505 		rth->u.dst.expires;
506 }
507 
508 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509 {
510 	unsigned long age;
511 	int ret = 0;
512 
513 	if (atomic_read(&rth->u.dst.__refcnt))
514 		goto out;
515 
516 	ret = 1;
517 	if (rth->u.dst.expires &&
518 	    time_after_eq(jiffies, rth->u.dst.expires))
519 		goto out;
520 
521 	age = jiffies - rth->u.dst.lastuse;
522 	ret = 0;
523 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524 	    (age <= tmo2 && rt_valuable(rth)))
525 		goto out;
526 	ret = 1;
527 out:	return ret;
528 }
529 
530 /* Bits of score are:
531  * 31: very valuable
532  * 30: not quite useless
533  * 29..0: usage counter
534  */
535 static inline u32 rt_score(struct rtable *rt)
536 {
537 	u32 score = jiffies - rt->u.dst.lastuse;
538 
539 	score = ~score & ~(3<<30);
540 
541 	if (rt_valuable(rt))
542 		score |= (1<<31);
543 
544 	if (!rt->fl.iif ||
545 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546 		score |= (1<<30);
547 
548 	return score;
549 }
550 
551 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552 {
553 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
555 		(fl1->mark ^ fl2->mark) |
556 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
557 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
558 		(fl1->oif ^ fl2->oif) |
559 		(fl1->iif ^ fl2->iif)) == 0;
560 }
561 
562 static void rt_check_expire(struct work_struct *work)
563 {
564 	static unsigned int rover;
565 	unsigned int i = rover, goal;
566 	struct rtable *rth, **rthp;
567 	u64 mult;
568 
569 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570 	if (ip_rt_gc_timeout > 1)
571 		do_div(mult, ip_rt_gc_timeout);
572 	goal = (unsigned int)mult;
573 	if (goal > rt_hash_mask)
574 		goal = rt_hash_mask + 1;
575 	for (; goal > 0; goal--) {
576 		unsigned long tmo = ip_rt_gc_timeout;
577 
578 		i = (i + 1) & rt_hash_mask;
579 		rthp = &rt_hash_table[i].chain;
580 
581 		if (*rthp == NULL)
582 			continue;
583 		spin_lock_bh(rt_hash_lock_addr(i));
584 		while ((rth = *rthp) != NULL) {
585 			if (rth->u.dst.expires) {
586 				/* Entry is expired even if it is in use */
587 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
588 					tmo >>= 1;
589 					rthp = &rth->u.dst.rt_next;
590 					continue;
591 				}
592 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
593 				tmo >>= 1;
594 				rthp = &rth->u.dst.rt_next;
595 				continue;
596 			}
597 
598 			/* Cleanup aged off entries. */
599 			*rthp = rth->u.dst.rt_next;
600 			rt_free(rth);
601 		}
602 		spin_unlock_bh(rt_hash_lock_addr(i));
603 	}
604 	rover = i;
605 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
606 }
607 
608 /* This can run from both BH and non-BH contexts, the latter
609  * in the case of a forced flush event.
610  */
611 static void rt_run_flush(unsigned long dummy)
612 {
613 	int i;
614 	struct rtable *rth, *next;
615 
616 	rt_deadline = 0;
617 
618 	get_random_bytes(&rt_hash_rnd, 4);
619 
620 	for (i = rt_hash_mask; i >= 0; i--) {
621 		spin_lock_bh(rt_hash_lock_addr(i));
622 		rth = rt_hash_table[i].chain;
623 		if (rth)
624 			rt_hash_table[i].chain = NULL;
625 		spin_unlock_bh(rt_hash_lock_addr(i));
626 
627 		for (; rth; rth = next) {
628 			next = rth->u.dst.rt_next;
629 			rt_free(rth);
630 		}
631 	}
632 }
633 
634 static DEFINE_SPINLOCK(rt_flush_lock);
635 
636 void rt_cache_flush(int delay)
637 {
638 	unsigned long now = jiffies;
639 	int user_mode = !in_softirq();
640 
641 	if (delay < 0)
642 		delay = ip_rt_min_delay;
643 
644 	spin_lock_bh(&rt_flush_lock);
645 
646 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
647 		long tmo = (long)(rt_deadline - now);
648 
649 		/* If flush timer is already running
650 		   and flush request is not immediate (delay > 0):
651 
652 		   if deadline is not achieved, prolongate timer to "delay",
653 		   otherwise fire it at deadline time.
654 		 */
655 
656 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
657 			tmo = 0;
658 
659 		if (delay > tmo)
660 			delay = tmo;
661 	}
662 
663 	if (delay <= 0) {
664 		spin_unlock_bh(&rt_flush_lock);
665 		rt_run_flush(0);
666 		return;
667 	}
668 
669 	if (rt_deadline == 0)
670 		rt_deadline = now + ip_rt_max_delay;
671 
672 	mod_timer(&rt_flush_timer, now+delay);
673 	spin_unlock_bh(&rt_flush_lock);
674 }
675 
676 static void rt_secret_rebuild(unsigned long dummy)
677 {
678 	unsigned long now = jiffies;
679 
680 	rt_cache_flush(0);
681 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
682 }
683 
684 /*
685    Short description of GC goals.
686 
687    We want to build algorithm, which will keep routing cache
688    at some equilibrium point, when number of aged off entries
689    is kept approximately equal to newly generated ones.
690 
691    Current expiration strength is variable "expire".
692    We try to adjust it dynamically, so that if networking
693    is idle expires is large enough to keep enough of warm entries,
694    and when load increases it reduces to limit cache size.
695  */
696 
697 static int rt_garbage_collect(void)
698 {
699 	static unsigned long expire = RT_GC_TIMEOUT;
700 	static unsigned long last_gc;
701 	static int rover;
702 	static int equilibrium;
703 	struct rtable *rth, **rthp;
704 	unsigned long now = jiffies;
705 	int goal;
706 
707 	/*
708 	 * Garbage collection is pretty expensive,
709 	 * do not make it too frequently.
710 	 */
711 
712 	RT_CACHE_STAT_INC(gc_total);
713 
714 	if (now - last_gc < ip_rt_gc_min_interval &&
715 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
716 		RT_CACHE_STAT_INC(gc_ignored);
717 		goto out;
718 	}
719 
720 	/* Calculate number of entries, which we want to expire now. */
721 	goal = atomic_read(&ipv4_dst_ops.entries) -
722 		(ip_rt_gc_elasticity << rt_hash_log);
723 	if (goal <= 0) {
724 		if (equilibrium < ipv4_dst_ops.gc_thresh)
725 			equilibrium = ipv4_dst_ops.gc_thresh;
726 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
727 		if (goal > 0) {
728 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
729 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730 		}
731 	} else {
732 		/* We are in dangerous area. Try to reduce cache really
733 		 * aggressively.
734 		 */
735 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
736 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
737 	}
738 
739 	if (now - last_gc >= ip_rt_gc_min_interval)
740 		last_gc = now;
741 
742 	if (goal <= 0) {
743 		equilibrium += goal;
744 		goto work_done;
745 	}
746 
747 	do {
748 		int i, k;
749 
750 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
751 			unsigned long tmo = expire;
752 
753 			k = (k + 1) & rt_hash_mask;
754 			rthp = &rt_hash_table[k].chain;
755 			spin_lock_bh(rt_hash_lock_addr(k));
756 			while ((rth = *rthp) != NULL) {
757 				if (!rt_may_expire(rth, tmo, expire)) {
758 					tmo >>= 1;
759 					rthp = &rth->u.dst.rt_next;
760 					continue;
761 				}
762 				*rthp = rth->u.dst.rt_next;
763 				rt_free(rth);
764 				goal--;
765 			}
766 			spin_unlock_bh(rt_hash_lock_addr(k));
767 			if (goal <= 0)
768 				break;
769 		}
770 		rover = k;
771 
772 		if (goal <= 0)
773 			goto work_done;
774 
775 		/* Goal is not achieved. We stop process if:
776 
777 		   - if expire reduced to zero. Otherwise, expire is halfed.
778 		   - if table is not full.
779 		   - if we are called from interrupt.
780 		   - jiffies check is just fallback/debug loop breaker.
781 		     We will not spin here for long time in any case.
782 		 */
783 
784 		RT_CACHE_STAT_INC(gc_goal_miss);
785 
786 		if (expire == 0)
787 			break;
788 
789 		expire >>= 1;
790 #if RT_CACHE_DEBUG >= 2
791 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
792 				atomic_read(&ipv4_dst_ops.entries), goal, i);
793 #endif
794 
795 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
796 			goto out;
797 	} while (!in_softirq() && time_before_eq(jiffies, now));
798 
799 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
800 		goto out;
801 	if (net_ratelimit())
802 		printk(KERN_WARNING "dst cache overflow\n");
803 	RT_CACHE_STAT_INC(gc_dst_overflow);
804 	return 1;
805 
806 work_done:
807 	expire += ip_rt_gc_min_interval;
808 	if (expire > ip_rt_gc_timeout ||
809 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
810 		expire = ip_rt_gc_timeout;
811 #if RT_CACHE_DEBUG >= 2
812 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
813 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
814 #endif
815 out:	return 0;
816 }
817 
818 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
819 {
820 	struct rtable	*rth, **rthp;
821 	unsigned long	now;
822 	struct rtable *cand, **candp;
823 	u32 		min_score;
824 	int		chain_length;
825 	int attempts = !in_softirq();
826 
827 restart:
828 	chain_length = 0;
829 	min_score = ~(u32)0;
830 	cand = NULL;
831 	candp = NULL;
832 	now = jiffies;
833 
834 	rthp = &rt_hash_table[hash].chain;
835 
836 	spin_lock_bh(rt_hash_lock_addr(hash));
837 	while ((rth = *rthp) != NULL) {
838 		if (compare_keys(&rth->fl, &rt->fl)) {
839 			/* Put it first */
840 			*rthp = rth->u.dst.rt_next;
841 			/*
842 			 * Since lookup is lockfree, the deletion
843 			 * must be visible to another weakly ordered CPU before
844 			 * the insertion at the start of the hash chain.
845 			 */
846 			rcu_assign_pointer(rth->u.dst.rt_next,
847 					   rt_hash_table[hash].chain);
848 			/*
849 			 * Since lookup is lockfree, the update writes
850 			 * must be ordered for consistency on SMP.
851 			 */
852 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
853 
854 			rth->u.dst.__use++;
855 			dst_hold(&rth->u.dst);
856 			rth->u.dst.lastuse = now;
857 			spin_unlock_bh(rt_hash_lock_addr(hash));
858 
859 			rt_drop(rt);
860 			*rp = rth;
861 			return 0;
862 		}
863 
864 		if (!atomic_read(&rth->u.dst.__refcnt)) {
865 			u32 score = rt_score(rth);
866 
867 			if (score <= min_score) {
868 				cand = rth;
869 				candp = rthp;
870 				min_score = score;
871 			}
872 		}
873 
874 		chain_length++;
875 
876 		rthp = &rth->u.dst.rt_next;
877 	}
878 
879 	if (cand) {
880 		/* ip_rt_gc_elasticity used to be average length of chain
881 		 * length, when exceeded gc becomes really aggressive.
882 		 *
883 		 * The second limit is less certain. At the moment it allows
884 		 * only 2 entries per bucket. We will see.
885 		 */
886 		if (chain_length > ip_rt_gc_elasticity) {
887 			*candp = cand->u.dst.rt_next;
888 			rt_free(cand);
889 		}
890 	}
891 
892 	/* Try to bind route to arp only if it is output
893 	   route or unicast forwarding path.
894 	 */
895 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
896 		int err = arp_bind_neighbour(&rt->u.dst);
897 		if (err) {
898 			spin_unlock_bh(rt_hash_lock_addr(hash));
899 
900 			if (err != -ENOBUFS) {
901 				rt_drop(rt);
902 				return err;
903 			}
904 
905 			/* Neighbour tables are full and nothing
906 			   can be released. Try to shrink route cache,
907 			   it is most likely it holds some neighbour records.
908 			 */
909 			if (attempts-- > 0) {
910 				int saved_elasticity = ip_rt_gc_elasticity;
911 				int saved_int = ip_rt_gc_min_interval;
912 				ip_rt_gc_elasticity	= 1;
913 				ip_rt_gc_min_interval	= 0;
914 				rt_garbage_collect();
915 				ip_rt_gc_min_interval	= saved_int;
916 				ip_rt_gc_elasticity	= saved_elasticity;
917 				goto restart;
918 			}
919 
920 			if (net_ratelimit())
921 				printk(KERN_WARNING "Neighbour table overflow.\n");
922 			rt_drop(rt);
923 			return -ENOBUFS;
924 		}
925 	}
926 
927 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
928 #if RT_CACHE_DEBUG >= 2
929 	if (rt->u.dst.rt_next) {
930 		struct rtable *trt;
931 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
932 		       NIPQUAD(rt->rt_dst));
933 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
934 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
935 		printk("\n");
936 	}
937 #endif
938 	rt_hash_table[hash].chain = rt;
939 	spin_unlock_bh(rt_hash_lock_addr(hash));
940 	*rp = rt;
941 	return 0;
942 }
943 
944 void rt_bind_peer(struct rtable *rt, int create)
945 {
946 	static DEFINE_SPINLOCK(rt_peer_lock);
947 	struct inet_peer *peer;
948 
949 	peer = inet_getpeer(rt->rt_dst, create);
950 
951 	spin_lock_bh(&rt_peer_lock);
952 	if (rt->peer == NULL) {
953 		rt->peer = peer;
954 		peer = NULL;
955 	}
956 	spin_unlock_bh(&rt_peer_lock);
957 	if (peer)
958 		inet_putpeer(peer);
959 }
960 
961 /*
962  * Peer allocation may fail only in serious out-of-memory conditions.  However
963  * we still can generate some output.
964  * Random ID selection looks a bit dangerous because we have no chances to
965  * select ID being unique in a reasonable period of time.
966  * But broken packet identifier may be better than no packet at all.
967  */
968 static void ip_select_fb_ident(struct iphdr *iph)
969 {
970 	static DEFINE_SPINLOCK(ip_fb_id_lock);
971 	static u32 ip_fallback_id;
972 	u32 salt;
973 
974 	spin_lock_bh(&ip_fb_id_lock);
975 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
976 	iph->id = htons(salt & 0xFFFF);
977 	ip_fallback_id = salt;
978 	spin_unlock_bh(&ip_fb_id_lock);
979 }
980 
981 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
982 {
983 	struct rtable *rt = (struct rtable *) dst;
984 
985 	if (rt) {
986 		if (rt->peer == NULL)
987 			rt_bind_peer(rt, 1);
988 
989 		/* If peer is attached to destination, it is never detached,
990 		   so that we need not to grab a lock to dereference it.
991 		 */
992 		if (rt->peer) {
993 			iph->id = htons(inet_getid(rt->peer, more));
994 			return;
995 		}
996 	} else
997 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
998 		       __builtin_return_address(0));
999 
1000 	ip_select_fb_ident(iph);
1001 }
1002 
1003 static void rt_del(unsigned hash, struct rtable *rt)
1004 {
1005 	struct rtable **rthp;
1006 
1007 	spin_lock_bh(rt_hash_lock_addr(hash));
1008 	ip_rt_put(rt);
1009 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1010 	     rthp = &(*rthp)->u.dst.rt_next)
1011 		if (*rthp == rt) {
1012 			*rthp = rt->u.dst.rt_next;
1013 			rt_free(rt);
1014 			break;
1015 		}
1016 	spin_unlock_bh(rt_hash_lock_addr(hash));
1017 }
1018 
1019 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1020 		    __be32 saddr, struct net_device *dev)
1021 {
1022 	int i, k;
1023 	struct in_device *in_dev = in_dev_get(dev);
1024 	struct rtable *rth, **rthp;
1025 	__be32  skeys[2] = { saddr, 0 };
1026 	int  ikeys[2] = { dev->ifindex, 0 };
1027 	struct netevent_redirect netevent;
1028 
1029 	if (!in_dev)
1030 		return;
1031 
1032 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1033 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1034 		goto reject_redirect;
1035 
1036 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1037 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1038 			goto reject_redirect;
1039 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1040 			goto reject_redirect;
1041 	} else {
1042 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1043 			goto reject_redirect;
1044 	}
1045 
1046 	for (i = 0; i < 2; i++) {
1047 		for (k = 0; k < 2; k++) {
1048 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1049 
1050 			rthp=&rt_hash_table[hash].chain;
1051 
1052 			rcu_read_lock();
1053 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1054 				struct rtable *rt;
1055 
1056 				if (rth->fl.fl4_dst != daddr ||
1057 				    rth->fl.fl4_src != skeys[i] ||
1058 				    rth->fl.oif != ikeys[k] ||
1059 				    rth->fl.iif != 0) {
1060 					rthp = &rth->u.dst.rt_next;
1061 					continue;
1062 				}
1063 
1064 				if (rth->rt_dst != daddr ||
1065 				    rth->rt_src != saddr ||
1066 				    rth->u.dst.error ||
1067 				    rth->rt_gateway != old_gw ||
1068 				    rth->u.dst.dev != dev)
1069 					break;
1070 
1071 				dst_hold(&rth->u.dst);
1072 				rcu_read_unlock();
1073 
1074 				rt = dst_alloc(&ipv4_dst_ops);
1075 				if (rt == NULL) {
1076 					ip_rt_put(rth);
1077 					in_dev_put(in_dev);
1078 					return;
1079 				}
1080 
1081 				/* Copy all the information. */
1082 				*rt = *rth;
1083 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1084 				rt->u.dst.__use		= 1;
1085 				atomic_set(&rt->u.dst.__refcnt, 1);
1086 				rt->u.dst.child		= NULL;
1087 				if (rt->u.dst.dev)
1088 					dev_hold(rt->u.dst.dev);
1089 				if (rt->idev)
1090 					in_dev_hold(rt->idev);
1091 				rt->u.dst.obsolete	= 0;
1092 				rt->u.dst.lastuse	= jiffies;
1093 				rt->u.dst.path		= &rt->u.dst;
1094 				rt->u.dst.neighbour	= NULL;
1095 				rt->u.dst.hh		= NULL;
1096 				rt->u.dst.xfrm		= NULL;
1097 
1098 				rt->rt_flags		|= RTCF_REDIRECTED;
1099 
1100 				/* Gateway is different ... */
1101 				rt->rt_gateway		= new_gw;
1102 
1103 				/* Redirect received -> path was valid */
1104 				dst_confirm(&rth->u.dst);
1105 
1106 				if (rt->peer)
1107 					atomic_inc(&rt->peer->refcnt);
1108 
1109 				if (arp_bind_neighbour(&rt->u.dst) ||
1110 				    !(rt->u.dst.neighbour->nud_state &
1111 					    NUD_VALID)) {
1112 					if (rt->u.dst.neighbour)
1113 						neigh_event_send(rt->u.dst.neighbour, NULL);
1114 					ip_rt_put(rth);
1115 					rt_drop(rt);
1116 					goto do_next;
1117 				}
1118 
1119 				netevent.old = &rth->u.dst;
1120 				netevent.new = &rt->u.dst;
1121 				call_netevent_notifiers(NETEVENT_REDIRECT,
1122 							&netevent);
1123 
1124 				rt_del(hash, rth);
1125 				if (!rt_intern_hash(hash, rt, &rt))
1126 					ip_rt_put(rt);
1127 				goto do_next;
1128 			}
1129 			rcu_read_unlock();
1130 		do_next:
1131 			;
1132 		}
1133 	}
1134 	in_dev_put(in_dev);
1135 	return;
1136 
1137 reject_redirect:
1138 #ifdef CONFIG_IP_ROUTE_VERBOSE
1139 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1140 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1141 			"%u.%u.%u.%u ignored.\n"
1142 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1143 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1144 		       NIPQUAD(saddr), NIPQUAD(daddr));
1145 #endif
1146 	in_dev_put(in_dev);
1147 }
1148 
1149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1150 {
1151 	struct rtable *rt = (struct rtable*)dst;
1152 	struct dst_entry *ret = dst;
1153 
1154 	if (rt) {
1155 		if (dst->obsolete) {
1156 			ip_rt_put(rt);
1157 			ret = NULL;
1158 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1159 			   rt->u.dst.expires) {
1160 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1161 						rt->fl.oif);
1162 #if RT_CACHE_DEBUG >= 1
1163 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1164 					  "%u.%u.%u.%u/%02x dropped\n",
1165 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1166 #endif
1167 			rt_del(hash, rt);
1168 			ret = NULL;
1169 		}
1170 	}
1171 	return ret;
1172 }
1173 
1174 /*
1175  * Algorithm:
1176  *	1. The first ip_rt_redirect_number redirects are sent
1177  *	   with exponential backoff, then we stop sending them at all,
1178  *	   assuming that the host ignores our redirects.
1179  *	2. If we did not see packets requiring redirects
1180  *	   during ip_rt_redirect_silence, we assume that the host
1181  *	   forgot redirected route and start to send redirects again.
1182  *
1183  * This algorithm is much cheaper and more intelligent than dumb load limiting
1184  * in icmp.c.
1185  *
1186  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1187  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1188  */
1189 
1190 void ip_rt_send_redirect(struct sk_buff *skb)
1191 {
1192 	struct rtable *rt = (struct rtable*)skb->dst;
1193 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1194 
1195 	if (!in_dev)
1196 		return;
1197 
1198 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1199 		goto out;
1200 
1201 	/* No redirected packets during ip_rt_redirect_silence;
1202 	 * reset the algorithm.
1203 	 */
1204 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1205 		rt->u.dst.rate_tokens = 0;
1206 
1207 	/* Too many ignored redirects; do not send anything
1208 	 * set u.dst.rate_last to the last seen redirected packet.
1209 	 */
1210 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1211 		rt->u.dst.rate_last = jiffies;
1212 		goto out;
1213 	}
1214 
1215 	/* Check for load limit; set rate_last to the latest sent
1216 	 * redirect.
1217 	 */
1218 	if (rt->u.dst.rate_tokens == 0 ||
1219 	    time_after(jiffies,
1220 		       (rt->u.dst.rate_last +
1221 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1222 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1223 		rt->u.dst.rate_last = jiffies;
1224 		++rt->u.dst.rate_tokens;
1225 #ifdef CONFIG_IP_ROUTE_VERBOSE
1226 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1227 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1228 		    net_ratelimit())
1229 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1230 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1231 				NIPQUAD(rt->rt_src), rt->rt_iif,
1232 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1233 #endif
1234 	}
1235 out:
1236 	in_dev_put(in_dev);
1237 }
1238 
1239 static int ip_error(struct sk_buff *skb)
1240 {
1241 	struct rtable *rt = (struct rtable*)skb->dst;
1242 	unsigned long now;
1243 	int code;
1244 
1245 	switch (rt->u.dst.error) {
1246 		case EINVAL:
1247 		default:
1248 			goto out;
1249 		case EHOSTUNREACH:
1250 			code = ICMP_HOST_UNREACH;
1251 			break;
1252 		case ENETUNREACH:
1253 			code = ICMP_NET_UNREACH;
1254 			break;
1255 		case EACCES:
1256 			code = ICMP_PKT_FILTERED;
1257 			break;
1258 	}
1259 
1260 	now = jiffies;
1261 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1262 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1263 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1264 	rt->u.dst.rate_last = now;
1265 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1266 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1267 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1268 	}
1269 
1270 out:	kfree_skb(skb);
1271 	return 0;
1272 }
1273 
1274 /*
1275  *	The last two values are not from the RFC but
1276  *	are needed for AMPRnet AX.25 paths.
1277  */
1278 
1279 static const unsigned short mtu_plateau[] =
1280 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1281 
1282 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1283 {
1284 	int i;
1285 
1286 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1287 		if (old_mtu > mtu_plateau[i])
1288 			return mtu_plateau[i];
1289 	return 68;
1290 }
1291 
1292 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1293 {
1294 	int i;
1295 	unsigned short old_mtu = ntohs(iph->tot_len);
1296 	struct rtable *rth;
1297 	__be32  skeys[2] = { iph->saddr, 0, };
1298 	__be32  daddr = iph->daddr;
1299 	unsigned short est_mtu = 0;
1300 
1301 	if (ipv4_config.no_pmtu_disc)
1302 		return 0;
1303 
1304 	for (i = 0; i < 2; i++) {
1305 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1306 
1307 		rcu_read_lock();
1308 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1309 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1310 			if (rth->fl.fl4_dst == daddr &&
1311 			    rth->fl.fl4_src == skeys[i] &&
1312 			    rth->rt_dst  == daddr &&
1313 			    rth->rt_src  == iph->saddr &&
1314 			    rth->fl.iif == 0 &&
1315 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1316 				unsigned short mtu = new_mtu;
1317 
1318 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1319 
1320 					/* BSD 4.2 compatibility hack :-( */
1321 					if (mtu == 0 &&
1322 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1323 					    old_mtu >= 68 + (iph->ihl << 2))
1324 						old_mtu -= iph->ihl << 2;
1325 
1326 					mtu = guess_mtu(old_mtu);
1327 				}
1328 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1329 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1330 						dst_confirm(&rth->u.dst);
1331 						if (mtu < ip_rt_min_pmtu) {
1332 							mtu = ip_rt_min_pmtu;
1333 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1334 								(1 << RTAX_MTU);
1335 						}
1336 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1337 						dst_set_expires(&rth->u.dst,
1338 							ip_rt_mtu_expires);
1339 					}
1340 					est_mtu = mtu;
1341 				}
1342 			}
1343 		}
1344 		rcu_read_unlock();
1345 	}
1346 	return est_mtu ? : new_mtu;
1347 }
1348 
1349 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1350 {
1351 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1352 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1353 		if (mtu < ip_rt_min_pmtu) {
1354 			mtu = ip_rt_min_pmtu;
1355 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1356 		}
1357 		dst->metrics[RTAX_MTU-1] = mtu;
1358 		dst_set_expires(dst, ip_rt_mtu_expires);
1359 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1360 	}
1361 }
1362 
1363 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1364 {
1365 	return NULL;
1366 }
1367 
1368 static void ipv4_dst_destroy(struct dst_entry *dst)
1369 {
1370 	struct rtable *rt = (struct rtable *) dst;
1371 	struct inet_peer *peer = rt->peer;
1372 	struct in_device *idev = rt->idev;
1373 
1374 	if (peer) {
1375 		rt->peer = NULL;
1376 		inet_putpeer(peer);
1377 	}
1378 
1379 	if (idev) {
1380 		rt->idev = NULL;
1381 		in_dev_put(idev);
1382 	}
1383 }
1384 
1385 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1386 			    int how)
1387 {
1388 	struct rtable *rt = (struct rtable *) dst;
1389 	struct in_device *idev = rt->idev;
1390 	if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1391 		struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1392 		if (loopback_idev) {
1393 			rt->idev = loopback_idev;
1394 			in_dev_put(idev);
1395 		}
1396 	}
1397 }
1398 
1399 static void ipv4_link_failure(struct sk_buff *skb)
1400 {
1401 	struct rtable *rt;
1402 
1403 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1404 
1405 	rt = (struct rtable *) skb->dst;
1406 	if (rt)
1407 		dst_set_expires(&rt->u.dst, 0);
1408 }
1409 
1410 static int ip_rt_bug(struct sk_buff *skb)
1411 {
1412 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1413 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1414 		skb->dev ? skb->dev->name : "?");
1415 	kfree_skb(skb);
1416 	return 0;
1417 }
1418 
1419 /*
1420    We do not cache source address of outgoing interface,
1421    because it is used only by IP RR, TS and SRR options,
1422    so that it out of fast path.
1423 
1424    BTW remember: "addr" is allowed to be not aligned
1425    in IP options!
1426  */
1427 
1428 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1429 {
1430 	__be32 src;
1431 	struct fib_result res;
1432 
1433 	if (rt->fl.iif == 0)
1434 		src = rt->rt_src;
1435 	else if (fib_lookup(&rt->fl, &res) == 0) {
1436 		src = FIB_RES_PREFSRC(res);
1437 		fib_res_put(&res);
1438 	} else
1439 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1440 					RT_SCOPE_UNIVERSE);
1441 	memcpy(addr, &src, 4);
1442 }
1443 
1444 #ifdef CONFIG_NET_CLS_ROUTE
1445 static void set_class_tag(struct rtable *rt, u32 tag)
1446 {
1447 	if (!(rt->u.dst.tclassid & 0xFFFF))
1448 		rt->u.dst.tclassid |= tag & 0xFFFF;
1449 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1450 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1451 }
1452 #endif
1453 
1454 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1455 {
1456 	struct fib_info *fi = res->fi;
1457 
1458 	if (fi) {
1459 		if (FIB_RES_GW(*res) &&
1460 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1461 			rt->rt_gateway = FIB_RES_GW(*res);
1462 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1463 		       sizeof(rt->u.dst.metrics));
1464 		if (fi->fib_mtu == 0) {
1465 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1466 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1467 			    rt->rt_gateway != rt->rt_dst &&
1468 			    rt->u.dst.dev->mtu > 576)
1469 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1470 		}
1471 #ifdef CONFIG_NET_CLS_ROUTE
1472 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1473 #endif
1474 	} else
1475 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1476 
1477 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1478 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1479 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1480 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1481 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1482 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1483 				       ip_rt_min_advmss);
1484 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1485 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1486 
1487 #ifdef CONFIG_NET_CLS_ROUTE
1488 #ifdef CONFIG_IP_MULTIPLE_TABLES
1489 	set_class_tag(rt, fib_rules_tclass(res));
1490 #endif
1491 	set_class_tag(rt, itag);
1492 #endif
1493 	rt->rt_type = res->type;
1494 }
1495 
1496 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1497 				u8 tos, struct net_device *dev, int our)
1498 {
1499 	unsigned hash;
1500 	struct rtable *rth;
1501 	__be32 spec_dst;
1502 	struct in_device *in_dev = in_dev_get(dev);
1503 	u32 itag = 0;
1504 
1505 	/* Primary sanity checks. */
1506 
1507 	if (in_dev == NULL)
1508 		return -EINVAL;
1509 
1510 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1511 	    skb->protocol != htons(ETH_P_IP))
1512 		goto e_inval;
1513 
1514 	if (ZERONET(saddr)) {
1515 		if (!LOCAL_MCAST(daddr))
1516 			goto e_inval;
1517 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1518 	} else if (fib_validate_source(saddr, 0, tos, 0,
1519 					dev, &spec_dst, &itag) < 0)
1520 		goto e_inval;
1521 
1522 	rth = dst_alloc(&ipv4_dst_ops);
1523 	if (!rth)
1524 		goto e_nobufs;
1525 
1526 	rth->u.dst.output= ip_rt_bug;
1527 
1528 	atomic_set(&rth->u.dst.__refcnt, 1);
1529 	rth->u.dst.flags= DST_HOST;
1530 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1531 		rth->u.dst.flags |= DST_NOPOLICY;
1532 	rth->fl.fl4_dst	= daddr;
1533 	rth->rt_dst	= daddr;
1534 	rth->fl.fl4_tos	= tos;
1535 	rth->fl.mark    = skb->mark;
1536 	rth->fl.fl4_src	= saddr;
1537 	rth->rt_src	= saddr;
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539 	rth->u.dst.tclassid = itag;
1540 #endif
1541 	rth->rt_iif	=
1542 	rth->fl.iif	= dev->ifindex;
1543 	rth->u.dst.dev	= init_net.loopback_dev;
1544 	dev_hold(rth->u.dst.dev);
1545 	rth->idev	= in_dev_get(rth->u.dst.dev);
1546 	rth->fl.oif	= 0;
1547 	rth->rt_gateway	= daddr;
1548 	rth->rt_spec_dst= spec_dst;
1549 	rth->rt_type	= RTN_MULTICAST;
1550 	rth->rt_flags	= RTCF_MULTICAST;
1551 	if (our) {
1552 		rth->u.dst.input= ip_local_deliver;
1553 		rth->rt_flags |= RTCF_LOCAL;
1554 	}
1555 
1556 #ifdef CONFIG_IP_MROUTE
1557 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1558 		rth->u.dst.input = ip_mr_input;
1559 #endif
1560 	RT_CACHE_STAT_INC(in_slow_mc);
1561 
1562 	in_dev_put(in_dev);
1563 	hash = rt_hash(daddr, saddr, dev->ifindex);
1564 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1565 
1566 e_nobufs:
1567 	in_dev_put(in_dev);
1568 	return -ENOBUFS;
1569 
1570 e_inval:
1571 	in_dev_put(in_dev);
1572 	return -EINVAL;
1573 }
1574 
1575 
1576 static void ip_handle_martian_source(struct net_device *dev,
1577 				     struct in_device *in_dev,
1578 				     struct sk_buff *skb,
1579 				     __be32 daddr,
1580 				     __be32 saddr)
1581 {
1582 	RT_CACHE_STAT_INC(in_martian_src);
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1585 		/*
1586 		 *	RFC1812 recommendation, if source is martian,
1587 		 *	the only hint is MAC header.
1588 		 */
1589 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1590 			"%u.%u.%u.%u, on dev %s\n",
1591 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1592 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1593 			int i;
1594 			const unsigned char *p = skb_mac_header(skb);
1595 			printk(KERN_WARNING "ll header: ");
1596 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1597 				printk("%02x", *p);
1598 				if (i < (dev->hard_header_len - 1))
1599 					printk(":");
1600 			}
1601 			printk("\n");
1602 		}
1603 	}
1604 #endif
1605 }
1606 
1607 static inline int __mkroute_input(struct sk_buff *skb,
1608 				  struct fib_result* res,
1609 				  struct in_device *in_dev,
1610 				  __be32 daddr, __be32 saddr, u32 tos,
1611 				  struct rtable **result)
1612 {
1613 
1614 	struct rtable *rth;
1615 	int err;
1616 	struct in_device *out_dev;
1617 	unsigned flags = 0;
1618 	__be32 spec_dst;
1619 	u32 itag;
1620 
1621 	/* get a working reference to the output device */
1622 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1623 	if (out_dev == NULL) {
1624 		if (net_ratelimit())
1625 			printk(KERN_CRIT "Bug in ip_route_input" \
1626 			       "_slow(). Please, report\n");
1627 		return -EINVAL;
1628 	}
1629 
1630 
1631 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1632 				  in_dev->dev, &spec_dst, &itag);
1633 	if (err < 0) {
1634 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1635 					 saddr);
1636 
1637 		err = -EINVAL;
1638 		goto cleanup;
1639 	}
1640 
1641 	if (err)
1642 		flags |= RTCF_DIRECTSRC;
1643 
1644 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1645 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1646 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1647 		flags |= RTCF_DOREDIRECT;
1648 
1649 	if (skb->protocol != htons(ETH_P_IP)) {
1650 		/* Not IP (i.e. ARP). Do not create route, if it is
1651 		 * invalid for proxy arp. DNAT routes are always valid.
1652 		 */
1653 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1654 			err = -EINVAL;
1655 			goto cleanup;
1656 		}
1657 	}
1658 
1659 
1660 	rth = dst_alloc(&ipv4_dst_ops);
1661 	if (!rth) {
1662 		err = -ENOBUFS;
1663 		goto cleanup;
1664 	}
1665 
1666 	atomic_set(&rth->u.dst.__refcnt, 1);
1667 	rth->u.dst.flags= DST_HOST;
1668 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1669 		rth->u.dst.flags |= DST_NOPOLICY;
1670 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1671 		rth->u.dst.flags |= DST_NOXFRM;
1672 	rth->fl.fl4_dst	= daddr;
1673 	rth->rt_dst	= daddr;
1674 	rth->fl.fl4_tos	= tos;
1675 	rth->fl.mark    = skb->mark;
1676 	rth->fl.fl4_src	= saddr;
1677 	rth->rt_src	= saddr;
1678 	rth->rt_gateway	= daddr;
1679 	rth->rt_iif 	=
1680 		rth->fl.iif	= in_dev->dev->ifindex;
1681 	rth->u.dst.dev	= (out_dev)->dev;
1682 	dev_hold(rth->u.dst.dev);
1683 	rth->idev	= in_dev_get(rth->u.dst.dev);
1684 	rth->fl.oif 	= 0;
1685 	rth->rt_spec_dst= spec_dst;
1686 
1687 	rth->u.dst.input = ip_forward;
1688 	rth->u.dst.output = ip_output;
1689 
1690 	rt_set_nexthop(rth, res, itag);
1691 
1692 	rth->rt_flags = flags;
1693 
1694 	*result = rth;
1695 	err = 0;
1696  cleanup:
1697 	/* release the working reference to the output device */
1698 	in_dev_put(out_dev);
1699 	return err;
1700 }
1701 
1702 static inline int ip_mkroute_input(struct sk_buff *skb,
1703 				   struct fib_result* res,
1704 				   const struct flowi *fl,
1705 				   struct in_device *in_dev,
1706 				   __be32 daddr, __be32 saddr, u32 tos)
1707 {
1708 	struct rtable* rth = NULL;
1709 	int err;
1710 	unsigned hash;
1711 
1712 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1713 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1714 		fib_select_multipath(fl, res);
1715 #endif
1716 
1717 	/* create a routing cache entry */
1718 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1719 	if (err)
1720 		return err;
1721 
1722 	/* put it into the cache */
1723 	hash = rt_hash(daddr, saddr, fl->iif);
1724 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1725 }
1726 
1727 /*
1728  *	NOTE. We drop all the packets that has local source
1729  *	addresses, because every properly looped back packet
1730  *	must have correct destination already attached by output routine.
1731  *
1732  *	Such approach solves two big problems:
1733  *	1. Not simplex devices are handled properly.
1734  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1735  */
1736 
1737 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1738 			       u8 tos, struct net_device *dev)
1739 {
1740 	struct fib_result res;
1741 	struct in_device *in_dev = in_dev_get(dev);
1742 	struct flowi fl = { .nl_u = { .ip4_u =
1743 				      { .daddr = daddr,
1744 					.saddr = saddr,
1745 					.tos = tos,
1746 					.scope = RT_SCOPE_UNIVERSE,
1747 				      } },
1748 			    .mark = skb->mark,
1749 			    .iif = dev->ifindex };
1750 	unsigned	flags = 0;
1751 	u32		itag = 0;
1752 	struct rtable * rth;
1753 	unsigned	hash;
1754 	__be32		spec_dst;
1755 	int		err = -EINVAL;
1756 	int		free_res = 0;
1757 
1758 	/* IP on this device is disabled. */
1759 
1760 	if (!in_dev)
1761 		goto out;
1762 
1763 	/* Check for the most weird martians, which can be not detected
1764 	   by fib_lookup.
1765 	 */
1766 
1767 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1768 		goto martian_source;
1769 
1770 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1771 		goto brd_input;
1772 
1773 	/* Accept zero addresses only to limited broadcast;
1774 	 * I even do not know to fix it or not. Waiting for complains :-)
1775 	 */
1776 	if (ZERONET(saddr))
1777 		goto martian_source;
1778 
1779 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1780 		goto martian_destination;
1781 
1782 	/*
1783 	 *	Now we are ready to route packet.
1784 	 */
1785 	if ((err = fib_lookup(&fl, &res)) != 0) {
1786 		if (!IN_DEV_FORWARD(in_dev))
1787 			goto e_hostunreach;
1788 		goto no_route;
1789 	}
1790 	free_res = 1;
1791 
1792 	RT_CACHE_STAT_INC(in_slow_tot);
1793 
1794 	if (res.type == RTN_BROADCAST)
1795 		goto brd_input;
1796 
1797 	if (res.type == RTN_LOCAL) {
1798 		int result;
1799 		result = fib_validate_source(saddr, daddr, tos,
1800 					     init_net.loopback_dev->ifindex,
1801 					     dev, &spec_dst, &itag);
1802 		if (result < 0)
1803 			goto martian_source;
1804 		if (result)
1805 			flags |= RTCF_DIRECTSRC;
1806 		spec_dst = daddr;
1807 		goto local_input;
1808 	}
1809 
1810 	if (!IN_DEV_FORWARD(in_dev))
1811 		goto e_hostunreach;
1812 	if (res.type != RTN_UNICAST)
1813 		goto martian_destination;
1814 
1815 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1816 	if (err == -ENOBUFS)
1817 		goto e_nobufs;
1818 	if (err == -EINVAL)
1819 		goto e_inval;
1820 
1821 done:
1822 	in_dev_put(in_dev);
1823 	if (free_res)
1824 		fib_res_put(&res);
1825 out:	return err;
1826 
1827 brd_input:
1828 	if (skb->protocol != htons(ETH_P_IP))
1829 		goto e_inval;
1830 
1831 	if (ZERONET(saddr))
1832 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1833 	else {
1834 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1835 					  &itag);
1836 		if (err < 0)
1837 			goto martian_source;
1838 		if (err)
1839 			flags |= RTCF_DIRECTSRC;
1840 	}
1841 	flags |= RTCF_BROADCAST;
1842 	res.type = RTN_BROADCAST;
1843 	RT_CACHE_STAT_INC(in_brd);
1844 
1845 local_input:
1846 	rth = dst_alloc(&ipv4_dst_ops);
1847 	if (!rth)
1848 		goto e_nobufs;
1849 
1850 	rth->u.dst.output= ip_rt_bug;
1851 
1852 	atomic_set(&rth->u.dst.__refcnt, 1);
1853 	rth->u.dst.flags= DST_HOST;
1854 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1855 		rth->u.dst.flags |= DST_NOPOLICY;
1856 	rth->fl.fl4_dst	= daddr;
1857 	rth->rt_dst	= daddr;
1858 	rth->fl.fl4_tos	= tos;
1859 	rth->fl.mark    = skb->mark;
1860 	rth->fl.fl4_src	= saddr;
1861 	rth->rt_src	= saddr;
1862 #ifdef CONFIG_NET_CLS_ROUTE
1863 	rth->u.dst.tclassid = itag;
1864 #endif
1865 	rth->rt_iif	=
1866 	rth->fl.iif	= dev->ifindex;
1867 	rth->u.dst.dev	= init_net.loopback_dev;
1868 	dev_hold(rth->u.dst.dev);
1869 	rth->idev	= in_dev_get(rth->u.dst.dev);
1870 	rth->rt_gateway	= daddr;
1871 	rth->rt_spec_dst= spec_dst;
1872 	rth->u.dst.input= ip_local_deliver;
1873 	rth->rt_flags 	= flags|RTCF_LOCAL;
1874 	if (res.type == RTN_UNREACHABLE) {
1875 		rth->u.dst.input= ip_error;
1876 		rth->u.dst.error= -err;
1877 		rth->rt_flags 	&= ~RTCF_LOCAL;
1878 	}
1879 	rth->rt_type	= res.type;
1880 	hash = rt_hash(daddr, saddr, fl.iif);
1881 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1882 	goto done;
1883 
1884 no_route:
1885 	RT_CACHE_STAT_INC(in_no_route);
1886 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1887 	res.type = RTN_UNREACHABLE;
1888 	goto local_input;
1889 
1890 	/*
1891 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1892 	 */
1893 martian_destination:
1894 	RT_CACHE_STAT_INC(in_martian_dst);
1895 #ifdef CONFIG_IP_ROUTE_VERBOSE
1896 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1897 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1898 			"%u.%u.%u.%u, dev %s\n",
1899 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1900 #endif
1901 
1902 e_hostunreach:
1903 	err = -EHOSTUNREACH;
1904 	goto done;
1905 
1906 e_inval:
1907 	err = -EINVAL;
1908 	goto done;
1909 
1910 e_nobufs:
1911 	err = -ENOBUFS;
1912 	goto done;
1913 
1914 martian_source:
1915 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1916 	goto e_inval;
1917 }
1918 
1919 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1920 		   u8 tos, struct net_device *dev)
1921 {
1922 	struct rtable * rth;
1923 	unsigned	hash;
1924 	int iif = dev->ifindex;
1925 
1926 	tos &= IPTOS_RT_MASK;
1927 	hash = rt_hash(daddr, saddr, iif);
1928 
1929 	rcu_read_lock();
1930 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1931 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
1932 		if (rth->fl.fl4_dst == daddr &&
1933 		    rth->fl.fl4_src == saddr &&
1934 		    rth->fl.iif == iif &&
1935 		    rth->fl.oif == 0 &&
1936 		    rth->fl.mark == skb->mark &&
1937 		    rth->fl.fl4_tos == tos) {
1938 			rth->u.dst.lastuse = jiffies;
1939 			dst_hold(&rth->u.dst);
1940 			rth->u.dst.__use++;
1941 			RT_CACHE_STAT_INC(in_hit);
1942 			rcu_read_unlock();
1943 			skb->dst = (struct dst_entry*)rth;
1944 			return 0;
1945 		}
1946 		RT_CACHE_STAT_INC(in_hlist_search);
1947 	}
1948 	rcu_read_unlock();
1949 
1950 	/* Multicast recognition logic is moved from route cache to here.
1951 	   The problem was that too many Ethernet cards have broken/missing
1952 	   hardware multicast filters :-( As result the host on multicasting
1953 	   network acquires a lot of useless route cache entries, sort of
1954 	   SDR messages from all the world. Now we try to get rid of them.
1955 	   Really, provided software IP multicast filter is organized
1956 	   reasonably (at least, hashed), it does not result in a slowdown
1957 	   comparing with route cache reject entries.
1958 	   Note, that multicast routers are not affected, because
1959 	   route cache entry is created eventually.
1960 	 */
1961 	if (MULTICAST(daddr)) {
1962 		struct in_device *in_dev;
1963 
1964 		rcu_read_lock();
1965 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1966 			int our = ip_check_mc(in_dev, daddr, saddr,
1967 				ip_hdr(skb)->protocol);
1968 			if (our
1969 #ifdef CONFIG_IP_MROUTE
1970 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1971 #endif
1972 			    ) {
1973 				rcu_read_unlock();
1974 				return ip_route_input_mc(skb, daddr, saddr,
1975 							 tos, dev, our);
1976 			}
1977 		}
1978 		rcu_read_unlock();
1979 		return -EINVAL;
1980 	}
1981 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1982 }
1983 
1984 static inline int __mkroute_output(struct rtable **result,
1985 				   struct fib_result* res,
1986 				   const struct flowi *fl,
1987 				   const struct flowi *oldflp,
1988 				   struct net_device *dev_out,
1989 				   unsigned flags)
1990 {
1991 	struct rtable *rth;
1992 	struct in_device *in_dev;
1993 	u32 tos = RT_FL_TOS(oldflp);
1994 	int err = 0;
1995 
1996 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1997 		return -EINVAL;
1998 
1999 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2000 		res->type = RTN_BROADCAST;
2001 	else if (MULTICAST(fl->fl4_dst))
2002 		res->type = RTN_MULTICAST;
2003 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2004 		return -EINVAL;
2005 
2006 	if (dev_out->flags & IFF_LOOPBACK)
2007 		flags |= RTCF_LOCAL;
2008 
2009 	/* get work reference to inet device */
2010 	in_dev = in_dev_get(dev_out);
2011 	if (!in_dev)
2012 		return -EINVAL;
2013 
2014 	if (res->type == RTN_BROADCAST) {
2015 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2016 		if (res->fi) {
2017 			fib_info_put(res->fi);
2018 			res->fi = NULL;
2019 		}
2020 	} else if (res->type == RTN_MULTICAST) {
2021 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2022 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2023 				 oldflp->proto))
2024 			flags &= ~RTCF_LOCAL;
2025 		/* If multicast route do not exist use
2026 		   default one, but do not gateway in this case.
2027 		   Yes, it is hack.
2028 		 */
2029 		if (res->fi && res->prefixlen < 4) {
2030 			fib_info_put(res->fi);
2031 			res->fi = NULL;
2032 		}
2033 	}
2034 
2035 
2036 	rth = dst_alloc(&ipv4_dst_ops);
2037 	if (!rth) {
2038 		err = -ENOBUFS;
2039 		goto cleanup;
2040 	}
2041 
2042 	atomic_set(&rth->u.dst.__refcnt, 1);
2043 	rth->u.dst.flags= DST_HOST;
2044 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2045 		rth->u.dst.flags |= DST_NOXFRM;
2046 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2047 		rth->u.dst.flags |= DST_NOPOLICY;
2048 
2049 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2050 	rth->fl.fl4_tos	= tos;
2051 	rth->fl.fl4_src	= oldflp->fl4_src;
2052 	rth->fl.oif	= oldflp->oif;
2053 	rth->fl.mark    = oldflp->mark;
2054 	rth->rt_dst	= fl->fl4_dst;
2055 	rth->rt_src	= fl->fl4_src;
2056 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2057 	/* get references to the devices that are to be hold by the routing
2058 	   cache entry */
2059 	rth->u.dst.dev	= dev_out;
2060 	dev_hold(dev_out);
2061 	rth->idev	= in_dev_get(dev_out);
2062 	rth->rt_gateway = fl->fl4_dst;
2063 	rth->rt_spec_dst= fl->fl4_src;
2064 
2065 	rth->u.dst.output=ip_output;
2066 
2067 	RT_CACHE_STAT_INC(out_slow_tot);
2068 
2069 	if (flags & RTCF_LOCAL) {
2070 		rth->u.dst.input = ip_local_deliver;
2071 		rth->rt_spec_dst = fl->fl4_dst;
2072 	}
2073 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2074 		rth->rt_spec_dst = fl->fl4_src;
2075 		if (flags & RTCF_LOCAL &&
2076 		    !(dev_out->flags & IFF_LOOPBACK)) {
2077 			rth->u.dst.output = ip_mc_output;
2078 			RT_CACHE_STAT_INC(out_slow_mc);
2079 		}
2080 #ifdef CONFIG_IP_MROUTE
2081 		if (res->type == RTN_MULTICAST) {
2082 			if (IN_DEV_MFORWARD(in_dev) &&
2083 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2084 				rth->u.dst.input = ip_mr_input;
2085 				rth->u.dst.output = ip_mc_output;
2086 			}
2087 		}
2088 #endif
2089 	}
2090 
2091 	rt_set_nexthop(rth, res, 0);
2092 
2093 	rth->rt_flags = flags;
2094 
2095 	*result = rth;
2096  cleanup:
2097 	/* release work reference to inet device */
2098 	in_dev_put(in_dev);
2099 
2100 	return err;
2101 }
2102 
2103 static inline int ip_mkroute_output(struct rtable **rp,
2104 				    struct fib_result* res,
2105 				    const struct flowi *fl,
2106 				    const struct flowi *oldflp,
2107 				    struct net_device *dev_out,
2108 				    unsigned flags)
2109 {
2110 	struct rtable *rth = NULL;
2111 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2112 	unsigned hash;
2113 	if (err == 0) {
2114 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2115 		err = rt_intern_hash(hash, rth, rp);
2116 	}
2117 
2118 	return err;
2119 }
2120 
2121 /*
2122  * Major route resolver routine.
2123  */
2124 
2125 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2126 {
2127 	u32 tos	= RT_FL_TOS(oldflp);
2128 	struct flowi fl = { .nl_u = { .ip4_u =
2129 				      { .daddr = oldflp->fl4_dst,
2130 					.saddr = oldflp->fl4_src,
2131 					.tos = tos & IPTOS_RT_MASK,
2132 					.scope = ((tos & RTO_ONLINK) ?
2133 						  RT_SCOPE_LINK :
2134 						  RT_SCOPE_UNIVERSE),
2135 				      } },
2136 			    .mark = oldflp->mark,
2137 			    .iif = init_net.loopback_dev->ifindex,
2138 			    .oif = oldflp->oif };
2139 	struct fib_result res;
2140 	unsigned flags = 0;
2141 	struct net_device *dev_out = NULL;
2142 	int free_res = 0;
2143 	int err;
2144 
2145 
2146 	res.fi		= NULL;
2147 #ifdef CONFIG_IP_MULTIPLE_TABLES
2148 	res.r		= NULL;
2149 #endif
2150 
2151 	if (oldflp->fl4_src) {
2152 		err = -EINVAL;
2153 		if (MULTICAST(oldflp->fl4_src) ||
2154 		    BADCLASS(oldflp->fl4_src) ||
2155 		    ZERONET(oldflp->fl4_src))
2156 			goto out;
2157 
2158 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2159 		dev_out = ip_dev_find(oldflp->fl4_src);
2160 		if (dev_out == NULL)
2161 			goto out;
2162 
2163 		/* I removed check for oif == dev_out->oif here.
2164 		   It was wrong for two reasons:
2165 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2166 		      assigned to multiple interfaces.
2167 		   2. Moreover, we are allowed to send packets with saddr
2168 		      of another iface. --ANK
2169 		 */
2170 
2171 		if (oldflp->oif == 0
2172 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2173 			/* Special hack: user can direct multicasts
2174 			   and limited broadcast via necessary interface
2175 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2176 			   This hack is not just for fun, it allows
2177 			   vic,vat and friends to work.
2178 			   They bind socket to loopback, set ttl to zero
2179 			   and expect that it will work.
2180 			   From the viewpoint of routing cache they are broken,
2181 			   because we are not allowed to build multicast path
2182 			   with loopback source addr (look, routing cache
2183 			   cannot know, that ttl is zero, so that packet
2184 			   will not leave this host and route is valid).
2185 			   Luckily, this hack is good workaround.
2186 			 */
2187 
2188 			fl.oif = dev_out->ifindex;
2189 			goto make_route;
2190 		}
2191 		if (dev_out)
2192 			dev_put(dev_out);
2193 		dev_out = NULL;
2194 	}
2195 
2196 
2197 	if (oldflp->oif) {
2198 		dev_out = dev_get_by_index(&init_net, oldflp->oif);
2199 		err = -ENODEV;
2200 		if (dev_out == NULL)
2201 			goto out;
2202 
2203 		/* RACE: Check return value of inet_select_addr instead. */
2204 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2205 			dev_put(dev_out);
2206 			goto out;	/* Wrong error code */
2207 		}
2208 
2209 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2210 			if (!fl.fl4_src)
2211 				fl.fl4_src = inet_select_addr(dev_out, 0,
2212 							      RT_SCOPE_LINK);
2213 			goto make_route;
2214 		}
2215 		if (!fl.fl4_src) {
2216 			if (MULTICAST(oldflp->fl4_dst))
2217 				fl.fl4_src = inet_select_addr(dev_out, 0,
2218 							      fl.fl4_scope);
2219 			else if (!oldflp->fl4_dst)
2220 				fl.fl4_src = inet_select_addr(dev_out, 0,
2221 							      RT_SCOPE_HOST);
2222 		}
2223 	}
2224 
2225 	if (!fl.fl4_dst) {
2226 		fl.fl4_dst = fl.fl4_src;
2227 		if (!fl.fl4_dst)
2228 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2229 		if (dev_out)
2230 			dev_put(dev_out);
2231 		dev_out = init_net.loopback_dev;
2232 		dev_hold(dev_out);
2233 		fl.oif = init_net.loopback_dev->ifindex;
2234 		res.type = RTN_LOCAL;
2235 		flags |= RTCF_LOCAL;
2236 		goto make_route;
2237 	}
2238 
2239 	if (fib_lookup(&fl, &res)) {
2240 		res.fi = NULL;
2241 		if (oldflp->oif) {
2242 			/* Apparently, routing tables are wrong. Assume,
2243 			   that the destination is on link.
2244 
2245 			   WHY? DW.
2246 			   Because we are allowed to send to iface
2247 			   even if it has NO routes and NO assigned
2248 			   addresses. When oif is specified, routing
2249 			   tables are looked up with only one purpose:
2250 			   to catch if destination is gatewayed, rather than
2251 			   direct. Moreover, if MSG_DONTROUTE is set,
2252 			   we send packet, ignoring both routing tables
2253 			   and ifaddr state. --ANK
2254 
2255 
2256 			   We could make it even if oif is unknown,
2257 			   likely IPv6, but we do not.
2258 			 */
2259 
2260 			if (fl.fl4_src == 0)
2261 				fl.fl4_src = inet_select_addr(dev_out, 0,
2262 							      RT_SCOPE_LINK);
2263 			res.type = RTN_UNICAST;
2264 			goto make_route;
2265 		}
2266 		if (dev_out)
2267 			dev_put(dev_out);
2268 		err = -ENETUNREACH;
2269 		goto out;
2270 	}
2271 	free_res = 1;
2272 
2273 	if (res.type == RTN_LOCAL) {
2274 		if (!fl.fl4_src)
2275 			fl.fl4_src = fl.fl4_dst;
2276 		if (dev_out)
2277 			dev_put(dev_out);
2278 		dev_out = init_net.loopback_dev;
2279 		dev_hold(dev_out);
2280 		fl.oif = dev_out->ifindex;
2281 		if (res.fi)
2282 			fib_info_put(res.fi);
2283 		res.fi = NULL;
2284 		flags |= RTCF_LOCAL;
2285 		goto make_route;
2286 	}
2287 
2288 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2289 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2290 		fib_select_multipath(&fl, &res);
2291 	else
2292 #endif
2293 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2294 		fib_select_default(&fl, &res);
2295 
2296 	if (!fl.fl4_src)
2297 		fl.fl4_src = FIB_RES_PREFSRC(res);
2298 
2299 	if (dev_out)
2300 		dev_put(dev_out);
2301 	dev_out = FIB_RES_DEV(res);
2302 	dev_hold(dev_out);
2303 	fl.oif = dev_out->ifindex;
2304 
2305 
2306 make_route:
2307 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2308 
2309 
2310 	if (free_res)
2311 		fib_res_put(&res);
2312 	if (dev_out)
2313 		dev_put(dev_out);
2314 out:	return err;
2315 }
2316 
2317 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2318 {
2319 	unsigned hash;
2320 	struct rtable *rth;
2321 
2322 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2323 
2324 	rcu_read_lock_bh();
2325 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2326 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2327 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2328 		    rth->fl.fl4_src == flp->fl4_src &&
2329 		    rth->fl.iif == 0 &&
2330 		    rth->fl.oif == flp->oif &&
2331 		    rth->fl.mark == flp->mark &&
2332 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2333 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2334 			rth->u.dst.lastuse = jiffies;
2335 			dst_hold(&rth->u.dst);
2336 			rth->u.dst.__use++;
2337 			RT_CACHE_STAT_INC(out_hit);
2338 			rcu_read_unlock_bh();
2339 			*rp = rth;
2340 			return 0;
2341 		}
2342 		RT_CACHE_STAT_INC(out_hlist_search);
2343 	}
2344 	rcu_read_unlock_bh();
2345 
2346 	return ip_route_output_slow(rp, flp);
2347 }
2348 
2349 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2350 
2351 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2352 {
2353 }
2354 
2355 static struct dst_ops ipv4_dst_blackhole_ops = {
2356 	.family			=	AF_INET,
2357 	.protocol		=	__constant_htons(ETH_P_IP),
2358 	.destroy		=	ipv4_dst_destroy,
2359 	.check			=	ipv4_dst_check,
2360 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2361 	.entry_size		=	sizeof(struct rtable),
2362 };
2363 
2364 
2365 static int ipv4_blackhole_output(struct sk_buff *skb)
2366 {
2367 	kfree_skb(skb);
2368 	return 0;
2369 }
2370 
2371 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2372 {
2373 	struct rtable *ort = *rp;
2374 	struct rtable *rt = (struct rtable *)
2375 		dst_alloc(&ipv4_dst_blackhole_ops);
2376 
2377 	if (rt) {
2378 		struct dst_entry *new = &rt->u.dst;
2379 
2380 		atomic_set(&new->__refcnt, 1);
2381 		new->__use = 1;
2382 		new->input = ipv4_blackhole_output;
2383 		new->output = ipv4_blackhole_output;
2384 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2385 
2386 		new->dev = ort->u.dst.dev;
2387 		if (new->dev)
2388 			dev_hold(new->dev);
2389 
2390 		rt->fl = ort->fl;
2391 
2392 		rt->idev = ort->idev;
2393 		if (rt->idev)
2394 			in_dev_hold(rt->idev);
2395 		rt->rt_flags = ort->rt_flags;
2396 		rt->rt_type = ort->rt_type;
2397 		rt->rt_dst = ort->rt_dst;
2398 		rt->rt_src = ort->rt_src;
2399 		rt->rt_iif = ort->rt_iif;
2400 		rt->rt_gateway = ort->rt_gateway;
2401 		rt->rt_spec_dst = ort->rt_spec_dst;
2402 		rt->peer = ort->peer;
2403 		if (rt->peer)
2404 			atomic_inc(&rt->peer->refcnt);
2405 
2406 		dst_free(new);
2407 	}
2408 
2409 	dst_release(&(*rp)->u.dst);
2410 	*rp = rt;
2411 	return (rt ? 0 : -ENOMEM);
2412 }
2413 
2414 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2415 {
2416 	int err;
2417 
2418 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2419 		return err;
2420 
2421 	if (flp->proto) {
2422 		if (!flp->fl4_src)
2423 			flp->fl4_src = (*rp)->rt_src;
2424 		if (!flp->fl4_dst)
2425 			flp->fl4_dst = (*rp)->rt_dst;
2426 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2427 		if (err == -EREMOTE)
2428 			err = ipv4_dst_blackhole(rp, flp, sk);
2429 
2430 		return err;
2431 	}
2432 
2433 	return 0;
2434 }
2435 
2436 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2437 
2438 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2439 {
2440 	return ip_route_output_flow(rp, flp, NULL, 0);
2441 }
2442 
2443 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2444 			int nowait, unsigned int flags)
2445 {
2446 	struct rtable *rt = (struct rtable*)skb->dst;
2447 	struct rtmsg *r;
2448 	struct nlmsghdr *nlh;
2449 	long expires;
2450 	u32 id = 0, ts = 0, tsage = 0, error;
2451 
2452 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2453 	if (nlh == NULL)
2454 		return -EMSGSIZE;
2455 
2456 	r = nlmsg_data(nlh);
2457 	r->rtm_family	 = AF_INET;
2458 	r->rtm_dst_len	= 32;
2459 	r->rtm_src_len	= 0;
2460 	r->rtm_tos	= rt->fl.fl4_tos;
2461 	r->rtm_table	= RT_TABLE_MAIN;
2462 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2463 	r->rtm_type	= rt->rt_type;
2464 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2465 	r->rtm_protocol = RTPROT_UNSPEC;
2466 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2467 	if (rt->rt_flags & RTCF_NOTIFY)
2468 		r->rtm_flags |= RTM_F_NOTIFY;
2469 
2470 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2471 
2472 	if (rt->fl.fl4_src) {
2473 		r->rtm_src_len = 32;
2474 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2475 	}
2476 	if (rt->u.dst.dev)
2477 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2478 #ifdef CONFIG_NET_CLS_ROUTE
2479 	if (rt->u.dst.tclassid)
2480 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2481 #endif
2482 	if (rt->fl.iif)
2483 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2484 	else if (rt->rt_src != rt->fl.fl4_src)
2485 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2486 
2487 	if (rt->rt_dst != rt->rt_gateway)
2488 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2489 
2490 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2491 		goto nla_put_failure;
2492 
2493 	error = rt->u.dst.error;
2494 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2495 	if (rt->peer) {
2496 		id = rt->peer->ip_id_count;
2497 		if (rt->peer->tcp_ts_stamp) {
2498 			ts = rt->peer->tcp_ts;
2499 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2500 		}
2501 	}
2502 
2503 	if (rt->fl.iif) {
2504 #ifdef CONFIG_IP_MROUTE
2505 		__be32 dst = rt->rt_dst;
2506 
2507 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2508 		    IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2509 			int err = ipmr_get_route(skb, r, nowait);
2510 			if (err <= 0) {
2511 				if (!nowait) {
2512 					if (err == 0)
2513 						return 0;
2514 					goto nla_put_failure;
2515 				} else {
2516 					if (err == -EMSGSIZE)
2517 						goto nla_put_failure;
2518 					error = err;
2519 				}
2520 			}
2521 		} else
2522 #endif
2523 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2524 	}
2525 
2526 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2527 			       expires, error) < 0)
2528 		goto nla_put_failure;
2529 
2530 	return nlmsg_end(skb, nlh);
2531 
2532 nla_put_failure:
2533 	nlmsg_cancel(skb, nlh);
2534 	return -EMSGSIZE;
2535 }
2536 
2537 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2538 {
2539 	struct rtmsg *rtm;
2540 	struct nlattr *tb[RTA_MAX+1];
2541 	struct rtable *rt = NULL;
2542 	__be32 dst = 0;
2543 	__be32 src = 0;
2544 	u32 iif;
2545 	int err;
2546 	struct sk_buff *skb;
2547 
2548 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2549 	if (err < 0)
2550 		goto errout;
2551 
2552 	rtm = nlmsg_data(nlh);
2553 
2554 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2555 	if (skb == NULL) {
2556 		err = -ENOBUFS;
2557 		goto errout;
2558 	}
2559 
2560 	/* Reserve room for dummy headers, this skb can pass
2561 	   through good chunk of routing engine.
2562 	 */
2563 	skb_reset_mac_header(skb);
2564 	skb_reset_network_header(skb);
2565 
2566 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2567 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2568 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2569 
2570 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2571 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2572 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2573 
2574 	if (iif) {
2575 		struct net_device *dev;
2576 
2577 		dev = __dev_get_by_index(&init_net, iif);
2578 		if (dev == NULL) {
2579 			err = -ENODEV;
2580 			goto errout_free;
2581 		}
2582 
2583 		skb->protocol	= htons(ETH_P_IP);
2584 		skb->dev	= dev;
2585 		local_bh_disable();
2586 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2587 		local_bh_enable();
2588 
2589 		rt = (struct rtable*) skb->dst;
2590 		if (err == 0 && rt->u.dst.error)
2591 			err = -rt->u.dst.error;
2592 	} else {
2593 		struct flowi fl = {
2594 			.nl_u = {
2595 				.ip4_u = {
2596 					.daddr = dst,
2597 					.saddr = src,
2598 					.tos = rtm->rtm_tos,
2599 				},
2600 			},
2601 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2602 		};
2603 		err = ip_route_output_key(&rt, &fl);
2604 	}
2605 
2606 	if (err)
2607 		goto errout_free;
2608 
2609 	skb->dst = &rt->u.dst;
2610 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2611 		rt->rt_flags |= RTCF_NOTIFY;
2612 
2613 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2614 				RTM_NEWROUTE, 0, 0);
2615 	if (err <= 0)
2616 		goto errout_free;
2617 
2618 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2619 errout:
2620 	return err;
2621 
2622 errout_free:
2623 	kfree_skb(skb);
2624 	goto errout;
2625 }
2626 
2627 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2628 {
2629 	struct rtable *rt;
2630 	int h, s_h;
2631 	int idx, s_idx;
2632 
2633 	s_h = cb->args[0];
2634 	s_idx = idx = cb->args[1];
2635 	for (h = 0; h <= rt_hash_mask; h++) {
2636 		if (h < s_h) continue;
2637 		if (h > s_h)
2638 			s_idx = 0;
2639 		rcu_read_lock_bh();
2640 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2641 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2642 			if (idx < s_idx)
2643 				continue;
2644 			skb->dst = dst_clone(&rt->u.dst);
2645 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2646 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2647 					 1, NLM_F_MULTI) <= 0) {
2648 				dst_release(xchg(&skb->dst, NULL));
2649 				rcu_read_unlock_bh();
2650 				goto done;
2651 			}
2652 			dst_release(xchg(&skb->dst, NULL));
2653 		}
2654 		rcu_read_unlock_bh();
2655 	}
2656 
2657 done:
2658 	cb->args[0] = h;
2659 	cb->args[1] = idx;
2660 	return skb->len;
2661 }
2662 
2663 void ip_rt_multicast_event(struct in_device *in_dev)
2664 {
2665 	rt_cache_flush(0);
2666 }
2667 
2668 #ifdef CONFIG_SYSCTL
2669 static int flush_delay;
2670 
2671 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2672 					struct file *filp, void __user *buffer,
2673 					size_t *lenp, loff_t *ppos)
2674 {
2675 	if (write) {
2676 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2677 		rt_cache_flush(flush_delay);
2678 		return 0;
2679 	}
2680 
2681 	return -EINVAL;
2682 }
2683 
2684 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2685 						int __user *name,
2686 						int nlen,
2687 						void __user *oldval,
2688 						size_t __user *oldlenp,
2689 						void __user *newval,
2690 						size_t newlen)
2691 {
2692 	int delay;
2693 	if (newlen != sizeof(int))
2694 		return -EINVAL;
2695 	if (get_user(delay, (int __user *)newval))
2696 		return -EFAULT;
2697 	rt_cache_flush(delay);
2698 	return 0;
2699 }
2700 
2701 ctl_table ipv4_route_table[] = {
2702 	{
2703 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2704 		.procname	= "flush",
2705 		.data		= &flush_delay,
2706 		.maxlen		= sizeof(int),
2707 		.mode		= 0200,
2708 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2709 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2710 	},
2711 	{
2712 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2713 		.procname	= "min_delay",
2714 		.data		= &ip_rt_min_delay,
2715 		.maxlen		= sizeof(int),
2716 		.mode		= 0644,
2717 		.proc_handler	= &proc_dointvec_jiffies,
2718 		.strategy	= &sysctl_jiffies,
2719 	},
2720 	{
2721 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2722 		.procname	= "max_delay",
2723 		.data		= &ip_rt_max_delay,
2724 		.maxlen		= sizeof(int),
2725 		.mode		= 0644,
2726 		.proc_handler	= &proc_dointvec_jiffies,
2727 		.strategy	= &sysctl_jiffies,
2728 	},
2729 	{
2730 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2731 		.procname	= "gc_thresh",
2732 		.data		= &ipv4_dst_ops.gc_thresh,
2733 		.maxlen		= sizeof(int),
2734 		.mode		= 0644,
2735 		.proc_handler	= &proc_dointvec,
2736 	},
2737 	{
2738 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2739 		.procname	= "max_size",
2740 		.data		= &ip_rt_max_size,
2741 		.maxlen		= sizeof(int),
2742 		.mode		= 0644,
2743 		.proc_handler	= &proc_dointvec,
2744 	},
2745 	{
2746 		/*  Deprecated. Use gc_min_interval_ms */
2747 
2748 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2749 		.procname	= "gc_min_interval",
2750 		.data		= &ip_rt_gc_min_interval,
2751 		.maxlen		= sizeof(int),
2752 		.mode		= 0644,
2753 		.proc_handler	= &proc_dointvec_jiffies,
2754 		.strategy	= &sysctl_jiffies,
2755 	},
2756 	{
2757 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2758 		.procname	= "gc_min_interval_ms",
2759 		.data		= &ip_rt_gc_min_interval,
2760 		.maxlen		= sizeof(int),
2761 		.mode		= 0644,
2762 		.proc_handler	= &proc_dointvec_ms_jiffies,
2763 		.strategy	= &sysctl_ms_jiffies,
2764 	},
2765 	{
2766 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2767 		.procname	= "gc_timeout",
2768 		.data		= &ip_rt_gc_timeout,
2769 		.maxlen		= sizeof(int),
2770 		.mode		= 0644,
2771 		.proc_handler	= &proc_dointvec_jiffies,
2772 		.strategy	= &sysctl_jiffies,
2773 	},
2774 	{
2775 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2776 		.procname	= "gc_interval",
2777 		.data		= &ip_rt_gc_interval,
2778 		.maxlen		= sizeof(int),
2779 		.mode		= 0644,
2780 		.proc_handler	= &proc_dointvec_jiffies,
2781 		.strategy	= &sysctl_jiffies,
2782 	},
2783 	{
2784 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2785 		.procname	= "redirect_load",
2786 		.data		= &ip_rt_redirect_load,
2787 		.maxlen		= sizeof(int),
2788 		.mode		= 0644,
2789 		.proc_handler	= &proc_dointvec,
2790 	},
2791 	{
2792 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2793 		.procname	= "redirect_number",
2794 		.data		= &ip_rt_redirect_number,
2795 		.maxlen		= sizeof(int),
2796 		.mode		= 0644,
2797 		.proc_handler	= &proc_dointvec,
2798 	},
2799 	{
2800 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2801 		.procname	= "redirect_silence",
2802 		.data		= &ip_rt_redirect_silence,
2803 		.maxlen		= sizeof(int),
2804 		.mode		= 0644,
2805 		.proc_handler	= &proc_dointvec,
2806 	},
2807 	{
2808 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2809 		.procname	= "error_cost",
2810 		.data		= &ip_rt_error_cost,
2811 		.maxlen		= sizeof(int),
2812 		.mode		= 0644,
2813 		.proc_handler	= &proc_dointvec,
2814 	},
2815 	{
2816 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2817 		.procname	= "error_burst",
2818 		.data		= &ip_rt_error_burst,
2819 		.maxlen		= sizeof(int),
2820 		.mode		= 0644,
2821 		.proc_handler	= &proc_dointvec,
2822 	},
2823 	{
2824 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2825 		.procname	= "gc_elasticity",
2826 		.data		= &ip_rt_gc_elasticity,
2827 		.maxlen		= sizeof(int),
2828 		.mode		= 0644,
2829 		.proc_handler	= &proc_dointvec,
2830 	},
2831 	{
2832 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2833 		.procname	= "mtu_expires",
2834 		.data		= &ip_rt_mtu_expires,
2835 		.maxlen		= sizeof(int),
2836 		.mode		= 0644,
2837 		.proc_handler	= &proc_dointvec_jiffies,
2838 		.strategy	= &sysctl_jiffies,
2839 	},
2840 	{
2841 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2842 		.procname	= "min_pmtu",
2843 		.data		= &ip_rt_min_pmtu,
2844 		.maxlen		= sizeof(int),
2845 		.mode		= 0644,
2846 		.proc_handler	= &proc_dointvec,
2847 	},
2848 	{
2849 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2850 		.procname	= "min_adv_mss",
2851 		.data		= &ip_rt_min_advmss,
2852 		.maxlen		= sizeof(int),
2853 		.mode		= 0644,
2854 		.proc_handler	= &proc_dointvec,
2855 	},
2856 	{
2857 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
2858 		.procname	= "secret_interval",
2859 		.data		= &ip_rt_secret_interval,
2860 		.maxlen		= sizeof(int),
2861 		.mode		= 0644,
2862 		.proc_handler	= &proc_dointvec_jiffies,
2863 		.strategy	= &sysctl_jiffies,
2864 	},
2865 	{ .ctl_name = 0 }
2866 };
2867 #endif
2868 
2869 #ifdef CONFIG_NET_CLS_ROUTE
2870 struct ip_rt_acct *ip_rt_acct;
2871 
2872 /* This code sucks.  But you should have seen it before! --RR */
2873 
2874 /* IP route accounting ptr for this logical cpu number. */
2875 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2876 
2877 #ifdef CONFIG_PROC_FS
2878 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2879 			   int length, int *eof, void *data)
2880 {
2881 	unsigned int i;
2882 
2883 	if ((offset & 3) || (length & 3))
2884 		return -EIO;
2885 
2886 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
2887 		*eof = 1;
2888 		return 0;
2889 	}
2890 
2891 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2892 		length = sizeof(struct ip_rt_acct) * 256 - offset;
2893 		*eof = 1;
2894 	}
2895 
2896 	offset /= sizeof(u32);
2897 
2898 	if (length > 0) {
2899 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2900 		u32 *dst = (u32 *) buffer;
2901 
2902 		/* Copy first cpu. */
2903 		*start = buffer;
2904 		memcpy(dst, src, length);
2905 
2906 		/* Add the other cpus in, one int at a time */
2907 		for_each_possible_cpu(i) {
2908 			unsigned int j;
2909 
2910 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2911 
2912 			for (j = 0; j < length/4; j++)
2913 				dst[j] += src[j];
2914 		}
2915 	}
2916 	return length;
2917 }
2918 #endif /* CONFIG_PROC_FS */
2919 #endif /* CONFIG_NET_CLS_ROUTE */
2920 
2921 static __initdata unsigned long rhash_entries;
2922 static int __init set_rhash_entries(char *str)
2923 {
2924 	if (!str)
2925 		return 0;
2926 	rhash_entries = simple_strtoul(str, &str, 0);
2927 	return 1;
2928 }
2929 __setup("rhash_entries=", set_rhash_entries);
2930 
2931 int __init ip_rt_init(void)
2932 {
2933 	int rc = 0;
2934 
2935 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2936 			     (jiffies ^ (jiffies >> 7)));
2937 
2938 #ifdef CONFIG_NET_CLS_ROUTE
2939 	{
2940 	int order;
2941 	for (order = 0;
2942 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2943 		/* NOTHING */;
2944 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2945 	if (!ip_rt_acct)
2946 		panic("IP: failed to allocate ip_rt_acct\n");
2947 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
2948 	}
2949 #endif
2950 
2951 	ipv4_dst_ops.kmem_cachep =
2952 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2953 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2954 
2955 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2956 
2957 	rt_hash_table = (struct rt_hash_bucket *)
2958 		alloc_large_system_hash("IP route cache",
2959 					sizeof(struct rt_hash_bucket),
2960 					rhash_entries,
2961 					(num_physpages >= 128 * 1024) ?
2962 					15 : 17,
2963 					0,
2964 					&rt_hash_log,
2965 					&rt_hash_mask,
2966 					0);
2967 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2968 	rt_hash_lock_init();
2969 
2970 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2971 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
2972 
2973 	devinet_init();
2974 	ip_fib_init();
2975 
2976 	init_timer(&rt_flush_timer);
2977 	rt_flush_timer.function = rt_run_flush;
2978 	init_timer(&rt_secret_timer);
2979 	rt_secret_timer.function = rt_secret_rebuild;
2980 
2981 	/* All the timers, started at system startup tend
2982 	   to synchronize. Perturb it a bit.
2983 	 */
2984 	schedule_delayed_work(&expires_work,
2985 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2986 
2987 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2988 		ip_rt_secret_interval;
2989 	add_timer(&rt_secret_timer);
2990 
2991 #ifdef CONFIG_PROC_FS
2992 	{
2993 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2994 	if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2995 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2996 					     init_net.proc_net_stat))) {
2997 		return -ENOMEM;
2998 	}
2999 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3000 	}
3001 #ifdef CONFIG_NET_CLS_ROUTE
3002 	create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3003 #endif
3004 #endif
3005 #ifdef CONFIG_XFRM
3006 	xfrm_init();
3007 	xfrm4_init();
3008 #endif
3009 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3010 
3011 	return rc;
3012 }
3013 
3014 EXPORT_SYMBOL(__ip_select_ident);
3015 EXPORT_SYMBOL(ip_route_input);
3016 EXPORT_SYMBOL(ip_route_output_key);
3017