xref: /openbmc/linux/net/ipv4/route.c (revision fd589a8f)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 			   int length, int *eof, void *data)
518 {
519 	unsigned int i;
520 
521 	if ((offset & 3) || (length & 3))
522 		return -EIO;
523 
524 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 		*eof = 1;
526 		return 0;
527 	}
528 
529 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 		length = sizeof(struct ip_rt_acct) * 256 - offset;
531 		*eof = 1;
532 	}
533 
534 	offset /= sizeof(u32);
535 
536 	if (length > 0) {
537 		u32 *dst = (u32 *) buffer;
538 
539 		*start = buffer;
540 		memset(dst, 0, length);
541 
542 		for_each_possible_cpu(i) {
543 			unsigned int j;
544 			u32 *src;
545 
546 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 			for (j = 0; j < length/4; j++)
548 				dst[j] += src[j];
549 		}
550 	}
551 	return length;
552 }
553 #endif
554 
555 static int __net_init ip_rt_do_proc_init(struct net *net)
556 {
557 	struct proc_dir_entry *pde;
558 
559 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 			&rt_cache_seq_fops);
561 	if (!pde)
562 		goto err1;
563 
564 	pde = proc_create("rt_cache", S_IRUGO,
565 			  net->proc_net_stat, &rt_cpu_seq_fops);
566 	if (!pde)
567 		goto err2;
568 
569 #ifdef CONFIG_NET_CLS_ROUTE
570 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 			ip_rt_acct_read, NULL);
572 	if (!pde)
573 		goto err3;
574 #endif
575 	return 0;
576 
577 #ifdef CONFIG_NET_CLS_ROUTE
578 err3:
579 	remove_proc_entry("rt_cache", net->proc_net_stat);
580 #endif
581 err2:
582 	remove_proc_entry("rt_cache", net->proc_net);
583 err1:
584 	return -ENOMEM;
585 }
586 
587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
588 {
589 	remove_proc_entry("rt_cache", net->proc_net_stat);
590 	remove_proc_entry("rt_cache", net->proc_net);
591 	remove_proc_entry("rt_acct", net->proc_net);
592 }
593 
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595 	.init = ip_rt_do_proc_init,
596 	.exit = ip_rt_do_proc_exit,
597 };
598 
599 static int __init ip_rt_proc_init(void)
600 {
601 	return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603 
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607 	return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610 
611 static inline void rt_free(struct rtable *rt)
612 {
613 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615 
616 static inline void rt_drop(struct rtable *rt)
617 {
618 	ip_rt_put(rt);
619 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621 
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624 	/* Kill broadcast/multicast entries very aggresively, if they
625 	   collide in hash table with more useful entries */
626 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 		rth->fl.iif && rth->u.dst.rt_next;
628 }
629 
630 static inline int rt_valuable(struct rtable *rth)
631 {
632 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 		rth->u.dst.expires;
634 }
635 
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638 	unsigned long age;
639 	int ret = 0;
640 
641 	if (atomic_read(&rth->u.dst.__refcnt))
642 		goto out;
643 
644 	ret = 1;
645 	if (rth->u.dst.expires &&
646 	    time_after_eq(jiffies, rth->u.dst.expires))
647 		goto out;
648 
649 	age = jiffies - rth->u.dst.lastuse;
650 	ret = 0;
651 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 	    (age <= tmo2 && rt_valuable(rth)))
653 		goto out;
654 	ret = 1;
655 out:	return ret;
656 }
657 
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665 	u32 score = jiffies - rt->u.dst.lastuse;
666 
667 	score = ~score & ~(3<<30);
668 
669 	if (rt_valuable(rt))
670 		score |= (1<<31);
671 
672 	if (!rt->fl.iif ||
673 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 		score |= (1<<30);
675 
676 	return score;
677 }
678 
679 static inline bool rt_caching(const struct net *net)
680 {
681 	return net->ipv4.current_rt_cache_rebuild_count <=
682 		net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684 
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 					const struct flowi *fl2)
687 {
688 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 		(fl1->iif ^ fl2->iif)) == 0);
691 }
692 
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 		(fl1->mark ^ fl2->mark) |
698 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 		(fl1->oif ^ fl2->oif) |
701 		(fl1->iif ^ fl2->iif)) == 0;
702 }
703 
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
707 }
708 
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713 
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721 	unsigned int i;
722 	struct rtable *rth, *next;
723 	struct rtable * tail;
724 
725 	for (i = 0; i <= rt_hash_mask; i++) {
726 		if (process_context && need_resched())
727 			cond_resched();
728 		rth = rt_hash_table[i].chain;
729 		if (!rth)
730 			continue;
731 
732 		spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734 		{
735 		struct rtable ** prev, * p;
736 
737 		rth = rt_hash_table[i].chain;
738 
739 		/* defer releasing the head of the list after spin_unlock */
740 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = *prev; p; p = next) {
749 			next = p->u.dst.rt_next;
750 			if (!rt_is_expired(p)) {
751 				prev = &p->u.dst.rt_next;
752 			} else {
753 				*prev = next;
754 				rt_free(p);
755 			}
756 		}
757 		}
758 #else
759 		rth = rt_hash_table[i].chain;
760 		rt_hash_table[i].chain = NULL;
761 		tail = NULL;
762 #endif
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 
765 		for (; rth != tail; rth = next) {
766 			next = rth->u.dst.rt_next;
767 			rt_free(rth);
768 		}
769 	}
770 }
771 
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779 
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782 
783 static void rt_check_expire(void)
784 {
785 	static unsigned int rover;
786 	unsigned int i = rover, goal;
787 	struct rtable *rth, *aux, **rthp;
788 	unsigned long samples = 0;
789 	unsigned long sum = 0, sum2 = 0;
790 	unsigned long delta;
791 	u64 mult;
792 
793 	delta = jiffies - expires_ljiffies;
794 	expires_ljiffies = jiffies;
795 	mult = ((u64)delta) << rt_hash_log;
796 	if (ip_rt_gc_timeout > 1)
797 		do_div(mult, ip_rt_gc_timeout);
798 	goal = (unsigned int)mult;
799 	if (goal > rt_hash_mask)
800 		goal = rt_hash_mask + 1;
801 	for (; goal > 0; goal--) {
802 		unsigned long tmo = ip_rt_gc_timeout;
803 		unsigned long length;
804 
805 		i = (i + 1) & rt_hash_mask;
806 		rthp = &rt_hash_table[i].chain;
807 
808 		if (need_resched())
809 			cond_resched();
810 
811 		samples++;
812 
813 		if (*rthp == NULL)
814 			continue;
815 		length = 0;
816 		spin_lock_bh(rt_hash_lock_addr(i));
817 		while ((rth = *rthp) != NULL) {
818 			prefetch(rth->u.dst.rt_next);
819 			if (rt_is_expired(rth)) {
820 				*rthp = rth->u.dst.rt_next;
821 				rt_free(rth);
822 				continue;
823 			}
824 			if (rth->u.dst.expires) {
825 				/* Entry is expired even if it is in use */
826 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
827 nofree:
828 					tmo >>= 1;
829 					rthp = &rth->u.dst.rt_next;
830 					/*
831 					 * We only count entries on
832 					 * a chain with equal hash inputs once
833 					 * so that entries for different QOS
834 					 * levels, and other non-hash input
835 					 * attributes don't unfairly skew
836 					 * the length computation
837 					 */
838 					for (aux = rt_hash_table[i].chain;;) {
839 						if (aux == rth) {
840 							length += ONE;
841 							break;
842 						}
843 						if (compare_hash_inputs(&aux->fl, &rth->fl))
844 							break;
845 						aux = aux->u.dst.rt_next;
846 					}
847 					continue;
848 				}
849 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 				goto nofree;
851 
852 			/* Cleanup aged off entries. */
853 			*rthp = rth->u.dst.rt_next;
854 			rt_free(rth);
855 		}
856 		spin_unlock_bh(rt_hash_lock_addr(i));
857 		sum += length;
858 		sum2 += length*length;
859 	}
860 	if (samples) {
861 		unsigned long avg = sum / samples;
862 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 		rt_chain_length_max = max_t(unsigned long,
864 					ip_rt_gc_elasticity,
865 					(avg + 4*sd) >> FRACT_BITS);
866 	}
867 	rover = i;
868 }
869 
870 /*
871  * rt_worker_func() is run in process context.
872  * we call rt_check_expire() to scan part of the hash table
873  */
874 static void rt_worker_func(struct work_struct *work)
875 {
876 	rt_check_expire();
877 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 }
879 
880 /*
881  * Pertubation of rt_genid by a small quantity [1..256]
882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883  * many times (2^24) without giving recent rt_genid.
884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
885  */
886 static void rt_cache_invalidate(struct net *net)
887 {
888 	unsigned char shuffle;
889 
890 	get_random_bytes(&shuffle, sizeof(shuffle));
891 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 }
893 
894 /*
895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
896  * delay >= 0 : invalidate & flush cache (can be long)
897  */
898 void rt_cache_flush(struct net *net, int delay)
899 {
900 	rt_cache_invalidate(net);
901 	if (delay >= 0)
902 		rt_do_flush(!in_softirq());
903 }
904 
905 /*
906  * We change rt_genid and let gc do the cleanup
907  */
908 static void rt_secret_rebuild(unsigned long __net)
909 {
910 	struct net *net = (struct net *)__net;
911 	rt_cache_invalidate(net);
912 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
913 }
914 
915 static void rt_secret_rebuild_oneshot(struct net *net)
916 {
917 	del_timer_sync(&net->ipv4.rt_secret_timer);
918 	rt_cache_invalidate(net);
919 	if (ip_rt_secret_interval) {
920 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 		add_timer(&net->ipv4.rt_secret_timer);
922 	}
923 }
924 
925 static void rt_emergency_hash_rebuild(struct net *net)
926 {
927 	if (net_ratelimit()) {
928 		printk(KERN_WARNING "Route hash chain too long!\n");
929 		printk(KERN_WARNING "Adjust your secret_interval!\n");
930 	}
931 
932 	rt_secret_rebuild_oneshot(net);
933 }
934 
935 /*
936    Short description of GC goals.
937 
938    We want to build algorithm, which will keep routing cache
939    at some equilibrium point, when number of aged off entries
940    is kept approximately equal to newly generated ones.
941 
942    Current expiration strength is variable "expire".
943    We try to adjust it dynamically, so that if networking
944    is idle expires is large enough to keep enough of warm entries,
945    and when load increases it reduces to limit cache size.
946  */
947 
948 static int rt_garbage_collect(struct dst_ops *ops)
949 {
950 	static unsigned long expire = RT_GC_TIMEOUT;
951 	static unsigned long last_gc;
952 	static int rover;
953 	static int equilibrium;
954 	struct rtable *rth, **rthp;
955 	unsigned long now = jiffies;
956 	int goal;
957 
958 	/*
959 	 * Garbage collection is pretty expensive,
960 	 * do not make it too frequently.
961 	 */
962 
963 	RT_CACHE_STAT_INC(gc_total);
964 
965 	if (now - last_gc < ip_rt_gc_min_interval &&
966 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
967 		RT_CACHE_STAT_INC(gc_ignored);
968 		goto out;
969 	}
970 
971 	/* Calculate number of entries, which we want to expire now. */
972 	goal = atomic_read(&ipv4_dst_ops.entries) -
973 		(ip_rt_gc_elasticity << rt_hash_log);
974 	if (goal <= 0) {
975 		if (equilibrium < ipv4_dst_ops.gc_thresh)
976 			equilibrium = ipv4_dst_ops.gc_thresh;
977 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 		if (goal > 0) {
979 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
980 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
981 		}
982 	} else {
983 		/* We are in dangerous area. Try to reduce cache really
984 		 * aggressively.
985 		 */
986 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
987 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
988 	}
989 
990 	if (now - last_gc >= ip_rt_gc_min_interval)
991 		last_gc = now;
992 
993 	if (goal <= 0) {
994 		equilibrium += goal;
995 		goto work_done;
996 	}
997 
998 	do {
999 		int i, k;
1000 
1001 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002 			unsigned long tmo = expire;
1003 
1004 			k = (k + 1) & rt_hash_mask;
1005 			rthp = &rt_hash_table[k].chain;
1006 			spin_lock_bh(rt_hash_lock_addr(k));
1007 			while ((rth = *rthp) != NULL) {
1008 				if (!rt_is_expired(rth) &&
1009 					!rt_may_expire(rth, tmo, expire)) {
1010 					tmo >>= 1;
1011 					rthp = &rth->u.dst.rt_next;
1012 					continue;
1013 				}
1014 				*rthp = rth->u.dst.rt_next;
1015 				rt_free(rth);
1016 				goal--;
1017 			}
1018 			spin_unlock_bh(rt_hash_lock_addr(k));
1019 			if (goal <= 0)
1020 				break;
1021 		}
1022 		rover = k;
1023 
1024 		if (goal <= 0)
1025 			goto work_done;
1026 
1027 		/* Goal is not achieved. We stop process if:
1028 
1029 		   - if expire reduced to zero. Otherwise, expire is halfed.
1030 		   - if table is not full.
1031 		   - if we are called from interrupt.
1032 		   - jiffies check is just fallback/debug loop breaker.
1033 		     We will not spin here for long time in any case.
1034 		 */
1035 
1036 		RT_CACHE_STAT_INC(gc_goal_miss);
1037 
1038 		if (expire == 0)
1039 			break;
1040 
1041 		expire >>= 1;
1042 #if RT_CACHE_DEBUG >= 2
1043 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1045 #endif
1046 
1047 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048 			goto out;
1049 	} while (!in_softirq() && time_before_eq(jiffies, now));
1050 
1051 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 		goto out;
1053 	if (net_ratelimit())
1054 		printk(KERN_WARNING "dst cache overflow\n");
1055 	RT_CACHE_STAT_INC(gc_dst_overflow);
1056 	return 1;
1057 
1058 work_done:
1059 	expire += ip_rt_gc_min_interval;
1060 	if (expire > ip_rt_gc_timeout ||
1061 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062 		expire = ip_rt_gc_timeout;
1063 #if RT_CACHE_DEBUG >= 2
1064 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066 #endif
1067 out:	return 0;
1068 }
1069 
1070 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 			  struct rtable **rp, struct sk_buff *skb)
1072 {
1073 	struct rtable	*rth, **rthp;
1074 	unsigned long	now;
1075 	struct rtable *cand, **candp;
1076 	u32 		min_score;
1077 	int		chain_length;
1078 	int attempts = !in_softirq();
1079 
1080 restart:
1081 	chain_length = 0;
1082 	min_score = ~(u32)0;
1083 	cand = NULL;
1084 	candp = NULL;
1085 	now = jiffies;
1086 
1087 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088 		/*
1089 		 * If we're not caching, just tell the caller we
1090 		 * were successful and don't touch the route.  The
1091 		 * caller hold the sole reference to the cache entry, and
1092 		 * it will be released when the caller is done with it.
1093 		 * If we drop it here, the callers have no way to resolve routes
1094 		 * when we're not caching.  Instead, just point *rp at rt, so
1095 		 * the caller gets a single use out of the route
1096 		 * Note that we do rt_free on this new route entry, so that
1097 		 * once its refcount hits zero, we are still able to reap it
1098 		 * (Thanks Alexey)
1099 		 * Note also the rt_free uses call_rcu.  We don't actually
1100 		 * need rcu protection here, this is just our path to get
1101 		 * on the route gc list.
1102 		 */
1103 
1104 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1105 			int err = arp_bind_neighbour(&rt->u.dst);
1106 			if (err) {
1107 				if (net_ratelimit())
1108 					printk(KERN_WARNING
1109 					    "Neighbour table failure & not caching routes.\n");
1110 				rt_drop(rt);
1111 				return err;
1112 			}
1113 		}
1114 
1115 		rt_free(rt);
1116 		goto skip_hashing;
1117 	}
1118 
1119 	rthp = &rt_hash_table[hash].chain;
1120 
1121 	spin_lock_bh(rt_hash_lock_addr(hash));
1122 	while ((rth = *rthp) != NULL) {
1123 		if (rt_is_expired(rth)) {
1124 			*rthp = rth->u.dst.rt_next;
1125 			rt_free(rth);
1126 			continue;
1127 		}
1128 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1129 			/* Put it first */
1130 			*rthp = rth->u.dst.rt_next;
1131 			/*
1132 			 * Since lookup is lockfree, the deletion
1133 			 * must be visible to another weakly ordered CPU before
1134 			 * the insertion at the start of the hash chain.
1135 			 */
1136 			rcu_assign_pointer(rth->u.dst.rt_next,
1137 					   rt_hash_table[hash].chain);
1138 			/*
1139 			 * Since lookup is lockfree, the update writes
1140 			 * must be ordered for consistency on SMP.
1141 			 */
1142 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1143 
1144 			dst_use(&rth->u.dst, now);
1145 			spin_unlock_bh(rt_hash_lock_addr(hash));
1146 
1147 			rt_drop(rt);
1148 			if (rp)
1149 				*rp = rth;
1150 			else
1151 				skb_dst_set(skb, &rth->u.dst);
1152 			return 0;
1153 		}
1154 
1155 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1156 			u32 score = rt_score(rth);
1157 
1158 			if (score <= min_score) {
1159 				cand = rth;
1160 				candp = rthp;
1161 				min_score = score;
1162 			}
1163 		}
1164 
1165 		chain_length++;
1166 
1167 		rthp = &rth->u.dst.rt_next;
1168 	}
1169 
1170 	if (cand) {
1171 		/* ip_rt_gc_elasticity used to be average length of chain
1172 		 * length, when exceeded gc becomes really aggressive.
1173 		 *
1174 		 * The second limit is less certain. At the moment it allows
1175 		 * only 2 entries per bucket. We will see.
1176 		 */
1177 		if (chain_length > ip_rt_gc_elasticity) {
1178 			*candp = cand->u.dst.rt_next;
1179 			rt_free(cand);
1180 		}
1181 	} else {
1182 		if (chain_length > rt_chain_length_max) {
1183 			struct net *net = dev_net(rt->u.dst.dev);
1184 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1186 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187 					rt->u.dst.dev->name, num);
1188 			}
1189 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1190 		}
1191 	}
1192 
1193 	/* Try to bind route to arp only if it is output
1194 	   route or unicast forwarding path.
1195 	 */
1196 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1197 		int err = arp_bind_neighbour(&rt->u.dst);
1198 		if (err) {
1199 			spin_unlock_bh(rt_hash_lock_addr(hash));
1200 
1201 			if (err != -ENOBUFS) {
1202 				rt_drop(rt);
1203 				return err;
1204 			}
1205 
1206 			/* Neighbour tables are full and nothing
1207 			   can be released. Try to shrink route cache,
1208 			   it is most likely it holds some neighbour records.
1209 			 */
1210 			if (attempts-- > 0) {
1211 				int saved_elasticity = ip_rt_gc_elasticity;
1212 				int saved_int = ip_rt_gc_min_interval;
1213 				ip_rt_gc_elasticity	= 1;
1214 				ip_rt_gc_min_interval	= 0;
1215 				rt_garbage_collect(&ipv4_dst_ops);
1216 				ip_rt_gc_min_interval	= saved_int;
1217 				ip_rt_gc_elasticity	= saved_elasticity;
1218 				goto restart;
1219 			}
1220 
1221 			if (net_ratelimit())
1222 				printk(KERN_WARNING "Neighbour table overflow.\n");
1223 			rt_drop(rt);
1224 			return -ENOBUFS;
1225 		}
1226 	}
1227 
1228 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1229 
1230 #if RT_CACHE_DEBUG >= 2
1231 	if (rt->u.dst.rt_next) {
1232 		struct rtable *trt;
1233 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1234 		       hash, &rt->rt_dst);
1235 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1236 			printk(" . %pI4", &trt->rt_dst);
1237 		printk("\n");
1238 	}
1239 #endif
1240 	/*
1241 	 * Since lookup is lockfree, we must make sure
1242 	 * previous writes to rt are comitted to memory
1243 	 * before making rt visible to other CPUS.
1244 	 */
1245 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1246 
1247 	spin_unlock_bh(rt_hash_lock_addr(hash));
1248 
1249 skip_hashing:
1250 	if (rp)
1251 		*rp = rt;
1252 	else
1253 		skb_dst_set(skb, &rt->u.dst);
1254 	return 0;
1255 }
1256 
1257 void rt_bind_peer(struct rtable *rt, int create)
1258 {
1259 	static DEFINE_SPINLOCK(rt_peer_lock);
1260 	struct inet_peer *peer;
1261 
1262 	peer = inet_getpeer(rt->rt_dst, create);
1263 
1264 	spin_lock_bh(&rt_peer_lock);
1265 	if (rt->peer == NULL) {
1266 		rt->peer = peer;
1267 		peer = NULL;
1268 	}
1269 	spin_unlock_bh(&rt_peer_lock);
1270 	if (peer)
1271 		inet_putpeer(peer);
1272 }
1273 
1274 /*
1275  * Peer allocation may fail only in serious out-of-memory conditions.  However
1276  * we still can generate some output.
1277  * Random ID selection looks a bit dangerous because we have no chances to
1278  * select ID being unique in a reasonable period of time.
1279  * But broken packet identifier may be better than no packet at all.
1280  */
1281 static void ip_select_fb_ident(struct iphdr *iph)
1282 {
1283 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1284 	static u32 ip_fallback_id;
1285 	u32 salt;
1286 
1287 	spin_lock_bh(&ip_fb_id_lock);
1288 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1289 	iph->id = htons(salt & 0xFFFF);
1290 	ip_fallback_id = salt;
1291 	spin_unlock_bh(&ip_fb_id_lock);
1292 }
1293 
1294 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1295 {
1296 	struct rtable *rt = (struct rtable *) dst;
1297 
1298 	if (rt) {
1299 		if (rt->peer == NULL)
1300 			rt_bind_peer(rt, 1);
1301 
1302 		/* If peer is attached to destination, it is never detached,
1303 		   so that we need not to grab a lock to dereference it.
1304 		 */
1305 		if (rt->peer) {
1306 			iph->id = htons(inet_getid(rt->peer, more));
1307 			return;
1308 		}
1309 	} else
1310 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1311 		       __builtin_return_address(0));
1312 
1313 	ip_select_fb_ident(iph);
1314 }
1315 
1316 static void rt_del(unsigned hash, struct rtable *rt)
1317 {
1318 	struct rtable **rthp, *aux;
1319 
1320 	rthp = &rt_hash_table[hash].chain;
1321 	spin_lock_bh(rt_hash_lock_addr(hash));
1322 	ip_rt_put(rt);
1323 	while ((aux = *rthp) != NULL) {
1324 		if (aux == rt || rt_is_expired(aux)) {
1325 			*rthp = aux->u.dst.rt_next;
1326 			rt_free(aux);
1327 			continue;
1328 		}
1329 		rthp = &aux->u.dst.rt_next;
1330 	}
1331 	spin_unlock_bh(rt_hash_lock_addr(hash));
1332 }
1333 
1334 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1335 		    __be32 saddr, struct net_device *dev)
1336 {
1337 	int i, k;
1338 	struct in_device *in_dev = in_dev_get(dev);
1339 	struct rtable *rth, **rthp;
1340 	__be32  skeys[2] = { saddr, 0 };
1341 	int  ikeys[2] = { dev->ifindex, 0 };
1342 	struct netevent_redirect netevent;
1343 	struct net *net;
1344 
1345 	if (!in_dev)
1346 		return;
1347 
1348 	net = dev_net(dev);
1349 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1350 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1351 	    || ipv4_is_zeronet(new_gw))
1352 		goto reject_redirect;
1353 
1354 	if (!rt_caching(net))
1355 		goto reject_redirect;
1356 
1357 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359 			goto reject_redirect;
1360 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361 			goto reject_redirect;
1362 	} else {
1363 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364 			goto reject_redirect;
1365 	}
1366 
1367 	for (i = 0; i < 2; i++) {
1368 		for (k = 0; k < 2; k++) {
1369 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1370 						rt_genid(net));
1371 
1372 			rthp=&rt_hash_table[hash].chain;
1373 
1374 			rcu_read_lock();
1375 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1376 				struct rtable *rt;
1377 
1378 				if (rth->fl.fl4_dst != daddr ||
1379 				    rth->fl.fl4_src != skeys[i] ||
1380 				    rth->fl.oif != ikeys[k] ||
1381 				    rth->fl.iif != 0 ||
1382 				    rt_is_expired(rth) ||
1383 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1384 					rthp = &rth->u.dst.rt_next;
1385 					continue;
1386 				}
1387 
1388 				if (rth->rt_dst != daddr ||
1389 				    rth->rt_src != saddr ||
1390 				    rth->u.dst.error ||
1391 				    rth->rt_gateway != old_gw ||
1392 				    rth->u.dst.dev != dev)
1393 					break;
1394 
1395 				dst_hold(&rth->u.dst);
1396 				rcu_read_unlock();
1397 
1398 				rt = dst_alloc(&ipv4_dst_ops);
1399 				if (rt == NULL) {
1400 					ip_rt_put(rth);
1401 					in_dev_put(in_dev);
1402 					return;
1403 				}
1404 
1405 				/* Copy all the information. */
1406 				*rt = *rth;
1407 				rt->u.dst.__use		= 1;
1408 				atomic_set(&rt->u.dst.__refcnt, 1);
1409 				rt->u.dst.child		= NULL;
1410 				if (rt->u.dst.dev)
1411 					dev_hold(rt->u.dst.dev);
1412 				if (rt->idev)
1413 					in_dev_hold(rt->idev);
1414 				rt->u.dst.obsolete	= 0;
1415 				rt->u.dst.lastuse	= jiffies;
1416 				rt->u.dst.path		= &rt->u.dst;
1417 				rt->u.dst.neighbour	= NULL;
1418 				rt->u.dst.hh		= NULL;
1419 #ifdef CONFIG_XFRM
1420 				rt->u.dst.xfrm		= NULL;
1421 #endif
1422 				rt->rt_genid		= rt_genid(net);
1423 				rt->rt_flags		|= RTCF_REDIRECTED;
1424 
1425 				/* Gateway is different ... */
1426 				rt->rt_gateway		= new_gw;
1427 
1428 				/* Redirect received -> path was valid */
1429 				dst_confirm(&rth->u.dst);
1430 
1431 				if (rt->peer)
1432 					atomic_inc(&rt->peer->refcnt);
1433 
1434 				if (arp_bind_neighbour(&rt->u.dst) ||
1435 				    !(rt->u.dst.neighbour->nud_state &
1436 					    NUD_VALID)) {
1437 					if (rt->u.dst.neighbour)
1438 						neigh_event_send(rt->u.dst.neighbour, NULL);
1439 					ip_rt_put(rth);
1440 					rt_drop(rt);
1441 					goto do_next;
1442 				}
1443 
1444 				netevent.old = &rth->u.dst;
1445 				netevent.new = &rt->u.dst;
1446 				call_netevent_notifiers(NETEVENT_REDIRECT,
1447 							&netevent);
1448 
1449 				rt_del(hash, rth);
1450 				if (!rt_intern_hash(hash, rt, &rt, NULL))
1451 					ip_rt_put(rt);
1452 				goto do_next;
1453 			}
1454 			rcu_read_unlock();
1455 		do_next:
1456 			;
1457 		}
1458 	}
1459 	in_dev_put(in_dev);
1460 	return;
1461 
1462 reject_redirect:
1463 #ifdef CONFIG_IP_ROUTE_VERBOSE
1464 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1465 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1466 			"  Advised path = %pI4 -> %pI4\n",
1467 		       &old_gw, dev->name, &new_gw,
1468 		       &saddr, &daddr);
1469 #endif
1470 	in_dev_put(in_dev);
1471 }
1472 
1473 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1474 {
1475 	struct rtable *rt = (struct rtable *)dst;
1476 	struct dst_entry *ret = dst;
1477 
1478 	if (rt) {
1479 		if (dst->obsolete) {
1480 			ip_rt_put(rt);
1481 			ret = NULL;
1482 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483 			   rt->u.dst.expires) {
1484 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485 						rt->fl.oif,
1486 						rt_genid(dev_net(dst->dev)));
1487 #if RT_CACHE_DEBUG >= 1
1488 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1489 				&rt->rt_dst, rt->fl.fl4_tos);
1490 #endif
1491 			rt_del(hash, rt);
1492 			ret = NULL;
1493 		}
1494 	}
1495 	return ret;
1496 }
1497 
1498 /*
1499  * Algorithm:
1500  *	1. The first ip_rt_redirect_number redirects are sent
1501  *	   with exponential backoff, then we stop sending them at all,
1502  *	   assuming that the host ignores our redirects.
1503  *	2. If we did not see packets requiring redirects
1504  *	   during ip_rt_redirect_silence, we assume that the host
1505  *	   forgot redirected route and start to send redirects again.
1506  *
1507  * This algorithm is much cheaper and more intelligent than dumb load limiting
1508  * in icmp.c.
1509  *
1510  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1511  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1512  */
1513 
1514 void ip_rt_send_redirect(struct sk_buff *skb)
1515 {
1516 	struct rtable *rt = skb_rtable(skb);
1517 	struct in_device *in_dev;
1518 	int log_martians;
1519 
1520 	rcu_read_lock();
1521 	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1522 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1523 		rcu_read_unlock();
1524 		return;
1525 	}
1526 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1527 	rcu_read_unlock();
1528 
1529 	/* No redirected packets during ip_rt_redirect_silence;
1530 	 * reset the algorithm.
1531 	 */
1532 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1533 		rt->u.dst.rate_tokens = 0;
1534 
1535 	/* Too many ignored redirects; do not send anything
1536 	 * set u.dst.rate_last to the last seen redirected packet.
1537 	 */
1538 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1539 		rt->u.dst.rate_last = jiffies;
1540 		return;
1541 	}
1542 
1543 	/* Check for load limit; set rate_last to the latest sent
1544 	 * redirect.
1545 	 */
1546 	if (rt->u.dst.rate_tokens == 0 ||
1547 	    time_after(jiffies,
1548 		       (rt->u.dst.rate_last +
1549 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1550 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1551 		rt->u.dst.rate_last = jiffies;
1552 		++rt->u.dst.rate_tokens;
1553 #ifdef CONFIG_IP_ROUTE_VERBOSE
1554 		if (log_martians &&
1555 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1556 		    net_ratelimit())
1557 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1558 				&rt->rt_src, rt->rt_iif,
1559 				&rt->rt_dst, &rt->rt_gateway);
1560 #endif
1561 	}
1562 }
1563 
1564 static int ip_error(struct sk_buff *skb)
1565 {
1566 	struct rtable *rt = skb_rtable(skb);
1567 	unsigned long now;
1568 	int code;
1569 
1570 	switch (rt->u.dst.error) {
1571 		case EINVAL:
1572 		default:
1573 			goto out;
1574 		case EHOSTUNREACH:
1575 			code = ICMP_HOST_UNREACH;
1576 			break;
1577 		case ENETUNREACH:
1578 			code = ICMP_NET_UNREACH;
1579 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1580 					IPSTATS_MIB_INNOROUTES);
1581 			break;
1582 		case EACCES:
1583 			code = ICMP_PKT_FILTERED;
1584 			break;
1585 	}
1586 
1587 	now = jiffies;
1588 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1589 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1590 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1591 	rt->u.dst.rate_last = now;
1592 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1593 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1594 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1595 	}
1596 
1597 out:	kfree_skb(skb);
1598 	return 0;
1599 }
1600 
1601 /*
1602  *	The last two values are not from the RFC but
1603  *	are needed for AMPRnet AX.25 paths.
1604  */
1605 
1606 static const unsigned short mtu_plateau[] =
1607 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1608 
1609 static inline unsigned short guess_mtu(unsigned short old_mtu)
1610 {
1611 	int i;
1612 
1613 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1614 		if (old_mtu > mtu_plateau[i])
1615 			return mtu_plateau[i];
1616 	return 68;
1617 }
1618 
1619 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1620 				 unsigned short new_mtu,
1621 				 struct net_device *dev)
1622 {
1623 	int i, k;
1624 	unsigned short old_mtu = ntohs(iph->tot_len);
1625 	struct rtable *rth;
1626 	int  ikeys[2] = { dev->ifindex, 0 };
1627 	__be32  skeys[2] = { iph->saddr, 0, };
1628 	__be32  daddr = iph->daddr;
1629 	unsigned short est_mtu = 0;
1630 
1631 	if (ipv4_config.no_pmtu_disc)
1632 		return 0;
1633 
1634 	for (k = 0; k < 2; k++) {
1635 		for (i = 0; i < 2; i++) {
1636 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1637 						rt_genid(net));
1638 
1639 			rcu_read_lock();
1640 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1641 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1642 				unsigned short mtu = new_mtu;
1643 
1644 				if (rth->fl.fl4_dst != daddr ||
1645 				    rth->fl.fl4_src != skeys[i] ||
1646 				    rth->rt_dst != daddr ||
1647 				    rth->rt_src != iph->saddr ||
1648 				    rth->fl.oif != ikeys[k] ||
1649 				    rth->fl.iif != 0 ||
1650 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1651 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1652 				    rt_is_expired(rth))
1653 					continue;
1654 
1655 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1656 
1657 					/* BSD 4.2 compatibility hack :-( */
1658 					if (mtu == 0 &&
1659 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1660 					    old_mtu >= 68 + (iph->ihl << 2))
1661 						old_mtu -= iph->ihl << 2;
1662 
1663 					mtu = guess_mtu(old_mtu);
1664 				}
1665 				if (mtu <= dst_mtu(&rth->u.dst)) {
1666 					if (mtu < dst_mtu(&rth->u.dst)) {
1667 						dst_confirm(&rth->u.dst);
1668 						if (mtu < ip_rt_min_pmtu) {
1669 							mtu = ip_rt_min_pmtu;
1670 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1671 								(1 << RTAX_MTU);
1672 						}
1673 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1674 						dst_set_expires(&rth->u.dst,
1675 							ip_rt_mtu_expires);
1676 					}
1677 					est_mtu = mtu;
1678 				}
1679 			}
1680 			rcu_read_unlock();
1681 		}
1682 	}
1683 	return est_mtu ? : new_mtu;
1684 }
1685 
1686 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1687 {
1688 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1689 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1690 		if (mtu < ip_rt_min_pmtu) {
1691 			mtu = ip_rt_min_pmtu;
1692 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1693 		}
1694 		dst->metrics[RTAX_MTU-1] = mtu;
1695 		dst_set_expires(dst, ip_rt_mtu_expires);
1696 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1697 	}
1698 }
1699 
1700 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1701 {
1702 	return NULL;
1703 }
1704 
1705 static void ipv4_dst_destroy(struct dst_entry *dst)
1706 {
1707 	struct rtable *rt = (struct rtable *) dst;
1708 	struct inet_peer *peer = rt->peer;
1709 	struct in_device *idev = rt->idev;
1710 
1711 	if (peer) {
1712 		rt->peer = NULL;
1713 		inet_putpeer(peer);
1714 	}
1715 
1716 	if (idev) {
1717 		rt->idev = NULL;
1718 		in_dev_put(idev);
1719 	}
1720 }
1721 
1722 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1723 			    int how)
1724 {
1725 	struct rtable *rt = (struct rtable *) dst;
1726 	struct in_device *idev = rt->idev;
1727 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1728 		struct in_device *loopback_idev =
1729 			in_dev_get(dev_net(dev)->loopback_dev);
1730 		if (loopback_idev) {
1731 			rt->idev = loopback_idev;
1732 			in_dev_put(idev);
1733 		}
1734 	}
1735 }
1736 
1737 static void ipv4_link_failure(struct sk_buff *skb)
1738 {
1739 	struct rtable *rt;
1740 
1741 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1742 
1743 	rt = skb_rtable(skb);
1744 	if (rt)
1745 		dst_set_expires(&rt->u.dst, 0);
1746 }
1747 
1748 static int ip_rt_bug(struct sk_buff *skb)
1749 {
1750 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1751 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1752 		skb->dev ? skb->dev->name : "?");
1753 	kfree_skb(skb);
1754 	return 0;
1755 }
1756 
1757 /*
1758    We do not cache source address of outgoing interface,
1759    because it is used only by IP RR, TS and SRR options,
1760    so that it out of fast path.
1761 
1762    BTW remember: "addr" is allowed to be not aligned
1763    in IP options!
1764  */
1765 
1766 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1767 {
1768 	__be32 src;
1769 	struct fib_result res;
1770 
1771 	if (rt->fl.iif == 0)
1772 		src = rt->rt_src;
1773 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1774 		src = FIB_RES_PREFSRC(res);
1775 		fib_res_put(&res);
1776 	} else
1777 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1778 					RT_SCOPE_UNIVERSE);
1779 	memcpy(addr, &src, 4);
1780 }
1781 
1782 #ifdef CONFIG_NET_CLS_ROUTE
1783 static void set_class_tag(struct rtable *rt, u32 tag)
1784 {
1785 	if (!(rt->u.dst.tclassid & 0xFFFF))
1786 		rt->u.dst.tclassid |= tag & 0xFFFF;
1787 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1788 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1789 }
1790 #endif
1791 
1792 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1793 {
1794 	struct fib_info *fi = res->fi;
1795 
1796 	if (fi) {
1797 		if (FIB_RES_GW(*res) &&
1798 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1799 			rt->rt_gateway = FIB_RES_GW(*res);
1800 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1801 		       sizeof(rt->u.dst.metrics));
1802 		if (fi->fib_mtu == 0) {
1803 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1804 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1805 			    rt->rt_gateway != rt->rt_dst &&
1806 			    rt->u.dst.dev->mtu > 576)
1807 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1808 		}
1809 #ifdef CONFIG_NET_CLS_ROUTE
1810 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1811 #endif
1812 	} else
1813 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1814 
1815 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1816 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1817 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1818 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1819 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1820 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1821 				       ip_rt_min_advmss);
1822 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1823 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1824 
1825 #ifdef CONFIG_NET_CLS_ROUTE
1826 #ifdef CONFIG_IP_MULTIPLE_TABLES
1827 	set_class_tag(rt, fib_rules_tclass(res));
1828 #endif
1829 	set_class_tag(rt, itag);
1830 #endif
1831 	rt->rt_type = res->type;
1832 }
1833 
1834 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1835 				u8 tos, struct net_device *dev, int our)
1836 {
1837 	unsigned hash;
1838 	struct rtable *rth;
1839 	__be32 spec_dst;
1840 	struct in_device *in_dev = in_dev_get(dev);
1841 	u32 itag = 0;
1842 
1843 	/* Primary sanity checks. */
1844 
1845 	if (in_dev == NULL)
1846 		return -EINVAL;
1847 
1848 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1849 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1850 		goto e_inval;
1851 
1852 	if (ipv4_is_zeronet(saddr)) {
1853 		if (!ipv4_is_local_multicast(daddr))
1854 			goto e_inval;
1855 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1856 	} else if (fib_validate_source(saddr, 0, tos, 0,
1857 					dev, &spec_dst, &itag) < 0)
1858 		goto e_inval;
1859 
1860 	rth = dst_alloc(&ipv4_dst_ops);
1861 	if (!rth)
1862 		goto e_nobufs;
1863 
1864 	rth->u.dst.output= ip_rt_bug;
1865 
1866 	atomic_set(&rth->u.dst.__refcnt, 1);
1867 	rth->u.dst.flags= DST_HOST;
1868 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1869 		rth->u.dst.flags |= DST_NOPOLICY;
1870 	rth->fl.fl4_dst	= daddr;
1871 	rth->rt_dst	= daddr;
1872 	rth->fl.fl4_tos	= tos;
1873 	rth->fl.mark    = skb->mark;
1874 	rth->fl.fl4_src	= saddr;
1875 	rth->rt_src	= saddr;
1876 #ifdef CONFIG_NET_CLS_ROUTE
1877 	rth->u.dst.tclassid = itag;
1878 #endif
1879 	rth->rt_iif	=
1880 	rth->fl.iif	= dev->ifindex;
1881 	rth->u.dst.dev	= init_net.loopback_dev;
1882 	dev_hold(rth->u.dst.dev);
1883 	rth->idev	= in_dev_get(rth->u.dst.dev);
1884 	rth->fl.oif	= 0;
1885 	rth->rt_gateway	= daddr;
1886 	rth->rt_spec_dst= spec_dst;
1887 	rth->rt_genid	= rt_genid(dev_net(dev));
1888 	rth->rt_flags	= RTCF_MULTICAST;
1889 	rth->rt_type	= RTN_MULTICAST;
1890 	if (our) {
1891 		rth->u.dst.input= ip_local_deliver;
1892 		rth->rt_flags |= RTCF_LOCAL;
1893 	}
1894 
1895 #ifdef CONFIG_IP_MROUTE
1896 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1897 		rth->u.dst.input = ip_mr_input;
1898 #endif
1899 	RT_CACHE_STAT_INC(in_slow_mc);
1900 
1901 	in_dev_put(in_dev);
1902 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1903 	return rt_intern_hash(hash, rth, NULL, skb);
1904 
1905 e_nobufs:
1906 	in_dev_put(in_dev);
1907 	return -ENOBUFS;
1908 
1909 e_inval:
1910 	in_dev_put(in_dev);
1911 	return -EINVAL;
1912 }
1913 
1914 
1915 static void ip_handle_martian_source(struct net_device *dev,
1916 				     struct in_device *in_dev,
1917 				     struct sk_buff *skb,
1918 				     __be32 daddr,
1919 				     __be32 saddr)
1920 {
1921 	RT_CACHE_STAT_INC(in_martian_src);
1922 #ifdef CONFIG_IP_ROUTE_VERBOSE
1923 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1924 		/*
1925 		 *	RFC1812 recommendation, if source is martian,
1926 		 *	the only hint is MAC header.
1927 		 */
1928 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1929 			&daddr, &saddr, dev->name);
1930 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1931 			int i;
1932 			const unsigned char *p = skb_mac_header(skb);
1933 			printk(KERN_WARNING "ll header: ");
1934 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1935 				printk("%02x", *p);
1936 				if (i < (dev->hard_header_len - 1))
1937 					printk(":");
1938 			}
1939 			printk("\n");
1940 		}
1941 	}
1942 #endif
1943 }
1944 
1945 static int __mkroute_input(struct sk_buff *skb,
1946 			   struct fib_result *res,
1947 			   struct in_device *in_dev,
1948 			   __be32 daddr, __be32 saddr, u32 tos,
1949 			   struct rtable **result)
1950 {
1951 
1952 	struct rtable *rth;
1953 	int err;
1954 	struct in_device *out_dev;
1955 	unsigned flags = 0;
1956 	__be32 spec_dst;
1957 	u32 itag;
1958 
1959 	/* get a working reference to the output device */
1960 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1961 	if (out_dev == NULL) {
1962 		if (net_ratelimit())
1963 			printk(KERN_CRIT "Bug in ip_route_input" \
1964 			       "_slow(). Please, report\n");
1965 		return -EINVAL;
1966 	}
1967 
1968 
1969 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1970 				  in_dev->dev, &spec_dst, &itag);
1971 	if (err < 0) {
1972 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1973 					 saddr);
1974 
1975 		err = -EINVAL;
1976 		goto cleanup;
1977 	}
1978 
1979 	if (err)
1980 		flags |= RTCF_DIRECTSRC;
1981 
1982 	if (out_dev == in_dev && err &&
1983 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1984 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1985 		flags |= RTCF_DOREDIRECT;
1986 
1987 	if (skb->protocol != htons(ETH_P_IP)) {
1988 		/* Not IP (i.e. ARP). Do not create route, if it is
1989 		 * invalid for proxy arp. DNAT routes are always valid.
1990 		 */
1991 		if (out_dev == in_dev) {
1992 			err = -EINVAL;
1993 			goto cleanup;
1994 		}
1995 	}
1996 
1997 
1998 	rth = dst_alloc(&ipv4_dst_ops);
1999 	if (!rth) {
2000 		err = -ENOBUFS;
2001 		goto cleanup;
2002 	}
2003 
2004 	atomic_set(&rth->u.dst.__refcnt, 1);
2005 	rth->u.dst.flags= DST_HOST;
2006 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2007 		rth->u.dst.flags |= DST_NOPOLICY;
2008 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2009 		rth->u.dst.flags |= DST_NOXFRM;
2010 	rth->fl.fl4_dst	= daddr;
2011 	rth->rt_dst	= daddr;
2012 	rth->fl.fl4_tos	= tos;
2013 	rth->fl.mark    = skb->mark;
2014 	rth->fl.fl4_src	= saddr;
2015 	rth->rt_src	= saddr;
2016 	rth->rt_gateway	= daddr;
2017 	rth->rt_iif 	=
2018 		rth->fl.iif	= in_dev->dev->ifindex;
2019 	rth->u.dst.dev	= (out_dev)->dev;
2020 	dev_hold(rth->u.dst.dev);
2021 	rth->idev	= in_dev_get(rth->u.dst.dev);
2022 	rth->fl.oif 	= 0;
2023 	rth->rt_spec_dst= spec_dst;
2024 
2025 	rth->u.dst.input = ip_forward;
2026 	rth->u.dst.output = ip_output;
2027 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2028 
2029 	rt_set_nexthop(rth, res, itag);
2030 
2031 	rth->rt_flags = flags;
2032 
2033 	*result = rth;
2034 	err = 0;
2035  cleanup:
2036 	/* release the working reference to the output device */
2037 	in_dev_put(out_dev);
2038 	return err;
2039 }
2040 
2041 static int ip_mkroute_input(struct sk_buff *skb,
2042 			    struct fib_result *res,
2043 			    const struct flowi *fl,
2044 			    struct in_device *in_dev,
2045 			    __be32 daddr, __be32 saddr, u32 tos)
2046 {
2047 	struct rtable* rth = NULL;
2048 	int err;
2049 	unsigned hash;
2050 
2051 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2052 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2053 		fib_select_multipath(fl, res);
2054 #endif
2055 
2056 	/* create a routing cache entry */
2057 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2058 	if (err)
2059 		return err;
2060 
2061 	/* put it into the cache */
2062 	hash = rt_hash(daddr, saddr, fl->iif,
2063 		       rt_genid(dev_net(rth->u.dst.dev)));
2064 	return rt_intern_hash(hash, rth, NULL, skb);
2065 }
2066 
2067 /*
2068  *	NOTE. We drop all the packets that has local source
2069  *	addresses, because every properly looped back packet
2070  *	must have correct destination already attached by output routine.
2071  *
2072  *	Such approach solves two big problems:
2073  *	1. Not simplex devices are handled properly.
2074  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2075  */
2076 
2077 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078 			       u8 tos, struct net_device *dev)
2079 {
2080 	struct fib_result res;
2081 	struct in_device *in_dev = in_dev_get(dev);
2082 	struct flowi fl = { .nl_u = { .ip4_u =
2083 				      { .daddr = daddr,
2084 					.saddr = saddr,
2085 					.tos = tos,
2086 					.scope = RT_SCOPE_UNIVERSE,
2087 				      } },
2088 			    .mark = skb->mark,
2089 			    .iif = dev->ifindex };
2090 	unsigned	flags = 0;
2091 	u32		itag = 0;
2092 	struct rtable * rth;
2093 	unsigned	hash;
2094 	__be32		spec_dst;
2095 	int		err = -EINVAL;
2096 	int		free_res = 0;
2097 	struct net    * net = dev_net(dev);
2098 
2099 	/* IP on this device is disabled. */
2100 
2101 	if (!in_dev)
2102 		goto out;
2103 
2104 	/* Check for the most weird martians, which can be not detected
2105 	   by fib_lookup.
2106 	 */
2107 
2108 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2109 	    ipv4_is_loopback(saddr))
2110 		goto martian_source;
2111 
2112 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2113 		goto brd_input;
2114 
2115 	/* Accept zero addresses only to limited broadcast;
2116 	 * I even do not know to fix it or not. Waiting for complains :-)
2117 	 */
2118 	if (ipv4_is_zeronet(saddr))
2119 		goto martian_source;
2120 
2121 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2122 	    ipv4_is_loopback(daddr))
2123 		goto martian_destination;
2124 
2125 	/*
2126 	 *	Now we are ready to route packet.
2127 	 */
2128 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2129 		if (!IN_DEV_FORWARD(in_dev))
2130 			goto e_hostunreach;
2131 		goto no_route;
2132 	}
2133 	free_res = 1;
2134 
2135 	RT_CACHE_STAT_INC(in_slow_tot);
2136 
2137 	if (res.type == RTN_BROADCAST)
2138 		goto brd_input;
2139 
2140 	if (res.type == RTN_LOCAL) {
2141 		int result;
2142 		result = fib_validate_source(saddr, daddr, tos,
2143 					     net->loopback_dev->ifindex,
2144 					     dev, &spec_dst, &itag);
2145 		if (result < 0)
2146 			goto martian_source;
2147 		if (result)
2148 			flags |= RTCF_DIRECTSRC;
2149 		spec_dst = daddr;
2150 		goto local_input;
2151 	}
2152 
2153 	if (!IN_DEV_FORWARD(in_dev))
2154 		goto e_hostunreach;
2155 	if (res.type != RTN_UNICAST)
2156 		goto martian_destination;
2157 
2158 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2159 done:
2160 	in_dev_put(in_dev);
2161 	if (free_res)
2162 		fib_res_put(&res);
2163 out:	return err;
2164 
2165 brd_input:
2166 	if (skb->protocol != htons(ETH_P_IP))
2167 		goto e_inval;
2168 
2169 	if (ipv4_is_zeronet(saddr))
2170 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2171 	else {
2172 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2173 					  &itag);
2174 		if (err < 0)
2175 			goto martian_source;
2176 		if (err)
2177 			flags |= RTCF_DIRECTSRC;
2178 	}
2179 	flags |= RTCF_BROADCAST;
2180 	res.type = RTN_BROADCAST;
2181 	RT_CACHE_STAT_INC(in_brd);
2182 
2183 local_input:
2184 	rth = dst_alloc(&ipv4_dst_ops);
2185 	if (!rth)
2186 		goto e_nobufs;
2187 
2188 	rth->u.dst.output= ip_rt_bug;
2189 	rth->rt_genid = rt_genid(net);
2190 
2191 	atomic_set(&rth->u.dst.__refcnt, 1);
2192 	rth->u.dst.flags= DST_HOST;
2193 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2194 		rth->u.dst.flags |= DST_NOPOLICY;
2195 	rth->fl.fl4_dst	= daddr;
2196 	rth->rt_dst	= daddr;
2197 	rth->fl.fl4_tos	= tos;
2198 	rth->fl.mark    = skb->mark;
2199 	rth->fl.fl4_src	= saddr;
2200 	rth->rt_src	= saddr;
2201 #ifdef CONFIG_NET_CLS_ROUTE
2202 	rth->u.dst.tclassid = itag;
2203 #endif
2204 	rth->rt_iif	=
2205 	rth->fl.iif	= dev->ifindex;
2206 	rth->u.dst.dev	= net->loopback_dev;
2207 	dev_hold(rth->u.dst.dev);
2208 	rth->idev	= in_dev_get(rth->u.dst.dev);
2209 	rth->rt_gateway	= daddr;
2210 	rth->rt_spec_dst= spec_dst;
2211 	rth->u.dst.input= ip_local_deliver;
2212 	rth->rt_flags 	= flags|RTCF_LOCAL;
2213 	if (res.type == RTN_UNREACHABLE) {
2214 		rth->u.dst.input= ip_error;
2215 		rth->u.dst.error= -err;
2216 		rth->rt_flags 	&= ~RTCF_LOCAL;
2217 	}
2218 	rth->rt_type	= res.type;
2219 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2220 	err = rt_intern_hash(hash, rth, NULL, skb);
2221 	goto done;
2222 
2223 no_route:
2224 	RT_CACHE_STAT_INC(in_no_route);
2225 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226 	res.type = RTN_UNREACHABLE;
2227 	if (err == -ESRCH)
2228 		err = -ENETUNREACH;
2229 	goto local_input;
2230 
2231 	/*
2232 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2233 	 */
2234 martian_destination:
2235 	RT_CACHE_STAT_INC(in_martian_dst);
2236 #ifdef CONFIG_IP_ROUTE_VERBOSE
2237 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2238 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239 			&daddr, &saddr, dev->name);
2240 #endif
2241 
2242 e_hostunreach:
2243 	err = -EHOSTUNREACH;
2244 	goto done;
2245 
2246 e_inval:
2247 	err = -EINVAL;
2248 	goto done;
2249 
2250 e_nobufs:
2251 	err = -ENOBUFS;
2252 	goto done;
2253 
2254 martian_source:
2255 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2256 	goto e_inval;
2257 }
2258 
2259 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260 		   u8 tos, struct net_device *dev)
2261 {
2262 	struct rtable * rth;
2263 	unsigned	hash;
2264 	int iif = dev->ifindex;
2265 	struct net *net;
2266 
2267 	net = dev_net(dev);
2268 
2269 	if (!rt_caching(net))
2270 		goto skip_cache;
2271 
2272 	tos &= IPTOS_RT_MASK;
2273 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2274 
2275 	rcu_read_lock();
2276 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2277 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2278 		if (((rth->fl.fl4_dst ^ daddr) |
2279 		     (rth->fl.fl4_src ^ saddr) |
2280 		     (rth->fl.iif ^ iif) |
2281 		     rth->fl.oif |
2282 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2283 		    rth->fl.mark == skb->mark &&
2284 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2285 		    !rt_is_expired(rth)) {
2286 			dst_use(&rth->u.dst, jiffies);
2287 			RT_CACHE_STAT_INC(in_hit);
2288 			rcu_read_unlock();
2289 			skb_dst_set(skb, &rth->u.dst);
2290 			return 0;
2291 		}
2292 		RT_CACHE_STAT_INC(in_hlist_search);
2293 	}
2294 	rcu_read_unlock();
2295 
2296 skip_cache:
2297 	/* Multicast recognition logic is moved from route cache to here.
2298 	   The problem was that too many Ethernet cards have broken/missing
2299 	   hardware multicast filters :-( As result the host on multicasting
2300 	   network acquires a lot of useless route cache entries, sort of
2301 	   SDR messages from all the world. Now we try to get rid of them.
2302 	   Really, provided software IP multicast filter is organized
2303 	   reasonably (at least, hashed), it does not result in a slowdown
2304 	   comparing with route cache reject entries.
2305 	   Note, that multicast routers are not affected, because
2306 	   route cache entry is created eventually.
2307 	 */
2308 	if (ipv4_is_multicast(daddr)) {
2309 		struct in_device *in_dev;
2310 
2311 		rcu_read_lock();
2312 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2313 			int our = ip_check_mc(in_dev, daddr, saddr,
2314 				ip_hdr(skb)->protocol);
2315 			if (our
2316 #ifdef CONFIG_IP_MROUTE
2317 			    || (!ipv4_is_local_multicast(daddr) &&
2318 				IN_DEV_MFORWARD(in_dev))
2319 #endif
2320 			    ) {
2321 				rcu_read_unlock();
2322 				return ip_route_input_mc(skb, daddr, saddr,
2323 							 tos, dev, our);
2324 			}
2325 		}
2326 		rcu_read_unlock();
2327 		return -EINVAL;
2328 	}
2329 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2330 }
2331 
2332 static int __mkroute_output(struct rtable **result,
2333 			    struct fib_result *res,
2334 			    const struct flowi *fl,
2335 			    const struct flowi *oldflp,
2336 			    struct net_device *dev_out,
2337 			    unsigned flags)
2338 {
2339 	struct rtable *rth;
2340 	struct in_device *in_dev;
2341 	u32 tos = RT_FL_TOS(oldflp);
2342 	int err = 0;
2343 
2344 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2345 		return -EINVAL;
2346 
2347 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2348 		res->type = RTN_BROADCAST;
2349 	else if (ipv4_is_multicast(fl->fl4_dst))
2350 		res->type = RTN_MULTICAST;
2351 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2352 		return -EINVAL;
2353 
2354 	if (dev_out->flags & IFF_LOOPBACK)
2355 		flags |= RTCF_LOCAL;
2356 
2357 	/* get work reference to inet device */
2358 	in_dev = in_dev_get(dev_out);
2359 	if (!in_dev)
2360 		return -EINVAL;
2361 
2362 	if (res->type == RTN_BROADCAST) {
2363 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2364 		if (res->fi) {
2365 			fib_info_put(res->fi);
2366 			res->fi = NULL;
2367 		}
2368 	} else if (res->type == RTN_MULTICAST) {
2369 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2370 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2371 				 oldflp->proto))
2372 			flags &= ~RTCF_LOCAL;
2373 		/* If multicast route do not exist use
2374 		   default one, but do not gateway in this case.
2375 		   Yes, it is hack.
2376 		 */
2377 		if (res->fi && res->prefixlen < 4) {
2378 			fib_info_put(res->fi);
2379 			res->fi = NULL;
2380 		}
2381 	}
2382 
2383 
2384 	rth = dst_alloc(&ipv4_dst_ops);
2385 	if (!rth) {
2386 		err = -ENOBUFS;
2387 		goto cleanup;
2388 	}
2389 
2390 	atomic_set(&rth->u.dst.__refcnt, 1);
2391 	rth->u.dst.flags= DST_HOST;
2392 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2393 		rth->u.dst.flags |= DST_NOXFRM;
2394 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2395 		rth->u.dst.flags |= DST_NOPOLICY;
2396 
2397 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2398 	rth->fl.fl4_tos	= tos;
2399 	rth->fl.fl4_src	= oldflp->fl4_src;
2400 	rth->fl.oif	= oldflp->oif;
2401 	rth->fl.mark    = oldflp->mark;
2402 	rth->rt_dst	= fl->fl4_dst;
2403 	rth->rt_src	= fl->fl4_src;
2404 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2405 	/* get references to the devices that are to be hold by the routing
2406 	   cache entry */
2407 	rth->u.dst.dev	= dev_out;
2408 	dev_hold(dev_out);
2409 	rth->idev	= in_dev_get(dev_out);
2410 	rth->rt_gateway = fl->fl4_dst;
2411 	rth->rt_spec_dst= fl->fl4_src;
2412 
2413 	rth->u.dst.output=ip_output;
2414 	rth->rt_genid = rt_genid(dev_net(dev_out));
2415 
2416 	RT_CACHE_STAT_INC(out_slow_tot);
2417 
2418 	if (flags & RTCF_LOCAL) {
2419 		rth->u.dst.input = ip_local_deliver;
2420 		rth->rt_spec_dst = fl->fl4_dst;
2421 	}
2422 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2423 		rth->rt_spec_dst = fl->fl4_src;
2424 		if (flags & RTCF_LOCAL &&
2425 		    !(dev_out->flags & IFF_LOOPBACK)) {
2426 			rth->u.dst.output = ip_mc_output;
2427 			RT_CACHE_STAT_INC(out_slow_mc);
2428 		}
2429 #ifdef CONFIG_IP_MROUTE
2430 		if (res->type == RTN_MULTICAST) {
2431 			if (IN_DEV_MFORWARD(in_dev) &&
2432 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2433 				rth->u.dst.input = ip_mr_input;
2434 				rth->u.dst.output = ip_mc_output;
2435 			}
2436 		}
2437 #endif
2438 	}
2439 
2440 	rt_set_nexthop(rth, res, 0);
2441 
2442 	rth->rt_flags = flags;
2443 
2444 	*result = rth;
2445  cleanup:
2446 	/* release work reference to inet device */
2447 	in_dev_put(in_dev);
2448 
2449 	return err;
2450 }
2451 
2452 static int ip_mkroute_output(struct rtable **rp,
2453 			     struct fib_result *res,
2454 			     const struct flowi *fl,
2455 			     const struct flowi *oldflp,
2456 			     struct net_device *dev_out,
2457 			     unsigned flags)
2458 {
2459 	struct rtable *rth = NULL;
2460 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2461 	unsigned hash;
2462 	if (err == 0) {
2463 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2464 			       rt_genid(dev_net(dev_out)));
2465 		err = rt_intern_hash(hash, rth, rp, NULL);
2466 	}
2467 
2468 	return err;
2469 }
2470 
2471 /*
2472  * Major route resolver routine.
2473  */
2474 
2475 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2476 				const struct flowi *oldflp)
2477 {
2478 	u32 tos	= RT_FL_TOS(oldflp);
2479 	struct flowi fl = { .nl_u = { .ip4_u =
2480 				      { .daddr = oldflp->fl4_dst,
2481 					.saddr = oldflp->fl4_src,
2482 					.tos = tos & IPTOS_RT_MASK,
2483 					.scope = ((tos & RTO_ONLINK) ?
2484 						  RT_SCOPE_LINK :
2485 						  RT_SCOPE_UNIVERSE),
2486 				      } },
2487 			    .mark = oldflp->mark,
2488 			    .iif = net->loopback_dev->ifindex,
2489 			    .oif = oldflp->oif };
2490 	struct fib_result res;
2491 	unsigned flags = 0;
2492 	struct net_device *dev_out = NULL;
2493 	int free_res = 0;
2494 	int err;
2495 
2496 
2497 	res.fi		= NULL;
2498 #ifdef CONFIG_IP_MULTIPLE_TABLES
2499 	res.r		= NULL;
2500 #endif
2501 
2502 	if (oldflp->fl4_src) {
2503 		err = -EINVAL;
2504 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2505 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2506 		    ipv4_is_zeronet(oldflp->fl4_src))
2507 			goto out;
2508 
2509 		/* I removed check for oif == dev_out->oif here.
2510 		   It was wrong for two reasons:
2511 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2512 		      is assigned to multiple interfaces.
2513 		   2. Moreover, we are allowed to send packets with saddr
2514 		      of another iface. --ANK
2515 		 */
2516 
2517 		if (oldflp->oif == 0
2518 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2519 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2520 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2521 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2522 			if (dev_out == NULL)
2523 				goto out;
2524 
2525 			/* Special hack: user can direct multicasts
2526 			   and limited broadcast via necessary interface
2527 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2528 			   This hack is not just for fun, it allows
2529 			   vic,vat and friends to work.
2530 			   They bind socket to loopback, set ttl to zero
2531 			   and expect that it will work.
2532 			   From the viewpoint of routing cache they are broken,
2533 			   because we are not allowed to build multicast path
2534 			   with loopback source addr (look, routing cache
2535 			   cannot know, that ttl is zero, so that packet
2536 			   will not leave this host and route is valid).
2537 			   Luckily, this hack is good workaround.
2538 			 */
2539 
2540 			fl.oif = dev_out->ifindex;
2541 			goto make_route;
2542 		}
2543 
2544 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2545 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2546 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2547 			if (dev_out == NULL)
2548 				goto out;
2549 			dev_put(dev_out);
2550 			dev_out = NULL;
2551 		}
2552 	}
2553 
2554 
2555 	if (oldflp->oif) {
2556 		dev_out = dev_get_by_index(net, oldflp->oif);
2557 		err = -ENODEV;
2558 		if (dev_out == NULL)
2559 			goto out;
2560 
2561 		/* RACE: Check return value of inet_select_addr instead. */
2562 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2563 			dev_put(dev_out);
2564 			goto out;	/* Wrong error code */
2565 		}
2566 
2567 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2568 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2569 			if (!fl.fl4_src)
2570 				fl.fl4_src = inet_select_addr(dev_out, 0,
2571 							      RT_SCOPE_LINK);
2572 			goto make_route;
2573 		}
2574 		if (!fl.fl4_src) {
2575 			if (ipv4_is_multicast(oldflp->fl4_dst))
2576 				fl.fl4_src = inet_select_addr(dev_out, 0,
2577 							      fl.fl4_scope);
2578 			else if (!oldflp->fl4_dst)
2579 				fl.fl4_src = inet_select_addr(dev_out, 0,
2580 							      RT_SCOPE_HOST);
2581 		}
2582 	}
2583 
2584 	if (!fl.fl4_dst) {
2585 		fl.fl4_dst = fl.fl4_src;
2586 		if (!fl.fl4_dst)
2587 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2588 		if (dev_out)
2589 			dev_put(dev_out);
2590 		dev_out = net->loopback_dev;
2591 		dev_hold(dev_out);
2592 		fl.oif = net->loopback_dev->ifindex;
2593 		res.type = RTN_LOCAL;
2594 		flags |= RTCF_LOCAL;
2595 		goto make_route;
2596 	}
2597 
2598 	if (fib_lookup(net, &fl, &res)) {
2599 		res.fi = NULL;
2600 		if (oldflp->oif) {
2601 			/* Apparently, routing tables are wrong. Assume,
2602 			   that the destination is on link.
2603 
2604 			   WHY? DW.
2605 			   Because we are allowed to send to iface
2606 			   even if it has NO routes and NO assigned
2607 			   addresses. When oif is specified, routing
2608 			   tables are looked up with only one purpose:
2609 			   to catch if destination is gatewayed, rather than
2610 			   direct. Moreover, if MSG_DONTROUTE is set,
2611 			   we send packet, ignoring both routing tables
2612 			   and ifaddr state. --ANK
2613 
2614 
2615 			   We could make it even if oif is unknown,
2616 			   likely IPv6, but we do not.
2617 			 */
2618 
2619 			if (fl.fl4_src == 0)
2620 				fl.fl4_src = inet_select_addr(dev_out, 0,
2621 							      RT_SCOPE_LINK);
2622 			res.type = RTN_UNICAST;
2623 			goto make_route;
2624 		}
2625 		if (dev_out)
2626 			dev_put(dev_out);
2627 		err = -ENETUNREACH;
2628 		goto out;
2629 	}
2630 	free_res = 1;
2631 
2632 	if (res.type == RTN_LOCAL) {
2633 		if (!fl.fl4_src)
2634 			fl.fl4_src = fl.fl4_dst;
2635 		if (dev_out)
2636 			dev_put(dev_out);
2637 		dev_out = net->loopback_dev;
2638 		dev_hold(dev_out);
2639 		fl.oif = dev_out->ifindex;
2640 		if (res.fi)
2641 			fib_info_put(res.fi);
2642 		res.fi = NULL;
2643 		flags |= RTCF_LOCAL;
2644 		goto make_route;
2645 	}
2646 
2647 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2648 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2649 		fib_select_multipath(&fl, &res);
2650 	else
2651 #endif
2652 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2653 		fib_select_default(net, &fl, &res);
2654 
2655 	if (!fl.fl4_src)
2656 		fl.fl4_src = FIB_RES_PREFSRC(res);
2657 
2658 	if (dev_out)
2659 		dev_put(dev_out);
2660 	dev_out = FIB_RES_DEV(res);
2661 	dev_hold(dev_out);
2662 	fl.oif = dev_out->ifindex;
2663 
2664 
2665 make_route:
2666 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2667 
2668 
2669 	if (free_res)
2670 		fib_res_put(&res);
2671 	if (dev_out)
2672 		dev_put(dev_out);
2673 out:	return err;
2674 }
2675 
2676 int __ip_route_output_key(struct net *net, struct rtable **rp,
2677 			  const struct flowi *flp)
2678 {
2679 	unsigned hash;
2680 	struct rtable *rth;
2681 
2682 	if (!rt_caching(net))
2683 		goto slow_output;
2684 
2685 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2686 
2687 	rcu_read_lock_bh();
2688 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2689 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2690 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2691 		    rth->fl.fl4_src == flp->fl4_src &&
2692 		    rth->fl.iif == 0 &&
2693 		    rth->fl.oif == flp->oif &&
2694 		    rth->fl.mark == flp->mark &&
2695 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2696 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2697 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2698 		    !rt_is_expired(rth)) {
2699 			dst_use(&rth->u.dst, jiffies);
2700 			RT_CACHE_STAT_INC(out_hit);
2701 			rcu_read_unlock_bh();
2702 			*rp = rth;
2703 			return 0;
2704 		}
2705 		RT_CACHE_STAT_INC(out_hlist_search);
2706 	}
2707 	rcu_read_unlock_bh();
2708 
2709 slow_output:
2710 	return ip_route_output_slow(net, rp, flp);
2711 }
2712 
2713 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2714 
2715 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2716 {
2717 }
2718 
2719 static struct dst_ops ipv4_dst_blackhole_ops = {
2720 	.family			=	AF_INET,
2721 	.protocol		=	cpu_to_be16(ETH_P_IP),
2722 	.destroy		=	ipv4_dst_destroy,
2723 	.check			=	ipv4_dst_check,
2724 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2725 	.entries		=	ATOMIC_INIT(0),
2726 };
2727 
2728 
2729 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2730 {
2731 	struct rtable *ort = *rp;
2732 	struct rtable *rt = (struct rtable *)
2733 		dst_alloc(&ipv4_dst_blackhole_ops);
2734 
2735 	if (rt) {
2736 		struct dst_entry *new = &rt->u.dst;
2737 
2738 		atomic_set(&new->__refcnt, 1);
2739 		new->__use = 1;
2740 		new->input = dst_discard;
2741 		new->output = dst_discard;
2742 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2743 
2744 		new->dev = ort->u.dst.dev;
2745 		if (new->dev)
2746 			dev_hold(new->dev);
2747 
2748 		rt->fl = ort->fl;
2749 
2750 		rt->idev = ort->idev;
2751 		if (rt->idev)
2752 			in_dev_hold(rt->idev);
2753 		rt->rt_genid = rt_genid(net);
2754 		rt->rt_flags = ort->rt_flags;
2755 		rt->rt_type = ort->rt_type;
2756 		rt->rt_dst = ort->rt_dst;
2757 		rt->rt_src = ort->rt_src;
2758 		rt->rt_iif = ort->rt_iif;
2759 		rt->rt_gateway = ort->rt_gateway;
2760 		rt->rt_spec_dst = ort->rt_spec_dst;
2761 		rt->peer = ort->peer;
2762 		if (rt->peer)
2763 			atomic_inc(&rt->peer->refcnt);
2764 
2765 		dst_free(new);
2766 	}
2767 
2768 	dst_release(&(*rp)->u.dst);
2769 	*rp = rt;
2770 	return (rt ? 0 : -ENOMEM);
2771 }
2772 
2773 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2774 			 struct sock *sk, int flags)
2775 {
2776 	int err;
2777 
2778 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2779 		return err;
2780 
2781 	if (flp->proto) {
2782 		if (!flp->fl4_src)
2783 			flp->fl4_src = (*rp)->rt_src;
2784 		if (!flp->fl4_dst)
2785 			flp->fl4_dst = (*rp)->rt_dst;
2786 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2787 				    flags ? XFRM_LOOKUP_WAIT : 0);
2788 		if (err == -EREMOTE)
2789 			err = ipv4_dst_blackhole(net, rp, flp);
2790 
2791 		return err;
2792 	}
2793 
2794 	return 0;
2795 }
2796 
2797 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2798 
2799 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2800 {
2801 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2802 }
2803 
2804 static int rt_fill_info(struct net *net,
2805 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2806 			int nowait, unsigned int flags)
2807 {
2808 	struct rtable *rt = skb_rtable(skb);
2809 	struct rtmsg *r;
2810 	struct nlmsghdr *nlh;
2811 	long expires;
2812 	u32 id = 0, ts = 0, tsage = 0, error;
2813 
2814 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2815 	if (nlh == NULL)
2816 		return -EMSGSIZE;
2817 
2818 	r = nlmsg_data(nlh);
2819 	r->rtm_family	 = AF_INET;
2820 	r->rtm_dst_len	= 32;
2821 	r->rtm_src_len	= 0;
2822 	r->rtm_tos	= rt->fl.fl4_tos;
2823 	r->rtm_table	= RT_TABLE_MAIN;
2824 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2825 	r->rtm_type	= rt->rt_type;
2826 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2827 	r->rtm_protocol = RTPROT_UNSPEC;
2828 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2829 	if (rt->rt_flags & RTCF_NOTIFY)
2830 		r->rtm_flags |= RTM_F_NOTIFY;
2831 
2832 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2833 
2834 	if (rt->fl.fl4_src) {
2835 		r->rtm_src_len = 32;
2836 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2837 	}
2838 	if (rt->u.dst.dev)
2839 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2840 #ifdef CONFIG_NET_CLS_ROUTE
2841 	if (rt->u.dst.tclassid)
2842 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2843 #endif
2844 	if (rt->fl.iif)
2845 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2846 	else if (rt->rt_src != rt->fl.fl4_src)
2847 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2848 
2849 	if (rt->rt_dst != rt->rt_gateway)
2850 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2851 
2852 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2853 		goto nla_put_failure;
2854 
2855 	error = rt->u.dst.error;
2856 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2857 	if (rt->peer) {
2858 		id = rt->peer->ip_id_count;
2859 		if (rt->peer->tcp_ts_stamp) {
2860 			ts = rt->peer->tcp_ts;
2861 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2862 		}
2863 	}
2864 
2865 	if (rt->fl.iif) {
2866 #ifdef CONFIG_IP_MROUTE
2867 		__be32 dst = rt->rt_dst;
2868 
2869 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2870 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2871 			int err = ipmr_get_route(net, skb, r, nowait);
2872 			if (err <= 0) {
2873 				if (!nowait) {
2874 					if (err == 0)
2875 						return 0;
2876 					goto nla_put_failure;
2877 				} else {
2878 					if (err == -EMSGSIZE)
2879 						goto nla_put_failure;
2880 					error = err;
2881 				}
2882 			}
2883 		} else
2884 #endif
2885 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2886 	}
2887 
2888 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2889 			       expires, error) < 0)
2890 		goto nla_put_failure;
2891 
2892 	return nlmsg_end(skb, nlh);
2893 
2894 nla_put_failure:
2895 	nlmsg_cancel(skb, nlh);
2896 	return -EMSGSIZE;
2897 }
2898 
2899 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2900 {
2901 	struct net *net = sock_net(in_skb->sk);
2902 	struct rtmsg *rtm;
2903 	struct nlattr *tb[RTA_MAX+1];
2904 	struct rtable *rt = NULL;
2905 	__be32 dst = 0;
2906 	__be32 src = 0;
2907 	u32 iif;
2908 	int err;
2909 	struct sk_buff *skb;
2910 
2911 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2912 	if (err < 0)
2913 		goto errout;
2914 
2915 	rtm = nlmsg_data(nlh);
2916 
2917 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2918 	if (skb == NULL) {
2919 		err = -ENOBUFS;
2920 		goto errout;
2921 	}
2922 
2923 	/* Reserve room for dummy headers, this skb can pass
2924 	   through good chunk of routing engine.
2925 	 */
2926 	skb_reset_mac_header(skb);
2927 	skb_reset_network_header(skb);
2928 
2929 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2930 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2931 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2932 
2933 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2934 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2935 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2936 
2937 	if (iif) {
2938 		struct net_device *dev;
2939 
2940 		dev = __dev_get_by_index(net, iif);
2941 		if (dev == NULL) {
2942 			err = -ENODEV;
2943 			goto errout_free;
2944 		}
2945 
2946 		skb->protocol	= htons(ETH_P_IP);
2947 		skb->dev	= dev;
2948 		local_bh_disable();
2949 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2950 		local_bh_enable();
2951 
2952 		rt = skb_rtable(skb);
2953 		if (err == 0 && rt->u.dst.error)
2954 			err = -rt->u.dst.error;
2955 	} else {
2956 		struct flowi fl = {
2957 			.nl_u = {
2958 				.ip4_u = {
2959 					.daddr = dst,
2960 					.saddr = src,
2961 					.tos = rtm->rtm_tos,
2962 				},
2963 			},
2964 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2965 		};
2966 		err = ip_route_output_key(net, &rt, &fl);
2967 	}
2968 
2969 	if (err)
2970 		goto errout_free;
2971 
2972 	skb_dst_set(skb, &rt->u.dst);
2973 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2974 		rt->rt_flags |= RTCF_NOTIFY;
2975 
2976 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2977 			   RTM_NEWROUTE, 0, 0);
2978 	if (err <= 0)
2979 		goto errout_free;
2980 
2981 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2982 errout:
2983 	return err;
2984 
2985 errout_free:
2986 	kfree_skb(skb);
2987 	goto errout;
2988 }
2989 
2990 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2991 {
2992 	struct rtable *rt;
2993 	int h, s_h;
2994 	int idx, s_idx;
2995 	struct net *net;
2996 
2997 	net = sock_net(skb->sk);
2998 
2999 	s_h = cb->args[0];
3000 	if (s_h < 0)
3001 		s_h = 0;
3002 	s_idx = idx = cb->args[1];
3003 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3004 		if (!rt_hash_table[h].chain)
3005 			continue;
3006 		rcu_read_lock_bh();
3007 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3008 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3009 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3010 				continue;
3011 			if (rt_is_expired(rt))
3012 				continue;
3013 			skb_dst_set(skb, dst_clone(&rt->u.dst));
3014 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3015 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3016 					 1, NLM_F_MULTI) <= 0) {
3017 				skb_dst_drop(skb);
3018 				rcu_read_unlock_bh();
3019 				goto done;
3020 			}
3021 			skb_dst_drop(skb);
3022 		}
3023 		rcu_read_unlock_bh();
3024 	}
3025 
3026 done:
3027 	cb->args[0] = h;
3028 	cb->args[1] = idx;
3029 	return skb->len;
3030 }
3031 
3032 void ip_rt_multicast_event(struct in_device *in_dev)
3033 {
3034 	rt_cache_flush(dev_net(in_dev->dev), 0);
3035 }
3036 
3037 #ifdef CONFIG_SYSCTL
3038 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3039 					struct file *filp, void __user *buffer,
3040 					size_t *lenp, loff_t *ppos)
3041 {
3042 	if (write) {
3043 		int flush_delay;
3044 		ctl_table ctl;
3045 		struct net *net;
3046 
3047 		memcpy(&ctl, __ctl, sizeof(ctl));
3048 		ctl.data = &flush_delay;
3049 		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3050 
3051 		net = (struct net *)__ctl->extra1;
3052 		rt_cache_flush(net, flush_delay);
3053 		return 0;
3054 	}
3055 
3056 	return -EINVAL;
3057 }
3058 
3059 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3060 						void __user *oldval,
3061 						size_t __user *oldlenp,
3062 						void __user *newval,
3063 						size_t newlen)
3064 {
3065 	int delay;
3066 	struct net *net;
3067 	if (newlen != sizeof(int))
3068 		return -EINVAL;
3069 	if (get_user(delay, (int __user *)newval))
3070 		return -EFAULT;
3071 	net = (struct net *)table->extra1;
3072 	rt_cache_flush(net, delay);
3073 	return 0;
3074 }
3075 
3076 static void rt_secret_reschedule(int old)
3077 {
3078 	struct net *net;
3079 	int new = ip_rt_secret_interval;
3080 	int diff = new - old;
3081 
3082 	if (!diff)
3083 		return;
3084 
3085 	rtnl_lock();
3086 	for_each_net(net) {
3087 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3088 
3089 		if (!new)
3090 			continue;
3091 
3092 		if (deleted) {
3093 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3094 
3095 			if (time <= 0 || (time += diff) <= 0)
3096 				time = 0;
3097 
3098 			net->ipv4.rt_secret_timer.expires = time;
3099 		} else
3100 			net->ipv4.rt_secret_timer.expires = new;
3101 
3102 		net->ipv4.rt_secret_timer.expires += jiffies;
3103 		add_timer(&net->ipv4.rt_secret_timer);
3104 	}
3105 	rtnl_unlock();
3106 }
3107 
3108 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3109 					  struct file *filp,
3110 					  void __user *buffer, size_t *lenp,
3111 					  loff_t *ppos)
3112 {
3113 	int old = ip_rt_secret_interval;
3114 	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3115 
3116 	rt_secret_reschedule(old);
3117 
3118 	return ret;
3119 }
3120 
3121 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3122 						   void __user *oldval,
3123 						   size_t __user *oldlenp,
3124 						   void __user *newval,
3125 						   size_t newlen)
3126 {
3127 	int old = ip_rt_secret_interval;
3128 	int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3129 
3130 	rt_secret_reschedule(old);
3131 
3132 	return ret;
3133 }
3134 
3135 static ctl_table ipv4_route_table[] = {
3136 	{
3137 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
3138 		.procname	= "gc_thresh",
3139 		.data		= &ipv4_dst_ops.gc_thresh,
3140 		.maxlen		= sizeof(int),
3141 		.mode		= 0644,
3142 		.proc_handler	= proc_dointvec,
3143 	},
3144 	{
3145 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
3146 		.procname	= "max_size",
3147 		.data		= &ip_rt_max_size,
3148 		.maxlen		= sizeof(int),
3149 		.mode		= 0644,
3150 		.proc_handler	= proc_dointvec,
3151 	},
3152 	{
3153 		/*  Deprecated. Use gc_min_interval_ms */
3154 
3155 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3156 		.procname	= "gc_min_interval",
3157 		.data		= &ip_rt_gc_min_interval,
3158 		.maxlen		= sizeof(int),
3159 		.mode		= 0644,
3160 		.proc_handler	= proc_dointvec_jiffies,
3161 		.strategy	= sysctl_jiffies,
3162 	},
3163 	{
3164 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3165 		.procname	= "gc_min_interval_ms",
3166 		.data		= &ip_rt_gc_min_interval,
3167 		.maxlen		= sizeof(int),
3168 		.mode		= 0644,
3169 		.proc_handler	= proc_dointvec_ms_jiffies,
3170 		.strategy	= sysctl_ms_jiffies,
3171 	},
3172 	{
3173 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3174 		.procname	= "gc_timeout",
3175 		.data		= &ip_rt_gc_timeout,
3176 		.maxlen		= sizeof(int),
3177 		.mode		= 0644,
3178 		.proc_handler	= proc_dointvec_jiffies,
3179 		.strategy	= sysctl_jiffies,
3180 	},
3181 	{
3182 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3183 		.procname	= "gc_interval",
3184 		.data		= &ip_rt_gc_interval,
3185 		.maxlen		= sizeof(int),
3186 		.mode		= 0644,
3187 		.proc_handler	= proc_dointvec_jiffies,
3188 		.strategy	= sysctl_jiffies,
3189 	},
3190 	{
3191 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3192 		.procname	= "redirect_load",
3193 		.data		= &ip_rt_redirect_load,
3194 		.maxlen		= sizeof(int),
3195 		.mode		= 0644,
3196 		.proc_handler	= proc_dointvec,
3197 	},
3198 	{
3199 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3200 		.procname	= "redirect_number",
3201 		.data		= &ip_rt_redirect_number,
3202 		.maxlen		= sizeof(int),
3203 		.mode		= 0644,
3204 		.proc_handler	= proc_dointvec,
3205 	},
3206 	{
3207 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3208 		.procname	= "redirect_silence",
3209 		.data		= &ip_rt_redirect_silence,
3210 		.maxlen		= sizeof(int),
3211 		.mode		= 0644,
3212 		.proc_handler	= proc_dointvec,
3213 	},
3214 	{
3215 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3216 		.procname	= "error_cost",
3217 		.data		= &ip_rt_error_cost,
3218 		.maxlen		= sizeof(int),
3219 		.mode		= 0644,
3220 		.proc_handler	= proc_dointvec,
3221 	},
3222 	{
3223 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3224 		.procname	= "error_burst",
3225 		.data		= &ip_rt_error_burst,
3226 		.maxlen		= sizeof(int),
3227 		.mode		= 0644,
3228 		.proc_handler	= proc_dointvec,
3229 	},
3230 	{
3231 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3232 		.procname	= "gc_elasticity",
3233 		.data		= &ip_rt_gc_elasticity,
3234 		.maxlen		= sizeof(int),
3235 		.mode		= 0644,
3236 		.proc_handler	= proc_dointvec,
3237 	},
3238 	{
3239 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3240 		.procname	= "mtu_expires",
3241 		.data		= &ip_rt_mtu_expires,
3242 		.maxlen		= sizeof(int),
3243 		.mode		= 0644,
3244 		.proc_handler	= proc_dointvec_jiffies,
3245 		.strategy	= sysctl_jiffies,
3246 	},
3247 	{
3248 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3249 		.procname	= "min_pmtu",
3250 		.data		= &ip_rt_min_pmtu,
3251 		.maxlen		= sizeof(int),
3252 		.mode		= 0644,
3253 		.proc_handler	= proc_dointvec,
3254 	},
3255 	{
3256 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3257 		.procname	= "min_adv_mss",
3258 		.data		= &ip_rt_min_advmss,
3259 		.maxlen		= sizeof(int),
3260 		.mode		= 0644,
3261 		.proc_handler	= proc_dointvec,
3262 	},
3263 	{
3264 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3265 		.procname	= "secret_interval",
3266 		.data		= &ip_rt_secret_interval,
3267 		.maxlen		= sizeof(int),
3268 		.mode		= 0644,
3269 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3270 		.strategy	= ipv4_sysctl_rt_secret_interval_strategy,
3271 	},
3272 	{ .ctl_name = 0 }
3273 };
3274 
3275 static struct ctl_table empty[1];
3276 
3277 static struct ctl_table ipv4_skeleton[] =
3278 {
3279 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3280 	  .mode = 0555, .child = ipv4_route_table},
3281 	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3282 	  .mode = 0555, .child = empty},
3283 	{ }
3284 };
3285 
3286 static __net_initdata struct ctl_path ipv4_path[] = {
3287 	{ .procname = "net", .ctl_name = CTL_NET, },
3288 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3289 	{ },
3290 };
3291 
3292 static struct ctl_table ipv4_route_flush_table[] = {
3293 	{
3294 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
3295 		.procname	= "flush",
3296 		.maxlen		= sizeof(int),
3297 		.mode		= 0200,
3298 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3299 		.strategy	= ipv4_sysctl_rtcache_flush_strategy,
3300 	},
3301 	{ .ctl_name = 0 },
3302 };
3303 
3304 static __net_initdata struct ctl_path ipv4_route_path[] = {
3305 	{ .procname = "net", .ctl_name = CTL_NET, },
3306 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3307 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3308 	{ },
3309 };
3310 
3311 static __net_init int sysctl_route_net_init(struct net *net)
3312 {
3313 	struct ctl_table *tbl;
3314 
3315 	tbl = ipv4_route_flush_table;
3316 	if (net != &init_net) {
3317 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3318 		if (tbl == NULL)
3319 			goto err_dup;
3320 	}
3321 	tbl[0].extra1 = net;
3322 
3323 	net->ipv4.route_hdr =
3324 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3325 	if (net->ipv4.route_hdr == NULL)
3326 		goto err_reg;
3327 	return 0;
3328 
3329 err_reg:
3330 	if (tbl != ipv4_route_flush_table)
3331 		kfree(tbl);
3332 err_dup:
3333 	return -ENOMEM;
3334 }
3335 
3336 static __net_exit void sysctl_route_net_exit(struct net *net)
3337 {
3338 	struct ctl_table *tbl;
3339 
3340 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3341 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3342 	BUG_ON(tbl == ipv4_route_flush_table);
3343 	kfree(tbl);
3344 }
3345 
3346 static __net_initdata struct pernet_operations sysctl_route_ops = {
3347 	.init = sysctl_route_net_init,
3348 	.exit = sysctl_route_net_exit,
3349 };
3350 #endif
3351 
3352 
3353 static __net_init int rt_secret_timer_init(struct net *net)
3354 {
3355 	atomic_set(&net->ipv4.rt_genid,
3356 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3357 			(jiffies ^ (jiffies >> 7))));
3358 
3359 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3360 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3361 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3362 
3363 	if (ip_rt_secret_interval) {
3364 		net->ipv4.rt_secret_timer.expires =
3365 			jiffies + net_random() % ip_rt_secret_interval +
3366 			ip_rt_secret_interval;
3367 		add_timer(&net->ipv4.rt_secret_timer);
3368 	}
3369 	return 0;
3370 }
3371 
3372 static __net_exit void rt_secret_timer_exit(struct net *net)
3373 {
3374 	del_timer_sync(&net->ipv4.rt_secret_timer);
3375 }
3376 
3377 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3378 	.init = rt_secret_timer_init,
3379 	.exit = rt_secret_timer_exit,
3380 };
3381 
3382 
3383 #ifdef CONFIG_NET_CLS_ROUTE
3384 struct ip_rt_acct *ip_rt_acct __read_mostly;
3385 #endif /* CONFIG_NET_CLS_ROUTE */
3386 
3387 static __initdata unsigned long rhash_entries;
3388 static int __init set_rhash_entries(char *str)
3389 {
3390 	if (!str)
3391 		return 0;
3392 	rhash_entries = simple_strtoul(str, &str, 0);
3393 	return 1;
3394 }
3395 __setup("rhash_entries=", set_rhash_entries);
3396 
3397 int __init ip_rt_init(void)
3398 {
3399 	int rc = 0;
3400 
3401 #ifdef CONFIG_NET_CLS_ROUTE
3402 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3403 	if (!ip_rt_acct)
3404 		panic("IP: failed to allocate ip_rt_acct\n");
3405 #endif
3406 
3407 	ipv4_dst_ops.kmem_cachep =
3408 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3409 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3410 
3411 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3412 
3413 	rt_hash_table = (struct rt_hash_bucket *)
3414 		alloc_large_system_hash("IP route cache",
3415 					sizeof(struct rt_hash_bucket),
3416 					rhash_entries,
3417 					(num_physpages >= 128 * 1024) ?
3418 					15 : 17,
3419 					0,
3420 					&rt_hash_log,
3421 					&rt_hash_mask,
3422 					rhash_entries ? 0 : 512 * 1024);
3423 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3424 	rt_hash_lock_init();
3425 
3426 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3427 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3428 
3429 	devinet_init();
3430 	ip_fib_init();
3431 
3432 	/* All the timers, started at system startup tend
3433 	   to synchronize. Perturb it a bit.
3434 	 */
3435 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3436 	expires_ljiffies = jiffies;
3437 	schedule_delayed_work(&expires_work,
3438 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3439 
3440 	if (register_pernet_subsys(&rt_secret_timer_ops))
3441 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3442 
3443 	if (ip_rt_proc_init())
3444 		printk(KERN_ERR "Unable to create route proc files\n");
3445 #ifdef CONFIG_XFRM
3446 	xfrm_init();
3447 	xfrm4_init(ip_rt_max_size);
3448 #endif
3449 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3450 
3451 #ifdef CONFIG_SYSCTL
3452 	register_pernet_subsys(&sysctl_route_ops);
3453 #endif
3454 	return rc;
3455 }
3456 
3457 #ifdef CONFIG_SYSCTL
3458 /*
3459  * We really need to sanitize the damn ipv4 init order, then all
3460  * this nonsense will go away.
3461  */
3462 void __init ip_static_sysctl_init(void)
3463 {
3464 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3465 }
3466 #endif
3467 
3468 EXPORT_SYMBOL(__ip_select_ident);
3469 EXPORT_SYMBOL(ip_route_input);
3470 EXPORT_SYMBOL(ip_route_output_key);
3471