xref: /openbmc/linux/net/ipv4/route.c (revision 78c99ba1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static void rt_worker_func(struct work_struct *work);
135 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 			   int length, int *eof, void *data)
518 {
519 	unsigned int i;
520 
521 	if ((offset & 3) || (length & 3))
522 		return -EIO;
523 
524 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 		*eof = 1;
526 		return 0;
527 	}
528 
529 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 		length = sizeof(struct ip_rt_acct) * 256 - offset;
531 		*eof = 1;
532 	}
533 
534 	offset /= sizeof(u32);
535 
536 	if (length > 0) {
537 		u32 *dst = (u32 *) buffer;
538 
539 		*start = buffer;
540 		memset(dst, 0, length);
541 
542 		for_each_possible_cpu(i) {
543 			unsigned int j;
544 			u32 *src;
545 
546 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 			for (j = 0; j < length/4; j++)
548 				dst[j] += src[j];
549 		}
550 	}
551 	return length;
552 }
553 #endif
554 
555 static int __net_init ip_rt_do_proc_init(struct net *net)
556 {
557 	struct proc_dir_entry *pde;
558 
559 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 			&rt_cache_seq_fops);
561 	if (!pde)
562 		goto err1;
563 
564 	pde = proc_create("rt_cache", S_IRUGO,
565 			  net->proc_net_stat, &rt_cpu_seq_fops);
566 	if (!pde)
567 		goto err2;
568 
569 #ifdef CONFIG_NET_CLS_ROUTE
570 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 			ip_rt_acct_read, NULL);
572 	if (!pde)
573 		goto err3;
574 #endif
575 	return 0;
576 
577 #ifdef CONFIG_NET_CLS_ROUTE
578 err3:
579 	remove_proc_entry("rt_cache", net->proc_net_stat);
580 #endif
581 err2:
582 	remove_proc_entry("rt_cache", net->proc_net);
583 err1:
584 	return -ENOMEM;
585 }
586 
587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
588 {
589 	remove_proc_entry("rt_cache", net->proc_net_stat);
590 	remove_proc_entry("rt_cache", net->proc_net);
591 	remove_proc_entry("rt_acct", net->proc_net);
592 }
593 
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595 	.init = ip_rt_do_proc_init,
596 	.exit = ip_rt_do_proc_exit,
597 };
598 
599 static int __init ip_rt_proc_init(void)
600 {
601 	return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603 
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607 	return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610 
611 static inline void rt_free(struct rtable *rt)
612 {
613 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615 
616 static inline void rt_drop(struct rtable *rt)
617 {
618 	ip_rt_put(rt);
619 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621 
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624 	/* Kill broadcast/multicast entries very aggresively, if they
625 	   collide in hash table with more useful entries */
626 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 		rth->fl.iif && rth->u.dst.rt_next;
628 }
629 
630 static inline int rt_valuable(struct rtable *rth)
631 {
632 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 		rth->u.dst.expires;
634 }
635 
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638 	unsigned long age;
639 	int ret = 0;
640 
641 	if (atomic_read(&rth->u.dst.__refcnt))
642 		goto out;
643 
644 	ret = 1;
645 	if (rth->u.dst.expires &&
646 	    time_after_eq(jiffies, rth->u.dst.expires))
647 		goto out;
648 
649 	age = jiffies - rth->u.dst.lastuse;
650 	ret = 0;
651 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 	    (age <= tmo2 && rt_valuable(rth)))
653 		goto out;
654 	ret = 1;
655 out:	return ret;
656 }
657 
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665 	u32 score = jiffies - rt->u.dst.lastuse;
666 
667 	score = ~score & ~(3<<30);
668 
669 	if (rt_valuable(rt))
670 		score |= (1<<31);
671 
672 	if (!rt->fl.iif ||
673 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 		score |= (1<<30);
675 
676 	return score;
677 }
678 
679 static inline bool rt_caching(const struct net *net)
680 {
681 	return net->ipv4.current_rt_cache_rebuild_count <=
682 		net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684 
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 					const struct flowi *fl2)
687 {
688 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 		(fl1->iif ^ fl2->iif)) == 0);
691 }
692 
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 		(fl1->mark ^ fl2->mark) |
698 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 		(fl1->oif ^ fl2->oif) |
701 		(fl1->iif ^ fl2->iif)) == 0;
702 }
703 
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
707 }
708 
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713 
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721 	unsigned int i;
722 	struct rtable *rth, *next;
723 	struct rtable * tail;
724 
725 	for (i = 0; i <= rt_hash_mask; i++) {
726 		if (process_context && need_resched())
727 			cond_resched();
728 		rth = rt_hash_table[i].chain;
729 		if (!rth)
730 			continue;
731 
732 		spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734 		{
735 		struct rtable ** prev, * p;
736 
737 		rth = rt_hash_table[i].chain;
738 
739 		/* defer releasing the head of the list after spin_unlock */
740 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = *prev; p; p = next) {
749 			next = p->u.dst.rt_next;
750 			if (!rt_is_expired(p)) {
751 				prev = &p->u.dst.rt_next;
752 			} else {
753 				*prev = next;
754 				rt_free(p);
755 			}
756 		}
757 		}
758 #else
759 		rth = rt_hash_table[i].chain;
760 		rt_hash_table[i].chain = NULL;
761 		tail = NULL;
762 #endif
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 
765 		for (; rth != tail; rth = next) {
766 			next = rth->u.dst.rt_next;
767 			rt_free(rth);
768 		}
769 	}
770 }
771 
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779 
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782 
783 static void rt_check_expire(void)
784 {
785 	static unsigned int rover;
786 	unsigned int i = rover, goal;
787 	struct rtable *rth, *aux, **rthp;
788 	unsigned long samples = 0;
789 	unsigned long sum = 0, sum2 = 0;
790 	u64 mult;
791 
792 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
793 	if (ip_rt_gc_timeout > 1)
794 		do_div(mult, ip_rt_gc_timeout);
795 	goal = (unsigned int)mult;
796 	if (goal > rt_hash_mask)
797 		goal = rt_hash_mask + 1;
798 	for (; goal > 0; goal--) {
799 		unsigned long tmo = ip_rt_gc_timeout;
800 		unsigned long length;
801 
802 		i = (i + 1) & rt_hash_mask;
803 		rthp = &rt_hash_table[i].chain;
804 
805 		if (need_resched())
806 			cond_resched();
807 
808 		samples++;
809 
810 		if (*rthp == NULL)
811 			continue;
812 		length = 0;
813 		spin_lock_bh(rt_hash_lock_addr(i));
814 		while ((rth = *rthp) != NULL) {
815 			prefetch(rth->u.dst.rt_next);
816 			if (rt_is_expired(rth)) {
817 				*rthp = rth->u.dst.rt_next;
818 				rt_free(rth);
819 				continue;
820 			}
821 			if (rth->u.dst.expires) {
822 				/* Entry is expired even if it is in use */
823 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
824 nofree:
825 					tmo >>= 1;
826 					rthp = &rth->u.dst.rt_next;
827 					/*
828 					 * We only count entries on
829 					 * a chain with equal hash inputs once
830 					 * so that entries for different QOS
831 					 * levels, and other non-hash input
832 					 * attributes don't unfairly skew
833 					 * the length computation
834 					 */
835 					for (aux = rt_hash_table[i].chain;;) {
836 						if (aux == rth) {
837 							length += ONE;
838 							break;
839 						}
840 						if (compare_hash_inputs(&aux->fl, &rth->fl))
841 							break;
842 						aux = aux->u.dst.rt_next;
843 					}
844 					continue;
845 				}
846 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
847 				goto nofree;
848 
849 			/* Cleanup aged off entries. */
850 			*rthp = rth->u.dst.rt_next;
851 			rt_free(rth);
852 		}
853 		spin_unlock_bh(rt_hash_lock_addr(i));
854 		sum += length;
855 		sum2 += length*length;
856 	}
857 	if (samples) {
858 		unsigned long avg = sum / samples;
859 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
860 		rt_chain_length_max = max_t(unsigned long,
861 					ip_rt_gc_elasticity,
862 					(avg + 4*sd) >> FRACT_BITS);
863 	}
864 	rover = i;
865 }
866 
867 /*
868  * rt_worker_func() is run in process context.
869  * we call rt_check_expire() to scan part of the hash table
870  */
871 static void rt_worker_func(struct work_struct *work)
872 {
873 	rt_check_expire();
874 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
875 }
876 
877 /*
878  * Pertubation of rt_genid by a small quantity [1..256]
879  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
880  * many times (2^24) without giving recent rt_genid.
881  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
882  */
883 static void rt_cache_invalidate(struct net *net)
884 {
885 	unsigned char shuffle;
886 
887 	get_random_bytes(&shuffle, sizeof(shuffle));
888 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
889 }
890 
891 /*
892  * delay < 0  : invalidate cache (fast : entries will be deleted later)
893  * delay >= 0 : invalidate & flush cache (can be long)
894  */
895 void rt_cache_flush(struct net *net, int delay)
896 {
897 	rt_cache_invalidate(net);
898 	if (delay >= 0)
899 		rt_do_flush(!in_softirq());
900 }
901 
902 /*
903  * We change rt_genid and let gc do the cleanup
904  */
905 static void rt_secret_rebuild(unsigned long __net)
906 {
907 	struct net *net = (struct net *)__net;
908 	rt_cache_invalidate(net);
909 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
910 }
911 
912 static void rt_secret_rebuild_oneshot(struct net *net)
913 {
914 	del_timer_sync(&net->ipv4.rt_secret_timer);
915 	rt_cache_invalidate(net);
916 	if (ip_rt_secret_interval) {
917 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
918 		add_timer(&net->ipv4.rt_secret_timer);
919 	}
920 }
921 
922 static void rt_emergency_hash_rebuild(struct net *net)
923 {
924 	if (net_ratelimit()) {
925 		printk(KERN_WARNING "Route hash chain too long!\n");
926 		printk(KERN_WARNING "Adjust your secret_interval!\n");
927 	}
928 
929 	rt_secret_rebuild_oneshot(net);
930 }
931 
932 /*
933    Short description of GC goals.
934 
935    We want to build algorithm, which will keep routing cache
936    at some equilibrium point, when number of aged off entries
937    is kept approximately equal to newly generated ones.
938 
939    Current expiration strength is variable "expire".
940    We try to adjust it dynamically, so that if networking
941    is idle expires is large enough to keep enough of warm entries,
942    and when load increases it reduces to limit cache size.
943  */
944 
945 static int rt_garbage_collect(struct dst_ops *ops)
946 {
947 	static unsigned long expire = RT_GC_TIMEOUT;
948 	static unsigned long last_gc;
949 	static int rover;
950 	static int equilibrium;
951 	struct rtable *rth, **rthp;
952 	unsigned long now = jiffies;
953 	int goal;
954 
955 	/*
956 	 * Garbage collection is pretty expensive,
957 	 * do not make it too frequently.
958 	 */
959 
960 	RT_CACHE_STAT_INC(gc_total);
961 
962 	if (now - last_gc < ip_rt_gc_min_interval &&
963 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
964 		RT_CACHE_STAT_INC(gc_ignored);
965 		goto out;
966 	}
967 
968 	/* Calculate number of entries, which we want to expire now. */
969 	goal = atomic_read(&ipv4_dst_ops.entries) -
970 		(ip_rt_gc_elasticity << rt_hash_log);
971 	if (goal <= 0) {
972 		if (equilibrium < ipv4_dst_ops.gc_thresh)
973 			equilibrium = ipv4_dst_ops.gc_thresh;
974 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
975 		if (goal > 0) {
976 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
977 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 		}
979 	} else {
980 		/* We are in dangerous area. Try to reduce cache really
981 		 * aggressively.
982 		 */
983 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
984 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
985 	}
986 
987 	if (now - last_gc >= ip_rt_gc_min_interval)
988 		last_gc = now;
989 
990 	if (goal <= 0) {
991 		equilibrium += goal;
992 		goto work_done;
993 	}
994 
995 	do {
996 		int i, k;
997 
998 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
999 			unsigned long tmo = expire;
1000 
1001 			k = (k + 1) & rt_hash_mask;
1002 			rthp = &rt_hash_table[k].chain;
1003 			spin_lock_bh(rt_hash_lock_addr(k));
1004 			while ((rth = *rthp) != NULL) {
1005 				if (!rt_is_expired(rth) &&
1006 					!rt_may_expire(rth, tmo, expire)) {
1007 					tmo >>= 1;
1008 					rthp = &rth->u.dst.rt_next;
1009 					continue;
1010 				}
1011 				*rthp = rth->u.dst.rt_next;
1012 				rt_free(rth);
1013 				goal--;
1014 			}
1015 			spin_unlock_bh(rt_hash_lock_addr(k));
1016 			if (goal <= 0)
1017 				break;
1018 		}
1019 		rover = k;
1020 
1021 		if (goal <= 0)
1022 			goto work_done;
1023 
1024 		/* Goal is not achieved. We stop process if:
1025 
1026 		   - if expire reduced to zero. Otherwise, expire is halfed.
1027 		   - if table is not full.
1028 		   - if we are called from interrupt.
1029 		   - jiffies check is just fallback/debug loop breaker.
1030 		     We will not spin here for long time in any case.
1031 		 */
1032 
1033 		RT_CACHE_STAT_INC(gc_goal_miss);
1034 
1035 		if (expire == 0)
1036 			break;
1037 
1038 		expire >>= 1;
1039 #if RT_CACHE_DEBUG >= 2
1040 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1041 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1042 #endif
1043 
1044 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1045 			goto out;
1046 	} while (!in_softirq() && time_before_eq(jiffies, now));
1047 
1048 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1049 		goto out;
1050 	if (net_ratelimit())
1051 		printk(KERN_WARNING "dst cache overflow\n");
1052 	RT_CACHE_STAT_INC(gc_dst_overflow);
1053 	return 1;
1054 
1055 work_done:
1056 	expire += ip_rt_gc_min_interval;
1057 	if (expire > ip_rt_gc_timeout ||
1058 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1059 		expire = ip_rt_gc_timeout;
1060 #if RT_CACHE_DEBUG >= 2
1061 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1063 #endif
1064 out:	return 0;
1065 }
1066 
1067 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1068 {
1069 	struct rtable	*rth, **rthp;
1070 	unsigned long	now;
1071 	struct rtable *cand, **candp;
1072 	u32 		min_score;
1073 	int		chain_length;
1074 	int attempts = !in_softirq();
1075 
1076 restart:
1077 	chain_length = 0;
1078 	min_score = ~(u32)0;
1079 	cand = NULL;
1080 	candp = NULL;
1081 	now = jiffies;
1082 
1083 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1084 		rt_drop(rt);
1085 		return 0;
1086 	}
1087 
1088 	rthp = &rt_hash_table[hash].chain;
1089 
1090 	spin_lock_bh(rt_hash_lock_addr(hash));
1091 	while ((rth = *rthp) != NULL) {
1092 		if (rt_is_expired(rth)) {
1093 			*rthp = rth->u.dst.rt_next;
1094 			rt_free(rth);
1095 			continue;
1096 		}
1097 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1098 			/* Put it first */
1099 			*rthp = rth->u.dst.rt_next;
1100 			/*
1101 			 * Since lookup is lockfree, the deletion
1102 			 * must be visible to another weakly ordered CPU before
1103 			 * the insertion at the start of the hash chain.
1104 			 */
1105 			rcu_assign_pointer(rth->u.dst.rt_next,
1106 					   rt_hash_table[hash].chain);
1107 			/*
1108 			 * Since lookup is lockfree, the update writes
1109 			 * must be ordered for consistency on SMP.
1110 			 */
1111 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1112 
1113 			dst_use(&rth->u.dst, now);
1114 			spin_unlock_bh(rt_hash_lock_addr(hash));
1115 
1116 			rt_drop(rt);
1117 			*rp = rth;
1118 			return 0;
1119 		}
1120 
1121 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1122 			u32 score = rt_score(rth);
1123 
1124 			if (score <= min_score) {
1125 				cand = rth;
1126 				candp = rthp;
1127 				min_score = score;
1128 			}
1129 		}
1130 
1131 		chain_length++;
1132 
1133 		rthp = &rth->u.dst.rt_next;
1134 	}
1135 
1136 	if (cand) {
1137 		/* ip_rt_gc_elasticity used to be average length of chain
1138 		 * length, when exceeded gc becomes really aggressive.
1139 		 *
1140 		 * The second limit is less certain. At the moment it allows
1141 		 * only 2 entries per bucket. We will see.
1142 		 */
1143 		if (chain_length > ip_rt_gc_elasticity) {
1144 			*candp = cand->u.dst.rt_next;
1145 			rt_free(cand);
1146 		}
1147 	} else {
1148 		if (chain_length > rt_chain_length_max) {
1149 			struct net *net = dev_net(rt->u.dst.dev);
1150 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1151 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1152 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1153 					rt->u.dst.dev->name, num);
1154 			}
1155 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1156 		}
1157 	}
1158 
1159 	/* Try to bind route to arp only if it is output
1160 	   route or unicast forwarding path.
1161 	 */
1162 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1163 		int err = arp_bind_neighbour(&rt->u.dst);
1164 		if (err) {
1165 			spin_unlock_bh(rt_hash_lock_addr(hash));
1166 
1167 			if (err != -ENOBUFS) {
1168 				rt_drop(rt);
1169 				return err;
1170 			}
1171 
1172 			/* Neighbour tables are full and nothing
1173 			   can be released. Try to shrink route cache,
1174 			   it is most likely it holds some neighbour records.
1175 			 */
1176 			if (attempts-- > 0) {
1177 				int saved_elasticity = ip_rt_gc_elasticity;
1178 				int saved_int = ip_rt_gc_min_interval;
1179 				ip_rt_gc_elasticity	= 1;
1180 				ip_rt_gc_min_interval	= 0;
1181 				rt_garbage_collect(&ipv4_dst_ops);
1182 				ip_rt_gc_min_interval	= saved_int;
1183 				ip_rt_gc_elasticity	= saved_elasticity;
1184 				goto restart;
1185 			}
1186 
1187 			if (net_ratelimit())
1188 				printk(KERN_WARNING "Neighbour table overflow.\n");
1189 			rt_drop(rt);
1190 			return -ENOBUFS;
1191 		}
1192 	}
1193 
1194 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1195 
1196 #if RT_CACHE_DEBUG >= 2
1197 	if (rt->u.dst.rt_next) {
1198 		struct rtable *trt;
1199 		printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
1200 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1201 			printk(" . %pI4", &trt->rt_dst);
1202 		printk("\n");
1203 	}
1204 #endif
1205 	/*
1206 	 * Since lookup is lockfree, we must make sure
1207 	 * previous writes to rt are comitted to memory
1208 	 * before making rt visible to other CPUS.
1209 	 */
1210 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1211 
1212 	spin_unlock_bh(rt_hash_lock_addr(hash));
1213 	*rp = rt;
1214 	return 0;
1215 }
1216 
1217 void rt_bind_peer(struct rtable *rt, int create)
1218 {
1219 	static DEFINE_SPINLOCK(rt_peer_lock);
1220 	struct inet_peer *peer;
1221 
1222 	peer = inet_getpeer(rt->rt_dst, create);
1223 
1224 	spin_lock_bh(&rt_peer_lock);
1225 	if (rt->peer == NULL) {
1226 		rt->peer = peer;
1227 		peer = NULL;
1228 	}
1229 	spin_unlock_bh(&rt_peer_lock);
1230 	if (peer)
1231 		inet_putpeer(peer);
1232 }
1233 
1234 /*
1235  * Peer allocation may fail only in serious out-of-memory conditions.  However
1236  * we still can generate some output.
1237  * Random ID selection looks a bit dangerous because we have no chances to
1238  * select ID being unique in a reasonable period of time.
1239  * But broken packet identifier may be better than no packet at all.
1240  */
1241 static void ip_select_fb_ident(struct iphdr *iph)
1242 {
1243 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1244 	static u32 ip_fallback_id;
1245 	u32 salt;
1246 
1247 	spin_lock_bh(&ip_fb_id_lock);
1248 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1249 	iph->id = htons(salt & 0xFFFF);
1250 	ip_fallback_id = salt;
1251 	spin_unlock_bh(&ip_fb_id_lock);
1252 }
1253 
1254 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1255 {
1256 	struct rtable *rt = (struct rtable *) dst;
1257 
1258 	if (rt) {
1259 		if (rt->peer == NULL)
1260 			rt_bind_peer(rt, 1);
1261 
1262 		/* If peer is attached to destination, it is never detached,
1263 		   so that we need not to grab a lock to dereference it.
1264 		 */
1265 		if (rt->peer) {
1266 			iph->id = htons(inet_getid(rt->peer, more));
1267 			return;
1268 		}
1269 	} else
1270 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1271 		       __builtin_return_address(0));
1272 
1273 	ip_select_fb_ident(iph);
1274 }
1275 
1276 static void rt_del(unsigned hash, struct rtable *rt)
1277 {
1278 	struct rtable **rthp, *aux;
1279 
1280 	rthp = &rt_hash_table[hash].chain;
1281 	spin_lock_bh(rt_hash_lock_addr(hash));
1282 	ip_rt_put(rt);
1283 	while ((aux = *rthp) != NULL) {
1284 		if (aux == rt || rt_is_expired(aux)) {
1285 			*rthp = aux->u.dst.rt_next;
1286 			rt_free(aux);
1287 			continue;
1288 		}
1289 		rthp = &aux->u.dst.rt_next;
1290 	}
1291 	spin_unlock_bh(rt_hash_lock_addr(hash));
1292 }
1293 
1294 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1295 		    __be32 saddr, struct net_device *dev)
1296 {
1297 	int i, k;
1298 	struct in_device *in_dev = in_dev_get(dev);
1299 	struct rtable *rth, **rthp;
1300 	__be32  skeys[2] = { saddr, 0 };
1301 	int  ikeys[2] = { dev->ifindex, 0 };
1302 	struct netevent_redirect netevent;
1303 	struct net *net;
1304 
1305 	if (!in_dev)
1306 		return;
1307 
1308 	net = dev_net(dev);
1309 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1310 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1311 	    || ipv4_is_zeronet(new_gw))
1312 		goto reject_redirect;
1313 
1314 	if (!rt_caching(net))
1315 		goto reject_redirect;
1316 
1317 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1318 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1319 			goto reject_redirect;
1320 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1321 			goto reject_redirect;
1322 	} else {
1323 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1324 			goto reject_redirect;
1325 	}
1326 
1327 	for (i = 0; i < 2; i++) {
1328 		for (k = 0; k < 2; k++) {
1329 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1330 						rt_genid(net));
1331 
1332 			rthp=&rt_hash_table[hash].chain;
1333 
1334 			rcu_read_lock();
1335 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1336 				struct rtable *rt;
1337 
1338 				if (rth->fl.fl4_dst != daddr ||
1339 				    rth->fl.fl4_src != skeys[i] ||
1340 				    rth->fl.oif != ikeys[k] ||
1341 				    rth->fl.iif != 0 ||
1342 				    rt_is_expired(rth) ||
1343 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1344 					rthp = &rth->u.dst.rt_next;
1345 					continue;
1346 				}
1347 
1348 				if (rth->rt_dst != daddr ||
1349 				    rth->rt_src != saddr ||
1350 				    rth->u.dst.error ||
1351 				    rth->rt_gateway != old_gw ||
1352 				    rth->u.dst.dev != dev)
1353 					break;
1354 
1355 				dst_hold(&rth->u.dst);
1356 				rcu_read_unlock();
1357 
1358 				rt = dst_alloc(&ipv4_dst_ops);
1359 				if (rt == NULL) {
1360 					ip_rt_put(rth);
1361 					in_dev_put(in_dev);
1362 					return;
1363 				}
1364 
1365 				/* Copy all the information. */
1366 				*rt = *rth;
1367 				rt->u.dst.__use		= 1;
1368 				atomic_set(&rt->u.dst.__refcnt, 1);
1369 				rt->u.dst.child		= NULL;
1370 				if (rt->u.dst.dev)
1371 					dev_hold(rt->u.dst.dev);
1372 				if (rt->idev)
1373 					in_dev_hold(rt->idev);
1374 				rt->u.dst.obsolete	= 0;
1375 				rt->u.dst.lastuse	= jiffies;
1376 				rt->u.dst.path		= &rt->u.dst;
1377 				rt->u.dst.neighbour	= NULL;
1378 				rt->u.dst.hh		= NULL;
1379 #ifdef CONFIG_XFRM
1380 				rt->u.dst.xfrm		= NULL;
1381 #endif
1382 				rt->rt_genid		= rt_genid(net);
1383 				rt->rt_flags		|= RTCF_REDIRECTED;
1384 
1385 				/* Gateway is different ... */
1386 				rt->rt_gateway		= new_gw;
1387 
1388 				/* Redirect received -> path was valid */
1389 				dst_confirm(&rth->u.dst);
1390 
1391 				if (rt->peer)
1392 					atomic_inc(&rt->peer->refcnt);
1393 
1394 				if (arp_bind_neighbour(&rt->u.dst) ||
1395 				    !(rt->u.dst.neighbour->nud_state &
1396 					    NUD_VALID)) {
1397 					if (rt->u.dst.neighbour)
1398 						neigh_event_send(rt->u.dst.neighbour, NULL);
1399 					ip_rt_put(rth);
1400 					rt_drop(rt);
1401 					goto do_next;
1402 				}
1403 
1404 				netevent.old = &rth->u.dst;
1405 				netevent.new = &rt->u.dst;
1406 				call_netevent_notifiers(NETEVENT_REDIRECT,
1407 							&netevent);
1408 
1409 				rt_del(hash, rth);
1410 				if (!rt_intern_hash(hash, rt, &rt))
1411 					ip_rt_put(rt);
1412 				goto do_next;
1413 			}
1414 			rcu_read_unlock();
1415 		do_next:
1416 			;
1417 		}
1418 	}
1419 	in_dev_put(in_dev);
1420 	return;
1421 
1422 reject_redirect:
1423 #ifdef CONFIG_IP_ROUTE_VERBOSE
1424 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1425 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1426 			"  Advised path = %pI4 -> %pI4\n",
1427 		       &old_gw, dev->name, &new_gw,
1428 		       &saddr, &daddr);
1429 #endif
1430 	in_dev_put(in_dev);
1431 }
1432 
1433 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1434 {
1435 	struct rtable *rt = (struct rtable *)dst;
1436 	struct dst_entry *ret = dst;
1437 
1438 	if (rt) {
1439 		if (dst->obsolete) {
1440 			ip_rt_put(rt);
1441 			ret = NULL;
1442 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1443 			   rt->u.dst.expires) {
1444 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1445 						rt->fl.oif,
1446 						rt_genid(dev_net(dst->dev)));
1447 #if RT_CACHE_DEBUG >= 1
1448 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1449 				&rt->rt_dst, rt->fl.fl4_tos);
1450 #endif
1451 			rt_del(hash, rt);
1452 			ret = NULL;
1453 		}
1454 	}
1455 	return ret;
1456 }
1457 
1458 /*
1459  * Algorithm:
1460  *	1. The first ip_rt_redirect_number redirects are sent
1461  *	   with exponential backoff, then we stop sending them at all,
1462  *	   assuming that the host ignores our redirects.
1463  *	2. If we did not see packets requiring redirects
1464  *	   during ip_rt_redirect_silence, we assume that the host
1465  *	   forgot redirected route and start to send redirects again.
1466  *
1467  * This algorithm is much cheaper and more intelligent than dumb load limiting
1468  * in icmp.c.
1469  *
1470  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1471  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1472  */
1473 
1474 void ip_rt_send_redirect(struct sk_buff *skb)
1475 {
1476 	struct rtable *rt = skb->rtable;
1477 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1478 
1479 	if (!in_dev)
1480 		return;
1481 
1482 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1483 		goto out;
1484 
1485 	/* No redirected packets during ip_rt_redirect_silence;
1486 	 * reset the algorithm.
1487 	 */
1488 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1489 		rt->u.dst.rate_tokens = 0;
1490 
1491 	/* Too many ignored redirects; do not send anything
1492 	 * set u.dst.rate_last to the last seen redirected packet.
1493 	 */
1494 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1495 		rt->u.dst.rate_last = jiffies;
1496 		goto out;
1497 	}
1498 
1499 	/* Check for load limit; set rate_last to the latest sent
1500 	 * redirect.
1501 	 */
1502 	if (rt->u.dst.rate_tokens == 0 ||
1503 	    time_after(jiffies,
1504 		       (rt->u.dst.rate_last +
1505 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1506 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1507 		rt->u.dst.rate_last = jiffies;
1508 		++rt->u.dst.rate_tokens;
1509 #ifdef CONFIG_IP_ROUTE_VERBOSE
1510 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1511 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1512 		    net_ratelimit())
1513 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1514 				&rt->rt_src, rt->rt_iif,
1515 				&rt->rt_dst, &rt->rt_gateway);
1516 #endif
1517 	}
1518 out:
1519 	in_dev_put(in_dev);
1520 }
1521 
1522 static int ip_error(struct sk_buff *skb)
1523 {
1524 	struct rtable *rt = skb->rtable;
1525 	unsigned long now;
1526 	int code;
1527 
1528 	switch (rt->u.dst.error) {
1529 		case EINVAL:
1530 		default:
1531 			goto out;
1532 		case EHOSTUNREACH:
1533 			code = ICMP_HOST_UNREACH;
1534 			break;
1535 		case ENETUNREACH:
1536 			code = ICMP_NET_UNREACH;
1537 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1538 					IPSTATS_MIB_INNOROUTES);
1539 			break;
1540 		case EACCES:
1541 			code = ICMP_PKT_FILTERED;
1542 			break;
1543 	}
1544 
1545 	now = jiffies;
1546 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1547 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1548 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1549 	rt->u.dst.rate_last = now;
1550 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1551 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1552 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1553 	}
1554 
1555 out:	kfree_skb(skb);
1556 	return 0;
1557 }
1558 
1559 /*
1560  *	The last two values are not from the RFC but
1561  *	are needed for AMPRnet AX.25 paths.
1562  */
1563 
1564 static const unsigned short mtu_plateau[] =
1565 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1566 
1567 static inline unsigned short guess_mtu(unsigned short old_mtu)
1568 {
1569 	int i;
1570 
1571 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1572 		if (old_mtu > mtu_plateau[i])
1573 			return mtu_plateau[i];
1574 	return 68;
1575 }
1576 
1577 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1578 				 unsigned short new_mtu,
1579 				 struct net_device *dev)
1580 {
1581 	int i, k;
1582 	unsigned short old_mtu = ntohs(iph->tot_len);
1583 	struct rtable *rth;
1584 	int  ikeys[2] = { dev->ifindex, 0 };
1585 	__be32  skeys[2] = { iph->saddr, 0, };
1586 	__be32  daddr = iph->daddr;
1587 	unsigned short est_mtu = 0;
1588 
1589 	if (ipv4_config.no_pmtu_disc)
1590 		return 0;
1591 
1592 	for (k = 0; k < 2; k++) {
1593 		for (i = 0; i < 2; i++) {
1594 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1595 						rt_genid(net));
1596 
1597 			rcu_read_lock();
1598 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1599 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1600 				unsigned short mtu = new_mtu;
1601 
1602 				if (rth->fl.fl4_dst != daddr ||
1603 				    rth->fl.fl4_src != skeys[i] ||
1604 				    rth->rt_dst != daddr ||
1605 				    rth->rt_src != iph->saddr ||
1606 				    rth->fl.oif != ikeys[k] ||
1607 				    rth->fl.iif != 0 ||
1608 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1609 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1610 				    rt_is_expired(rth))
1611 					continue;
1612 
1613 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1614 
1615 					/* BSD 4.2 compatibility hack :-( */
1616 					if (mtu == 0 &&
1617 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1618 					    old_mtu >= 68 + (iph->ihl << 2))
1619 						old_mtu -= iph->ihl << 2;
1620 
1621 					mtu = guess_mtu(old_mtu);
1622 				}
1623 				if (mtu <= dst_mtu(&rth->u.dst)) {
1624 					if (mtu < dst_mtu(&rth->u.dst)) {
1625 						dst_confirm(&rth->u.dst);
1626 						if (mtu < ip_rt_min_pmtu) {
1627 							mtu = ip_rt_min_pmtu;
1628 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1629 								(1 << RTAX_MTU);
1630 						}
1631 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1632 						dst_set_expires(&rth->u.dst,
1633 							ip_rt_mtu_expires);
1634 					}
1635 					est_mtu = mtu;
1636 				}
1637 			}
1638 			rcu_read_unlock();
1639 		}
1640 	}
1641 	return est_mtu ? : new_mtu;
1642 }
1643 
1644 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1645 {
1646 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1647 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1648 		if (mtu < ip_rt_min_pmtu) {
1649 			mtu = ip_rt_min_pmtu;
1650 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1651 		}
1652 		dst->metrics[RTAX_MTU-1] = mtu;
1653 		dst_set_expires(dst, ip_rt_mtu_expires);
1654 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1655 	}
1656 }
1657 
1658 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1659 {
1660 	return NULL;
1661 }
1662 
1663 static void ipv4_dst_destroy(struct dst_entry *dst)
1664 {
1665 	struct rtable *rt = (struct rtable *) dst;
1666 	struct inet_peer *peer = rt->peer;
1667 	struct in_device *idev = rt->idev;
1668 
1669 	if (peer) {
1670 		rt->peer = NULL;
1671 		inet_putpeer(peer);
1672 	}
1673 
1674 	if (idev) {
1675 		rt->idev = NULL;
1676 		in_dev_put(idev);
1677 	}
1678 }
1679 
1680 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1681 			    int how)
1682 {
1683 	struct rtable *rt = (struct rtable *) dst;
1684 	struct in_device *idev = rt->idev;
1685 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1686 		struct in_device *loopback_idev =
1687 			in_dev_get(dev_net(dev)->loopback_dev);
1688 		if (loopback_idev) {
1689 			rt->idev = loopback_idev;
1690 			in_dev_put(idev);
1691 		}
1692 	}
1693 }
1694 
1695 static void ipv4_link_failure(struct sk_buff *skb)
1696 {
1697 	struct rtable *rt;
1698 
1699 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1700 
1701 	rt = skb->rtable;
1702 	if (rt)
1703 		dst_set_expires(&rt->u.dst, 0);
1704 }
1705 
1706 static int ip_rt_bug(struct sk_buff *skb)
1707 {
1708 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1709 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1710 		skb->dev ? skb->dev->name : "?");
1711 	kfree_skb(skb);
1712 	return 0;
1713 }
1714 
1715 /*
1716    We do not cache source address of outgoing interface,
1717    because it is used only by IP RR, TS and SRR options,
1718    so that it out of fast path.
1719 
1720    BTW remember: "addr" is allowed to be not aligned
1721    in IP options!
1722  */
1723 
1724 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1725 {
1726 	__be32 src;
1727 	struct fib_result res;
1728 
1729 	if (rt->fl.iif == 0)
1730 		src = rt->rt_src;
1731 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1732 		src = FIB_RES_PREFSRC(res);
1733 		fib_res_put(&res);
1734 	} else
1735 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1736 					RT_SCOPE_UNIVERSE);
1737 	memcpy(addr, &src, 4);
1738 }
1739 
1740 #ifdef CONFIG_NET_CLS_ROUTE
1741 static void set_class_tag(struct rtable *rt, u32 tag)
1742 {
1743 	if (!(rt->u.dst.tclassid & 0xFFFF))
1744 		rt->u.dst.tclassid |= tag & 0xFFFF;
1745 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1746 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1747 }
1748 #endif
1749 
1750 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1751 {
1752 	struct fib_info *fi = res->fi;
1753 
1754 	if (fi) {
1755 		if (FIB_RES_GW(*res) &&
1756 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1757 			rt->rt_gateway = FIB_RES_GW(*res);
1758 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1759 		       sizeof(rt->u.dst.metrics));
1760 		if (fi->fib_mtu == 0) {
1761 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1762 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1763 			    rt->rt_gateway != rt->rt_dst &&
1764 			    rt->u.dst.dev->mtu > 576)
1765 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1766 		}
1767 #ifdef CONFIG_NET_CLS_ROUTE
1768 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1769 #endif
1770 	} else
1771 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1772 
1773 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1774 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1775 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1776 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1777 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1778 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1779 				       ip_rt_min_advmss);
1780 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1781 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1782 
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784 #ifdef CONFIG_IP_MULTIPLE_TABLES
1785 	set_class_tag(rt, fib_rules_tclass(res));
1786 #endif
1787 	set_class_tag(rt, itag);
1788 #endif
1789 	rt->rt_type = res->type;
1790 }
1791 
1792 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1793 				u8 tos, struct net_device *dev, int our)
1794 {
1795 	unsigned hash;
1796 	struct rtable *rth;
1797 	__be32 spec_dst;
1798 	struct in_device *in_dev = in_dev_get(dev);
1799 	u32 itag = 0;
1800 
1801 	/* Primary sanity checks. */
1802 
1803 	if (in_dev == NULL)
1804 		return -EINVAL;
1805 
1806 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1807 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1808 		goto e_inval;
1809 
1810 	if (ipv4_is_zeronet(saddr)) {
1811 		if (!ipv4_is_local_multicast(daddr))
1812 			goto e_inval;
1813 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1814 	} else if (fib_validate_source(saddr, 0, tos, 0,
1815 					dev, &spec_dst, &itag) < 0)
1816 		goto e_inval;
1817 
1818 	rth = dst_alloc(&ipv4_dst_ops);
1819 	if (!rth)
1820 		goto e_nobufs;
1821 
1822 	rth->u.dst.output= ip_rt_bug;
1823 
1824 	atomic_set(&rth->u.dst.__refcnt, 1);
1825 	rth->u.dst.flags= DST_HOST;
1826 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1827 		rth->u.dst.flags |= DST_NOPOLICY;
1828 	rth->fl.fl4_dst	= daddr;
1829 	rth->rt_dst	= daddr;
1830 	rth->fl.fl4_tos	= tos;
1831 	rth->fl.mark    = skb->mark;
1832 	rth->fl.fl4_src	= saddr;
1833 	rth->rt_src	= saddr;
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835 	rth->u.dst.tclassid = itag;
1836 #endif
1837 	rth->rt_iif	=
1838 	rth->fl.iif	= dev->ifindex;
1839 	rth->u.dst.dev	= init_net.loopback_dev;
1840 	dev_hold(rth->u.dst.dev);
1841 	rth->idev	= in_dev_get(rth->u.dst.dev);
1842 	rth->fl.oif	= 0;
1843 	rth->rt_gateway	= daddr;
1844 	rth->rt_spec_dst= spec_dst;
1845 	rth->rt_genid	= rt_genid(dev_net(dev));
1846 	rth->rt_flags	= RTCF_MULTICAST;
1847 	rth->rt_type	= RTN_MULTICAST;
1848 	if (our) {
1849 		rth->u.dst.input= ip_local_deliver;
1850 		rth->rt_flags |= RTCF_LOCAL;
1851 	}
1852 
1853 #ifdef CONFIG_IP_MROUTE
1854 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1855 		rth->u.dst.input = ip_mr_input;
1856 #endif
1857 	RT_CACHE_STAT_INC(in_slow_mc);
1858 
1859 	in_dev_put(in_dev);
1860 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1861 	return rt_intern_hash(hash, rth, &skb->rtable);
1862 
1863 e_nobufs:
1864 	in_dev_put(in_dev);
1865 	return -ENOBUFS;
1866 
1867 e_inval:
1868 	in_dev_put(in_dev);
1869 	return -EINVAL;
1870 }
1871 
1872 
1873 static void ip_handle_martian_source(struct net_device *dev,
1874 				     struct in_device *in_dev,
1875 				     struct sk_buff *skb,
1876 				     __be32 daddr,
1877 				     __be32 saddr)
1878 {
1879 	RT_CACHE_STAT_INC(in_martian_src);
1880 #ifdef CONFIG_IP_ROUTE_VERBOSE
1881 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1882 		/*
1883 		 *	RFC1812 recommendation, if source is martian,
1884 		 *	the only hint is MAC header.
1885 		 */
1886 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1887 			&daddr, &saddr, dev->name);
1888 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1889 			int i;
1890 			const unsigned char *p = skb_mac_header(skb);
1891 			printk(KERN_WARNING "ll header: ");
1892 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1893 				printk("%02x", *p);
1894 				if (i < (dev->hard_header_len - 1))
1895 					printk(":");
1896 			}
1897 			printk("\n");
1898 		}
1899 	}
1900 #endif
1901 }
1902 
1903 static int __mkroute_input(struct sk_buff *skb,
1904 			   struct fib_result *res,
1905 			   struct in_device *in_dev,
1906 			   __be32 daddr, __be32 saddr, u32 tos,
1907 			   struct rtable **result)
1908 {
1909 
1910 	struct rtable *rth;
1911 	int err;
1912 	struct in_device *out_dev;
1913 	unsigned flags = 0;
1914 	__be32 spec_dst;
1915 	u32 itag;
1916 
1917 	/* get a working reference to the output device */
1918 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1919 	if (out_dev == NULL) {
1920 		if (net_ratelimit())
1921 			printk(KERN_CRIT "Bug in ip_route_input" \
1922 			       "_slow(). Please, report\n");
1923 		return -EINVAL;
1924 	}
1925 
1926 
1927 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1928 				  in_dev->dev, &spec_dst, &itag);
1929 	if (err < 0) {
1930 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1931 					 saddr);
1932 
1933 		err = -EINVAL;
1934 		goto cleanup;
1935 	}
1936 
1937 	if (err)
1938 		flags |= RTCF_DIRECTSRC;
1939 
1940 	if (out_dev == in_dev && err &&
1941 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1942 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1943 		flags |= RTCF_DOREDIRECT;
1944 
1945 	if (skb->protocol != htons(ETH_P_IP)) {
1946 		/* Not IP (i.e. ARP). Do not create route, if it is
1947 		 * invalid for proxy arp. DNAT routes are always valid.
1948 		 */
1949 		if (out_dev == in_dev) {
1950 			err = -EINVAL;
1951 			goto cleanup;
1952 		}
1953 	}
1954 
1955 
1956 	rth = dst_alloc(&ipv4_dst_ops);
1957 	if (!rth) {
1958 		err = -ENOBUFS;
1959 		goto cleanup;
1960 	}
1961 
1962 	atomic_set(&rth->u.dst.__refcnt, 1);
1963 	rth->u.dst.flags= DST_HOST;
1964 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1965 		rth->u.dst.flags |= DST_NOPOLICY;
1966 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1967 		rth->u.dst.flags |= DST_NOXFRM;
1968 	rth->fl.fl4_dst	= daddr;
1969 	rth->rt_dst	= daddr;
1970 	rth->fl.fl4_tos	= tos;
1971 	rth->fl.mark    = skb->mark;
1972 	rth->fl.fl4_src	= saddr;
1973 	rth->rt_src	= saddr;
1974 	rth->rt_gateway	= daddr;
1975 	rth->rt_iif 	=
1976 		rth->fl.iif	= in_dev->dev->ifindex;
1977 	rth->u.dst.dev	= (out_dev)->dev;
1978 	dev_hold(rth->u.dst.dev);
1979 	rth->idev	= in_dev_get(rth->u.dst.dev);
1980 	rth->fl.oif 	= 0;
1981 	rth->rt_spec_dst= spec_dst;
1982 
1983 	rth->u.dst.input = ip_forward;
1984 	rth->u.dst.output = ip_output;
1985 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1986 
1987 	rt_set_nexthop(rth, res, itag);
1988 
1989 	rth->rt_flags = flags;
1990 
1991 	*result = rth;
1992 	err = 0;
1993  cleanup:
1994 	/* release the working reference to the output device */
1995 	in_dev_put(out_dev);
1996 	return err;
1997 }
1998 
1999 static int ip_mkroute_input(struct sk_buff *skb,
2000 			    struct fib_result *res,
2001 			    const struct flowi *fl,
2002 			    struct in_device *in_dev,
2003 			    __be32 daddr, __be32 saddr, u32 tos)
2004 {
2005 	struct rtable* rth = NULL;
2006 	int err;
2007 	unsigned hash;
2008 
2009 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2010 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2011 		fib_select_multipath(fl, res);
2012 #endif
2013 
2014 	/* create a routing cache entry */
2015 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2016 	if (err)
2017 		return err;
2018 
2019 	/* put it into the cache */
2020 	hash = rt_hash(daddr, saddr, fl->iif,
2021 		       rt_genid(dev_net(rth->u.dst.dev)));
2022 	return rt_intern_hash(hash, rth, &skb->rtable);
2023 }
2024 
2025 /*
2026  *	NOTE. We drop all the packets that has local source
2027  *	addresses, because every properly looped back packet
2028  *	must have correct destination already attached by output routine.
2029  *
2030  *	Such approach solves two big problems:
2031  *	1. Not simplex devices are handled properly.
2032  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2033  */
2034 
2035 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036 			       u8 tos, struct net_device *dev)
2037 {
2038 	struct fib_result res;
2039 	struct in_device *in_dev = in_dev_get(dev);
2040 	struct flowi fl = { .nl_u = { .ip4_u =
2041 				      { .daddr = daddr,
2042 					.saddr = saddr,
2043 					.tos = tos,
2044 					.scope = RT_SCOPE_UNIVERSE,
2045 				      } },
2046 			    .mark = skb->mark,
2047 			    .iif = dev->ifindex };
2048 	unsigned	flags = 0;
2049 	u32		itag = 0;
2050 	struct rtable * rth;
2051 	unsigned	hash;
2052 	__be32		spec_dst;
2053 	int		err = -EINVAL;
2054 	int		free_res = 0;
2055 	struct net    * net = dev_net(dev);
2056 
2057 	/* IP on this device is disabled. */
2058 
2059 	if (!in_dev)
2060 		goto out;
2061 
2062 	/* Check for the most weird martians, which can be not detected
2063 	   by fib_lookup.
2064 	 */
2065 
2066 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2067 	    ipv4_is_loopback(saddr))
2068 		goto martian_source;
2069 
2070 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2071 		goto brd_input;
2072 
2073 	/* Accept zero addresses only to limited broadcast;
2074 	 * I even do not know to fix it or not. Waiting for complains :-)
2075 	 */
2076 	if (ipv4_is_zeronet(saddr))
2077 		goto martian_source;
2078 
2079 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2080 	    ipv4_is_loopback(daddr))
2081 		goto martian_destination;
2082 
2083 	/*
2084 	 *	Now we are ready to route packet.
2085 	 */
2086 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2087 		if (!IN_DEV_FORWARD(in_dev))
2088 			goto e_hostunreach;
2089 		goto no_route;
2090 	}
2091 	free_res = 1;
2092 
2093 	RT_CACHE_STAT_INC(in_slow_tot);
2094 
2095 	if (res.type == RTN_BROADCAST)
2096 		goto brd_input;
2097 
2098 	if (res.type == RTN_LOCAL) {
2099 		int result;
2100 		result = fib_validate_source(saddr, daddr, tos,
2101 					     net->loopback_dev->ifindex,
2102 					     dev, &spec_dst, &itag);
2103 		if (result < 0)
2104 			goto martian_source;
2105 		if (result)
2106 			flags |= RTCF_DIRECTSRC;
2107 		spec_dst = daddr;
2108 		goto local_input;
2109 	}
2110 
2111 	if (!IN_DEV_FORWARD(in_dev))
2112 		goto e_hostunreach;
2113 	if (res.type != RTN_UNICAST)
2114 		goto martian_destination;
2115 
2116 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2117 done:
2118 	in_dev_put(in_dev);
2119 	if (free_res)
2120 		fib_res_put(&res);
2121 out:	return err;
2122 
2123 brd_input:
2124 	if (skb->protocol != htons(ETH_P_IP))
2125 		goto e_inval;
2126 
2127 	if (ipv4_is_zeronet(saddr))
2128 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2129 	else {
2130 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2131 					  &itag);
2132 		if (err < 0)
2133 			goto martian_source;
2134 		if (err)
2135 			flags |= RTCF_DIRECTSRC;
2136 	}
2137 	flags |= RTCF_BROADCAST;
2138 	res.type = RTN_BROADCAST;
2139 	RT_CACHE_STAT_INC(in_brd);
2140 
2141 local_input:
2142 	rth = dst_alloc(&ipv4_dst_ops);
2143 	if (!rth)
2144 		goto e_nobufs;
2145 
2146 	rth->u.dst.output= ip_rt_bug;
2147 	rth->rt_genid = rt_genid(net);
2148 
2149 	atomic_set(&rth->u.dst.__refcnt, 1);
2150 	rth->u.dst.flags= DST_HOST;
2151 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2152 		rth->u.dst.flags |= DST_NOPOLICY;
2153 	rth->fl.fl4_dst	= daddr;
2154 	rth->rt_dst	= daddr;
2155 	rth->fl.fl4_tos	= tos;
2156 	rth->fl.mark    = skb->mark;
2157 	rth->fl.fl4_src	= saddr;
2158 	rth->rt_src	= saddr;
2159 #ifdef CONFIG_NET_CLS_ROUTE
2160 	rth->u.dst.tclassid = itag;
2161 #endif
2162 	rth->rt_iif	=
2163 	rth->fl.iif	= dev->ifindex;
2164 	rth->u.dst.dev	= net->loopback_dev;
2165 	dev_hold(rth->u.dst.dev);
2166 	rth->idev	= in_dev_get(rth->u.dst.dev);
2167 	rth->rt_gateway	= daddr;
2168 	rth->rt_spec_dst= spec_dst;
2169 	rth->u.dst.input= ip_local_deliver;
2170 	rth->rt_flags 	= flags|RTCF_LOCAL;
2171 	if (res.type == RTN_UNREACHABLE) {
2172 		rth->u.dst.input= ip_error;
2173 		rth->u.dst.error= -err;
2174 		rth->rt_flags 	&= ~RTCF_LOCAL;
2175 	}
2176 	rth->rt_type	= res.type;
2177 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2178 	err = rt_intern_hash(hash, rth, &skb->rtable);
2179 	goto done;
2180 
2181 no_route:
2182 	RT_CACHE_STAT_INC(in_no_route);
2183 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2184 	res.type = RTN_UNREACHABLE;
2185 	if (err == -ESRCH)
2186 		err = -ENETUNREACH;
2187 	goto local_input;
2188 
2189 	/*
2190 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2191 	 */
2192 martian_destination:
2193 	RT_CACHE_STAT_INC(in_martian_dst);
2194 #ifdef CONFIG_IP_ROUTE_VERBOSE
2195 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2196 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2197 			&daddr, &saddr, dev->name);
2198 #endif
2199 
2200 e_hostunreach:
2201 	err = -EHOSTUNREACH;
2202 	goto done;
2203 
2204 e_inval:
2205 	err = -EINVAL;
2206 	goto done;
2207 
2208 e_nobufs:
2209 	err = -ENOBUFS;
2210 	goto done;
2211 
2212 martian_source:
2213 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2214 	goto e_inval;
2215 }
2216 
2217 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2218 		   u8 tos, struct net_device *dev)
2219 {
2220 	struct rtable * rth;
2221 	unsigned	hash;
2222 	int iif = dev->ifindex;
2223 	struct net *net;
2224 
2225 	net = dev_net(dev);
2226 
2227 	if (!rt_caching(net))
2228 		goto skip_cache;
2229 
2230 	tos &= IPTOS_RT_MASK;
2231 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2232 
2233 	rcu_read_lock();
2234 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2235 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2236 		if (((rth->fl.fl4_dst ^ daddr) |
2237 		     (rth->fl.fl4_src ^ saddr) |
2238 		     (rth->fl.iif ^ iif) |
2239 		     rth->fl.oif |
2240 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2241 		    rth->fl.mark == skb->mark &&
2242 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2243 		    !rt_is_expired(rth)) {
2244 			dst_use(&rth->u.dst, jiffies);
2245 			RT_CACHE_STAT_INC(in_hit);
2246 			rcu_read_unlock();
2247 			skb->rtable = rth;
2248 			return 0;
2249 		}
2250 		RT_CACHE_STAT_INC(in_hlist_search);
2251 	}
2252 	rcu_read_unlock();
2253 
2254 skip_cache:
2255 	/* Multicast recognition logic is moved from route cache to here.
2256 	   The problem was that too many Ethernet cards have broken/missing
2257 	   hardware multicast filters :-( As result the host on multicasting
2258 	   network acquires a lot of useless route cache entries, sort of
2259 	   SDR messages from all the world. Now we try to get rid of them.
2260 	   Really, provided software IP multicast filter is organized
2261 	   reasonably (at least, hashed), it does not result in a slowdown
2262 	   comparing with route cache reject entries.
2263 	   Note, that multicast routers are not affected, because
2264 	   route cache entry is created eventually.
2265 	 */
2266 	if (ipv4_is_multicast(daddr)) {
2267 		struct in_device *in_dev;
2268 
2269 		rcu_read_lock();
2270 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2271 			int our = ip_check_mc(in_dev, daddr, saddr,
2272 				ip_hdr(skb)->protocol);
2273 			if (our
2274 #ifdef CONFIG_IP_MROUTE
2275 			    || (!ipv4_is_local_multicast(daddr) &&
2276 				IN_DEV_MFORWARD(in_dev))
2277 #endif
2278 			    ) {
2279 				rcu_read_unlock();
2280 				return ip_route_input_mc(skb, daddr, saddr,
2281 							 tos, dev, our);
2282 			}
2283 		}
2284 		rcu_read_unlock();
2285 		return -EINVAL;
2286 	}
2287 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2288 }
2289 
2290 static int __mkroute_output(struct rtable **result,
2291 			    struct fib_result *res,
2292 			    const struct flowi *fl,
2293 			    const struct flowi *oldflp,
2294 			    struct net_device *dev_out,
2295 			    unsigned flags)
2296 {
2297 	struct rtable *rth;
2298 	struct in_device *in_dev;
2299 	u32 tos = RT_FL_TOS(oldflp);
2300 	int err = 0;
2301 
2302 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2303 		return -EINVAL;
2304 
2305 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2306 		res->type = RTN_BROADCAST;
2307 	else if (ipv4_is_multicast(fl->fl4_dst))
2308 		res->type = RTN_MULTICAST;
2309 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2310 		return -EINVAL;
2311 
2312 	if (dev_out->flags & IFF_LOOPBACK)
2313 		flags |= RTCF_LOCAL;
2314 
2315 	/* get work reference to inet device */
2316 	in_dev = in_dev_get(dev_out);
2317 	if (!in_dev)
2318 		return -EINVAL;
2319 
2320 	if (res->type == RTN_BROADCAST) {
2321 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2322 		if (res->fi) {
2323 			fib_info_put(res->fi);
2324 			res->fi = NULL;
2325 		}
2326 	} else if (res->type == RTN_MULTICAST) {
2327 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2328 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2329 				 oldflp->proto))
2330 			flags &= ~RTCF_LOCAL;
2331 		/* If multicast route do not exist use
2332 		   default one, but do not gateway in this case.
2333 		   Yes, it is hack.
2334 		 */
2335 		if (res->fi && res->prefixlen < 4) {
2336 			fib_info_put(res->fi);
2337 			res->fi = NULL;
2338 		}
2339 	}
2340 
2341 
2342 	rth = dst_alloc(&ipv4_dst_ops);
2343 	if (!rth) {
2344 		err = -ENOBUFS;
2345 		goto cleanup;
2346 	}
2347 
2348 	atomic_set(&rth->u.dst.__refcnt, 1);
2349 	rth->u.dst.flags= DST_HOST;
2350 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2351 		rth->u.dst.flags |= DST_NOXFRM;
2352 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2353 		rth->u.dst.flags |= DST_NOPOLICY;
2354 
2355 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2356 	rth->fl.fl4_tos	= tos;
2357 	rth->fl.fl4_src	= oldflp->fl4_src;
2358 	rth->fl.oif	= oldflp->oif;
2359 	rth->fl.mark    = oldflp->mark;
2360 	rth->rt_dst	= fl->fl4_dst;
2361 	rth->rt_src	= fl->fl4_src;
2362 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2363 	/* get references to the devices that are to be hold by the routing
2364 	   cache entry */
2365 	rth->u.dst.dev	= dev_out;
2366 	dev_hold(dev_out);
2367 	rth->idev	= in_dev_get(dev_out);
2368 	rth->rt_gateway = fl->fl4_dst;
2369 	rth->rt_spec_dst= fl->fl4_src;
2370 
2371 	rth->u.dst.output=ip_output;
2372 	rth->rt_genid = rt_genid(dev_net(dev_out));
2373 
2374 	RT_CACHE_STAT_INC(out_slow_tot);
2375 
2376 	if (flags & RTCF_LOCAL) {
2377 		rth->u.dst.input = ip_local_deliver;
2378 		rth->rt_spec_dst = fl->fl4_dst;
2379 	}
2380 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2381 		rth->rt_spec_dst = fl->fl4_src;
2382 		if (flags & RTCF_LOCAL &&
2383 		    !(dev_out->flags & IFF_LOOPBACK)) {
2384 			rth->u.dst.output = ip_mc_output;
2385 			RT_CACHE_STAT_INC(out_slow_mc);
2386 		}
2387 #ifdef CONFIG_IP_MROUTE
2388 		if (res->type == RTN_MULTICAST) {
2389 			if (IN_DEV_MFORWARD(in_dev) &&
2390 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2391 				rth->u.dst.input = ip_mr_input;
2392 				rth->u.dst.output = ip_mc_output;
2393 			}
2394 		}
2395 #endif
2396 	}
2397 
2398 	rt_set_nexthop(rth, res, 0);
2399 
2400 	rth->rt_flags = flags;
2401 
2402 	*result = rth;
2403  cleanup:
2404 	/* release work reference to inet device */
2405 	in_dev_put(in_dev);
2406 
2407 	return err;
2408 }
2409 
2410 static int ip_mkroute_output(struct rtable **rp,
2411 			     struct fib_result *res,
2412 			     const struct flowi *fl,
2413 			     const struct flowi *oldflp,
2414 			     struct net_device *dev_out,
2415 			     unsigned flags)
2416 {
2417 	struct rtable *rth = NULL;
2418 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2419 	unsigned hash;
2420 	if (err == 0) {
2421 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2422 			       rt_genid(dev_net(dev_out)));
2423 		err = rt_intern_hash(hash, rth, rp);
2424 	}
2425 
2426 	return err;
2427 }
2428 
2429 /*
2430  * Major route resolver routine.
2431  */
2432 
2433 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2434 				const struct flowi *oldflp)
2435 {
2436 	u32 tos	= RT_FL_TOS(oldflp);
2437 	struct flowi fl = { .nl_u = { .ip4_u =
2438 				      { .daddr = oldflp->fl4_dst,
2439 					.saddr = oldflp->fl4_src,
2440 					.tos = tos & IPTOS_RT_MASK,
2441 					.scope = ((tos & RTO_ONLINK) ?
2442 						  RT_SCOPE_LINK :
2443 						  RT_SCOPE_UNIVERSE),
2444 				      } },
2445 			    .mark = oldflp->mark,
2446 			    .iif = net->loopback_dev->ifindex,
2447 			    .oif = oldflp->oif };
2448 	struct fib_result res;
2449 	unsigned flags = 0;
2450 	struct net_device *dev_out = NULL;
2451 	int free_res = 0;
2452 	int err;
2453 
2454 
2455 	res.fi		= NULL;
2456 #ifdef CONFIG_IP_MULTIPLE_TABLES
2457 	res.r		= NULL;
2458 #endif
2459 
2460 	if (oldflp->fl4_src) {
2461 		err = -EINVAL;
2462 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2463 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2464 		    ipv4_is_zeronet(oldflp->fl4_src))
2465 			goto out;
2466 
2467 		/* I removed check for oif == dev_out->oif here.
2468 		   It was wrong for two reasons:
2469 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2470 		      is assigned to multiple interfaces.
2471 		   2. Moreover, we are allowed to send packets with saddr
2472 		      of another iface. --ANK
2473 		 */
2474 
2475 		if (oldflp->oif == 0
2476 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2477 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2478 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2479 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2480 			if (dev_out == NULL)
2481 				goto out;
2482 
2483 			/* Special hack: user can direct multicasts
2484 			   and limited broadcast via necessary interface
2485 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2486 			   This hack is not just for fun, it allows
2487 			   vic,vat and friends to work.
2488 			   They bind socket to loopback, set ttl to zero
2489 			   and expect that it will work.
2490 			   From the viewpoint of routing cache they are broken,
2491 			   because we are not allowed to build multicast path
2492 			   with loopback source addr (look, routing cache
2493 			   cannot know, that ttl is zero, so that packet
2494 			   will not leave this host and route is valid).
2495 			   Luckily, this hack is good workaround.
2496 			 */
2497 
2498 			fl.oif = dev_out->ifindex;
2499 			goto make_route;
2500 		}
2501 
2502 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2503 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2504 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2505 			if (dev_out == NULL)
2506 				goto out;
2507 			dev_put(dev_out);
2508 			dev_out = NULL;
2509 		}
2510 	}
2511 
2512 
2513 	if (oldflp->oif) {
2514 		dev_out = dev_get_by_index(net, oldflp->oif);
2515 		err = -ENODEV;
2516 		if (dev_out == NULL)
2517 			goto out;
2518 
2519 		/* RACE: Check return value of inet_select_addr instead. */
2520 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2521 			dev_put(dev_out);
2522 			goto out;	/* Wrong error code */
2523 		}
2524 
2525 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2526 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2527 			if (!fl.fl4_src)
2528 				fl.fl4_src = inet_select_addr(dev_out, 0,
2529 							      RT_SCOPE_LINK);
2530 			goto make_route;
2531 		}
2532 		if (!fl.fl4_src) {
2533 			if (ipv4_is_multicast(oldflp->fl4_dst))
2534 				fl.fl4_src = inet_select_addr(dev_out, 0,
2535 							      fl.fl4_scope);
2536 			else if (!oldflp->fl4_dst)
2537 				fl.fl4_src = inet_select_addr(dev_out, 0,
2538 							      RT_SCOPE_HOST);
2539 		}
2540 	}
2541 
2542 	if (!fl.fl4_dst) {
2543 		fl.fl4_dst = fl.fl4_src;
2544 		if (!fl.fl4_dst)
2545 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2546 		if (dev_out)
2547 			dev_put(dev_out);
2548 		dev_out = net->loopback_dev;
2549 		dev_hold(dev_out);
2550 		fl.oif = net->loopback_dev->ifindex;
2551 		res.type = RTN_LOCAL;
2552 		flags |= RTCF_LOCAL;
2553 		goto make_route;
2554 	}
2555 
2556 	if (fib_lookup(net, &fl, &res)) {
2557 		res.fi = NULL;
2558 		if (oldflp->oif) {
2559 			/* Apparently, routing tables are wrong. Assume,
2560 			   that the destination is on link.
2561 
2562 			   WHY? DW.
2563 			   Because we are allowed to send to iface
2564 			   even if it has NO routes and NO assigned
2565 			   addresses. When oif is specified, routing
2566 			   tables are looked up with only one purpose:
2567 			   to catch if destination is gatewayed, rather than
2568 			   direct. Moreover, if MSG_DONTROUTE is set,
2569 			   we send packet, ignoring both routing tables
2570 			   and ifaddr state. --ANK
2571 
2572 
2573 			   We could make it even if oif is unknown,
2574 			   likely IPv6, but we do not.
2575 			 */
2576 
2577 			if (fl.fl4_src == 0)
2578 				fl.fl4_src = inet_select_addr(dev_out, 0,
2579 							      RT_SCOPE_LINK);
2580 			res.type = RTN_UNICAST;
2581 			goto make_route;
2582 		}
2583 		if (dev_out)
2584 			dev_put(dev_out);
2585 		err = -ENETUNREACH;
2586 		goto out;
2587 	}
2588 	free_res = 1;
2589 
2590 	if (res.type == RTN_LOCAL) {
2591 		if (!fl.fl4_src)
2592 			fl.fl4_src = fl.fl4_dst;
2593 		if (dev_out)
2594 			dev_put(dev_out);
2595 		dev_out = net->loopback_dev;
2596 		dev_hold(dev_out);
2597 		fl.oif = dev_out->ifindex;
2598 		if (res.fi)
2599 			fib_info_put(res.fi);
2600 		res.fi = NULL;
2601 		flags |= RTCF_LOCAL;
2602 		goto make_route;
2603 	}
2604 
2605 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2606 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2607 		fib_select_multipath(&fl, &res);
2608 	else
2609 #endif
2610 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2611 		fib_select_default(net, &fl, &res);
2612 
2613 	if (!fl.fl4_src)
2614 		fl.fl4_src = FIB_RES_PREFSRC(res);
2615 
2616 	if (dev_out)
2617 		dev_put(dev_out);
2618 	dev_out = FIB_RES_DEV(res);
2619 	dev_hold(dev_out);
2620 	fl.oif = dev_out->ifindex;
2621 
2622 
2623 make_route:
2624 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2625 
2626 
2627 	if (free_res)
2628 		fib_res_put(&res);
2629 	if (dev_out)
2630 		dev_put(dev_out);
2631 out:	return err;
2632 }
2633 
2634 int __ip_route_output_key(struct net *net, struct rtable **rp,
2635 			  const struct flowi *flp)
2636 {
2637 	unsigned hash;
2638 	struct rtable *rth;
2639 
2640 	if (!rt_caching(net))
2641 		goto slow_output;
2642 
2643 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2644 
2645 	rcu_read_lock_bh();
2646 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2647 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2648 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2649 		    rth->fl.fl4_src == flp->fl4_src &&
2650 		    rth->fl.iif == 0 &&
2651 		    rth->fl.oif == flp->oif &&
2652 		    rth->fl.mark == flp->mark &&
2653 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2654 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2655 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2656 		    !rt_is_expired(rth)) {
2657 			dst_use(&rth->u.dst, jiffies);
2658 			RT_CACHE_STAT_INC(out_hit);
2659 			rcu_read_unlock_bh();
2660 			*rp = rth;
2661 			return 0;
2662 		}
2663 		RT_CACHE_STAT_INC(out_hlist_search);
2664 	}
2665 	rcu_read_unlock_bh();
2666 
2667 slow_output:
2668 	return ip_route_output_slow(net, rp, flp);
2669 }
2670 
2671 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2672 
2673 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2674 {
2675 }
2676 
2677 static struct dst_ops ipv4_dst_blackhole_ops = {
2678 	.family			=	AF_INET,
2679 	.protocol		=	cpu_to_be16(ETH_P_IP),
2680 	.destroy		=	ipv4_dst_destroy,
2681 	.check			=	ipv4_dst_check,
2682 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2683 	.entries		=	ATOMIC_INIT(0),
2684 };
2685 
2686 
2687 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2688 {
2689 	struct rtable *ort = *rp;
2690 	struct rtable *rt = (struct rtable *)
2691 		dst_alloc(&ipv4_dst_blackhole_ops);
2692 
2693 	if (rt) {
2694 		struct dst_entry *new = &rt->u.dst;
2695 
2696 		atomic_set(&new->__refcnt, 1);
2697 		new->__use = 1;
2698 		new->input = dst_discard;
2699 		new->output = dst_discard;
2700 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2701 
2702 		new->dev = ort->u.dst.dev;
2703 		if (new->dev)
2704 			dev_hold(new->dev);
2705 
2706 		rt->fl = ort->fl;
2707 
2708 		rt->idev = ort->idev;
2709 		if (rt->idev)
2710 			in_dev_hold(rt->idev);
2711 		rt->rt_genid = rt_genid(net);
2712 		rt->rt_flags = ort->rt_flags;
2713 		rt->rt_type = ort->rt_type;
2714 		rt->rt_dst = ort->rt_dst;
2715 		rt->rt_src = ort->rt_src;
2716 		rt->rt_iif = ort->rt_iif;
2717 		rt->rt_gateway = ort->rt_gateway;
2718 		rt->rt_spec_dst = ort->rt_spec_dst;
2719 		rt->peer = ort->peer;
2720 		if (rt->peer)
2721 			atomic_inc(&rt->peer->refcnt);
2722 
2723 		dst_free(new);
2724 	}
2725 
2726 	dst_release(&(*rp)->u.dst);
2727 	*rp = rt;
2728 	return (rt ? 0 : -ENOMEM);
2729 }
2730 
2731 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2732 			 struct sock *sk, int flags)
2733 {
2734 	int err;
2735 
2736 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2737 		return err;
2738 
2739 	if (flp->proto) {
2740 		if (!flp->fl4_src)
2741 			flp->fl4_src = (*rp)->rt_src;
2742 		if (!flp->fl4_dst)
2743 			flp->fl4_dst = (*rp)->rt_dst;
2744 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2745 				    flags ? XFRM_LOOKUP_WAIT : 0);
2746 		if (err == -EREMOTE)
2747 			err = ipv4_dst_blackhole(net, rp, flp);
2748 
2749 		return err;
2750 	}
2751 
2752 	return 0;
2753 }
2754 
2755 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2756 
2757 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2758 {
2759 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2760 }
2761 
2762 static int rt_fill_info(struct net *net,
2763 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2764 			int nowait, unsigned int flags)
2765 {
2766 	struct rtable *rt = skb->rtable;
2767 	struct rtmsg *r;
2768 	struct nlmsghdr *nlh;
2769 	long expires;
2770 	u32 id = 0, ts = 0, tsage = 0, error;
2771 
2772 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2773 	if (nlh == NULL)
2774 		return -EMSGSIZE;
2775 
2776 	r = nlmsg_data(nlh);
2777 	r->rtm_family	 = AF_INET;
2778 	r->rtm_dst_len	= 32;
2779 	r->rtm_src_len	= 0;
2780 	r->rtm_tos	= rt->fl.fl4_tos;
2781 	r->rtm_table	= RT_TABLE_MAIN;
2782 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2783 	r->rtm_type	= rt->rt_type;
2784 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2785 	r->rtm_protocol = RTPROT_UNSPEC;
2786 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2787 	if (rt->rt_flags & RTCF_NOTIFY)
2788 		r->rtm_flags |= RTM_F_NOTIFY;
2789 
2790 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2791 
2792 	if (rt->fl.fl4_src) {
2793 		r->rtm_src_len = 32;
2794 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2795 	}
2796 	if (rt->u.dst.dev)
2797 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2798 #ifdef CONFIG_NET_CLS_ROUTE
2799 	if (rt->u.dst.tclassid)
2800 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2801 #endif
2802 	if (rt->fl.iif)
2803 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2804 	else if (rt->rt_src != rt->fl.fl4_src)
2805 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2806 
2807 	if (rt->rt_dst != rt->rt_gateway)
2808 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2809 
2810 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2811 		goto nla_put_failure;
2812 
2813 	error = rt->u.dst.error;
2814 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2815 	if (rt->peer) {
2816 		id = rt->peer->ip_id_count;
2817 		if (rt->peer->tcp_ts_stamp) {
2818 			ts = rt->peer->tcp_ts;
2819 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2820 		}
2821 	}
2822 
2823 	if (rt->fl.iif) {
2824 #ifdef CONFIG_IP_MROUTE
2825 		__be32 dst = rt->rt_dst;
2826 
2827 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2828 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2829 			int err = ipmr_get_route(net, skb, r, nowait);
2830 			if (err <= 0) {
2831 				if (!nowait) {
2832 					if (err == 0)
2833 						return 0;
2834 					goto nla_put_failure;
2835 				} else {
2836 					if (err == -EMSGSIZE)
2837 						goto nla_put_failure;
2838 					error = err;
2839 				}
2840 			}
2841 		} else
2842 #endif
2843 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2844 	}
2845 
2846 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2847 			       expires, error) < 0)
2848 		goto nla_put_failure;
2849 
2850 	return nlmsg_end(skb, nlh);
2851 
2852 nla_put_failure:
2853 	nlmsg_cancel(skb, nlh);
2854 	return -EMSGSIZE;
2855 }
2856 
2857 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2858 {
2859 	struct net *net = sock_net(in_skb->sk);
2860 	struct rtmsg *rtm;
2861 	struct nlattr *tb[RTA_MAX+1];
2862 	struct rtable *rt = NULL;
2863 	__be32 dst = 0;
2864 	__be32 src = 0;
2865 	u32 iif;
2866 	int err;
2867 	struct sk_buff *skb;
2868 
2869 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2870 	if (err < 0)
2871 		goto errout;
2872 
2873 	rtm = nlmsg_data(nlh);
2874 
2875 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2876 	if (skb == NULL) {
2877 		err = -ENOBUFS;
2878 		goto errout;
2879 	}
2880 
2881 	/* Reserve room for dummy headers, this skb can pass
2882 	   through good chunk of routing engine.
2883 	 */
2884 	skb_reset_mac_header(skb);
2885 	skb_reset_network_header(skb);
2886 
2887 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2888 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2889 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2890 
2891 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2892 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2893 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2894 
2895 	if (iif) {
2896 		struct net_device *dev;
2897 
2898 		dev = __dev_get_by_index(net, iif);
2899 		if (dev == NULL) {
2900 			err = -ENODEV;
2901 			goto errout_free;
2902 		}
2903 
2904 		skb->protocol	= htons(ETH_P_IP);
2905 		skb->dev	= dev;
2906 		local_bh_disable();
2907 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2908 		local_bh_enable();
2909 
2910 		rt = skb->rtable;
2911 		if (err == 0 && rt->u.dst.error)
2912 			err = -rt->u.dst.error;
2913 	} else {
2914 		struct flowi fl = {
2915 			.nl_u = {
2916 				.ip4_u = {
2917 					.daddr = dst,
2918 					.saddr = src,
2919 					.tos = rtm->rtm_tos,
2920 				},
2921 			},
2922 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2923 		};
2924 		err = ip_route_output_key(net, &rt, &fl);
2925 	}
2926 
2927 	if (err)
2928 		goto errout_free;
2929 
2930 	skb->rtable = rt;
2931 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2932 		rt->rt_flags |= RTCF_NOTIFY;
2933 
2934 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2935 			   RTM_NEWROUTE, 0, 0);
2936 	if (err <= 0)
2937 		goto errout_free;
2938 
2939 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2940 errout:
2941 	return err;
2942 
2943 errout_free:
2944 	kfree_skb(skb);
2945 	goto errout;
2946 }
2947 
2948 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2949 {
2950 	struct rtable *rt;
2951 	int h, s_h;
2952 	int idx, s_idx;
2953 	struct net *net;
2954 
2955 	net = sock_net(skb->sk);
2956 
2957 	s_h = cb->args[0];
2958 	if (s_h < 0)
2959 		s_h = 0;
2960 	s_idx = idx = cb->args[1];
2961 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2962 		if (!rt_hash_table[h].chain)
2963 			continue;
2964 		rcu_read_lock_bh();
2965 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2966 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2967 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2968 				continue;
2969 			if (rt_is_expired(rt))
2970 				continue;
2971 			skb->dst = dst_clone(&rt->u.dst);
2972 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2973 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2974 					 1, NLM_F_MULTI) <= 0) {
2975 				dst_release(xchg(&skb->dst, NULL));
2976 				rcu_read_unlock_bh();
2977 				goto done;
2978 			}
2979 			dst_release(xchg(&skb->dst, NULL));
2980 		}
2981 		rcu_read_unlock_bh();
2982 	}
2983 
2984 done:
2985 	cb->args[0] = h;
2986 	cb->args[1] = idx;
2987 	return skb->len;
2988 }
2989 
2990 void ip_rt_multicast_event(struct in_device *in_dev)
2991 {
2992 	rt_cache_flush(dev_net(in_dev->dev), 0);
2993 }
2994 
2995 #ifdef CONFIG_SYSCTL
2996 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2997 					struct file *filp, void __user *buffer,
2998 					size_t *lenp, loff_t *ppos)
2999 {
3000 	if (write) {
3001 		int flush_delay;
3002 		ctl_table ctl;
3003 		struct net *net;
3004 
3005 		memcpy(&ctl, __ctl, sizeof(ctl));
3006 		ctl.data = &flush_delay;
3007 		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3008 
3009 		net = (struct net *)__ctl->extra1;
3010 		rt_cache_flush(net, flush_delay);
3011 		return 0;
3012 	}
3013 
3014 	return -EINVAL;
3015 }
3016 
3017 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3018 						void __user *oldval,
3019 						size_t __user *oldlenp,
3020 						void __user *newval,
3021 						size_t newlen)
3022 {
3023 	int delay;
3024 	struct net *net;
3025 	if (newlen != sizeof(int))
3026 		return -EINVAL;
3027 	if (get_user(delay, (int __user *)newval))
3028 		return -EFAULT;
3029 	net = (struct net *)table->extra1;
3030 	rt_cache_flush(net, delay);
3031 	return 0;
3032 }
3033 
3034 static void rt_secret_reschedule(int old)
3035 {
3036 	struct net *net;
3037 	int new = ip_rt_secret_interval;
3038 	int diff = new - old;
3039 
3040 	if (!diff)
3041 		return;
3042 
3043 	rtnl_lock();
3044 	for_each_net(net) {
3045 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3046 
3047 		if (!new)
3048 			continue;
3049 
3050 		if (deleted) {
3051 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3052 
3053 			if (time <= 0 || (time += diff) <= 0)
3054 				time = 0;
3055 
3056 			net->ipv4.rt_secret_timer.expires = time;
3057 		} else
3058 			net->ipv4.rt_secret_timer.expires = new;
3059 
3060 		net->ipv4.rt_secret_timer.expires += jiffies;
3061 		add_timer(&net->ipv4.rt_secret_timer);
3062 	}
3063 	rtnl_unlock();
3064 }
3065 
3066 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3067 					  struct file *filp,
3068 					  void __user *buffer, size_t *lenp,
3069 					  loff_t *ppos)
3070 {
3071 	int old = ip_rt_secret_interval;
3072 	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3073 
3074 	rt_secret_reschedule(old);
3075 
3076 	return ret;
3077 }
3078 
3079 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3080 						   void __user *oldval,
3081 						   size_t __user *oldlenp,
3082 						   void __user *newval,
3083 						   size_t newlen)
3084 {
3085 	int old = ip_rt_secret_interval;
3086 	int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3087 
3088 	rt_secret_reschedule(old);
3089 
3090 	return ret;
3091 }
3092 
3093 static ctl_table ipv4_route_table[] = {
3094 	{
3095 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
3096 		.procname	= "gc_thresh",
3097 		.data		= &ipv4_dst_ops.gc_thresh,
3098 		.maxlen		= sizeof(int),
3099 		.mode		= 0644,
3100 		.proc_handler	= proc_dointvec,
3101 	},
3102 	{
3103 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
3104 		.procname	= "max_size",
3105 		.data		= &ip_rt_max_size,
3106 		.maxlen		= sizeof(int),
3107 		.mode		= 0644,
3108 		.proc_handler	= proc_dointvec,
3109 	},
3110 	{
3111 		/*  Deprecated. Use gc_min_interval_ms */
3112 
3113 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3114 		.procname	= "gc_min_interval",
3115 		.data		= &ip_rt_gc_min_interval,
3116 		.maxlen		= sizeof(int),
3117 		.mode		= 0644,
3118 		.proc_handler	= proc_dointvec_jiffies,
3119 		.strategy	= sysctl_jiffies,
3120 	},
3121 	{
3122 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3123 		.procname	= "gc_min_interval_ms",
3124 		.data		= &ip_rt_gc_min_interval,
3125 		.maxlen		= sizeof(int),
3126 		.mode		= 0644,
3127 		.proc_handler	= proc_dointvec_ms_jiffies,
3128 		.strategy	= sysctl_ms_jiffies,
3129 	},
3130 	{
3131 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3132 		.procname	= "gc_timeout",
3133 		.data		= &ip_rt_gc_timeout,
3134 		.maxlen		= sizeof(int),
3135 		.mode		= 0644,
3136 		.proc_handler	= proc_dointvec_jiffies,
3137 		.strategy	= sysctl_jiffies,
3138 	},
3139 	{
3140 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3141 		.procname	= "gc_interval",
3142 		.data		= &ip_rt_gc_interval,
3143 		.maxlen		= sizeof(int),
3144 		.mode		= 0644,
3145 		.proc_handler	= proc_dointvec_jiffies,
3146 		.strategy	= sysctl_jiffies,
3147 	},
3148 	{
3149 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3150 		.procname	= "redirect_load",
3151 		.data		= &ip_rt_redirect_load,
3152 		.maxlen		= sizeof(int),
3153 		.mode		= 0644,
3154 		.proc_handler	= proc_dointvec,
3155 	},
3156 	{
3157 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3158 		.procname	= "redirect_number",
3159 		.data		= &ip_rt_redirect_number,
3160 		.maxlen		= sizeof(int),
3161 		.mode		= 0644,
3162 		.proc_handler	= proc_dointvec,
3163 	},
3164 	{
3165 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3166 		.procname	= "redirect_silence",
3167 		.data		= &ip_rt_redirect_silence,
3168 		.maxlen		= sizeof(int),
3169 		.mode		= 0644,
3170 		.proc_handler	= proc_dointvec,
3171 	},
3172 	{
3173 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3174 		.procname	= "error_cost",
3175 		.data		= &ip_rt_error_cost,
3176 		.maxlen		= sizeof(int),
3177 		.mode		= 0644,
3178 		.proc_handler	= proc_dointvec,
3179 	},
3180 	{
3181 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3182 		.procname	= "error_burst",
3183 		.data		= &ip_rt_error_burst,
3184 		.maxlen		= sizeof(int),
3185 		.mode		= 0644,
3186 		.proc_handler	= proc_dointvec,
3187 	},
3188 	{
3189 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3190 		.procname	= "gc_elasticity",
3191 		.data		= &ip_rt_gc_elasticity,
3192 		.maxlen		= sizeof(int),
3193 		.mode		= 0644,
3194 		.proc_handler	= proc_dointvec,
3195 	},
3196 	{
3197 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3198 		.procname	= "mtu_expires",
3199 		.data		= &ip_rt_mtu_expires,
3200 		.maxlen		= sizeof(int),
3201 		.mode		= 0644,
3202 		.proc_handler	= proc_dointvec_jiffies,
3203 		.strategy	= sysctl_jiffies,
3204 	},
3205 	{
3206 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3207 		.procname	= "min_pmtu",
3208 		.data		= &ip_rt_min_pmtu,
3209 		.maxlen		= sizeof(int),
3210 		.mode		= 0644,
3211 		.proc_handler	= proc_dointvec,
3212 	},
3213 	{
3214 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3215 		.procname	= "min_adv_mss",
3216 		.data		= &ip_rt_min_advmss,
3217 		.maxlen		= sizeof(int),
3218 		.mode		= 0644,
3219 		.proc_handler	= proc_dointvec,
3220 	},
3221 	{
3222 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3223 		.procname	= "secret_interval",
3224 		.data		= &ip_rt_secret_interval,
3225 		.maxlen		= sizeof(int),
3226 		.mode		= 0644,
3227 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3228 		.strategy	= ipv4_sysctl_rt_secret_interval_strategy,
3229 	},
3230 	{ .ctl_name = 0 }
3231 };
3232 
3233 static struct ctl_table empty[1];
3234 
3235 static struct ctl_table ipv4_skeleton[] =
3236 {
3237 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3238 	  .mode = 0555, .child = ipv4_route_table},
3239 	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3240 	  .mode = 0555, .child = empty},
3241 	{ }
3242 };
3243 
3244 static __net_initdata struct ctl_path ipv4_path[] = {
3245 	{ .procname = "net", .ctl_name = CTL_NET, },
3246 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3247 	{ },
3248 };
3249 
3250 static struct ctl_table ipv4_route_flush_table[] = {
3251 	{
3252 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
3253 		.procname	= "flush",
3254 		.maxlen		= sizeof(int),
3255 		.mode		= 0200,
3256 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3257 		.strategy	= ipv4_sysctl_rtcache_flush_strategy,
3258 	},
3259 	{ .ctl_name = 0 },
3260 };
3261 
3262 static __net_initdata struct ctl_path ipv4_route_path[] = {
3263 	{ .procname = "net", .ctl_name = CTL_NET, },
3264 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3265 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3266 	{ },
3267 };
3268 
3269 static __net_init int sysctl_route_net_init(struct net *net)
3270 {
3271 	struct ctl_table *tbl;
3272 
3273 	tbl = ipv4_route_flush_table;
3274 	if (net != &init_net) {
3275 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3276 		if (tbl == NULL)
3277 			goto err_dup;
3278 	}
3279 	tbl[0].extra1 = net;
3280 
3281 	net->ipv4.route_hdr =
3282 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3283 	if (net->ipv4.route_hdr == NULL)
3284 		goto err_reg;
3285 	return 0;
3286 
3287 err_reg:
3288 	if (tbl != ipv4_route_flush_table)
3289 		kfree(tbl);
3290 err_dup:
3291 	return -ENOMEM;
3292 }
3293 
3294 static __net_exit void sysctl_route_net_exit(struct net *net)
3295 {
3296 	struct ctl_table *tbl;
3297 
3298 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3299 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3300 	BUG_ON(tbl == ipv4_route_flush_table);
3301 	kfree(tbl);
3302 }
3303 
3304 static __net_initdata struct pernet_operations sysctl_route_ops = {
3305 	.init = sysctl_route_net_init,
3306 	.exit = sysctl_route_net_exit,
3307 };
3308 #endif
3309 
3310 
3311 static __net_init int rt_secret_timer_init(struct net *net)
3312 {
3313 	atomic_set(&net->ipv4.rt_genid,
3314 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3315 			(jiffies ^ (jiffies >> 7))));
3316 
3317 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3318 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3319 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3320 
3321 	if (ip_rt_secret_interval) {
3322 		net->ipv4.rt_secret_timer.expires =
3323 			jiffies + net_random() % ip_rt_secret_interval +
3324 			ip_rt_secret_interval;
3325 		add_timer(&net->ipv4.rt_secret_timer);
3326 	}
3327 	return 0;
3328 }
3329 
3330 static __net_exit void rt_secret_timer_exit(struct net *net)
3331 {
3332 	del_timer_sync(&net->ipv4.rt_secret_timer);
3333 }
3334 
3335 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3336 	.init = rt_secret_timer_init,
3337 	.exit = rt_secret_timer_exit,
3338 };
3339 
3340 
3341 #ifdef CONFIG_NET_CLS_ROUTE
3342 struct ip_rt_acct *ip_rt_acct __read_mostly;
3343 #endif /* CONFIG_NET_CLS_ROUTE */
3344 
3345 static __initdata unsigned long rhash_entries;
3346 static int __init set_rhash_entries(char *str)
3347 {
3348 	if (!str)
3349 		return 0;
3350 	rhash_entries = simple_strtoul(str, &str, 0);
3351 	return 1;
3352 }
3353 __setup("rhash_entries=", set_rhash_entries);
3354 
3355 int __init ip_rt_init(void)
3356 {
3357 	int rc = 0;
3358 
3359 #ifdef CONFIG_NET_CLS_ROUTE
3360 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3361 	if (!ip_rt_acct)
3362 		panic("IP: failed to allocate ip_rt_acct\n");
3363 #endif
3364 
3365 	ipv4_dst_ops.kmem_cachep =
3366 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3367 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3368 
3369 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3370 
3371 	rt_hash_table = (struct rt_hash_bucket *)
3372 		alloc_large_system_hash("IP route cache",
3373 					sizeof(struct rt_hash_bucket),
3374 					rhash_entries,
3375 					(num_physpages >= 128 * 1024) ?
3376 					15 : 17,
3377 					0,
3378 					&rt_hash_log,
3379 					&rt_hash_mask,
3380 					rhash_entries ? 0 : 512 * 1024);
3381 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3382 	rt_hash_lock_init();
3383 
3384 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3385 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3386 
3387 	devinet_init();
3388 	ip_fib_init();
3389 
3390 	/* All the timers, started at system startup tend
3391 	   to synchronize. Perturb it a bit.
3392 	 */
3393 	schedule_delayed_work(&expires_work,
3394 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3395 
3396 	if (register_pernet_subsys(&rt_secret_timer_ops))
3397 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3398 
3399 	if (ip_rt_proc_init())
3400 		printk(KERN_ERR "Unable to create route proc files\n");
3401 #ifdef CONFIG_XFRM
3402 	xfrm_init();
3403 	xfrm4_init();
3404 #endif
3405 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3406 
3407 #ifdef CONFIG_SYSCTL
3408 	register_pernet_subsys(&sysctl_route_ops);
3409 #endif
3410 	return rc;
3411 }
3412 
3413 #ifdef CONFIG_SYSCTL
3414 /*
3415  * We really need to sanitize the damn ipv4 init order, then all
3416  * this nonsense will go away.
3417  */
3418 void __init ip_static_sysctl_init(void)
3419 {
3420 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3421 }
3422 #endif
3423 
3424 EXPORT_SYMBOL(__ip_select_ident);
3425 EXPORT_SYMBOL(ip_route_input);
3426 EXPORT_SYMBOL(ip_route_output_key);
3427