xref: /openbmc/linux/net/ipv4/route.c (revision 5a0e3ad6)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
133 static int rt_chain_length_max __read_mostly	= 20;
134 
135 static struct delayed_work expires_work;
136 static unsigned long expires_ljiffies;
137 
138 /*
139  *	Interface to generic destination cache.
140  */
141 
142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143 static void		 ipv4_dst_destroy(struct dst_entry *dst);
144 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
145 					 struct net_device *dev, int how);
146 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
147 static void		 ipv4_link_failure(struct sk_buff *skb);
148 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
149 static int rt_garbage_collect(struct dst_ops *ops);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference_bh(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference_bh(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
517 {
518 	struct ip_rt_acct *dst, *src;
519 	unsigned int i, j;
520 
521 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 	if (!dst)
523 		return -ENOMEM;
524 
525 	for_each_possible_cpu(i) {
526 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 		for (j = 0; j < 256; j++) {
528 			dst[j].o_bytes   += src[j].o_bytes;
529 			dst[j].o_packets += src[j].o_packets;
530 			dst[j].i_bytes   += src[j].i_bytes;
531 			dst[j].i_packets += src[j].i_packets;
532 		}
533 	}
534 
535 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 	kfree(dst);
537 	return 0;
538 }
539 
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
541 {
542 	return single_open(file, rt_acct_proc_show, NULL);
543 }
544 
545 static const struct file_operations rt_acct_proc_fops = {
546 	.owner		= THIS_MODULE,
547 	.open		= rt_acct_proc_open,
548 	.read		= seq_read,
549 	.llseek		= seq_lseek,
550 	.release	= single_release,
551 };
552 #endif
553 
554 static int __net_init ip_rt_do_proc_init(struct net *net)
555 {
556 	struct proc_dir_entry *pde;
557 
558 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 			&rt_cache_seq_fops);
560 	if (!pde)
561 		goto err1;
562 
563 	pde = proc_create("rt_cache", S_IRUGO,
564 			  net->proc_net_stat, &rt_cpu_seq_fops);
565 	if (!pde)
566 		goto err2;
567 
568 #ifdef CONFIG_NET_CLS_ROUTE
569 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 	if (!pde)
571 		goto err3;
572 #endif
573 	return 0;
574 
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 	remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 	remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 	return -ENOMEM;
583 }
584 
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587 	remove_proc_entry("rt_cache", net->proc_net_stat);
588 	remove_proc_entry("rt_cache", net->proc_net);
589 #ifdef CONFIG_NET_CLS_ROUTE
590 	remove_proc_entry("rt_acct", net->proc_net);
591 #endif
592 }
593 
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595 	.init = ip_rt_do_proc_init,
596 	.exit = ip_rt_do_proc_exit,
597 };
598 
599 static int __init ip_rt_proc_init(void)
600 {
601 	return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603 
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607 	return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610 
611 static inline void rt_free(struct rtable *rt)
612 {
613 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615 
616 static inline void rt_drop(struct rtable *rt)
617 {
618 	ip_rt_put(rt);
619 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621 
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624 	/* Kill broadcast/multicast entries very aggresively, if they
625 	   collide in hash table with more useful entries */
626 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 		rth->fl.iif && rth->u.dst.rt_next;
628 }
629 
630 static inline int rt_valuable(struct rtable *rth)
631 {
632 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 		rth->u.dst.expires;
634 }
635 
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638 	unsigned long age;
639 	int ret = 0;
640 
641 	if (atomic_read(&rth->u.dst.__refcnt))
642 		goto out;
643 
644 	ret = 1;
645 	if (rth->u.dst.expires &&
646 	    time_after_eq(jiffies, rth->u.dst.expires))
647 		goto out;
648 
649 	age = jiffies - rth->u.dst.lastuse;
650 	ret = 0;
651 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 	    (age <= tmo2 && rt_valuable(rth)))
653 		goto out;
654 	ret = 1;
655 out:	return ret;
656 }
657 
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665 	u32 score = jiffies - rt->u.dst.lastuse;
666 
667 	score = ~score & ~(3<<30);
668 
669 	if (rt_valuable(rt))
670 		score |= (1<<31);
671 
672 	if (!rt->fl.iif ||
673 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 		score |= (1<<30);
675 
676 	return score;
677 }
678 
679 static inline bool rt_caching(const struct net *net)
680 {
681 	return net->ipv4.current_rt_cache_rebuild_count <=
682 		net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684 
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 					const struct flowi *fl2)
687 {
688 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 		(fl1->iif ^ fl2->iif)) == 0);
691 }
692 
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 		(fl1->mark ^ fl2->mark) |
698 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 		(fl1->oif ^ fl2->oif) |
701 		(fl1->iif ^ fl2->iif)) == 0;
702 }
703 
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706 	return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
707 }
708 
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713 
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721 	unsigned int i;
722 	struct rtable *rth, *next;
723 	struct rtable * tail;
724 
725 	for (i = 0; i <= rt_hash_mask; i++) {
726 		if (process_context && need_resched())
727 			cond_resched();
728 		rth = rt_hash_table[i].chain;
729 		if (!rth)
730 			continue;
731 
732 		spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734 		{
735 		struct rtable ** prev, * p;
736 
737 		rth = rt_hash_table[i].chain;
738 
739 		/* defer releasing the head of the list after spin_unlock */
740 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = *prev; p; p = next) {
749 			next = p->u.dst.rt_next;
750 			if (!rt_is_expired(p)) {
751 				prev = &p->u.dst.rt_next;
752 			} else {
753 				*prev = next;
754 				rt_free(p);
755 			}
756 		}
757 		}
758 #else
759 		rth = rt_hash_table[i].chain;
760 		rt_hash_table[i].chain = NULL;
761 		tail = NULL;
762 #endif
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 
765 		for (; rth != tail; rth = next) {
766 			next = rth->u.dst.rt_next;
767 			rt_free(rth);
768 		}
769 	}
770 }
771 
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779 
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782 
783 /*
784  * Given a hash chain and an item in this hash chain,
785  * find if a previous entry has the same hash_inputs
786  * (but differs on tos, mark or oif)
787  * Returns 0 if an alias is found.
788  * Returns ONE if rth has no alias before itself.
789  */
790 static int has_noalias(const struct rtable *head, const struct rtable *rth)
791 {
792 	const struct rtable *aux = head;
793 
794 	while (aux != rth) {
795 		if (compare_hash_inputs(&aux->fl, &rth->fl))
796 			return 0;
797 		aux = aux->u.dst.rt_next;
798 	}
799 	return ONE;
800 }
801 
802 static void rt_check_expire(void)
803 {
804 	static unsigned int rover;
805 	unsigned int i = rover, goal;
806 	struct rtable *rth, **rthp;
807 	unsigned long samples = 0;
808 	unsigned long sum = 0, sum2 = 0;
809 	unsigned long delta;
810 	u64 mult;
811 
812 	delta = jiffies - expires_ljiffies;
813 	expires_ljiffies = jiffies;
814 	mult = ((u64)delta) << rt_hash_log;
815 	if (ip_rt_gc_timeout > 1)
816 		do_div(mult, ip_rt_gc_timeout);
817 	goal = (unsigned int)mult;
818 	if (goal > rt_hash_mask)
819 		goal = rt_hash_mask + 1;
820 	for (; goal > 0; goal--) {
821 		unsigned long tmo = ip_rt_gc_timeout;
822 		unsigned long length;
823 
824 		i = (i + 1) & rt_hash_mask;
825 		rthp = &rt_hash_table[i].chain;
826 
827 		if (need_resched())
828 			cond_resched();
829 
830 		samples++;
831 
832 		if (*rthp == NULL)
833 			continue;
834 		length = 0;
835 		spin_lock_bh(rt_hash_lock_addr(i));
836 		while ((rth = *rthp) != NULL) {
837 			prefetch(rth->u.dst.rt_next);
838 			if (rt_is_expired(rth)) {
839 				*rthp = rth->u.dst.rt_next;
840 				rt_free(rth);
841 				continue;
842 			}
843 			if (rth->u.dst.expires) {
844 				/* Entry is expired even if it is in use */
845 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
846 nofree:
847 					tmo >>= 1;
848 					rthp = &rth->u.dst.rt_next;
849 					/*
850 					 * We only count entries on
851 					 * a chain with equal hash inputs once
852 					 * so that entries for different QOS
853 					 * levels, and other non-hash input
854 					 * attributes don't unfairly skew
855 					 * the length computation
856 					 */
857 					length += has_noalias(rt_hash_table[i].chain, rth);
858 					continue;
859 				}
860 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
861 				goto nofree;
862 
863 			/* Cleanup aged off entries. */
864 			*rthp = rth->u.dst.rt_next;
865 			rt_free(rth);
866 		}
867 		spin_unlock_bh(rt_hash_lock_addr(i));
868 		sum += length;
869 		sum2 += length*length;
870 	}
871 	if (samples) {
872 		unsigned long avg = sum / samples;
873 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
874 		rt_chain_length_max = max_t(unsigned long,
875 					ip_rt_gc_elasticity,
876 					(avg + 4*sd) >> FRACT_BITS);
877 	}
878 	rover = i;
879 }
880 
881 /*
882  * rt_worker_func() is run in process context.
883  * we call rt_check_expire() to scan part of the hash table
884  */
885 static void rt_worker_func(struct work_struct *work)
886 {
887 	rt_check_expire();
888 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
889 }
890 
891 /*
892  * Pertubation of rt_genid by a small quantity [1..256]
893  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
894  * many times (2^24) without giving recent rt_genid.
895  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
896  */
897 static void rt_cache_invalidate(struct net *net)
898 {
899 	unsigned char shuffle;
900 
901 	get_random_bytes(&shuffle, sizeof(shuffle));
902 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
903 }
904 
905 /*
906  * delay < 0  : invalidate cache (fast : entries will be deleted later)
907  * delay >= 0 : invalidate & flush cache (can be long)
908  */
909 void rt_cache_flush(struct net *net, int delay)
910 {
911 	rt_cache_invalidate(net);
912 	if (delay >= 0)
913 		rt_do_flush(!in_softirq());
914 }
915 
916 /* Flush previous cache invalidated entries from the cache */
917 void rt_cache_flush_batch(void)
918 {
919 	rt_do_flush(!in_softirq());
920 }
921 
922 /*
923  * We change rt_genid and let gc do the cleanup
924  */
925 static void rt_secret_rebuild(unsigned long __net)
926 {
927 	struct net *net = (struct net *)__net;
928 	rt_cache_invalidate(net);
929 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
930 }
931 
932 static void rt_secret_rebuild_oneshot(struct net *net)
933 {
934 	del_timer_sync(&net->ipv4.rt_secret_timer);
935 	rt_cache_invalidate(net);
936 	if (ip_rt_secret_interval)
937 		mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
938 }
939 
940 static void rt_emergency_hash_rebuild(struct net *net)
941 {
942 	if (net_ratelimit()) {
943 		printk(KERN_WARNING "Route hash chain too long!\n");
944 		printk(KERN_WARNING "Adjust your secret_interval!\n");
945 	}
946 
947 	rt_secret_rebuild_oneshot(net);
948 }
949 
950 /*
951    Short description of GC goals.
952 
953    We want to build algorithm, which will keep routing cache
954    at some equilibrium point, when number of aged off entries
955    is kept approximately equal to newly generated ones.
956 
957    Current expiration strength is variable "expire".
958    We try to adjust it dynamically, so that if networking
959    is idle expires is large enough to keep enough of warm entries,
960    and when load increases it reduces to limit cache size.
961  */
962 
963 static int rt_garbage_collect(struct dst_ops *ops)
964 {
965 	static unsigned long expire = RT_GC_TIMEOUT;
966 	static unsigned long last_gc;
967 	static int rover;
968 	static int equilibrium;
969 	struct rtable *rth, **rthp;
970 	unsigned long now = jiffies;
971 	int goal;
972 
973 	/*
974 	 * Garbage collection is pretty expensive,
975 	 * do not make it too frequently.
976 	 */
977 
978 	RT_CACHE_STAT_INC(gc_total);
979 
980 	if (now - last_gc < ip_rt_gc_min_interval &&
981 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
982 		RT_CACHE_STAT_INC(gc_ignored);
983 		goto out;
984 	}
985 
986 	/* Calculate number of entries, which we want to expire now. */
987 	goal = atomic_read(&ipv4_dst_ops.entries) -
988 		(ip_rt_gc_elasticity << rt_hash_log);
989 	if (goal <= 0) {
990 		if (equilibrium < ipv4_dst_ops.gc_thresh)
991 			equilibrium = ipv4_dst_ops.gc_thresh;
992 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
993 		if (goal > 0) {
994 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
995 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
996 		}
997 	} else {
998 		/* We are in dangerous area. Try to reduce cache really
999 		 * aggressively.
1000 		 */
1001 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1002 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1003 	}
1004 
1005 	if (now - last_gc >= ip_rt_gc_min_interval)
1006 		last_gc = now;
1007 
1008 	if (goal <= 0) {
1009 		equilibrium += goal;
1010 		goto work_done;
1011 	}
1012 
1013 	do {
1014 		int i, k;
1015 
1016 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1017 			unsigned long tmo = expire;
1018 
1019 			k = (k + 1) & rt_hash_mask;
1020 			rthp = &rt_hash_table[k].chain;
1021 			spin_lock_bh(rt_hash_lock_addr(k));
1022 			while ((rth = *rthp) != NULL) {
1023 				if (!rt_is_expired(rth) &&
1024 					!rt_may_expire(rth, tmo, expire)) {
1025 					tmo >>= 1;
1026 					rthp = &rth->u.dst.rt_next;
1027 					continue;
1028 				}
1029 				*rthp = rth->u.dst.rt_next;
1030 				rt_free(rth);
1031 				goal--;
1032 			}
1033 			spin_unlock_bh(rt_hash_lock_addr(k));
1034 			if (goal <= 0)
1035 				break;
1036 		}
1037 		rover = k;
1038 
1039 		if (goal <= 0)
1040 			goto work_done;
1041 
1042 		/* Goal is not achieved. We stop process if:
1043 
1044 		   - if expire reduced to zero. Otherwise, expire is halfed.
1045 		   - if table is not full.
1046 		   - if we are called from interrupt.
1047 		   - jiffies check is just fallback/debug loop breaker.
1048 		     We will not spin here for long time in any case.
1049 		 */
1050 
1051 		RT_CACHE_STAT_INC(gc_goal_miss);
1052 
1053 		if (expire == 0)
1054 			break;
1055 
1056 		expire >>= 1;
1057 #if RT_CACHE_DEBUG >= 2
1058 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1059 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1060 #endif
1061 
1062 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1063 			goto out;
1064 	} while (!in_softirq() && time_before_eq(jiffies, now));
1065 
1066 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1067 		goto out;
1068 	if (net_ratelimit())
1069 		printk(KERN_WARNING "dst cache overflow\n");
1070 	RT_CACHE_STAT_INC(gc_dst_overflow);
1071 	return 1;
1072 
1073 work_done:
1074 	expire += ip_rt_gc_min_interval;
1075 	if (expire > ip_rt_gc_timeout ||
1076 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1077 		expire = ip_rt_gc_timeout;
1078 #if RT_CACHE_DEBUG >= 2
1079 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1080 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1081 #endif
1082 out:	return 0;
1083 }
1084 
1085 /*
1086  * Returns number of entries in a hash chain that have different hash_inputs
1087  */
1088 static int slow_chain_length(const struct rtable *head)
1089 {
1090 	int length = 0;
1091 	const struct rtable *rth = head;
1092 
1093 	while (rth) {
1094 		length += has_noalias(head, rth);
1095 		rth = rth->u.dst.rt_next;
1096 	}
1097 	return length >> FRACT_BITS;
1098 }
1099 
1100 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1101 			  struct rtable **rp, struct sk_buff *skb, int ifindex)
1102 {
1103 	struct rtable	*rth, **rthp;
1104 	unsigned long	now;
1105 	struct rtable *cand, **candp;
1106 	u32 		min_score;
1107 	int		chain_length;
1108 	int attempts = !in_softirq();
1109 
1110 restart:
1111 	chain_length = 0;
1112 	min_score = ~(u32)0;
1113 	cand = NULL;
1114 	candp = NULL;
1115 	now = jiffies;
1116 
1117 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1118 		/*
1119 		 * If we're not caching, just tell the caller we
1120 		 * were successful and don't touch the route.  The
1121 		 * caller hold the sole reference to the cache entry, and
1122 		 * it will be released when the caller is done with it.
1123 		 * If we drop it here, the callers have no way to resolve routes
1124 		 * when we're not caching.  Instead, just point *rp at rt, so
1125 		 * the caller gets a single use out of the route
1126 		 * Note that we do rt_free on this new route entry, so that
1127 		 * once its refcount hits zero, we are still able to reap it
1128 		 * (Thanks Alexey)
1129 		 * Note also the rt_free uses call_rcu.  We don't actually
1130 		 * need rcu protection here, this is just our path to get
1131 		 * on the route gc list.
1132 		 */
1133 
1134 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1135 			int err = arp_bind_neighbour(&rt->u.dst);
1136 			if (err) {
1137 				if (net_ratelimit())
1138 					printk(KERN_WARNING
1139 					    "Neighbour table failure & not caching routes.\n");
1140 				rt_drop(rt);
1141 				return err;
1142 			}
1143 		}
1144 
1145 		rt_free(rt);
1146 		goto skip_hashing;
1147 	}
1148 
1149 	rthp = &rt_hash_table[hash].chain;
1150 
1151 	spin_lock_bh(rt_hash_lock_addr(hash));
1152 	while ((rth = *rthp) != NULL) {
1153 		if (rt_is_expired(rth)) {
1154 			*rthp = rth->u.dst.rt_next;
1155 			rt_free(rth);
1156 			continue;
1157 		}
1158 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1159 			/* Put it first */
1160 			*rthp = rth->u.dst.rt_next;
1161 			/*
1162 			 * Since lookup is lockfree, the deletion
1163 			 * must be visible to another weakly ordered CPU before
1164 			 * the insertion at the start of the hash chain.
1165 			 */
1166 			rcu_assign_pointer(rth->u.dst.rt_next,
1167 					   rt_hash_table[hash].chain);
1168 			/*
1169 			 * Since lookup is lockfree, the update writes
1170 			 * must be ordered for consistency on SMP.
1171 			 */
1172 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1173 
1174 			dst_use(&rth->u.dst, now);
1175 			spin_unlock_bh(rt_hash_lock_addr(hash));
1176 
1177 			rt_drop(rt);
1178 			if (rp)
1179 				*rp = rth;
1180 			else
1181 				skb_dst_set(skb, &rth->u.dst);
1182 			return 0;
1183 		}
1184 
1185 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1186 			u32 score = rt_score(rth);
1187 
1188 			if (score <= min_score) {
1189 				cand = rth;
1190 				candp = rthp;
1191 				min_score = score;
1192 			}
1193 		}
1194 
1195 		chain_length++;
1196 
1197 		rthp = &rth->u.dst.rt_next;
1198 	}
1199 
1200 	if (cand) {
1201 		/* ip_rt_gc_elasticity used to be average length of chain
1202 		 * length, when exceeded gc becomes really aggressive.
1203 		 *
1204 		 * The second limit is less certain. At the moment it allows
1205 		 * only 2 entries per bucket. We will see.
1206 		 */
1207 		if (chain_length > ip_rt_gc_elasticity) {
1208 			*candp = cand->u.dst.rt_next;
1209 			rt_free(cand);
1210 		}
1211 	} else {
1212 		if (chain_length > rt_chain_length_max &&
1213 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1214 			struct net *net = dev_net(rt->u.dst.dev);
1215 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1216 			if (!rt_caching(net)) {
1217 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1218 					rt->u.dst.dev->name, num);
1219 			}
1220 			rt_emergency_hash_rebuild(net);
1221 			spin_unlock_bh(rt_hash_lock_addr(hash));
1222 
1223 			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1224 					ifindex, rt_genid(net));
1225 			goto restart;
1226 		}
1227 	}
1228 
1229 	/* Try to bind route to arp only if it is output
1230 	   route or unicast forwarding path.
1231 	 */
1232 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1233 		int err = arp_bind_neighbour(&rt->u.dst);
1234 		if (err) {
1235 			spin_unlock_bh(rt_hash_lock_addr(hash));
1236 
1237 			if (err != -ENOBUFS) {
1238 				rt_drop(rt);
1239 				return err;
1240 			}
1241 
1242 			/* Neighbour tables are full and nothing
1243 			   can be released. Try to shrink route cache,
1244 			   it is most likely it holds some neighbour records.
1245 			 */
1246 			if (attempts-- > 0) {
1247 				int saved_elasticity = ip_rt_gc_elasticity;
1248 				int saved_int = ip_rt_gc_min_interval;
1249 				ip_rt_gc_elasticity	= 1;
1250 				ip_rt_gc_min_interval	= 0;
1251 				rt_garbage_collect(&ipv4_dst_ops);
1252 				ip_rt_gc_min_interval	= saved_int;
1253 				ip_rt_gc_elasticity	= saved_elasticity;
1254 				goto restart;
1255 			}
1256 
1257 			if (net_ratelimit())
1258 				printk(KERN_WARNING "Neighbour table overflow.\n");
1259 			rt_drop(rt);
1260 			return -ENOBUFS;
1261 		}
1262 	}
1263 
1264 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1265 
1266 #if RT_CACHE_DEBUG >= 2
1267 	if (rt->u.dst.rt_next) {
1268 		struct rtable *trt;
1269 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1270 		       hash, &rt->rt_dst);
1271 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1272 			printk(" . %pI4", &trt->rt_dst);
1273 		printk("\n");
1274 	}
1275 #endif
1276 	/*
1277 	 * Since lookup is lockfree, we must make sure
1278 	 * previous writes to rt are comitted to memory
1279 	 * before making rt visible to other CPUS.
1280 	 */
1281 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1282 
1283 	spin_unlock_bh(rt_hash_lock_addr(hash));
1284 
1285 skip_hashing:
1286 	if (rp)
1287 		*rp = rt;
1288 	else
1289 		skb_dst_set(skb, &rt->u.dst);
1290 	return 0;
1291 }
1292 
1293 void rt_bind_peer(struct rtable *rt, int create)
1294 {
1295 	static DEFINE_SPINLOCK(rt_peer_lock);
1296 	struct inet_peer *peer;
1297 
1298 	peer = inet_getpeer(rt->rt_dst, create);
1299 
1300 	spin_lock_bh(&rt_peer_lock);
1301 	if (rt->peer == NULL) {
1302 		rt->peer = peer;
1303 		peer = NULL;
1304 	}
1305 	spin_unlock_bh(&rt_peer_lock);
1306 	if (peer)
1307 		inet_putpeer(peer);
1308 }
1309 
1310 /*
1311  * Peer allocation may fail only in serious out-of-memory conditions.  However
1312  * we still can generate some output.
1313  * Random ID selection looks a bit dangerous because we have no chances to
1314  * select ID being unique in a reasonable period of time.
1315  * But broken packet identifier may be better than no packet at all.
1316  */
1317 static void ip_select_fb_ident(struct iphdr *iph)
1318 {
1319 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1320 	static u32 ip_fallback_id;
1321 	u32 salt;
1322 
1323 	spin_lock_bh(&ip_fb_id_lock);
1324 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1325 	iph->id = htons(salt & 0xFFFF);
1326 	ip_fallback_id = salt;
1327 	spin_unlock_bh(&ip_fb_id_lock);
1328 }
1329 
1330 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1331 {
1332 	struct rtable *rt = (struct rtable *) dst;
1333 
1334 	if (rt) {
1335 		if (rt->peer == NULL)
1336 			rt_bind_peer(rt, 1);
1337 
1338 		/* If peer is attached to destination, it is never detached,
1339 		   so that we need not to grab a lock to dereference it.
1340 		 */
1341 		if (rt->peer) {
1342 			iph->id = htons(inet_getid(rt->peer, more));
1343 			return;
1344 		}
1345 	} else
1346 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1347 		       __builtin_return_address(0));
1348 
1349 	ip_select_fb_ident(iph);
1350 }
1351 
1352 static void rt_del(unsigned hash, struct rtable *rt)
1353 {
1354 	struct rtable **rthp, *aux;
1355 
1356 	rthp = &rt_hash_table[hash].chain;
1357 	spin_lock_bh(rt_hash_lock_addr(hash));
1358 	ip_rt_put(rt);
1359 	while ((aux = *rthp) != NULL) {
1360 		if (aux == rt || rt_is_expired(aux)) {
1361 			*rthp = aux->u.dst.rt_next;
1362 			rt_free(aux);
1363 			continue;
1364 		}
1365 		rthp = &aux->u.dst.rt_next;
1366 	}
1367 	spin_unlock_bh(rt_hash_lock_addr(hash));
1368 }
1369 
1370 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1371 		    __be32 saddr, struct net_device *dev)
1372 {
1373 	int i, k;
1374 	struct in_device *in_dev = in_dev_get(dev);
1375 	struct rtable *rth, **rthp;
1376 	__be32  skeys[2] = { saddr, 0 };
1377 	int  ikeys[2] = { dev->ifindex, 0 };
1378 	struct netevent_redirect netevent;
1379 	struct net *net;
1380 
1381 	if (!in_dev)
1382 		return;
1383 
1384 	net = dev_net(dev);
1385 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1386 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1387 	    ipv4_is_zeronet(new_gw))
1388 		goto reject_redirect;
1389 
1390 	if (!rt_caching(net))
1391 		goto reject_redirect;
1392 
1393 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1394 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1395 			goto reject_redirect;
1396 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1397 			goto reject_redirect;
1398 	} else {
1399 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1400 			goto reject_redirect;
1401 	}
1402 
1403 	for (i = 0; i < 2; i++) {
1404 		for (k = 0; k < 2; k++) {
1405 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1406 						rt_genid(net));
1407 
1408 			rthp=&rt_hash_table[hash].chain;
1409 
1410 			rcu_read_lock();
1411 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1412 				struct rtable *rt;
1413 
1414 				if (rth->fl.fl4_dst != daddr ||
1415 				    rth->fl.fl4_src != skeys[i] ||
1416 				    rth->fl.oif != ikeys[k] ||
1417 				    rth->fl.iif != 0 ||
1418 				    rt_is_expired(rth) ||
1419 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1420 					rthp = &rth->u.dst.rt_next;
1421 					continue;
1422 				}
1423 
1424 				if (rth->rt_dst != daddr ||
1425 				    rth->rt_src != saddr ||
1426 				    rth->u.dst.error ||
1427 				    rth->rt_gateway != old_gw ||
1428 				    rth->u.dst.dev != dev)
1429 					break;
1430 
1431 				dst_hold(&rth->u.dst);
1432 				rcu_read_unlock();
1433 
1434 				rt = dst_alloc(&ipv4_dst_ops);
1435 				if (rt == NULL) {
1436 					ip_rt_put(rth);
1437 					in_dev_put(in_dev);
1438 					return;
1439 				}
1440 
1441 				/* Copy all the information. */
1442 				*rt = *rth;
1443 				rt->u.dst.__use		= 1;
1444 				atomic_set(&rt->u.dst.__refcnt, 1);
1445 				rt->u.dst.child		= NULL;
1446 				if (rt->u.dst.dev)
1447 					dev_hold(rt->u.dst.dev);
1448 				if (rt->idev)
1449 					in_dev_hold(rt->idev);
1450 				rt->u.dst.obsolete	= -1;
1451 				rt->u.dst.lastuse	= jiffies;
1452 				rt->u.dst.path		= &rt->u.dst;
1453 				rt->u.dst.neighbour	= NULL;
1454 				rt->u.dst.hh		= NULL;
1455 #ifdef CONFIG_XFRM
1456 				rt->u.dst.xfrm		= NULL;
1457 #endif
1458 				rt->rt_genid		= rt_genid(net);
1459 				rt->rt_flags		|= RTCF_REDIRECTED;
1460 
1461 				/* Gateway is different ... */
1462 				rt->rt_gateway		= new_gw;
1463 
1464 				/* Redirect received -> path was valid */
1465 				dst_confirm(&rth->u.dst);
1466 
1467 				if (rt->peer)
1468 					atomic_inc(&rt->peer->refcnt);
1469 
1470 				if (arp_bind_neighbour(&rt->u.dst) ||
1471 				    !(rt->u.dst.neighbour->nud_state &
1472 					    NUD_VALID)) {
1473 					if (rt->u.dst.neighbour)
1474 						neigh_event_send(rt->u.dst.neighbour, NULL);
1475 					ip_rt_put(rth);
1476 					rt_drop(rt);
1477 					goto do_next;
1478 				}
1479 
1480 				netevent.old = &rth->u.dst;
1481 				netevent.new = &rt->u.dst;
1482 				call_netevent_notifiers(NETEVENT_REDIRECT,
1483 							&netevent);
1484 
1485 				rt_del(hash, rth);
1486 				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1487 					ip_rt_put(rt);
1488 				goto do_next;
1489 			}
1490 			rcu_read_unlock();
1491 		do_next:
1492 			;
1493 		}
1494 	}
1495 	in_dev_put(in_dev);
1496 	return;
1497 
1498 reject_redirect:
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1501 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 			"  Advised path = %pI4 -> %pI4\n",
1503 		       &old_gw, dev->name, &new_gw,
1504 		       &saddr, &daddr);
1505 #endif
1506 	in_dev_put(in_dev);
1507 }
1508 
1509 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1510 {
1511 	struct rtable *rt = (struct rtable *)dst;
1512 	struct dst_entry *ret = dst;
1513 
1514 	if (rt) {
1515 		if (dst->obsolete > 0) {
1516 			ip_rt_put(rt);
1517 			ret = NULL;
1518 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1519 			   (rt->u.dst.expires &&
1520 			    time_after_eq(jiffies, rt->u.dst.expires))) {
1521 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1522 						rt->fl.oif,
1523 						rt_genid(dev_net(dst->dev)));
1524 #if RT_CACHE_DEBUG >= 1
1525 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1526 				&rt->rt_dst, rt->fl.fl4_tos);
1527 #endif
1528 			rt_del(hash, rt);
1529 			ret = NULL;
1530 		}
1531 	}
1532 	return ret;
1533 }
1534 
1535 /*
1536  * Algorithm:
1537  *	1. The first ip_rt_redirect_number redirects are sent
1538  *	   with exponential backoff, then we stop sending them at all,
1539  *	   assuming that the host ignores our redirects.
1540  *	2. If we did not see packets requiring redirects
1541  *	   during ip_rt_redirect_silence, we assume that the host
1542  *	   forgot redirected route and start to send redirects again.
1543  *
1544  * This algorithm is much cheaper and more intelligent than dumb load limiting
1545  * in icmp.c.
1546  *
1547  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1548  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1549  */
1550 
1551 void ip_rt_send_redirect(struct sk_buff *skb)
1552 {
1553 	struct rtable *rt = skb_rtable(skb);
1554 	struct in_device *in_dev;
1555 	int log_martians;
1556 
1557 	rcu_read_lock();
1558 	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1559 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1560 		rcu_read_unlock();
1561 		return;
1562 	}
1563 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564 	rcu_read_unlock();
1565 
1566 	/* No redirected packets during ip_rt_redirect_silence;
1567 	 * reset the algorithm.
1568 	 */
1569 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1570 		rt->u.dst.rate_tokens = 0;
1571 
1572 	/* Too many ignored redirects; do not send anything
1573 	 * set u.dst.rate_last to the last seen redirected packet.
1574 	 */
1575 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1576 		rt->u.dst.rate_last = jiffies;
1577 		return;
1578 	}
1579 
1580 	/* Check for load limit; set rate_last to the latest sent
1581 	 * redirect.
1582 	 */
1583 	if (rt->u.dst.rate_tokens == 0 ||
1584 	    time_after(jiffies,
1585 		       (rt->u.dst.rate_last +
1586 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1587 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 		rt->u.dst.rate_last = jiffies;
1589 		++rt->u.dst.rate_tokens;
1590 #ifdef CONFIG_IP_ROUTE_VERBOSE
1591 		if (log_martians &&
1592 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1593 		    net_ratelimit())
1594 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1595 				&rt->rt_src, rt->rt_iif,
1596 				&rt->rt_dst, &rt->rt_gateway);
1597 #endif
1598 	}
1599 }
1600 
1601 static int ip_error(struct sk_buff *skb)
1602 {
1603 	struct rtable *rt = skb_rtable(skb);
1604 	unsigned long now;
1605 	int code;
1606 
1607 	switch (rt->u.dst.error) {
1608 		case EINVAL:
1609 		default:
1610 			goto out;
1611 		case EHOSTUNREACH:
1612 			code = ICMP_HOST_UNREACH;
1613 			break;
1614 		case ENETUNREACH:
1615 			code = ICMP_NET_UNREACH;
1616 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1617 					IPSTATS_MIB_INNOROUTES);
1618 			break;
1619 		case EACCES:
1620 			code = ICMP_PKT_FILTERED;
1621 			break;
1622 	}
1623 
1624 	now = jiffies;
1625 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1626 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1627 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1628 	rt->u.dst.rate_last = now;
1629 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1630 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1631 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1632 	}
1633 
1634 out:	kfree_skb(skb);
1635 	return 0;
1636 }
1637 
1638 /*
1639  *	The last two values are not from the RFC but
1640  *	are needed for AMPRnet AX.25 paths.
1641  */
1642 
1643 static const unsigned short mtu_plateau[] =
1644 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1645 
1646 static inline unsigned short guess_mtu(unsigned short old_mtu)
1647 {
1648 	int i;
1649 
1650 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1651 		if (old_mtu > mtu_plateau[i])
1652 			return mtu_plateau[i];
1653 	return 68;
1654 }
1655 
1656 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1657 				 unsigned short new_mtu,
1658 				 struct net_device *dev)
1659 {
1660 	int i, k;
1661 	unsigned short old_mtu = ntohs(iph->tot_len);
1662 	struct rtable *rth;
1663 	int  ikeys[2] = { dev->ifindex, 0 };
1664 	__be32  skeys[2] = { iph->saddr, 0, };
1665 	__be32  daddr = iph->daddr;
1666 	unsigned short est_mtu = 0;
1667 
1668 	for (k = 0; k < 2; k++) {
1669 		for (i = 0; i < 2; i++) {
1670 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1671 						rt_genid(net));
1672 
1673 			rcu_read_lock();
1674 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1675 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1676 				unsigned short mtu = new_mtu;
1677 
1678 				if (rth->fl.fl4_dst != daddr ||
1679 				    rth->fl.fl4_src != skeys[i] ||
1680 				    rth->rt_dst != daddr ||
1681 				    rth->rt_src != iph->saddr ||
1682 				    rth->fl.oif != ikeys[k] ||
1683 				    rth->fl.iif != 0 ||
1684 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1685 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1686 				    rt_is_expired(rth))
1687 					continue;
1688 
1689 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1690 
1691 					/* BSD 4.2 compatibility hack :-( */
1692 					if (mtu == 0 &&
1693 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1694 					    old_mtu >= 68 + (iph->ihl << 2))
1695 						old_mtu -= iph->ihl << 2;
1696 
1697 					mtu = guess_mtu(old_mtu);
1698 				}
1699 				if (mtu <= dst_mtu(&rth->u.dst)) {
1700 					if (mtu < dst_mtu(&rth->u.dst)) {
1701 						dst_confirm(&rth->u.dst);
1702 						if (mtu < ip_rt_min_pmtu) {
1703 							mtu = ip_rt_min_pmtu;
1704 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1705 								(1 << RTAX_MTU);
1706 						}
1707 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1708 						dst_set_expires(&rth->u.dst,
1709 							ip_rt_mtu_expires);
1710 					}
1711 					est_mtu = mtu;
1712 				}
1713 			}
1714 			rcu_read_unlock();
1715 		}
1716 	}
1717 	return est_mtu ? : new_mtu;
1718 }
1719 
1720 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1721 {
1722 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1723 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1724 		if (mtu < ip_rt_min_pmtu) {
1725 			mtu = ip_rt_min_pmtu;
1726 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1727 		}
1728 		dst->metrics[RTAX_MTU-1] = mtu;
1729 		dst_set_expires(dst, ip_rt_mtu_expires);
1730 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1731 	}
1732 }
1733 
1734 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1735 {
1736 	if (rt_is_expired((struct rtable *)dst))
1737 		return NULL;
1738 	return dst;
1739 }
1740 
1741 static void ipv4_dst_destroy(struct dst_entry *dst)
1742 {
1743 	struct rtable *rt = (struct rtable *) dst;
1744 	struct inet_peer *peer = rt->peer;
1745 	struct in_device *idev = rt->idev;
1746 
1747 	if (peer) {
1748 		rt->peer = NULL;
1749 		inet_putpeer(peer);
1750 	}
1751 
1752 	if (idev) {
1753 		rt->idev = NULL;
1754 		in_dev_put(idev);
1755 	}
1756 }
1757 
1758 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1759 			    int how)
1760 {
1761 	struct rtable *rt = (struct rtable *) dst;
1762 	struct in_device *idev = rt->idev;
1763 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1764 		struct in_device *loopback_idev =
1765 			in_dev_get(dev_net(dev)->loopback_dev);
1766 		if (loopback_idev) {
1767 			rt->idev = loopback_idev;
1768 			in_dev_put(idev);
1769 		}
1770 	}
1771 }
1772 
1773 static void ipv4_link_failure(struct sk_buff *skb)
1774 {
1775 	struct rtable *rt;
1776 
1777 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1778 
1779 	rt = skb_rtable(skb);
1780 	if (rt)
1781 		dst_set_expires(&rt->u.dst, 0);
1782 }
1783 
1784 static int ip_rt_bug(struct sk_buff *skb)
1785 {
1786 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1787 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1788 		skb->dev ? skb->dev->name : "?");
1789 	kfree_skb(skb);
1790 	return 0;
1791 }
1792 
1793 /*
1794    We do not cache source address of outgoing interface,
1795    because it is used only by IP RR, TS and SRR options,
1796    so that it out of fast path.
1797 
1798    BTW remember: "addr" is allowed to be not aligned
1799    in IP options!
1800  */
1801 
1802 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1803 {
1804 	__be32 src;
1805 	struct fib_result res;
1806 
1807 	if (rt->fl.iif == 0)
1808 		src = rt->rt_src;
1809 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1810 		src = FIB_RES_PREFSRC(res);
1811 		fib_res_put(&res);
1812 	} else
1813 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1814 					RT_SCOPE_UNIVERSE);
1815 	memcpy(addr, &src, 4);
1816 }
1817 
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819 static void set_class_tag(struct rtable *rt, u32 tag)
1820 {
1821 	if (!(rt->u.dst.tclassid & 0xFFFF))
1822 		rt->u.dst.tclassid |= tag & 0xFFFF;
1823 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1824 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1825 }
1826 #endif
1827 
1828 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1829 {
1830 	struct fib_info *fi = res->fi;
1831 
1832 	if (fi) {
1833 		if (FIB_RES_GW(*res) &&
1834 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1835 			rt->rt_gateway = FIB_RES_GW(*res);
1836 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1837 		       sizeof(rt->u.dst.metrics));
1838 		if (fi->fib_mtu == 0) {
1839 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1840 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1841 			    rt->rt_gateway != rt->rt_dst &&
1842 			    rt->u.dst.dev->mtu > 576)
1843 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1844 		}
1845 #ifdef CONFIG_NET_CLS_ROUTE
1846 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1847 #endif
1848 	} else
1849 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1850 
1851 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1852 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1853 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1854 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1855 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1856 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1857 				       ip_rt_min_advmss);
1858 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1859 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1860 
1861 #ifdef CONFIG_NET_CLS_ROUTE
1862 #ifdef CONFIG_IP_MULTIPLE_TABLES
1863 	set_class_tag(rt, fib_rules_tclass(res));
1864 #endif
1865 	set_class_tag(rt, itag);
1866 #endif
1867 	rt->rt_type = res->type;
1868 }
1869 
1870 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1871 				u8 tos, struct net_device *dev, int our)
1872 {
1873 	unsigned hash;
1874 	struct rtable *rth;
1875 	__be32 spec_dst;
1876 	struct in_device *in_dev = in_dev_get(dev);
1877 	u32 itag = 0;
1878 
1879 	/* Primary sanity checks. */
1880 
1881 	if (in_dev == NULL)
1882 		return -EINVAL;
1883 
1884 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1885 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1886 		goto e_inval;
1887 
1888 	if (ipv4_is_zeronet(saddr)) {
1889 		if (!ipv4_is_local_multicast(daddr))
1890 			goto e_inval;
1891 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1892 	} else if (fib_validate_source(saddr, 0, tos, 0,
1893 					dev, &spec_dst, &itag, 0) < 0)
1894 		goto e_inval;
1895 
1896 	rth = dst_alloc(&ipv4_dst_ops);
1897 	if (!rth)
1898 		goto e_nobufs;
1899 
1900 	rth->u.dst.output = ip_rt_bug;
1901 	rth->u.dst.obsolete = -1;
1902 
1903 	atomic_set(&rth->u.dst.__refcnt, 1);
1904 	rth->u.dst.flags= DST_HOST;
1905 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1906 		rth->u.dst.flags |= DST_NOPOLICY;
1907 	rth->fl.fl4_dst	= daddr;
1908 	rth->rt_dst	= daddr;
1909 	rth->fl.fl4_tos	= tos;
1910 	rth->fl.mark    = skb->mark;
1911 	rth->fl.fl4_src	= saddr;
1912 	rth->rt_src	= saddr;
1913 #ifdef CONFIG_NET_CLS_ROUTE
1914 	rth->u.dst.tclassid = itag;
1915 #endif
1916 	rth->rt_iif	=
1917 	rth->fl.iif	= dev->ifindex;
1918 	rth->u.dst.dev	= init_net.loopback_dev;
1919 	dev_hold(rth->u.dst.dev);
1920 	rth->idev	= in_dev_get(rth->u.dst.dev);
1921 	rth->fl.oif	= 0;
1922 	rth->rt_gateway	= daddr;
1923 	rth->rt_spec_dst= spec_dst;
1924 	rth->rt_genid	= rt_genid(dev_net(dev));
1925 	rth->rt_flags	= RTCF_MULTICAST;
1926 	rth->rt_type	= RTN_MULTICAST;
1927 	if (our) {
1928 		rth->u.dst.input= ip_local_deliver;
1929 		rth->rt_flags |= RTCF_LOCAL;
1930 	}
1931 
1932 #ifdef CONFIG_IP_MROUTE
1933 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1934 		rth->u.dst.input = ip_mr_input;
1935 #endif
1936 	RT_CACHE_STAT_INC(in_slow_mc);
1937 
1938 	in_dev_put(in_dev);
1939 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1940 	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1941 
1942 e_nobufs:
1943 	in_dev_put(in_dev);
1944 	return -ENOBUFS;
1945 
1946 e_inval:
1947 	in_dev_put(in_dev);
1948 	return -EINVAL;
1949 }
1950 
1951 
1952 static void ip_handle_martian_source(struct net_device *dev,
1953 				     struct in_device *in_dev,
1954 				     struct sk_buff *skb,
1955 				     __be32 daddr,
1956 				     __be32 saddr)
1957 {
1958 	RT_CACHE_STAT_INC(in_martian_src);
1959 #ifdef CONFIG_IP_ROUTE_VERBOSE
1960 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1961 		/*
1962 		 *	RFC1812 recommendation, if source is martian,
1963 		 *	the only hint is MAC header.
1964 		 */
1965 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1966 			&daddr, &saddr, dev->name);
1967 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1968 			int i;
1969 			const unsigned char *p = skb_mac_header(skb);
1970 			printk(KERN_WARNING "ll header: ");
1971 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1972 				printk("%02x", *p);
1973 				if (i < (dev->hard_header_len - 1))
1974 					printk(":");
1975 			}
1976 			printk("\n");
1977 		}
1978 	}
1979 #endif
1980 }
1981 
1982 static int __mkroute_input(struct sk_buff *skb,
1983 			   struct fib_result *res,
1984 			   struct in_device *in_dev,
1985 			   __be32 daddr, __be32 saddr, u32 tos,
1986 			   struct rtable **result)
1987 {
1988 
1989 	struct rtable *rth;
1990 	int err;
1991 	struct in_device *out_dev;
1992 	unsigned flags = 0;
1993 	__be32 spec_dst;
1994 	u32 itag;
1995 
1996 	/* get a working reference to the output device */
1997 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1998 	if (out_dev == NULL) {
1999 		if (net_ratelimit())
2000 			printk(KERN_CRIT "Bug in ip_route_input" \
2001 			       "_slow(). Please, report\n");
2002 		return -EINVAL;
2003 	}
2004 
2005 
2006 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2007 				  in_dev->dev, &spec_dst, &itag, skb->mark);
2008 	if (err < 0) {
2009 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2010 					 saddr);
2011 
2012 		err = -EINVAL;
2013 		goto cleanup;
2014 	}
2015 
2016 	if (err)
2017 		flags |= RTCF_DIRECTSRC;
2018 
2019 	if (out_dev == in_dev && err &&
2020 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2021 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2022 		flags |= RTCF_DOREDIRECT;
2023 
2024 	if (skb->protocol != htons(ETH_P_IP)) {
2025 		/* Not IP (i.e. ARP). Do not create route, if it is
2026 		 * invalid for proxy arp. DNAT routes are always valid.
2027 		 *
2028 		 * Proxy arp feature have been extended to allow, ARP
2029 		 * replies back to the same interface, to support
2030 		 * Private VLAN switch technologies. See arp.c.
2031 		 */
2032 		if (out_dev == in_dev &&
2033 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2034 			err = -EINVAL;
2035 			goto cleanup;
2036 		}
2037 	}
2038 
2039 
2040 	rth = dst_alloc(&ipv4_dst_ops);
2041 	if (!rth) {
2042 		err = -ENOBUFS;
2043 		goto cleanup;
2044 	}
2045 
2046 	atomic_set(&rth->u.dst.__refcnt, 1);
2047 	rth->u.dst.flags= DST_HOST;
2048 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2049 		rth->u.dst.flags |= DST_NOPOLICY;
2050 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2051 		rth->u.dst.flags |= DST_NOXFRM;
2052 	rth->fl.fl4_dst	= daddr;
2053 	rth->rt_dst	= daddr;
2054 	rth->fl.fl4_tos	= tos;
2055 	rth->fl.mark    = skb->mark;
2056 	rth->fl.fl4_src	= saddr;
2057 	rth->rt_src	= saddr;
2058 	rth->rt_gateway	= daddr;
2059 	rth->rt_iif 	=
2060 		rth->fl.iif	= in_dev->dev->ifindex;
2061 	rth->u.dst.dev	= (out_dev)->dev;
2062 	dev_hold(rth->u.dst.dev);
2063 	rth->idev	= in_dev_get(rth->u.dst.dev);
2064 	rth->fl.oif 	= 0;
2065 	rth->rt_spec_dst= spec_dst;
2066 
2067 	rth->u.dst.obsolete = -1;
2068 	rth->u.dst.input = ip_forward;
2069 	rth->u.dst.output = ip_output;
2070 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2071 
2072 	rt_set_nexthop(rth, res, itag);
2073 
2074 	rth->rt_flags = flags;
2075 
2076 	*result = rth;
2077 	err = 0;
2078  cleanup:
2079 	/* release the working reference to the output device */
2080 	in_dev_put(out_dev);
2081 	return err;
2082 }
2083 
2084 static int ip_mkroute_input(struct sk_buff *skb,
2085 			    struct fib_result *res,
2086 			    const struct flowi *fl,
2087 			    struct in_device *in_dev,
2088 			    __be32 daddr, __be32 saddr, u32 tos)
2089 {
2090 	struct rtable* rth = NULL;
2091 	int err;
2092 	unsigned hash;
2093 
2094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2095 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2096 		fib_select_multipath(fl, res);
2097 #endif
2098 
2099 	/* create a routing cache entry */
2100 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2101 	if (err)
2102 		return err;
2103 
2104 	/* put it into the cache */
2105 	hash = rt_hash(daddr, saddr, fl->iif,
2106 		       rt_genid(dev_net(rth->u.dst.dev)));
2107 	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2108 }
2109 
2110 /*
2111  *	NOTE. We drop all the packets that has local source
2112  *	addresses, because every properly looped back packet
2113  *	must have correct destination already attached by output routine.
2114  *
2115  *	Such approach solves two big problems:
2116  *	1. Not simplex devices are handled properly.
2117  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2118  */
2119 
2120 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2121 			       u8 tos, struct net_device *dev)
2122 {
2123 	struct fib_result res;
2124 	struct in_device *in_dev = in_dev_get(dev);
2125 	struct flowi fl = { .nl_u = { .ip4_u =
2126 				      { .daddr = daddr,
2127 					.saddr = saddr,
2128 					.tos = tos,
2129 					.scope = RT_SCOPE_UNIVERSE,
2130 				      } },
2131 			    .mark = skb->mark,
2132 			    .iif = dev->ifindex };
2133 	unsigned	flags = 0;
2134 	u32		itag = 0;
2135 	struct rtable * rth;
2136 	unsigned	hash;
2137 	__be32		spec_dst;
2138 	int		err = -EINVAL;
2139 	int		free_res = 0;
2140 	struct net    * net = dev_net(dev);
2141 
2142 	/* IP on this device is disabled. */
2143 
2144 	if (!in_dev)
2145 		goto out;
2146 
2147 	/* Check for the most weird martians, which can be not detected
2148 	   by fib_lookup.
2149 	 */
2150 
2151 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2152 	    ipv4_is_loopback(saddr))
2153 		goto martian_source;
2154 
2155 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2156 		goto brd_input;
2157 
2158 	/* Accept zero addresses only to limited broadcast;
2159 	 * I even do not know to fix it or not. Waiting for complains :-)
2160 	 */
2161 	if (ipv4_is_zeronet(saddr))
2162 		goto martian_source;
2163 
2164 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2165 	    ipv4_is_loopback(daddr))
2166 		goto martian_destination;
2167 
2168 	/*
2169 	 *	Now we are ready to route packet.
2170 	 */
2171 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2172 		if (!IN_DEV_FORWARD(in_dev))
2173 			goto e_hostunreach;
2174 		goto no_route;
2175 	}
2176 	free_res = 1;
2177 
2178 	RT_CACHE_STAT_INC(in_slow_tot);
2179 
2180 	if (res.type == RTN_BROADCAST)
2181 		goto brd_input;
2182 
2183 	if (res.type == RTN_LOCAL) {
2184 		int result;
2185 		result = fib_validate_source(saddr, daddr, tos,
2186 					     net->loopback_dev->ifindex,
2187 					     dev, &spec_dst, &itag, skb->mark);
2188 		if (result < 0)
2189 			goto martian_source;
2190 		if (result)
2191 			flags |= RTCF_DIRECTSRC;
2192 		spec_dst = daddr;
2193 		goto local_input;
2194 	}
2195 
2196 	if (!IN_DEV_FORWARD(in_dev))
2197 		goto e_hostunreach;
2198 	if (res.type != RTN_UNICAST)
2199 		goto martian_destination;
2200 
2201 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2202 done:
2203 	in_dev_put(in_dev);
2204 	if (free_res)
2205 		fib_res_put(&res);
2206 out:	return err;
2207 
2208 brd_input:
2209 	if (skb->protocol != htons(ETH_P_IP))
2210 		goto e_inval;
2211 
2212 	if (ipv4_is_zeronet(saddr))
2213 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2214 	else {
2215 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2216 					  &itag, skb->mark);
2217 		if (err < 0)
2218 			goto martian_source;
2219 		if (err)
2220 			flags |= RTCF_DIRECTSRC;
2221 	}
2222 	flags |= RTCF_BROADCAST;
2223 	res.type = RTN_BROADCAST;
2224 	RT_CACHE_STAT_INC(in_brd);
2225 
2226 local_input:
2227 	rth = dst_alloc(&ipv4_dst_ops);
2228 	if (!rth)
2229 		goto e_nobufs;
2230 
2231 	rth->u.dst.output= ip_rt_bug;
2232 	rth->u.dst.obsolete = -1;
2233 	rth->rt_genid = rt_genid(net);
2234 
2235 	atomic_set(&rth->u.dst.__refcnt, 1);
2236 	rth->u.dst.flags= DST_HOST;
2237 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2238 		rth->u.dst.flags |= DST_NOPOLICY;
2239 	rth->fl.fl4_dst	= daddr;
2240 	rth->rt_dst	= daddr;
2241 	rth->fl.fl4_tos	= tos;
2242 	rth->fl.mark    = skb->mark;
2243 	rth->fl.fl4_src	= saddr;
2244 	rth->rt_src	= saddr;
2245 #ifdef CONFIG_NET_CLS_ROUTE
2246 	rth->u.dst.tclassid = itag;
2247 #endif
2248 	rth->rt_iif	=
2249 	rth->fl.iif	= dev->ifindex;
2250 	rth->u.dst.dev	= net->loopback_dev;
2251 	dev_hold(rth->u.dst.dev);
2252 	rth->idev	= in_dev_get(rth->u.dst.dev);
2253 	rth->rt_gateway	= daddr;
2254 	rth->rt_spec_dst= spec_dst;
2255 	rth->u.dst.input= ip_local_deliver;
2256 	rth->rt_flags 	= flags|RTCF_LOCAL;
2257 	if (res.type == RTN_UNREACHABLE) {
2258 		rth->u.dst.input= ip_error;
2259 		rth->u.dst.error= -err;
2260 		rth->rt_flags 	&= ~RTCF_LOCAL;
2261 	}
2262 	rth->rt_type	= res.type;
2263 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2264 	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2265 	goto done;
2266 
2267 no_route:
2268 	RT_CACHE_STAT_INC(in_no_route);
2269 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2270 	res.type = RTN_UNREACHABLE;
2271 	if (err == -ESRCH)
2272 		err = -ENETUNREACH;
2273 	goto local_input;
2274 
2275 	/*
2276 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2277 	 */
2278 martian_destination:
2279 	RT_CACHE_STAT_INC(in_martian_dst);
2280 #ifdef CONFIG_IP_ROUTE_VERBOSE
2281 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2282 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2283 			&daddr, &saddr, dev->name);
2284 #endif
2285 
2286 e_hostunreach:
2287 	err = -EHOSTUNREACH;
2288 	goto done;
2289 
2290 e_inval:
2291 	err = -EINVAL;
2292 	goto done;
2293 
2294 e_nobufs:
2295 	err = -ENOBUFS;
2296 	goto done;
2297 
2298 martian_source:
2299 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2300 	goto e_inval;
2301 }
2302 
2303 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2304 		   u8 tos, struct net_device *dev)
2305 {
2306 	struct rtable * rth;
2307 	unsigned	hash;
2308 	int iif = dev->ifindex;
2309 	struct net *net;
2310 
2311 	net = dev_net(dev);
2312 
2313 	if (!rt_caching(net))
2314 		goto skip_cache;
2315 
2316 	tos &= IPTOS_RT_MASK;
2317 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2318 
2319 	rcu_read_lock();
2320 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2321 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2322 		if (((rth->fl.fl4_dst ^ daddr) |
2323 		     (rth->fl.fl4_src ^ saddr) |
2324 		     (rth->fl.iif ^ iif) |
2325 		     rth->fl.oif |
2326 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2327 		    rth->fl.mark == skb->mark &&
2328 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2329 		    !rt_is_expired(rth)) {
2330 			dst_use(&rth->u.dst, jiffies);
2331 			RT_CACHE_STAT_INC(in_hit);
2332 			rcu_read_unlock();
2333 			skb_dst_set(skb, &rth->u.dst);
2334 			return 0;
2335 		}
2336 		RT_CACHE_STAT_INC(in_hlist_search);
2337 	}
2338 	rcu_read_unlock();
2339 
2340 skip_cache:
2341 	/* Multicast recognition logic is moved from route cache to here.
2342 	   The problem was that too many Ethernet cards have broken/missing
2343 	   hardware multicast filters :-( As result the host on multicasting
2344 	   network acquires a lot of useless route cache entries, sort of
2345 	   SDR messages from all the world. Now we try to get rid of them.
2346 	   Really, provided software IP multicast filter is organized
2347 	   reasonably (at least, hashed), it does not result in a slowdown
2348 	   comparing with route cache reject entries.
2349 	   Note, that multicast routers are not affected, because
2350 	   route cache entry is created eventually.
2351 	 */
2352 	if (ipv4_is_multicast(daddr)) {
2353 		struct in_device *in_dev;
2354 
2355 		rcu_read_lock();
2356 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2357 			int our = ip_check_mc(in_dev, daddr, saddr,
2358 				ip_hdr(skb)->protocol);
2359 			if (our
2360 #ifdef CONFIG_IP_MROUTE
2361 				||
2362 			    (!ipv4_is_local_multicast(daddr) &&
2363 			     IN_DEV_MFORWARD(in_dev))
2364 #endif
2365 			   ) {
2366 				rcu_read_unlock();
2367 				return ip_route_input_mc(skb, daddr, saddr,
2368 							 tos, dev, our);
2369 			}
2370 		}
2371 		rcu_read_unlock();
2372 		return -EINVAL;
2373 	}
2374 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2375 }
2376 
2377 static int __mkroute_output(struct rtable **result,
2378 			    struct fib_result *res,
2379 			    const struct flowi *fl,
2380 			    const struct flowi *oldflp,
2381 			    struct net_device *dev_out,
2382 			    unsigned flags)
2383 {
2384 	struct rtable *rth;
2385 	struct in_device *in_dev;
2386 	u32 tos = RT_FL_TOS(oldflp);
2387 	int err = 0;
2388 
2389 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2390 		return -EINVAL;
2391 
2392 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2393 		res->type = RTN_BROADCAST;
2394 	else if (ipv4_is_multicast(fl->fl4_dst))
2395 		res->type = RTN_MULTICAST;
2396 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2397 		return -EINVAL;
2398 
2399 	if (dev_out->flags & IFF_LOOPBACK)
2400 		flags |= RTCF_LOCAL;
2401 
2402 	/* get work reference to inet device */
2403 	in_dev = in_dev_get(dev_out);
2404 	if (!in_dev)
2405 		return -EINVAL;
2406 
2407 	if (res->type == RTN_BROADCAST) {
2408 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2409 		if (res->fi) {
2410 			fib_info_put(res->fi);
2411 			res->fi = NULL;
2412 		}
2413 	} else if (res->type == RTN_MULTICAST) {
2414 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2415 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2416 				 oldflp->proto))
2417 			flags &= ~RTCF_LOCAL;
2418 		/* If multicast route do not exist use
2419 		   default one, but do not gateway in this case.
2420 		   Yes, it is hack.
2421 		 */
2422 		if (res->fi && res->prefixlen < 4) {
2423 			fib_info_put(res->fi);
2424 			res->fi = NULL;
2425 		}
2426 	}
2427 
2428 
2429 	rth = dst_alloc(&ipv4_dst_ops);
2430 	if (!rth) {
2431 		err = -ENOBUFS;
2432 		goto cleanup;
2433 	}
2434 
2435 	atomic_set(&rth->u.dst.__refcnt, 1);
2436 	rth->u.dst.flags= DST_HOST;
2437 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2438 		rth->u.dst.flags |= DST_NOXFRM;
2439 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2440 		rth->u.dst.flags |= DST_NOPOLICY;
2441 
2442 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2443 	rth->fl.fl4_tos	= tos;
2444 	rth->fl.fl4_src	= oldflp->fl4_src;
2445 	rth->fl.oif	= oldflp->oif;
2446 	rth->fl.mark    = oldflp->mark;
2447 	rth->rt_dst	= fl->fl4_dst;
2448 	rth->rt_src	= fl->fl4_src;
2449 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2450 	/* get references to the devices that are to be hold by the routing
2451 	   cache entry */
2452 	rth->u.dst.dev	= dev_out;
2453 	dev_hold(dev_out);
2454 	rth->idev	= in_dev_get(dev_out);
2455 	rth->rt_gateway = fl->fl4_dst;
2456 	rth->rt_spec_dst= fl->fl4_src;
2457 
2458 	rth->u.dst.output=ip_output;
2459 	rth->u.dst.obsolete = -1;
2460 	rth->rt_genid = rt_genid(dev_net(dev_out));
2461 
2462 	RT_CACHE_STAT_INC(out_slow_tot);
2463 
2464 	if (flags & RTCF_LOCAL) {
2465 		rth->u.dst.input = ip_local_deliver;
2466 		rth->rt_spec_dst = fl->fl4_dst;
2467 	}
2468 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2469 		rth->rt_spec_dst = fl->fl4_src;
2470 		if (flags & RTCF_LOCAL &&
2471 		    !(dev_out->flags & IFF_LOOPBACK)) {
2472 			rth->u.dst.output = ip_mc_output;
2473 			RT_CACHE_STAT_INC(out_slow_mc);
2474 		}
2475 #ifdef CONFIG_IP_MROUTE
2476 		if (res->type == RTN_MULTICAST) {
2477 			if (IN_DEV_MFORWARD(in_dev) &&
2478 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2479 				rth->u.dst.input = ip_mr_input;
2480 				rth->u.dst.output = ip_mc_output;
2481 			}
2482 		}
2483 #endif
2484 	}
2485 
2486 	rt_set_nexthop(rth, res, 0);
2487 
2488 	rth->rt_flags = flags;
2489 
2490 	*result = rth;
2491  cleanup:
2492 	/* release work reference to inet device */
2493 	in_dev_put(in_dev);
2494 
2495 	return err;
2496 }
2497 
2498 static int ip_mkroute_output(struct rtable **rp,
2499 			     struct fib_result *res,
2500 			     const struct flowi *fl,
2501 			     const struct flowi *oldflp,
2502 			     struct net_device *dev_out,
2503 			     unsigned flags)
2504 {
2505 	struct rtable *rth = NULL;
2506 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2507 	unsigned hash;
2508 	if (err == 0) {
2509 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2510 			       rt_genid(dev_net(dev_out)));
2511 		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2512 	}
2513 
2514 	return err;
2515 }
2516 
2517 /*
2518  * Major route resolver routine.
2519  */
2520 
2521 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2522 				const struct flowi *oldflp)
2523 {
2524 	u32 tos	= RT_FL_TOS(oldflp);
2525 	struct flowi fl = { .nl_u = { .ip4_u =
2526 				      { .daddr = oldflp->fl4_dst,
2527 					.saddr = oldflp->fl4_src,
2528 					.tos = tos & IPTOS_RT_MASK,
2529 					.scope = ((tos & RTO_ONLINK) ?
2530 						  RT_SCOPE_LINK :
2531 						  RT_SCOPE_UNIVERSE),
2532 				      } },
2533 			    .mark = oldflp->mark,
2534 			    .iif = net->loopback_dev->ifindex,
2535 			    .oif = oldflp->oif };
2536 	struct fib_result res;
2537 	unsigned flags = 0;
2538 	struct net_device *dev_out = NULL;
2539 	int free_res = 0;
2540 	int err;
2541 
2542 
2543 	res.fi		= NULL;
2544 #ifdef CONFIG_IP_MULTIPLE_TABLES
2545 	res.r		= NULL;
2546 #endif
2547 
2548 	if (oldflp->fl4_src) {
2549 		err = -EINVAL;
2550 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2551 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2552 		    ipv4_is_zeronet(oldflp->fl4_src))
2553 			goto out;
2554 
2555 		/* I removed check for oif == dev_out->oif here.
2556 		   It was wrong for two reasons:
2557 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 		      is assigned to multiple interfaces.
2559 		   2. Moreover, we are allowed to send packets with saddr
2560 		      of another iface. --ANK
2561 		 */
2562 
2563 		if (oldflp->oif == 0 &&
2564 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2565 		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2566 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2568 			if (dev_out == NULL)
2569 				goto out;
2570 
2571 			/* Special hack: user can direct multicasts
2572 			   and limited broadcast via necessary interface
2573 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 			   This hack is not just for fun, it allows
2575 			   vic,vat and friends to work.
2576 			   They bind socket to loopback, set ttl to zero
2577 			   and expect that it will work.
2578 			   From the viewpoint of routing cache they are broken,
2579 			   because we are not allowed to build multicast path
2580 			   with loopback source addr (look, routing cache
2581 			   cannot know, that ttl is zero, so that packet
2582 			   will not leave this host and route is valid).
2583 			   Luckily, this hack is good workaround.
2584 			 */
2585 
2586 			fl.oif = dev_out->ifindex;
2587 			goto make_route;
2588 		}
2589 
2590 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2593 			if (dev_out == NULL)
2594 				goto out;
2595 			dev_put(dev_out);
2596 			dev_out = NULL;
2597 		}
2598 	}
2599 
2600 
2601 	if (oldflp->oif) {
2602 		dev_out = dev_get_by_index(net, oldflp->oif);
2603 		err = -ENODEV;
2604 		if (dev_out == NULL)
2605 			goto out;
2606 
2607 		/* RACE: Check return value of inet_select_addr instead. */
2608 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2609 			dev_put(dev_out);
2610 			goto out;	/* Wrong error code */
2611 		}
2612 
2613 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2614 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2615 			if (!fl.fl4_src)
2616 				fl.fl4_src = inet_select_addr(dev_out, 0,
2617 							      RT_SCOPE_LINK);
2618 			goto make_route;
2619 		}
2620 		if (!fl.fl4_src) {
2621 			if (ipv4_is_multicast(oldflp->fl4_dst))
2622 				fl.fl4_src = inet_select_addr(dev_out, 0,
2623 							      fl.fl4_scope);
2624 			else if (!oldflp->fl4_dst)
2625 				fl.fl4_src = inet_select_addr(dev_out, 0,
2626 							      RT_SCOPE_HOST);
2627 		}
2628 	}
2629 
2630 	if (!fl.fl4_dst) {
2631 		fl.fl4_dst = fl.fl4_src;
2632 		if (!fl.fl4_dst)
2633 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2634 		if (dev_out)
2635 			dev_put(dev_out);
2636 		dev_out = net->loopback_dev;
2637 		dev_hold(dev_out);
2638 		fl.oif = net->loopback_dev->ifindex;
2639 		res.type = RTN_LOCAL;
2640 		flags |= RTCF_LOCAL;
2641 		goto make_route;
2642 	}
2643 
2644 	if (fib_lookup(net, &fl, &res)) {
2645 		res.fi = NULL;
2646 		if (oldflp->oif) {
2647 			/* Apparently, routing tables are wrong. Assume,
2648 			   that the destination is on link.
2649 
2650 			   WHY? DW.
2651 			   Because we are allowed to send to iface
2652 			   even if it has NO routes and NO assigned
2653 			   addresses. When oif is specified, routing
2654 			   tables are looked up with only one purpose:
2655 			   to catch if destination is gatewayed, rather than
2656 			   direct. Moreover, if MSG_DONTROUTE is set,
2657 			   we send packet, ignoring both routing tables
2658 			   and ifaddr state. --ANK
2659 
2660 
2661 			   We could make it even if oif is unknown,
2662 			   likely IPv6, but we do not.
2663 			 */
2664 
2665 			if (fl.fl4_src == 0)
2666 				fl.fl4_src = inet_select_addr(dev_out, 0,
2667 							      RT_SCOPE_LINK);
2668 			res.type = RTN_UNICAST;
2669 			goto make_route;
2670 		}
2671 		if (dev_out)
2672 			dev_put(dev_out);
2673 		err = -ENETUNREACH;
2674 		goto out;
2675 	}
2676 	free_res = 1;
2677 
2678 	if (res.type == RTN_LOCAL) {
2679 		if (!fl.fl4_src)
2680 			fl.fl4_src = fl.fl4_dst;
2681 		if (dev_out)
2682 			dev_put(dev_out);
2683 		dev_out = net->loopback_dev;
2684 		dev_hold(dev_out);
2685 		fl.oif = dev_out->ifindex;
2686 		if (res.fi)
2687 			fib_info_put(res.fi);
2688 		res.fi = NULL;
2689 		flags |= RTCF_LOCAL;
2690 		goto make_route;
2691 	}
2692 
2693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2694 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2695 		fib_select_multipath(&fl, &res);
2696 	else
2697 #endif
2698 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2699 		fib_select_default(net, &fl, &res);
2700 
2701 	if (!fl.fl4_src)
2702 		fl.fl4_src = FIB_RES_PREFSRC(res);
2703 
2704 	if (dev_out)
2705 		dev_put(dev_out);
2706 	dev_out = FIB_RES_DEV(res);
2707 	dev_hold(dev_out);
2708 	fl.oif = dev_out->ifindex;
2709 
2710 
2711 make_route:
2712 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2713 
2714 
2715 	if (free_res)
2716 		fib_res_put(&res);
2717 	if (dev_out)
2718 		dev_put(dev_out);
2719 out:	return err;
2720 }
2721 
2722 int __ip_route_output_key(struct net *net, struct rtable **rp,
2723 			  const struct flowi *flp)
2724 {
2725 	unsigned hash;
2726 	struct rtable *rth;
2727 
2728 	if (!rt_caching(net))
2729 		goto slow_output;
2730 
2731 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2732 
2733 	rcu_read_lock_bh();
2734 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2735 		rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2736 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2737 		    rth->fl.fl4_src == flp->fl4_src &&
2738 		    rth->fl.iif == 0 &&
2739 		    rth->fl.oif == flp->oif &&
2740 		    rth->fl.mark == flp->mark &&
2741 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2742 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2743 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2744 		    !rt_is_expired(rth)) {
2745 			dst_use(&rth->u.dst, jiffies);
2746 			RT_CACHE_STAT_INC(out_hit);
2747 			rcu_read_unlock_bh();
2748 			*rp = rth;
2749 			return 0;
2750 		}
2751 		RT_CACHE_STAT_INC(out_hlist_search);
2752 	}
2753 	rcu_read_unlock_bh();
2754 
2755 slow_output:
2756 	return ip_route_output_slow(net, rp, flp);
2757 }
2758 
2759 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2760 
2761 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2762 {
2763 }
2764 
2765 static struct dst_ops ipv4_dst_blackhole_ops = {
2766 	.family			=	AF_INET,
2767 	.protocol		=	cpu_to_be16(ETH_P_IP),
2768 	.destroy		=	ipv4_dst_destroy,
2769 	.check			=	ipv4_dst_check,
2770 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2771 	.entries		=	ATOMIC_INIT(0),
2772 };
2773 
2774 
2775 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2776 {
2777 	struct rtable *ort = *rp;
2778 	struct rtable *rt = (struct rtable *)
2779 		dst_alloc(&ipv4_dst_blackhole_ops);
2780 
2781 	if (rt) {
2782 		struct dst_entry *new = &rt->u.dst;
2783 
2784 		atomic_set(&new->__refcnt, 1);
2785 		new->__use = 1;
2786 		new->input = dst_discard;
2787 		new->output = dst_discard;
2788 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2789 
2790 		new->dev = ort->u.dst.dev;
2791 		if (new->dev)
2792 			dev_hold(new->dev);
2793 
2794 		rt->fl = ort->fl;
2795 
2796 		rt->idev = ort->idev;
2797 		if (rt->idev)
2798 			in_dev_hold(rt->idev);
2799 		rt->rt_genid = rt_genid(net);
2800 		rt->rt_flags = ort->rt_flags;
2801 		rt->rt_type = ort->rt_type;
2802 		rt->rt_dst = ort->rt_dst;
2803 		rt->rt_src = ort->rt_src;
2804 		rt->rt_iif = ort->rt_iif;
2805 		rt->rt_gateway = ort->rt_gateway;
2806 		rt->rt_spec_dst = ort->rt_spec_dst;
2807 		rt->peer = ort->peer;
2808 		if (rt->peer)
2809 			atomic_inc(&rt->peer->refcnt);
2810 
2811 		dst_free(new);
2812 	}
2813 
2814 	dst_release(&(*rp)->u.dst);
2815 	*rp = rt;
2816 	return (rt ? 0 : -ENOMEM);
2817 }
2818 
2819 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2820 			 struct sock *sk, int flags)
2821 {
2822 	int err;
2823 
2824 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2825 		return err;
2826 
2827 	if (flp->proto) {
2828 		if (!flp->fl4_src)
2829 			flp->fl4_src = (*rp)->rt_src;
2830 		if (!flp->fl4_dst)
2831 			flp->fl4_dst = (*rp)->rt_dst;
2832 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2833 				    flags ? XFRM_LOOKUP_WAIT : 0);
2834 		if (err == -EREMOTE)
2835 			err = ipv4_dst_blackhole(net, rp, flp);
2836 
2837 		return err;
2838 	}
2839 
2840 	return 0;
2841 }
2842 
2843 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2844 
2845 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2846 {
2847 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2848 }
2849 
2850 static int rt_fill_info(struct net *net,
2851 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2852 			int nowait, unsigned int flags)
2853 {
2854 	struct rtable *rt = skb_rtable(skb);
2855 	struct rtmsg *r;
2856 	struct nlmsghdr *nlh;
2857 	long expires;
2858 	u32 id = 0, ts = 0, tsage = 0, error;
2859 
2860 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2861 	if (nlh == NULL)
2862 		return -EMSGSIZE;
2863 
2864 	r = nlmsg_data(nlh);
2865 	r->rtm_family	 = AF_INET;
2866 	r->rtm_dst_len	= 32;
2867 	r->rtm_src_len	= 0;
2868 	r->rtm_tos	= rt->fl.fl4_tos;
2869 	r->rtm_table	= RT_TABLE_MAIN;
2870 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2871 	r->rtm_type	= rt->rt_type;
2872 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2873 	r->rtm_protocol = RTPROT_UNSPEC;
2874 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2875 	if (rt->rt_flags & RTCF_NOTIFY)
2876 		r->rtm_flags |= RTM_F_NOTIFY;
2877 
2878 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2879 
2880 	if (rt->fl.fl4_src) {
2881 		r->rtm_src_len = 32;
2882 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2883 	}
2884 	if (rt->u.dst.dev)
2885 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2886 #ifdef CONFIG_NET_CLS_ROUTE
2887 	if (rt->u.dst.tclassid)
2888 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2889 #endif
2890 	if (rt->fl.iif)
2891 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2892 	else if (rt->rt_src != rt->fl.fl4_src)
2893 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2894 
2895 	if (rt->rt_dst != rt->rt_gateway)
2896 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2897 
2898 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2899 		goto nla_put_failure;
2900 
2901 	error = rt->u.dst.error;
2902 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2903 	if (rt->peer) {
2904 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2905 		if (rt->peer->tcp_ts_stamp) {
2906 			ts = rt->peer->tcp_ts;
2907 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2908 		}
2909 	}
2910 
2911 	if (rt->fl.iif) {
2912 #ifdef CONFIG_IP_MROUTE
2913 		__be32 dst = rt->rt_dst;
2914 
2915 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2916 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917 			int err = ipmr_get_route(net, skb, r, nowait);
2918 			if (err <= 0) {
2919 				if (!nowait) {
2920 					if (err == 0)
2921 						return 0;
2922 					goto nla_put_failure;
2923 				} else {
2924 					if (err == -EMSGSIZE)
2925 						goto nla_put_failure;
2926 					error = err;
2927 				}
2928 			}
2929 		} else
2930 #endif
2931 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2932 	}
2933 
2934 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2935 			       expires, error) < 0)
2936 		goto nla_put_failure;
2937 
2938 	return nlmsg_end(skb, nlh);
2939 
2940 nla_put_failure:
2941 	nlmsg_cancel(skb, nlh);
2942 	return -EMSGSIZE;
2943 }
2944 
2945 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2946 {
2947 	struct net *net = sock_net(in_skb->sk);
2948 	struct rtmsg *rtm;
2949 	struct nlattr *tb[RTA_MAX+1];
2950 	struct rtable *rt = NULL;
2951 	__be32 dst = 0;
2952 	__be32 src = 0;
2953 	u32 iif;
2954 	int err;
2955 	struct sk_buff *skb;
2956 
2957 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2958 	if (err < 0)
2959 		goto errout;
2960 
2961 	rtm = nlmsg_data(nlh);
2962 
2963 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2964 	if (skb == NULL) {
2965 		err = -ENOBUFS;
2966 		goto errout;
2967 	}
2968 
2969 	/* Reserve room for dummy headers, this skb can pass
2970 	   through good chunk of routing engine.
2971 	 */
2972 	skb_reset_mac_header(skb);
2973 	skb_reset_network_header(skb);
2974 
2975 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2976 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2977 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2978 
2979 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2980 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2981 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2982 
2983 	if (iif) {
2984 		struct net_device *dev;
2985 
2986 		dev = __dev_get_by_index(net, iif);
2987 		if (dev == NULL) {
2988 			err = -ENODEV;
2989 			goto errout_free;
2990 		}
2991 
2992 		skb->protocol	= htons(ETH_P_IP);
2993 		skb->dev	= dev;
2994 		local_bh_disable();
2995 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2996 		local_bh_enable();
2997 
2998 		rt = skb_rtable(skb);
2999 		if (err == 0 && rt->u.dst.error)
3000 			err = -rt->u.dst.error;
3001 	} else {
3002 		struct flowi fl = {
3003 			.nl_u = {
3004 				.ip4_u = {
3005 					.daddr = dst,
3006 					.saddr = src,
3007 					.tos = rtm->rtm_tos,
3008 				},
3009 			},
3010 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3011 		};
3012 		err = ip_route_output_key(net, &rt, &fl);
3013 	}
3014 
3015 	if (err)
3016 		goto errout_free;
3017 
3018 	skb_dst_set(skb, &rt->u.dst);
3019 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3020 		rt->rt_flags |= RTCF_NOTIFY;
3021 
3022 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3023 			   RTM_NEWROUTE, 0, 0);
3024 	if (err <= 0)
3025 		goto errout_free;
3026 
3027 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3028 errout:
3029 	return err;
3030 
3031 errout_free:
3032 	kfree_skb(skb);
3033 	goto errout;
3034 }
3035 
3036 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3037 {
3038 	struct rtable *rt;
3039 	int h, s_h;
3040 	int idx, s_idx;
3041 	struct net *net;
3042 
3043 	net = sock_net(skb->sk);
3044 
3045 	s_h = cb->args[0];
3046 	if (s_h < 0)
3047 		s_h = 0;
3048 	s_idx = idx = cb->args[1];
3049 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3050 		if (!rt_hash_table[h].chain)
3051 			continue;
3052 		rcu_read_lock_bh();
3053 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3054 		     rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3055 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3056 				continue;
3057 			if (rt_is_expired(rt))
3058 				continue;
3059 			skb_dst_set(skb, dst_clone(&rt->u.dst));
3060 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3061 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3062 					 1, NLM_F_MULTI) <= 0) {
3063 				skb_dst_drop(skb);
3064 				rcu_read_unlock_bh();
3065 				goto done;
3066 			}
3067 			skb_dst_drop(skb);
3068 		}
3069 		rcu_read_unlock_bh();
3070 	}
3071 
3072 done:
3073 	cb->args[0] = h;
3074 	cb->args[1] = idx;
3075 	return skb->len;
3076 }
3077 
3078 void ip_rt_multicast_event(struct in_device *in_dev)
3079 {
3080 	rt_cache_flush(dev_net(in_dev->dev), 0);
3081 }
3082 
3083 #ifdef CONFIG_SYSCTL
3084 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3085 					void __user *buffer,
3086 					size_t *lenp, loff_t *ppos)
3087 {
3088 	if (write) {
3089 		int flush_delay;
3090 		ctl_table ctl;
3091 		struct net *net;
3092 
3093 		memcpy(&ctl, __ctl, sizeof(ctl));
3094 		ctl.data = &flush_delay;
3095 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3096 
3097 		net = (struct net *)__ctl->extra1;
3098 		rt_cache_flush(net, flush_delay);
3099 		return 0;
3100 	}
3101 
3102 	return -EINVAL;
3103 }
3104 
3105 static void rt_secret_reschedule(int old)
3106 {
3107 	struct net *net;
3108 	int new = ip_rt_secret_interval;
3109 	int diff = new - old;
3110 
3111 	if (!diff)
3112 		return;
3113 
3114 	rtnl_lock();
3115 	for_each_net(net) {
3116 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3117 		long time;
3118 
3119 		if (!new)
3120 			continue;
3121 
3122 		if (deleted) {
3123 			time = net->ipv4.rt_secret_timer.expires - jiffies;
3124 
3125 			if (time <= 0 || (time += diff) <= 0)
3126 				time = 0;
3127 		} else
3128 			time = new;
3129 
3130 		mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3131 	}
3132 	rtnl_unlock();
3133 }
3134 
3135 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3136 					  void __user *buffer, size_t *lenp,
3137 					  loff_t *ppos)
3138 {
3139 	int old = ip_rt_secret_interval;
3140 	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3141 
3142 	rt_secret_reschedule(old);
3143 
3144 	return ret;
3145 }
3146 
3147 static ctl_table ipv4_route_table[] = {
3148 	{
3149 		.procname	= "gc_thresh",
3150 		.data		= &ipv4_dst_ops.gc_thresh,
3151 		.maxlen		= sizeof(int),
3152 		.mode		= 0644,
3153 		.proc_handler	= proc_dointvec,
3154 	},
3155 	{
3156 		.procname	= "max_size",
3157 		.data		= &ip_rt_max_size,
3158 		.maxlen		= sizeof(int),
3159 		.mode		= 0644,
3160 		.proc_handler	= proc_dointvec,
3161 	},
3162 	{
3163 		/*  Deprecated. Use gc_min_interval_ms */
3164 
3165 		.procname	= "gc_min_interval",
3166 		.data		= &ip_rt_gc_min_interval,
3167 		.maxlen		= sizeof(int),
3168 		.mode		= 0644,
3169 		.proc_handler	= proc_dointvec_jiffies,
3170 	},
3171 	{
3172 		.procname	= "gc_min_interval_ms",
3173 		.data		= &ip_rt_gc_min_interval,
3174 		.maxlen		= sizeof(int),
3175 		.mode		= 0644,
3176 		.proc_handler	= proc_dointvec_ms_jiffies,
3177 	},
3178 	{
3179 		.procname	= "gc_timeout",
3180 		.data		= &ip_rt_gc_timeout,
3181 		.maxlen		= sizeof(int),
3182 		.mode		= 0644,
3183 		.proc_handler	= proc_dointvec_jiffies,
3184 	},
3185 	{
3186 		.procname	= "gc_interval",
3187 		.data		= &ip_rt_gc_interval,
3188 		.maxlen		= sizeof(int),
3189 		.mode		= 0644,
3190 		.proc_handler	= proc_dointvec_jiffies,
3191 	},
3192 	{
3193 		.procname	= "redirect_load",
3194 		.data		= &ip_rt_redirect_load,
3195 		.maxlen		= sizeof(int),
3196 		.mode		= 0644,
3197 		.proc_handler	= proc_dointvec,
3198 	},
3199 	{
3200 		.procname	= "redirect_number",
3201 		.data		= &ip_rt_redirect_number,
3202 		.maxlen		= sizeof(int),
3203 		.mode		= 0644,
3204 		.proc_handler	= proc_dointvec,
3205 	},
3206 	{
3207 		.procname	= "redirect_silence",
3208 		.data		= &ip_rt_redirect_silence,
3209 		.maxlen		= sizeof(int),
3210 		.mode		= 0644,
3211 		.proc_handler	= proc_dointvec,
3212 	},
3213 	{
3214 		.procname	= "error_cost",
3215 		.data		= &ip_rt_error_cost,
3216 		.maxlen		= sizeof(int),
3217 		.mode		= 0644,
3218 		.proc_handler	= proc_dointvec,
3219 	},
3220 	{
3221 		.procname	= "error_burst",
3222 		.data		= &ip_rt_error_burst,
3223 		.maxlen		= sizeof(int),
3224 		.mode		= 0644,
3225 		.proc_handler	= proc_dointvec,
3226 	},
3227 	{
3228 		.procname	= "gc_elasticity",
3229 		.data		= &ip_rt_gc_elasticity,
3230 		.maxlen		= sizeof(int),
3231 		.mode		= 0644,
3232 		.proc_handler	= proc_dointvec,
3233 	},
3234 	{
3235 		.procname	= "mtu_expires",
3236 		.data		= &ip_rt_mtu_expires,
3237 		.maxlen		= sizeof(int),
3238 		.mode		= 0644,
3239 		.proc_handler	= proc_dointvec_jiffies,
3240 	},
3241 	{
3242 		.procname	= "min_pmtu",
3243 		.data		= &ip_rt_min_pmtu,
3244 		.maxlen		= sizeof(int),
3245 		.mode		= 0644,
3246 		.proc_handler	= proc_dointvec,
3247 	},
3248 	{
3249 		.procname	= "min_adv_mss",
3250 		.data		= &ip_rt_min_advmss,
3251 		.maxlen		= sizeof(int),
3252 		.mode		= 0644,
3253 		.proc_handler	= proc_dointvec,
3254 	},
3255 	{
3256 		.procname	= "secret_interval",
3257 		.data		= &ip_rt_secret_interval,
3258 		.maxlen		= sizeof(int),
3259 		.mode		= 0644,
3260 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3261 	},
3262 	{ }
3263 };
3264 
3265 static struct ctl_table empty[1];
3266 
3267 static struct ctl_table ipv4_skeleton[] =
3268 {
3269 	{ .procname = "route",
3270 	  .mode = 0555, .child = ipv4_route_table},
3271 	{ .procname = "neigh",
3272 	  .mode = 0555, .child = empty},
3273 	{ }
3274 };
3275 
3276 static __net_initdata struct ctl_path ipv4_path[] = {
3277 	{ .procname = "net", },
3278 	{ .procname = "ipv4", },
3279 	{ },
3280 };
3281 
3282 static struct ctl_table ipv4_route_flush_table[] = {
3283 	{
3284 		.procname	= "flush",
3285 		.maxlen		= sizeof(int),
3286 		.mode		= 0200,
3287 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3288 	},
3289 	{ },
3290 };
3291 
3292 static __net_initdata struct ctl_path ipv4_route_path[] = {
3293 	{ .procname = "net", },
3294 	{ .procname = "ipv4", },
3295 	{ .procname = "route", },
3296 	{ },
3297 };
3298 
3299 static __net_init int sysctl_route_net_init(struct net *net)
3300 {
3301 	struct ctl_table *tbl;
3302 
3303 	tbl = ipv4_route_flush_table;
3304 	if (!net_eq(net, &init_net)) {
3305 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306 		if (tbl == NULL)
3307 			goto err_dup;
3308 	}
3309 	tbl[0].extra1 = net;
3310 
3311 	net->ipv4.route_hdr =
3312 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3313 	if (net->ipv4.route_hdr == NULL)
3314 		goto err_reg;
3315 	return 0;
3316 
3317 err_reg:
3318 	if (tbl != ipv4_route_flush_table)
3319 		kfree(tbl);
3320 err_dup:
3321 	return -ENOMEM;
3322 }
3323 
3324 static __net_exit void sysctl_route_net_exit(struct net *net)
3325 {
3326 	struct ctl_table *tbl;
3327 
3328 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3329 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3330 	BUG_ON(tbl == ipv4_route_flush_table);
3331 	kfree(tbl);
3332 }
3333 
3334 static __net_initdata struct pernet_operations sysctl_route_ops = {
3335 	.init = sysctl_route_net_init,
3336 	.exit = sysctl_route_net_exit,
3337 };
3338 #endif
3339 
3340 
3341 static __net_init int rt_secret_timer_init(struct net *net)
3342 {
3343 	atomic_set(&net->ipv4.rt_genid,
3344 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3345 			(jiffies ^ (jiffies >> 7))));
3346 
3347 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3348 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3349 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3350 
3351 	if (ip_rt_secret_interval) {
3352 		net->ipv4.rt_secret_timer.expires =
3353 			jiffies + net_random() % ip_rt_secret_interval +
3354 			ip_rt_secret_interval;
3355 		add_timer(&net->ipv4.rt_secret_timer);
3356 	}
3357 	return 0;
3358 }
3359 
3360 static __net_exit void rt_secret_timer_exit(struct net *net)
3361 {
3362 	del_timer_sync(&net->ipv4.rt_secret_timer);
3363 }
3364 
3365 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3366 	.init = rt_secret_timer_init,
3367 	.exit = rt_secret_timer_exit,
3368 };
3369 
3370 
3371 #ifdef CONFIG_NET_CLS_ROUTE
3372 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_NET_CLS_ROUTE */
3374 
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3377 {
3378 	if (!str)
3379 		return 0;
3380 	rhash_entries = simple_strtoul(str, &str, 0);
3381 	return 1;
3382 }
3383 __setup("rhash_entries=", set_rhash_entries);
3384 
3385 int __init ip_rt_init(void)
3386 {
3387 	int rc = 0;
3388 
3389 #ifdef CONFIG_NET_CLS_ROUTE
3390 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3391 	if (!ip_rt_acct)
3392 		panic("IP: failed to allocate ip_rt_acct\n");
3393 #endif
3394 
3395 	ipv4_dst_ops.kmem_cachep =
3396 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3397 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3398 
3399 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3400 
3401 	rt_hash_table = (struct rt_hash_bucket *)
3402 		alloc_large_system_hash("IP route cache",
3403 					sizeof(struct rt_hash_bucket),
3404 					rhash_entries,
3405 					(totalram_pages >= 128 * 1024) ?
3406 					15 : 17,
3407 					0,
3408 					&rt_hash_log,
3409 					&rt_hash_mask,
3410 					rhash_entries ? 0 : 512 * 1024);
3411 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3412 	rt_hash_lock_init();
3413 
3414 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3415 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3416 
3417 	devinet_init();
3418 	ip_fib_init();
3419 
3420 	/* All the timers, started at system startup tend
3421 	   to synchronize. Perturb it a bit.
3422 	 */
3423 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3424 	expires_ljiffies = jiffies;
3425 	schedule_delayed_work(&expires_work,
3426 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3427 
3428 	if (register_pernet_subsys(&rt_secret_timer_ops))
3429 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3430 
3431 	if (ip_rt_proc_init())
3432 		printk(KERN_ERR "Unable to create route proc files\n");
3433 #ifdef CONFIG_XFRM
3434 	xfrm_init();
3435 	xfrm4_init(ip_rt_max_size);
3436 #endif
3437 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3438 
3439 #ifdef CONFIG_SYSCTL
3440 	register_pernet_subsys(&sysctl_route_ops);
3441 #endif
3442 	return rc;
3443 }
3444 
3445 #ifdef CONFIG_SYSCTL
3446 /*
3447  * We really need to sanitize the damn ipv4 init order, then all
3448  * this nonsense will go away.
3449  */
3450 void __init ip_static_sysctl_init(void)
3451 {
3452 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3453 }
3454 #endif
3455 
3456 EXPORT_SYMBOL(__ip_select_ident);
3457 EXPORT_SYMBOL(ip_route_input);
3458 EXPORT_SYMBOL(ip_route_output_key);
3459