xref: /openbmc/linux/net/ipv4/route.c (revision d78c317f)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 
114 #define RT_FL_TOS(oldflp4) \
115 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116 
117 #define IP_MAX_MTU	0xFFF0
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly	= 8;
131 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly	= 256;
134 static int rt_chain_length_max __read_mostly	= 20;
135 static int redirect_genid;
136 
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139 
140 /*
141  *	Interface to generic destination cache.
142  */
143 
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
147 static void		 ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void		 ipv4_link_failure(struct sk_buff *skb);
150 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152 
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 			    int how)
155 {
156 }
157 
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160 	struct rtable *rt = (struct rtable *) dst;
161 	struct inet_peer *peer;
162 	u32 *p = NULL;
163 
164 	if (!rt->peer)
165 		rt_bind_peer(rt, rt->rt_dst, 1);
166 
167 	peer = rt->peer;
168 	if (peer) {
169 		u32 *old_p = __DST_METRICS_PTR(old);
170 		unsigned long prev, new;
171 
172 		p = peer->metrics;
173 		if (inet_metrics_new(peer))
174 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175 
176 		new = (unsigned long) p;
177 		prev = cmpxchg(&dst->_metrics, old, new);
178 
179 		if (prev != old) {
180 			p = __DST_METRICS_PTR(prev);
181 			if (prev & DST_METRICS_READ_ONLY)
182 				p = NULL;
183 		} else {
184 			if (rt->fi) {
185 				fib_info_put(rt->fi);
186 				rt->fi = NULL;
187 			}
188 		}
189 	}
190 	return p;
191 }
192 
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194 
195 static struct dst_ops ipv4_dst_ops = {
196 	.family =		AF_INET,
197 	.protocol =		cpu_to_be16(ETH_P_IP),
198 	.gc =			rt_garbage_collect,
199 	.check =		ipv4_dst_check,
200 	.default_advmss =	ipv4_default_advmss,
201 	.mtu =			ipv4_mtu,
202 	.cow_metrics =		ipv4_cow_metrics,
203 	.destroy =		ipv4_dst_destroy,
204 	.ifdown =		ipv4_dst_ifdown,
205 	.negative_advice =	ipv4_negative_advice,
206 	.link_failure =		ipv4_link_failure,
207 	.update_pmtu =		ip_rt_update_pmtu,
208 	.local_out =		__ip_local_out,
209 	.neigh_lookup =		ipv4_neigh_lookup,
210 };
211 
212 #define ECN_OR_COST(class)	TC_PRIO_##class
213 
214 const __u8 ip_tos2prio[16] = {
215 	TC_PRIO_BESTEFFORT,
216 	ECN_OR_COST(BESTEFFORT),
217 	TC_PRIO_BESTEFFORT,
218 	ECN_OR_COST(BESTEFFORT),
219 	TC_PRIO_BULK,
220 	ECN_OR_COST(BULK),
221 	TC_PRIO_BULK,
222 	ECN_OR_COST(BULK),
223 	TC_PRIO_INTERACTIVE,
224 	ECN_OR_COST(INTERACTIVE),
225 	TC_PRIO_INTERACTIVE,
226 	ECN_OR_COST(INTERACTIVE),
227 	TC_PRIO_INTERACTIVE_BULK,
228 	ECN_OR_COST(INTERACTIVE_BULK),
229 	TC_PRIO_INTERACTIVE_BULK,
230 	ECN_OR_COST(INTERACTIVE_BULK)
231 };
232 
233 
234 /*
235  * Route cache.
236  */
237 
238 /* The locking scheme is rather straight forward:
239  *
240  * 1) Read-Copy Update protects the buckets of the central route hash.
241  * 2) Only writers remove entries, and they hold the lock
242  *    as they look at rtable reference counts.
243  * 3) Only readers acquire references to rtable entries,
244  *    they do so with atomic increments and with the
245  *    lock held.
246  */
247 
248 struct rt_hash_bucket {
249 	struct rtable __rcu	*chain;
250 };
251 
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 	defined(CONFIG_PROVE_LOCKING)
254 /*
255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256  * The size of this table is a power of two and depends on the number of CPUS.
257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258  */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ	256
261 #else
262 # if NR_CPUS >= 32
263 #  define RT_HASH_LOCK_SZ	4096
264 # elif NR_CPUS >= 16
265 #  define RT_HASH_LOCK_SZ	2048
266 # elif NR_CPUS >= 8
267 #  define RT_HASH_LOCK_SZ	1024
268 # elif NR_CPUS >= 4
269 #  define RT_HASH_LOCK_SZ	512
270 # else
271 #  define RT_HASH_LOCK_SZ	256
272 # endif
273 #endif
274 
275 static spinlock_t	*rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277 
278 static __init void rt_hash_lock_init(void)
279 {
280 	int i;
281 
282 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 			GFP_KERNEL);
284 	if (!rt_hash_locks)
285 		panic("IP: failed to allocate rt_hash_locks\n");
286 
287 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 		spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292 
293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297 
298 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
299 static unsigned			rt_hash_mask __read_mostly;
300 static unsigned int		rt_hash_log  __read_mostly;
301 
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304 
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306 				   int genid)
307 {
308 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 			    idx, genid)
310 		& rt_hash_mask;
311 }
312 
313 static inline int rt_genid(struct net *net)
314 {
315 	return atomic_read(&net->ipv4.rt_genid);
316 }
317 
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320 	struct seq_net_private p;
321 	int bucket;
322 	int genid;
323 };
324 
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327 	struct rt_cache_iter_state *st = seq->private;
328 	struct rtable *r = NULL;
329 
330 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332 			continue;
333 		rcu_read_lock_bh();
334 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335 		while (r) {
336 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337 			    r->rt_genid == st->genid)
338 				return r;
339 			r = rcu_dereference_bh(r->dst.rt_next);
340 		}
341 		rcu_read_unlock_bh();
342 	}
343 	return r;
344 }
345 
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347 					  struct rtable *r)
348 {
349 	struct rt_cache_iter_state *st = seq->private;
350 
351 	r = rcu_dereference_bh(r->dst.rt_next);
352 	while (!r) {
353 		rcu_read_unlock_bh();
354 		do {
355 			if (--st->bucket < 0)
356 				return NULL;
357 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358 		rcu_read_lock_bh();
359 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 	}
361 	return r;
362 }
363 
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365 					struct rtable *r)
366 {
367 	struct rt_cache_iter_state *st = seq->private;
368 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369 		if (dev_net(r->dst.dev) != seq_file_net(seq))
370 			continue;
371 		if (r->rt_genid == st->genid)
372 			break;
373 	}
374 	return r;
375 }
376 
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379 	struct rtable *r = rt_cache_get_first(seq);
380 
381 	if (r)
382 		while (pos && (r = rt_cache_get_next(seq, r)))
383 			--pos;
384 	return pos ? NULL : r;
385 }
386 
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389 	struct rt_cache_iter_state *st = seq->private;
390 	if (*pos)
391 		return rt_cache_get_idx(seq, *pos - 1);
392 	st->genid = rt_genid(seq_file_net(seq));
393 	return SEQ_START_TOKEN;
394 }
395 
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398 	struct rtable *r;
399 
400 	if (v == SEQ_START_TOKEN)
401 		r = rt_cache_get_first(seq);
402 	else
403 		r = rt_cache_get_next(seq, v);
404 	++*pos;
405 	return r;
406 }
407 
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410 	if (v && v != SEQ_START_TOKEN)
411 		rcu_read_unlock_bh();
412 }
413 
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416 	if (v == SEQ_START_TOKEN)
417 		seq_printf(seq, "%-127s\n",
418 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 			   "HHUptod\tSpecDst");
421 	else {
422 		struct rtable *r = v;
423 		struct neighbour *n;
424 		int len, HHUptod;
425 
426 		rcu_read_lock();
427 		n = dst_get_neighbour_noref(&r->dst);
428 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 		rcu_read_unlock();
430 
431 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433 			r->dst.dev ? r->dst.dev->name : "*",
434 			(__force u32)r->rt_dst,
435 			(__force u32)r->rt_gateway,
436 			r->rt_flags, atomic_read(&r->dst.__refcnt),
437 			r->dst.__use, 0, (__force u32)r->rt_src,
438 			dst_metric_advmss(&r->dst) + 40,
439 			dst_metric(&r->dst, RTAX_WINDOW),
440 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 			      dst_metric(&r->dst, RTAX_RTTVAR)),
442 			r->rt_key_tos,
443 			-1,
444 			HHUptod,
445 			r->rt_spec_dst, &len);
446 
447 		seq_printf(seq, "%*s\n", 127 - len, "");
448 	}
449 	return 0;
450 }
451 
452 static const struct seq_operations rt_cache_seq_ops = {
453 	.start  = rt_cache_seq_start,
454 	.next   = rt_cache_seq_next,
455 	.stop   = rt_cache_seq_stop,
456 	.show   = rt_cache_seq_show,
457 };
458 
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461 	return seq_open_net(inode, file, &rt_cache_seq_ops,
462 			sizeof(struct rt_cache_iter_state));
463 }
464 
465 static const struct file_operations rt_cache_seq_fops = {
466 	.owner	 = THIS_MODULE,
467 	.open	 = rt_cache_seq_open,
468 	.read	 = seq_read,
469 	.llseek	 = seq_lseek,
470 	.release = seq_release_net,
471 };
472 
473 
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476 	int cpu;
477 
478 	if (*pos == 0)
479 		return SEQ_START_TOKEN;
480 
481 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482 		if (!cpu_possible(cpu))
483 			continue;
484 		*pos = cpu+1;
485 		return &per_cpu(rt_cache_stat, cpu);
486 	}
487 	return NULL;
488 }
489 
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492 	int cpu;
493 
494 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495 		if (!cpu_possible(cpu))
496 			continue;
497 		*pos = cpu+1;
498 		return &per_cpu(rt_cache_stat, cpu);
499 	}
500 	return NULL;
501 
502 }
503 
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506 
507 }
508 
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511 	struct rt_cache_stat *st = v;
512 
513 	if (v == SEQ_START_TOKEN) {
514 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515 		return 0;
516 	}
517 
518 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
519 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520 		   dst_entries_get_slow(&ipv4_dst_ops),
521 		   st->in_hit,
522 		   st->in_slow_tot,
523 		   st->in_slow_mc,
524 		   st->in_no_route,
525 		   st->in_brd,
526 		   st->in_martian_dst,
527 		   st->in_martian_src,
528 
529 		   st->out_hit,
530 		   st->out_slow_tot,
531 		   st->out_slow_mc,
532 
533 		   st->gc_total,
534 		   st->gc_ignored,
535 		   st->gc_goal_miss,
536 		   st->gc_dst_overflow,
537 		   st->in_hlist_search,
538 		   st->out_hlist_search
539 		);
540 	return 0;
541 }
542 
543 static const struct seq_operations rt_cpu_seq_ops = {
544 	.start  = rt_cpu_seq_start,
545 	.next   = rt_cpu_seq_next,
546 	.stop   = rt_cpu_seq_stop,
547 	.show   = rt_cpu_seq_show,
548 };
549 
550 
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553 	return seq_open(file, &rt_cpu_seq_ops);
554 }
555 
556 static const struct file_operations rt_cpu_seq_fops = {
557 	.owner	 = THIS_MODULE,
558 	.open	 = rt_cpu_seq_open,
559 	.read	 = seq_read,
560 	.llseek	 = seq_lseek,
561 	.release = seq_release,
562 };
563 
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567 	struct ip_rt_acct *dst, *src;
568 	unsigned int i, j;
569 
570 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 	if (!dst)
572 		return -ENOMEM;
573 
574 	for_each_possible_cpu(i) {
575 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 		for (j = 0; j < 256; j++) {
577 			dst[j].o_bytes   += src[j].o_bytes;
578 			dst[j].o_packets += src[j].o_packets;
579 			dst[j].i_bytes   += src[j].i_bytes;
580 			dst[j].i_packets += src[j].i_packets;
581 		}
582 	}
583 
584 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 	kfree(dst);
586 	return 0;
587 }
588 
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591 	return single_open(file, rt_acct_proc_show, NULL);
592 }
593 
594 static const struct file_operations rt_acct_proc_fops = {
595 	.owner		= THIS_MODULE,
596 	.open		= rt_acct_proc_open,
597 	.read		= seq_read,
598 	.llseek		= seq_lseek,
599 	.release	= single_release,
600 };
601 #endif
602 
603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605 	struct proc_dir_entry *pde;
606 
607 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 			&rt_cache_seq_fops);
609 	if (!pde)
610 		goto err1;
611 
612 	pde = proc_create("rt_cache", S_IRUGO,
613 			  net->proc_net_stat, &rt_cpu_seq_fops);
614 	if (!pde)
615 		goto err2;
616 
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619 	if (!pde)
620 		goto err3;
621 #endif
622 	return 0;
623 
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626 	remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629 	remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631 	return -ENOMEM;
632 }
633 
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636 	remove_proc_entry("rt_cache", net->proc_net_stat);
637 	remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639 	remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642 
643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
644 	.init = ip_rt_do_proc_init,
645 	.exit = ip_rt_do_proc_exit,
646 };
647 
648 static int __init ip_rt_proc_init(void)
649 {
650 	return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652 
653 #else
654 static inline int ip_rt_proc_init(void)
655 {
656 	return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659 
660 static inline void rt_free(struct rtable *rt)
661 {
662 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664 
665 static inline void rt_drop(struct rtable *rt)
666 {
667 	ip_rt_put(rt);
668 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670 
671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673 	/* Kill broadcast/multicast entries very aggresively, if they
674 	   collide in hash table with more useful entries */
675 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676 		rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678 
679 static inline int rt_valuable(struct rtable *rth)
680 {
681 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682 		(rth->peer && rth->peer->pmtu_expires);
683 }
684 
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687 	unsigned long age;
688 	int ret = 0;
689 
690 	if (atomic_read(&rth->dst.__refcnt))
691 		goto out;
692 
693 	age = jiffies - rth->dst.lastuse;
694 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 	    (age <= tmo2 && rt_valuable(rth)))
696 		goto out;
697 	ret = 1;
698 out:	return ret;
699 }
700 
701 /* Bits of score are:
702  * 31: very valuable
703  * 30: not quite useless
704  * 29..0: usage counter
705  */
706 static inline u32 rt_score(struct rtable *rt)
707 {
708 	u32 score = jiffies - rt->dst.lastuse;
709 
710 	score = ~score & ~(3<<30);
711 
712 	if (rt_valuable(rt))
713 		score |= (1<<31);
714 
715 	if (rt_is_output_route(rt) ||
716 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 		score |= (1<<30);
718 
719 	return score;
720 }
721 
722 static inline bool rt_caching(const struct net *net)
723 {
724 	return net->ipv4.current_rt_cache_rebuild_count <=
725 		net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727 
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729 				       const struct rtable *rt2)
730 {
731 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735 
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 		(rt1->rt_mark ^ rt2->rt_mark) |
741 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
742 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
743 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745 
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750 
751 static inline int rt_is_expired(struct rtable *rth)
752 {
753 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755 
756 /*
757  * Perform a full scan of hash table and free all entries.
758  * Can be called by a softirq or a process.
759  * In the later case, we want to be reschedule if necessary
760  */
761 static void rt_do_flush(struct net *net, int process_context)
762 {
763 	unsigned int i;
764 	struct rtable *rth, *next;
765 
766 	for (i = 0; i <= rt_hash_mask; i++) {
767 		struct rtable __rcu **pprev;
768 		struct rtable *list;
769 
770 		if (process_context && need_resched())
771 			cond_resched();
772 		rth = rcu_access_pointer(rt_hash_table[i].chain);
773 		if (!rth)
774 			continue;
775 
776 		spin_lock_bh(rt_hash_lock_addr(i));
777 
778 		list = NULL;
779 		pprev = &rt_hash_table[i].chain;
780 		rth = rcu_dereference_protected(*pprev,
781 			lockdep_is_held(rt_hash_lock_addr(i)));
782 
783 		while (rth) {
784 			next = rcu_dereference_protected(rth->dst.rt_next,
785 				lockdep_is_held(rt_hash_lock_addr(i)));
786 
787 			if (!net ||
788 			    net_eq(dev_net(rth->dst.dev), net)) {
789 				rcu_assign_pointer(*pprev, next);
790 				rcu_assign_pointer(rth->dst.rt_next, list);
791 				list = rth;
792 			} else {
793 				pprev = &rth->dst.rt_next;
794 			}
795 			rth = next;
796 		}
797 
798 		spin_unlock_bh(rt_hash_lock_addr(i));
799 
800 		for (; list; list = next) {
801 			next = rcu_dereference_protected(list->dst.rt_next, 1);
802 			rt_free(list);
803 		}
804 	}
805 }
806 
807 /*
808  * While freeing expired entries, we compute average chain length
809  * and standard deviation, using fixed-point arithmetic.
810  * This to have an estimation of rt_chain_length_max
811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813  */
814 
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817 
818 /*
819  * Given a hash chain and an item in this hash chain,
820  * find if a previous entry has the same hash_inputs
821  * (but differs on tos, mark or oif)
822  * Returns 0 if an alias is found.
823  * Returns ONE if rth has no alias before itself.
824  */
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827 	const struct rtable *aux = head;
828 
829 	while (aux != rth) {
830 		if (compare_hash_inputs(aux, rth))
831 			return 0;
832 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833 	}
834 	return ONE;
835 }
836 
837 static void rt_check_expire(void)
838 {
839 	static unsigned int rover;
840 	unsigned int i = rover, goal;
841 	struct rtable *rth;
842 	struct rtable __rcu **rthp;
843 	unsigned long samples = 0;
844 	unsigned long sum = 0, sum2 = 0;
845 	unsigned long delta;
846 	u64 mult;
847 
848 	delta = jiffies - expires_ljiffies;
849 	expires_ljiffies = jiffies;
850 	mult = ((u64)delta) << rt_hash_log;
851 	if (ip_rt_gc_timeout > 1)
852 		do_div(mult, ip_rt_gc_timeout);
853 	goal = (unsigned int)mult;
854 	if (goal > rt_hash_mask)
855 		goal = rt_hash_mask + 1;
856 	for (; goal > 0; goal--) {
857 		unsigned long tmo = ip_rt_gc_timeout;
858 		unsigned long length;
859 
860 		i = (i + 1) & rt_hash_mask;
861 		rthp = &rt_hash_table[i].chain;
862 
863 		if (need_resched())
864 			cond_resched();
865 
866 		samples++;
867 
868 		if (rcu_dereference_raw(*rthp) == NULL)
869 			continue;
870 		length = 0;
871 		spin_lock_bh(rt_hash_lock_addr(i));
872 		while ((rth = rcu_dereference_protected(*rthp,
873 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 			prefetch(rth->dst.rt_next);
875 			if (rt_is_expired(rth)) {
876 				*rthp = rth->dst.rt_next;
877 				rt_free(rth);
878 				continue;
879 			}
880 			if (rth->dst.expires) {
881 				/* Entry is expired even if it is in use */
882 				if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884 					tmo >>= 1;
885 					rthp = &rth->dst.rt_next;
886 					/*
887 					 * We only count entries on
888 					 * a chain with equal hash inputs once
889 					 * so that entries for different QOS
890 					 * levels, and other non-hash input
891 					 * attributes don't unfairly skew
892 					 * the length computation
893 					 */
894 					length += has_noalias(rt_hash_table[i].chain, rth);
895 					continue;
896 				}
897 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 				goto nofree;
899 
900 			/* Cleanup aged off entries. */
901 			*rthp = rth->dst.rt_next;
902 			rt_free(rth);
903 		}
904 		spin_unlock_bh(rt_hash_lock_addr(i));
905 		sum += length;
906 		sum2 += length*length;
907 	}
908 	if (samples) {
909 		unsigned long avg = sum / samples;
910 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 		rt_chain_length_max = max_t(unsigned long,
912 					ip_rt_gc_elasticity,
913 					(avg + 4*sd) >> FRACT_BITS);
914 	}
915 	rover = i;
916 }
917 
918 /*
919  * rt_worker_func() is run in process context.
920  * we call rt_check_expire() to scan part of the hash table
921  */
922 static void rt_worker_func(struct work_struct *work)
923 {
924 	rt_check_expire();
925 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927 
928 /*
929  * Perturbation of rt_genid by a small quantity [1..256]
930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931  * many times (2^24) without giving recent rt_genid.
932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933  */
934 static void rt_cache_invalidate(struct net *net)
935 {
936 	unsigned char shuffle;
937 
938 	get_random_bytes(&shuffle, sizeof(shuffle));
939 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940 	redirect_genid++;
941 }
942 
943 /*
944  * delay < 0  : invalidate cache (fast : entries will be deleted later)
945  * delay >= 0 : invalidate & flush cache (can be long)
946  */
947 void rt_cache_flush(struct net *net, int delay)
948 {
949 	rt_cache_invalidate(net);
950 	if (delay >= 0)
951 		rt_do_flush(net, !in_softirq());
952 }
953 
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
956 {
957 	rt_do_flush(net, !in_softirq());
958 }
959 
960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962 	if (net_ratelimit())
963 		printk(KERN_WARNING "Route hash chain too long!\n");
964 	rt_cache_invalidate(net);
965 }
966 
967 /*
968    Short description of GC goals.
969 
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973 
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979 
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982 	static unsigned long expire = RT_GC_TIMEOUT;
983 	static unsigned long last_gc;
984 	static int rover;
985 	static int equilibrium;
986 	struct rtable *rth;
987 	struct rtable __rcu **rthp;
988 	unsigned long now = jiffies;
989 	int goal;
990 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
991 
992 	/*
993 	 * Garbage collection is pretty expensive,
994 	 * do not make it too frequently.
995 	 */
996 
997 	RT_CACHE_STAT_INC(gc_total);
998 
999 	if (now - last_gc < ip_rt_gc_min_interval &&
1000 	    entries < ip_rt_max_size) {
1001 		RT_CACHE_STAT_INC(gc_ignored);
1002 		goto out;
1003 	}
1004 
1005 	entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 	/* Calculate number of entries, which we want to expire now. */
1007 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008 	if (goal <= 0) {
1009 		if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 			equilibrium = ipv4_dst_ops.gc_thresh;
1011 		goal = entries - equilibrium;
1012 		if (goal > 0) {
1013 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 			goal = entries - equilibrium;
1015 		}
1016 	} else {
1017 		/* We are in dangerous area. Try to reduce cache really
1018 		 * aggressively.
1019 		 */
1020 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 		equilibrium = entries - goal;
1022 	}
1023 
1024 	if (now - last_gc >= ip_rt_gc_min_interval)
1025 		last_gc = now;
1026 
1027 	if (goal <= 0) {
1028 		equilibrium += goal;
1029 		goto work_done;
1030 	}
1031 
1032 	do {
1033 		int i, k;
1034 
1035 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 			unsigned long tmo = expire;
1037 
1038 			k = (k + 1) & rt_hash_mask;
1039 			rthp = &rt_hash_table[k].chain;
1040 			spin_lock_bh(rt_hash_lock_addr(k));
1041 			while ((rth = rcu_dereference_protected(*rthp,
1042 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 				if (!rt_is_expired(rth) &&
1044 					!rt_may_expire(rth, tmo, expire)) {
1045 					tmo >>= 1;
1046 					rthp = &rth->dst.rt_next;
1047 					continue;
1048 				}
1049 				*rthp = rth->dst.rt_next;
1050 				rt_free(rth);
1051 				goal--;
1052 			}
1053 			spin_unlock_bh(rt_hash_lock_addr(k));
1054 			if (goal <= 0)
1055 				break;
1056 		}
1057 		rover = k;
1058 
1059 		if (goal <= 0)
1060 			goto work_done;
1061 
1062 		/* Goal is not achieved. We stop process if:
1063 
1064 		   - if expire reduced to zero. Otherwise, expire is halfed.
1065 		   - if table is not full.
1066 		   - if we are called from interrupt.
1067 		   - jiffies check is just fallback/debug loop breaker.
1068 		     We will not spin here for long time in any case.
1069 		 */
1070 
1071 		RT_CACHE_STAT_INC(gc_goal_miss);
1072 
1073 		if (expire == 0)
1074 			break;
1075 
1076 		expire >>= 1;
1077 
1078 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 			goto out;
1080 	} while (!in_softirq() && time_before_eq(jiffies, now));
1081 
1082 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 		goto out;
1084 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 		goto out;
1086 	if (net_ratelimit())
1087 		printk(KERN_WARNING "dst cache overflow\n");
1088 	RT_CACHE_STAT_INC(gc_dst_overflow);
1089 	return 1;
1090 
1091 work_done:
1092 	expire += ip_rt_gc_min_interval;
1093 	if (expire > ip_rt_gc_timeout ||
1094 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096 		expire = ip_rt_gc_timeout;
1097 out:	return 0;
1098 }
1099 
1100 /*
1101  * Returns number of entries in a hash chain that have different hash_inputs
1102  */
1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105 	int length = 0;
1106 	const struct rtable *rth = head;
1107 
1108 	while (rth) {
1109 		length += has_noalias(head, rth);
1110 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111 	}
1112 	return length >> FRACT_BITS;
1113 }
1114 
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117 	static const __be32 inaddr_any = 0;
1118 	struct net_device *dev = dst->dev;
1119 	const __be32 *pkey = daddr;
1120 	struct neighbour *n;
1121 
1122 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1123 		pkey = &inaddr_any;
1124 
1125 	n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1126 	if (n)
1127 		return n;
1128 	return neigh_create(&arp_tbl, pkey, dev);
1129 }
1130 
1131 static int rt_bind_neighbour(struct rtable *rt)
1132 {
1133 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1134 	if (IS_ERR(n))
1135 		return PTR_ERR(n);
1136 	dst_set_neighbour(&rt->dst, n);
1137 
1138 	return 0;
1139 }
1140 
1141 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1142 				     struct sk_buff *skb, int ifindex)
1143 {
1144 	struct rtable	*rth, *cand;
1145 	struct rtable __rcu **rthp, **candp;
1146 	unsigned long	now;
1147 	u32 		min_score;
1148 	int		chain_length;
1149 	int attempts = !in_softirq();
1150 
1151 restart:
1152 	chain_length = 0;
1153 	min_score = ~(u32)0;
1154 	cand = NULL;
1155 	candp = NULL;
1156 	now = jiffies;
1157 
1158 	if (!rt_caching(dev_net(rt->dst.dev))) {
1159 		/*
1160 		 * If we're not caching, just tell the caller we
1161 		 * were successful and don't touch the route.  The
1162 		 * caller hold the sole reference to the cache entry, and
1163 		 * it will be released when the caller is done with it.
1164 		 * If we drop it here, the callers have no way to resolve routes
1165 		 * when we're not caching.  Instead, just point *rp at rt, so
1166 		 * the caller gets a single use out of the route
1167 		 * Note that we do rt_free on this new route entry, so that
1168 		 * once its refcount hits zero, we are still able to reap it
1169 		 * (Thanks Alexey)
1170 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1171 		 * we set DST_NOCACHE so that dst_release() can free dst without
1172 		 * waiting a grace period.
1173 		 */
1174 
1175 		rt->dst.flags |= DST_NOCACHE;
1176 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177 			int err = rt_bind_neighbour(rt);
1178 			if (err) {
1179 				if (net_ratelimit())
1180 					printk(KERN_WARNING
1181 					    "Neighbour table failure & not caching routes.\n");
1182 				ip_rt_put(rt);
1183 				return ERR_PTR(err);
1184 			}
1185 		}
1186 
1187 		goto skip_hashing;
1188 	}
1189 
1190 	rthp = &rt_hash_table[hash].chain;
1191 
1192 	spin_lock_bh(rt_hash_lock_addr(hash));
1193 	while ((rth = rcu_dereference_protected(*rthp,
1194 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1195 		if (rt_is_expired(rth)) {
1196 			*rthp = rth->dst.rt_next;
1197 			rt_free(rth);
1198 			continue;
1199 		}
1200 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1201 			/* Put it first */
1202 			*rthp = rth->dst.rt_next;
1203 			/*
1204 			 * Since lookup is lockfree, the deletion
1205 			 * must be visible to another weakly ordered CPU before
1206 			 * the insertion at the start of the hash chain.
1207 			 */
1208 			rcu_assign_pointer(rth->dst.rt_next,
1209 					   rt_hash_table[hash].chain);
1210 			/*
1211 			 * Since lookup is lockfree, the update writes
1212 			 * must be ordered for consistency on SMP.
1213 			 */
1214 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1215 
1216 			dst_use(&rth->dst, now);
1217 			spin_unlock_bh(rt_hash_lock_addr(hash));
1218 
1219 			rt_drop(rt);
1220 			if (skb)
1221 				skb_dst_set(skb, &rth->dst);
1222 			return rth;
1223 		}
1224 
1225 		if (!atomic_read(&rth->dst.__refcnt)) {
1226 			u32 score = rt_score(rth);
1227 
1228 			if (score <= min_score) {
1229 				cand = rth;
1230 				candp = rthp;
1231 				min_score = score;
1232 			}
1233 		}
1234 
1235 		chain_length++;
1236 
1237 		rthp = &rth->dst.rt_next;
1238 	}
1239 
1240 	if (cand) {
1241 		/* ip_rt_gc_elasticity used to be average length of chain
1242 		 * length, when exceeded gc becomes really aggressive.
1243 		 *
1244 		 * The second limit is less certain. At the moment it allows
1245 		 * only 2 entries per bucket. We will see.
1246 		 */
1247 		if (chain_length > ip_rt_gc_elasticity) {
1248 			*candp = cand->dst.rt_next;
1249 			rt_free(cand);
1250 		}
1251 	} else {
1252 		if (chain_length > rt_chain_length_max &&
1253 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1254 			struct net *net = dev_net(rt->dst.dev);
1255 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1256 			if (!rt_caching(net)) {
1257 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1258 					rt->dst.dev->name, num);
1259 			}
1260 			rt_emergency_hash_rebuild(net);
1261 			spin_unlock_bh(rt_hash_lock_addr(hash));
1262 
1263 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1264 					ifindex, rt_genid(net));
1265 			goto restart;
1266 		}
1267 	}
1268 
1269 	/* Try to bind route to arp only if it is output
1270 	   route or unicast forwarding path.
1271 	 */
1272 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1273 		int err = rt_bind_neighbour(rt);
1274 		if (err) {
1275 			spin_unlock_bh(rt_hash_lock_addr(hash));
1276 
1277 			if (err != -ENOBUFS) {
1278 				rt_drop(rt);
1279 				return ERR_PTR(err);
1280 			}
1281 
1282 			/* Neighbour tables are full and nothing
1283 			   can be released. Try to shrink route cache,
1284 			   it is most likely it holds some neighbour records.
1285 			 */
1286 			if (attempts-- > 0) {
1287 				int saved_elasticity = ip_rt_gc_elasticity;
1288 				int saved_int = ip_rt_gc_min_interval;
1289 				ip_rt_gc_elasticity	= 1;
1290 				ip_rt_gc_min_interval	= 0;
1291 				rt_garbage_collect(&ipv4_dst_ops);
1292 				ip_rt_gc_min_interval	= saved_int;
1293 				ip_rt_gc_elasticity	= saved_elasticity;
1294 				goto restart;
1295 			}
1296 
1297 			if (net_ratelimit())
1298 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1299 			rt_drop(rt);
1300 			return ERR_PTR(-ENOBUFS);
1301 		}
1302 	}
1303 
1304 	rt->dst.rt_next = rt_hash_table[hash].chain;
1305 
1306 	/*
1307 	 * Since lookup is lockfree, we must make sure
1308 	 * previous writes to rt are committed to memory
1309 	 * before making rt visible to other CPUS.
1310 	 */
1311 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1312 
1313 	spin_unlock_bh(rt_hash_lock_addr(hash));
1314 
1315 skip_hashing:
1316 	if (skb)
1317 		skb_dst_set(skb, &rt->dst);
1318 	return rt;
1319 }
1320 
1321 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1322 
1323 static u32 rt_peer_genid(void)
1324 {
1325 	return atomic_read(&__rt_peer_genid);
1326 }
1327 
1328 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1329 {
1330 	struct inet_peer *peer;
1331 
1332 	peer = inet_getpeer_v4(daddr, create);
1333 
1334 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1335 		inet_putpeer(peer);
1336 	else
1337 		rt->rt_peer_genid = rt_peer_genid();
1338 }
1339 
1340 /*
1341  * Peer allocation may fail only in serious out-of-memory conditions.  However
1342  * we still can generate some output.
1343  * Random ID selection looks a bit dangerous because we have no chances to
1344  * select ID being unique in a reasonable period of time.
1345  * But broken packet identifier may be better than no packet at all.
1346  */
1347 static void ip_select_fb_ident(struct iphdr *iph)
1348 {
1349 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1350 	static u32 ip_fallback_id;
1351 	u32 salt;
1352 
1353 	spin_lock_bh(&ip_fb_id_lock);
1354 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1355 	iph->id = htons(salt & 0xFFFF);
1356 	ip_fallback_id = salt;
1357 	spin_unlock_bh(&ip_fb_id_lock);
1358 }
1359 
1360 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1361 {
1362 	struct rtable *rt = (struct rtable *) dst;
1363 
1364 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1365 		if (rt->peer == NULL)
1366 			rt_bind_peer(rt, rt->rt_dst, 1);
1367 
1368 		/* If peer is attached to destination, it is never detached,
1369 		   so that we need not to grab a lock to dereference it.
1370 		 */
1371 		if (rt->peer) {
1372 			iph->id = htons(inet_getid(rt->peer, more));
1373 			return;
1374 		}
1375 	} else if (!rt)
1376 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1377 		       __builtin_return_address(0));
1378 
1379 	ip_select_fb_ident(iph);
1380 }
1381 EXPORT_SYMBOL(__ip_select_ident);
1382 
1383 static void rt_del(unsigned hash, struct rtable *rt)
1384 {
1385 	struct rtable __rcu **rthp;
1386 	struct rtable *aux;
1387 
1388 	rthp = &rt_hash_table[hash].chain;
1389 	spin_lock_bh(rt_hash_lock_addr(hash));
1390 	ip_rt_put(rt);
1391 	while ((aux = rcu_dereference_protected(*rthp,
1392 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393 		if (aux == rt || rt_is_expired(aux)) {
1394 			*rthp = aux->dst.rt_next;
1395 			rt_free(aux);
1396 			continue;
1397 		}
1398 		rthp = &aux->dst.rt_next;
1399 	}
1400 	spin_unlock_bh(rt_hash_lock_addr(hash));
1401 }
1402 
1403 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404 {
1405 	struct rtable *rt = (struct rtable *) dst;
1406 	__be32 orig_gw = rt->rt_gateway;
1407 	struct neighbour *n, *old_n;
1408 
1409 	dst_confirm(&rt->dst);
1410 
1411 	rt->rt_gateway = peer->redirect_learned.a4;
1412 
1413 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414 	if (IS_ERR(n)) {
1415 		rt->rt_gateway = orig_gw;
1416 		return;
1417 	}
1418 	old_n = xchg(&rt->dst._neighbour, n);
1419 	if (old_n)
1420 		neigh_release(old_n);
1421 	if (!(n->nud_state & NUD_VALID)) {
1422 		neigh_event_send(n, NULL);
1423 	} else {
1424 		rt->rt_flags |= RTCF_REDIRECTED;
1425 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426 	}
1427 }
1428 
1429 /* called in rcu_read_lock() section */
1430 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431 		    __be32 saddr, struct net_device *dev)
1432 {
1433 	int s, i;
1434 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1435 	__be32 skeys[2] = { saddr, 0 };
1436 	int    ikeys[2] = { dev->ifindex, 0 };
1437 	struct inet_peer *peer;
1438 	struct net *net;
1439 
1440 	if (!in_dev)
1441 		return;
1442 
1443 	net = dev_net(dev);
1444 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446 	    ipv4_is_zeronet(new_gw))
1447 		goto reject_redirect;
1448 
1449 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451 			goto reject_redirect;
1452 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453 			goto reject_redirect;
1454 	} else {
1455 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456 			goto reject_redirect;
1457 	}
1458 
1459 	for (s = 0; s < 2; s++) {
1460 		for (i = 0; i < 2; i++) {
1461 			unsigned int hash;
1462 			struct rtable __rcu **rthp;
1463 			struct rtable *rt;
1464 
1465 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466 
1467 			rthp = &rt_hash_table[hash].chain;
1468 
1469 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1470 				rthp = &rt->dst.rt_next;
1471 
1472 				if (rt->rt_key_dst != daddr ||
1473 				    rt->rt_key_src != skeys[s] ||
1474 				    rt->rt_oif != ikeys[i] ||
1475 				    rt_is_input_route(rt) ||
1476 				    rt_is_expired(rt) ||
1477 				    !net_eq(dev_net(rt->dst.dev), net) ||
1478 				    rt->dst.error ||
1479 				    rt->dst.dev != dev ||
1480 				    rt->rt_gateway != old_gw)
1481 					continue;
1482 
1483 				if (!rt->peer)
1484 					rt_bind_peer(rt, rt->rt_dst, 1);
1485 
1486 				peer = rt->peer;
1487 				if (peer) {
1488 					if (peer->redirect_learned.a4 != new_gw ||
1489 					    peer->redirect_genid != redirect_genid) {
1490 						peer->redirect_learned.a4 = new_gw;
1491 						peer->redirect_genid = redirect_genid;
1492 						atomic_inc(&__rt_peer_genid);
1493 					}
1494 					check_peer_redir(&rt->dst, peer);
1495 				}
1496 			}
1497 		}
1498 	}
1499 	return;
1500 
1501 reject_redirect:
1502 #ifdef CONFIG_IP_ROUTE_VERBOSE
1503 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1504 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1505 			"  Advised path = %pI4 -> %pI4\n",
1506 		       &old_gw, dev->name, &new_gw,
1507 		       &saddr, &daddr);
1508 #endif
1509 	;
1510 }
1511 
1512 static bool peer_pmtu_expired(struct inet_peer *peer)
1513 {
1514 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1515 
1516 	return orig &&
1517 	       time_after_eq(jiffies, orig) &&
1518 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1519 }
1520 
1521 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1522 {
1523 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1524 
1525 	return orig &&
1526 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1527 }
1528 
1529 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1530 {
1531 	struct rtable *rt = (struct rtable *)dst;
1532 	struct dst_entry *ret = dst;
1533 
1534 	if (rt) {
1535 		if (dst->obsolete > 0) {
1536 			ip_rt_put(rt);
1537 			ret = NULL;
1538 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1539 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1540 						rt->rt_oif,
1541 						rt_genid(dev_net(dst->dev)));
1542 			rt_del(hash, rt);
1543 			ret = NULL;
1544 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1545 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1546 		}
1547 	}
1548 	return ret;
1549 }
1550 
1551 /*
1552  * Algorithm:
1553  *	1. The first ip_rt_redirect_number redirects are sent
1554  *	   with exponential backoff, then we stop sending them at all,
1555  *	   assuming that the host ignores our redirects.
1556  *	2. If we did not see packets requiring redirects
1557  *	   during ip_rt_redirect_silence, we assume that the host
1558  *	   forgot redirected route and start to send redirects again.
1559  *
1560  * This algorithm is much cheaper and more intelligent than dumb load limiting
1561  * in icmp.c.
1562  *
1563  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1564  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1565  */
1566 
1567 void ip_rt_send_redirect(struct sk_buff *skb)
1568 {
1569 	struct rtable *rt = skb_rtable(skb);
1570 	struct in_device *in_dev;
1571 	struct inet_peer *peer;
1572 	int log_martians;
1573 
1574 	rcu_read_lock();
1575 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1576 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1577 		rcu_read_unlock();
1578 		return;
1579 	}
1580 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1581 	rcu_read_unlock();
1582 
1583 	if (!rt->peer)
1584 		rt_bind_peer(rt, rt->rt_dst, 1);
1585 	peer = rt->peer;
1586 	if (!peer) {
1587 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 		return;
1589 	}
1590 
1591 	/* No redirected packets during ip_rt_redirect_silence;
1592 	 * reset the algorithm.
1593 	 */
1594 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1595 		peer->rate_tokens = 0;
1596 
1597 	/* Too many ignored redirects; do not send anything
1598 	 * set dst.rate_last to the last seen redirected packet.
1599 	 */
1600 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1601 		peer->rate_last = jiffies;
1602 		return;
1603 	}
1604 
1605 	/* Check for load limit; set rate_last to the latest sent
1606 	 * redirect.
1607 	 */
1608 	if (peer->rate_tokens == 0 ||
1609 	    time_after(jiffies,
1610 		       (peer->rate_last +
1611 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1612 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1613 		peer->rate_last = jiffies;
1614 		++peer->rate_tokens;
1615 #ifdef CONFIG_IP_ROUTE_VERBOSE
1616 		if (log_martians &&
1617 		    peer->rate_tokens == ip_rt_redirect_number &&
1618 		    net_ratelimit())
1619 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1620 			       &ip_hdr(skb)->saddr, rt->rt_iif,
1621 				&rt->rt_dst, &rt->rt_gateway);
1622 #endif
1623 	}
1624 }
1625 
1626 static int ip_error(struct sk_buff *skb)
1627 {
1628 	struct rtable *rt = skb_rtable(skb);
1629 	struct inet_peer *peer;
1630 	unsigned long now;
1631 	bool send;
1632 	int code;
1633 
1634 	switch (rt->dst.error) {
1635 	case EINVAL:
1636 	default:
1637 		goto out;
1638 	case EHOSTUNREACH:
1639 		code = ICMP_HOST_UNREACH;
1640 		break;
1641 	case ENETUNREACH:
1642 		code = ICMP_NET_UNREACH;
1643 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1644 				IPSTATS_MIB_INNOROUTES);
1645 		break;
1646 	case EACCES:
1647 		code = ICMP_PKT_FILTERED;
1648 		break;
1649 	}
1650 
1651 	if (!rt->peer)
1652 		rt_bind_peer(rt, rt->rt_dst, 1);
1653 	peer = rt->peer;
1654 
1655 	send = true;
1656 	if (peer) {
1657 		now = jiffies;
1658 		peer->rate_tokens += now - peer->rate_last;
1659 		if (peer->rate_tokens > ip_rt_error_burst)
1660 			peer->rate_tokens = ip_rt_error_burst;
1661 		peer->rate_last = now;
1662 		if (peer->rate_tokens >= ip_rt_error_cost)
1663 			peer->rate_tokens -= ip_rt_error_cost;
1664 		else
1665 			send = false;
1666 	}
1667 	if (send)
1668 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1669 
1670 out:	kfree_skb(skb);
1671 	return 0;
1672 }
1673 
1674 /*
1675  *	The last two values are not from the RFC but
1676  *	are needed for AMPRnet AX.25 paths.
1677  */
1678 
1679 static const unsigned short mtu_plateau[] =
1680 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1681 
1682 static inline unsigned short guess_mtu(unsigned short old_mtu)
1683 {
1684 	int i;
1685 
1686 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1687 		if (old_mtu > mtu_plateau[i])
1688 			return mtu_plateau[i];
1689 	return 68;
1690 }
1691 
1692 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1693 				 unsigned short new_mtu,
1694 				 struct net_device *dev)
1695 {
1696 	unsigned short old_mtu = ntohs(iph->tot_len);
1697 	unsigned short est_mtu = 0;
1698 	struct inet_peer *peer;
1699 
1700 	peer = inet_getpeer_v4(iph->daddr, 1);
1701 	if (peer) {
1702 		unsigned short mtu = new_mtu;
1703 
1704 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1705 			/* BSD 4.2 derived systems incorrectly adjust
1706 			 * tot_len by the IP header length, and report
1707 			 * a zero MTU in the ICMP message.
1708 			 */
1709 			if (mtu == 0 &&
1710 			    old_mtu >= 68 + (iph->ihl << 2))
1711 				old_mtu -= iph->ihl << 2;
1712 			mtu = guess_mtu(old_mtu);
1713 		}
1714 
1715 		if (mtu < ip_rt_min_pmtu)
1716 			mtu = ip_rt_min_pmtu;
1717 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1718 			unsigned long pmtu_expires;
1719 
1720 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1721 			if (!pmtu_expires)
1722 				pmtu_expires = 1UL;
1723 
1724 			est_mtu = mtu;
1725 			peer->pmtu_learned = mtu;
1726 			peer->pmtu_expires = pmtu_expires;
1727 			atomic_inc(&__rt_peer_genid);
1728 		}
1729 
1730 		inet_putpeer(peer);
1731 	}
1732 	return est_mtu ? : new_mtu;
1733 }
1734 
1735 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1736 {
1737 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1738 
1739 	if (!expires)
1740 		return;
1741 	if (time_before(jiffies, expires)) {
1742 		u32 orig_dst_mtu = dst_mtu(dst);
1743 		if (peer->pmtu_learned < orig_dst_mtu) {
1744 			if (!peer->pmtu_orig)
1745 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1746 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1747 		}
1748 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1749 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1750 }
1751 
1752 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1753 {
1754 	struct rtable *rt = (struct rtable *) dst;
1755 	struct inet_peer *peer;
1756 
1757 	dst_confirm(dst);
1758 
1759 	if (!rt->peer)
1760 		rt_bind_peer(rt, rt->rt_dst, 1);
1761 	peer = rt->peer;
1762 	if (peer) {
1763 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1764 
1765 		if (mtu < ip_rt_min_pmtu)
1766 			mtu = ip_rt_min_pmtu;
1767 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1768 
1769 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1770 			if (!pmtu_expires)
1771 				pmtu_expires = 1UL;
1772 
1773 			peer->pmtu_learned = mtu;
1774 			peer->pmtu_expires = pmtu_expires;
1775 
1776 			atomic_inc(&__rt_peer_genid);
1777 			rt->rt_peer_genid = rt_peer_genid();
1778 		}
1779 		check_peer_pmtu(dst, peer);
1780 	}
1781 }
1782 
1783 
1784 static void ipv4_validate_peer(struct rtable *rt)
1785 {
1786 	if (rt->rt_peer_genid != rt_peer_genid()) {
1787 		struct inet_peer *peer;
1788 
1789 		if (!rt->peer)
1790 			rt_bind_peer(rt, rt->rt_dst, 0);
1791 
1792 		peer = rt->peer;
1793 		if (peer) {
1794 			check_peer_pmtu(&rt->dst, peer);
1795 
1796 			if (peer->redirect_genid != redirect_genid)
1797 				peer->redirect_learned.a4 = 0;
1798 			if (peer->redirect_learned.a4 &&
1799 			    peer->redirect_learned.a4 != rt->rt_gateway)
1800 				check_peer_redir(&rt->dst, peer);
1801 		}
1802 
1803 		rt->rt_peer_genid = rt_peer_genid();
1804 	}
1805 }
1806 
1807 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808 {
1809 	struct rtable *rt = (struct rtable *) dst;
1810 
1811 	if (rt_is_expired(rt))
1812 		return NULL;
1813 	ipv4_validate_peer(rt);
1814 	return dst;
1815 }
1816 
1817 static void ipv4_dst_destroy(struct dst_entry *dst)
1818 {
1819 	struct rtable *rt = (struct rtable *) dst;
1820 	struct inet_peer *peer = rt->peer;
1821 
1822 	if (rt->fi) {
1823 		fib_info_put(rt->fi);
1824 		rt->fi = NULL;
1825 	}
1826 	if (peer) {
1827 		rt->peer = NULL;
1828 		inet_putpeer(peer);
1829 	}
1830 }
1831 
1832 
1833 static void ipv4_link_failure(struct sk_buff *skb)
1834 {
1835 	struct rtable *rt;
1836 
1837 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838 
1839 	rt = skb_rtable(skb);
1840 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1842 }
1843 
1844 static int ip_rt_bug(struct sk_buff *skb)
1845 {
1846 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848 		skb->dev ? skb->dev->name : "?");
1849 	kfree_skb(skb);
1850 	WARN_ON(1);
1851 	return 0;
1852 }
1853 
1854 /*
1855    We do not cache source address of outgoing interface,
1856    because it is used only by IP RR, TS and SRR options,
1857    so that it out of fast path.
1858 
1859    BTW remember: "addr" is allowed to be not aligned
1860    in IP options!
1861  */
1862 
1863 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1864 {
1865 	__be32 src;
1866 
1867 	if (rt_is_output_route(rt))
1868 		src = ip_hdr(skb)->saddr;
1869 	else {
1870 		struct fib_result res;
1871 		struct flowi4 fl4;
1872 		struct iphdr *iph;
1873 
1874 		iph = ip_hdr(skb);
1875 
1876 		memset(&fl4, 0, sizeof(fl4));
1877 		fl4.daddr = iph->daddr;
1878 		fl4.saddr = iph->saddr;
1879 		fl4.flowi4_tos = RT_TOS(iph->tos);
1880 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1881 		fl4.flowi4_iif = skb->dev->ifindex;
1882 		fl4.flowi4_mark = skb->mark;
1883 
1884 		rcu_read_lock();
1885 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1886 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1887 		else
1888 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1889 					RT_SCOPE_UNIVERSE);
1890 		rcu_read_unlock();
1891 	}
1892 	memcpy(addr, &src, 4);
1893 }
1894 
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
1896 static void set_class_tag(struct rtable *rt, u32 tag)
1897 {
1898 	if (!(rt->dst.tclassid & 0xFFFF))
1899 		rt->dst.tclassid |= tag & 0xFFFF;
1900 	if (!(rt->dst.tclassid & 0xFFFF0000))
1901 		rt->dst.tclassid |= tag & 0xFFFF0000;
1902 }
1903 #endif
1904 
1905 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906 {
1907 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908 
1909 	if (advmss == 0) {
1910 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911 			       ip_rt_min_advmss);
1912 		if (advmss > 65535 - 40)
1913 			advmss = 65535 - 40;
1914 	}
1915 	return advmss;
1916 }
1917 
1918 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1919 {
1920 	const struct rtable *rt = (const struct rtable *) dst;
1921 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922 
1923 	if (mtu && rt_is_output_route(rt))
1924 		return mtu;
1925 
1926 	mtu = dst->dev->mtu;
1927 
1928 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1929 
1930 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931 			mtu = 576;
1932 	}
1933 
1934 	if (mtu > IP_MAX_MTU)
1935 		mtu = IP_MAX_MTU;
1936 
1937 	return mtu;
1938 }
1939 
1940 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1941 			    struct fib_info *fi)
1942 {
1943 	struct inet_peer *peer;
1944 	int create = 0;
1945 
1946 	/* If a peer entry exists for this destination, we must hook
1947 	 * it up in order to get at cached metrics.
1948 	 */
1949 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1950 		create = 1;
1951 
1952 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1953 	if (peer) {
1954 		rt->rt_peer_genid = rt_peer_genid();
1955 		if (inet_metrics_new(peer))
1956 			memcpy(peer->metrics, fi->fib_metrics,
1957 			       sizeof(u32) * RTAX_MAX);
1958 		dst_init_metrics(&rt->dst, peer->metrics, false);
1959 
1960 		check_peer_pmtu(&rt->dst, peer);
1961 		if (peer->redirect_genid != redirect_genid)
1962 			peer->redirect_learned.a4 = 0;
1963 		if (peer->redirect_learned.a4 &&
1964 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1965 			rt->rt_gateway = peer->redirect_learned.a4;
1966 			rt->rt_flags |= RTCF_REDIRECTED;
1967 		}
1968 	} else {
1969 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970 			rt->fi = fi;
1971 			atomic_inc(&fi->fib_clntref);
1972 		}
1973 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1974 	}
1975 }
1976 
1977 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1978 			   const struct fib_result *res,
1979 			   struct fib_info *fi, u16 type, u32 itag)
1980 {
1981 	struct dst_entry *dst = &rt->dst;
1982 
1983 	if (fi) {
1984 		if (FIB_RES_GW(*res) &&
1985 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986 			rt->rt_gateway = FIB_RES_GW(*res);
1987 		rt_init_metrics(rt, fl4, fi);
1988 #ifdef CONFIG_IP_ROUTE_CLASSID
1989 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1990 #endif
1991 	}
1992 
1993 	if (dst_mtu(dst) > IP_MAX_MTU)
1994 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1995 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1996 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1997 
1998 #ifdef CONFIG_IP_ROUTE_CLASSID
1999 #ifdef CONFIG_IP_MULTIPLE_TABLES
2000 	set_class_tag(rt, fib_rules_tclass(res));
2001 #endif
2002 	set_class_tag(rt, itag);
2003 #endif
2004 }
2005 
2006 static struct rtable *rt_dst_alloc(struct net_device *dev,
2007 				   bool nopolicy, bool noxfrm)
2008 {
2009 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010 			 DST_HOST |
2011 			 (nopolicy ? DST_NOPOLICY : 0) |
2012 			 (noxfrm ? DST_NOXFRM : 0));
2013 }
2014 
2015 /* called in rcu_read_lock() section */
2016 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2017 				u8 tos, struct net_device *dev, int our)
2018 {
2019 	unsigned int hash;
2020 	struct rtable *rth;
2021 	__be32 spec_dst;
2022 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2023 	u32 itag = 0;
2024 	int err;
2025 
2026 	/* Primary sanity checks. */
2027 
2028 	if (in_dev == NULL)
2029 		return -EINVAL;
2030 
2031 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2032 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2033 		goto e_inval;
2034 
2035 	if (ipv4_is_zeronet(saddr)) {
2036 		if (!ipv4_is_local_multicast(daddr))
2037 			goto e_inval;
2038 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2039 	} else {
2040 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041 					  &itag);
2042 		if (err < 0)
2043 			goto e_err;
2044 	}
2045 	rth = rt_dst_alloc(init_net.loopback_dev,
2046 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2047 	if (!rth)
2048 		goto e_nobufs;
2049 
2050 #ifdef CONFIG_IP_ROUTE_CLASSID
2051 	rth->dst.tclassid = itag;
2052 #endif
2053 	rth->dst.output = ip_rt_bug;
2054 
2055 	rth->rt_key_dst	= daddr;
2056 	rth->rt_key_src	= saddr;
2057 	rth->rt_genid	= rt_genid(dev_net(dev));
2058 	rth->rt_flags	= RTCF_MULTICAST;
2059 	rth->rt_type	= RTN_MULTICAST;
2060 	rth->rt_key_tos	= tos;
2061 	rth->rt_dst	= daddr;
2062 	rth->rt_src	= saddr;
2063 	rth->rt_route_iif = dev->ifindex;
2064 	rth->rt_iif	= dev->ifindex;
2065 	rth->rt_oif	= 0;
2066 	rth->rt_mark    = skb->mark;
2067 	rth->rt_gateway	= daddr;
2068 	rth->rt_spec_dst= spec_dst;
2069 	rth->rt_peer_genid = 0;
2070 	rth->peer = NULL;
2071 	rth->fi = NULL;
2072 	if (our) {
2073 		rth->dst.input= ip_local_deliver;
2074 		rth->rt_flags |= RTCF_LOCAL;
2075 	}
2076 
2077 #ifdef CONFIG_IP_MROUTE
2078 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2079 		rth->dst.input = ip_mr_input;
2080 #endif
2081 	RT_CACHE_STAT_INC(in_slow_mc);
2082 
2083 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2084 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2085 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2086 
2087 e_nobufs:
2088 	return -ENOBUFS;
2089 e_inval:
2090 	return -EINVAL;
2091 e_err:
2092 	return err;
2093 }
2094 
2095 
2096 static void ip_handle_martian_source(struct net_device *dev,
2097 				     struct in_device *in_dev,
2098 				     struct sk_buff *skb,
2099 				     __be32 daddr,
2100 				     __be32 saddr)
2101 {
2102 	RT_CACHE_STAT_INC(in_martian_src);
2103 #ifdef CONFIG_IP_ROUTE_VERBOSE
2104 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105 		/*
2106 		 *	RFC1812 recommendation, if source is martian,
2107 		 *	the only hint is MAC header.
2108 		 */
2109 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2110 			&daddr, &saddr, dev->name);
2111 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2112 			int i;
2113 			const unsigned char *p = skb_mac_header(skb);
2114 			printk(KERN_WARNING "ll header: ");
2115 			for (i = 0; i < dev->hard_header_len; i++, p++) {
2116 				printk("%02x", *p);
2117 				if (i < (dev->hard_header_len - 1))
2118 					printk(":");
2119 			}
2120 			printk("\n");
2121 		}
2122 	}
2123 #endif
2124 }
2125 
2126 /* called in rcu_read_lock() section */
2127 static int __mkroute_input(struct sk_buff *skb,
2128 			   const struct fib_result *res,
2129 			   struct in_device *in_dev,
2130 			   __be32 daddr, __be32 saddr, u32 tos,
2131 			   struct rtable **result)
2132 {
2133 	struct rtable *rth;
2134 	int err;
2135 	struct in_device *out_dev;
2136 	unsigned int flags = 0;
2137 	__be32 spec_dst;
2138 	u32 itag;
2139 
2140 	/* get a working reference to the output device */
2141 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2142 	if (out_dev == NULL) {
2143 		if (net_ratelimit())
2144 			printk(KERN_CRIT "Bug in ip_route_input" \
2145 			       "_slow(). Please, report\n");
2146 		return -EINVAL;
2147 	}
2148 
2149 
2150 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2151 				  in_dev->dev, &spec_dst, &itag);
2152 	if (err < 0) {
2153 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2154 					 saddr);
2155 
2156 		goto cleanup;
2157 	}
2158 
2159 	if (err)
2160 		flags |= RTCF_DIRECTSRC;
2161 
2162 	if (out_dev == in_dev && err &&
2163 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2164 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2165 		flags |= RTCF_DOREDIRECT;
2166 
2167 	if (skb->protocol != htons(ETH_P_IP)) {
2168 		/* Not IP (i.e. ARP). Do not create route, if it is
2169 		 * invalid for proxy arp. DNAT routes are always valid.
2170 		 *
2171 		 * Proxy arp feature have been extended to allow, ARP
2172 		 * replies back to the same interface, to support
2173 		 * Private VLAN switch technologies. See arp.c.
2174 		 */
2175 		if (out_dev == in_dev &&
2176 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2177 			err = -EINVAL;
2178 			goto cleanup;
2179 		}
2180 	}
2181 
2182 	rth = rt_dst_alloc(out_dev->dev,
2183 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2184 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2185 	if (!rth) {
2186 		err = -ENOBUFS;
2187 		goto cleanup;
2188 	}
2189 
2190 	rth->rt_key_dst	= daddr;
2191 	rth->rt_key_src	= saddr;
2192 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2193 	rth->rt_flags = flags;
2194 	rth->rt_type = res->type;
2195 	rth->rt_key_tos	= tos;
2196 	rth->rt_dst	= daddr;
2197 	rth->rt_src	= saddr;
2198 	rth->rt_route_iif = in_dev->dev->ifindex;
2199 	rth->rt_iif 	= in_dev->dev->ifindex;
2200 	rth->rt_oif 	= 0;
2201 	rth->rt_mark    = skb->mark;
2202 	rth->rt_gateway	= daddr;
2203 	rth->rt_spec_dst= spec_dst;
2204 	rth->rt_peer_genid = 0;
2205 	rth->peer = NULL;
2206 	rth->fi = NULL;
2207 
2208 	rth->dst.input = ip_forward;
2209 	rth->dst.output = ip_output;
2210 
2211 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2212 
2213 	*result = rth;
2214 	err = 0;
2215  cleanup:
2216 	return err;
2217 }
2218 
2219 static int ip_mkroute_input(struct sk_buff *skb,
2220 			    struct fib_result *res,
2221 			    const struct flowi4 *fl4,
2222 			    struct in_device *in_dev,
2223 			    __be32 daddr, __be32 saddr, u32 tos)
2224 {
2225 	struct rtable* rth = NULL;
2226 	int err;
2227 	unsigned hash;
2228 
2229 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2230 	if (res->fi && res->fi->fib_nhs > 1)
2231 		fib_select_multipath(res);
2232 #endif
2233 
2234 	/* create a routing cache entry */
2235 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2236 	if (err)
2237 		return err;
2238 
2239 	/* put it into the cache */
2240 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2241 		       rt_genid(dev_net(rth->dst.dev)));
2242 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2243 	if (IS_ERR(rth))
2244 		return PTR_ERR(rth);
2245 	return 0;
2246 }
2247 
2248 /*
2249  *	NOTE. We drop all the packets that has local source
2250  *	addresses, because every properly looped back packet
2251  *	must have correct destination already attached by output routine.
2252  *
2253  *	Such approach solves two big problems:
2254  *	1. Not simplex devices are handled properly.
2255  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2256  *	called with rcu_read_lock()
2257  */
2258 
2259 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260 			       u8 tos, struct net_device *dev)
2261 {
2262 	struct fib_result res;
2263 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2264 	struct flowi4	fl4;
2265 	unsigned	flags = 0;
2266 	u32		itag = 0;
2267 	struct rtable * rth;
2268 	unsigned	hash;
2269 	__be32		spec_dst;
2270 	int		err = -EINVAL;
2271 	struct net    * net = dev_net(dev);
2272 
2273 	/* IP on this device is disabled. */
2274 
2275 	if (!in_dev)
2276 		goto out;
2277 
2278 	/* Check for the most weird martians, which can be not detected
2279 	   by fib_lookup.
2280 	 */
2281 
2282 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2283 	    ipv4_is_loopback(saddr))
2284 		goto martian_source;
2285 
2286 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2287 		goto brd_input;
2288 
2289 	/* Accept zero addresses only to limited broadcast;
2290 	 * I even do not know to fix it or not. Waiting for complains :-)
2291 	 */
2292 	if (ipv4_is_zeronet(saddr))
2293 		goto martian_source;
2294 
2295 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2296 		goto martian_destination;
2297 
2298 	/*
2299 	 *	Now we are ready to route packet.
2300 	 */
2301 	fl4.flowi4_oif = 0;
2302 	fl4.flowi4_iif = dev->ifindex;
2303 	fl4.flowi4_mark = skb->mark;
2304 	fl4.flowi4_tos = tos;
2305 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2306 	fl4.daddr = daddr;
2307 	fl4.saddr = saddr;
2308 	err = fib_lookup(net, &fl4, &res);
2309 	if (err != 0) {
2310 		if (!IN_DEV_FORWARD(in_dev))
2311 			goto e_hostunreach;
2312 		goto no_route;
2313 	}
2314 
2315 	RT_CACHE_STAT_INC(in_slow_tot);
2316 
2317 	if (res.type == RTN_BROADCAST)
2318 		goto brd_input;
2319 
2320 	if (res.type == RTN_LOCAL) {
2321 		err = fib_validate_source(skb, saddr, daddr, tos,
2322 					  net->loopback_dev->ifindex,
2323 					  dev, &spec_dst, &itag);
2324 		if (err < 0)
2325 			goto martian_source_keep_err;
2326 		if (err)
2327 			flags |= RTCF_DIRECTSRC;
2328 		spec_dst = daddr;
2329 		goto local_input;
2330 	}
2331 
2332 	if (!IN_DEV_FORWARD(in_dev))
2333 		goto e_hostunreach;
2334 	if (res.type != RTN_UNICAST)
2335 		goto martian_destination;
2336 
2337 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2338 out:	return err;
2339 
2340 brd_input:
2341 	if (skb->protocol != htons(ETH_P_IP))
2342 		goto e_inval;
2343 
2344 	if (ipv4_is_zeronet(saddr))
2345 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2346 	else {
2347 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2348 					  &itag);
2349 		if (err < 0)
2350 			goto martian_source_keep_err;
2351 		if (err)
2352 			flags |= RTCF_DIRECTSRC;
2353 	}
2354 	flags |= RTCF_BROADCAST;
2355 	res.type = RTN_BROADCAST;
2356 	RT_CACHE_STAT_INC(in_brd);
2357 
2358 local_input:
2359 	rth = rt_dst_alloc(net->loopback_dev,
2360 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2361 	if (!rth)
2362 		goto e_nobufs;
2363 
2364 	rth->dst.input= ip_local_deliver;
2365 	rth->dst.output= ip_rt_bug;
2366 #ifdef CONFIG_IP_ROUTE_CLASSID
2367 	rth->dst.tclassid = itag;
2368 #endif
2369 
2370 	rth->rt_key_dst	= daddr;
2371 	rth->rt_key_src	= saddr;
2372 	rth->rt_genid = rt_genid(net);
2373 	rth->rt_flags 	= flags|RTCF_LOCAL;
2374 	rth->rt_type	= res.type;
2375 	rth->rt_key_tos	= tos;
2376 	rth->rt_dst	= daddr;
2377 	rth->rt_src	= saddr;
2378 #ifdef CONFIG_IP_ROUTE_CLASSID
2379 	rth->dst.tclassid = itag;
2380 #endif
2381 	rth->rt_route_iif = dev->ifindex;
2382 	rth->rt_iif	= dev->ifindex;
2383 	rth->rt_oif	= 0;
2384 	rth->rt_mark    = skb->mark;
2385 	rth->rt_gateway	= daddr;
2386 	rth->rt_spec_dst= spec_dst;
2387 	rth->rt_peer_genid = 0;
2388 	rth->peer = NULL;
2389 	rth->fi = NULL;
2390 	if (res.type == RTN_UNREACHABLE) {
2391 		rth->dst.input= ip_error;
2392 		rth->dst.error= -err;
2393 		rth->rt_flags 	&= ~RTCF_LOCAL;
2394 	}
2395 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2396 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2397 	err = 0;
2398 	if (IS_ERR(rth))
2399 		err = PTR_ERR(rth);
2400 	goto out;
2401 
2402 no_route:
2403 	RT_CACHE_STAT_INC(in_no_route);
2404 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2405 	res.type = RTN_UNREACHABLE;
2406 	if (err == -ESRCH)
2407 		err = -ENETUNREACH;
2408 	goto local_input;
2409 
2410 	/*
2411 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2412 	 */
2413 martian_destination:
2414 	RT_CACHE_STAT_INC(in_martian_dst);
2415 #ifdef CONFIG_IP_ROUTE_VERBOSE
2416 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2417 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2418 			&daddr, &saddr, dev->name);
2419 #endif
2420 
2421 e_hostunreach:
2422 	err = -EHOSTUNREACH;
2423 	goto out;
2424 
2425 e_inval:
2426 	err = -EINVAL;
2427 	goto out;
2428 
2429 e_nobufs:
2430 	err = -ENOBUFS;
2431 	goto out;
2432 
2433 martian_source:
2434 	err = -EINVAL;
2435 martian_source_keep_err:
2436 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2437 	goto out;
2438 }
2439 
2440 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2441 			   u8 tos, struct net_device *dev, bool noref)
2442 {
2443 	struct rtable * rth;
2444 	unsigned	hash;
2445 	int iif = dev->ifindex;
2446 	struct net *net;
2447 	int res;
2448 
2449 	net = dev_net(dev);
2450 
2451 	rcu_read_lock();
2452 
2453 	if (!rt_caching(net))
2454 		goto skip_cache;
2455 
2456 	tos &= IPTOS_RT_MASK;
2457 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2458 
2459 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2460 	     rth = rcu_dereference(rth->dst.rt_next)) {
2461 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2462 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2463 		     (rth->rt_route_iif ^ iif) |
2464 		     (rth->rt_key_tos ^ tos)) == 0 &&
2465 		    rth->rt_mark == skb->mark &&
2466 		    net_eq(dev_net(rth->dst.dev), net) &&
2467 		    !rt_is_expired(rth)) {
2468 			ipv4_validate_peer(rth);
2469 			if (noref) {
2470 				dst_use_noref(&rth->dst, jiffies);
2471 				skb_dst_set_noref(skb, &rth->dst);
2472 			} else {
2473 				dst_use(&rth->dst, jiffies);
2474 				skb_dst_set(skb, &rth->dst);
2475 			}
2476 			RT_CACHE_STAT_INC(in_hit);
2477 			rcu_read_unlock();
2478 			return 0;
2479 		}
2480 		RT_CACHE_STAT_INC(in_hlist_search);
2481 	}
2482 
2483 skip_cache:
2484 	/* Multicast recognition logic is moved from route cache to here.
2485 	   The problem was that too many Ethernet cards have broken/missing
2486 	   hardware multicast filters :-( As result the host on multicasting
2487 	   network acquires a lot of useless route cache entries, sort of
2488 	   SDR messages from all the world. Now we try to get rid of them.
2489 	   Really, provided software IP multicast filter is organized
2490 	   reasonably (at least, hashed), it does not result in a slowdown
2491 	   comparing with route cache reject entries.
2492 	   Note, that multicast routers are not affected, because
2493 	   route cache entry is created eventually.
2494 	 */
2495 	if (ipv4_is_multicast(daddr)) {
2496 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2497 
2498 		if (in_dev) {
2499 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2500 						  ip_hdr(skb)->protocol);
2501 			if (our
2502 #ifdef CONFIG_IP_MROUTE
2503 				||
2504 			    (!ipv4_is_local_multicast(daddr) &&
2505 			     IN_DEV_MFORWARD(in_dev))
2506 #endif
2507 			   ) {
2508 				int res = ip_route_input_mc(skb, daddr, saddr,
2509 							    tos, dev, our);
2510 				rcu_read_unlock();
2511 				return res;
2512 			}
2513 		}
2514 		rcu_read_unlock();
2515 		return -EINVAL;
2516 	}
2517 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2518 	rcu_read_unlock();
2519 	return res;
2520 }
2521 EXPORT_SYMBOL(ip_route_input_common);
2522 
2523 /* called with rcu_read_lock() */
2524 static struct rtable *__mkroute_output(const struct fib_result *res,
2525 				       const struct flowi4 *fl4,
2526 				       __be32 orig_daddr, __be32 orig_saddr,
2527 				       int orig_oif, __u8 orig_rtos,
2528 				       struct net_device *dev_out,
2529 				       unsigned int flags)
2530 {
2531 	struct fib_info *fi = res->fi;
2532 	struct in_device *in_dev;
2533 	u16 type = res->type;
2534 	struct rtable *rth;
2535 
2536 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2537 		return ERR_PTR(-EINVAL);
2538 
2539 	if (ipv4_is_lbcast(fl4->daddr))
2540 		type = RTN_BROADCAST;
2541 	else if (ipv4_is_multicast(fl4->daddr))
2542 		type = RTN_MULTICAST;
2543 	else if (ipv4_is_zeronet(fl4->daddr))
2544 		return ERR_PTR(-EINVAL);
2545 
2546 	if (dev_out->flags & IFF_LOOPBACK)
2547 		flags |= RTCF_LOCAL;
2548 
2549 	in_dev = __in_dev_get_rcu(dev_out);
2550 	if (!in_dev)
2551 		return ERR_PTR(-EINVAL);
2552 
2553 	if (type == RTN_BROADCAST) {
2554 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2555 		fi = NULL;
2556 	} else if (type == RTN_MULTICAST) {
2557 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2558 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2559 				     fl4->flowi4_proto))
2560 			flags &= ~RTCF_LOCAL;
2561 		/* If multicast route do not exist use
2562 		 * default one, but do not gateway in this case.
2563 		 * Yes, it is hack.
2564 		 */
2565 		if (fi && res->prefixlen < 4)
2566 			fi = NULL;
2567 	}
2568 
2569 	rth = rt_dst_alloc(dev_out,
2570 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2571 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2572 	if (!rth)
2573 		return ERR_PTR(-ENOBUFS);
2574 
2575 	rth->dst.output = ip_output;
2576 
2577 	rth->rt_key_dst	= orig_daddr;
2578 	rth->rt_key_src	= orig_saddr;
2579 	rth->rt_genid = rt_genid(dev_net(dev_out));
2580 	rth->rt_flags	= flags;
2581 	rth->rt_type	= type;
2582 	rth->rt_key_tos	= orig_rtos;
2583 	rth->rt_dst	= fl4->daddr;
2584 	rth->rt_src	= fl4->saddr;
2585 	rth->rt_route_iif = 0;
2586 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2587 	rth->rt_oif	= orig_oif;
2588 	rth->rt_mark    = fl4->flowi4_mark;
2589 	rth->rt_gateway = fl4->daddr;
2590 	rth->rt_spec_dst= fl4->saddr;
2591 	rth->rt_peer_genid = 0;
2592 	rth->peer = NULL;
2593 	rth->fi = NULL;
2594 
2595 	RT_CACHE_STAT_INC(out_slow_tot);
2596 
2597 	if (flags & RTCF_LOCAL) {
2598 		rth->dst.input = ip_local_deliver;
2599 		rth->rt_spec_dst = fl4->daddr;
2600 	}
2601 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2602 		rth->rt_spec_dst = fl4->saddr;
2603 		if (flags & RTCF_LOCAL &&
2604 		    !(dev_out->flags & IFF_LOOPBACK)) {
2605 			rth->dst.output = ip_mc_output;
2606 			RT_CACHE_STAT_INC(out_slow_mc);
2607 		}
2608 #ifdef CONFIG_IP_MROUTE
2609 		if (type == RTN_MULTICAST) {
2610 			if (IN_DEV_MFORWARD(in_dev) &&
2611 			    !ipv4_is_local_multicast(fl4->daddr)) {
2612 				rth->dst.input = ip_mr_input;
2613 				rth->dst.output = ip_mc_output;
2614 			}
2615 		}
2616 #endif
2617 	}
2618 
2619 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2620 
2621 	return rth;
2622 }
2623 
2624 /*
2625  * Major route resolver routine.
2626  * called with rcu_read_lock();
2627  */
2628 
2629 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2630 {
2631 	struct net_device *dev_out = NULL;
2632 	__u8 tos = RT_FL_TOS(fl4);
2633 	unsigned int flags = 0;
2634 	struct fib_result res;
2635 	struct rtable *rth;
2636 	__be32 orig_daddr;
2637 	__be32 orig_saddr;
2638 	int orig_oif;
2639 
2640 	res.fi		= NULL;
2641 #ifdef CONFIG_IP_MULTIPLE_TABLES
2642 	res.r		= NULL;
2643 #endif
2644 
2645 	orig_daddr = fl4->daddr;
2646 	orig_saddr = fl4->saddr;
2647 	orig_oif = fl4->flowi4_oif;
2648 
2649 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2650 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2651 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2652 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2653 
2654 	rcu_read_lock();
2655 	if (fl4->saddr) {
2656 		rth = ERR_PTR(-EINVAL);
2657 		if (ipv4_is_multicast(fl4->saddr) ||
2658 		    ipv4_is_lbcast(fl4->saddr) ||
2659 		    ipv4_is_zeronet(fl4->saddr))
2660 			goto out;
2661 
2662 		/* I removed check for oif == dev_out->oif here.
2663 		   It was wrong for two reasons:
2664 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2665 		      is assigned to multiple interfaces.
2666 		   2. Moreover, we are allowed to send packets with saddr
2667 		      of another iface. --ANK
2668 		 */
2669 
2670 		if (fl4->flowi4_oif == 0 &&
2671 		    (ipv4_is_multicast(fl4->daddr) ||
2672 		     ipv4_is_lbcast(fl4->daddr))) {
2673 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2674 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2675 			if (dev_out == NULL)
2676 				goto out;
2677 
2678 			/* Special hack: user can direct multicasts
2679 			   and limited broadcast via necessary interface
2680 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2681 			   This hack is not just for fun, it allows
2682 			   vic,vat and friends to work.
2683 			   They bind socket to loopback, set ttl to zero
2684 			   and expect that it will work.
2685 			   From the viewpoint of routing cache they are broken,
2686 			   because we are not allowed to build multicast path
2687 			   with loopback source addr (look, routing cache
2688 			   cannot know, that ttl is zero, so that packet
2689 			   will not leave this host and route is valid).
2690 			   Luckily, this hack is good workaround.
2691 			 */
2692 
2693 			fl4->flowi4_oif = dev_out->ifindex;
2694 			goto make_route;
2695 		}
2696 
2697 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2698 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2699 			if (!__ip_dev_find(net, fl4->saddr, false))
2700 				goto out;
2701 		}
2702 	}
2703 
2704 
2705 	if (fl4->flowi4_oif) {
2706 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2707 		rth = ERR_PTR(-ENODEV);
2708 		if (dev_out == NULL)
2709 			goto out;
2710 
2711 		/* RACE: Check return value of inet_select_addr instead. */
2712 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2713 			rth = ERR_PTR(-ENETUNREACH);
2714 			goto out;
2715 		}
2716 		if (ipv4_is_local_multicast(fl4->daddr) ||
2717 		    ipv4_is_lbcast(fl4->daddr)) {
2718 			if (!fl4->saddr)
2719 				fl4->saddr = inet_select_addr(dev_out, 0,
2720 							      RT_SCOPE_LINK);
2721 			goto make_route;
2722 		}
2723 		if (fl4->saddr) {
2724 			if (ipv4_is_multicast(fl4->daddr))
2725 				fl4->saddr = inet_select_addr(dev_out, 0,
2726 							      fl4->flowi4_scope);
2727 			else if (!fl4->daddr)
2728 				fl4->saddr = inet_select_addr(dev_out, 0,
2729 							      RT_SCOPE_HOST);
2730 		}
2731 	}
2732 
2733 	if (!fl4->daddr) {
2734 		fl4->daddr = fl4->saddr;
2735 		if (!fl4->daddr)
2736 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2737 		dev_out = net->loopback_dev;
2738 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2739 		res.type = RTN_LOCAL;
2740 		flags |= RTCF_LOCAL;
2741 		goto make_route;
2742 	}
2743 
2744 	if (fib_lookup(net, fl4, &res)) {
2745 		res.fi = NULL;
2746 		if (fl4->flowi4_oif) {
2747 			/* Apparently, routing tables are wrong. Assume,
2748 			   that the destination is on link.
2749 
2750 			   WHY? DW.
2751 			   Because we are allowed to send to iface
2752 			   even if it has NO routes and NO assigned
2753 			   addresses. When oif is specified, routing
2754 			   tables are looked up with only one purpose:
2755 			   to catch if destination is gatewayed, rather than
2756 			   direct. Moreover, if MSG_DONTROUTE is set,
2757 			   we send packet, ignoring both routing tables
2758 			   and ifaddr state. --ANK
2759 
2760 
2761 			   We could make it even if oif is unknown,
2762 			   likely IPv6, but we do not.
2763 			 */
2764 
2765 			if (fl4->saddr == 0)
2766 				fl4->saddr = inet_select_addr(dev_out, 0,
2767 							      RT_SCOPE_LINK);
2768 			res.type = RTN_UNICAST;
2769 			goto make_route;
2770 		}
2771 		rth = ERR_PTR(-ENETUNREACH);
2772 		goto out;
2773 	}
2774 
2775 	if (res.type == RTN_LOCAL) {
2776 		if (!fl4->saddr) {
2777 			if (res.fi->fib_prefsrc)
2778 				fl4->saddr = res.fi->fib_prefsrc;
2779 			else
2780 				fl4->saddr = fl4->daddr;
2781 		}
2782 		dev_out = net->loopback_dev;
2783 		fl4->flowi4_oif = dev_out->ifindex;
2784 		res.fi = NULL;
2785 		flags |= RTCF_LOCAL;
2786 		goto make_route;
2787 	}
2788 
2789 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2790 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2791 		fib_select_multipath(&res);
2792 	else
2793 #endif
2794 	if (!res.prefixlen &&
2795 	    res.table->tb_num_default > 1 &&
2796 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2797 		fib_select_default(&res);
2798 
2799 	if (!fl4->saddr)
2800 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2801 
2802 	dev_out = FIB_RES_DEV(res);
2803 	fl4->flowi4_oif = dev_out->ifindex;
2804 
2805 
2806 make_route:
2807 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2808 			       tos, dev_out, flags);
2809 	if (!IS_ERR(rth)) {
2810 		unsigned int hash;
2811 
2812 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2813 			       rt_genid(dev_net(dev_out)));
2814 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2815 	}
2816 
2817 out:
2818 	rcu_read_unlock();
2819 	return rth;
2820 }
2821 
2822 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2823 {
2824 	struct rtable *rth;
2825 	unsigned int hash;
2826 
2827 	if (!rt_caching(net))
2828 		goto slow_output;
2829 
2830 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2831 
2832 	rcu_read_lock_bh();
2833 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2834 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2835 		if (rth->rt_key_dst == flp4->daddr &&
2836 		    rth->rt_key_src == flp4->saddr &&
2837 		    rt_is_output_route(rth) &&
2838 		    rth->rt_oif == flp4->flowi4_oif &&
2839 		    rth->rt_mark == flp4->flowi4_mark &&
2840 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2841 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2842 		    net_eq(dev_net(rth->dst.dev), net) &&
2843 		    !rt_is_expired(rth)) {
2844 			ipv4_validate_peer(rth);
2845 			dst_use(&rth->dst, jiffies);
2846 			RT_CACHE_STAT_INC(out_hit);
2847 			rcu_read_unlock_bh();
2848 			if (!flp4->saddr)
2849 				flp4->saddr = rth->rt_src;
2850 			if (!flp4->daddr)
2851 				flp4->daddr = rth->rt_dst;
2852 			return rth;
2853 		}
2854 		RT_CACHE_STAT_INC(out_hlist_search);
2855 	}
2856 	rcu_read_unlock_bh();
2857 
2858 slow_output:
2859 	return ip_route_output_slow(net, flp4);
2860 }
2861 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2862 
2863 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2864 {
2865 	return NULL;
2866 }
2867 
2868 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2869 {
2870 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2871 
2872 	return mtu ? : dst->dev->mtu;
2873 }
2874 
2875 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2876 {
2877 }
2878 
2879 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2880 					  unsigned long old)
2881 {
2882 	return NULL;
2883 }
2884 
2885 static struct dst_ops ipv4_dst_blackhole_ops = {
2886 	.family			=	AF_INET,
2887 	.protocol		=	cpu_to_be16(ETH_P_IP),
2888 	.destroy		=	ipv4_dst_destroy,
2889 	.check			=	ipv4_blackhole_dst_check,
2890 	.mtu			=	ipv4_blackhole_mtu,
2891 	.default_advmss		=	ipv4_default_advmss,
2892 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2893 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2894 	.neigh_lookup		=	ipv4_neigh_lookup,
2895 };
2896 
2897 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2898 {
2899 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2900 	struct rtable *ort = (struct rtable *) dst_orig;
2901 
2902 	if (rt) {
2903 		struct dst_entry *new = &rt->dst;
2904 
2905 		new->__use = 1;
2906 		new->input = dst_discard;
2907 		new->output = dst_discard;
2908 		dst_copy_metrics(new, &ort->dst);
2909 
2910 		new->dev = ort->dst.dev;
2911 		if (new->dev)
2912 			dev_hold(new->dev);
2913 
2914 		rt->rt_key_dst = ort->rt_key_dst;
2915 		rt->rt_key_src = ort->rt_key_src;
2916 		rt->rt_key_tos = ort->rt_key_tos;
2917 		rt->rt_route_iif = ort->rt_route_iif;
2918 		rt->rt_iif = ort->rt_iif;
2919 		rt->rt_oif = ort->rt_oif;
2920 		rt->rt_mark = ort->rt_mark;
2921 
2922 		rt->rt_genid = rt_genid(net);
2923 		rt->rt_flags = ort->rt_flags;
2924 		rt->rt_type = ort->rt_type;
2925 		rt->rt_dst = ort->rt_dst;
2926 		rt->rt_src = ort->rt_src;
2927 		rt->rt_gateway = ort->rt_gateway;
2928 		rt->rt_spec_dst = ort->rt_spec_dst;
2929 		rt->peer = ort->peer;
2930 		if (rt->peer)
2931 			atomic_inc(&rt->peer->refcnt);
2932 		rt->fi = ort->fi;
2933 		if (rt->fi)
2934 			atomic_inc(&rt->fi->fib_clntref);
2935 
2936 		dst_free(new);
2937 	}
2938 
2939 	dst_release(dst_orig);
2940 
2941 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2942 }
2943 
2944 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2945 				    struct sock *sk)
2946 {
2947 	struct rtable *rt = __ip_route_output_key(net, flp4);
2948 
2949 	if (IS_ERR(rt))
2950 		return rt;
2951 
2952 	if (flp4->flowi4_proto)
2953 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2954 						   flowi4_to_flowi(flp4),
2955 						   sk, 0);
2956 
2957 	return rt;
2958 }
2959 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2960 
2961 static int rt_fill_info(struct net *net,
2962 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2963 			int nowait, unsigned int flags)
2964 {
2965 	struct rtable *rt = skb_rtable(skb);
2966 	struct rtmsg *r;
2967 	struct nlmsghdr *nlh;
2968 	unsigned long expires = 0;
2969 	const struct inet_peer *peer = rt->peer;
2970 	u32 id = 0, ts = 0, tsage = 0, error;
2971 
2972 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2973 	if (nlh == NULL)
2974 		return -EMSGSIZE;
2975 
2976 	r = nlmsg_data(nlh);
2977 	r->rtm_family	 = AF_INET;
2978 	r->rtm_dst_len	= 32;
2979 	r->rtm_src_len	= 0;
2980 	r->rtm_tos	= rt->rt_key_tos;
2981 	r->rtm_table	= RT_TABLE_MAIN;
2982 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2983 	r->rtm_type	= rt->rt_type;
2984 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2985 	r->rtm_protocol = RTPROT_UNSPEC;
2986 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2987 	if (rt->rt_flags & RTCF_NOTIFY)
2988 		r->rtm_flags |= RTM_F_NOTIFY;
2989 
2990 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2991 
2992 	if (rt->rt_key_src) {
2993 		r->rtm_src_len = 32;
2994 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2995 	}
2996 	if (rt->dst.dev)
2997 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2998 #ifdef CONFIG_IP_ROUTE_CLASSID
2999 	if (rt->dst.tclassid)
3000 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3001 #endif
3002 	if (rt_is_input_route(rt))
3003 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3004 	else if (rt->rt_src != rt->rt_key_src)
3005 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3006 
3007 	if (rt->rt_dst != rt->rt_gateway)
3008 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3009 
3010 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3011 		goto nla_put_failure;
3012 
3013 	if (rt->rt_mark)
3014 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3015 
3016 	error = rt->dst.error;
3017 	if (peer) {
3018 		inet_peer_refcheck(rt->peer);
3019 		id = atomic_read(&peer->ip_id_count) & 0xffff;
3020 		if (peer->tcp_ts_stamp) {
3021 			ts = peer->tcp_ts;
3022 			tsage = get_seconds() - peer->tcp_ts_stamp;
3023 		}
3024 		expires = ACCESS_ONCE(peer->pmtu_expires);
3025 		if (expires) {
3026 			if (time_before(jiffies, expires))
3027 				expires -= jiffies;
3028 			else
3029 				expires = 0;
3030 		}
3031 	}
3032 
3033 	if (rt_is_input_route(rt)) {
3034 #ifdef CONFIG_IP_MROUTE
3035 		__be32 dst = rt->rt_dst;
3036 
3037 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3038 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3039 			int err = ipmr_get_route(net, skb,
3040 						 rt->rt_src, rt->rt_dst,
3041 						 r, nowait);
3042 			if (err <= 0) {
3043 				if (!nowait) {
3044 					if (err == 0)
3045 						return 0;
3046 					goto nla_put_failure;
3047 				} else {
3048 					if (err == -EMSGSIZE)
3049 						goto nla_put_failure;
3050 					error = err;
3051 				}
3052 			}
3053 		} else
3054 #endif
3055 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3056 	}
3057 
3058 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3059 			       expires, error) < 0)
3060 		goto nla_put_failure;
3061 
3062 	return nlmsg_end(skb, nlh);
3063 
3064 nla_put_failure:
3065 	nlmsg_cancel(skb, nlh);
3066 	return -EMSGSIZE;
3067 }
3068 
3069 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3070 {
3071 	struct net *net = sock_net(in_skb->sk);
3072 	struct rtmsg *rtm;
3073 	struct nlattr *tb[RTA_MAX+1];
3074 	struct rtable *rt = NULL;
3075 	__be32 dst = 0;
3076 	__be32 src = 0;
3077 	u32 iif;
3078 	int err;
3079 	int mark;
3080 	struct sk_buff *skb;
3081 
3082 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3083 	if (err < 0)
3084 		goto errout;
3085 
3086 	rtm = nlmsg_data(nlh);
3087 
3088 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3089 	if (skb == NULL) {
3090 		err = -ENOBUFS;
3091 		goto errout;
3092 	}
3093 
3094 	/* Reserve room for dummy headers, this skb can pass
3095 	   through good chunk of routing engine.
3096 	 */
3097 	skb_reset_mac_header(skb);
3098 	skb_reset_network_header(skb);
3099 
3100 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3101 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3102 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3103 
3104 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3105 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3106 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3107 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3108 
3109 	if (iif) {
3110 		struct net_device *dev;
3111 
3112 		dev = __dev_get_by_index(net, iif);
3113 		if (dev == NULL) {
3114 			err = -ENODEV;
3115 			goto errout_free;
3116 		}
3117 
3118 		skb->protocol	= htons(ETH_P_IP);
3119 		skb->dev	= dev;
3120 		skb->mark	= mark;
3121 		local_bh_disable();
3122 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3123 		local_bh_enable();
3124 
3125 		rt = skb_rtable(skb);
3126 		if (err == 0 && rt->dst.error)
3127 			err = -rt->dst.error;
3128 	} else {
3129 		struct flowi4 fl4 = {
3130 			.daddr = dst,
3131 			.saddr = src,
3132 			.flowi4_tos = rtm->rtm_tos,
3133 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3134 			.flowi4_mark = mark,
3135 		};
3136 		rt = ip_route_output_key(net, &fl4);
3137 
3138 		err = 0;
3139 		if (IS_ERR(rt))
3140 			err = PTR_ERR(rt);
3141 	}
3142 
3143 	if (err)
3144 		goto errout_free;
3145 
3146 	skb_dst_set(skb, &rt->dst);
3147 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3148 		rt->rt_flags |= RTCF_NOTIFY;
3149 
3150 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3151 			   RTM_NEWROUTE, 0, 0);
3152 	if (err <= 0)
3153 		goto errout_free;
3154 
3155 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3156 errout:
3157 	return err;
3158 
3159 errout_free:
3160 	kfree_skb(skb);
3161 	goto errout;
3162 }
3163 
3164 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3165 {
3166 	struct rtable *rt;
3167 	int h, s_h;
3168 	int idx, s_idx;
3169 	struct net *net;
3170 
3171 	net = sock_net(skb->sk);
3172 
3173 	s_h = cb->args[0];
3174 	if (s_h < 0)
3175 		s_h = 0;
3176 	s_idx = idx = cb->args[1];
3177 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3178 		if (!rt_hash_table[h].chain)
3179 			continue;
3180 		rcu_read_lock_bh();
3181 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3182 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3183 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3184 				continue;
3185 			if (rt_is_expired(rt))
3186 				continue;
3187 			skb_dst_set_noref(skb, &rt->dst);
3188 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3189 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3190 					 1, NLM_F_MULTI) <= 0) {
3191 				skb_dst_drop(skb);
3192 				rcu_read_unlock_bh();
3193 				goto done;
3194 			}
3195 			skb_dst_drop(skb);
3196 		}
3197 		rcu_read_unlock_bh();
3198 	}
3199 
3200 done:
3201 	cb->args[0] = h;
3202 	cb->args[1] = idx;
3203 	return skb->len;
3204 }
3205 
3206 void ip_rt_multicast_event(struct in_device *in_dev)
3207 {
3208 	rt_cache_flush(dev_net(in_dev->dev), 0);
3209 }
3210 
3211 #ifdef CONFIG_SYSCTL
3212 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3213 					void __user *buffer,
3214 					size_t *lenp, loff_t *ppos)
3215 {
3216 	if (write) {
3217 		int flush_delay;
3218 		ctl_table ctl;
3219 		struct net *net;
3220 
3221 		memcpy(&ctl, __ctl, sizeof(ctl));
3222 		ctl.data = &flush_delay;
3223 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3224 
3225 		net = (struct net *)__ctl->extra1;
3226 		rt_cache_flush(net, flush_delay);
3227 		return 0;
3228 	}
3229 
3230 	return -EINVAL;
3231 }
3232 
3233 static ctl_table ipv4_route_table[] = {
3234 	{
3235 		.procname	= "gc_thresh",
3236 		.data		= &ipv4_dst_ops.gc_thresh,
3237 		.maxlen		= sizeof(int),
3238 		.mode		= 0644,
3239 		.proc_handler	= proc_dointvec,
3240 	},
3241 	{
3242 		.procname	= "max_size",
3243 		.data		= &ip_rt_max_size,
3244 		.maxlen		= sizeof(int),
3245 		.mode		= 0644,
3246 		.proc_handler	= proc_dointvec,
3247 	},
3248 	{
3249 		/*  Deprecated. Use gc_min_interval_ms */
3250 
3251 		.procname	= "gc_min_interval",
3252 		.data		= &ip_rt_gc_min_interval,
3253 		.maxlen		= sizeof(int),
3254 		.mode		= 0644,
3255 		.proc_handler	= proc_dointvec_jiffies,
3256 	},
3257 	{
3258 		.procname	= "gc_min_interval_ms",
3259 		.data		= &ip_rt_gc_min_interval,
3260 		.maxlen		= sizeof(int),
3261 		.mode		= 0644,
3262 		.proc_handler	= proc_dointvec_ms_jiffies,
3263 	},
3264 	{
3265 		.procname	= "gc_timeout",
3266 		.data		= &ip_rt_gc_timeout,
3267 		.maxlen		= sizeof(int),
3268 		.mode		= 0644,
3269 		.proc_handler	= proc_dointvec_jiffies,
3270 	},
3271 	{
3272 		.procname	= "gc_interval",
3273 		.data		= &ip_rt_gc_interval,
3274 		.maxlen		= sizeof(int),
3275 		.mode		= 0644,
3276 		.proc_handler	= proc_dointvec_jiffies,
3277 	},
3278 	{
3279 		.procname	= "redirect_load",
3280 		.data		= &ip_rt_redirect_load,
3281 		.maxlen		= sizeof(int),
3282 		.mode		= 0644,
3283 		.proc_handler	= proc_dointvec,
3284 	},
3285 	{
3286 		.procname	= "redirect_number",
3287 		.data		= &ip_rt_redirect_number,
3288 		.maxlen		= sizeof(int),
3289 		.mode		= 0644,
3290 		.proc_handler	= proc_dointvec,
3291 	},
3292 	{
3293 		.procname	= "redirect_silence",
3294 		.data		= &ip_rt_redirect_silence,
3295 		.maxlen		= sizeof(int),
3296 		.mode		= 0644,
3297 		.proc_handler	= proc_dointvec,
3298 	},
3299 	{
3300 		.procname	= "error_cost",
3301 		.data		= &ip_rt_error_cost,
3302 		.maxlen		= sizeof(int),
3303 		.mode		= 0644,
3304 		.proc_handler	= proc_dointvec,
3305 	},
3306 	{
3307 		.procname	= "error_burst",
3308 		.data		= &ip_rt_error_burst,
3309 		.maxlen		= sizeof(int),
3310 		.mode		= 0644,
3311 		.proc_handler	= proc_dointvec,
3312 	},
3313 	{
3314 		.procname	= "gc_elasticity",
3315 		.data		= &ip_rt_gc_elasticity,
3316 		.maxlen		= sizeof(int),
3317 		.mode		= 0644,
3318 		.proc_handler	= proc_dointvec,
3319 	},
3320 	{
3321 		.procname	= "mtu_expires",
3322 		.data		= &ip_rt_mtu_expires,
3323 		.maxlen		= sizeof(int),
3324 		.mode		= 0644,
3325 		.proc_handler	= proc_dointvec_jiffies,
3326 	},
3327 	{
3328 		.procname	= "min_pmtu",
3329 		.data		= &ip_rt_min_pmtu,
3330 		.maxlen		= sizeof(int),
3331 		.mode		= 0644,
3332 		.proc_handler	= proc_dointvec,
3333 	},
3334 	{
3335 		.procname	= "min_adv_mss",
3336 		.data		= &ip_rt_min_advmss,
3337 		.maxlen		= sizeof(int),
3338 		.mode		= 0644,
3339 		.proc_handler	= proc_dointvec,
3340 	},
3341 	{ }
3342 };
3343 
3344 static struct ctl_table empty[1];
3345 
3346 static struct ctl_table ipv4_skeleton[] =
3347 {
3348 	{ .procname = "route",
3349 	  .mode = 0555, .child = ipv4_route_table},
3350 	{ .procname = "neigh",
3351 	  .mode = 0555, .child = empty},
3352 	{ }
3353 };
3354 
3355 static __net_initdata struct ctl_path ipv4_path[] = {
3356 	{ .procname = "net", },
3357 	{ .procname = "ipv4", },
3358 	{ },
3359 };
3360 
3361 static struct ctl_table ipv4_route_flush_table[] = {
3362 	{
3363 		.procname	= "flush",
3364 		.maxlen		= sizeof(int),
3365 		.mode		= 0200,
3366 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3367 	},
3368 	{ },
3369 };
3370 
3371 static __net_initdata struct ctl_path ipv4_route_path[] = {
3372 	{ .procname = "net", },
3373 	{ .procname = "ipv4", },
3374 	{ .procname = "route", },
3375 	{ },
3376 };
3377 
3378 static __net_init int sysctl_route_net_init(struct net *net)
3379 {
3380 	struct ctl_table *tbl;
3381 
3382 	tbl = ipv4_route_flush_table;
3383 	if (!net_eq(net, &init_net)) {
3384 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3385 		if (tbl == NULL)
3386 			goto err_dup;
3387 	}
3388 	tbl[0].extra1 = net;
3389 
3390 	net->ipv4.route_hdr =
3391 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3392 	if (net->ipv4.route_hdr == NULL)
3393 		goto err_reg;
3394 	return 0;
3395 
3396 err_reg:
3397 	if (tbl != ipv4_route_flush_table)
3398 		kfree(tbl);
3399 err_dup:
3400 	return -ENOMEM;
3401 }
3402 
3403 static __net_exit void sysctl_route_net_exit(struct net *net)
3404 {
3405 	struct ctl_table *tbl;
3406 
3407 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3408 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3409 	BUG_ON(tbl == ipv4_route_flush_table);
3410 	kfree(tbl);
3411 }
3412 
3413 static __net_initdata struct pernet_operations sysctl_route_ops = {
3414 	.init = sysctl_route_net_init,
3415 	.exit = sysctl_route_net_exit,
3416 };
3417 #endif
3418 
3419 static __net_init int rt_genid_init(struct net *net)
3420 {
3421 	get_random_bytes(&net->ipv4.rt_genid,
3422 			 sizeof(net->ipv4.rt_genid));
3423 	get_random_bytes(&net->ipv4.dev_addr_genid,
3424 			 sizeof(net->ipv4.dev_addr_genid));
3425 	return 0;
3426 }
3427 
3428 static __net_initdata struct pernet_operations rt_genid_ops = {
3429 	.init = rt_genid_init,
3430 };
3431 
3432 
3433 #ifdef CONFIG_IP_ROUTE_CLASSID
3434 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3435 #endif /* CONFIG_IP_ROUTE_CLASSID */
3436 
3437 static __initdata unsigned long rhash_entries;
3438 static int __init set_rhash_entries(char *str)
3439 {
3440 	if (!str)
3441 		return 0;
3442 	rhash_entries = simple_strtoul(str, &str, 0);
3443 	return 1;
3444 }
3445 __setup("rhash_entries=", set_rhash_entries);
3446 
3447 int __init ip_rt_init(void)
3448 {
3449 	int rc = 0;
3450 
3451 #ifdef CONFIG_IP_ROUTE_CLASSID
3452 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3453 	if (!ip_rt_acct)
3454 		panic("IP: failed to allocate ip_rt_acct\n");
3455 #endif
3456 
3457 	ipv4_dst_ops.kmem_cachep =
3458 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3459 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3460 
3461 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3462 
3463 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3464 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3465 
3466 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3467 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3468 
3469 	rt_hash_table = (struct rt_hash_bucket *)
3470 		alloc_large_system_hash("IP route cache",
3471 					sizeof(struct rt_hash_bucket),
3472 					rhash_entries,
3473 					(totalram_pages >= 128 * 1024) ?
3474 					15 : 17,
3475 					0,
3476 					&rt_hash_log,
3477 					&rt_hash_mask,
3478 					rhash_entries ? 0 : 512 * 1024);
3479 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3480 	rt_hash_lock_init();
3481 
3482 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3483 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3484 
3485 	devinet_init();
3486 	ip_fib_init();
3487 
3488 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3489 	expires_ljiffies = jiffies;
3490 	schedule_delayed_work(&expires_work,
3491 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3492 
3493 	if (ip_rt_proc_init())
3494 		printk(KERN_ERR "Unable to create route proc files\n");
3495 #ifdef CONFIG_XFRM
3496 	xfrm_init();
3497 	xfrm4_init(ip_rt_max_size);
3498 #endif
3499 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3500 
3501 #ifdef CONFIG_SYSCTL
3502 	register_pernet_subsys(&sysctl_route_ops);
3503 #endif
3504 	register_pernet_subsys(&rt_genid_ops);
3505 	return rc;
3506 }
3507 
3508 #ifdef CONFIG_SYSCTL
3509 /*
3510  * We really need to sanitize the damn ipv4 init order, then all
3511  * this nonsense will go away.
3512  */
3513 void __init ip_static_sysctl_init(void)
3514 {
3515 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3516 }
3517 #endif
3518