1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #define pr_fmt(fmt) "IPv4: " fmt 66 67 #include <linux/module.h> 68 #include <linux/uaccess.h> 69 #include <linux/bitops.h> 70 #include <linux/types.h> 71 #include <linux/kernel.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/inetdevice.h> 84 #include <linux/igmp.h> 85 #include <linux/pkt_sched.h> 86 #include <linux/mroute.h> 87 #include <linux/netfilter_ipv4.h> 88 #include <linux/random.h> 89 #include <linux/rcupdate.h> 90 #include <linux/times.h> 91 #include <linux/slab.h> 92 #include <linux/jhash.h> 93 #include <net/dst.h> 94 #include <net/dst_metadata.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/lwtunnel.h> 107 #include <net/netevent.h> 108 #include <net/rtnetlink.h> 109 #ifdef CONFIG_SYSCTL 110 #include <linux/sysctl.h> 111 #endif 112 #include <net/secure_seq.h> 113 #include <net/ip_tunnels.h> 114 #include <net/l3mdev.h> 115 116 #include "fib_lookup.h" 117 118 #define RT_FL_TOS(oldflp4) \ 119 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 120 121 #define RT_GC_TIMEOUT (300*HZ) 122 123 static int ip_rt_max_size; 124 static int ip_rt_redirect_number __read_mostly = 9; 125 static int ip_rt_redirect_load __read_mostly = HZ / 50; 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 127 static int ip_rt_error_cost __read_mostly = HZ; 128 static int ip_rt_error_burst __read_mostly = 5 * HZ; 129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 130 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 131 static int ip_rt_min_advmss __read_mostly = 256; 132 133 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 134 135 /* 136 * Interface to generic destination cache. 137 */ 138 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 141 static unsigned int ipv4_mtu(const struct dst_entry *dst); 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 143 static void ipv4_link_failure(struct sk_buff *skb); 144 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb, u32 mtu); 146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 147 struct sk_buff *skb); 148 static void ipv4_dst_destroy(struct dst_entry *dst); 149 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 151 { 152 WARN_ON(1); 153 return NULL; 154 } 155 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 157 struct sk_buff *skb, 158 const void *daddr); 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 160 161 static struct dst_ops ipv4_dst_ops = { 162 .family = AF_INET, 163 .check = ipv4_dst_check, 164 .default_advmss = ipv4_default_advmss, 165 .mtu = ipv4_mtu, 166 .cow_metrics = ipv4_cow_metrics, 167 .destroy = ipv4_dst_destroy, 168 .negative_advice = ipv4_negative_advice, 169 .link_failure = ipv4_link_failure, 170 .update_pmtu = ip_rt_update_pmtu, 171 .redirect = ip_do_redirect, 172 .local_out = __ip_local_out, 173 .neigh_lookup = ipv4_neigh_lookup, 174 .confirm_neigh = ipv4_confirm_neigh, 175 }; 176 177 #define ECN_OR_COST(class) TC_PRIO_##class 178 179 const __u8 ip_tos2prio[16] = { 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BESTEFFORT, 183 ECN_OR_COST(BESTEFFORT), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_BULK, 187 ECN_OR_COST(BULK), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE, 191 ECN_OR_COST(INTERACTIVE), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK), 194 TC_PRIO_INTERACTIVE_BULK, 195 ECN_OR_COST(INTERACTIVE_BULK) 196 }; 197 EXPORT_SYMBOL(ip_tos2prio); 198 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 201 202 #ifdef CONFIG_PROC_FS 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 204 { 205 if (*pos) 206 return NULL; 207 return SEQ_START_TOKEN; 208 } 209 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 211 { 212 ++*pos; 213 return NULL; 214 } 215 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 217 { 218 } 219 220 static int rt_cache_seq_show(struct seq_file *seq, void *v) 221 { 222 if (v == SEQ_START_TOKEN) 223 seq_printf(seq, "%-127s\n", 224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 226 "HHUptod\tSpecDst"); 227 return 0; 228 } 229 230 static const struct seq_operations rt_cache_seq_ops = { 231 .start = rt_cache_seq_start, 232 .next = rt_cache_seq_next, 233 .stop = rt_cache_seq_stop, 234 .show = rt_cache_seq_show, 235 }; 236 237 static int rt_cache_seq_open(struct inode *inode, struct file *file) 238 { 239 return seq_open(file, &rt_cache_seq_ops); 240 } 241 242 static const struct file_operations rt_cache_seq_fops = { 243 .open = rt_cache_seq_open, 244 .read = seq_read, 245 .llseek = seq_lseek, 246 .release = seq_release, 247 }; 248 249 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 251 { 252 int cpu; 253 254 if (*pos == 0) 255 return SEQ_START_TOKEN; 256 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 258 if (!cpu_possible(cpu)) 259 continue; 260 *pos = cpu+1; 261 return &per_cpu(rt_cache_stat, cpu); 262 } 263 return NULL; 264 } 265 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 267 { 268 int cpu; 269 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 271 if (!cpu_possible(cpu)) 272 continue; 273 *pos = cpu+1; 274 return &per_cpu(rt_cache_stat, cpu); 275 } 276 return NULL; 277 278 } 279 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 281 { 282 283 } 284 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 286 { 287 struct rt_cache_stat *st = v; 288 289 if (v == SEQ_START_TOKEN) { 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 291 return 0; 292 } 293 294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 296 dst_entries_get_slow(&ipv4_dst_ops), 297 0, /* st->in_hit */ 298 st->in_slow_tot, 299 st->in_slow_mc, 300 st->in_no_route, 301 st->in_brd, 302 st->in_martian_dst, 303 st->in_martian_src, 304 305 0, /* st->out_hit */ 306 st->out_slow_tot, 307 st->out_slow_mc, 308 309 0, /* st->gc_total */ 310 0, /* st->gc_ignored */ 311 0, /* st->gc_goal_miss */ 312 0, /* st->gc_dst_overflow */ 313 0, /* st->in_hlist_search */ 314 0 /* st->out_hlist_search */ 315 ); 316 return 0; 317 } 318 319 static const struct seq_operations rt_cpu_seq_ops = { 320 .start = rt_cpu_seq_start, 321 .next = rt_cpu_seq_next, 322 .stop = rt_cpu_seq_stop, 323 .show = rt_cpu_seq_show, 324 }; 325 326 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 328 { 329 return seq_open(file, &rt_cpu_seq_ops); 330 } 331 332 static const struct file_operations rt_cpu_seq_fops = { 333 .open = rt_cpu_seq_open, 334 .read = seq_read, 335 .llseek = seq_lseek, 336 .release = seq_release, 337 }; 338 339 #ifdef CONFIG_IP_ROUTE_CLASSID 340 static int rt_acct_proc_show(struct seq_file *m, void *v) 341 { 342 struct ip_rt_acct *dst, *src; 343 unsigned int i, j; 344 345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 346 if (!dst) 347 return -ENOMEM; 348 349 for_each_possible_cpu(i) { 350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 351 for (j = 0; j < 256; j++) { 352 dst[j].o_bytes += src[j].o_bytes; 353 dst[j].o_packets += src[j].o_packets; 354 dst[j].i_bytes += src[j].i_bytes; 355 dst[j].i_packets += src[j].i_packets; 356 } 357 } 358 359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 360 kfree(dst); 361 return 0; 362 } 363 #endif 364 365 static int __net_init ip_rt_do_proc_init(struct net *net) 366 { 367 struct proc_dir_entry *pde; 368 369 pde = proc_create("rt_cache", 0444, net->proc_net, 370 &rt_cache_seq_fops); 371 if (!pde) 372 goto err1; 373 374 pde = proc_create("rt_cache", 0444, 375 net->proc_net_stat, &rt_cpu_seq_fops); 376 if (!pde) 377 goto err2; 378 379 #ifdef CONFIG_IP_ROUTE_CLASSID 380 pde = proc_create_single("rt_acct", 0, net->proc_net, 381 rt_acct_proc_show); 382 if (!pde) 383 goto err3; 384 #endif 385 return 0; 386 387 #ifdef CONFIG_IP_ROUTE_CLASSID 388 err3: 389 remove_proc_entry("rt_cache", net->proc_net_stat); 390 #endif 391 err2: 392 remove_proc_entry("rt_cache", net->proc_net); 393 err1: 394 return -ENOMEM; 395 } 396 397 static void __net_exit ip_rt_do_proc_exit(struct net *net) 398 { 399 remove_proc_entry("rt_cache", net->proc_net_stat); 400 remove_proc_entry("rt_cache", net->proc_net); 401 #ifdef CONFIG_IP_ROUTE_CLASSID 402 remove_proc_entry("rt_acct", net->proc_net); 403 #endif 404 } 405 406 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 407 .init = ip_rt_do_proc_init, 408 .exit = ip_rt_do_proc_exit, 409 }; 410 411 static int __init ip_rt_proc_init(void) 412 { 413 return register_pernet_subsys(&ip_rt_proc_ops); 414 } 415 416 #else 417 static inline int ip_rt_proc_init(void) 418 { 419 return 0; 420 } 421 #endif /* CONFIG_PROC_FS */ 422 423 static inline bool rt_is_expired(const struct rtable *rth) 424 { 425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 426 } 427 428 void rt_cache_flush(struct net *net) 429 { 430 rt_genid_bump_ipv4(net); 431 } 432 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 434 struct sk_buff *skb, 435 const void *daddr) 436 { 437 const struct rtable *rt = container_of(dst, struct rtable, dst); 438 struct net_device *dev = dst->dev; 439 struct neighbour *n; 440 441 rcu_read_lock_bh(); 442 443 if (likely(rt->rt_gw_family == AF_INET)) { 444 n = ip_neigh_gw4(dev, rt->rt_gw4); 445 } else if (rt->rt_gw_family == AF_INET6) { 446 n = ip_neigh_gw6(dev, &rt->rt_gw6); 447 } else { 448 __be32 pkey; 449 450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); 451 n = ip_neigh_gw4(dev, pkey); 452 } 453 454 if (n && !refcount_inc_not_zero(&n->refcnt)) 455 n = NULL; 456 457 rcu_read_unlock_bh(); 458 459 return n; 460 } 461 462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 463 { 464 const struct rtable *rt = container_of(dst, struct rtable, dst); 465 struct net_device *dev = dst->dev; 466 const __be32 *pkey = daddr; 467 468 if (rt->rt_gw_family == AF_INET) { 469 pkey = (const __be32 *)&rt->rt_gw4; 470 } else if (rt->rt_gw_family == AF_INET6) { 471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); 472 } else if (!daddr || 473 (rt->rt_flags & 474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { 475 return; 476 } 477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 478 } 479 480 #define IP_IDENTS_SZ 2048u 481 482 static atomic_t *ip_idents __read_mostly; 483 static u32 *ip_tstamps __read_mostly; 484 485 /* In order to protect privacy, we add a perturbation to identifiers 486 * if one generator is seldom used. This makes hard for an attacker 487 * to infer how many packets were sent between two points in time. 488 */ 489 u32 ip_idents_reserve(u32 hash, int segs) 490 { 491 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 492 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 493 u32 old = READ_ONCE(*p_tstamp); 494 u32 now = (u32)jiffies; 495 u32 new, delta = 0; 496 497 if (old != now && cmpxchg(p_tstamp, old, now) == old) 498 delta = prandom_u32_max(now - old); 499 500 /* Do not use atomic_add_return() as it makes UBSAN unhappy */ 501 do { 502 old = (u32)atomic_read(p_id); 503 new = old + delta + segs; 504 } while (atomic_cmpxchg(p_id, old, new) != old); 505 506 return new - segs; 507 } 508 EXPORT_SYMBOL(ip_idents_reserve); 509 510 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 511 { 512 u32 hash, id; 513 514 /* Note the following code is not safe, but this is okay. */ 515 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) 516 get_random_bytes(&net->ipv4.ip_id_key, 517 sizeof(net->ipv4.ip_id_key)); 518 519 hash = siphash_3u32((__force u32)iph->daddr, 520 (__force u32)iph->saddr, 521 iph->protocol, 522 &net->ipv4.ip_id_key); 523 id = ip_idents_reserve(hash, segs); 524 iph->id = htons(id); 525 } 526 EXPORT_SYMBOL(__ip_select_ident); 527 528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 529 const struct sock *sk, 530 const struct iphdr *iph, 531 int oif, u8 tos, 532 u8 prot, u32 mark, int flow_flags) 533 { 534 if (sk) { 535 const struct inet_sock *inet = inet_sk(sk); 536 537 oif = sk->sk_bound_dev_if; 538 mark = sk->sk_mark; 539 tos = RT_CONN_FLAGS(sk); 540 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 541 } 542 flowi4_init_output(fl4, oif, mark, tos, 543 RT_SCOPE_UNIVERSE, prot, 544 flow_flags, 545 iph->daddr, iph->saddr, 0, 0, 546 sock_net_uid(net, sk)); 547 } 548 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 550 const struct sock *sk) 551 { 552 const struct net *net = dev_net(skb->dev); 553 const struct iphdr *iph = ip_hdr(skb); 554 int oif = skb->dev->ifindex; 555 u8 tos = RT_TOS(iph->tos); 556 u8 prot = iph->protocol; 557 u32 mark = skb->mark; 558 559 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 560 } 561 562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 563 { 564 const struct inet_sock *inet = inet_sk(sk); 565 const struct ip_options_rcu *inet_opt; 566 __be32 daddr = inet->inet_daddr; 567 568 rcu_read_lock(); 569 inet_opt = rcu_dereference(inet->inet_opt); 570 if (inet_opt && inet_opt->opt.srr) 571 daddr = inet_opt->opt.faddr; 572 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 573 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 574 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 575 inet_sk_flowi_flags(sk), 576 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 577 rcu_read_unlock(); 578 } 579 580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 581 const struct sk_buff *skb) 582 { 583 if (skb) 584 build_skb_flow_key(fl4, skb, sk); 585 else 586 build_sk_flow_key(fl4, sk); 587 } 588 589 static DEFINE_SPINLOCK(fnhe_lock); 590 591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 592 { 593 struct rtable *rt; 594 595 rt = rcu_dereference(fnhe->fnhe_rth_input); 596 if (rt) { 597 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 598 dst_dev_put(&rt->dst); 599 dst_release(&rt->dst); 600 } 601 rt = rcu_dereference(fnhe->fnhe_rth_output); 602 if (rt) { 603 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 604 dst_dev_put(&rt->dst); 605 dst_release(&rt->dst); 606 } 607 } 608 609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 610 { 611 struct fib_nh_exception *fnhe, *oldest; 612 613 oldest = rcu_dereference(hash->chain); 614 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 615 fnhe = rcu_dereference(fnhe->fnhe_next)) { 616 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 617 oldest = fnhe; 618 } 619 fnhe_flush_routes(oldest); 620 return oldest; 621 } 622 623 static inline u32 fnhe_hashfun(__be32 daddr) 624 { 625 static u32 fnhe_hashrnd __read_mostly; 626 u32 hval; 627 628 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 629 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); 630 return hash_32(hval, FNHE_HASH_SHIFT); 631 } 632 633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 634 { 635 rt->rt_pmtu = fnhe->fnhe_pmtu; 636 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; 637 rt->dst.expires = fnhe->fnhe_expires; 638 639 if (fnhe->fnhe_gw) { 640 rt->rt_flags |= RTCF_REDIRECTED; 641 rt->rt_gw_family = AF_INET; 642 rt->rt_gw4 = fnhe->fnhe_gw; 643 } 644 } 645 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 647 u32 pmtu, bool lock, unsigned long expires) 648 { 649 struct fnhe_hash_bucket *hash; 650 struct fib_nh_exception *fnhe; 651 struct rtable *rt; 652 u32 genid, hval; 653 unsigned int i; 654 int depth; 655 656 genid = fnhe_genid(dev_net(nh->fib_nh_dev)); 657 hval = fnhe_hashfun(daddr); 658 659 spin_lock_bh(&fnhe_lock); 660 661 hash = rcu_dereference(nh->nh_exceptions); 662 if (!hash) { 663 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); 664 if (!hash) 665 goto out_unlock; 666 rcu_assign_pointer(nh->nh_exceptions, hash); 667 } 668 669 hash += hval; 670 671 depth = 0; 672 for (fnhe = rcu_dereference(hash->chain); fnhe; 673 fnhe = rcu_dereference(fnhe->fnhe_next)) { 674 if (fnhe->fnhe_daddr == daddr) 675 break; 676 depth++; 677 } 678 679 if (fnhe) { 680 if (fnhe->fnhe_genid != genid) 681 fnhe->fnhe_genid = genid; 682 if (gw) 683 fnhe->fnhe_gw = gw; 684 if (pmtu) { 685 fnhe->fnhe_pmtu = pmtu; 686 fnhe->fnhe_mtu_locked = lock; 687 } 688 fnhe->fnhe_expires = max(1UL, expires); 689 /* Update all cached dsts too */ 690 rt = rcu_dereference(fnhe->fnhe_rth_input); 691 if (rt) 692 fill_route_from_fnhe(rt, fnhe); 693 rt = rcu_dereference(fnhe->fnhe_rth_output); 694 if (rt) 695 fill_route_from_fnhe(rt, fnhe); 696 } else { 697 if (depth > FNHE_RECLAIM_DEPTH) 698 fnhe = fnhe_oldest(hash); 699 else { 700 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 701 if (!fnhe) 702 goto out_unlock; 703 704 fnhe->fnhe_next = hash->chain; 705 rcu_assign_pointer(hash->chain, fnhe); 706 } 707 fnhe->fnhe_genid = genid; 708 fnhe->fnhe_daddr = daddr; 709 fnhe->fnhe_gw = gw; 710 fnhe->fnhe_pmtu = pmtu; 711 fnhe->fnhe_mtu_locked = lock; 712 fnhe->fnhe_expires = max(1UL, expires); 713 714 /* Exception created; mark the cached routes for the nexthop 715 * stale, so anyone caching it rechecks if this exception 716 * applies to them. 717 */ 718 rt = rcu_dereference(nh->nh_rth_input); 719 if (rt) 720 rt->dst.obsolete = DST_OBSOLETE_KILL; 721 722 for_each_possible_cpu(i) { 723 struct rtable __rcu **prt; 724 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); 725 rt = rcu_dereference(*prt); 726 if (rt) 727 rt->dst.obsolete = DST_OBSOLETE_KILL; 728 } 729 } 730 731 fnhe->fnhe_stamp = jiffies; 732 733 out_unlock: 734 spin_unlock_bh(&fnhe_lock); 735 } 736 737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 738 bool kill_route) 739 { 740 __be32 new_gw = icmp_hdr(skb)->un.gateway; 741 __be32 old_gw = ip_hdr(skb)->saddr; 742 struct net_device *dev = skb->dev; 743 struct in_device *in_dev; 744 struct fib_result res; 745 struct neighbour *n; 746 struct net *net; 747 748 switch (icmp_hdr(skb)->code & 7) { 749 case ICMP_REDIR_NET: 750 case ICMP_REDIR_NETTOS: 751 case ICMP_REDIR_HOST: 752 case ICMP_REDIR_HOSTTOS: 753 break; 754 755 default: 756 return; 757 } 758 759 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) 760 return; 761 762 in_dev = __in_dev_get_rcu(dev); 763 if (!in_dev) 764 return; 765 766 net = dev_net(dev); 767 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 768 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 769 ipv4_is_zeronet(new_gw)) 770 goto reject_redirect; 771 772 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 773 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 774 goto reject_redirect; 775 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 776 goto reject_redirect; 777 } else { 778 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 779 goto reject_redirect; 780 } 781 782 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 783 if (!n) 784 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 785 if (!IS_ERR(n)) { 786 if (!(n->nud_state & NUD_VALID)) { 787 neigh_event_send(n, NULL); 788 } else { 789 if (fib_lookup(net, fl4, &res, 0) == 0) { 790 struct fib_nh_common *nhc = FIB_RES_NHC(res); 791 struct fib_nh *nh; 792 793 nh = container_of(nhc, struct fib_nh, nh_common); 794 update_or_create_fnhe(nh, fl4->daddr, new_gw, 795 0, false, 796 jiffies + ip_rt_gc_timeout); 797 } 798 if (kill_route) 799 rt->dst.obsolete = DST_OBSOLETE_KILL; 800 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 801 } 802 neigh_release(n); 803 } 804 return; 805 806 reject_redirect: 807 #ifdef CONFIG_IP_ROUTE_VERBOSE 808 if (IN_DEV_LOG_MARTIANS(in_dev)) { 809 const struct iphdr *iph = (const struct iphdr *) skb->data; 810 __be32 daddr = iph->daddr; 811 __be32 saddr = iph->saddr; 812 813 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 814 " Advised path = %pI4 -> %pI4\n", 815 &old_gw, dev->name, &new_gw, 816 &saddr, &daddr); 817 } 818 #endif 819 ; 820 } 821 822 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 823 { 824 struct rtable *rt; 825 struct flowi4 fl4; 826 const struct iphdr *iph = (const struct iphdr *) skb->data; 827 struct net *net = dev_net(skb->dev); 828 int oif = skb->dev->ifindex; 829 u8 tos = RT_TOS(iph->tos); 830 u8 prot = iph->protocol; 831 u32 mark = skb->mark; 832 833 rt = (struct rtable *) dst; 834 835 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 836 __ip_do_redirect(rt, skb, &fl4, true); 837 } 838 839 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 840 { 841 struct rtable *rt = (struct rtable *)dst; 842 struct dst_entry *ret = dst; 843 844 if (rt) { 845 if (dst->obsolete > 0) { 846 ip_rt_put(rt); 847 ret = NULL; 848 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 849 rt->dst.expires) { 850 ip_rt_put(rt); 851 ret = NULL; 852 } 853 } 854 return ret; 855 } 856 857 /* 858 * Algorithm: 859 * 1. The first ip_rt_redirect_number redirects are sent 860 * with exponential backoff, then we stop sending them at all, 861 * assuming that the host ignores our redirects. 862 * 2. If we did not see packets requiring redirects 863 * during ip_rt_redirect_silence, we assume that the host 864 * forgot redirected route and start to send redirects again. 865 * 866 * This algorithm is much cheaper and more intelligent than dumb load limiting 867 * in icmp.c. 868 * 869 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 870 * and "frag. need" (breaks PMTU discovery) in icmp.c. 871 */ 872 873 void ip_rt_send_redirect(struct sk_buff *skb) 874 { 875 struct rtable *rt = skb_rtable(skb); 876 struct in_device *in_dev; 877 struct inet_peer *peer; 878 struct net *net; 879 int log_martians; 880 int vif; 881 882 rcu_read_lock(); 883 in_dev = __in_dev_get_rcu(rt->dst.dev); 884 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 885 rcu_read_unlock(); 886 return; 887 } 888 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 889 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 890 rcu_read_unlock(); 891 892 net = dev_net(rt->dst.dev); 893 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 894 if (!peer) { 895 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 896 rt_nexthop(rt, ip_hdr(skb)->daddr)); 897 return; 898 } 899 900 /* No redirected packets during ip_rt_redirect_silence; 901 * reset the algorithm. 902 */ 903 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { 904 peer->rate_tokens = 0; 905 peer->n_redirects = 0; 906 } 907 908 /* Too many ignored redirects; do not send anything 909 * set dst.rate_last to the last seen redirected packet. 910 */ 911 if (peer->n_redirects >= ip_rt_redirect_number) { 912 peer->rate_last = jiffies; 913 goto out_put_peer; 914 } 915 916 /* Check for load limit; set rate_last to the latest sent 917 * redirect. 918 */ 919 if (peer->rate_tokens == 0 || 920 time_after(jiffies, 921 (peer->rate_last + 922 (ip_rt_redirect_load << peer->rate_tokens)))) { 923 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 924 925 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 926 peer->rate_last = jiffies; 927 ++peer->rate_tokens; 928 ++peer->n_redirects; 929 #ifdef CONFIG_IP_ROUTE_VERBOSE 930 if (log_martians && 931 peer->rate_tokens == ip_rt_redirect_number) 932 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 933 &ip_hdr(skb)->saddr, inet_iif(skb), 934 &ip_hdr(skb)->daddr, &gw); 935 #endif 936 } 937 out_put_peer: 938 inet_putpeer(peer); 939 } 940 941 static int ip_error(struct sk_buff *skb) 942 { 943 struct rtable *rt = skb_rtable(skb); 944 struct net_device *dev = skb->dev; 945 struct in_device *in_dev; 946 struct inet_peer *peer; 947 unsigned long now; 948 struct net *net; 949 bool send; 950 int code; 951 952 if (netif_is_l3_master(skb->dev)) { 953 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); 954 if (!dev) 955 goto out; 956 } 957 958 in_dev = __in_dev_get_rcu(dev); 959 960 /* IP on this device is disabled. */ 961 if (!in_dev) 962 goto out; 963 964 net = dev_net(rt->dst.dev); 965 if (!IN_DEV_FORWARD(in_dev)) { 966 switch (rt->dst.error) { 967 case EHOSTUNREACH: 968 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 969 break; 970 971 case ENETUNREACH: 972 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 973 break; 974 } 975 goto out; 976 } 977 978 switch (rt->dst.error) { 979 case EINVAL: 980 default: 981 goto out; 982 case EHOSTUNREACH: 983 code = ICMP_HOST_UNREACH; 984 break; 985 case ENETUNREACH: 986 code = ICMP_NET_UNREACH; 987 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 988 break; 989 case EACCES: 990 code = ICMP_PKT_FILTERED; 991 break; 992 } 993 994 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 995 l3mdev_master_ifindex(skb->dev), 1); 996 997 send = true; 998 if (peer) { 999 now = jiffies; 1000 peer->rate_tokens += now - peer->rate_last; 1001 if (peer->rate_tokens > ip_rt_error_burst) 1002 peer->rate_tokens = ip_rt_error_burst; 1003 peer->rate_last = now; 1004 if (peer->rate_tokens >= ip_rt_error_cost) 1005 peer->rate_tokens -= ip_rt_error_cost; 1006 else 1007 send = false; 1008 inet_putpeer(peer); 1009 } 1010 if (send) 1011 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1012 1013 out: kfree_skb(skb); 1014 return 0; 1015 } 1016 1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1018 { 1019 struct dst_entry *dst = &rt->dst; 1020 u32 old_mtu = ipv4_mtu(dst); 1021 struct fib_result res; 1022 bool lock = false; 1023 1024 if (ip_mtu_locked(dst)) 1025 return; 1026 1027 if (old_mtu < mtu) 1028 return; 1029 1030 if (mtu < ip_rt_min_pmtu) { 1031 lock = true; 1032 mtu = min(old_mtu, ip_rt_min_pmtu); 1033 } 1034 1035 if (rt->rt_pmtu == mtu && !lock && 1036 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1037 return; 1038 1039 rcu_read_lock(); 1040 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 1041 struct fib_nh_common *nhc = FIB_RES_NHC(res); 1042 struct fib_nh *nh; 1043 1044 nh = container_of(nhc, struct fib_nh, nh_common); 1045 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, 1046 jiffies + ip_rt_mtu_expires); 1047 } 1048 rcu_read_unlock(); 1049 } 1050 1051 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1052 struct sk_buff *skb, u32 mtu) 1053 { 1054 struct rtable *rt = (struct rtable *) dst; 1055 struct flowi4 fl4; 1056 1057 ip_rt_build_flow_key(&fl4, sk, skb); 1058 __ip_rt_update_pmtu(rt, &fl4, mtu); 1059 } 1060 1061 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1062 int oif, u8 protocol) 1063 { 1064 const struct iphdr *iph = (const struct iphdr *) skb->data; 1065 struct flowi4 fl4; 1066 struct rtable *rt; 1067 u32 mark = IP4_REPLY_MARK(net, skb->mark); 1068 1069 __build_flow_key(net, &fl4, NULL, iph, oif, 1070 RT_TOS(iph->tos), protocol, mark, 0); 1071 rt = __ip_route_output_key(net, &fl4); 1072 if (!IS_ERR(rt)) { 1073 __ip_rt_update_pmtu(rt, &fl4, mtu); 1074 ip_rt_put(rt); 1075 } 1076 } 1077 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1078 1079 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1080 { 1081 const struct iphdr *iph = (const struct iphdr *) skb->data; 1082 struct flowi4 fl4; 1083 struct rtable *rt; 1084 1085 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1086 1087 if (!fl4.flowi4_mark) 1088 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1089 1090 rt = __ip_route_output_key(sock_net(sk), &fl4); 1091 if (!IS_ERR(rt)) { 1092 __ip_rt_update_pmtu(rt, &fl4, mtu); 1093 ip_rt_put(rt); 1094 } 1095 } 1096 1097 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1098 { 1099 const struct iphdr *iph = (const struct iphdr *) skb->data; 1100 struct flowi4 fl4; 1101 struct rtable *rt; 1102 struct dst_entry *odst = NULL; 1103 bool new = false; 1104 struct net *net = sock_net(sk); 1105 1106 bh_lock_sock(sk); 1107 1108 if (!ip_sk_accept_pmtu(sk)) 1109 goto out; 1110 1111 odst = sk_dst_get(sk); 1112 1113 if (sock_owned_by_user(sk) || !odst) { 1114 __ipv4_sk_update_pmtu(skb, sk, mtu); 1115 goto out; 1116 } 1117 1118 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1119 1120 rt = (struct rtable *)odst; 1121 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1122 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1123 if (IS_ERR(rt)) 1124 goto out; 1125 1126 new = true; 1127 } 1128 1129 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); 1130 1131 if (!dst_check(&rt->dst, 0)) { 1132 if (new) 1133 dst_release(&rt->dst); 1134 1135 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1136 if (IS_ERR(rt)) 1137 goto out; 1138 1139 new = true; 1140 } 1141 1142 if (new) 1143 sk_dst_set(sk, &rt->dst); 1144 1145 out: 1146 bh_unlock_sock(sk); 1147 dst_release(odst); 1148 } 1149 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1150 1151 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1152 int oif, u8 protocol) 1153 { 1154 const struct iphdr *iph = (const struct iphdr *) skb->data; 1155 struct flowi4 fl4; 1156 struct rtable *rt; 1157 1158 __build_flow_key(net, &fl4, NULL, iph, oif, 1159 RT_TOS(iph->tos), protocol, 0, 0); 1160 rt = __ip_route_output_key(net, &fl4); 1161 if (!IS_ERR(rt)) { 1162 __ip_do_redirect(rt, skb, &fl4, false); 1163 ip_rt_put(rt); 1164 } 1165 } 1166 EXPORT_SYMBOL_GPL(ipv4_redirect); 1167 1168 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1169 { 1170 const struct iphdr *iph = (const struct iphdr *) skb->data; 1171 struct flowi4 fl4; 1172 struct rtable *rt; 1173 struct net *net = sock_net(sk); 1174 1175 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1176 rt = __ip_route_output_key(net, &fl4); 1177 if (!IS_ERR(rt)) { 1178 __ip_do_redirect(rt, skb, &fl4, false); 1179 ip_rt_put(rt); 1180 } 1181 } 1182 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1183 1184 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1185 { 1186 struct rtable *rt = (struct rtable *) dst; 1187 1188 /* All IPV4 dsts are created with ->obsolete set to the value 1189 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1190 * into this function always. 1191 * 1192 * When a PMTU/redirect information update invalidates a route, 1193 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1194 * DST_OBSOLETE_DEAD. 1195 */ 1196 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1197 return NULL; 1198 return dst; 1199 } 1200 1201 static void ipv4_send_dest_unreach(struct sk_buff *skb) 1202 { 1203 struct ip_options opt; 1204 int res; 1205 1206 /* Recompile ip options since IPCB may not be valid anymore. 1207 * Also check we have a reasonable ipv4 header. 1208 */ 1209 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || 1210 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) 1211 return; 1212 1213 memset(&opt, 0, sizeof(opt)); 1214 if (ip_hdr(skb)->ihl > 5) { 1215 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) 1216 return; 1217 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); 1218 1219 rcu_read_lock(); 1220 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); 1221 rcu_read_unlock(); 1222 1223 if (res) 1224 return; 1225 } 1226 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); 1227 } 1228 1229 static void ipv4_link_failure(struct sk_buff *skb) 1230 { 1231 struct rtable *rt; 1232 1233 ipv4_send_dest_unreach(skb); 1234 1235 rt = skb_rtable(skb); 1236 if (rt) 1237 dst_set_expires(&rt->dst, 0); 1238 } 1239 1240 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1241 { 1242 pr_debug("%s: %pI4 -> %pI4, %s\n", 1243 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1244 skb->dev ? skb->dev->name : "?"); 1245 kfree_skb(skb); 1246 WARN_ON(1); 1247 return 0; 1248 } 1249 1250 /* 1251 We do not cache source address of outgoing interface, 1252 because it is used only by IP RR, TS and SRR options, 1253 so that it out of fast path. 1254 1255 BTW remember: "addr" is allowed to be not aligned 1256 in IP options! 1257 */ 1258 1259 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1260 { 1261 __be32 src; 1262 1263 if (rt_is_output_route(rt)) 1264 src = ip_hdr(skb)->saddr; 1265 else { 1266 struct fib_result res; 1267 struct iphdr *iph = ip_hdr(skb); 1268 struct flowi4 fl4 = { 1269 .daddr = iph->daddr, 1270 .saddr = iph->saddr, 1271 .flowi4_tos = RT_TOS(iph->tos), 1272 .flowi4_oif = rt->dst.dev->ifindex, 1273 .flowi4_iif = skb->dev->ifindex, 1274 .flowi4_mark = skb->mark, 1275 }; 1276 1277 rcu_read_lock(); 1278 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1279 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); 1280 else 1281 src = inet_select_addr(rt->dst.dev, 1282 rt_nexthop(rt, iph->daddr), 1283 RT_SCOPE_UNIVERSE); 1284 rcu_read_unlock(); 1285 } 1286 memcpy(addr, &src, 4); 1287 } 1288 1289 #ifdef CONFIG_IP_ROUTE_CLASSID 1290 static void set_class_tag(struct rtable *rt, u32 tag) 1291 { 1292 if (!(rt->dst.tclassid & 0xFFFF)) 1293 rt->dst.tclassid |= tag & 0xFFFF; 1294 if (!(rt->dst.tclassid & 0xFFFF0000)) 1295 rt->dst.tclassid |= tag & 0xFFFF0000; 1296 } 1297 #endif 1298 1299 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1300 { 1301 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1302 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1303 ip_rt_min_advmss); 1304 1305 return min(advmss, IPV4_MAX_PMTU - header_size); 1306 } 1307 1308 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1309 { 1310 const struct rtable *rt = (const struct rtable *) dst; 1311 unsigned int mtu = rt->rt_pmtu; 1312 1313 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1314 mtu = dst_metric_raw(dst, RTAX_MTU); 1315 1316 if (mtu) 1317 return mtu; 1318 1319 mtu = READ_ONCE(dst->dev->mtu); 1320 1321 if (unlikely(ip_mtu_locked(dst))) { 1322 if (rt->rt_gw_family && mtu > 576) 1323 mtu = 576; 1324 } 1325 1326 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1327 1328 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1329 } 1330 1331 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) 1332 { 1333 struct fnhe_hash_bucket *hash; 1334 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1335 u32 hval = fnhe_hashfun(daddr); 1336 1337 spin_lock_bh(&fnhe_lock); 1338 1339 hash = rcu_dereference_protected(nh->nh_exceptions, 1340 lockdep_is_held(&fnhe_lock)); 1341 hash += hval; 1342 1343 fnhe_p = &hash->chain; 1344 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1345 while (fnhe) { 1346 if (fnhe->fnhe_daddr == daddr) { 1347 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1348 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1349 /* set fnhe_daddr to 0 to ensure it won't bind with 1350 * new dsts in rt_bind_exception(). 1351 */ 1352 fnhe->fnhe_daddr = 0; 1353 fnhe_flush_routes(fnhe); 1354 kfree_rcu(fnhe, rcu); 1355 break; 1356 } 1357 fnhe_p = &fnhe->fnhe_next; 1358 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1359 lockdep_is_held(&fnhe_lock)); 1360 } 1361 1362 spin_unlock_bh(&fnhe_lock); 1363 } 1364 1365 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1366 { 1367 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); 1368 struct fib_nh_exception *fnhe; 1369 u32 hval; 1370 1371 if (!hash) 1372 return NULL; 1373 1374 hval = fnhe_hashfun(daddr); 1375 1376 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1377 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1378 if (fnhe->fnhe_daddr == daddr) { 1379 if (fnhe->fnhe_expires && 1380 time_after(jiffies, fnhe->fnhe_expires)) { 1381 ip_del_fnhe(nh, daddr); 1382 break; 1383 } 1384 return fnhe; 1385 } 1386 } 1387 return NULL; 1388 } 1389 1390 /* MTU selection: 1391 * 1. mtu on route is locked - use it 1392 * 2. mtu from nexthop exception 1393 * 3. mtu from egress device 1394 */ 1395 1396 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) 1397 { 1398 struct fib_nh_common *nhc = res->nhc; 1399 struct net_device *dev = nhc->nhc_dev; 1400 struct fib_info *fi = res->fi; 1401 u32 mtu = 0; 1402 1403 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || 1404 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) 1405 mtu = fi->fib_mtu; 1406 1407 if (likely(!mtu)) { 1408 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 1409 struct fib_nh_exception *fnhe; 1410 1411 fnhe = find_exception(nh, daddr); 1412 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) 1413 mtu = fnhe->fnhe_pmtu; 1414 } 1415 1416 if (likely(!mtu)) 1417 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); 1418 1419 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); 1420 } 1421 1422 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1423 __be32 daddr, const bool do_cache) 1424 { 1425 bool ret = false; 1426 1427 spin_lock_bh(&fnhe_lock); 1428 1429 if (daddr == fnhe->fnhe_daddr) { 1430 struct rtable __rcu **porig; 1431 struct rtable *orig; 1432 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1433 1434 if (rt_is_input_route(rt)) 1435 porig = &fnhe->fnhe_rth_input; 1436 else 1437 porig = &fnhe->fnhe_rth_output; 1438 orig = rcu_dereference(*porig); 1439 1440 if (fnhe->fnhe_genid != genid) { 1441 fnhe->fnhe_genid = genid; 1442 fnhe->fnhe_gw = 0; 1443 fnhe->fnhe_pmtu = 0; 1444 fnhe->fnhe_expires = 0; 1445 fnhe->fnhe_mtu_locked = false; 1446 fnhe_flush_routes(fnhe); 1447 orig = NULL; 1448 } 1449 fill_route_from_fnhe(rt, fnhe); 1450 if (!rt->rt_gw4) { 1451 rt->rt_gw4 = daddr; 1452 rt->rt_gw_family = AF_INET; 1453 } 1454 1455 if (do_cache) { 1456 dst_hold(&rt->dst); 1457 rcu_assign_pointer(*porig, rt); 1458 if (orig) { 1459 dst_dev_put(&orig->dst); 1460 dst_release(&orig->dst); 1461 } 1462 ret = true; 1463 } 1464 1465 fnhe->fnhe_stamp = jiffies; 1466 } 1467 spin_unlock_bh(&fnhe_lock); 1468 1469 return ret; 1470 } 1471 1472 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1473 { 1474 struct rtable *orig, *prev, **p; 1475 bool ret = true; 1476 1477 if (rt_is_input_route(rt)) { 1478 p = (struct rtable **)&nh->nh_rth_input; 1479 } else { 1480 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); 1481 } 1482 orig = *p; 1483 1484 /* hold dst before doing cmpxchg() to avoid race condition 1485 * on this dst 1486 */ 1487 dst_hold(&rt->dst); 1488 prev = cmpxchg(p, orig, rt); 1489 if (prev == orig) { 1490 if (orig) { 1491 dst_dev_put(&orig->dst); 1492 dst_release(&orig->dst); 1493 } 1494 } else { 1495 dst_release(&rt->dst); 1496 ret = false; 1497 } 1498 1499 return ret; 1500 } 1501 1502 struct uncached_list { 1503 spinlock_t lock; 1504 struct list_head head; 1505 }; 1506 1507 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1508 1509 void rt_add_uncached_list(struct rtable *rt) 1510 { 1511 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1512 1513 rt->rt_uncached_list = ul; 1514 1515 spin_lock_bh(&ul->lock); 1516 list_add_tail(&rt->rt_uncached, &ul->head); 1517 spin_unlock_bh(&ul->lock); 1518 } 1519 1520 void rt_del_uncached_list(struct rtable *rt) 1521 { 1522 if (!list_empty(&rt->rt_uncached)) { 1523 struct uncached_list *ul = rt->rt_uncached_list; 1524 1525 spin_lock_bh(&ul->lock); 1526 list_del(&rt->rt_uncached); 1527 spin_unlock_bh(&ul->lock); 1528 } 1529 } 1530 1531 static void ipv4_dst_destroy(struct dst_entry *dst) 1532 { 1533 struct rtable *rt = (struct rtable *)dst; 1534 1535 ip_dst_metrics_put(dst); 1536 rt_del_uncached_list(rt); 1537 } 1538 1539 void rt_flush_dev(struct net_device *dev) 1540 { 1541 struct net *net = dev_net(dev); 1542 struct rtable *rt; 1543 int cpu; 1544 1545 for_each_possible_cpu(cpu) { 1546 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1547 1548 spin_lock_bh(&ul->lock); 1549 list_for_each_entry(rt, &ul->head, rt_uncached) { 1550 if (rt->dst.dev != dev) 1551 continue; 1552 rt->dst.dev = net->loopback_dev; 1553 dev_hold(rt->dst.dev); 1554 dev_put(dev); 1555 } 1556 spin_unlock_bh(&ul->lock); 1557 } 1558 } 1559 1560 static bool rt_cache_valid(const struct rtable *rt) 1561 { 1562 return rt && 1563 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1564 !rt_is_expired(rt); 1565 } 1566 1567 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1568 const struct fib_result *res, 1569 struct fib_nh_exception *fnhe, 1570 struct fib_info *fi, u16 type, u32 itag, 1571 const bool do_cache) 1572 { 1573 bool cached = false; 1574 1575 if (fi) { 1576 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1577 struct fib_nh *nh; 1578 1579 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { 1580 rt->rt_gw_family = nhc->nhc_gw_family; 1581 /* only INET and INET6 are supported */ 1582 if (likely(nhc->nhc_gw_family == AF_INET)) 1583 rt->rt_gw4 = nhc->nhc_gw.ipv4; 1584 else 1585 rt->rt_gw6 = nhc->nhc_gw.ipv6; 1586 } 1587 1588 ip_dst_init_metrics(&rt->dst, fi->fib_metrics); 1589 1590 nh = container_of(nhc, struct fib_nh, nh_common); 1591 #ifdef CONFIG_IP_ROUTE_CLASSID 1592 rt->dst.tclassid = nh->nh_tclassid; 1593 #endif 1594 rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); 1595 if (unlikely(fnhe)) 1596 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1597 else if (do_cache) 1598 cached = rt_cache_route(nh, rt); 1599 if (unlikely(!cached)) { 1600 /* Routes we intend to cache in nexthop exception or 1601 * FIB nexthop have the DST_NOCACHE bit clear. 1602 * However, if we are unsuccessful at storing this 1603 * route into the cache we really need to set it. 1604 */ 1605 if (!rt->rt_gw4) { 1606 rt->rt_gw_family = AF_INET; 1607 rt->rt_gw4 = daddr; 1608 } 1609 rt_add_uncached_list(rt); 1610 } 1611 } else 1612 rt_add_uncached_list(rt); 1613 1614 #ifdef CONFIG_IP_ROUTE_CLASSID 1615 #ifdef CONFIG_IP_MULTIPLE_TABLES 1616 set_class_tag(rt, res->tclassid); 1617 #endif 1618 set_class_tag(rt, itag); 1619 #endif 1620 } 1621 1622 struct rtable *rt_dst_alloc(struct net_device *dev, 1623 unsigned int flags, u16 type, 1624 bool nopolicy, bool noxfrm, bool will_cache) 1625 { 1626 struct rtable *rt; 1627 1628 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1629 (will_cache ? 0 : DST_HOST) | 1630 (nopolicy ? DST_NOPOLICY : 0) | 1631 (noxfrm ? DST_NOXFRM : 0)); 1632 1633 if (rt) { 1634 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1635 rt->rt_flags = flags; 1636 rt->rt_type = type; 1637 rt->rt_is_input = 0; 1638 rt->rt_iif = 0; 1639 rt->rt_pmtu = 0; 1640 rt->rt_mtu_locked = 0; 1641 rt->rt_gw_family = 0; 1642 rt->rt_gw4 = 0; 1643 INIT_LIST_HEAD(&rt->rt_uncached); 1644 1645 rt->dst.output = ip_output; 1646 if (flags & RTCF_LOCAL) 1647 rt->dst.input = ip_local_deliver; 1648 } 1649 1650 return rt; 1651 } 1652 EXPORT_SYMBOL(rt_dst_alloc); 1653 1654 /* called in rcu_read_lock() section */ 1655 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1656 u8 tos, struct net_device *dev, 1657 struct in_device *in_dev, u32 *itag) 1658 { 1659 int err; 1660 1661 /* Primary sanity checks. */ 1662 if (!in_dev) 1663 return -EINVAL; 1664 1665 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1666 skb->protocol != htons(ETH_P_IP)) 1667 return -EINVAL; 1668 1669 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1670 return -EINVAL; 1671 1672 if (ipv4_is_zeronet(saddr)) { 1673 if (!ipv4_is_local_multicast(daddr) && 1674 ip_hdr(skb)->protocol != IPPROTO_IGMP) 1675 return -EINVAL; 1676 } else { 1677 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1678 in_dev, itag); 1679 if (err < 0) 1680 return err; 1681 } 1682 return 0; 1683 } 1684 1685 /* called in rcu_read_lock() section */ 1686 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1687 u8 tos, struct net_device *dev, int our) 1688 { 1689 struct in_device *in_dev = __in_dev_get_rcu(dev); 1690 unsigned int flags = RTCF_MULTICAST; 1691 struct rtable *rth; 1692 u32 itag = 0; 1693 int err; 1694 1695 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1696 if (err) 1697 return err; 1698 1699 if (our) 1700 flags |= RTCF_LOCAL; 1701 1702 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1703 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 1704 if (!rth) 1705 return -ENOBUFS; 1706 1707 #ifdef CONFIG_IP_ROUTE_CLASSID 1708 rth->dst.tclassid = itag; 1709 #endif 1710 rth->dst.output = ip_rt_bug; 1711 rth->rt_is_input= 1; 1712 1713 #ifdef CONFIG_IP_MROUTE 1714 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1715 rth->dst.input = ip_mr_input; 1716 #endif 1717 RT_CACHE_STAT_INC(in_slow_mc); 1718 1719 skb_dst_set(skb, &rth->dst); 1720 return 0; 1721 } 1722 1723 1724 static void ip_handle_martian_source(struct net_device *dev, 1725 struct in_device *in_dev, 1726 struct sk_buff *skb, 1727 __be32 daddr, 1728 __be32 saddr) 1729 { 1730 RT_CACHE_STAT_INC(in_martian_src); 1731 #ifdef CONFIG_IP_ROUTE_VERBOSE 1732 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1733 /* 1734 * RFC1812 recommendation, if source is martian, 1735 * the only hint is MAC header. 1736 */ 1737 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1738 &daddr, &saddr, dev->name); 1739 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1740 print_hex_dump(KERN_WARNING, "ll header: ", 1741 DUMP_PREFIX_OFFSET, 16, 1, 1742 skb_mac_header(skb), 1743 dev->hard_header_len, false); 1744 } 1745 } 1746 #endif 1747 } 1748 1749 /* called in rcu_read_lock() section */ 1750 static int __mkroute_input(struct sk_buff *skb, 1751 const struct fib_result *res, 1752 struct in_device *in_dev, 1753 __be32 daddr, __be32 saddr, u32 tos) 1754 { 1755 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1756 struct net_device *dev = nhc->nhc_dev; 1757 struct fib_nh_exception *fnhe; 1758 struct rtable *rth; 1759 struct fib_nh *nh; 1760 int err; 1761 struct in_device *out_dev; 1762 bool do_cache; 1763 u32 itag = 0; 1764 1765 /* get a working reference to the output device */ 1766 out_dev = __in_dev_get_rcu(dev); 1767 if (!out_dev) { 1768 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1769 return -EINVAL; 1770 } 1771 1772 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1773 in_dev->dev, in_dev, &itag); 1774 if (err < 0) { 1775 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1776 saddr); 1777 1778 goto cleanup; 1779 } 1780 1781 do_cache = res->fi && !itag; 1782 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1783 skb->protocol == htons(ETH_P_IP)) { 1784 __be32 gw; 1785 1786 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; 1787 if (IN_DEV_SHARED_MEDIA(out_dev) || 1788 inet_addr_onlink(out_dev, saddr, gw)) 1789 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1790 } 1791 1792 if (skb->protocol != htons(ETH_P_IP)) { 1793 /* Not IP (i.e. ARP). Do not create route, if it is 1794 * invalid for proxy arp. DNAT routes are always valid. 1795 * 1796 * Proxy arp feature have been extended to allow, ARP 1797 * replies back to the same interface, to support 1798 * Private VLAN switch technologies. See arp.c. 1799 */ 1800 if (out_dev == in_dev && 1801 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1802 err = -EINVAL; 1803 goto cleanup; 1804 } 1805 } 1806 1807 nh = container_of(nhc, struct fib_nh, nh_common); 1808 fnhe = find_exception(nh, daddr); 1809 if (do_cache) { 1810 if (fnhe) 1811 rth = rcu_dereference(fnhe->fnhe_rth_input); 1812 else 1813 rth = rcu_dereference(nh->nh_rth_input); 1814 if (rt_cache_valid(rth)) { 1815 skb_dst_set_noref(skb, &rth->dst); 1816 goto out; 1817 } 1818 } 1819 1820 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1821 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1822 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 1823 if (!rth) { 1824 err = -ENOBUFS; 1825 goto cleanup; 1826 } 1827 1828 rth->rt_is_input = 1; 1829 RT_CACHE_STAT_INC(in_slow_tot); 1830 1831 rth->dst.input = ip_forward; 1832 1833 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1834 do_cache); 1835 lwtunnel_set_redirect(&rth->dst); 1836 skb_dst_set(skb, &rth->dst); 1837 out: 1838 err = 0; 1839 cleanup: 1840 return err; 1841 } 1842 1843 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1844 /* To make ICMP packets follow the right flow, the multipath hash is 1845 * calculated from the inner IP addresses. 1846 */ 1847 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1848 struct flow_keys *hash_keys) 1849 { 1850 const struct iphdr *outer_iph = ip_hdr(skb); 1851 const struct iphdr *key_iph = outer_iph; 1852 const struct iphdr *inner_iph; 1853 const struct icmphdr *icmph; 1854 struct iphdr _inner_iph; 1855 struct icmphdr _icmph; 1856 1857 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1858 goto out; 1859 1860 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1861 goto out; 1862 1863 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1864 &_icmph); 1865 if (!icmph) 1866 goto out; 1867 1868 if (icmph->type != ICMP_DEST_UNREACH && 1869 icmph->type != ICMP_REDIRECT && 1870 icmph->type != ICMP_TIME_EXCEEDED && 1871 icmph->type != ICMP_PARAMETERPROB) 1872 goto out; 1873 1874 inner_iph = skb_header_pointer(skb, 1875 outer_iph->ihl * 4 + sizeof(_icmph), 1876 sizeof(_inner_iph), &_inner_iph); 1877 if (!inner_iph) 1878 goto out; 1879 1880 key_iph = inner_iph; 1881 out: 1882 hash_keys->addrs.v4addrs.src = key_iph->saddr; 1883 hash_keys->addrs.v4addrs.dst = key_iph->daddr; 1884 } 1885 1886 /* if skb is set it will be used and fl4 can be NULL */ 1887 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, 1888 const struct sk_buff *skb, struct flow_keys *flkeys) 1889 { 1890 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; 1891 struct flow_keys hash_keys; 1892 u32 mhash; 1893 1894 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1895 case 0: 1896 memset(&hash_keys, 0, sizeof(hash_keys)); 1897 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1898 if (skb) { 1899 ip_multipath_l3_keys(skb, &hash_keys); 1900 } else { 1901 hash_keys.addrs.v4addrs.src = fl4->saddr; 1902 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1903 } 1904 break; 1905 case 1: 1906 /* skb is currently provided only when forwarding */ 1907 if (skb) { 1908 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1909 struct flow_keys keys; 1910 1911 /* short-circuit if we already have L4 hash present */ 1912 if (skb->l4_hash) 1913 return skb_get_hash_raw(skb) >> 1; 1914 1915 memset(&hash_keys, 0, sizeof(hash_keys)); 1916 1917 if (!flkeys) { 1918 skb_flow_dissect_flow_keys(skb, &keys, flag); 1919 flkeys = &keys; 1920 } 1921 1922 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1923 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 1924 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 1925 hash_keys.ports.src = flkeys->ports.src; 1926 hash_keys.ports.dst = flkeys->ports.dst; 1927 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 1928 } else { 1929 memset(&hash_keys, 0, sizeof(hash_keys)); 1930 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1931 hash_keys.addrs.v4addrs.src = fl4->saddr; 1932 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1933 hash_keys.ports.src = fl4->fl4_sport; 1934 hash_keys.ports.dst = fl4->fl4_dport; 1935 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1936 } 1937 break; 1938 } 1939 mhash = flow_hash_from_keys(&hash_keys); 1940 1941 if (multipath_hash) 1942 mhash = jhash_2words(mhash, multipath_hash, 0); 1943 1944 return mhash >> 1; 1945 } 1946 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1947 1948 static int ip_mkroute_input(struct sk_buff *skb, 1949 struct fib_result *res, 1950 struct in_device *in_dev, 1951 __be32 daddr, __be32 saddr, u32 tos, 1952 struct flow_keys *hkeys) 1953 { 1954 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1955 if (res->fi && res->fi->fib_nhs > 1) { 1956 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); 1957 1958 fib_select_multipath(res, h); 1959 } 1960 #endif 1961 1962 /* create a routing cache entry */ 1963 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1964 } 1965 1966 /* 1967 * NOTE. We drop all the packets that has local source 1968 * addresses, because every properly looped back packet 1969 * must have correct destination already attached by output routine. 1970 * 1971 * Such approach solves two big problems: 1972 * 1. Not simplex devices are handled properly. 1973 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1974 * called with rcu_read_lock() 1975 */ 1976 1977 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1978 u8 tos, struct net_device *dev, 1979 struct fib_result *res) 1980 { 1981 struct in_device *in_dev = __in_dev_get_rcu(dev); 1982 struct flow_keys *flkeys = NULL, _flkeys; 1983 struct net *net = dev_net(dev); 1984 struct ip_tunnel_info *tun_info; 1985 int err = -EINVAL; 1986 unsigned int flags = 0; 1987 u32 itag = 0; 1988 struct rtable *rth; 1989 struct flowi4 fl4; 1990 bool do_cache; 1991 1992 /* IP on this device is disabled. */ 1993 1994 if (!in_dev) 1995 goto out; 1996 1997 /* Check for the most weird martians, which can be not detected 1998 by fib_lookup. 1999 */ 2000 2001 tun_info = skb_tunnel_info(skb); 2002 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2003 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 2004 else 2005 fl4.flowi4_tun_key.tun_id = 0; 2006 skb_dst_drop(skb); 2007 2008 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2009 goto martian_source; 2010 2011 res->fi = NULL; 2012 res->table = NULL; 2013 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2014 goto brd_input; 2015 2016 /* Accept zero addresses only to limited broadcast; 2017 * I even do not know to fix it or not. Waiting for complains :-) 2018 */ 2019 if (ipv4_is_zeronet(saddr)) 2020 goto martian_source; 2021 2022 if (ipv4_is_zeronet(daddr)) 2023 goto martian_destination; 2024 2025 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 2026 * and call it once if daddr or/and saddr are loopback addresses 2027 */ 2028 if (ipv4_is_loopback(daddr)) { 2029 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2030 goto martian_destination; 2031 } else if (ipv4_is_loopback(saddr)) { 2032 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2033 goto martian_source; 2034 } 2035 2036 /* 2037 * Now we are ready to route packet. 2038 */ 2039 fl4.flowi4_oif = 0; 2040 fl4.flowi4_iif = dev->ifindex; 2041 fl4.flowi4_mark = skb->mark; 2042 fl4.flowi4_tos = tos; 2043 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2044 fl4.flowi4_flags = 0; 2045 fl4.daddr = daddr; 2046 fl4.saddr = saddr; 2047 fl4.flowi4_uid = sock_net_uid(net, NULL); 2048 2049 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { 2050 flkeys = &_flkeys; 2051 } else { 2052 fl4.flowi4_proto = 0; 2053 fl4.fl4_sport = 0; 2054 fl4.fl4_dport = 0; 2055 } 2056 2057 err = fib_lookup(net, &fl4, res, 0); 2058 if (err != 0) { 2059 if (!IN_DEV_FORWARD(in_dev)) 2060 err = -EHOSTUNREACH; 2061 goto no_route; 2062 } 2063 2064 if (res->type == RTN_BROADCAST) { 2065 if (IN_DEV_BFORWARD(in_dev)) 2066 goto make_route; 2067 goto brd_input; 2068 } 2069 2070 if (res->type == RTN_LOCAL) { 2071 err = fib_validate_source(skb, saddr, daddr, tos, 2072 0, dev, in_dev, &itag); 2073 if (err < 0) 2074 goto martian_source; 2075 goto local_input; 2076 } 2077 2078 if (!IN_DEV_FORWARD(in_dev)) { 2079 err = -EHOSTUNREACH; 2080 goto no_route; 2081 } 2082 if (res->type != RTN_UNICAST) 2083 goto martian_destination; 2084 2085 make_route: 2086 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); 2087 out: return err; 2088 2089 brd_input: 2090 if (skb->protocol != htons(ETH_P_IP)) 2091 goto e_inval; 2092 2093 if (!ipv4_is_zeronet(saddr)) { 2094 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2095 in_dev, &itag); 2096 if (err < 0) 2097 goto martian_source; 2098 } 2099 flags |= RTCF_BROADCAST; 2100 res->type = RTN_BROADCAST; 2101 RT_CACHE_STAT_INC(in_brd); 2102 2103 local_input: 2104 do_cache = false; 2105 if (res->fi) { 2106 if (!itag) { 2107 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2108 struct fib_nh *nh; 2109 2110 nh = container_of(nhc, struct fib_nh, nh_common); 2111 rth = rcu_dereference(nh->nh_rth_input); 2112 if (rt_cache_valid(rth)) { 2113 skb_dst_set_noref(skb, &rth->dst); 2114 err = 0; 2115 goto out; 2116 } 2117 do_cache = true; 2118 } 2119 } 2120 2121 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 2122 flags | RTCF_LOCAL, res->type, 2123 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 2124 if (!rth) 2125 goto e_nobufs; 2126 2127 rth->dst.output= ip_rt_bug; 2128 #ifdef CONFIG_IP_ROUTE_CLASSID 2129 rth->dst.tclassid = itag; 2130 #endif 2131 rth->rt_is_input = 1; 2132 2133 RT_CACHE_STAT_INC(in_slow_tot); 2134 if (res->type == RTN_UNREACHABLE) { 2135 rth->dst.input= ip_error; 2136 rth->dst.error= -err; 2137 rth->rt_flags &= ~RTCF_LOCAL; 2138 } 2139 2140 if (do_cache) { 2141 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2142 struct fib_nh *nh; 2143 2144 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 2145 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2146 WARN_ON(rth->dst.input == lwtunnel_input); 2147 rth->dst.lwtstate->orig_input = rth->dst.input; 2148 rth->dst.input = lwtunnel_input; 2149 } 2150 2151 nh = container_of(nhc, struct fib_nh, nh_common); 2152 if (unlikely(!rt_cache_route(nh, rth))) 2153 rt_add_uncached_list(rth); 2154 } 2155 skb_dst_set(skb, &rth->dst); 2156 err = 0; 2157 goto out; 2158 2159 no_route: 2160 RT_CACHE_STAT_INC(in_no_route); 2161 res->type = RTN_UNREACHABLE; 2162 res->fi = NULL; 2163 res->table = NULL; 2164 goto local_input; 2165 2166 /* 2167 * Do not cache martian addresses: they should be logged (RFC1812) 2168 */ 2169 martian_destination: 2170 RT_CACHE_STAT_INC(in_martian_dst); 2171 #ifdef CONFIG_IP_ROUTE_VERBOSE 2172 if (IN_DEV_LOG_MARTIANS(in_dev)) 2173 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2174 &daddr, &saddr, dev->name); 2175 #endif 2176 2177 e_inval: 2178 err = -EINVAL; 2179 goto out; 2180 2181 e_nobufs: 2182 err = -ENOBUFS; 2183 goto out; 2184 2185 martian_source: 2186 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2187 goto out; 2188 } 2189 2190 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2191 u8 tos, struct net_device *dev) 2192 { 2193 struct fib_result res; 2194 int err; 2195 2196 tos &= IPTOS_RT_MASK; 2197 rcu_read_lock(); 2198 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2199 rcu_read_unlock(); 2200 2201 return err; 2202 } 2203 EXPORT_SYMBOL(ip_route_input_noref); 2204 2205 /* called with rcu_read_lock held */ 2206 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2207 u8 tos, struct net_device *dev, struct fib_result *res) 2208 { 2209 /* Multicast recognition logic is moved from route cache to here. 2210 The problem was that too many Ethernet cards have broken/missing 2211 hardware multicast filters :-( As result the host on multicasting 2212 network acquires a lot of useless route cache entries, sort of 2213 SDR messages from all the world. Now we try to get rid of them. 2214 Really, provided software IP multicast filter is organized 2215 reasonably (at least, hashed), it does not result in a slowdown 2216 comparing with route cache reject entries. 2217 Note, that multicast routers are not affected, because 2218 route cache entry is created eventually. 2219 */ 2220 if (ipv4_is_multicast(daddr)) { 2221 struct in_device *in_dev = __in_dev_get_rcu(dev); 2222 int our = 0; 2223 int err = -EINVAL; 2224 2225 if (!in_dev) 2226 return err; 2227 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2228 ip_hdr(skb)->protocol); 2229 2230 /* check l3 master if no match yet */ 2231 if (!our && netif_is_l3_slave(dev)) { 2232 struct in_device *l3_in_dev; 2233 2234 l3_in_dev = __in_dev_get_rcu(skb->dev); 2235 if (l3_in_dev) 2236 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2237 ip_hdr(skb)->protocol); 2238 } 2239 2240 if (our 2241 #ifdef CONFIG_IP_MROUTE 2242 || 2243 (!ipv4_is_local_multicast(daddr) && 2244 IN_DEV_MFORWARD(in_dev)) 2245 #endif 2246 ) { 2247 err = ip_route_input_mc(skb, daddr, saddr, 2248 tos, dev, our); 2249 } 2250 return err; 2251 } 2252 2253 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2254 } 2255 2256 /* called with rcu_read_lock() */ 2257 static struct rtable *__mkroute_output(const struct fib_result *res, 2258 const struct flowi4 *fl4, int orig_oif, 2259 struct net_device *dev_out, 2260 unsigned int flags) 2261 { 2262 struct fib_info *fi = res->fi; 2263 struct fib_nh_exception *fnhe; 2264 struct in_device *in_dev; 2265 u16 type = res->type; 2266 struct rtable *rth; 2267 bool do_cache; 2268 2269 in_dev = __in_dev_get_rcu(dev_out); 2270 if (!in_dev) 2271 return ERR_PTR(-EINVAL); 2272 2273 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2274 if (ipv4_is_loopback(fl4->saddr) && 2275 !(dev_out->flags & IFF_LOOPBACK) && 2276 !netif_is_l3_master(dev_out)) 2277 return ERR_PTR(-EINVAL); 2278 2279 if (ipv4_is_lbcast(fl4->daddr)) 2280 type = RTN_BROADCAST; 2281 else if (ipv4_is_multicast(fl4->daddr)) 2282 type = RTN_MULTICAST; 2283 else if (ipv4_is_zeronet(fl4->daddr)) 2284 return ERR_PTR(-EINVAL); 2285 2286 if (dev_out->flags & IFF_LOOPBACK) 2287 flags |= RTCF_LOCAL; 2288 2289 do_cache = true; 2290 if (type == RTN_BROADCAST) { 2291 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2292 fi = NULL; 2293 } else if (type == RTN_MULTICAST) { 2294 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2295 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2296 fl4->flowi4_proto)) 2297 flags &= ~RTCF_LOCAL; 2298 else 2299 do_cache = false; 2300 /* If multicast route do not exist use 2301 * default one, but do not gateway in this case. 2302 * Yes, it is hack. 2303 */ 2304 if (fi && res->prefixlen < 4) 2305 fi = NULL; 2306 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2307 (orig_oif != dev_out->ifindex)) { 2308 /* For local routes that require a particular output interface 2309 * we do not want to cache the result. Caching the result 2310 * causes incorrect behaviour when there are multiple source 2311 * addresses on the interface, the end result being that if the 2312 * intended recipient is waiting on that interface for the 2313 * packet he won't receive it because it will be delivered on 2314 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2315 * be set to the loopback interface as well. 2316 */ 2317 do_cache = false; 2318 } 2319 2320 fnhe = NULL; 2321 do_cache &= fi != NULL; 2322 if (fi) { 2323 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2324 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 2325 struct rtable __rcu **prth; 2326 2327 fnhe = find_exception(nh, fl4->daddr); 2328 if (!do_cache) 2329 goto add; 2330 if (fnhe) { 2331 prth = &fnhe->fnhe_rth_output; 2332 } else { 2333 if (unlikely(fl4->flowi4_flags & 2334 FLOWI_FLAG_KNOWN_NH && 2335 !(nhc->nhc_gw_family && 2336 nhc->nhc_scope == RT_SCOPE_LINK))) { 2337 do_cache = false; 2338 goto add; 2339 } 2340 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); 2341 } 2342 rth = rcu_dereference(*prth); 2343 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2344 return rth; 2345 } 2346 2347 add: 2348 rth = rt_dst_alloc(dev_out, flags, type, 2349 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2350 IN_DEV_CONF_GET(in_dev, NOXFRM), 2351 do_cache); 2352 if (!rth) 2353 return ERR_PTR(-ENOBUFS); 2354 2355 rth->rt_iif = orig_oif; 2356 2357 RT_CACHE_STAT_INC(out_slow_tot); 2358 2359 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2360 if (flags & RTCF_LOCAL && 2361 !(dev_out->flags & IFF_LOOPBACK)) { 2362 rth->dst.output = ip_mc_output; 2363 RT_CACHE_STAT_INC(out_slow_mc); 2364 } 2365 #ifdef CONFIG_IP_MROUTE 2366 if (type == RTN_MULTICAST) { 2367 if (IN_DEV_MFORWARD(in_dev) && 2368 !ipv4_is_local_multicast(fl4->daddr)) { 2369 rth->dst.input = ip_mr_input; 2370 rth->dst.output = ip_mc_output; 2371 } 2372 } 2373 #endif 2374 } 2375 2376 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2377 lwtunnel_set_redirect(&rth->dst); 2378 2379 return rth; 2380 } 2381 2382 /* 2383 * Major route resolver routine. 2384 */ 2385 2386 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2387 const struct sk_buff *skb) 2388 { 2389 __u8 tos = RT_FL_TOS(fl4); 2390 struct fib_result res = { 2391 .type = RTN_UNSPEC, 2392 .fi = NULL, 2393 .table = NULL, 2394 .tclassid = 0, 2395 }; 2396 struct rtable *rth; 2397 2398 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2399 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2400 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2401 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2402 2403 rcu_read_lock(); 2404 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2405 rcu_read_unlock(); 2406 2407 return rth; 2408 } 2409 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2410 2411 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2412 struct fib_result *res, 2413 const struct sk_buff *skb) 2414 { 2415 struct net_device *dev_out = NULL; 2416 int orig_oif = fl4->flowi4_oif; 2417 unsigned int flags = 0; 2418 struct rtable *rth; 2419 int err = -ENETUNREACH; 2420 2421 if (fl4->saddr) { 2422 rth = ERR_PTR(-EINVAL); 2423 if (ipv4_is_multicast(fl4->saddr) || 2424 ipv4_is_lbcast(fl4->saddr) || 2425 ipv4_is_zeronet(fl4->saddr)) 2426 goto out; 2427 2428 /* I removed check for oif == dev_out->oif here. 2429 It was wrong for two reasons: 2430 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2431 is assigned to multiple interfaces. 2432 2. Moreover, we are allowed to send packets with saddr 2433 of another iface. --ANK 2434 */ 2435 2436 if (fl4->flowi4_oif == 0 && 2437 (ipv4_is_multicast(fl4->daddr) || 2438 ipv4_is_lbcast(fl4->daddr))) { 2439 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2440 dev_out = __ip_dev_find(net, fl4->saddr, false); 2441 if (!dev_out) 2442 goto out; 2443 2444 /* Special hack: user can direct multicasts 2445 and limited broadcast via necessary interface 2446 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2447 This hack is not just for fun, it allows 2448 vic,vat and friends to work. 2449 They bind socket to loopback, set ttl to zero 2450 and expect that it will work. 2451 From the viewpoint of routing cache they are broken, 2452 because we are not allowed to build multicast path 2453 with loopback source addr (look, routing cache 2454 cannot know, that ttl is zero, so that packet 2455 will not leave this host and route is valid). 2456 Luckily, this hack is good workaround. 2457 */ 2458 2459 fl4->flowi4_oif = dev_out->ifindex; 2460 goto make_route; 2461 } 2462 2463 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2464 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2465 if (!__ip_dev_find(net, fl4->saddr, false)) 2466 goto out; 2467 } 2468 } 2469 2470 2471 if (fl4->flowi4_oif) { 2472 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2473 rth = ERR_PTR(-ENODEV); 2474 if (!dev_out) 2475 goto out; 2476 2477 /* RACE: Check return value of inet_select_addr instead. */ 2478 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2479 rth = ERR_PTR(-ENETUNREACH); 2480 goto out; 2481 } 2482 if (ipv4_is_local_multicast(fl4->daddr) || 2483 ipv4_is_lbcast(fl4->daddr) || 2484 fl4->flowi4_proto == IPPROTO_IGMP) { 2485 if (!fl4->saddr) 2486 fl4->saddr = inet_select_addr(dev_out, 0, 2487 RT_SCOPE_LINK); 2488 goto make_route; 2489 } 2490 if (!fl4->saddr) { 2491 if (ipv4_is_multicast(fl4->daddr)) 2492 fl4->saddr = inet_select_addr(dev_out, 0, 2493 fl4->flowi4_scope); 2494 else if (!fl4->daddr) 2495 fl4->saddr = inet_select_addr(dev_out, 0, 2496 RT_SCOPE_HOST); 2497 } 2498 } 2499 2500 if (!fl4->daddr) { 2501 fl4->daddr = fl4->saddr; 2502 if (!fl4->daddr) 2503 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2504 dev_out = net->loopback_dev; 2505 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2506 res->type = RTN_LOCAL; 2507 flags |= RTCF_LOCAL; 2508 goto make_route; 2509 } 2510 2511 err = fib_lookup(net, fl4, res, 0); 2512 if (err) { 2513 res->fi = NULL; 2514 res->table = NULL; 2515 if (fl4->flowi4_oif && 2516 (ipv4_is_multicast(fl4->daddr) || 2517 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2518 /* Apparently, routing tables are wrong. Assume, 2519 that the destination is on link. 2520 2521 WHY? DW. 2522 Because we are allowed to send to iface 2523 even if it has NO routes and NO assigned 2524 addresses. When oif is specified, routing 2525 tables are looked up with only one purpose: 2526 to catch if destination is gatewayed, rather than 2527 direct. Moreover, if MSG_DONTROUTE is set, 2528 we send packet, ignoring both routing tables 2529 and ifaddr state. --ANK 2530 2531 2532 We could make it even if oif is unknown, 2533 likely IPv6, but we do not. 2534 */ 2535 2536 if (fl4->saddr == 0) 2537 fl4->saddr = inet_select_addr(dev_out, 0, 2538 RT_SCOPE_LINK); 2539 res->type = RTN_UNICAST; 2540 goto make_route; 2541 } 2542 rth = ERR_PTR(err); 2543 goto out; 2544 } 2545 2546 if (res->type == RTN_LOCAL) { 2547 if (!fl4->saddr) { 2548 if (res->fi->fib_prefsrc) 2549 fl4->saddr = res->fi->fib_prefsrc; 2550 else 2551 fl4->saddr = fl4->daddr; 2552 } 2553 2554 /* L3 master device is the loopback for that domain */ 2555 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2556 net->loopback_dev; 2557 2558 /* make sure orig_oif points to fib result device even 2559 * though packet rx/tx happens over loopback or l3mdev 2560 */ 2561 orig_oif = FIB_RES_OIF(*res); 2562 2563 fl4->flowi4_oif = dev_out->ifindex; 2564 flags |= RTCF_LOCAL; 2565 goto make_route; 2566 } 2567 2568 fib_select_path(net, res, fl4, skb); 2569 2570 dev_out = FIB_RES_DEV(*res); 2571 fl4->flowi4_oif = dev_out->ifindex; 2572 2573 2574 make_route: 2575 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2576 2577 out: 2578 return rth; 2579 } 2580 2581 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2582 { 2583 return NULL; 2584 } 2585 2586 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2587 { 2588 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2589 2590 return mtu ? : dst->dev->mtu; 2591 } 2592 2593 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2594 struct sk_buff *skb, u32 mtu) 2595 { 2596 } 2597 2598 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2599 struct sk_buff *skb) 2600 { 2601 } 2602 2603 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2604 unsigned long old) 2605 { 2606 return NULL; 2607 } 2608 2609 static struct dst_ops ipv4_dst_blackhole_ops = { 2610 .family = AF_INET, 2611 .check = ipv4_blackhole_dst_check, 2612 .mtu = ipv4_blackhole_mtu, 2613 .default_advmss = ipv4_default_advmss, 2614 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2615 .redirect = ipv4_rt_blackhole_redirect, 2616 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2617 .neigh_lookup = ipv4_neigh_lookup, 2618 }; 2619 2620 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2621 { 2622 struct rtable *ort = (struct rtable *) dst_orig; 2623 struct rtable *rt; 2624 2625 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2626 if (rt) { 2627 struct dst_entry *new = &rt->dst; 2628 2629 new->__use = 1; 2630 new->input = dst_discard; 2631 new->output = dst_discard_out; 2632 2633 new->dev = net->loopback_dev; 2634 if (new->dev) 2635 dev_hold(new->dev); 2636 2637 rt->rt_is_input = ort->rt_is_input; 2638 rt->rt_iif = ort->rt_iif; 2639 rt->rt_pmtu = ort->rt_pmtu; 2640 rt->rt_mtu_locked = ort->rt_mtu_locked; 2641 2642 rt->rt_genid = rt_genid_ipv4(net); 2643 rt->rt_flags = ort->rt_flags; 2644 rt->rt_type = ort->rt_type; 2645 rt->rt_gw_family = ort->rt_gw_family; 2646 if (rt->rt_gw_family == AF_INET) 2647 rt->rt_gw4 = ort->rt_gw4; 2648 else if (rt->rt_gw_family == AF_INET6) 2649 rt->rt_gw6 = ort->rt_gw6; 2650 2651 INIT_LIST_HEAD(&rt->rt_uncached); 2652 } 2653 2654 dst_release(dst_orig); 2655 2656 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2657 } 2658 2659 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2660 const struct sock *sk) 2661 { 2662 struct rtable *rt = __ip_route_output_key(net, flp4); 2663 2664 if (IS_ERR(rt)) 2665 return rt; 2666 2667 if (flp4->flowi4_proto) 2668 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2669 flowi4_to_flowi(flp4), 2670 sk, 0); 2671 2672 return rt; 2673 } 2674 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2675 2676 /* called with rcu_read_lock held */ 2677 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2678 struct rtable *rt, u32 table_id, struct flowi4 *fl4, 2679 struct sk_buff *skb, u32 portid, u32 seq) 2680 { 2681 struct rtmsg *r; 2682 struct nlmsghdr *nlh; 2683 unsigned long expires = 0; 2684 u32 error; 2685 u32 metrics[RTAX_MAX]; 2686 2687 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); 2688 if (!nlh) 2689 return -EMSGSIZE; 2690 2691 r = nlmsg_data(nlh); 2692 r->rtm_family = AF_INET; 2693 r->rtm_dst_len = 32; 2694 r->rtm_src_len = 0; 2695 r->rtm_tos = fl4->flowi4_tos; 2696 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2697 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2698 goto nla_put_failure; 2699 r->rtm_type = rt->rt_type; 2700 r->rtm_scope = RT_SCOPE_UNIVERSE; 2701 r->rtm_protocol = RTPROT_UNSPEC; 2702 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2703 if (rt->rt_flags & RTCF_NOTIFY) 2704 r->rtm_flags |= RTM_F_NOTIFY; 2705 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2706 r->rtm_flags |= RTCF_DOREDIRECT; 2707 2708 if (nla_put_in_addr(skb, RTA_DST, dst)) 2709 goto nla_put_failure; 2710 if (src) { 2711 r->rtm_src_len = 32; 2712 if (nla_put_in_addr(skb, RTA_SRC, src)) 2713 goto nla_put_failure; 2714 } 2715 if (rt->dst.dev && 2716 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2717 goto nla_put_failure; 2718 #ifdef CONFIG_IP_ROUTE_CLASSID 2719 if (rt->dst.tclassid && 2720 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2721 goto nla_put_failure; 2722 #endif 2723 if (!rt_is_input_route(rt) && 2724 fl4->saddr != src) { 2725 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2726 goto nla_put_failure; 2727 } 2728 if (rt->rt_gw_family == AF_INET && 2729 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { 2730 goto nla_put_failure; 2731 } else if (rt->rt_gw_family == AF_INET6) { 2732 int alen = sizeof(struct in6_addr); 2733 struct nlattr *nla; 2734 struct rtvia *via; 2735 2736 nla = nla_reserve(skb, RTA_VIA, alen + 2); 2737 if (!nla) 2738 goto nla_put_failure; 2739 2740 via = nla_data(nla); 2741 via->rtvia_family = AF_INET6; 2742 memcpy(via->rtvia_addr, &rt->rt_gw6, alen); 2743 } 2744 2745 expires = rt->dst.expires; 2746 if (expires) { 2747 unsigned long now = jiffies; 2748 2749 if (time_before(now, expires)) 2750 expires -= now; 2751 else 2752 expires = 0; 2753 } 2754 2755 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2756 if (rt->rt_pmtu && expires) 2757 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2758 if (rt->rt_mtu_locked && expires) 2759 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); 2760 if (rtnetlink_put_metrics(skb, metrics) < 0) 2761 goto nla_put_failure; 2762 2763 if (fl4->flowi4_mark && 2764 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2765 goto nla_put_failure; 2766 2767 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2768 nla_put_u32(skb, RTA_UID, 2769 from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) 2770 goto nla_put_failure; 2771 2772 error = rt->dst.error; 2773 2774 if (rt_is_input_route(rt)) { 2775 #ifdef CONFIG_IP_MROUTE 2776 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2777 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2778 int err = ipmr_get_route(net, skb, 2779 fl4->saddr, fl4->daddr, 2780 r, portid); 2781 2782 if (err <= 0) { 2783 if (err == 0) 2784 return 0; 2785 goto nla_put_failure; 2786 } 2787 } else 2788 #endif 2789 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) 2790 goto nla_put_failure; 2791 } 2792 2793 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2794 goto nla_put_failure; 2795 2796 nlmsg_end(skb, nlh); 2797 return 0; 2798 2799 nla_put_failure: 2800 nlmsg_cancel(skb, nlh); 2801 return -EMSGSIZE; 2802 } 2803 2804 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, 2805 u8 ip_proto, __be16 sport, 2806 __be16 dport) 2807 { 2808 struct sk_buff *skb; 2809 struct iphdr *iph; 2810 2811 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2812 if (!skb) 2813 return NULL; 2814 2815 /* Reserve room for dummy headers, this skb can pass 2816 * through good chunk of routing engine. 2817 */ 2818 skb_reset_mac_header(skb); 2819 skb_reset_network_header(skb); 2820 skb->protocol = htons(ETH_P_IP); 2821 iph = skb_put(skb, sizeof(struct iphdr)); 2822 iph->protocol = ip_proto; 2823 iph->saddr = src; 2824 iph->daddr = dst; 2825 iph->version = 0x4; 2826 iph->frag_off = 0; 2827 iph->ihl = 0x5; 2828 skb_set_transport_header(skb, skb->len); 2829 2830 switch (iph->protocol) { 2831 case IPPROTO_UDP: { 2832 struct udphdr *udph; 2833 2834 udph = skb_put_zero(skb, sizeof(struct udphdr)); 2835 udph->source = sport; 2836 udph->dest = dport; 2837 udph->len = sizeof(struct udphdr); 2838 udph->check = 0; 2839 break; 2840 } 2841 case IPPROTO_TCP: { 2842 struct tcphdr *tcph; 2843 2844 tcph = skb_put_zero(skb, sizeof(struct tcphdr)); 2845 tcph->source = sport; 2846 tcph->dest = dport; 2847 tcph->doff = sizeof(struct tcphdr) / 4; 2848 tcph->rst = 1; 2849 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), 2850 src, dst, 0); 2851 break; 2852 } 2853 case IPPROTO_ICMP: { 2854 struct icmphdr *icmph; 2855 2856 icmph = skb_put_zero(skb, sizeof(struct icmphdr)); 2857 icmph->type = ICMP_ECHO; 2858 icmph->code = 0; 2859 } 2860 } 2861 2862 return skb; 2863 } 2864 2865 static int inet_rtm_valid_getroute_req(struct sk_buff *skb, 2866 const struct nlmsghdr *nlh, 2867 struct nlattr **tb, 2868 struct netlink_ext_ack *extack) 2869 { 2870 struct rtmsg *rtm; 2871 int i, err; 2872 2873 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 2874 NL_SET_ERR_MSG(extack, 2875 "ipv4: Invalid header for route get request"); 2876 return -EINVAL; 2877 } 2878 2879 if (!netlink_strict_get_check(skb)) 2880 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 2881 rtm_ipv4_policy, extack); 2882 2883 rtm = nlmsg_data(nlh); 2884 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || 2885 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || 2886 rtm->rtm_table || rtm->rtm_protocol || 2887 rtm->rtm_scope || rtm->rtm_type) { 2888 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); 2889 return -EINVAL; 2890 } 2891 2892 if (rtm->rtm_flags & ~(RTM_F_NOTIFY | 2893 RTM_F_LOOKUP_TABLE | 2894 RTM_F_FIB_MATCH)) { 2895 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); 2896 return -EINVAL; 2897 } 2898 2899 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 2900 rtm_ipv4_policy, extack); 2901 if (err) 2902 return err; 2903 2904 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 2905 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 2906 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); 2907 return -EINVAL; 2908 } 2909 2910 for (i = 0; i <= RTA_MAX; i++) { 2911 if (!tb[i]) 2912 continue; 2913 2914 switch (i) { 2915 case RTA_IIF: 2916 case RTA_OIF: 2917 case RTA_SRC: 2918 case RTA_DST: 2919 case RTA_IP_PROTO: 2920 case RTA_SPORT: 2921 case RTA_DPORT: 2922 case RTA_MARK: 2923 case RTA_UID: 2924 break; 2925 default: 2926 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); 2927 return -EINVAL; 2928 } 2929 } 2930 2931 return 0; 2932 } 2933 2934 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 2935 struct netlink_ext_ack *extack) 2936 { 2937 struct net *net = sock_net(in_skb->sk); 2938 struct nlattr *tb[RTA_MAX+1]; 2939 u32 table_id = RT_TABLE_MAIN; 2940 __be16 sport = 0, dport = 0; 2941 struct fib_result res = {}; 2942 u8 ip_proto = IPPROTO_UDP; 2943 struct rtable *rt = NULL; 2944 struct sk_buff *skb; 2945 struct rtmsg *rtm; 2946 struct flowi4 fl4 = {}; 2947 __be32 dst = 0; 2948 __be32 src = 0; 2949 kuid_t uid; 2950 u32 iif; 2951 int err; 2952 int mark; 2953 2954 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 2955 if (err < 0) 2956 return err; 2957 2958 rtm = nlmsg_data(nlh); 2959 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2960 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2961 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2962 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2963 if (tb[RTA_UID]) 2964 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 2965 else 2966 uid = (iif ? INVALID_UID : current_uid()); 2967 2968 if (tb[RTA_IP_PROTO]) { 2969 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 2970 &ip_proto, AF_INET, extack); 2971 if (err) 2972 return err; 2973 } 2974 2975 if (tb[RTA_SPORT]) 2976 sport = nla_get_be16(tb[RTA_SPORT]); 2977 2978 if (tb[RTA_DPORT]) 2979 dport = nla_get_be16(tb[RTA_DPORT]); 2980 2981 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); 2982 if (!skb) 2983 return -ENOBUFS; 2984 2985 fl4.daddr = dst; 2986 fl4.saddr = src; 2987 fl4.flowi4_tos = rtm->rtm_tos; 2988 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2989 fl4.flowi4_mark = mark; 2990 fl4.flowi4_uid = uid; 2991 if (sport) 2992 fl4.fl4_sport = sport; 2993 if (dport) 2994 fl4.fl4_dport = dport; 2995 fl4.flowi4_proto = ip_proto; 2996 2997 rcu_read_lock(); 2998 2999 if (iif) { 3000 struct net_device *dev; 3001 3002 dev = dev_get_by_index_rcu(net, iif); 3003 if (!dev) { 3004 err = -ENODEV; 3005 goto errout_rcu; 3006 } 3007 3008 fl4.flowi4_iif = iif; /* for rt_fill_info */ 3009 skb->dev = dev; 3010 skb->mark = mark; 3011 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 3012 dev, &res); 3013 3014 rt = skb_rtable(skb); 3015 if (err == 0 && rt->dst.error) 3016 err = -rt->dst.error; 3017 } else { 3018 fl4.flowi4_iif = LOOPBACK_IFINDEX; 3019 skb->dev = net->loopback_dev; 3020 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 3021 err = 0; 3022 if (IS_ERR(rt)) 3023 err = PTR_ERR(rt); 3024 else 3025 skb_dst_set(skb, &rt->dst); 3026 } 3027 3028 if (err) 3029 goto errout_rcu; 3030 3031 if (rtm->rtm_flags & RTM_F_NOTIFY) 3032 rt->rt_flags |= RTCF_NOTIFY; 3033 3034 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 3035 table_id = res.table ? res.table->tb_id : 0; 3036 3037 /* reset skb for netlink reply msg */ 3038 skb_trim(skb, 0); 3039 skb_reset_network_header(skb); 3040 skb_reset_transport_header(skb); 3041 skb_reset_mac_header(skb); 3042 3043 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 3044 if (!res.fi) { 3045 err = fib_props[res.type].error; 3046 if (!err) 3047 err = -EHOSTUNREACH; 3048 goto errout_rcu; 3049 } 3050 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 3051 nlh->nlmsg_seq, RTM_NEWROUTE, table_id, 3052 rt->rt_type, res.prefix, res.prefixlen, 3053 fl4.flowi4_tos, res.fi, 0); 3054 } else { 3055 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, 3056 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); 3057 } 3058 if (err < 0) 3059 goto errout_rcu; 3060 3061 rcu_read_unlock(); 3062 3063 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3064 3065 errout_free: 3066 return err; 3067 errout_rcu: 3068 rcu_read_unlock(); 3069 kfree_skb(skb); 3070 goto errout_free; 3071 } 3072 3073 void ip_rt_multicast_event(struct in_device *in_dev) 3074 { 3075 rt_cache_flush(dev_net(in_dev->dev)); 3076 } 3077 3078 #ifdef CONFIG_SYSCTL 3079 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 3080 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 3081 static int ip_rt_gc_elasticity __read_mostly = 8; 3082 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; 3083 3084 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 3085 void __user *buffer, 3086 size_t *lenp, loff_t *ppos) 3087 { 3088 struct net *net = (struct net *)__ctl->extra1; 3089 3090 if (write) { 3091 rt_cache_flush(net); 3092 fnhe_genid_bump(net); 3093 return 0; 3094 } 3095 3096 return -EINVAL; 3097 } 3098 3099 static struct ctl_table ipv4_route_table[] = { 3100 { 3101 .procname = "gc_thresh", 3102 .data = &ipv4_dst_ops.gc_thresh, 3103 .maxlen = sizeof(int), 3104 .mode = 0644, 3105 .proc_handler = proc_dointvec, 3106 }, 3107 { 3108 .procname = "max_size", 3109 .data = &ip_rt_max_size, 3110 .maxlen = sizeof(int), 3111 .mode = 0644, 3112 .proc_handler = proc_dointvec, 3113 }, 3114 { 3115 /* Deprecated. Use gc_min_interval_ms */ 3116 3117 .procname = "gc_min_interval", 3118 .data = &ip_rt_gc_min_interval, 3119 .maxlen = sizeof(int), 3120 .mode = 0644, 3121 .proc_handler = proc_dointvec_jiffies, 3122 }, 3123 { 3124 .procname = "gc_min_interval_ms", 3125 .data = &ip_rt_gc_min_interval, 3126 .maxlen = sizeof(int), 3127 .mode = 0644, 3128 .proc_handler = proc_dointvec_ms_jiffies, 3129 }, 3130 { 3131 .procname = "gc_timeout", 3132 .data = &ip_rt_gc_timeout, 3133 .maxlen = sizeof(int), 3134 .mode = 0644, 3135 .proc_handler = proc_dointvec_jiffies, 3136 }, 3137 { 3138 .procname = "gc_interval", 3139 .data = &ip_rt_gc_interval, 3140 .maxlen = sizeof(int), 3141 .mode = 0644, 3142 .proc_handler = proc_dointvec_jiffies, 3143 }, 3144 { 3145 .procname = "redirect_load", 3146 .data = &ip_rt_redirect_load, 3147 .maxlen = sizeof(int), 3148 .mode = 0644, 3149 .proc_handler = proc_dointvec, 3150 }, 3151 { 3152 .procname = "redirect_number", 3153 .data = &ip_rt_redirect_number, 3154 .maxlen = sizeof(int), 3155 .mode = 0644, 3156 .proc_handler = proc_dointvec, 3157 }, 3158 { 3159 .procname = "redirect_silence", 3160 .data = &ip_rt_redirect_silence, 3161 .maxlen = sizeof(int), 3162 .mode = 0644, 3163 .proc_handler = proc_dointvec, 3164 }, 3165 { 3166 .procname = "error_cost", 3167 .data = &ip_rt_error_cost, 3168 .maxlen = sizeof(int), 3169 .mode = 0644, 3170 .proc_handler = proc_dointvec, 3171 }, 3172 { 3173 .procname = "error_burst", 3174 .data = &ip_rt_error_burst, 3175 .maxlen = sizeof(int), 3176 .mode = 0644, 3177 .proc_handler = proc_dointvec, 3178 }, 3179 { 3180 .procname = "gc_elasticity", 3181 .data = &ip_rt_gc_elasticity, 3182 .maxlen = sizeof(int), 3183 .mode = 0644, 3184 .proc_handler = proc_dointvec, 3185 }, 3186 { 3187 .procname = "mtu_expires", 3188 .data = &ip_rt_mtu_expires, 3189 .maxlen = sizeof(int), 3190 .mode = 0644, 3191 .proc_handler = proc_dointvec_jiffies, 3192 }, 3193 { 3194 .procname = "min_pmtu", 3195 .data = &ip_rt_min_pmtu, 3196 .maxlen = sizeof(int), 3197 .mode = 0644, 3198 .proc_handler = proc_dointvec_minmax, 3199 .extra1 = &ip_min_valid_pmtu, 3200 }, 3201 { 3202 .procname = "min_adv_mss", 3203 .data = &ip_rt_min_advmss, 3204 .maxlen = sizeof(int), 3205 .mode = 0644, 3206 .proc_handler = proc_dointvec, 3207 }, 3208 { } 3209 }; 3210 3211 static struct ctl_table ipv4_route_flush_table[] = { 3212 { 3213 .procname = "flush", 3214 .maxlen = sizeof(int), 3215 .mode = 0200, 3216 .proc_handler = ipv4_sysctl_rtcache_flush, 3217 }, 3218 { }, 3219 }; 3220 3221 static __net_init int sysctl_route_net_init(struct net *net) 3222 { 3223 struct ctl_table *tbl; 3224 3225 tbl = ipv4_route_flush_table; 3226 if (!net_eq(net, &init_net)) { 3227 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3228 if (!tbl) 3229 goto err_dup; 3230 3231 /* Don't export sysctls to unprivileged users */ 3232 if (net->user_ns != &init_user_ns) 3233 tbl[0].procname = NULL; 3234 } 3235 tbl[0].extra1 = net; 3236 3237 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 3238 if (!net->ipv4.route_hdr) 3239 goto err_reg; 3240 return 0; 3241 3242 err_reg: 3243 if (tbl != ipv4_route_flush_table) 3244 kfree(tbl); 3245 err_dup: 3246 return -ENOMEM; 3247 } 3248 3249 static __net_exit void sysctl_route_net_exit(struct net *net) 3250 { 3251 struct ctl_table *tbl; 3252 3253 tbl = net->ipv4.route_hdr->ctl_table_arg; 3254 unregister_net_sysctl_table(net->ipv4.route_hdr); 3255 BUG_ON(tbl == ipv4_route_flush_table); 3256 kfree(tbl); 3257 } 3258 3259 static __net_initdata struct pernet_operations sysctl_route_ops = { 3260 .init = sysctl_route_net_init, 3261 .exit = sysctl_route_net_exit, 3262 }; 3263 #endif 3264 3265 static __net_init int rt_genid_init(struct net *net) 3266 { 3267 atomic_set(&net->ipv4.rt_genid, 0); 3268 atomic_set(&net->fnhe_genid, 0); 3269 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 3270 return 0; 3271 } 3272 3273 static __net_initdata struct pernet_operations rt_genid_ops = { 3274 .init = rt_genid_init, 3275 }; 3276 3277 static int __net_init ipv4_inetpeer_init(struct net *net) 3278 { 3279 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3280 3281 if (!bp) 3282 return -ENOMEM; 3283 inet_peer_base_init(bp); 3284 net->ipv4.peers = bp; 3285 return 0; 3286 } 3287 3288 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3289 { 3290 struct inet_peer_base *bp = net->ipv4.peers; 3291 3292 net->ipv4.peers = NULL; 3293 inetpeer_invalidate_tree(bp); 3294 kfree(bp); 3295 } 3296 3297 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3298 .init = ipv4_inetpeer_init, 3299 .exit = ipv4_inetpeer_exit, 3300 }; 3301 3302 #ifdef CONFIG_IP_ROUTE_CLASSID 3303 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3304 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3305 3306 int __init ip_rt_init(void) 3307 { 3308 int cpu; 3309 3310 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), 3311 GFP_KERNEL); 3312 if (!ip_idents) 3313 panic("IP: failed to allocate ip_idents\n"); 3314 3315 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3316 3317 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3318 if (!ip_tstamps) 3319 panic("IP: failed to allocate ip_tstamps\n"); 3320 3321 for_each_possible_cpu(cpu) { 3322 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3323 3324 INIT_LIST_HEAD(&ul->head); 3325 spin_lock_init(&ul->lock); 3326 } 3327 #ifdef CONFIG_IP_ROUTE_CLASSID 3328 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3329 if (!ip_rt_acct) 3330 panic("IP: failed to allocate ip_rt_acct\n"); 3331 #endif 3332 3333 ipv4_dst_ops.kmem_cachep = 3334 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3335 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3336 3337 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3338 3339 if (dst_entries_init(&ipv4_dst_ops) < 0) 3340 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3341 3342 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3343 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3344 3345 ipv4_dst_ops.gc_thresh = ~0; 3346 ip_rt_max_size = INT_MAX; 3347 3348 devinet_init(); 3349 ip_fib_init(); 3350 3351 if (ip_rt_proc_init()) 3352 pr_err("Unable to create route proc files\n"); 3353 #ifdef CONFIG_XFRM 3354 xfrm_init(); 3355 xfrm4_init(); 3356 #endif 3357 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3358 RTNL_FLAG_DOIT_UNLOCKED); 3359 3360 #ifdef CONFIG_SYSCTL 3361 register_pernet_subsys(&sysctl_route_ops); 3362 #endif 3363 register_pernet_subsys(&rt_genid_ops); 3364 register_pernet_subsys(&ipv4_inetpeer_ops); 3365 return 0; 3366 } 3367 3368 #ifdef CONFIG_SYSCTL 3369 /* 3370 * We really need to sanitize the damn ipv4 init order, then all 3371 * this nonsense will go away. 3372 */ 3373 void __init ip_static_sysctl_init(void) 3374 { 3375 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3376 } 3377 #endif 3378