1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * ROUTE - implementation of the IP router. 8 * 9 * Authors: Ross Biro 10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Alan Cox, <gw4pts@gw4pts.ampr.org> 12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 14 * 15 * Fixes: 16 * Alan Cox : Verify area fixes. 17 * Alan Cox : cli() protects routing changes 18 * Rui Oliveira : ICMP routing table updates 19 * (rco@di.uminho.pt) Routing table insertion and update 20 * Linus Torvalds : Rewrote bits to be sensible 21 * Alan Cox : Added BSD route gw semantics 22 * Alan Cox : Super /proc >4K 23 * Alan Cox : MTU in route table 24 * Alan Cox : MSS actually. Also added the window 25 * clamper. 26 * Sam Lantinga : Fixed route matching in rt_del() 27 * Alan Cox : Routing cache support. 28 * Alan Cox : Removed compatibility cruft. 29 * Alan Cox : RTF_REJECT support. 30 * Alan Cox : TCP irtt support. 31 * Jonathan Naylor : Added Metric support. 32 * Miquel van Smoorenburg : BSD API fixes. 33 * Miquel van Smoorenburg : Metrics. 34 * Alan Cox : Use __u32 properly 35 * Alan Cox : Aligned routing errors more closely with BSD 36 * our system is still very different. 37 * Alan Cox : Faster /proc handling 38 * Alexey Kuznetsov : Massive rework to support tree based routing, 39 * routing caches and better behaviour. 40 * 41 * Olaf Erb : irtt wasn't being copied right. 42 * Bjorn Ekwall : Kerneld route support. 43 * Alan Cox : Multicast fixed (I hope) 44 * Pavel Krauz : Limited broadcast fixed 45 * Mike McLagan : Routing by source 46 * Alexey Kuznetsov : End of old history. Split to fib.c and 47 * route.c and rewritten from scratch. 48 * Andi Kleen : Load-limit warning messages. 49 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 53 * Marc Boucher : routing by fwmark 54 * Robert Olsson : Added rt_cache statistics 55 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 58 * Ilia Sotnikov : Removed TOS from hash calculations 59 */ 60 61 #define pr_fmt(fmt) "IPv4: " fmt 62 63 #include <linux/module.h> 64 #include <linux/uaccess.h> 65 #include <linux/bitops.h> 66 #include <linux/types.h> 67 #include <linux/kernel.h> 68 #include <linux/mm.h> 69 #include <linux/string.h> 70 #include <linux/socket.h> 71 #include <linux/sockios.h> 72 #include <linux/errno.h> 73 #include <linux/in.h> 74 #include <linux/inet.h> 75 #include <linux/netdevice.h> 76 #include <linux/proc_fs.h> 77 #include <linux/init.h> 78 #include <linux/skbuff.h> 79 #include <linux/inetdevice.h> 80 #include <linux/igmp.h> 81 #include <linux/pkt_sched.h> 82 #include <linux/mroute.h> 83 #include <linux/netfilter_ipv4.h> 84 #include <linux/random.h> 85 #include <linux/rcupdate.h> 86 #include <linux/times.h> 87 #include <linux/slab.h> 88 #include <linux/jhash.h> 89 #include <net/dst.h> 90 #include <net/dst_metadata.h> 91 #include <net/net_namespace.h> 92 #include <net/protocol.h> 93 #include <net/ip.h> 94 #include <net/route.h> 95 #include <net/inetpeer.h> 96 #include <net/sock.h> 97 #include <net/ip_fib.h> 98 #include <net/nexthop.h> 99 #include <net/arp.h> 100 #include <net/tcp.h> 101 #include <net/icmp.h> 102 #include <net/xfrm.h> 103 #include <net/lwtunnel.h> 104 #include <net/netevent.h> 105 #include <net/rtnetlink.h> 106 #ifdef CONFIG_SYSCTL 107 #include <linux/sysctl.h> 108 #endif 109 #include <net/secure_seq.h> 110 #include <net/ip_tunnels.h> 111 #include <net/l3mdev.h> 112 113 #include "fib_lookup.h" 114 115 #define RT_FL_TOS(oldflp4) \ 116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 117 118 #define RT_GC_TIMEOUT (300*HZ) 119 120 static int ip_rt_max_size; 121 static int ip_rt_redirect_number __read_mostly = 9; 122 static int ip_rt_redirect_load __read_mostly = HZ / 50; 123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 124 static int ip_rt_error_cost __read_mostly = HZ; 125 static int ip_rt_error_burst __read_mostly = 5 * HZ; 126 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 127 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 128 static int ip_rt_min_advmss __read_mostly = 256; 129 130 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 131 132 /* 133 * Interface to generic destination cache. 134 */ 135 136 INDIRECT_CALLABLE_SCOPE 137 struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 139 INDIRECT_CALLABLE_SCOPE 140 unsigned int ipv4_mtu(const struct dst_entry *dst); 141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 142 static void ipv4_link_failure(struct sk_buff *skb); 143 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 144 struct sk_buff *skb, u32 mtu, 145 bool confirm_neigh); 146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 147 struct sk_buff *skb); 148 static void ipv4_dst_destroy(struct dst_entry *dst); 149 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 151 { 152 WARN_ON(1); 153 return NULL; 154 } 155 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 157 struct sk_buff *skb, 158 const void *daddr); 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 160 161 static struct dst_ops ipv4_dst_ops = { 162 .family = AF_INET, 163 .check = ipv4_dst_check, 164 .default_advmss = ipv4_default_advmss, 165 .mtu = ipv4_mtu, 166 .cow_metrics = ipv4_cow_metrics, 167 .destroy = ipv4_dst_destroy, 168 .negative_advice = ipv4_negative_advice, 169 .link_failure = ipv4_link_failure, 170 .update_pmtu = ip_rt_update_pmtu, 171 .redirect = ip_do_redirect, 172 .local_out = __ip_local_out, 173 .neigh_lookup = ipv4_neigh_lookup, 174 .confirm_neigh = ipv4_confirm_neigh, 175 }; 176 177 #define ECN_OR_COST(class) TC_PRIO_##class 178 179 const __u8 ip_tos2prio[16] = { 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BESTEFFORT, 183 ECN_OR_COST(BESTEFFORT), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_BULK, 187 ECN_OR_COST(BULK), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE, 191 ECN_OR_COST(INTERACTIVE), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK), 194 TC_PRIO_INTERACTIVE_BULK, 195 ECN_OR_COST(INTERACTIVE_BULK) 196 }; 197 EXPORT_SYMBOL(ip_tos2prio); 198 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 201 202 #ifdef CONFIG_PROC_FS 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 204 { 205 if (*pos) 206 return NULL; 207 return SEQ_START_TOKEN; 208 } 209 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 211 { 212 ++*pos; 213 return NULL; 214 } 215 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 217 { 218 } 219 220 static int rt_cache_seq_show(struct seq_file *seq, void *v) 221 { 222 if (v == SEQ_START_TOKEN) 223 seq_printf(seq, "%-127s\n", 224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 226 "HHUptod\tSpecDst"); 227 return 0; 228 } 229 230 static const struct seq_operations rt_cache_seq_ops = { 231 .start = rt_cache_seq_start, 232 .next = rt_cache_seq_next, 233 .stop = rt_cache_seq_stop, 234 .show = rt_cache_seq_show, 235 }; 236 237 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 238 { 239 int cpu; 240 241 if (*pos == 0) 242 return SEQ_START_TOKEN; 243 244 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 245 if (!cpu_possible(cpu)) 246 continue; 247 *pos = cpu+1; 248 return &per_cpu(rt_cache_stat, cpu); 249 } 250 return NULL; 251 } 252 253 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 254 { 255 int cpu; 256 257 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 258 if (!cpu_possible(cpu)) 259 continue; 260 *pos = cpu+1; 261 return &per_cpu(rt_cache_stat, cpu); 262 } 263 (*pos)++; 264 return NULL; 265 266 } 267 268 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 269 { 270 271 } 272 273 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 274 { 275 struct rt_cache_stat *st = v; 276 277 if (v == SEQ_START_TOKEN) { 278 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 279 return 0; 280 } 281 282 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 283 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 284 dst_entries_get_slow(&ipv4_dst_ops), 285 0, /* st->in_hit */ 286 st->in_slow_tot, 287 st->in_slow_mc, 288 st->in_no_route, 289 st->in_brd, 290 st->in_martian_dst, 291 st->in_martian_src, 292 293 0, /* st->out_hit */ 294 st->out_slow_tot, 295 st->out_slow_mc, 296 297 0, /* st->gc_total */ 298 0, /* st->gc_ignored */ 299 0, /* st->gc_goal_miss */ 300 0, /* st->gc_dst_overflow */ 301 0, /* st->in_hlist_search */ 302 0 /* st->out_hlist_search */ 303 ); 304 return 0; 305 } 306 307 static const struct seq_operations rt_cpu_seq_ops = { 308 .start = rt_cpu_seq_start, 309 .next = rt_cpu_seq_next, 310 .stop = rt_cpu_seq_stop, 311 .show = rt_cpu_seq_show, 312 }; 313 314 #ifdef CONFIG_IP_ROUTE_CLASSID 315 static int rt_acct_proc_show(struct seq_file *m, void *v) 316 { 317 struct ip_rt_acct *dst, *src; 318 unsigned int i, j; 319 320 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 321 if (!dst) 322 return -ENOMEM; 323 324 for_each_possible_cpu(i) { 325 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 326 for (j = 0; j < 256; j++) { 327 dst[j].o_bytes += src[j].o_bytes; 328 dst[j].o_packets += src[j].o_packets; 329 dst[j].i_bytes += src[j].i_bytes; 330 dst[j].i_packets += src[j].i_packets; 331 } 332 } 333 334 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 335 kfree(dst); 336 return 0; 337 } 338 #endif 339 340 static int __net_init ip_rt_do_proc_init(struct net *net) 341 { 342 struct proc_dir_entry *pde; 343 344 pde = proc_create_seq("rt_cache", 0444, net->proc_net, 345 &rt_cache_seq_ops); 346 if (!pde) 347 goto err1; 348 349 pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, 350 &rt_cpu_seq_ops); 351 if (!pde) 352 goto err2; 353 354 #ifdef CONFIG_IP_ROUTE_CLASSID 355 pde = proc_create_single("rt_acct", 0, net->proc_net, 356 rt_acct_proc_show); 357 if (!pde) 358 goto err3; 359 #endif 360 return 0; 361 362 #ifdef CONFIG_IP_ROUTE_CLASSID 363 err3: 364 remove_proc_entry("rt_cache", net->proc_net_stat); 365 #endif 366 err2: 367 remove_proc_entry("rt_cache", net->proc_net); 368 err1: 369 return -ENOMEM; 370 } 371 372 static void __net_exit ip_rt_do_proc_exit(struct net *net) 373 { 374 remove_proc_entry("rt_cache", net->proc_net_stat); 375 remove_proc_entry("rt_cache", net->proc_net); 376 #ifdef CONFIG_IP_ROUTE_CLASSID 377 remove_proc_entry("rt_acct", net->proc_net); 378 #endif 379 } 380 381 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 382 .init = ip_rt_do_proc_init, 383 .exit = ip_rt_do_proc_exit, 384 }; 385 386 static int __init ip_rt_proc_init(void) 387 { 388 return register_pernet_subsys(&ip_rt_proc_ops); 389 } 390 391 #else 392 static inline int ip_rt_proc_init(void) 393 { 394 return 0; 395 } 396 #endif /* CONFIG_PROC_FS */ 397 398 static inline bool rt_is_expired(const struct rtable *rth) 399 { 400 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 401 } 402 403 void rt_cache_flush(struct net *net) 404 { 405 rt_genid_bump_ipv4(net); 406 } 407 408 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 409 struct sk_buff *skb, 410 const void *daddr) 411 { 412 const struct rtable *rt = container_of(dst, struct rtable, dst); 413 struct net_device *dev = dst->dev; 414 struct neighbour *n; 415 416 rcu_read_lock_bh(); 417 418 if (likely(rt->rt_gw_family == AF_INET)) { 419 n = ip_neigh_gw4(dev, rt->rt_gw4); 420 } else if (rt->rt_gw_family == AF_INET6) { 421 n = ip_neigh_gw6(dev, &rt->rt_gw6); 422 } else { 423 __be32 pkey; 424 425 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); 426 n = ip_neigh_gw4(dev, pkey); 427 } 428 429 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) 430 n = NULL; 431 432 rcu_read_unlock_bh(); 433 434 return n; 435 } 436 437 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 438 { 439 const struct rtable *rt = container_of(dst, struct rtable, dst); 440 struct net_device *dev = dst->dev; 441 const __be32 *pkey = daddr; 442 443 if (rt->rt_gw_family == AF_INET) { 444 pkey = (const __be32 *)&rt->rt_gw4; 445 } else if (rt->rt_gw_family == AF_INET6) { 446 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); 447 } else if (!daddr || 448 (rt->rt_flags & 449 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { 450 return; 451 } 452 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 453 } 454 455 #define IP_IDENTS_SZ 2048u 456 457 static atomic_t *ip_idents __read_mostly; 458 static u32 *ip_tstamps __read_mostly; 459 460 /* In order to protect privacy, we add a perturbation to identifiers 461 * if one generator is seldom used. This makes hard for an attacker 462 * to infer how many packets were sent between two points in time. 463 */ 464 u32 ip_idents_reserve(u32 hash, int segs) 465 { 466 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 467 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 468 u32 old = READ_ONCE(*p_tstamp); 469 u32 now = (u32)jiffies; 470 u32 delta = 0; 471 472 if (old != now && cmpxchg(p_tstamp, old, now) == old) 473 delta = prandom_u32_max(now - old); 474 475 /* If UBSAN reports an error there, please make sure your compiler 476 * supports -fno-strict-overflow before reporting it that was a bug 477 * in UBSAN, and it has been fixed in GCC-8. 478 */ 479 return atomic_add_return(segs + delta, p_id) - segs; 480 } 481 EXPORT_SYMBOL(ip_idents_reserve); 482 483 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 484 { 485 u32 hash, id; 486 487 /* Note the following code is not safe, but this is okay. */ 488 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) 489 get_random_bytes(&net->ipv4.ip_id_key, 490 sizeof(net->ipv4.ip_id_key)); 491 492 hash = siphash_3u32((__force u32)iph->daddr, 493 (__force u32)iph->saddr, 494 iph->protocol, 495 &net->ipv4.ip_id_key); 496 id = ip_idents_reserve(hash, segs); 497 iph->id = htons(id); 498 } 499 EXPORT_SYMBOL(__ip_select_ident); 500 501 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 502 const struct sock *sk, 503 const struct iphdr *iph, 504 int oif, u8 tos, 505 u8 prot, u32 mark, int flow_flags) 506 { 507 if (sk) { 508 const struct inet_sock *inet = inet_sk(sk); 509 510 oif = sk->sk_bound_dev_if; 511 mark = sk->sk_mark; 512 tos = RT_CONN_FLAGS(sk); 513 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 514 } 515 flowi4_init_output(fl4, oif, mark, tos, 516 RT_SCOPE_UNIVERSE, prot, 517 flow_flags, 518 iph->daddr, iph->saddr, 0, 0, 519 sock_net_uid(net, sk)); 520 } 521 522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 523 const struct sock *sk) 524 { 525 const struct net *net = dev_net(skb->dev); 526 const struct iphdr *iph = ip_hdr(skb); 527 int oif = skb->dev->ifindex; 528 u8 tos = RT_TOS(iph->tos); 529 u8 prot = iph->protocol; 530 u32 mark = skb->mark; 531 532 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 533 } 534 535 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 536 { 537 const struct inet_sock *inet = inet_sk(sk); 538 const struct ip_options_rcu *inet_opt; 539 __be32 daddr = inet->inet_daddr; 540 541 rcu_read_lock(); 542 inet_opt = rcu_dereference(inet->inet_opt); 543 if (inet_opt && inet_opt->opt.srr) 544 daddr = inet_opt->opt.faddr; 545 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 546 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 547 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 548 inet_sk_flowi_flags(sk), 549 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 550 rcu_read_unlock(); 551 } 552 553 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 554 const struct sk_buff *skb) 555 { 556 if (skb) 557 build_skb_flow_key(fl4, skb, sk); 558 else 559 build_sk_flow_key(fl4, sk); 560 } 561 562 static DEFINE_SPINLOCK(fnhe_lock); 563 564 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 565 { 566 struct rtable *rt; 567 568 rt = rcu_dereference(fnhe->fnhe_rth_input); 569 if (rt) { 570 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 571 dst_dev_put(&rt->dst); 572 dst_release(&rt->dst); 573 } 574 rt = rcu_dereference(fnhe->fnhe_rth_output); 575 if (rt) { 576 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 577 dst_dev_put(&rt->dst); 578 dst_release(&rt->dst); 579 } 580 } 581 582 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 583 { 584 struct fib_nh_exception *fnhe, *oldest; 585 586 oldest = rcu_dereference(hash->chain); 587 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 588 fnhe = rcu_dereference(fnhe->fnhe_next)) { 589 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 590 oldest = fnhe; 591 } 592 fnhe_flush_routes(oldest); 593 return oldest; 594 } 595 596 static inline u32 fnhe_hashfun(__be32 daddr) 597 { 598 static u32 fnhe_hashrnd __read_mostly; 599 u32 hval; 600 601 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 602 hval = jhash_1word((__force u32)daddr, fnhe_hashrnd); 603 return hash_32(hval, FNHE_HASH_SHIFT); 604 } 605 606 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 607 { 608 rt->rt_pmtu = fnhe->fnhe_pmtu; 609 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; 610 rt->dst.expires = fnhe->fnhe_expires; 611 612 if (fnhe->fnhe_gw) { 613 rt->rt_flags |= RTCF_REDIRECTED; 614 rt->rt_uses_gateway = 1; 615 rt->rt_gw_family = AF_INET; 616 rt->rt_gw4 = fnhe->fnhe_gw; 617 } 618 } 619 620 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, 621 __be32 gw, u32 pmtu, bool lock, 622 unsigned long expires) 623 { 624 struct fnhe_hash_bucket *hash; 625 struct fib_nh_exception *fnhe; 626 struct rtable *rt; 627 u32 genid, hval; 628 unsigned int i; 629 int depth; 630 631 genid = fnhe_genid(dev_net(nhc->nhc_dev)); 632 hval = fnhe_hashfun(daddr); 633 634 spin_lock_bh(&fnhe_lock); 635 636 hash = rcu_dereference(nhc->nhc_exceptions); 637 if (!hash) { 638 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); 639 if (!hash) 640 goto out_unlock; 641 rcu_assign_pointer(nhc->nhc_exceptions, hash); 642 } 643 644 hash += hval; 645 646 depth = 0; 647 for (fnhe = rcu_dereference(hash->chain); fnhe; 648 fnhe = rcu_dereference(fnhe->fnhe_next)) { 649 if (fnhe->fnhe_daddr == daddr) 650 break; 651 depth++; 652 } 653 654 if (fnhe) { 655 if (fnhe->fnhe_genid != genid) 656 fnhe->fnhe_genid = genid; 657 if (gw) 658 fnhe->fnhe_gw = gw; 659 if (pmtu) { 660 fnhe->fnhe_pmtu = pmtu; 661 fnhe->fnhe_mtu_locked = lock; 662 } 663 fnhe->fnhe_expires = max(1UL, expires); 664 /* Update all cached dsts too */ 665 rt = rcu_dereference(fnhe->fnhe_rth_input); 666 if (rt) 667 fill_route_from_fnhe(rt, fnhe); 668 rt = rcu_dereference(fnhe->fnhe_rth_output); 669 if (rt) 670 fill_route_from_fnhe(rt, fnhe); 671 } else { 672 if (depth > FNHE_RECLAIM_DEPTH) 673 fnhe = fnhe_oldest(hash); 674 else { 675 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 676 if (!fnhe) 677 goto out_unlock; 678 679 fnhe->fnhe_next = hash->chain; 680 rcu_assign_pointer(hash->chain, fnhe); 681 } 682 fnhe->fnhe_genid = genid; 683 fnhe->fnhe_daddr = daddr; 684 fnhe->fnhe_gw = gw; 685 fnhe->fnhe_pmtu = pmtu; 686 fnhe->fnhe_mtu_locked = lock; 687 fnhe->fnhe_expires = max(1UL, expires); 688 689 /* Exception created; mark the cached routes for the nexthop 690 * stale, so anyone caching it rechecks if this exception 691 * applies to them. 692 */ 693 rt = rcu_dereference(nhc->nhc_rth_input); 694 if (rt) 695 rt->dst.obsolete = DST_OBSOLETE_KILL; 696 697 for_each_possible_cpu(i) { 698 struct rtable __rcu **prt; 699 700 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); 701 rt = rcu_dereference(*prt); 702 if (rt) 703 rt->dst.obsolete = DST_OBSOLETE_KILL; 704 } 705 } 706 707 fnhe->fnhe_stamp = jiffies; 708 709 out_unlock: 710 spin_unlock_bh(&fnhe_lock); 711 } 712 713 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 714 bool kill_route) 715 { 716 __be32 new_gw = icmp_hdr(skb)->un.gateway; 717 __be32 old_gw = ip_hdr(skb)->saddr; 718 struct net_device *dev = skb->dev; 719 struct in_device *in_dev; 720 struct fib_result res; 721 struct neighbour *n; 722 struct net *net; 723 724 switch (icmp_hdr(skb)->code & 7) { 725 case ICMP_REDIR_NET: 726 case ICMP_REDIR_NETTOS: 727 case ICMP_REDIR_HOST: 728 case ICMP_REDIR_HOSTTOS: 729 break; 730 731 default: 732 return; 733 } 734 735 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) 736 return; 737 738 in_dev = __in_dev_get_rcu(dev); 739 if (!in_dev) 740 return; 741 742 net = dev_net(dev); 743 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 744 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 745 ipv4_is_zeronet(new_gw)) 746 goto reject_redirect; 747 748 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 749 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 750 goto reject_redirect; 751 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 752 goto reject_redirect; 753 } else { 754 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 755 goto reject_redirect; 756 } 757 758 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 759 if (!n) 760 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 761 if (!IS_ERR(n)) { 762 if (!(n->nud_state & NUD_VALID)) { 763 neigh_event_send(n, NULL); 764 } else { 765 if (fib_lookup(net, fl4, &res, 0) == 0) { 766 struct fib_nh_common *nhc; 767 768 fib_select_path(net, &res, fl4, skb); 769 nhc = FIB_RES_NHC(res); 770 update_or_create_fnhe(nhc, fl4->daddr, new_gw, 771 0, false, 772 jiffies + ip_rt_gc_timeout); 773 } 774 if (kill_route) 775 rt->dst.obsolete = DST_OBSOLETE_KILL; 776 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 777 } 778 neigh_release(n); 779 } 780 return; 781 782 reject_redirect: 783 #ifdef CONFIG_IP_ROUTE_VERBOSE 784 if (IN_DEV_LOG_MARTIANS(in_dev)) { 785 const struct iphdr *iph = (const struct iphdr *) skb->data; 786 __be32 daddr = iph->daddr; 787 __be32 saddr = iph->saddr; 788 789 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 790 " Advised path = %pI4 -> %pI4\n", 791 &old_gw, dev->name, &new_gw, 792 &saddr, &daddr); 793 } 794 #endif 795 ; 796 } 797 798 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 799 { 800 struct rtable *rt; 801 struct flowi4 fl4; 802 const struct iphdr *iph = (const struct iphdr *) skb->data; 803 struct net *net = dev_net(skb->dev); 804 int oif = skb->dev->ifindex; 805 u8 tos = RT_TOS(iph->tos); 806 u8 prot = iph->protocol; 807 u32 mark = skb->mark; 808 809 rt = (struct rtable *) dst; 810 811 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 812 __ip_do_redirect(rt, skb, &fl4, true); 813 } 814 815 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 816 { 817 struct rtable *rt = (struct rtable *)dst; 818 struct dst_entry *ret = dst; 819 820 if (rt) { 821 if (dst->obsolete > 0) { 822 ip_rt_put(rt); 823 ret = NULL; 824 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 825 rt->dst.expires) { 826 ip_rt_put(rt); 827 ret = NULL; 828 } 829 } 830 return ret; 831 } 832 833 /* 834 * Algorithm: 835 * 1. The first ip_rt_redirect_number redirects are sent 836 * with exponential backoff, then we stop sending them at all, 837 * assuming that the host ignores our redirects. 838 * 2. If we did not see packets requiring redirects 839 * during ip_rt_redirect_silence, we assume that the host 840 * forgot redirected route and start to send redirects again. 841 * 842 * This algorithm is much cheaper and more intelligent than dumb load limiting 843 * in icmp.c. 844 * 845 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 846 * and "frag. need" (breaks PMTU discovery) in icmp.c. 847 */ 848 849 void ip_rt_send_redirect(struct sk_buff *skb) 850 { 851 struct rtable *rt = skb_rtable(skb); 852 struct in_device *in_dev; 853 struct inet_peer *peer; 854 struct net *net; 855 int log_martians; 856 int vif; 857 858 rcu_read_lock(); 859 in_dev = __in_dev_get_rcu(rt->dst.dev); 860 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 861 rcu_read_unlock(); 862 return; 863 } 864 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 865 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 866 rcu_read_unlock(); 867 868 net = dev_net(rt->dst.dev); 869 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 870 if (!peer) { 871 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 872 rt_nexthop(rt, ip_hdr(skb)->daddr)); 873 return; 874 } 875 876 /* No redirected packets during ip_rt_redirect_silence; 877 * reset the algorithm. 878 */ 879 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { 880 peer->rate_tokens = 0; 881 peer->n_redirects = 0; 882 } 883 884 /* Too many ignored redirects; do not send anything 885 * set dst.rate_last to the last seen redirected packet. 886 */ 887 if (peer->n_redirects >= ip_rt_redirect_number) { 888 peer->rate_last = jiffies; 889 goto out_put_peer; 890 } 891 892 /* Check for load limit; set rate_last to the latest sent 893 * redirect. 894 */ 895 if (peer->n_redirects == 0 || 896 time_after(jiffies, 897 (peer->rate_last + 898 (ip_rt_redirect_load << peer->n_redirects)))) { 899 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 900 901 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 902 peer->rate_last = jiffies; 903 ++peer->n_redirects; 904 #ifdef CONFIG_IP_ROUTE_VERBOSE 905 if (log_martians && 906 peer->n_redirects == ip_rt_redirect_number) 907 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 908 &ip_hdr(skb)->saddr, inet_iif(skb), 909 &ip_hdr(skb)->daddr, &gw); 910 #endif 911 } 912 out_put_peer: 913 inet_putpeer(peer); 914 } 915 916 static int ip_error(struct sk_buff *skb) 917 { 918 struct rtable *rt = skb_rtable(skb); 919 struct net_device *dev = skb->dev; 920 struct in_device *in_dev; 921 struct inet_peer *peer; 922 unsigned long now; 923 struct net *net; 924 bool send; 925 int code; 926 927 if (netif_is_l3_master(skb->dev)) { 928 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); 929 if (!dev) 930 goto out; 931 } 932 933 in_dev = __in_dev_get_rcu(dev); 934 935 /* IP on this device is disabled. */ 936 if (!in_dev) 937 goto out; 938 939 net = dev_net(rt->dst.dev); 940 if (!IN_DEV_FORWARD(in_dev)) { 941 switch (rt->dst.error) { 942 case EHOSTUNREACH: 943 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 944 break; 945 946 case ENETUNREACH: 947 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 948 break; 949 } 950 goto out; 951 } 952 953 switch (rt->dst.error) { 954 case EINVAL: 955 default: 956 goto out; 957 case EHOSTUNREACH: 958 code = ICMP_HOST_UNREACH; 959 break; 960 case ENETUNREACH: 961 code = ICMP_NET_UNREACH; 962 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 963 break; 964 case EACCES: 965 code = ICMP_PKT_FILTERED; 966 break; 967 } 968 969 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 970 l3mdev_master_ifindex(skb->dev), 1); 971 972 send = true; 973 if (peer) { 974 now = jiffies; 975 peer->rate_tokens += now - peer->rate_last; 976 if (peer->rate_tokens > ip_rt_error_burst) 977 peer->rate_tokens = ip_rt_error_burst; 978 peer->rate_last = now; 979 if (peer->rate_tokens >= ip_rt_error_cost) 980 peer->rate_tokens -= ip_rt_error_cost; 981 else 982 send = false; 983 inet_putpeer(peer); 984 } 985 if (send) 986 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 987 988 out: kfree_skb(skb); 989 return 0; 990 } 991 992 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 993 { 994 struct dst_entry *dst = &rt->dst; 995 struct net *net = dev_net(dst->dev); 996 struct fib_result res; 997 bool lock = false; 998 u32 old_mtu; 999 1000 if (ip_mtu_locked(dst)) 1001 return; 1002 1003 old_mtu = ipv4_mtu(dst); 1004 if (old_mtu < mtu) 1005 return; 1006 1007 if (mtu < ip_rt_min_pmtu) { 1008 lock = true; 1009 mtu = min(old_mtu, ip_rt_min_pmtu); 1010 } 1011 1012 if (rt->rt_pmtu == mtu && !lock && 1013 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1014 return; 1015 1016 rcu_read_lock(); 1017 if (fib_lookup(net, fl4, &res, 0) == 0) { 1018 struct fib_nh_common *nhc; 1019 1020 fib_select_path(net, &res, fl4, NULL); 1021 nhc = FIB_RES_NHC(res); 1022 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, 1023 jiffies + ip_rt_mtu_expires); 1024 } 1025 rcu_read_unlock(); 1026 } 1027 1028 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1029 struct sk_buff *skb, u32 mtu, 1030 bool confirm_neigh) 1031 { 1032 struct rtable *rt = (struct rtable *) dst; 1033 struct flowi4 fl4; 1034 1035 ip_rt_build_flow_key(&fl4, sk, skb); 1036 1037 /* Don't make lookup fail for bridged encapsulations */ 1038 if (skb && netif_is_any_bridge_port(skb->dev)) 1039 fl4.flowi4_oif = 0; 1040 1041 __ip_rt_update_pmtu(rt, &fl4, mtu); 1042 } 1043 1044 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1045 int oif, u8 protocol) 1046 { 1047 const struct iphdr *iph = (const struct iphdr *)skb->data; 1048 struct flowi4 fl4; 1049 struct rtable *rt; 1050 u32 mark = IP4_REPLY_MARK(net, skb->mark); 1051 1052 __build_flow_key(net, &fl4, NULL, iph, oif, 1053 RT_TOS(iph->tos), protocol, mark, 0); 1054 rt = __ip_route_output_key(net, &fl4); 1055 if (!IS_ERR(rt)) { 1056 __ip_rt_update_pmtu(rt, &fl4, mtu); 1057 ip_rt_put(rt); 1058 } 1059 } 1060 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1061 1062 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1063 { 1064 const struct iphdr *iph = (const struct iphdr *)skb->data; 1065 struct flowi4 fl4; 1066 struct rtable *rt; 1067 1068 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1069 1070 if (!fl4.flowi4_mark) 1071 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1072 1073 rt = __ip_route_output_key(sock_net(sk), &fl4); 1074 if (!IS_ERR(rt)) { 1075 __ip_rt_update_pmtu(rt, &fl4, mtu); 1076 ip_rt_put(rt); 1077 } 1078 } 1079 1080 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1081 { 1082 const struct iphdr *iph = (const struct iphdr *)skb->data; 1083 struct flowi4 fl4; 1084 struct rtable *rt; 1085 struct dst_entry *odst = NULL; 1086 bool new = false; 1087 struct net *net = sock_net(sk); 1088 1089 bh_lock_sock(sk); 1090 1091 if (!ip_sk_accept_pmtu(sk)) 1092 goto out; 1093 1094 odst = sk_dst_get(sk); 1095 1096 if (sock_owned_by_user(sk) || !odst) { 1097 __ipv4_sk_update_pmtu(skb, sk, mtu); 1098 goto out; 1099 } 1100 1101 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1102 1103 rt = (struct rtable *)odst; 1104 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1105 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1106 if (IS_ERR(rt)) 1107 goto out; 1108 1109 new = true; 1110 } 1111 1112 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); 1113 1114 if (!dst_check(&rt->dst, 0)) { 1115 if (new) 1116 dst_release(&rt->dst); 1117 1118 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1119 if (IS_ERR(rt)) 1120 goto out; 1121 1122 new = true; 1123 } 1124 1125 if (new) 1126 sk_dst_set(sk, &rt->dst); 1127 1128 out: 1129 bh_unlock_sock(sk); 1130 dst_release(odst); 1131 } 1132 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1133 1134 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1135 int oif, u8 protocol) 1136 { 1137 const struct iphdr *iph = (const struct iphdr *)skb->data; 1138 struct flowi4 fl4; 1139 struct rtable *rt; 1140 1141 __build_flow_key(net, &fl4, NULL, iph, oif, 1142 RT_TOS(iph->tos), protocol, 0, 0); 1143 rt = __ip_route_output_key(net, &fl4); 1144 if (!IS_ERR(rt)) { 1145 __ip_do_redirect(rt, skb, &fl4, false); 1146 ip_rt_put(rt); 1147 } 1148 } 1149 EXPORT_SYMBOL_GPL(ipv4_redirect); 1150 1151 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1152 { 1153 const struct iphdr *iph = (const struct iphdr *)skb->data; 1154 struct flowi4 fl4; 1155 struct rtable *rt; 1156 struct net *net = sock_net(sk); 1157 1158 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1159 rt = __ip_route_output_key(net, &fl4); 1160 if (!IS_ERR(rt)) { 1161 __ip_do_redirect(rt, skb, &fl4, false); 1162 ip_rt_put(rt); 1163 } 1164 } 1165 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1166 1167 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, 1168 u32 cookie) 1169 { 1170 struct rtable *rt = (struct rtable *) dst; 1171 1172 /* All IPV4 dsts are created with ->obsolete set to the value 1173 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1174 * into this function always. 1175 * 1176 * When a PMTU/redirect information update invalidates a route, 1177 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1178 * DST_OBSOLETE_DEAD. 1179 */ 1180 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1181 return NULL; 1182 return dst; 1183 } 1184 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); 1185 1186 static void ipv4_send_dest_unreach(struct sk_buff *skb) 1187 { 1188 struct ip_options opt; 1189 int res; 1190 1191 /* Recompile ip options since IPCB may not be valid anymore. 1192 * Also check we have a reasonable ipv4 header. 1193 */ 1194 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || 1195 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) 1196 return; 1197 1198 memset(&opt, 0, sizeof(opt)); 1199 if (ip_hdr(skb)->ihl > 5) { 1200 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) 1201 return; 1202 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); 1203 1204 rcu_read_lock(); 1205 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); 1206 rcu_read_unlock(); 1207 1208 if (res) 1209 return; 1210 } 1211 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); 1212 } 1213 1214 static void ipv4_link_failure(struct sk_buff *skb) 1215 { 1216 struct rtable *rt; 1217 1218 ipv4_send_dest_unreach(skb); 1219 1220 rt = skb_rtable(skb); 1221 if (rt) 1222 dst_set_expires(&rt->dst, 0); 1223 } 1224 1225 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1226 { 1227 pr_debug("%s: %pI4 -> %pI4, %s\n", 1228 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1229 skb->dev ? skb->dev->name : "?"); 1230 kfree_skb(skb); 1231 WARN_ON(1); 1232 return 0; 1233 } 1234 1235 /* 1236 * We do not cache source address of outgoing interface, 1237 * because it is used only by IP RR, TS and SRR options, 1238 * so that it out of fast path. 1239 * 1240 * BTW remember: "addr" is allowed to be not aligned 1241 * in IP options! 1242 */ 1243 1244 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1245 { 1246 __be32 src; 1247 1248 if (rt_is_output_route(rt)) 1249 src = ip_hdr(skb)->saddr; 1250 else { 1251 struct fib_result res; 1252 struct iphdr *iph = ip_hdr(skb); 1253 struct flowi4 fl4 = { 1254 .daddr = iph->daddr, 1255 .saddr = iph->saddr, 1256 .flowi4_tos = RT_TOS(iph->tos), 1257 .flowi4_oif = rt->dst.dev->ifindex, 1258 .flowi4_iif = skb->dev->ifindex, 1259 .flowi4_mark = skb->mark, 1260 }; 1261 1262 rcu_read_lock(); 1263 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1264 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); 1265 else 1266 src = inet_select_addr(rt->dst.dev, 1267 rt_nexthop(rt, iph->daddr), 1268 RT_SCOPE_UNIVERSE); 1269 rcu_read_unlock(); 1270 } 1271 memcpy(addr, &src, 4); 1272 } 1273 1274 #ifdef CONFIG_IP_ROUTE_CLASSID 1275 static void set_class_tag(struct rtable *rt, u32 tag) 1276 { 1277 if (!(rt->dst.tclassid & 0xFFFF)) 1278 rt->dst.tclassid |= tag & 0xFFFF; 1279 if (!(rt->dst.tclassid & 0xFFFF0000)) 1280 rt->dst.tclassid |= tag & 0xFFFF0000; 1281 } 1282 #endif 1283 1284 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1285 { 1286 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1287 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1288 ip_rt_min_advmss); 1289 1290 return min(advmss, IPV4_MAX_PMTU - header_size); 1291 } 1292 1293 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) 1294 { 1295 const struct rtable *rt = (const struct rtable *)dst; 1296 unsigned int mtu = rt->rt_pmtu; 1297 1298 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1299 mtu = dst_metric_raw(dst, RTAX_MTU); 1300 1301 if (mtu) 1302 return mtu; 1303 1304 mtu = READ_ONCE(dst->dev->mtu); 1305 1306 if (unlikely(ip_mtu_locked(dst))) { 1307 if (rt->rt_uses_gateway && mtu > 576) 1308 mtu = 576; 1309 } 1310 1311 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1312 1313 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1314 } 1315 EXPORT_INDIRECT_CALLABLE(ipv4_mtu); 1316 1317 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) 1318 { 1319 struct fnhe_hash_bucket *hash; 1320 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1321 u32 hval = fnhe_hashfun(daddr); 1322 1323 spin_lock_bh(&fnhe_lock); 1324 1325 hash = rcu_dereference_protected(nhc->nhc_exceptions, 1326 lockdep_is_held(&fnhe_lock)); 1327 hash += hval; 1328 1329 fnhe_p = &hash->chain; 1330 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1331 while (fnhe) { 1332 if (fnhe->fnhe_daddr == daddr) { 1333 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1334 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1335 /* set fnhe_daddr to 0 to ensure it won't bind with 1336 * new dsts in rt_bind_exception(). 1337 */ 1338 fnhe->fnhe_daddr = 0; 1339 fnhe_flush_routes(fnhe); 1340 kfree_rcu(fnhe, rcu); 1341 break; 1342 } 1343 fnhe_p = &fnhe->fnhe_next; 1344 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1345 lockdep_is_held(&fnhe_lock)); 1346 } 1347 1348 spin_unlock_bh(&fnhe_lock); 1349 } 1350 1351 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, 1352 __be32 daddr) 1353 { 1354 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); 1355 struct fib_nh_exception *fnhe; 1356 u32 hval; 1357 1358 if (!hash) 1359 return NULL; 1360 1361 hval = fnhe_hashfun(daddr); 1362 1363 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1364 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1365 if (fnhe->fnhe_daddr == daddr) { 1366 if (fnhe->fnhe_expires && 1367 time_after(jiffies, fnhe->fnhe_expires)) { 1368 ip_del_fnhe(nhc, daddr); 1369 break; 1370 } 1371 return fnhe; 1372 } 1373 } 1374 return NULL; 1375 } 1376 1377 /* MTU selection: 1378 * 1. mtu on route is locked - use it 1379 * 2. mtu from nexthop exception 1380 * 3. mtu from egress device 1381 */ 1382 1383 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) 1384 { 1385 struct fib_nh_common *nhc = res->nhc; 1386 struct net_device *dev = nhc->nhc_dev; 1387 struct fib_info *fi = res->fi; 1388 u32 mtu = 0; 1389 1390 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || 1391 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) 1392 mtu = fi->fib_mtu; 1393 1394 if (likely(!mtu)) { 1395 struct fib_nh_exception *fnhe; 1396 1397 fnhe = find_exception(nhc, daddr); 1398 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) 1399 mtu = fnhe->fnhe_pmtu; 1400 } 1401 1402 if (likely(!mtu)) 1403 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); 1404 1405 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); 1406 } 1407 1408 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1409 __be32 daddr, const bool do_cache) 1410 { 1411 bool ret = false; 1412 1413 spin_lock_bh(&fnhe_lock); 1414 1415 if (daddr == fnhe->fnhe_daddr) { 1416 struct rtable __rcu **porig; 1417 struct rtable *orig; 1418 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1419 1420 if (rt_is_input_route(rt)) 1421 porig = &fnhe->fnhe_rth_input; 1422 else 1423 porig = &fnhe->fnhe_rth_output; 1424 orig = rcu_dereference(*porig); 1425 1426 if (fnhe->fnhe_genid != genid) { 1427 fnhe->fnhe_genid = genid; 1428 fnhe->fnhe_gw = 0; 1429 fnhe->fnhe_pmtu = 0; 1430 fnhe->fnhe_expires = 0; 1431 fnhe->fnhe_mtu_locked = false; 1432 fnhe_flush_routes(fnhe); 1433 orig = NULL; 1434 } 1435 fill_route_from_fnhe(rt, fnhe); 1436 if (!rt->rt_gw4) { 1437 rt->rt_gw4 = daddr; 1438 rt->rt_gw_family = AF_INET; 1439 } 1440 1441 if (do_cache) { 1442 dst_hold(&rt->dst); 1443 rcu_assign_pointer(*porig, rt); 1444 if (orig) { 1445 dst_dev_put(&orig->dst); 1446 dst_release(&orig->dst); 1447 } 1448 ret = true; 1449 } 1450 1451 fnhe->fnhe_stamp = jiffies; 1452 } 1453 spin_unlock_bh(&fnhe_lock); 1454 1455 return ret; 1456 } 1457 1458 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) 1459 { 1460 struct rtable *orig, *prev, **p; 1461 bool ret = true; 1462 1463 if (rt_is_input_route(rt)) { 1464 p = (struct rtable **)&nhc->nhc_rth_input; 1465 } else { 1466 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); 1467 } 1468 orig = *p; 1469 1470 /* hold dst before doing cmpxchg() to avoid race condition 1471 * on this dst 1472 */ 1473 dst_hold(&rt->dst); 1474 prev = cmpxchg(p, orig, rt); 1475 if (prev == orig) { 1476 if (orig) { 1477 rt_add_uncached_list(orig); 1478 dst_release(&orig->dst); 1479 } 1480 } else { 1481 dst_release(&rt->dst); 1482 ret = false; 1483 } 1484 1485 return ret; 1486 } 1487 1488 struct uncached_list { 1489 spinlock_t lock; 1490 struct list_head head; 1491 }; 1492 1493 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1494 1495 void rt_add_uncached_list(struct rtable *rt) 1496 { 1497 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1498 1499 rt->rt_uncached_list = ul; 1500 1501 spin_lock_bh(&ul->lock); 1502 list_add_tail(&rt->rt_uncached, &ul->head); 1503 spin_unlock_bh(&ul->lock); 1504 } 1505 1506 void rt_del_uncached_list(struct rtable *rt) 1507 { 1508 if (!list_empty(&rt->rt_uncached)) { 1509 struct uncached_list *ul = rt->rt_uncached_list; 1510 1511 spin_lock_bh(&ul->lock); 1512 list_del(&rt->rt_uncached); 1513 spin_unlock_bh(&ul->lock); 1514 } 1515 } 1516 1517 static void ipv4_dst_destroy(struct dst_entry *dst) 1518 { 1519 struct rtable *rt = (struct rtable *)dst; 1520 1521 ip_dst_metrics_put(dst); 1522 rt_del_uncached_list(rt); 1523 } 1524 1525 void rt_flush_dev(struct net_device *dev) 1526 { 1527 struct rtable *rt; 1528 int cpu; 1529 1530 for_each_possible_cpu(cpu) { 1531 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1532 1533 spin_lock_bh(&ul->lock); 1534 list_for_each_entry(rt, &ul->head, rt_uncached) { 1535 if (rt->dst.dev != dev) 1536 continue; 1537 rt->dst.dev = blackhole_netdev; 1538 dev_hold(rt->dst.dev); 1539 dev_put(dev); 1540 } 1541 spin_unlock_bh(&ul->lock); 1542 } 1543 } 1544 1545 static bool rt_cache_valid(const struct rtable *rt) 1546 { 1547 return rt && 1548 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1549 !rt_is_expired(rt); 1550 } 1551 1552 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1553 const struct fib_result *res, 1554 struct fib_nh_exception *fnhe, 1555 struct fib_info *fi, u16 type, u32 itag, 1556 const bool do_cache) 1557 { 1558 bool cached = false; 1559 1560 if (fi) { 1561 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1562 1563 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { 1564 rt->rt_uses_gateway = 1; 1565 rt->rt_gw_family = nhc->nhc_gw_family; 1566 /* only INET and INET6 are supported */ 1567 if (likely(nhc->nhc_gw_family == AF_INET)) 1568 rt->rt_gw4 = nhc->nhc_gw.ipv4; 1569 else 1570 rt->rt_gw6 = nhc->nhc_gw.ipv6; 1571 } 1572 1573 ip_dst_init_metrics(&rt->dst, fi->fib_metrics); 1574 1575 #ifdef CONFIG_IP_ROUTE_CLASSID 1576 if (nhc->nhc_family == AF_INET) { 1577 struct fib_nh *nh; 1578 1579 nh = container_of(nhc, struct fib_nh, nh_common); 1580 rt->dst.tclassid = nh->nh_tclassid; 1581 } 1582 #endif 1583 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 1584 if (unlikely(fnhe)) 1585 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1586 else if (do_cache) 1587 cached = rt_cache_route(nhc, rt); 1588 if (unlikely(!cached)) { 1589 /* Routes we intend to cache in nexthop exception or 1590 * FIB nexthop have the DST_NOCACHE bit clear. 1591 * However, if we are unsuccessful at storing this 1592 * route into the cache we really need to set it. 1593 */ 1594 if (!rt->rt_gw4) { 1595 rt->rt_gw_family = AF_INET; 1596 rt->rt_gw4 = daddr; 1597 } 1598 rt_add_uncached_list(rt); 1599 } 1600 } else 1601 rt_add_uncached_list(rt); 1602 1603 #ifdef CONFIG_IP_ROUTE_CLASSID 1604 #ifdef CONFIG_IP_MULTIPLE_TABLES 1605 set_class_tag(rt, res->tclassid); 1606 #endif 1607 set_class_tag(rt, itag); 1608 #endif 1609 } 1610 1611 struct rtable *rt_dst_alloc(struct net_device *dev, 1612 unsigned int flags, u16 type, 1613 bool nopolicy, bool noxfrm) 1614 { 1615 struct rtable *rt; 1616 1617 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1618 (nopolicy ? DST_NOPOLICY : 0) | 1619 (noxfrm ? DST_NOXFRM : 0)); 1620 1621 if (rt) { 1622 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1623 rt->rt_flags = flags; 1624 rt->rt_type = type; 1625 rt->rt_is_input = 0; 1626 rt->rt_iif = 0; 1627 rt->rt_pmtu = 0; 1628 rt->rt_mtu_locked = 0; 1629 rt->rt_uses_gateway = 0; 1630 rt->rt_gw_family = 0; 1631 rt->rt_gw4 = 0; 1632 INIT_LIST_HEAD(&rt->rt_uncached); 1633 1634 rt->dst.output = ip_output; 1635 if (flags & RTCF_LOCAL) 1636 rt->dst.input = ip_local_deliver; 1637 } 1638 1639 return rt; 1640 } 1641 EXPORT_SYMBOL(rt_dst_alloc); 1642 1643 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) 1644 { 1645 struct rtable *new_rt; 1646 1647 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1648 rt->dst.flags); 1649 1650 if (new_rt) { 1651 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1652 new_rt->rt_flags = rt->rt_flags; 1653 new_rt->rt_type = rt->rt_type; 1654 new_rt->rt_is_input = rt->rt_is_input; 1655 new_rt->rt_iif = rt->rt_iif; 1656 new_rt->rt_pmtu = rt->rt_pmtu; 1657 new_rt->rt_mtu_locked = rt->rt_mtu_locked; 1658 new_rt->rt_gw_family = rt->rt_gw_family; 1659 if (rt->rt_gw_family == AF_INET) 1660 new_rt->rt_gw4 = rt->rt_gw4; 1661 else if (rt->rt_gw_family == AF_INET6) 1662 new_rt->rt_gw6 = rt->rt_gw6; 1663 INIT_LIST_HEAD(&new_rt->rt_uncached); 1664 1665 new_rt->dst.input = rt->dst.input; 1666 new_rt->dst.output = rt->dst.output; 1667 new_rt->dst.error = rt->dst.error; 1668 new_rt->dst.lastuse = jiffies; 1669 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); 1670 } 1671 return new_rt; 1672 } 1673 EXPORT_SYMBOL(rt_dst_clone); 1674 1675 /* called in rcu_read_lock() section */ 1676 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1677 u8 tos, struct net_device *dev, 1678 struct in_device *in_dev, u32 *itag) 1679 { 1680 int err; 1681 1682 /* Primary sanity checks. */ 1683 if (!in_dev) 1684 return -EINVAL; 1685 1686 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1687 skb->protocol != htons(ETH_P_IP)) 1688 return -EINVAL; 1689 1690 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1691 return -EINVAL; 1692 1693 if (ipv4_is_zeronet(saddr)) { 1694 if (!ipv4_is_local_multicast(daddr) && 1695 ip_hdr(skb)->protocol != IPPROTO_IGMP) 1696 return -EINVAL; 1697 } else { 1698 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1699 in_dev, itag); 1700 if (err < 0) 1701 return err; 1702 } 1703 return 0; 1704 } 1705 1706 /* called in rcu_read_lock() section */ 1707 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1708 u8 tos, struct net_device *dev, int our) 1709 { 1710 struct in_device *in_dev = __in_dev_get_rcu(dev); 1711 unsigned int flags = RTCF_MULTICAST; 1712 struct rtable *rth; 1713 u32 itag = 0; 1714 int err; 1715 1716 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1717 if (err) 1718 return err; 1719 1720 if (our) 1721 flags |= RTCF_LOCAL; 1722 1723 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1724 IN_DEV_ORCONF(in_dev, NOPOLICY), false); 1725 if (!rth) 1726 return -ENOBUFS; 1727 1728 #ifdef CONFIG_IP_ROUTE_CLASSID 1729 rth->dst.tclassid = itag; 1730 #endif 1731 rth->dst.output = ip_rt_bug; 1732 rth->rt_is_input= 1; 1733 1734 #ifdef CONFIG_IP_MROUTE 1735 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1736 rth->dst.input = ip_mr_input; 1737 #endif 1738 RT_CACHE_STAT_INC(in_slow_mc); 1739 1740 skb_dst_set(skb, &rth->dst); 1741 return 0; 1742 } 1743 1744 1745 static void ip_handle_martian_source(struct net_device *dev, 1746 struct in_device *in_dev, 1747 struct sk_buff *skb, 1748 __be32 daddr, 1749 __be32 saddr) 1750 { 1751 RT_CACHE_STAT_INC(in_martian_src); 1752 #ifdef CONFIG_IP_ROUTE_VERBOSE 1753 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1754 /* 1755 * RFC1812 recommendation, if source is martian, 1756 * the only hint is MAC header. 1757 */ 1758 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1759 &daddr, &saddr, dev->name); 1760 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1761 print_hex_dump(KERN_WARNING, "ll header: ", 1762 DUMP_PREFIX_OFFSET, 16, 1, 1763 skb_mac_header(skb), 1764 dev->hard_header_len, false); 1765 } 1766 } 1767 #endif 1768 } 1769 1770 /* called in rcu_read_lock() section */ 1771 static int __mkroute_input(struct sk_buff *skb, 1772 const struct fib_result *res, 1773 struct in_device *in_dev, 1774 __be32 daddr, __be32 saddr, u32 tos) 1775 { 1776 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1777 struct net_device *dev = nhc->nhc_dev; 1778 struct fib_nh_exception *fnhe; 1779 struct rtable *rth; 1780 int err; 1781 struct in_device *out_dev; 1782 bool do_cache; 1783 u32 itag = 0; 1784 1785 /* get a working reference to the output device */ 1786 out_dev = __in_dev_get_rcu(dev); 1787 if (!out_dev) { 1788 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1789 return -EINVAL; 1790 } 1791 1792 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1793 in_dev->dev, in_dev, &itag); 1794 if (err < 0) { 1795 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1796 saddr); 1797 1798 goto cleanup; 1799 } 1800 1801 do_cache = res->fi && !itag; 1802 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1803 skb->protocol == htons(ETH_P_IP)) { 1804 __be32 gw; 1805 1806 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; 1807 if (IN_DEV_SHARED_MEDIA(out_dev) || 1808 inet_addr_onlink(out_dev, saddr, gw)) 1809 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1810 } 1811 1812 if (skb->protocol != htons(ETH_P_IP)) { 1813 /* Not IP (i.e. ARP). Do not create route, if it is 1814 * invalid for proxy arp. DNAT routes are always valid. 1815 * 1816 * Proxy arp feature have been extended to allow, ARP 1817 * replies back to the same interface, to support 1818 * Private VLAN switch technologies. See arp.c. 1819 */ 1820 if (out_dev == in_dev && 1821 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1822 err = -EINVAL; 1823 goto cleanup; 1824 } 1825 } 1826 1827 fnhe = find_exception(nhc, daddr); 1828 if (do_cache) { 1829 if (fnhe) 1830 rth = rcu_dereference(fnhe->fnhe_rth_input); 1831 else 1832 rth = rcu_dereference(nhc->nhc_rth_input); 1833 if (rt_cache_valid(rth)) { 1834 skb_dst_set_noref(skb, &rth->dst); 1835 goto out; 1836 } 1837 } 1838 1839 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1840 IN_DEV_ORCONF(in_dev, NOPOLICY), 1841 IN_DEV_ORCONF(out_dev, NOXFRM)); 1842 if (!rth) { 1843 err = -ENOBUFS; 1844 goto cleanup; 1845 } 1846 1847 rth->rt_is_input = 1; 1848 RT_CACHE_STAT_INC(in_slow_tot); 1849 1850 rth->dst.input = ip_forward; 1851 1852 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1853 do_cache); 1854 lwtunnel_set_redirect(&rth->dst); 1855 skb_dst_set(skb, &rth->dst); 1856 out: 1857 err = 0; 1858 cleanup: 1859 return err; 1860 } 1861 1862 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1863 /* To make ICMP packets follow the right flow, the multipath hash is 1864 * calculated from the inner IP addresses. 1865 */ 1866 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1867 struct flow_keys *hash_keys) 1868 { 1869 const struct iphdr *outer_iph = ip_hdr(skb); 1870 const struct iphdr *key_iph = outer_iph; 1871 const struct iphdr *inner_iph; 1872 const struct icmphdr *icmph; 1873 struct iphdr _inner_iph; 1874 struct icmphdr _icmph; 1875 1876 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1877 goto out; 1878 1879 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1880 goto out; 1881 1882 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1883 &_icmph); 1884 if (!icmph) 1885 goto out; 1886 1887 if (!icmp_is_err(icmph->type)) 1888 goto out; 1889 1890 inner_iph = skb_header_pointer(skb, 1891 outer_iph->ihl * 4 + sizeof(_icmph), 1892 sizeof(_inner_iph), &_inner_iph); 1893 if (!inner_iph) 1894 goto out; 1895 1896 key_iph = inner_iph; 1897 out: 1898 hash_keys->addrs.v4addrs.src = key_iph->saddr; 1899 hash_keys->addrs.v4addrs.dst = key_iph->daddr; 1900 } 1901 1902 /* if skb is set it will be used and fl4 can be NULL */ 1903 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, 1904 const struct sk_buff *skb, struct flow_keys *flkeys) 1905 { 1906 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; 1907 struct flow_keys hash_keys; 1908 u32 mhash; 1909 1910 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1911 case 0: 1912 memset(&hash_keys, 0, sizeof(hash_keys)); 1913 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1914 if (skb) { 1915 ip_multipath_l3_keys(skb, &hash_keys); 1916 } else { 1917 hash_keys.addrs.v4addrs.src = fl4->saddr; 1918 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1919 } 1920 break; 1921 case 1: 1922 /* skb is currently provided only when forwarding */ 1923 if (skb) { 1924 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1925 struct flow_keys keys; 1926 1927 /* short-circuit if we already have L4 hash present */ 1928 if (skb->l4_hash) 1929 return skb_get_hash_raw(skb) >> 1; 1930 1931 memset(&hash_keys, 0, sizeof(hash_keys)); 1932 1933 if (!flkeys) { 1934 skb_flow_dissect_flow_keys(skb, &keys, flag); 1935 flkeys = &keys; 1936 } 1937 1938 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1939 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 1940 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 1941 hash_keys.ports.src = flkeys->ports.src; 1942 hash_keys.ports.dst = flkeys->ports.dst; 1943 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 1944 } else { 1945 memset(&hash_keys, 0, sizeof(hash_keys)); 1946 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1947 hash_keys.addrs.v4addrs.src = fl4->saddr; 1948 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1949 hash_keys.ports.src = fl4->fl4_sport; 1950 hash_keys.ports.dst = fl4->fl4_dport; 1951 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1952 } 1953 break; 1954 case 2: 1955 memset(&hash_keys, 0, sizeof(hash_keys)); 1956 /* skb is currently provided only when forwarding */ 1957 if (skb) { 1958 struct flow_keys keys; 1959 1960 skb_flow_dissect_flow_keys(skb, &keys, 0); 1961 /* Inner can be v4 or v6 */ 1962 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 1963 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1964 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 1965 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 1966 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 1967 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1968 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 1969 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 1970 hash_keys.tags.flow_label = keys.tags.flow_label; 1971 hash_keys.basic.ip_proto = keys.basic.ip_proto; 1972 } else { 1973 /* Same as case 0 */ 1974 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1975 ip_multipath_l3_keys(skb, &hash_keys); 1976 } 1977 } else { 1978 /* Same as case 0 */ 1979 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1980 hash_keys.addrs.v4addrs.src = fl4->saddr; 1981 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1982 } 1983 break; 1984 } 1985 mhash = flow_hash_from_keys(&hash_keys); 1986 1987 if (multipath_hash) 1988 mhash = jhash_2words(mhash, multipath_hash, 0); 1989 1990 return mhash >> 1; 1991 } 1992 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1993 1994 static int ip_mkroute_input(struct sk_buff *skb, 1995 struct fib_result *res, 1996 struct in_device *in_dev, 1997 __be32 daddr, __be32 saddr, u32 tos, 1998 struct flow_keys *hkeys) 1999 { 2000 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2001 if (res->fi && fib_info_num_path(res->fi) > 1) { 2002 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); 2003 2004 fib_select_multipath(res, h); 2005 } 2006 #endif 2007 2008 /* create a routing cache entry */ 2009 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 2010 } 2011 2012 /* Implements all the saddr-related checks as ip_route_input_slow(), 2013 * assuming daddr is valid and the destination is not a local broadcast one. 2014 * Uses the provided hint instead of performing a route lookup. 2015 */ 2016 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2017 u8 tos, struct net_device *dev, 2018 const struct sk_buff *hint) 2019 { 2020 struct in_device *in_dev = __in_dev_get_rcu(dev); 2021 struct rtable *rt = skb_rtable(hint); 2022 struct net *net = dev_net(dev); 2023 int err = -EINVAL; 2024 u32 tag = 0; 2025 2026 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2027 goto martian_source; 2028 2029 if (ipv4_is_zeronet(saddr)) 2030 goto martian_source; 2031 2032 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2033 goto martian_source; 2034 2035 if (rt->rt_type != RTN_LOCAL) 2036 goto skip_validate_source; 2037 2038 tos &= IPTOS_RT_MASK; 2039 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); 2040 if (err < 0) 2041 goto martian_source; 2042 2043 skip_validate_source: 2044 skb_dst_copy(skb, hint); 2045 return 0; 2046 2047 martian_source: 2048 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2049 return err; 2050 } 2051 2052 /* 2053 * NOTE. We drop all the packets that has local source 2054 * addresses, because every properly looped back packet 2055 * must have correct destination already attached by output routine. 2056 * Changes in the enforced policies must be applied also to 2057 * ip_route_use_hint(). 2058 * 2059 * Such approach solves two big problems: 2060 * 1. Not simplex devices are handled properly. 2061 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2062 * called with rcu_read_lock() 2063 */ 2064 2065 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2066 u8 tos, struct net_device *dev, 2067 struct fib_result *res) 2068 { 2069 struct in_device *in_dev = __in_dev_get_rcu(dev); 2070 struct flow_keys *flkeys = NULL, _flkeys; 2071 struct net *net = dev_net(dev); 2072 struct ip_tunnel_info *tun_info; 2073 int err = -EINVAL; 2074 unsigned int flags = 0; 2075 u32 itag = 0; 2076 struct rtable *rth; 2077 struct flowi4 fl4; 2078 bool do_cache = true; 2079 2080 /* IP on this device is disabled. */ 2081 2082 if (!in_dev) 2083 goto out; 2084 2085 /* Check for the most weird martians, which can be not detected 2086 * by fib_lookup. 2087 */ 2088 2089 tun_info = skb_tunnel_info(skb); 2090 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2091 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 2092 else 2093 fl4.flowi4_tun_key.tun_id = 0; 2094 skb_dst_drop(skb); 2095 2096 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2097 goto martian_source; 2098 2099 res->fi = NULL; 2100 res->table = NULL; 2101 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2102 goto brd_input; 2103 2104 /* Accept zero addresses only to limited broadcast; 2105 * I even do not know to fix it or not. Waiting for complains :-) 2106 */ 2107 if (ipv4_is_zeronet(saddr)) 2108 goto martian_source; 2109 2110 if (ipv4_is_zeronet(daddr)) 2111 goto martian_destination; 2112 2113 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 2114 * and call it once if daddr or/and saddr are loopback addresses 2115 */ 2116 if (ipv4_is_loopback(daddr)) { 2117 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2118 goto martian_destination; 2119 } else if (ipv4_is_loopback(saddr)) { 2120 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2121 goto martian_source; 2122 } 2123 2124 /* 2125 * Now we are ready to route packet. 2126 */ 2127 fl4.flowi4_oif = 0; 2128 fl4.flowi4_iif = dev->ifindex; 2129 fl4.flowi4_mark = skb->mark; 2130 fl4.flowi4_tos = tos; 2131 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2132 fl4.flowi4_flags = 0; 2133 fl4.daddr = daddr; 2134 fl4.saddr = saddr; 2135 fl4.flowi4_uid = sock_net_uid(net, NULL); 2136 fl4.flowi4_multipath_hash = 0; 2137 2138 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { 2139 flkeys = &_flkeys; 2140 } else { 2141 fl4.flowi4_proto = 0; 2142 fl4.fl4_sport = 0; 2143 fl4.fl4_dport = 0; 2144 } 2145 2146 err = fib_lookup(net, &fl4, res, 0); 2147 if (err != 0) { 2148 if (!IN_DEV_FORWARD(in_dev)) 2149 err = -EHOSTUNREACH; 2150 goto no_route; 2151 } 2152 2153 if (res->type == RTN_BROADCAST) { 2154 if (IN_DEV_BFORWARD(in_dev)) 2155 goto make_route; 2156 /* not do cache if bc_forwarding is enabled */ 2157 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) 2158 do_cache = false; 2159 goto brd_input; 2160 } 2161 2162 if (res->type == RTN_LOCAL) { 2163 err = fib_validate_source(skb, saddr, daddr, tos, 2164 0, dev, in_dev, &itag); 2165 if (err < 0) 2166 goto martian_source; 2167 goto local_input; 2168 } 2169 2170 if (!IN_DEV_FORWARD(in_dev)) { 2171 err = -EHOSTUNREACH; 2172 goto no_route; 2173 } 2174 if (res->type != RTN_UNICAST) 2175 goto martian_destination; 2176 2177 make_route: 2178 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); 2179 out: return err; 2180 2181 brd_input: 2182 if (skb->protocol != htons(ETH_P_IP)) 2183 goto e_inval; 2184 2185 if (!ipv4_is_zeronet(saddr)) { 2186 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2187 in_dev, &itag); 2188 if (err < 0) 2189 goto martian_source; 2190 } 2191 flags |= RTCF_BROADCAST; 2192 res->type = RTN_BROADCAST; 2193 RT_CACHE_STAT_INC(in_brd); 2194 2195 local_input: 2196 do_cache &= res->fi && !itag; 2197 if (do_cache) { 2198 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2199 2200 rth = rcu_dereference(nhc->nhc_rth_input); 2201 if (rt_cache_valid(rth)) { 2202 skb_dst_set_noref(skb, &rth->dst); 2203 err = 0; 2204 goto out; 2205 } 2206 } 2207 2208 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 2209 flags | RTCF_LOCAL, res->type, 2210 IN_DEV_ORCONF(in_dev, NOPOLICY), false); 2211 if (!rth) 2212 goto e_nobufs; 2213 2214 rth->dst.output= ip_rt_bug; 2215 #ifdef CONFIG_IP_ROUTE_CLASSID 2216 rth->dst.tclassid = itag; 2217 #endif 2218 rth->rt_is_input = 1; 2219 2220 RT_CACHE_STAT_INC(in_slow_tot); 2221 if (res->type == RTN_UNREACHABLE) { 2222 rth->dst.input= ip_error; 2223 rth->dst.error= -err; 2224 rth->rt_flags &= ~RTCF_LOCAL; 2225 } 2226 2227 if (do_cache) { 2228 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2229 2230 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 2231 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2232 WARN_ON(rth->dst.input == lwtunnel_input); 2233 rth->dst.lwtstate->orig_input = rth->dst.input; 2234 rth->dst.input = lwtunnel_input; 2235 } 2236 2237 if (unlikely(!rt_cache_route(nhc, rth))) 2238 rt_add_uncached_list(rth); 2239 } 2240 skb_dst_set(skb, &rth->dst); 2241 err = 0; 2242 goto out; 2243 2244 no_route: 2245 RT_CACHE_STAT_INC(in_no_route); 2246 res->type = RTN_UNREACHABLE; 2247 res->fi = NULL; 2248 res->table = NULL; 2249 goto local_input; 2250 2251 /* 2252 * Do not cache martian addresses: they should be logged (RFC1812) 2253 */ 2254 martian_destination: 2255 RT_CACHE_STAT_INC(in_martian_dst); 2256 #ifdef CONFIG_IP_ROUTE_VERBOSE 2257 if (IN_DEV_LOG_MARTIANS(in_dev)) 2258 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2259 &daddr, &saddr, dev->name); 2260 #endif 2261 2262 e_inval: 2263 err = -EINVAL; 2264 goto out; 2265 2266 e_nobufs: 2267 err = -ENOBUFS; 2268 goto out; 2269 2270 martian_source: 2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2272 goto out; 2273 } 2274 2275 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2276 u8 tos, struct net_device *dev) 2277 { 2278 struct fib_result res; 2279 int err; 2280 2281 tos &= IPTOS_RT_MASK; 2282 rcu_read_lock(); 2283 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2284 rcu_read_unlock(); 2285 2286 return err; 2287 } 2288 EXPORT_SYMBOL(ip_route_input_noref); 2289 2290 /* called with rcu_read_lock held */ 2291 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2292 u8 tos, struct net_device *dev, struct fib_result *res) 2293 { 2294 /* Multicast recognition logic is moved from route cache to here. 2295 * The problem was that too many Ethernet cards have broken/missing 2296 * hardware multicast filters :-( As result the host on multicasting 2297 * network acquires a lot of useless route cache entries, sort of 2298 * SDR messages from all the world. Now we try to get rid of them. 2299 * Really, provided software IP multicast filter is organized 2300 * reasonably (at least, hashed), it does not result in a slowdown 2301 * comparing with route cache reject entries. 2302 * Note, that multicast routers are not affected, because 2303 * route cache entry is created eventually. 2304 */ 2305 if (ipv4_is_multicast(daddr)) { 2306 struct in_device *in_dev = __in_dev_get_rcu(dev); 2307 int our = 0; 2308 int err = -EINVAL; 2309 2310 if (!in_dev) 2311 return err; 2312 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2313 ip_hdr(skb)->protocol); 2314 2315 /* check l3 master if no match yet */ 2316 if (!our && netif_is_l3_slave(dev)) { 2317 struct in_device *l3_in_dev; 2318 2319 l3_in_dev = __in_dev_get_rcu(skb->dev); 2320 if (l3_in_dev) 2321 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2322 ip_hdr(skb)->protocol); 2323 } 2324 2325 if (our 2326 #ifdef CONFIG_IP_MROUTE 2327 || 2328 (!ipv4_is_local_multicast(daddr) && 2329 IN_DEV_MFORWARD(in_dev)) 2330 #endif 2331 ) { 2332 err = ip_route_input_mc(skb, daddr, saddr, 2333 tos, dev, our); 2334 } 2335 return err; 2336 } 2337 2338 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2339 } 2340 2341 /* called with rcu_read_lock() */ 2342 static struct rtable *__mkroute_output(const struct fib_result *res, 2343 const struct flowi4 *fl4, int orig_oif, 2344 struct net_device *dev_out, 2345 unsigned int flags) 2346 { 2347 struct fib_info *fi = res->fi; 2348 struct fib_nh_exception *fnhe; 2349 struct in_device *in_dev; 2350 u16 type = res->type; 2351 struct rtable *rth; 2352 bool do_cache; 2353 2354 in_dev = __in_dev_get_rcu(dev_out); 2355 if (!in_dev) 2356 return ERR_PTR(-EINVAL); 2357 2358 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2359 if (ipv4_is_loopback(fl4->saddr) && 2360 !(dev_out->flags & IFF_LOOPBACK) && 2361 !netif_is_l3_master(dev_out)) 2362 return ERR_PTR(-EINVAL); 2363 2364 if (ipv4_is_lbcast(fl4->daddr)) 2365 type = RTN_BROADCAST; 2366 else if (ipv4_is_multicast(fl4->daddr)) 2367 type = RTN_MULTICAST; 2368 else if (ipv4_is_zeronet(fl4->daddr)) 2369 return ERR_PTR(-EINVAL); 2370 2371 if (dev_out->flags & IFF_LOOPBACK) 2372 flags |= RTCF_LOCAL; 2373 2374 do_cache = true; 2375 if (type == RTN_BROADCAST) { 2376 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2377 fi = NULL; 2378 } else if (type == RTN_MULTICAST) { 2379 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2380 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2381 fl4->flowi4_proto)) 2382 flags &= ~RTCF_LOCAL; 2383 else 2384 do_cache = false; 2385 /* If multicast route do not exist use 2386 * default one, but do not gateway in this case. 2387 * Yes, it is hack. 2388 */ 2389 if (fi && res->prefixlen < 4) 2390 fi = NULL; 2391 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2392 (orig_oif != dev_out->ifindex)) { 2393 /* For local routes that require a particular output interface 2394 * we do not want to cache the result. Caching the result 2395 * causes incorrect behaviour when there are multiple source 2396 * addresses on the interface, the end result being that if the 2397 * intended recipient is waiting on that interface for the 2398 * packet he won't receive it because it will be delivered on 2399 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2400 * be set to the loopback interface as well. 2401 */ 2402 do_cache = false; 2403 } 2404 2405 fnhe = NULL; 2406 do_cache &= fi != NULL; 2407 if (fi) { 2408 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2409 struct rtable __rcu **prth; 2410 2411 fnhe = find_exception(nhc, fl4->daddr); 2412 if (!do_cache) 2413 goto add; 2414 if (fnhe) { 2415 prth = &fnhe->fnhe_rth_output; 2416 } else { 2417 if (unlikely(fl4->flowi4_flags & 2418 FLOWI_FLAG_KNOWN_NH && 2419 !(nhc->nhc_gw_family && 2420 nhc->nhc_scope == RT_SCOPE_LINK))) { 2421 do_cache = false; 2422 goto add; 2423 } 2424 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); 2425 } 2426 rth = rcu_dereference(*prth); 2427 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2428 return rth; 2429 } 2430 2431 add: 2432 rth = rt_dst_alloc(dev_out, flags, type, 2433 IN_DEV_ORCONF(in_dev, NOPOLICY), 2434 IN_DEV_ORCONF(in_dev, NOXFRM)); 2435 if (!rth) 2436 return ERR_PTR(-ENOBUFS); 2437 2438 rth->rt_iif = orig_oif; 2439 2440 RT_CACHE_STAT_INC(out_slow_tot); 2441 2442 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2443 if (flags & RTCF_LOCAL && 2444 !(dev_out->flags & IFF_LOOPBACK)) { 2445 rth->dst.output = ip_mc_output; 2446 RT_CACHE_STAT_INC(out_slow_mc); 2447 } 2448 #ifdef CONFIG_IP_MROUTE 2449 if (type == RTN_MULTICAST) { 2450 if (IN_DEV_MFORWARD(in_dev) && 2451 !ipv4_is_local_multicast(fl4->daddr)) { 2452 rth->dst.input = ip_mr_input; 2453 rth->dst.output = ip_mc_output; 2454 } 2455 } 2456 #endif 2457 } 2458 2459 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2460 lwtunnel_set_redirect(&rth->dst); 2461 2462 return rth; 2463 } 2464 2465 /* 2466 * Major route resolver routine. 2467 */ 2468 2469 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2470 const struct sk_buff *skb) 2471 { 2472 __u8 tos = RT_FL_TOS(fl4); 2473 struct fib_result res = { 2474 .type = RTN_UNSPEC, 2475 .fi = NULL, 2476 .table = NULL, 2477 .tclassid = 0, 2478 }; 2479 struct rtable *rth; 2480 2481 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2482 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2483 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2484 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2485 2486 rcu_read_lock(); 2487 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2488 rcu_read_unlock(); 2489 2490 return rth; 2491 } 2492 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2493 2494 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2495 struct fib_result *res, 2496 const struct sk_buff *skb) 2497 { 2498 struct net_device *dev_out = NULL; 2499 int orig_oif = fl4->flowi4_oif; 2500 unsigned int flags = 0; 2501 struct rtable *rth; 2502 int err; 2503 2504 if (fl4->saddr) { 2505 if (ipv4_is_multicast(fl4->saddr) || 2506 ipv4_is_lbcast(fl4->saddr) || 2507 ipv4_is_zeronet(fl4->saddr)) { 2508 rth = ERR_PTR(-EINVAL); 2509 goto out; 2510 } 2511 2512 rth = ERR_PTR(-ENETUNREACH); 2513 2514 /* I removed check for oif == dev_out->oif here. 2515 * It was wrong for two reasons: 2516 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2517 * is assigned to multiple interfaces. 2518 * 2. Moreover, we are allowed to send packets with saddr 2519 * of another iface. --ANK 2520 */ 2521 2522 if (fl4->flowi4_oif == 0 && 2523 (ipv4_is_multicast(fl4->daddr) || 2524 ipv4_is_lbcast(fl4->daddr))) { 2525 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2526 dev_out = __ip_dev_find(net, fl4->saddr, false); 2527 if (!dev_out) 2528 goto out; 2529 2530 /* Special hack: user can direct multicasts 2531 * and limited broadcast via necessary interface 2532 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2533 * This hack is not just for fun, it allows 2534 * vic,vat and friends to work. 2535 * They bind socket to loopback, set ttl to zero 2536 * and expect that it will work. 2537 * From the viewpoint of routing cache they are broken, 2538 * because we are not allowed to build multicast path 2539 * with loopback source addr (look, routing cache 2540 * cannot know, that ttl is zero, so that packet 2541 * will not leave this host and route is valid). 2542 * Luckily, this hack is good workaround. 2543 */ 2544 2545 fl4->flowi4_oif = dev_out->ifindex; 2546 goto make_route; 2547 } 2548 2549 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2550 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2551 if (!__ip_dev_find(net, fl4->saddr, false)) 2552 goto out; 2553 } 2554 } 2555 2556 2557 if (fl4->flowi4_oif) { 2558 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2559 rth = ERR_PTR(-ENODEV); 2560 if (!dev_out) 2561 goto out; 2562 2563 /* RACE: Check return value of inet_select_addr instead. */ 2564 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2565 rth = ERR_PTR(-ENETUNREACH); 2566 goto out; 2567 } 2568 if (ipv4_is_local_multicast(fl4->daddr) || 2569 ipv4_is_lbcast(fl4->daddr) || 2570 fl4->flowi4_proto == IPPROTO_IGMP) { 2571 if (!fl4->saddr) 2572 fl4->saddr = inet_select_addr(dev_out, 0, 2573 RT_SCOPE_LINK); 2574 goto make_route; 2575 } 2576 if (!fl4->saddr) { 2577 if (ipv4_is_multicast(fl4->daddr)) 2578 fl4->saddr = inet_select_addr(dev_out, 0, 2579 fl4->flowi4_scope); 2580 else if (!fl4->daddr) 2581 fl4->saddr = inet_select_addr(dev_out, 0, 2582 RT_SCOPE_HOST); 2583 } 2584 } 2585 2586 if (!fl4->daddr) { 2587 fl4->daddr = fl4->saddr; 2588 if (!fl4->daddr) 2589 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2590 dev_out = net->loopback_dev; 2591 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2592 res->type = RTN_LOCAL; 2593 flags |= RTCF_LOCAL; 2594 goto make_route; 2595 } 2596 2597 err = fib_lookup(net, fl4, res, 0); 2598 if (err) { 2599 res->fi = NULL; 2600 res->table = NULL; 2601 if (fl4->flowi4_oif && 2602 (ipv4_is_multicast(fl4->daddr) || 2603 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2604 /* Apparently, routing tables are wrong. Assume, 2605 * that the destination is on link. 2606 * 2607 * WHY? DW. 2608 * Because we are allowed to send to iface 2609 * even if it has NO routes and NO assigned 2610 * addresses. When oif is specified, routing 2611 * tables are looked up with only one purpose: 2612 * to catch if destination is gatewayed, rather than 2613 * direct. Moreover, if MSG_DONTROUTE is set, 2614 * we send packet, ignoring both routing tables 2615 * and ifaddr state. --ANK 2616 * 2617 * 2618 * We could make it even if oif is unknown, 2619 * likely IPv6, but we do not. 2620 */ 2621 2622 if (fl4->saddr == 0) 2623 fl4->saddr = inet_select_addr(dev_out, 0, 2624 RT_SCOPE_LINK); 2625 res->type = RTN_UNICAST; 2626 goto make_route; 2627 } 2628 rth = ERR_PTR(err); 2629 goto out; 2630 } 2631 2632 if (res->type == RTN_LOCAL) { 2633 if (!fl4->saddr) { 2634 if (res->fi->fib_prefsrc) 2635 fl4->saddr = res->fi->fib_prefsrc; 2636 else 2637 fl4->saddr = fl4->daddr; 2638 } 2639 2640 /* L3 master device is the loopback for that domain */ 2641 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2642 net->loopback_dev; 2643 2644 /* make sure orig_oif points to fib result device even 2645 * though packet rx/tx happens over loopback or l3mdev 2646 */ 2647 orig_oif = FIB_RES_OIF(*res); 2648 2649 fl4->flowi4_oif = dev_out->ifindex; 2650 flags |= RTCF_LOCAL; 2651 goto make_route; 2652 } 2653 2654 fib_select_path(net, res, fl4, skb); 2655 2656 dev_out = FIB_RES_DEV(*res); 2657 2658 make_route: 2659 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2660 2661 out: 2662 return rth; 2663 } 2664 2665 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2666 { 2667 return NULL; 2668 } 2669 2670 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2671 { 2672 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2673 2674 return mtu ? : dst->dev->mtu; 2675 } 2676 2677 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2678 struct sk_buff *skb, u32 mtu, 2679 bool confirm_neigh) 2680 { 2681 } 2682 2683 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2684 struct sk_buff *skb) 2685 { 2686 } 2687 2688 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2689 unsigned long old) 2690 { 2691 return NULL; 2692 } 2693 2694 static struct dst_ops ipv4_dst_blackhole_ops = { 2695 .family = AF_INET, 2696 .check = ipv4_blackhole_dst_check, 2697 .mtu = ipv4_blackhole_mtu, 2698 .default_advmss = ipv4_default_advmss, 2699 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2700 .redirect = ipv4_rt_blackhole_redirect, 2701 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2702 .neigh_lookup = ipv4_neigh_lookup, 2703 }; 2704 2705 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2706 { 2707 struct rtable *ort = (struct rtable *) dst_orig; 2708 struct rtable *rt; 2709 2710 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2711 if (rt) { 2712 struct dst_entry *new = &rt->dst; 2713 2714 new->__use = 1; 2715 new->input = dst_discard; 2716 new->output = dst_discard_out; 2717 2718 new->dev = net->loopback_dev; 2719 if (new->dev) 2720 dev_hold(new->dev); 2721 2722 rt->rt_is_input = ort->rt_is_input; 2723 rt->rt_iif = ort->rt_iif; 2724 rt->rt_pmtu = ort->rt_pmtu; 2725 rt->rt_mtu_locked = ort->rt_mtu_locked; 2726 2727 rt->rt_genid = rt_genid_ipv4(net); 2728 rt->rt_flags = ort->rt_flags; 2729 rt->rt_type = ort->rt_type; 2730 rt->rt_uses_gateway = ort->rt_uses_gateway; 2731 rt->rt_gw_family = ort->rt_gw_family; 2732 if (rt->rt_gw_family == AF_INET) 2733 rt->rt_gw4 = ort->rt_gw4; 2734 else if (rt->rt_gw_family == AF_INET6) 2735 rt->rt_gw6 = ort->rt_gw6; 2736 2737 INIT_LIST_HEAD(&rt->rt_uncached); 2738 } 2739 2740 dst_release(dst_orig); 2741 2742 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2743 } 2744 2745 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2746 const struct sock *sk) 2747 { 2748 struct rtable *rt = __ip_route_output_key(net, flp4); 2749 2750 if (IS_ERR(rt)) 2751 return rt; 2752 2753 if (flp4->flowi4_proto) { 2754 flp4->flowi4_oif = rt->dst.dev->ifindex; 2755 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2756 flowi4_to_flowi(flp4), 2757 sk, 0); 2758 } 2759 2760 return rt; 2761 } 2762 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2763 2764 struct rtable *ip_route_output_tunnel(struct sk_buff *skb, 2765 struct net_device *dev, 2766 struct net *net, __be32 *saddr, 2767 const struct ip_tunnel_info *info, 2768 u8 protocol, bool use_cache) 2769 { 2770 #ifdef CONFIG_DST_CACHE 2771 struct dst_cache *dst_cache; 2772 #endif 2773 struct rtable *rt = NULL; 2774 struct flowi4 fl4; 2775 __u8 tos; 2776 2777 #ifdef CONFIG_DST_CACHE 2778 dst_cache = (struct dst_cache *)&info->dst_cache; 2779 if (use_cache) { 2780 rt = dst_cache_get_ip4(dst_cache, saddr); 2781 if (rt) 2782 return rt; 2783 } 2784 #endif 2785 memset(&fl4, 0, sizeof(fl4)); 2786 fl4.flowi4_mark = skb->mark; 2787 fl4.flowi4_proto = protocol; 2788 fl4.daddr = info->key.u.ipv4.dst; 2789 fl4.saddr = info->key.u.ipv4.src; 2790 tos = info->key.tos; 2791 fl4.flowi4_tos = RT_TOS(tos); 2792 2793 rt = ip_route_output_key(net, &fl4); 2794 if (IS_ERR(rt)) { 2795 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); 2796 return ERR_PTR(-ENETUNREACH); 2797 } 2798 if (rt->dst.dev == dev) { /* is this necessary? */ 2799 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); 2800 ip_rt_put(rt); 2801 return ERR_PTR(-ELOOP); 2802 } 2803 #ifdef CONFIG_DST_CACHE 2804 if (use_cache) 2805 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); 2806 #endif 2807 *saddr = fl4.saddr; 2808 return rt; 2809 } 2810 EXPORT_SYMBOL_GPL(ip_route_output_tunnel); 2811 2812 /* called with rcu_read_lock held */ 2813 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2814 struct rtable *rt, u32 table_id, struct flowi4 *fl4, 2815 struct sk_buff *skb, u32 portid, u32 seq, 2816 unsigned int flags) 2817 { 2818 struct rtmsg *r; 2819 struct nlmsghdr *nlh; 2820 unsigned long expires = 0; 2821 u32 error; 2822 u32 metrics[RTAX_MAX]; 2823 2824 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); 2825 if (!nlh) 2826 return -EMSGSIZE; 2827 2828 r = nlmsg_data(nlh); 2829 r->rtm_family = AF_INET; 2830 r->rtm_dst_len = 32; 2831 r->rtm_src_len = 0; 2832 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; 2833 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2834 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2835 goto nla_put_failure; 2836 r->rtm_type = rt->rt_type; 2837 r->rtm_scope = RT_SCOPE_UNIVERSE; 2838 r->rtm_protocol = RTPROT_UNSPEC; 2839 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2840 if (rt->rt_flags & RTCF_NOTIFY) 2841 r->rtm_flags |= RTM_F_NOTIFY; 2842 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2843 r->rtm_flags |= RTCF_DOREDIRECT; 2844 2845 if (nla_put_in_addr(skb, RTA_DST, dst)) 2846 goto nla_put_failure; 2847 if (src) { 2848 r->rtm_src_len = 32; 2849 if (nla_put_in_addr(skb, RTA_SRC, src)) 2850 goto nla_put_failure; 2851 } 2852 if (rt->dst.dev && 2853 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2854 goto nla_put_failure; 2855 if (rt->dst.lwtstate && 2856 lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 2857 goto nla_put_failure; 2858 #ifdef CONFIG_IP_ROUTE_CLASSID 2859 if (rt->dst.tclassid && 2860 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2861 goto nla_put_failure; 2862 #endif 2863 if (fl4 && !rt_is_input_route(rt) && 2864 fl4->saddr != src) { 2865 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2866 goto nla_put_failure; 2867 } 2868 if (rt->rt_uses_gateway) { 2869 if (rt->rt_gw_family == AF_INET && 2870 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { 2871 goto nla_put_failure; 2872 } else if (rt->rt_gw_family == AF_INET6) { 2873 int alen = sizeof(struct in6_addr); 2874 struct nlattr *nla; 2875 struct rtvia *via; 2876 2877 nla = nla_reserve(skb, RTA_VIA, alen + 2); 2878 if (!nla) 2879 goto nla_put_failure; 2880 2881 via = nla_data(nla); 2882 via->rtvia_family = AF_INET6; 2883 memcpy(via->rtvia_addr, &rt->rt_gw6, alen); 2884 } 2885 } 2886 2887 expires = rt->dst.expires; 2888 if (expires) { 2889 unsigned long now = jiffies; 2890 2891 if (time_before(now, expires)) 2892 expires -= now; 2893 else 2894 expires = 0; 2895 } 2896 2897 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2898 if (rt->rt_pmtu && expires) 2899 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2900 if (rt->rt_mtu_locked && expires) 2901 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); 2902 if (rtnetlink_put_metrics(skb, metrics) < 0) 2903 goto nla_put_failure; 2904 2905 if (fl4) { 2906 if (fl4->flowi4_mark && 2907 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2908 goto nla_put_failure; 2909 2910 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2911 nla_put_u32(skb, RTA_UID, 2912 from_kuid_munged(current_user_ns(), 2913 fl4->flowi4_uid))) 2914 goto nla_put_failure; 2915 2916 if (rt_is_input_route(rt)) { 2917 #ifdef CONFIG_IP_MROUTE 2918 if (ipv4_is_multicast(dst) && 2919 !ipv4_is_local_multicast(dst) && 2920 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2921 int err = ipmr_get_route(net, skb, 2922 fl4->saddr, fl4->daddr, 2923 r, portid); 2924 2925 if (err <= 0) { 2926 if (err == 0) 2927 return 0; 2928 goto nla_put_failure; 2929 } 2930 } else 2931 #endif 2932 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) 2933 goto nla_put_failure; 2934 } 2935 } 2936 2937 error = rt->dst.error; 2938 2939 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2940 goto nla_put_failure; 2941 2942 nlmsg_end(skb, nlh); 2943 return 0; 2944 2945 nla_put_failure: 2946 nlmsg_cancel(skb, nlh); 2947 return -EMSGSIZE; 2948 } 2949 2950 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, 2951 struct netlink_callback *cb, u32 table_id, 2952 struct fnhe_hash_bucket *bucket, int genid, 2953 int *fa_index, int fa_start, unsigned int flags) 2954 { 2955 int i; 2956 2957 for (i = 0; i < FNHE_HASH_SIZE; i++) { 2958 struct fib_nh_exception *fnhe; 2959 2960 for (fnhe = rcu_dereference(bucket[i].chain); fnhe; 2961 fnhe = rcu_dereference(fnhe->fnhe_next)) { 2962 struct rtable *rt; 2963 int err; 2964 2965 if (*fa_index < fa_start) 2966 goto next; 2967 2968 if (fnhe->fnhe_genid != genid) 2969 goto next; 2970 2971 if (fnhe->fnhe_expires && 2972 time_after(jiffies, fnhe->fnhe_expires)) 2973 goto next; 2974 2975 rt = rcu_dereference(fnhe->fnhe_rth_input); 2976 if (!rt) 2977 rt = rcu_dereference(fnhe->fnhe_rth_output); 2978 if (!rt) 2979 goto next; 2980 2981 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, 2982 table_id, NULL, skb, 2983 NETLINK_CB(cb->skb).portid, 2984 cb->nlh->nlmsg_seq, flags); 2985 if (err) 2986 return err; 2987 next: 2988 (*fa_index)++; 2989 } 2990 } 2991 2992 return 0; 2993 } 2994 2995 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, 2996 u32 table_id, struct fib_info *fi, 2997 int *fa_index, int fa_start, unsigned int flags) 2998 { 2999 struct net *net = sock_net(cb->skb->sk); 3000 int nhsel, genid = fnhe_genid(net); 3001 3002 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { 3003 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); 3004 struct fnhe_hash_bucket *bucket; 3005 int err; 3006 3007 if (nhc->nhc_flags & RTNH_F_DEAD) 3008 continue; 3009 3010 rcu_read_lock(); 3011 bucket = rcu_dereference(nhc->nhc_exceptions); 3012 err = 0; 3013 if (bucket) 3014 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, 3015 genid, fa_index, fa_start, 3016 flags); 3017 rcu_read_unlock(); 3018 if (err) 3019 return err; 3020 } 3021 3022 return 0; 3023 } 3024 3025 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, 3026 u8 ip_proto, __be16 sport, 3027 __be16 dport) 3028 { 3029 struct sk_buff *skb; 3030 struct iphdr *iph; 3031 3032 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3033 if (!skb) 3034 return NULL; 3035 3036 /* Reserve room for dummy headers, this skb can pass 3037 * through good chunk of routing engine. 3038 */ 3039 skb_reset_mac_header(skb); 3040 skb_reset_network_header(skb); 3041 skb->protocol = htons(ETH_P_IP); 3042 iph = skb_put(skb, sizeof(struct iphdr)); 3043 iph->protocol = ip_proto; 3044 iph->saddr = src; 3045 iph->daddr = dst; 3046 iph->version = 0x4; 3047 iph->frag_off = 0; 3048 iph->ihl = 0x5; 3049 skb_set_transport_header(skb, skb->len); 3050 3051 switch (iph->protocol) { 3052 case IPPROTO_UDP: { 3053 struct udphdr *udph; 3054 3055 udph = skb_put_zero(skb, sizeof(struct udphdr)); 3056 udph->source = sport; 3057 udph->dest = dport; 3058 udph->len = sizeof(struct udphdr); 3059 udph->check = 0; 3060 break; 3061 } 3062 case IPPROTO_TCP: { 3063 struct tcphdr *tcph; 3064 3065 tcph = skb_put_zero(skb, sizeof(struct tcphdr)); 3066 tcph->source = sport; 3067 tcph->dest = dport; 3068 tcph->doff = sizeof(struct tcphdr) / 4; 3069 tcph->rst = 1; 3070 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), 3071 src, dst, 0); 3072 break; 3073 } 3074 case IPPROTO_ICMP: { 3075 struct icmphdr *icmph; 3076 3077 icmph = skb_put_zero(skb, sizeof(struct icmphdr)); 3078 icmph->type = ICMP_ECHO; 3079 icmph->code = 0; 3080 } 3081 } 3082 3083 return skb; 3084 } 3085 3086 static int inet_rtm_valid_getroute_req(struct sk_buff *skb, 3087 const struct nlmsghdr *nlh, 3088 struct nlattr **tb, 3089 struct netlink_ext_ack *extack) 3090 { 3091 struct rtmsg *rtm; 3092 int i, err; 3093 3094 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 3095 NL_SET_ERR_MSG(extack, 3096 "ipv4: Invalid header for route get request"); 3097 return -EINVAL; 3098 } 3099 3100 if (!netlink_strict_get_check(skb)) 3101 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 3102 rtm_ipv4_policy, extack); 3103 3104 rtm = nlmsg_data(nlh); 3105 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || 3106 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || 3107 rtm->rtm_table || rtm->rtm_protocol || 3108 rtm->rtm_scope || rtm->rtm_type) { 3109 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); 3110 return -EINVAL; 3111 } 3112 3113 if (rtm->rtm_flags & ~(RTM_F_NOTIFY | 3114 RTM_F_LOOKUP_TABLE | 3115 RTM_F_FIB_MATCH)) { 3116 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); 3117 return -EINVAL; 3118 } 3119 3120 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 3121 rtm_ipv4_policy, extack); 3122 if (err) 3123 return err; 3124 3125 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 3126 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 3127 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); 3128 return -EINVAL; 3129 } 3130 3131 for (i = 0; i <= RTA_MAX; i++) { 3132 if (!tb[i]) 3133 continue; 3134 3135 switch (i) { 3136 case RTA_IIF: 3137 case RTA_OIF: 3138 case RTA_SRC: 3139 case RTA_DST: 3140 case RTA_IP_PROTO: 3141 case RTA_SPORT: 3142 case RTA_DPORT: 3143 case RTA_MARK: 3144 case RTA_UID: 3145 break; 3146 default: 3147 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); 3148 return -EINVAL; 3149 } 3150 } 3151 3152 return 0; 3153 } 3154 3155 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 3156 struct netlink_ext_ack *extack) 3157 { 3158 struct net *net = sock_net(in_skb->sk); 3159 struct nlattr *tb[RTA_MAX+1]; 3160 u32 table_id = RT_TABLE_MAIN; 3161 __be16 sport = 0, dport = 0; 3162 struct fib_result res = {}; 3163 u8 ip_proto = IPPROTO_UDP; 3164 struct rtable *rt = NULL; 3165 struct sk_buff *skb; 3166 struct rtmsg *rtm; 3167 struct flowi4 fl4 = {}; 3168 __be32 dst = 0; 3169 __be32 src = 0; 3170 kuid_t uid; 3171 u32 iif; 3172 int err; 3173 int mark; 3174 3175 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 3176 if (err < 0) 3177 return err; 3178 3179 rtm = nlmsg_data(nlh); 3180 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 3181 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 3182 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 3183 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 3184 if (tb[RTA_UID]) 3185 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 3186 else 3187 uid = (iif ? INVALID_UID : current_uid()); 3188 3189 if (tb[RTA_IP_PROTO]) { 3190 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 3191 &ip_proto, AF_INET, extack); 3192 if (err) 3193 return err; 3194 } 3195 3196 if (tb[RTA_SPORT]) 3197 sport = nla_get_be16(tb[RTA_SPORT]); 3198 3199 if (tb[RTA_DPORT]) 3200 dport = nla_get_be16(tb[RTA_DPORT]); 3201 3202 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); 3203 if (!skb) 3204 return -ENOBUFS; 3205 3206 fl4.daddr = dst; 3207 fl4.saddr = src; 3208 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; 3209 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 3210 fl4.flowi4_mark = mark; 3211 fl4.flowi4_uid = uid; 3212 if (sport) 3213 fl4.fl4_sport = sport; 3214 if (dport) 3215 fl4.fl4_dport = dport; 3216 fl4.flowi4_proto = ip_proto; 3217 3218 rcu_read_lock(); 3219 3220 if (iif) { 3221 struct net_device *dev; 3222 3223 dev = dev_get_by_index_rcu(net, iif); 3224 if (!dev) { 3225 err = -ENODEV; 3226 goto errout_rcu; 3227 } 3228 3229 fl4.flowi4_iif = iif; /* for rt_fill_info */ 3230 skb->dev = dev; 3231 skb->mark = mark; 3232 err = ip_route_input_rcu(skb, dst, src, 3233 rtm->rtm_tos & IPTOS_RT_MASK, dev, 3234 &res); 3235 3236 rt = skb_rtable(skb); 3237 if (err == 0 && rt->dst.error) 3238 err = -rt->dst.error; 3239 } else { 3240 fl4.flowi4_iif = LOOPBACK_IFINDEX; 3241 skb->dev = net->loopback_dev; 3242 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 3243 err = 0; 3244 if (IS_ERR(rt)) 3245 err = PTR_ERR(rt); 3246 else 3247 skb_dst_set(skb, &rt->dst); 3248 } 3249 3250 if (err) 3251 goto errout_rcu; 3252 3253 if (rtm->rtm_flags & RTM_F_NOTIFY) 3254 rt->rt_flags |= RTCF_NOTIFY; 3255 3256 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 3257 table_id = res.table ? res.table->tb_id : 0; 3258 3259 /* reset skb for netlink reply msg */ 3260 skb_trim(skb, 0); 3261 skb_reset_network_header(skb); 3262 skb_reset_transport_header(skb); 3263 skb_reset_mac_header(skb); 3264 3265 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 3266 struct fib_rt_info fri; 3267 3268 if (!res.fi) { 3269 err = fib_props[res.type].error; 3270 if (!err) 3271 err = -EHOSTUNREACH; 3272 goto errout_rcu; 3273 } 3274 fri.fi = res.fi; 3275 fri.tb_id = table_id; 3276 fri.dst = res.prefix; 3277 fri.dst_len = res.prefixlen; 3278 fri.tos = fl4.flowi4_tos; 3279 fri.type = rt->rt_type; 3280 fri.offload = 0; 3281 fri.trap = 0; 3282 fri.offload_failed = 0; 3283 if (res.fa_head) { 3284 struct fib_alias *fa; 3285 3286 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { 3287 u8 slen = 32 - fri.dst_len; 3288 3289 if (fa->fa_slen == slen && 3290 fa->tb_id == fri.tb_id && 3291 fa->fa_tos == fri.tos && 3292 fa->fa_info == res.fi && 3293 fa->fa_type == fri.type) { 3294 fri.offload = fa->offload; 3295 fri.trap = fa->trap; 3296 break; 3297 } 3298 } 3299 } 3300 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 3301 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); 3302 } else { 3303 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, 3304 NETLINK_CB(in_skb).portid, 3305 nlh->nlmsg_seq, 0); 3306 } 3307 if (err < 0) 3308 goto errout_rcu; 3309 3310 rcu_read_unlock(); 3311 3312 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3313 3314 errout_free: 3315 return err; 3316 errout_rcu: 3317 rcu_read_unlock(); 3318 kfree_skb(skb); 3319 goto errout_free; 3320 } 3321 3322 void ip_rt_multicast_event(struct in_device *in_dev) 3323 { 3324 rt_cache_flush(dev_net(in_dev->dev)); 3325 } 3326 3327 #ifdef CONFIG_SYSCTL 3328 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 3329 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 3330 static int ip_rt_gc_elasticity __read_mostly = 8; 3331 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; 3332 3333 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 3334 void *buffer, size_t *lenp, loff_t *ppos) 3335 { 3336 struct net *net = (struct net *)__ctl->extra1; 3337 3338 if (write) { 3339 rt_cache_flush(net); 3340 fnhe_genid_bump(net); 3341 return 0; 3342 } 3343 3344 return -EINVAL; 3345 } 3346 3347 static struct ctl_table ipv4_route_table[] = { 3348 { 3349 .procname = "gc_thresh", 3350 .data = &ipv4_dst_ops.gc_thresh, 3351 .maxlen = sizeof(int), 3352 .mode = 0644, 3353 .proc_handler = proc_dointvec, 3354 }, 3355 { 3356 .procname = "max_size", 3357 .data = &ip_rt_max_size, 3358 .maxlen = sizeof(int), 3359 .mode = 0644, 3360 .proc_handler = proc_dointvec, 3361 }, 3362 { 3363 /* Deprecated. Use gc_min_interval_ms */ 3364 3365 .procname = "gc_min_interval", 3366 .data = &ip_rt_gc_min_interval, 3367 .maxlen = sizeof(int), 3368 .mode = 0644, 3369 .proc_handler = proc_dointvec_jiffies, 3370 }, 3371 { 3372 .procname = "gc_min_interval_ms", 3373 .data = &ip_rt_gc_min_interval, 3374 .maxlen = sizeof(int), 3375 .mode = 0644, 3376 .proc_handler = proc_dointvec_ms_jiffies, 3377 }, 3378 { 3379 .procname = "gc_timeout", 3380 .data = &ip_rt_gc_timeout, 3381 .maxlen = sizeof(int), 3382 .mode = 0644, 3383 .proc_handler = proc_dointvec_jiffies, 3384 }, 3385 { 3386 .procname = "gc_interval", 3387 .data = &ip_rt_gc_interval, 3388 .maxlen = sizeof(int), 3389 .mode = 0644, 3390 .proc_handler = proc_dointvec_jiffies, 3391 }, 3392 { 3393 .procname = "redirect_load", 3394 .data = &ip_rt_redirect_load, 3395 .maxlen = sizeof(int), 3396 .mode = 0644, 3397 .proc_handler = proc_dointvec, 3398 }, 3399 { 3400 .procname = "redirect_number", 3401 .data = &ip_rt_redirect_number, 3402 .maxlen = sizeof(int), 3403 .mode = 0644, 3404 .proc_handler = proc_dointvec, 3405 }, 3406 { 3407 .procname = "redirect_silence", 3408 .data = &ip_rt_redirect_silence, 3409 .maxlen = sizeof(int), 3410 .mode = 0644, 3411 .proc_handler = proc_dointvec, 3412 }, 3413 { 3414 .procname = "error_cost", 3415 .data = &ip_rt_error_cost, 3416 .maxlen = sizeof(int), 3417 .mode = 0644, 3418 .proc_handler = proc_dointvec, 3419 }, 3420 { 3421 .procname = "error_burst", 3422 .data = &ip_rt_error_burst, 3423 .maxlen = sizeof(int), 3424 .mode = 0644, 3425 .proc_handler = proc_dointvec, 3426 }, 3427 { 3428 .procname = "gc_elasticity", 3429 .data = &ip_rt_gc_elasticity, 3430 .maxlen = sizeof(int), 3431 .mode = 0644, 3432 .proc_handler = proc_dointvec, 3433 }, 3434 { 3435 .procname = "mtu_expires", 3436 .data = &ip_rt_mtu_expires, 3437 .maxlen = sizeof(int), 3438 .mode = 0644, 3439 .proc_handler = proc_dointvec_jiffies, 3440 }, 3441 { 3442 .procname = "min_pmtu", 3443 .data = &ip_rt_min_pmtu, 3444 .maxlen = sizeof(int), 3445 .mode = 0644, 3446 .proc_handler = proc_dointvec_minmax, 3447 .extra1 = &ip_min_valid_pmtu, 3448 }, 3449 { 3450 .procname = "min_adv_mss", 3451 .data = &ip_rt_min_advmss, 3452 .maxlen = sizeof(int), 3453 .mode = 0644, 3454 .proc_handler = proc_dointvec, 3455 }, 3456 { } 3457 }; 3458 3459 static const char ipv4_route_flush_procname[] = "flush"; 3460 3461 static struct ctl_table ipv4_route_flush_table[] = { 3462 { 3463 .procname = ipv4_route_flush_procname, 3464 .maxlen = sizeof(int), 3465 .mode = 0200, 3466 .proc_handler = ipv4_sysctl_rtcache_flush, 3467 }, 3468 { }, 3469 }; 3470 3471 static __net_init int sysctl_route_net_init(struct net *net) 3472 { 3473 struct ctl_table *tbl; 3474 3475 tbl = ipv4_route_flush_table; 3476 if (!net_eq(net, &init_net)) { 3477 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3478 if (!tbl) 3479 goto err_dup; 3480 3481 /* Don't export non-whitelisted sysctls to unprivileged users */ 3482 if (net->user_ns != &init_user_ns) { 3483 if (tbl[0].procname != ipv4_route_flush_procname) 3484 tbl[0].procname = NULL; 3485 } 3486 } 3487 tbl[0].extra1 = net; 3488 3489 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 3490 if (!net->ipv4.route_hdr) 3491 goto err_reg; 3492 return 0; 3493 3494 err_reg: 3495 if (tbl != ipv4_route_flush_table) 3496 kfree(tbl); 3497 err_dup: 3498 return -ENOMEM; 3499 } 3500 3501 static __net_exit void sysctl_route_net_exit(struct net *net) 3502 { 3503 struct ctl_table *tbl; 3504 3505 tbl = net->ipv4.route_hdr->ctl_table_arg; 3506 unregister_net_sysctl_table(net->ipv4.route_hdr); 3507 BUG_ON(tbl == ipv4_route_flush_table); 3508 kfree(tbl); 3509 } 3510 3511 static __net_initdata struct pernet_operations sysctl_route_ops = { 3512 .init = sysctl_route_net_init, 3513 .exit = sysctl_route_net_exit, 3514 }; 3515 #endif 3516 3517 static __net_init int rt_genid_init(struct net *net) 3518 { 3519 atomic_set(&net->ipv4.rt_genid, 0); 3520 atomic_set(&net->fnhe_genid, 0); 3521 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 3522 return 0; 3523 } 3524 3525 static __net_initdata struct pernet_operations rt_genid_ops = { 3526 .init = rt_genid_init, 3527 }; 3528 3529 static int __net_init ipv4_inetpeer_init(struct net *net) 3530 { 3531 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3532 3533 if (!bp) 3534 return -ENOMEM; 3535 inet_peer_base_init(bp); 3536 net->ipv4.peers = bp; 3537 return 0; 3538 } 3539 3540 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3541 { 3542 struct inet_peer_base *bp = net->ipv4.peers; 3543 3544 net->ipv4.peers = NULL; 3545 inetpeer_invalidate_tree(bp); 3546 kfree(bp); 3547 } 3548 3549 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3550 .init = ipv4_inetpeer_init, 3551 .exit = ipv4_inetpeer_exit, 3552 }; 3553 3554 #ifdef CONFIG_IP_ROUTE_CLASSID 3555 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3556 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3557 3558 int __init ip_rt_init(void) 3559 { 3560 int cpu; 3561 3562 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), 3563 GFP_KERNEL); 3564 if (!ip_idents) 3565 panic("IP: failed to allocate ip_idents\n"); 3566 3567 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3568 3569 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3570 if (!ip_tstamps) 3571 panic("IP: failed to allocate ip_tstamps\n"); 3572 3573 for_each_possible_cpu(cpu) { 3574 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3575 3576 INIT_LIST_HEAD(&ul->head); 3577 spin_lock_init(&ul->lock); 3578 } 3579 #ifdef CONFIG_IP_ROUTE_CLASSID 3580 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3581 if (!ip_rt_acct) 3582 panic("IP: failed to allocate ip_rt_acct\n"); 3583 #endif 3584 3585 ipv4_dst_ops.kmem_cachep = 3586 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3587 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3588 3589 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3590 3591 if (dst_entries_init(&ipv4_dst_ops) < 0) 3592 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3593 3594 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3595 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3596 3597 ipv4_dst_ops.gc_thresh = ~0; 3598 ip_rt_max_size = INT_MAX; 3599 3600 devinet_init(); 3601 ip_fib_init(); 3602 3603 if (ip_rt_proc_init()) 3604 pr_err("Unable to create route proc files\n"); 3605 #ifdef CONFIG_XFRM 3606 xfrm_init(); 3607 xfrm4_init(); 3608 #endif 3609 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3610 RTNL_FLAG_DOIT_UNLOCKED); 3611 3612 #ifdef CONFIG_SYSCTL 3613 register_pernet_subsys(&sysctl_route_ops); 3614 #endif 3615 register_pernet_subsys(&rt_genid_ops); 3616 register_pernet_subsys(&ipv4_inetpeer_ops); 3617 return 0; 3618 } 3619 3620 #ifdef CONFIG_SYSCTL 3621 /* 3622 * We really need to sanitize the damn ipv4 init order, then all 3623 * this nonsense will go away. 3624 */ 3625 void __init ip_static_sysctl_init(void) 3626 { 3627 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3628 } 3629 #endif 3630