1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <linux/siphash.h> 45 #include <net/net_namespace.h> 46 #include <net/snmp.h> 47 #include <net/ipv6.h> 48 #include <net/ip6_fib.h> 49 #include <net/ip6_route.h> 50 #include <net/ndisc.h> 51 #include <net/addrconf.h> 52 #include <net/tcp.h> 53 #include <linux/rtnetlink.h> 54 #include <net/dst.h> 55 #include <net/dst_metadata.h> 56 #include <net/xfrm.h> 57 #include <net/netevent.h> 58 #include <net/netlink.h> 59 #include <net/rtnh.h> 60 #include <net/lwtunnel.h> 61 #include <net/ip_tunnels.h> 62 #include <net/l3mdev.h> 63 #include <net/ip.h> 64 #include <linux/uaccess.h> 65 #include <linux/btf_ids.h> 66 67 #ifdef CONFIG_SYSCTL 68 #include <linux/sysctl.h> 69 #endif 70 71 static int ip6_rt_type_to_error(u8 fib6_type); 72 73 #define CREATE_TRACE_POINTS 74 #include <trace/events/fib6.h> 75 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 76 #undef CREATE_TRACE_POINTS 77 78 enum rt6_nud_state { 79 RT6_NUD_FAIL_HARD = -3, 80 RT6_NUD_FAIL_PROBE = -2, 81 RT6_NUD_FAIL_DO_RR = -1, 82 RT6_NUD_SUCCEED = 1 83 }; 84 85 INDIRECT_CALLABLE_SCOPE 86 struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 87 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88 INDIRECT_CALLABLE_SCOPE 89 unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu, 103 bool confirm_neigh); 104 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 105 struct sk_buff *skb); 106 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 107 int strict); 108 static size_t rt6_nlmsg_size(struct fib6_info *f6i); 109 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 110 struct fib6_info *rt, struct dst_entry *dst, 111 struct in6_addr *dest, struct in6_addr *src, 112 int iif, int type, u32 portid, u32 seq, 113 unsigned int flags); 114 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 115 const struct in6_addr *daddr, 116 const struct in6_addr *saddr); 117 118 #ifdef CONFIG_IPV6_ROUTE_INFO 119 static struct fib6_info *rt6_add_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev, 123 unsigned int pref); 124 static struct fib6_info *rt6_get_route_info(struct net *net, 125 const struct in6_addr *prefix, int prefixlen, 126 const struct in6_addr *gwaddr, 127 struct net_device *dev); 128 #endif 129 130 struct uncached_list { 131 spinlock_t lock; 132 struct list_head head; 133 }; 134 135 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 136 137 void rt6_uncached_list_add(struct rt6_info *rt) 138 { 139 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 140 141 rt->rt6i_uncached_list = ul; 142 143 spin_lock_bh(&ul->lock); 144 list_add_tail(&rt->rt6i_uncached, &ul->head); 145 spin_unlock_bh(&ul->lock); 146 } 147 148 void rt6_uncached_list_del(struct rt6_info *rt) 149 { 150 if (!list_empty(&rt->rt6i_uncached)) { 151 struct uncached_list *ul = rt->rt6i_uncached_list; 152 struct net *net = dev_net(rt->dst.dev); 153 154 spin_lock_bh(&ul->lock); 155 list_del(&rt->rt6i_uncached); 156 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 157 spin_unlock_bh(&ul->lock); 158 } 159 } 160 161 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 162 { 163 struct net_device *loopback_dev = net->loopback_dev; 164 int cpu; 165 166 if (dev == loopback_dev) 167 return; 168 169 for_each_possible_cpu(cpu) { 170 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 171 struct rt6_info *rt; 172 173 spin_lock_bh(&ul->lock); 174 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 175 struct inet6_dev *rt_idev = rt->rt6i_idev; 176 struct net_device *rt_dev = rt->dst.dev; 177 178 if (rt_idev->dev == dev) { 179 rt->rt6i_idev = in6_dev_get(loopback_dev); 180 in6_dev_put(rt_idev); 181 } 182 183 if (rt_dev == dev) { 184 rt->dst.dev = blackhole_netdev; 185 dev_hold(rt->dst.dev); 186 dev_put(rt_dev); 187 } 188 } 189 spin_unlock_bh(&ul->lock); 190 } 191 } 192 193 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 194 struct sk_buff *skb, 195 const void *daddr) 196 { 197 if (!ipv6_addr_any(p)) 198 return (const void *) p; 199 else if (skb) 200 return &ipv6_hdr(skb)->daddr; 201 return daddr; 202 } 203 204 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 205 struct net_device *dev, 206 struct sk_buff *skb, 207 const void *daddr) 208 { 209 struct neighbour *n; 210 211 daddr = choose_neigh_daddr(gw, skb, daddr); 212 n = __ipv6_neigh_lookup(dev, daddr); 213 if (n) 214 return n; 215 216 n = neigh_create(&nd_tbl, daddr, dev); 217 return IS_ERR(n) ? NULL : n; 218 } 219 220 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 221 struct sk_buff *skb, 222 const void *daddr) 223 { 224 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 225 226 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 227 dst->dev, skb, daddr); 228 } 229 230 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 231 { 232 struct net_device *dev = dst->dev; 233 struct rt6_info *rt = (struct rt6_info *)dst; 234 235 daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); 236 if (!daddr) 237 return; 238 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 239 return; 240 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 241 return; 242 __ipv6_confirm_neigh(dev, daddr); 243 } 244 245 static struct dst_ops ip6_dst_ops_template = { 246 .family = AF_INET6, 247 .gc = ip6_dst_gc, 248 .gc_thresh = 1024, 249 .check = ip6_dst_check, 250 .default_advmss = ip6_default_advmss, 251 .mtu = ip6_mtu, 252 .cow_metrics = dst_cow_metrics_generic, 253 .destroy = ip6_dst_destroy, 254 .ifdown = ip6_dst_ifdown, 255 .negative_advice = ip6_negative_advice, 256 .link_failure = ip6_link_failure, 257 .update_pmtu = ip6_rt_update_pmtu, 258 .redirect = rt6_do_redirect, 259 .local_out = __ip6_local_out, 260 .neigh_lookup = ip6_dst_neigh_lookup, 261 .confirm_neigh = ip6_confirm_neigh, 262 }; 263 264 static struct dst_ops ip6_dst_blackhole_ops = { 265 .family = AF_INET6, 266 .default_advmss = ip6_default_advmss, 267 .neigh_lookup = ip6_dst_neigh_lookup, 268 .check = ip6_dst_check, 269 .destroy = ip6_dst_destroy, 270 .cow_metrics = dst_cow_metrics_generic, 271 .update_pmtu = dst_blackhole_update_pmtu, 272 .redirect = dst_blackhole_redirect, 273 .mtu = dst_blackhole_mtu, 274 }; 275 276 static const u32 ip6_template_metrics[RTAX_MAX] = { 277 [RTAX_HOPLIMIT - 1] = 0, 278 }; 279 280 static const struct fib6_info fib6_null_entry_template = { 281 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 282 .fib6_protocol = RTPROT_KERNEL, 283 .fib6_metric = ~(u32)0, 284 .fib6_ref = REFCOUNT_INIT(1), 285 .fib6_type = RTN_UNREACHABLE, 286 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 287 }; 288 289 static const struct rt6_info ip6_null_entry_template = { 290 .dst = { 291 .__refcnt = ATOMIC_INIT(1), 292 .__use = 1, 293 .obsolete = DST_OBSOLETE_FORCE_CHK, 294 .error = -ENETUNREACH, 295 .input = ip6_pkt_discard, 296 .output = ip6_pkt_discard_out, 297 }, 298 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 299 }; 300 301 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 302 303 static const struct rt6_info ip6_prohibit_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -EACCES, 309 .input = ip6_pkt_prohibit, 310 .output = ip6_pkt_prohibit_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 static const struct rt6_info ip6_blk_hole_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EINVAL, 321 .input = dst_discard, 322 .output = dst_discard_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 #endif 328 329 static void rt6_info_init(struct rt6_info *rt) 330 { 331 memset_after(rt, 0, dst); 332 INIT_LIST_HEAD(&rt->rt6i_uncached); 333 } 334 335 /* allocate dst with ip6_dst_ops */ 336 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 337 int flags) 338 { 339 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 340 1, DST_OBSOLETE_FORCE_CHK, flags); 341 342 if (rt) { 343 rt6_info_init(rt); 344 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 345 } 346 347 return rt; 348 } 349 EXPORT_SYMBOL(ip6_dst_alloc); 350 351 static void ip6_dst_destroy(struct dst_entry *dst) 352 { 353 struct rt6_info *rt = (struct rt6_info *)dst; 354 struct fib6_info *from; 355 struct inet6_dev *idev; 356 357 ip_dst_metrics_put(dst); 358 rt6_uncached_list_del(rt); 359 360 idev = rt->rt6i_idev; 361 if (idev) { 362 rt->rt6i_idev = NULL; 363 in6_dev_put(idev); 364 } 365 366 from = xchg((__force struct fib6_info **)&rt->from, NULL); 367 fib6_info_release(from); 368 } 369 370 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 371 int how) 372 { 373 struct rt6_info *rt = (struct rt6_info *)dst; 374 struct inet6_dev *idev = rt->rt6i_idev; 375 struct net_device *loopback_dev = 376 dev_net(dev)->loopback_dev; 377 378 if (idev && idev->dev != loopback_dev) { 379 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 380 if (loopback_idev) { 381 rt->rt6i_idev = loopback_idev; 382 in6_dev_put(idev); 383 } 384 } 385 } 386 387 static bool __rt6_check_expired(const struct rt6_info *rt) 388 { 389 if (rt->rt6i_flags & RTF_EXPIRES) 390 return time_after(jiffies, rt->dst.expires); 391 else 392 return false; 393 } 394 395 static bool rt6_check_expired(const struct rt6_info *rt) 396 { 397 struct fib6_info *from; 398 399 from = rcu_dereference(rt->from); 400 401 if (rt->rt6i_flags & RTF_EXPIRES) { 402 if (time_after(jiffies, rt->dst.expires)) 403 return true; 404 } else if (from) { 405 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 406 fib6_check_expired(from); 407 } 408 return false; 409 } 410 411 void fib6_select_path(const struct net *net, struct fib6_result *res, 412 struct flowi6 *fl6, int oif, bool have_oif_match, 413 const struct sk_buff *skb, int strict) 414 { 415 struct fib6_info *sibling, *next_sibling; 416 struct fib6_info *match = res->f6i; 417 418 if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) 419 goto out; 420 421 if (match->nh && have_oif_match && res->nh) 422 return; 423 424 /* We might have already computed the hash for ICMPv6 errors. In such 425 * case it will always be non-zero. Otherwise now is the time to do it. 426 */ 427 if (!fl6->mp_hash && 428 (!match->nh || nexthop_is_multipath(match->nh))) 429 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 430 431 if (unlikely(match->nh)) { 432 nexthop_path_fib6_result(res, fl6->mp_hash); 433 return; 434 } 435 436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 437 goto out; 438 439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 440 fib6_siblings) { 441 const struct fib6_nh *nh = sibling->fib6_nh; 442 int nh_upper_bound; 443 444 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 445 if (fl6->mp_hash > nh_upper_bound) 446 continue; 447 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 448 break; 449 match = sibling; 450 break; 451 } 452 453 out: 454 res->f6i = match; 455 res->nh = match->fib6_nh; 456 } 457 458 /* 459 * Route lookup. rcu_read_lock() should be held. 460 */ 461 462 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 463 const struct in6_addr *saddr, int oif, int flags) 464 { 465 const struct net_device *dev; 466 467 if (nh->fib_nh_flags & RTNH_F_DEAD) 468 return false; 469 470 dev = nh->fib_nh_dev; 471 if (oif) { 472 if (dev->ifindex == oif) 473 return true; 474 } else { 475 if (ipv6_chk_addr(net, saddr, dev, 476 flags & RT6_LOOKUP_F_IFACE)) 477 return true; 478 } 479 480 return false; 481 } 482 483 struct fib6_nh_dm_arg { 484 struct net *net; 485 const struct in6_addr *saddr; 486 int oif; 487 int flags; 488 struct fib6_nh *nh; 489 }; 490 491 static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) 492 { 493 struct fib6_nh_dm_arg *arg = _arg; 494 495 arg->nh = nh; 496 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, 497 arg->flags); 498 } 499 500 /* returns fib6_nh from nexthop or NULL */ 501 static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, 502 struct fib6_result *res, 503 const struct in6_addr *saddr, 504 int oif, int flags) 505 { 506 struct fib6_nh_dm_arg arg = { 507 .net = net, 508 .saddr = saddr, 509 .oif = oif, 510 .flags = flags, 511 }; 512 513 if (nexthop_is_blackhole(nh)) 514 return NULL; 515 516 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) 517 return arg.nh; 518 519 return NULL; 520 } 521 522 static void rt6_device_match(struct net *net, struct fib6_result *res, 523 const struct in6_addr *saddr, int oif, int flags) 524 { 525 struct fib6_info *f6i = res->f6i; 526 struct fib6_info *spf6i; 527 struct fib6_nh *nh; 528 529 if (!oif && ipv6_addr_any(saddr)) { 530 if (unlikely(f6i->nh)) { 531 nh = nexthop_fib6_nh(f6i->nh); 532 if (nexthop_is_blackhole(f6i->nh)) 533 goto out_blackhole; 534 } else { 535 nh = f6i->fib6_nh; 536 } 537 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 538 goto out; 539 } 540 541 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 542 bool matched = false; 543 544 if (unlikely(spf6i->nh)) { 545 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, 546 oif, flags); 547 if (nh) 548 matched = true; 549 } else { 550 nh = spf6i->fib6_nh; 551 if (__rt6_device_match(net, nh, saddr, oif, flags)) 552 matched = true; 553 } 554 if (matched) { 555 res->f6i = spf6i; 556 goto out; 557 } 558 } 559 560 if (oif && flags & RT6_LOOKUP_F_IFACE) { 561 res->f6i = net->ipv6.fib6_null_entry; 562 nh = res->f6i->fib6_nh; 563 goto out; 564 } 565 566 if (unlikely(f6i->nh)) { 567 nh = nexthop_fib6_nh(f6i->nh); 568 if (nexthop_is_blackhole(f6i->nh)) 569 goto out_blackhole; 570 } else { 571 nh = f6i->fib6_nh; 572 } 573 574 if (nh->fib_nh_flags & RTNH_F_DEAD) { 575 res->f6i = net->ipv6.fib6_null_entry; 576 nh = res->f6i->fib6_nh; 577 } 578 out: 579 res->nh = nh; 580 res->fib6_type = res->f6i->fib6_type; 581 res->fib6_flags = res->f6i->fib6_flags; 582 return; 583 584 out_blackhole: 585 res->fib6_flags |= RTF_REJECT; 586 res->fib6_type = RTN_BLACKHOLE; 587 res->nh = nh; 588 } 589 590 #ifdef CONFIG_IPV6_ROUTER_PREF 591 struct __rt6_probe_work { 592 struct work_struct work; 593 struct in6_addr target; 594 struct net_device *dev; 595 }; 596 597 static void rt6_probe_deferred(struct work_struct *w) 598 { 599 struct in6_addr mcaddr; 600 struct __rt6_probe_work *work = 601 container_of(w, struct __rt6_probe_work, work); 602 603 addrconf_addr_solict_mult(&work->target, &mcaddr); 604 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 605 dev_put(work->dev); 606 kfree(work); 607 } 608 609 static void rt6_probe(struct fib6_nh *fib6_nh) 610 { 611 struct __rt6_probe_work *work = NULL; 612 const struct in6_addr *nh_gw; 613 unsigned long last_probe; 614 struct neighbour *neigh; 615 struct net_device *dev; 616 struct inet6_dev *idev; 617 618 /* 619 * Okay, this does not seem to be appropriate 620 * for now, however, we need to check if it 621 * is really so; aka Router Reachability Probing. 622 * 623 * Router Reachability Probe MUST be rate-limited 624 * to no more than one per minute. 625 */ 626 if (!fib6_nh->fib_nh_gw_family) 627 return; 628 629 nh_gw = &fib6_nh->fib_nh_gw6; 630 dev = fib6_nh->fib_nh_dev; 631 rcu_read_lock_bh(); 632 last_probe = READ_ONCE(fib6_nh->last_probe); 633 idev = __in6_dev_get(dev); 634 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 635 if (neigh) { 636 if (neigh->nud_state & NUD_VALID) 637 goto out; 638 639 write_lock(&neigh->lock); 640 if (!(neigh->nud_state & NUD_VALID) && 641 time_after(jiffies, 642 neigh->updated + idev->cnf.rtr_probe_interval)) { 643 work = kmalloc(sizeof(*work), GFP_ATOMIC); 644 if (work) 645 __neigh_set_probe_once(neigh); 646 } 647 write_unlock(&neigh->lock); 648 } else if (time_after(jiffies, last_probe + 649 idev->cnf.rtr_probe_interval)) { 650 work = kmalloc(sizeof(*work), GFP_ATOMIC); 651 } 652 653 if (!work || cmpxchg(&fib6_nh->last_probe, 654 last_probe, jiffies) != last_probe) { 655 kfree(work); 656 } else { 657 INIT_WORK(&work->work, rt6_probe_deferred); 658 work->target = *nh_gw; 659 dev_hold(dev); 660 work->dev = dev; 661 schedule_work(&work->work); 662 } 663 664 out: 665 rcu_read_unlock_bh(); 666 } 667 #else 668 static inline void rt6_probe(struct fib6_nh *fib6_nh) 669 { 670 } 671 #endif 672 673 /* 674 * Default Router Selection (RFC 2461 6.3.6) 675 */ 676 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 677 { 678 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 679 struct neighbour *neigh; 680 681 rcu_read_lock_bh(); 682 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 683 &fib6_nh->fib_nh_gw6); 684 if (neigh) { 685 read_lock(&neigh->lock); 686 if (neigh->nud_state & NUD_VALID) 687 ret = RT6_NUD_SUCCEED; 688 #ifdef CONFIG_IPV6_ROUTER_PREF 689 else if (!(neigh->nud_state & NUD_FAILED)) 690 ret = RT6_NUD_SUCCEED; 691 else 692 ret = RT6_NUD_FAIL_PROBE; 693 #endif 694 read_unlock(&neigh->lock); 695 } else { 696 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 697 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 698 } 699 rcu_read_unlock_bh(); 700 701 return ret; 702 } 703 704 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 705 int strict) 706 { 707 int m = 0; 708 709 if (!oif || nh->fib_nh_dev->ifindex == oif) 710 m = 2; 711 712 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 713 return RT6_NUD_FAIL_HARD; 714 #ifdef CONFIG_IPV6_ROUTER_PREF 715 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 716 #endif 717 if ((strict & RT6_LOOKUP_F_REACHABLE) && 718 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 719 int n = rt6_check_neigh(nh); 720 if (n < 0) 721 return n; 722 } 723 return m; 724 } 725 726 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 727 int oif, int strict, int *mpri, bool *do_rr) 728 { 729 bool match_do_rr = false; 730 bool rc = false; 731 int m; 732 733 if (nh->fib_nh_flags & RTNH_F_DEAD) 734 goto out; 735 736 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 737 nh->fib_nh_flags & RTNH_F_LINKDOWN && 738 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 739 goto out; 740 741 m = rt6_score_route(nh, fib6_flags, oif, strict); 742 if (m == RT6_NUD_FAIL_DO_RR) { 743 match_do_rr = true; 744 m = 0; /* lowest valid score */ 745 } else if (m == RT6_NUD_FAIL_HARD) { 746 goto out; 747 } 748 749 if (strict & RT6_LOOKUP_F_REACHABLE) 750 rt6_probe(nh); 751 752 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 753 if (m > *mpri) { 754 *do_rr = match_do_rr; 755 *mpri = m; 756 rc = true; 757 } 758 out: 759 return rc; 760 } 761 762 struct fib6_nh_frl_arg { 763 u32 flags; 764 int oif; 765 int strict; 766 int *mpri; 767 bool *do_rr; 768 struct fib6_nh *nh; 769 }; 770 771 static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) 772 { 773 struct fib6_nh_frl_arg *arg = _arg; 774 775 arg->nh = nh; 776 return find_match(nh, arg->flags, arg->oif, arg->strict, 777 arg->mpri, arg->do_rr); 778 } 779 780 static void __find_rr_leaf(struct fib6_info *f6i_start, 781 struct fib6_info *nomatch, u32 metric, 782 struct fib6_result *res, struct fib6_info **cont, 783 int oif, int strict, bool *do_rr, int *mpri) 784 { 785 struct fib6_info *f6i; 786 787 for (f6i = f6i_start; 788 f6i && f6i != nomatch; 789 f6i = rcu_dereference(f6i->fib6_next)) { 790 bool matched = false; 791 struct fib6_nh *nh; 792 793 if (cont && f6i->fib6_metric != metric) { 794 *cont = f6i; 795 return; 796 } 797 798 if (fib6_check_expired(f6i)) 799 continue; 800 801 if (unlikely(f6i->nh)) { 802 struct fib6_nh_frl_arg arg = { 803 .flags = f6i->fib6_flags, 804 .oif = oif, 805 .strict = strict, 806 .mpri = mpri, 807 .do_rr = do_rr 808 }; 809 810 if (nexthop_is_blackhole(f6i->nh)) { 811 res->fib6_flags = RTF_REJECT; 812 res->fib6_type = RTN_BLACKHOLE; 813 res->f6i = f6i; 814 res->nh = nexthop_fib6_nh(f6i->nh); 815 return; 816 } 817 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, 818 &arg)) { 819 matched = true; 820 nh = arg.nh; 821 } 822 } else { 823 nh = f6i->fib6_nh; 824 if (find_match(nh, f6i->fib6_flags, oif, strict, 825 mpri, do_rr)) 826 matched = true; 827 } 828 if (matched) { 829 res->f6i = f6i; 830 res->nh = nh; 831 res->fib6_flags = f6i->fib6_flags; 832 res->fib6_type = f6i->fib6_type; 833 } 834 } 835 } 836 837 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 838 struct fib6_info *rr_head, int oif, int strict, 839 bool *do_rr, struct fib6_result *res) 840 { 841 u32 metric = rr_head->fib6_metric; 842 struct fib6_info *cont = NULL; 843 int mpri = -1; 844 845 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 846 oif, strict, do_rr, &mpri); 847 848 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 849 oif, strict, do_rr, &mpri); 850 851 if (res->f6i || !cont) 852 return; 853 854 __find_rr_leaf(cont, NULL, metric, res, NULL, 855 oif, strict, do_rr, &mpri); 856 } 857 858 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 859 struct fib6_result *res, int strict) 860 { 861 struct fib6_info *leaf = rcu_dereference(fn->leaf); 862 struct fib6_info *rt0; 863 bool do_rr = false; 864 int key_plen; 865 866 /* make sure this function or its helpers sets f6i */ 867 res->f6i = NULL; 868 869 if (!leaf || leaf == net->ipv6.fib6_null_entry) 870 goto out; 871 872 rt0 = rcu_dereference(fn->rr_ptr); 873 if (!rt0) 874 rt0 = leaf; 875 876 /* Double check to make sure fn is not an intermediate node 877 * and fn->leaf does not points to its child's leaf 878 * (This might happen if all routes under fn are deleted from 879 * the tree and fib6_repair_tree() is called on the node.) 880 */ 881 key_plen = rt0->fib6_dst.plen; 882 #ifdef CONFIG_IPV6_SUBTREES 883 if (rt0->fib6_src.plen) 884 key_plen = rt0->fib6_src.plen; 885 #endif 886 if (fn->fn_bit != key_plen) 887 goto out; 888 889 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 890 if (do_rr) { 891 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 892 893 /* no entries matched; do round-robin */ 894 if (!next || next->fib6_metric != rt0->fib6_metric) 895 next = leaf; 896 897 if (next != rt0) { 898 spin_lock_bh(&leaf->fib6_table->tb6_lock); 899 /* make sure next is not being deleted from the tree */ 900 if (next->fib6_node) 901 rcu_assign_pointer(fn->rr_ptr, next); 902 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 903 } 904 } 905 906 out: 907 if (!res->f6i) { 908 res->f6i = net->ipv6.fib6_null_entry; 909 res->nh = res->f6i->fib6_nh; 910 res->fib6_flags = res->f6i->fib6_flags; 911 res->fib6_type = res->f6i->fib6_type; 912 } 913 } 914 915 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 916 { 917 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 918 res->nh->fib_nh_gw_family; 919 } 920 921 #ifdef CONFIG_IPV6_ROUTE_INFO 922 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 923 const struct in6_addr *gwaddr) 924 { 925 struct net *net = dev_net(dev); 926 struct route_info *rinfo = (struct route_info *) opt; 927 struct in6_addr prefix_buf, *prefix; 928 unsigned int pref; 929 unsigned long lifetime; 930 struct fib6_info *rt; 931 932 if (len < sizeof(struct route_info)) { 933 return -EINVAL; 934 } 935 936 /* Sanity check for prefix_len and length */ 937 if (rinfo->length > 3) { 938 return -EINVAL; 939 } else if (rinfo->prefix_len > 128) { 940 return -EINVAL; 941 } else if (rinfo->prefix_len > 64) { 942 if (rinfo->length < 2) { 943 return -EINVAL; 944 } 945 } else if (rinfo->prefix_len > 0) { 946 if (rinfo->length < 1) { 947 return -EINVAL; 948 } 949 } 950 951 pref = rinfo->route_pref; 952 if (pref == ICMPV6_ROUTER_PREF_INVALID) 953 return -EINVAL; 954 955 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 956 957 if (rinfo->length == 3) 958 prefix = (struct in6_addr *)rinfo->prefix; 959 else { 960 /* this function is safe */ 961 ipv6_addr_prefix(&prefix_buf, 962 (struct in6_addr *)rinfo->prefix, 963 rinfo->prefix_len); 964 prefix = &prefix_buf; 965 } 966 967 if (rinfo->prefix_len == 0) 968 rt = rt6_get_dflt_router(net, gwaddr, dev); 969 else 970 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 971 gwaddr, dev); 972 973 if (rt && !lifetime) { 974 ip6_del_rt(net, rt, false); 975 rt = NULL; 976 } 977 978 if (!rt && lifetime) 979 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 980 dev, pref); 981 else if (rt) 982 rt->fib6_flags = RTF_ROUTEINFO | 983 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 984 985 if (rt) { 986 if (!addrconf_finite_timeout(lifetime)) 987 fib6_clean_expires(rt); 988 else 989 fib6_set_expires(rt, jiffies + HZ * lifetime); 990 991 fib6_info_release(rt); 992 } 993 return 0; 994 } 995 #endif 996 997 /* 998 * Misc support functions 999 */ 1000 1001 /* called with rcu_lock held */ 1002 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 1003 { 1004 struct net_device *dev = res->nh->fib_nh_dev; 1005 1006 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1007 /* for copies of local routes, dst->dev needs to be the 1008 * device if it is a master device, the master device if 1009 * device is enslaved, and the loopback as the default 1010 */ 1011 if (netif_is_l3_slave(dev) && 1012 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 1013 dev = l3mdev_master_dev_rcu(dev); 1014 else if (!netif_is_l3_master(dev)) 1015 dev = dev_net(dev)->loopback_dev; 1016 /* last case is netif_is_l3_master(dev) is true in which 1017 * case we want dev returned to be dev 1018 */ 1019 } 1020 1021 return dev; 1022 } 1023 1024 static const int fib6_prop[RTN_MAX + 1] = { 1025 [RTN_UNSPEC] = 0, 1026 [RTN_UNICAST] = 0, 1027 [RTN_LOCAL] = 0, 1028 [RTN_BROADCAST] = 0, 1029 [RTN_ANYCAST] = 0, 1030 [RTN_MULTICAST] = 0, 1031 [RTN_BLACKHOLE] = -EINVAL, 1032 [RTN_UNREACHABLE] = -EHOSTUNREACH, 1033 [RTN_PROHIBIT] = -EACCES, 1034 [RTN_THROW] = -EAGAIN, 1035 [RTN_NAT] = -EINVAL, 1036 [RTN_XRESOLVE] = -EINVAL, 1037 }; 1038 1039 static int ip6_rt_type_to_error(u8 fib6_type) 1040 { 1041 return fib6_prop[fib6_type]; 1042 } 1043 1044 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 1045 { 1046 unsigned short flags = 0; 1047 1048 if (rt->dst_nocount) 1049 flags |= DST_NOCOUNT; 1050 if (rt->dst_nopolicy) 1051 flags |= DST_NOPOLICY; 1052 1053 return flags; 1054 } 1055 1056 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 1057 { 1058 rt->dst.error = ip6_rt_type_to_error(fib6_type); 1059 1060 switch (fib6_type) { 1061 case RTN_BLACKHOLE: 1062 rt->dst.output = dst_discard_out; 1063 rt->dst.input = dst_discard; 1064 break; 1065 case RTN_PROHIBIT: 1066 rt->dst.output = ip6_pkt_prohibit_out; 1067 rt->dst.input = ip6_pkt_prohibit; 1068 break; 1069 case RTN_THROW: 1070 case RTN_UNREACHABLE: 1071 default: 1072 rt->dst.output = ip6_pkt_discard_out; 1073 rt->dst.input = ip6_pkt_discard; 1074 break; 1075 } 1076 } 1077 1078 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 1079 { 1080 struct fib6_info *f6i = res->f6i; 1081 1082 if (res->fib6_flags & RTF_REJECT) { 1083 ip6_rt_init_dst_reject(rt, res->fib6_type); 1084 return; 1085 } 1086 1087 rt->dst.error = 0; 1088 rt->dst.output = ip6_output; 1089 1090 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1091 rt->dst.input = ip6_input; 1092 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1093 rt->dst.input = ip6_mc_input; 1094 } else { 1095 rt->dst.input = ip6_forward; 1096 } 1097 1098 if (res->nh->fib_nh_lws) { 1099 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1100 lwtunnel_set_redirect(&rt->dst); 1101 } 1102 1103 rt->dst.lastuse = jiffies; 1104 } 1105 1106 /* Caller must already hold reference to @from */ 1107 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1108 { 1109 rt->rt6i_flags &= ~RTF_EXPIRES; 1110 rcu_assign_pointer(rt->from, from); 1111 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1112 } 1113 1114 /* Caller must already hold reference to f6i in result */ 1115 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1116 { 1117 const struct fib6_nh *nh = res->nh; 1118 const struct net_device *dev = nh->fib_nh_dev; 1119 struct fib6_info *f6i = res->f6i; 1120 1121 ip6_rt_init_dst(rt, res); 1122 1123 rt->rt6i_dst = f6i->fib6_dst; 1124 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1125 rt->rt6i_flags = res->fib6_flags; 1126 if (nh->fib_nh_gw_family) { 1127 rt->rt6i_gateway = nh->fib_nh_gw6; 1128 rt->rt6i_flags |= RTF_GATEWAY; 1129 } 1130 rt6_set_from(rt, f6i); 1131 #ifdef CONFIG_IPV6_SUBTREES 1132 rt->rt6i_src = f6i->fib6_src; 1133 #endif 1134 } 1135 1136 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1137 struct in6_addr *saddr) 1138 { 1139 struct fib6_node *pn, *sn; 1140 while (1) { 1141 if (fn->fn_flags & RTN_TL_ROOT) 1142 return NULL; 1143 pn = rcu_dereference(fn->parent); 1144 sn = FIB6_SUBTREE(pn); 1145 if (sn && sn != fn) 1146 fn = fib6_node_lookup(sn, NULL, saddr); 1147 else 1148 fn = pn; 1149 if (fn->fn_flags & RTN_RTINFO) 1150 return fn; 1151 } 1152 } 1153 1154 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1155 { 1156 struct rt6_info *rt = *prt; 1157 1158 if (dst_hold_safe(&rt->dst)) 1159 return true; 1160 if (net) { 1161 rt = net->ipv6.ip6_null_entry; 1162 dst_hold(&rt->dst); 1163 } else { 1164 rt = NULL; 1165 } 1166 *prt = rt; 1167 return false; 1168 } 1169 1170 /* called with rcu_lock held */ 1171 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1172 { 1173 struct net_device *dev = res->nh->fib_nh_dev; 1174 struct fib6_info *f6i = res->f6i; 1175 unsigned short flags; 1176 struct rt6_info *nrt; 1177 1178 if (!fib6_info_hold_safe(f6i)) 1179 goto fallback; 1180 1181 flags = fib6_info_dst_flags(f6i); 1182 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1183 if (!nrt) { 1184 fib6_info_release(f6i); 1185 goto fallback; 1186 } 1187 1188 ip6_rt_copy_init(nrt, res); 1189 return nrt; 1190 1191 fallback: 1192 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1193 dst_hold(&nrt->dst); 1194 return nrt; 1195 } 1196 1197 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, 1198 struct fib6_table *table, 1199 struct flowi6 *fl6, 1200 const struct sk_buff *skb, 1201 int flags) 1202 { 1203 struct fib6_result res = {}; 1204 struct fib6_node *fn; 1205 struct rt6_info *rt; 1206 1207 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1208 flags &= ~RT6_LOOKUP_F_IFACE; 1209 1210 rcu_read_lock(); 1211 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1212 restart: 1213 res.f6i = rcu_dereference(fn->leaf); 1214 if (!res.f6i) 1215 res.f6i = net->ipv6.fib6_null_entry; 1216 else 1217 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1218 flags); 1219 1220 if (res.f6i == net->ipv6.fib6_null_entry) { 1221 fn = fib6_backtrack(fn, &fl6->saddr); 1222 if (fn) 1223 goto restart; 1224 1225 rt = net->ipv6.ip6_null_entry; 1226 dst_hold(&rt->dst); 1227 goto out; 1228 } else if (res.fib6_flags & RTF_REJECT) { 1229 goto do_create; 1230 } 1231 1232 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1233 fl6->flowi6_oif != 0, skb, flags); 1234 1235 /* Search through exception table */ 1236 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1237 if (rt) { 1238 if (ip6_hold_safe(net, &rt)) 1239 dst_use_noref(&rt->dst, jiffies); 1240 } else { 1241 do_create: 1242 rt = ip6_create_rt_rcu(&res); 1243 } 1244 1245 out: 1246 trace_fib6_table_lookup(net, &res, table, fl6); 1247 1248 rcu_read_unlock(); 1249 1250 return rt; 1251 } 1252 1253 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1254 const struct sk_buff *skb, int flags) 1255 { 1256 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1257 } 1258 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1259 1260 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1261 const struct in6_addr *saddr, int oif, 1262 const struct sk_buff *skb, int strict) 1263 { 1264 struct flowi6 fl6 = { 1265 .flowi6_oif = oif, 1266 .daddr = *daddr, 1267 }; 1268 struct dst_entry *dst; 1269 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1270 1271 if (saddr) { 1272 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1273 flags |= RT6_LOOKUP_F_HAS_SADDR; 1274 } 1275 1276 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1277 if (dst->error == 0) 1278 return (struct rt6_info *) dst; 1279 1280 dst_release(dst); 1281 1282 return NULL; 1283 } 1284 EXPORT_SYMBOL(rt6_lookup); 1285 1286 /* ip6_ins_rt is called with FREE table->tb6_lock. 1287 * It takes new route entry, the addition fails by any reason the 1288 * route is released. 1289 * Caller must hold dst before calling it. 1290 */ 1291 1292 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1293 struct netlink_ext_ack *extack) 1294 { 1295 int err; 1296 struct fib6_table *table; 1297 1298 table = rt->fib6_table; 1299 spin_lock_bh(&table->tb6_lock); 1300 err = fib6_add(&table->tb6_root, rt, info, extack); 1301 spin_unlock_bh(&table->tb6_lock); 1302 1303 return err; 1304 } 1305 1306 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1307 { 1308 struct nl_info info = { .nl_net = net, }; 1309 1310 return __ip6_ins_rt(rt, &info, NULL); 1311 } 1312 1313 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1314 const struct in6_addr *daddr, 1315 const struct in6_addr *saddr) 1316 { 1317 struct fib6_info *f6i = res->f6i; 1318 struct net_device *dev; 1319 struct rt6_info *rt; 1320 1321 /* 1322 * Clone the route. 1323 */ 1324 1325 if (!fib6_info_hold_safe(f6i)) 1326 return NULL; 1327 1328 dev = ip6_rt_get_dev_rcu(res); 1329 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1330 if (!rt) { 1331 fib6_info_release(f6i); 1332 return NULL; 1333 } 1334 1335 ip6_rt_copy_init(rt, res); 1336 rt->rt6i_flags |= RTF_CACHE; 1337 rt->rt6i_dst.addr = *daddr; 1338 rt->rt6i_dst.plen = 128; 1339 1340 if (!rt6_is_gw_or_nonexthop(res)) { 1341 if (f6i->fib6_dst.plen != 128 && 1342 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1343 rt->rt6i_flags |= RTF_ANYCAST; 1344 #ifdef CONFIG_IPV6_SUBTREES 1345 if (rt->rt6i_src.plen && saddr) { 1346 rt->rt6i_src.addr = *saddr; 1347 rt->rt6i_src.plen = 128; 1348 } 1349 #endif 1350 } 1351 1352 return rt; 1353 } 1354 1355 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1356 { 1357 struct fib6_info *f6i = res->f6i; 1358 unsigned short flags = fib6_info_dst_flags(f6i); 1359 struct net_device *dev; 1360 struct rt6_info *pcpu_rt; 1361 1362 if (!fib6_info_hold_safe(f6i)) 1363 return NULL; 1364 1365 rcu_read_lock(); 1366 dev = ip6_rt_get_dev_rcu(res); 1367 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); 1368 rcu_read_unlock(); 1369 if (!pcpu_rt) { 1370 fib6_info_release(f6i); 1371 return NULL; 1372 } 1373 ip6_rt_copy_init(pcpu_rt, res); 1374 pcpu_rt->rt6i_flags |= RTF_PCPU; 1375 1376 if (f6i->nh) 1377 pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); 1378 1379 return pcpu_rt; 1380 } 1381 1382 static bool rt6_is_valid(const struct rt6_info *rt6) 1383 { 1384 return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); 1385 } 1386 1387 /* It should be called with rcu_read_lock() acquired */ 1388 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1389 { 1390 struct rt6_info *pcpu_rt; 1391 1392 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1393 1394 if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { 1395 struct rt6_info *prev, **p; 1396 1397 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1398 prev = xchg(p, NULL); 1399 if (prev) { 1400 dst_dev_put(&prev->dst); 1401 dst_release(&prev->dst); 1402 } 1403 1404 pcpu_rt = NULL; 1405 } 1406 1407 return pcpu_rt; 1408 } 1409 1410 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1411 const struct fib6_result *res) 1412 { 1413 struct rt6_info *pcpu_rt, *prev, **p; 1414 1415 pcpu_rt = ip6_rt_pcpu_alloc(res); 1416 if (!pcpu_rt) 1417 return NULL; 1418 1419 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1420 prev = cmpxchg(p, NULL, pcpu_rt); 1421 BUG_ON(prev); 1422 1423 if (res->f6i->fib6_destroying) { 1424 struct fib6_info *from; 1425 1426 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1427 fib6_info_release(from); 1428 } 1429 1430 return pcpu_rt; 1431 } 1432 1433 /* exception hash table implementation 1434 */ 1435 static DEFINE_SPINLOCK(rt6_exception_lock); 1436 1437 /* Remove rt6_ex from hash table and free the memory 1438 * Caller must hold rt6_exception_lock 1439 */ 1440 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1441 struct rt6_exception *rt6_ex) 1442 { 1443 struct fib6_info *from; 1444 struct net *net; 1445 1446 if (!bucket || !rt6_ex) 1447 return; 1448 1449 net = dev_net(rt6_ex->rt6i->dst.dev); 1450 net->ipv6.rt6_stats->fib_rt_cache--; 1451 1452 /* purge completely the exception to allow releasing the held resources: 1453 * some [sk] cache may keep the dst around for unlimited time 1454 */ 1455 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1456 fib6_info_release(from); 1457 dst_dev_put(&rt6_ex->rt6i->dst); 1458 1459 hlist_del_rcu(&rt6_ex->hlist); 1460 dst_release(&rt6_ex->rt6i->dst); 1461 kfree_rcu(rt6_ex, rcu); 1462 WARN_ON_ONCE(!bucket->depth); 1463 bucket->depth--; 1464 } 1465 1466 /* Remove oldest rt6_ex in bucket and free the memory 1467 * Caller must hold rt6_exception_lock 1468 */ 1469 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1470 { 1471 struct rt6_exception *rt6_ex, *oldest = NULL; 1472 1473 if (!bucket) 1474 return; 1475 1476 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1477 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1478 oldest = rt6_ex; 1479 } 1480 rt6_remove_exception(bucket, oldest); 1481 } 1482 1483 static u32 rt6_exception_hash(const struct in6_addr *dst, 1484 const struct in6_addr *src) 1485 { 1486 static siphash_aligned_key_t rt6_exception_key; 1487 struct { 1488 struct in6_addr dst; 1489 struct in6_addr src; 1490 } __aligned(SIPHASH_ALIGNMENT) combined = { 1491 .dst = *dst, 1492 }; 1493 u64 val; 1494 1495 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); 1496 1497 #ifdef CONFIG_IPV6_SUBTREES 1498 if (src) 1499 combined.src = *src; 1500 #endif 1501 val = siphash(&combined, sizeof(combined), &rt6_exception_key); 1502 1503 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1504 } 1505 1506 /* Helper function to find the cached rt in the hash table 1507 * and update bucket pointer to point to the bucket for this 1508 * (daddr, saddr) pair 1509 * Caller must hold rt6_exception_lock 1510 */ 1511 static struct rt6_exception * 1512 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1513 const struct in6_addr *daddr, 1514 const struct in6_addr *saddr) 1515 { 1516 struct rt6_exception *rt6_ex; 1517 u32 hval; 1518 1519 if (!(*bucket) || !daddr) 1520 return NULL; 1521 1522 hval = rt6_exception_hash(daddr, saddr); 1523 *bucket += hval; 1524 1525 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1526 struct rt6_info *rt6 = rt6_ex->rt6i; 1527 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1528 1529 #ifdef CONFIG_IPV6_SUBTREES 1530 if (matched && saddr) 1531 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1532 #endif 1533 if (matched) 1534 return rt6_ex; 1535 } 1536 return NULL; 1537 } 1538 1539 /* Helper function to find the cached rt in the hash table 1540 * and update bucket pointer to point to the bucket for this 1541 * (daddr, saddr) pair 1542 * Caller must hold rcu_read_lock() 1543 */ 1544 static struct rt6_exception * 1545 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1546 const struct in6_addr *daddr, 1547 const struct in6_addr *saddr) 1548 { 1549 struct rt6_exception *rt6_ex; 1550 u32 hval; 1551 1552 WARN_ON_ONCE(!rcu_read_lock_held()); 1553 1554 if (!(*bucket) || !daddr) 1555 return NULL; 1556 1557 hval = rt6_exception_hash(daddr, saddr); 1558 *bucket += hval; 1559 1560 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1561 struct rt6_info *rt6 = rt6_ex->rt6i; 1562 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1563 1564 #ifdef CONFIG_IPV6_SUBTREES 1565 if (matched && saddr) 1566 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1567 #endif 1568 if (matched) 1569 return rt6_ex; 1570 } 1571 return NULL; 1572 } 1573 1574 static unsigned int fib6_mtu(const struct fib6_result *res) 1575 { 1576 const struct fib6_nh *nh = res->nh; 1577 unsigned int mtu; 1578 1579 if (res->f6i->fib6_pmtu) { 1580 mtu = res->f6i->fib6_pmtu; 1581 } else { 1582 struct net_device *dev = nh->fib_nh_dev; 1583 struct inet6_dev *idev; 1584 1585 rcu_read_lock(); 1586 idev = __in6_dev_get(dev); 1587 mtu = idev->cnf.mtu6; 1588 rcu_read_unlock(); 1589 } 1590 1591 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1592 1593 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1594 } 1595 1596 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1597 1598 /* used when the flushed bit is not relevant, only access to the bucket 1599 * (ie., all bucket users except rt6_insert_exception); 1600 * 1601 * called under rcu lock; sometimes called with rt6_exception_lock held 1602 */ 1603 static 1604 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1605 spinlock_t *lock) 1606 { 1607 struct rt6_exception_bucket *bucket; 1608 1609 if (lock) 1610 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1611 lockdep_is_held(lock)); 1612 else 1613 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1614 1615 /* remove bucket flushed bit if set */ 1616 if (bucket) { 1617 unsigned long p = (unsigned long)bucket; 1618 1619 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1620 bucket = (struct rt6_exception_bucket *)p; 1621 } 1622 1623 return bucket; 1624 } 1625 1626 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1627 { 1628 unsigned long p = (unsigned long)bucket; 1629 1630 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1631 } 1632 1633 /* called with rt6_exception_lock held */ 1634 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1635 spinlock_t *lock) 1636 { 1637 struct rt6_exception_bucket *bucket; 1638 unsigned long p; 1639 1640 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1641 lockdep_is_held(lock)); 1642 1643 p = (unsigned long)bucket; 1644 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1645 bucket = (struct rt6_exception_bucket *)p; 1646 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1647 } 1648 1649 static int rt6_insert_exception(struct rt6_info *nrt, 1650 const struct fib6_result *res) 1651 { 1652 struct net *net = dev_net(nrt->dst.dev); 1653 struct rt6_exception_bucket *bucket; 1654 struct fib6_info *f6i = res->f6i; 1655 struct in6_addr *src_key = NULL; 1656 struct rt6_exception *rt6_ex; 1657 struct fib6_nh *nh = res->nh; 1658 int max_depth; 1659 int err = 0; 1660 1661 spin_lock_bh(&rt6_exception_lock); 1662 1663 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1664 lockdep_is_held(&rt6_exception_lock)); 1665 if (!bucket) { 1666 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1667 GFP_ATOMIC); 1668 if (!bucket) { 1669 err = -ENOMEM; 1670 goto out; 1671 } 1672 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1673 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1674 err = -EINVAL; 1675 goto out; 1676 } 1677 1678 #ifdef CONFIG_IPV6_SUBTREES 1679 /* fib6_src.plen != 0 indicates f6i is in subtree 1680 * and exception table is indexed by a hash of 1681 * both fib6_dst and fib6_src. 1682 * Otherwise, the exception table is indexed by 1683 * a hash of only fib6_dst. 1684 */ 1685 if (f6i->fib6_src.plen) 1686 src_key = &nrt->rt6i_src.addr; 1687 #endif 1688 /* rt6_mtu_change() might lower mtu on f6i. 1689 * Only insert this exception route if its mtu 1690 * is less than f6i's mtu value. 1691 */ 1692 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1693 err = -EINVAL; 1694 goto out; 1695 } 1696 1697 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1698 src_key); 1699 if (rt6_ex) 1700 rt6_remove_exception(bucket, rt6_ex); 1701 1702 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1703 if (!rt6_ex) { 1704 err = -ENOMEM; 1705 goto out; 1706 } 1707 rt6_ex->rt6i = nrt; 1708 rt6_ex->stamp = jiffies; 1709 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1710 bucket->depth++; 1711 net->ipv6.rt6_stats->fib_rt_cache++; 1712 1713 /* Randomize max depth to avoid some side channels attacks. */ 1714 max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH); 1715 while (bucket->depth > max_depth) 1716 rt6_exception_remove_oldest(bucket); 1717 1718 out: 1719 spin_unlock_bh(&rt6_exception_lock); 1720 1721 /* Update fn->fn_sernum to invalidate all cached dst */ 1722 if (!err) { 1723 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1724 fib6_update_sernum(net, f6i); 1725 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1726 fib6_force_start_gc(net); 1727 } 1728 1729 return err; 1730 } 1731 1732 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1733 { 1734 struct rt6_exception_bucket *bucket; 1735 struct rt6_exception *rt6_ex; 1736 struct hlist_node *tmp; 1737 int i; 1738 1739 spin_lock_bh(&rt6_exception_lock); 1740 1741 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1742 if (!bucket) 1743 goto out; 1744 1745 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1746 if (!from) 1747 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1748 1749 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1750 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1751 if (!from || 1752 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1753 rt6_remove_exception(bucket, rt6_ex); 1754 } 1755 WARN_ON_ONCE(!from && bucket->depth); 1756 bucket++; 1757 } 1758 out: 1759 spin_unlock_bh(&rt6_exception_lock); 1760 } 1761 1762 static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) 1763 { 1764 struct fib6_info *f6i = arg; 1765 1766 fib6_nh_flush_exceptions(nh, f6i); 1767 1768 return 0; 1769 } 1770 1771 void rt6_flush_exceptions(struct fib6_info *f6i) 1772 { 1773 if (f6i->nh) 1774 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, 1775 f6i); 1776 else 1777 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1778 } 1779 1780 /* Find cached rt in the hash table inside passed in rt 1781 * Caller has to hold rcu_read_lock() 1782 */ 1783 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1784 const struct in6_addr *daddr, 1785 const struct in6_addr *saddr) 1786 { 1787 const struct in6_addr *src_key = NULL; 1788 struct rt6_exception_bucket *bucket; 1789 struct rt6_exception *rt6_ex; 1790 struct rt6_info *ret = NULL; 1791 1792 #ifdef CONFIG_IPV6_SUBTREES 1793 /* fib6i_src.plen != 0 indicates f6i is in subtree 1794 * and exception table is indexed by a hash of 1795 * both fib6_dst and fib6_src. 1796 * However, the src addr used to create the hash 1797 * might not be exactly the passed in saddr which 1798 * is a /128 addr from the flow. 1799 * So we need to use f6i->fib6_src to redo lookup 1800 * if the passed in saddr does not find anything. 1801 * (See the logic in ip6_rt_cache_alloc() on how 1802 * rt->rt6i_src is updated.) 1803 */ 1804 if (res->f6i->fib6_src.plen) 1805 src_key = saddr; 1806 find_ex: 1807 #endif 1808 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1809 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1810 1811 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1812 ret = rt6_ex->rt6i; 1813 1814 #ifdef CONFIG_IPV6_SUBTREES 1815 /* Use fib6_src as src_key and redo lookup */ 1816 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1817 src_key = &res->f6i->fib6_src.addr; 1818 goto find_ex; 1819 } 1820 #endif 1821 1822 return ret; 1823 } 1824 1825 /* Remove the passed in cached rt from the hash table that contains it */ 1826 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1827 const struct rt6_info *rt) 1828 { 1829 const struct in6_addr *src_key = NULL; 1830 struct rt6_exception_bucket *bucket; 1831 struct rt6_exception *rt6_ex; 1832 int err; 1833 1834 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1835 return -ENOENT; 1836 1837 spin_lock_bh(&rt6_exception_lock); 1838 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1839 1840 #ifdef CONFIG_IPV6_SUBTREES 1841 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1842 * and exception table is indexed by a hash of 1843 * both rt6i_dst and rt6i_src. 1844 * Otherwise, the exception table is indexed by 1845 * a hash of only rt6i_dst. 1846 */ 1847 if (plen) 1848 src_key = &rt->rt6i_src.addr; 1849 #endif 1850 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1851 &rt->rt6i_dst.addr, 1852 src_key); 1853 if (rt6_ex) { 1854 rt6_remove_exception(bucket, rt6_ex); 1855 err = 0; 1856 } else { 1857 err = -ENOENT; 1858 } 1859 1860 spin_unlock_bh(&rt6_exception_lock); 1861 return err; 1862 } 1863 1864 struct fib6_nh_excptn_arg { 1865 struct rt6_info *rt; 1866 int plen; 1867 }; 1868 1869 static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) 1870 { 1871 struct fib6_nh_excptn_arg *arg = _arg; 1872 int err; 1873 1874 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); 1875 if (err == 0) 1876 return 1; 1877 1878 return 0; 1879 } 1880 1881 static int rt6_remove_exception_rt(struct rt6_info *rt) 1882 { 1883 struct fib6_info *from; 1884 1885 from = rcu_dereference(rt->from); 1886 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1887 return -EINVAL; 1888 1889 if (from->nh) { 1890 struct fib6_nh_excptn_arg arg = { 1891 .rt = rt, 1892 .plen = from->fib6_src.plen 1893 }; 1894 int rc; 1895 1896 /* rc = 1 means an entry was found */ 1897 rc = nexthop_for_each_fib6_nh(from->nh, 1898 rt6_nh_remove_exception_rt, 1899 &arg); 1900 return rc ? 0 : -ENOENT; 1901 } 1902 1903 return fib6_nh_remove_exception(from->fib6_nh, 1904 from->fib6_src.plen, rt); 1905 } 1906 1907 /* Find rt6_ex which contains the passed in rt cache and 1908 * refresh its stamp 1909 */ 1910 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1911 const struct rt6_info *rt) 1912 { 1913 const struct in6_addr *src_key = NULL; 1914 struct rt6_exception_bucket *bucket; 1915 struct rt6_exception *rt6_ex; 1916 1917 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1918 #ifdef CONFIG_IPV6_SUBTREES 1919 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1920 * and exception table is indexed by a hash of 1921 * both rt6i_dst and rt6i_src. 1922 * Otherwise, the exception table is indexed by 1923 * a hash of only rt6i_dst. 1924 */ 1925 if (plen) 1926 src_key = &rt->rt6i_src.addr; 1927 #endif 1928 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1929 if (rt6_ex) 1930 rt6_ex->stamp = jiffies; 1931 } 1932 1933 struct fib6_nh_match_arg { 1934 const struct net_device *dev; 1935 const struct in6_addr *gw; 1936 struct fib6_nh *match; 1937 }; 1938 1939 /* determine if fib6_nh has given device and gateway */ 1940 static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) 1941 { 1942 struct fib6_nh_match_arg *arg = _arg; 1943 1944 if (arg->dev != nh->fib_nh_dev || 1945 (arg->gw && !nh->fib_nh_gw_family) || 1946 (!arg->gw && nh->fib_nh_gw_family) || 1947 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) 1948 return 0; 1949 1950 arg->match = nh; 1951 1952 /* found a match, break the loop */ 1953 return 1; 1954 } 1955 1956 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1957 { 1958 struct fib6_info *from; 1959 struct fib6_nh *fib6_nh; 1960 1961 rcu_read_lock(); 1962 1963 from = rcu_dereference(rt->from); 1964 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1965 goto unlock; 1966 1967 if (from->nh) { 1968 struct fib6_nh_match_arg arg = { 1969 .dev = rt->dst.dev, 1970 .gw = &rt->rt6i_gateway, 1971 }; 1972 1973 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); 1974 1975 if (!arg.match) 1976 goto unlock; 1977 fib6_nh = arg.match; 1978 } else { 1979 fib6_nh = from->fib6_nh; 1980 } 1981 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); 1982 unlock: 1983 rcu_read_unlock(); 1984 } 1985 1986 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1987 struct rt6_info *rt, int mtu) 1988 { 1989 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1990 * lowest MTU in the path: always allow updating the route PMTU to 1991 * reflect PMTU decreases. 1992 * 1993 * If the new MTU is higher, and the route PMTU is equal to the local 1994 * MTU, this means the old MTU is the lowest in the path, so allow 1995 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1996 * handle this. 1997 */ 1998 1999 if (dst_mtu(&rt->dst) >= mtu) 2000 return true; 2001 2002 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 2003 return true; 2004 2005 return false; 2006 } 2007 2008 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 2009 const struct fib6_nh *nh, int mtu) 2010 { 2011 struct rt6_exception_bucket *bucket; 2012 struct rt6_exception *rt6_ex; 2013 int i; 2014 2015 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2016 if (!bucket) 2017 return; 2018 2019 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2020 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 2021 struct rt6_info *entry = rt6_ex->rt6i; 2022 2023 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 2024 * route), the metrics of its rt->from have already 2025 * been updated. 2026 */ 2027 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 2028 rt6_mtu_change_route_allowed(idev, entry, mtu)) 2029 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 2030 } 2031 bucket++; 2032 } 2033 } 2034 2035 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2036 2037 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 2038 const struct in6_addr *gateway) 2039 { 2040 struct rt6_exception_bucket *bucket; 2041 struct rt6_exception *rt6_ex; 2042 struct hlist_node *tmp; 2043 int i; 2044 2045 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2046 return; 2047 2048 spin_lock_bh(&rt6_exception_lock); 2049 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2050 if (bucket) { 2051 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2052 hlist_for_each_entry_safe(rt6_ex, tmp, 2053 &bucket->chain, hlist) { 2054 struct rt6_info *entry = rt6_ex->rt6i; 2055 2056 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 2057 RTF_CACHE_GATEWAY && 2058 ipv6_addr_equal(gateway, 2059 &entry->rt6i_gateway)) { 2060 rt6_remove_exception(bucket, rt6_ex); 2061 } 2062 } 2063 bucket++; 2064 } 2065 } 2066 2067 spin_unlock_bh(&rt6_exception_lock); 2068 } 2069 2070 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 2071 struct rt6_exception *rt6_ex, 2072 struct fib6_gc_args *gc_args, 2073 unsigned long now) 2074 { 2075 struct rt6_info *rt = rt6_ex->rt6i; 2076 2077 /* we are pruning and obsoleting aged-out and non gateway exceptions 2078 * even if others have still references to them, so that on next 2079 * dst_check() such references can be dropped. 2080 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 2081 * expired, independently from their aging, as per RFC 8201 section 4 2082 */ 2083 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 2084 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 2085 RT6_TRACE("aging clone %p\n", rt); 2086 rt6_remove_exception(bucket, rt6_ex); 2087 return; 2088 } 2089 } else if (time_after(jiffies, rt->dst.expires)) { 2090 RT6_TRACE("purging expired route %p\n", rt); 2091 rt6_remove_exception(bucket, rt6_ex); 2092 return; 2093 } 2094 2095 if (rt->rt6i_flags & RTF_GATEWAY) { 2096 struct neighbour *neigh; 2097 2098 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 2099 2100 if (!(neigh && (neigh->flags & NTF_ROUTER))) { 2101 RT6_TRACE("purging route %p via non-router but gateway\n", 2102 rt); 2103 rt6_remove_exception(bucket, rt6_ex); 2104 return; 2105 } 2106 } 2107 2108 gc_args->more++; 2109 } 2110 2111 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 2112 struct fib6_gc_args *gc_args, 2113 unsigned long now) 2114 { 2115 struct rt6_exception_bucket *bucket; 2116 struct rt6_exception *rt6_ex; 2117 struct hlist_node *tmp; 2118 int i; 2119 2120 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2121 return; 2122 2123 rcu_read_lock_bh(); 2124 spin_lock(&rt6_exception_lock); 2125 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2126 if (bucket) { 2127 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2128 hlist_for_each_entry_safe(rt6_ex, tmp, 2129 &bucket->chain, hlist) { 2130 rt6_age_examine_exception(bucket, rt6_ex, 2131 gc_args, now); 2132 } 2133 bucket++; 2134 } 2135 } 2136 spin_unlock(&rt6_exception_lock); 2137 rcu_read_unlock_bh(); 2138 } 2139 2140 struct fib6_nh_age_excptn_arg { 2141 struct fib6_gc_args *gc_args; 2142 unsigned long now; 2143 }; 2144 2145 static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) 2146 { 2147 struct fib6_nh_age_excptn_arg *arg = _arg; 2148 2149 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); 2150 return 0; 2151 } 2152 2153 void rt6_age_exceptions(struct fib6_info *f6i, 2154 struct fib6_gc_args *gc_args, 2155 unsigned long now) 2156 { 2157 if (f6i->nh) { 2158 struct fib6_nh_age_excptn_arg arg = { 2159 .gc_args = gc_args, 2160 .now = now 2161 }; 2162 2163 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, 2164 &arg); 2165 } else { 2166 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 2167 } 2168 } 2169 2170 /* must be called with rcu lock held */ 2171 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 2172 struct flowi6 *fl6, struct fib6_result *res, int strict) 2173 { 2174 struct fib6_node *fn, *saved_fn; 2175 2176 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2177 saved_fn = fn; 2178 2179 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2180 oif = 0; 2181 2182 redo_rt6_select: 2183 rt6_select(net, fn, oif, res, strict); 2184 if (res->f6i == net->ipv6.fib6_null_entry) { 2185 fn = fib6_backtrack(fn, &fl6->saddr); 2186 if (fn) 2187 goto redo_rt6_select; 2188 else if (strict & RT6_LOOKUP_F_REACHABLE) { 2189 /* also consider unreachable route */ 2190 strict &= ~RT6_LOOKUP_F_REACHABLE; 2191 fn = saved_fn; 2192 goto redo_rt6_select; 2193 } 2194 } 2195 2196 trace_fib6_table_lookup(net, res, table, fl6); 2197 2198 return 0; 2199 } 2200 2201 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 2202 int oif, struct flowi6 *fl6, 2203 const struct sk_buff *skb, int flags) 2204 { 2205 struct fib6_result res = {}; 2206 struct rt6_info *rt = NULL; 2207 int strict = 0; 2208 2209 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2210 !rcu_read_lock_held()); 2211 2212 strict |= flags & RT6_LOOKUP_F_IFACE; 2213 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 2214 if (net->ipv6.devconf_all->forwarding == 0) 2215 strict |= RT6_LOOKUP_F_REACHABLE; 2216 2217 rcu_read_lock(); 2218 2219 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2220 if (res.f6i == net->ipv6.fib6_null_entry) 2221 goto out; 2222 2223 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2224 2225 /*Search through exception table */ 2226 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2227 if (rt) { 2228 goto out; 2229 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2230 !res.nh->fib_nh_gw_family)) { 2231 /* Create a RTF_CACHE clone which will not be 2232 * owned by the fib6 tree. It is for the special case where 2233 * the daddr in the skb during the neighbor look-up is different 2234 * from the fl6->daddr used to look-up route here. 2235 */ 2236 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2237 2238 if (rt) { 2239 /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2240 * As rt6_uncached_list_add() does not consume refcnt, 2241 * this refcnt is always returned to the caller even 2242 * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2243 */ 2244 rt6_uncached_list_add(rt); 2245 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2246 rcu_read_unlock(); 2247 2248 return rt; 2249 } 2250 } else { 2251 /* Get a percpu copy */ 2252 local_bh_disable(); 2253 rt = rt6_get_pcpu_route(&res); 2254 2255 if (!rt) 2256 rt = rt6_make_pcpu_route(net, &res); 2257 2258 local_bh_enable(); 2259 } 2260 out: 2261 if (!rt) 2262 rt = net->ipv6.ip6_null_entry; 2263 if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2264 ip6_hold_safe(net, &rt); 2265 rcu_read_unlock(); 2266 2267 return rt; 2268 } 2269 EXPORT_SYMBOL_GPL(ip6_pol_route); 2270 2271 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, 2272 struct fib6_table *table, 2273 struct flowi6 *fl6, 2274 const struct sk_buff *skb, 2275 int flags) 2276 { 2277 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2278 } 2279 2280 struct dst_entry *ip6_route_input_lookup(struct net *net, 2281 struct net_device *dev, 2282 struct flowi6 *fl6, 2283 const struct sk_buff *skb, 2284 int flags) 2285 { 2286 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2287 flags |= RT6_LOOKUP_F_IFACE; 2288 2289 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2290 } 2291 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2292 2293 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2294 struct flow_keys *keys, 2295 struct flow_keys *flkeys) 2296 { 2297 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2298 const struct ipv6hdr *key_iph = outer_iph; 2299 struct flow_keys *_flkeys = flkeys; 2300 const struct ipv6hdr *inner_iph; 2301 const struct icmp6hdr *icmph; 2302 struct ipv6hdr _inner_iph; 2303 struct icmp6hdr _icmph; 2304 2305 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2306 goto out; 2307 2308 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2309 sizeof(_icmph), &_icmph); 2310 if (!icmph) 2311 goto out; 2312 2313 if (!icmpv6_is_err(icmph->icmp6_type)) 2314 goto out; 2315 2316 inner_iph = skb_header_pointer(skb, 2317 skb_transport_offset(skb) + sizeof(*icmph), 2318 sizeof(_inner_iph), &_inner_iph); 2319 if (!inner_iph) 2320 goto out; 2321 2322 key_iph = inner_iph; 2323 _flkeys = NULL; 2324 out: 2325 if (_flkeys) { 2326 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2327 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2328 keys->tags.flow_label = _flkeys->tags.flow_label; 2329 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2330 } else { 2331 keys->addrs.v6addrs.src = key_iph->saddr; 2332 keys->addrs.v6addrs.dst = key_iph->daddr; 2333 keys->tags.flow_label = ip6_flowlabel(key_iph); 2334 keys->basic.ip_proto = key_iph->nexthdr; 2335 } 2336 } 2337 2338 static u32 rt6_multipath_custom_hash_outer(const struct net *net, 2339 const struct sk_buff *skb, 2340 bool *p_has_inner) 2341 { 2342 u32 hash_fields = ip6_multipath_hash_fields(net); 2343 struct flow_keys keys, hash_keys; 2344 2345 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2346 return 0; 2347 2348 memset(&hash_keys, 0, sizeof(hash_keys)); 2349 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); 2350 2351 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2352 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2353 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2354 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2355 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2356 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2357 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2358 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2359 hash_keys.tags.flow_label = keys.tags.flow_label; 2360 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2361 hash_keys.ports.src = keys.ports.src; 2362 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2363 hash_keys.ports.dst = keys.ports.dst; 2364 2365 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); 2366 return flow_hash_from_keys(&hash_keys); 2367 } 2368 2369 static u32 rt6_multipath_custom_hash_inner(const struct net *net, 2370 const struct sk_buff *skb, 2371 bool has_inner) 2372 { 2373 u32 hash_fields = ip6_multipath_hash_fields(net); 2374 struct flow_keys keys, hash_keys; 2375 2376 /* We assume the packet carries an encapsulation, but if none was 2377 * encountered during dissection of the outer flow, then there is no 2378 * point in calling the flow dissector again. 2379 */ 2380 if (!has_inner) 2381 return 0; 2382 2383 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) 2384 return 0; 2385 2386 memset(&hash_keys, 0, sizeof(hash_keys)); 2387 skb_flow_dissect_flow_keys(skb, &keys, 0); 2388 2389 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) 2390 return 0; 2391 2392 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2393 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2394 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2395 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 2396 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2397 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 2398 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2399 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2400 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2401 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2402 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2403 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2404 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) 2405 hash_keys.tags.flow_label = keys.tags.flow_label; 2406 } 2407 2408 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) 2409 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2410 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) 2411 hash_keys.ports.src = keys.ports.src; 2412 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) 2413 hash_keys.ports.dst = keys.ports.dst; 2414 2415 return flow_hash_from_keys(&hash_keys); 2416 } 2417 2418 static u32 rt6_multipath_custom_hash_skb(const struct net *net, 2419 const struct sk_buff *skb) 2420 { 2421 u32 mhash, mhash_inner; 2422 bool has_inner = true; 2423 2424 mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); 2425 mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); 2426 2427 return jhash_2words(mhash, mhash_inner, 0); 2428 } 2429 2430 static u32 rt6_multipath_custom_hash_fl6(const struct net *net, 2431 const struct flowi6 *fl6) 2432 { 2433 u32 hash_fields = ip6_multipath_hash_fields(net); 2434 struct flow_keys hash_keys; 2435 2436 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2437 return 0; 2438 2439 memset(&hash_keys, 0, sizeof(hash_keys)); 2440 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2441 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2442 hash_keys.addrs.v6addrs.src = fl6->saddr; 2443 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2444 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2445 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2446 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2447 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2448 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2449 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2450 hash_keys.ports.src = fl6->fl6_sport; 2451 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2452 hash_keys.ports.dst = fl6->fl6_dport; 2453 2454 return flow_hash_from_keys(&hash_keys); 2455 } 2456 2457 /* if skb is set it will be used and fl6 can be NULL */ 2458 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2459 const struct sk_buff *skb, struct flow_keys *flkeys) 2460 { 2461 struct flow_keys hash_keys; 2462 u32 mhash = 0; 2463 2464 switch (ip6_multipath_hash_policy(net)) { 2465 case 0: 2466 memset(&hash_keys, 0, sizeof(hash_keys)); 2467 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2468 if (skb) { 2469 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2470 } else { 2471 hash_keys.addrs.v6addrs.src = fl6->saddr; 2472 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2473 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2474 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2475 } 2476 mhash = flow_hash_from_keys(&hash_keys); 2477 break; 2478 case 1: 2479 if (skb) { 2480 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2481 struct flow_keys keys; 2482 2483 /* short-circuit if we already have L4 hash present */ 2484 if (skb->l4_hash) 2485 return skb_get_hash_raw(skb) >> 1; 2486 2487 memset(&hash_keys, 0, sizeof(hash_keys)); 2488 2489 if (!flkeys) { 2490 skb_flow_dissect_flow_keys(skb, &keys, flag); 2491 flkeys = &keys; 2492 } 2493 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2494 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2495 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2496 hash_keys.ports.src = flkeys->ports.src; 2497 hash_keys.ports.dst = flkeys->ports.dst; 2498 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2499 } else { 2500 memset(&hash_keys, 0, sizeof(hash_keys)); 2501 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2502 hash_keys.addrs.v6addrs.src = fl6->saddr; 2503 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2504 hash_keys.ports.src = fl6->fl6_sport; 2505 hash_keys.ports.dst = fl6->fl6_dport; 2506 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2507 } 2508 mhash = flow_hash_from_keys(&hash_keys); 2509 break; 2510 case 2: 2511 memset(&hash_keys, 0, sizeof(hash_keys)); 2512 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2513 if (skb) { 2514 struct flow_keys keys; 2515 2516 if (!flkeys) { 2517 skb_flow_dissect_flow_keys(skb, &keys, 0); 2518 flkeys = &keys; 2519 } 2520 2521 /* Inner can be v4 or v6 */ 2522 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2523 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2524 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 2525 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 2526 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2527 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2528 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2529 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2530 hash_keys.tags.flow_label = flkeys->tags.flow_label; 2531 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2532 } else { 2533 /* Same as case 0 */ 2534 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2535 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2536 } 2537 } else { 2538 /* Same as case 0 */ 2539 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2540 hash_keys.addrs.v6addrs.src = fl6->saddr; 2541 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2542 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2543 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2544 } 2545 mhash = flow_hash_from_keys(&hash_keys); 2546 break; 2547 case 3: 2548 if (skb) 2549 mhash = rt6_multipath_custom_hash_skb(net, skb); 2550 else 2551 mhash = rt6_multipath_custom_hash_fl6(net, fl6); 2552 break; 2553 } 2554 2555 return mhash >> 1; 2556 } 2557 2558 /* Called with rcu held */ 2559 void ip6_route_input(struct sk_buff *skb) 2560 { 2561 const struct ipv6hdr *iph = ipv6_hdr(skb); 2562 struct net *net = dev_net(skb->dev); 2563 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2564 struct ip_tunnel_info *tun_info; 2565 struct flowi6 fl6 = { 2566 .flowi6_iif = skb->dev->ifindex, 2567 .daddr = iph->daddr, 2568 .saddr = iph->saddr, 2569 .flowlabel = ip6_flowinfo(iph), 2570 .flowi6_mark = skb->mark, 2571 .flowi6_proto = iph->nexthdr, 2572 }; 2573 struct flow_keys *flkeys = NULL, _flkeys; 2574 2575 tun_info = skb_tunnel_info(skb); 2576 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2577 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2578 2579 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2580 flkeys = &_flkeys; 2581 2582 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2583 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2584 skb_dst_drop(skb); 2585 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2586 &fl6, skb, flags)); 2587 } 2588 2589 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, 2590 struct fib6_table *table, 2591 struct flowi6 *fl6, 2592 const struct sk_buff *skb, 2593 int flags) 2594 { 2595 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2596 } 2597 2598 struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2599 const struct sock *sk, 2600 struct flowi6 *fl6, int flags) 2601 { 2602 bool any_src; 2603 2604 if (ipv6_addr_type(&fl6->daddr) & 2605 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2606 struct dst_entry *dst; 2607 2608 /* This function does not take refcnt on the dst */ 2609 dst = l3mdev_link_scope_lookup(net, fl6); 2610 if (dst) 2611 return dst; 2612 } 2613 2614 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2615 2616 flags |= RT6_LOOKUP_F_DST_NOREF; 2617 any_src = ipv6_addr_any(&fl6->saddr); 2618 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2619 (fl6->flowi6_oif && any_src)) 2620 flags |= RT6_LOOKUP_F_IFACE; 2621 2622 if (!any_src) 2623 flags |= RT6_LOOKUP_F_HAS_SADDR; 2624 else if (sk) 2625 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2626 2627 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2628 } 2629 EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); 2630 2631 struct dst_entry *ip6_route_output_flags(struct net *net, 2632 const struct sock *sk, 2633 struct flowi6 *fl6, 2634 int flags) 2635 { 2636 struct dst_entry *dst; 2637 struct rt6_info *rt6; 2638 2639 rcu_read_lock(); 2640 dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2641 rt6 = (struct rt6_info *)dst; 2642 /* For dst cached in uncached_list, refcnt is already taken. */ 2643 if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { 2644 dst = &net->ipv6.ip6_null_entry->dst; 2645 dst_hold(dst); 2646 } 2647 rcu_read_unlock(); 2648 2649 return dst; 2650 } 2651 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2652 2653 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2654 { 2655 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2656 struct net_device *loopback_dev = net->loopback_dev; 2657 struct dst_entry *new = NULL; 2658 2659 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2660 DST_OBSOLETE_DEAD, 0); 2661 if (rt) { 2662 rt6_info_init(rt); 2663 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2664 2665 new = &rt->dst; 2666 new->__use = 1; 2667 new->input = dst_discard; 2668 new->output = dst_discard_out; 2669 2670 dst_copy_metrics(new, &ort->dst); 2671 2672 rt->rt6i_idev = in6_dev_get(loopback_dev); 2673 rt->rt6i_gateway = ort->rt6i_gateway; 2674 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2675 2676 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2677 #ifdef CONFIG_IPV6_SUBTREES 2678 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2679 #endif 2680 } 2681 2682 dst_release(dst_orig); 2683 return new ? new : ERR_PTR(-ENOMEM); 2684 } 2685 2686 /* 2687 * Destination cache support functions 2688 */ 2689 2690 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2691 { 2692 u32 rt_cookie = 0; 2693 2694 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2695 return false; 2696 2697 if (fib6_check_expired(f6i)) 2698 return false; 2699 2700 return true; 2701 } 2702 2703 static struct dst_entry *rt6_check(struct rt6_info *rt, 2704 struct fib6_info *from, 2705 u32 cookie) 2706 { 2707 u32 rt_cookie = 0; 2708 2709 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || 2710 rt_cookie != cookie) 2711 return NULL; 2712 2713 if (rt6_check_expired(rt)) 2714 return NULL; 2715 2716 return &rt->dst; 2717 } 2718 2719 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2720 struct fib6_info *from, 2721 u32 cookie) 2722 { 2723 if (!__rt6_check_expired(rt) && 2724 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2725 fib6_check(from, cookie)) 2726 return &rt->dst; 2727 else 2728 return NULL; 2729 } 2730 2731 INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, 2732 u32 cookie) 2733 { 2734 struct dst_entry *dst_ret; 2735 struct fib6_info *from; 2736 struct rt6_info *rt; 2737 2738 rt = container_of(dst, struct rt6_info, dst); 2739 2740 if (rt->sernum) 2741 return rt6_is_valid(rt) ? dst : NULL; 2742 2743 rcu_read_lock(); 2744 2745 /* All IPV6 dsts are created with ->obsolete set to the value 2746 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2747 * into this function always. 2748 */ 2749 2750 from = rcu_dereference(rt->from); 2751 2752 if (from && (rt->rt6i_flags & RTF_PCPU || 2753 unlikely(!list_empty(&rt->rt6i_uncached)))) 2754 dst_ret = rt6_dst_from_check(rt, from, cookie); 2755 else 2756 dst_ret = rt6_check(rt, from, cookie); 2757 2758 rcu_read_unlock(); 2759 2760 return dst_ret; 2761 } 2762 EXPORT_INDIRECT_CALLABLE(ip6_dst_check); 2763 2764 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2765 { 2766 struct rt6_info *rt = (struct rt6_info *) dst; 2767 2768 if (rt) { 2769 if (rt->rt6i_flags & RTF_CACHE) { 2770 rcu_read_lock(); 2771 if (rt6_check_expired(rt)) { 2772 rt6_remove_exception_rt(rt); 2773 dst = NULL; 2774 } 2775 rcu_read_unlock(); 2776 } else { 2777 dst_release(dst); 2778 dst = NULL; 2779 } 2780 } 2781 return dst; 2782 } 2783 2784 static void ip6_link_failure(struct sk_buff *skb) 2785 { 2786 struct rt6_info *rt; 2787 2788 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2789 2790 rt = (struct rt6_info *) skb_dst(skb); 2791 if (rt) { 2792 rcu_read_lock(); 2793 if (rt->rt6i_flags & RTF_CACHE) { 2794 rt6_remove_exception_rt(rt); 2795 } else { 2796 struct fib6_info *from; 2797 struct fib6_node *fn; 2798 2799 from = rcu_dereference(rt->from); 2800 if (from) { 2801 fn = rcu_dereference(from->fib6_node); 2802 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2803 fn->fn_sernum = -1; 2804 } 2805 } 2806 rcu_read_unlock(); 2807 } 2808 } 2809 2810 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2811 { 2812 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2813 struct fib6_info *from; 2814 2815 rcu_read_lock(); 2816 from = rcu_dereference(rt0->from); 2817 if (from) 2818 rt0->dst.expires = from->expires; 2819 rcu_read_unlock(); 2820 } 2821 2822 dst_set_expires(&rt0->dst, timeout); 2823 rt0->rt6i_flags |= RTF_EXPIRES; 2824 } 2825 2826 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2827 { 2828 struct net *net = dev_net(rt->dst.dev); 2829 2830 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2831 rt->rt6i_flags |= RTF_MODIFIED; 2832 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2833 } 2834 2835 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2836 { 2837 return !(rt->rt6i_flags & RTF_CACHE) && 2838 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2839 } 2840 2841 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2842 const struct ipv6hdr *iph, u32 mtu, 2843 bool confirm_neigh) 2844 { 2845 const struct in6_addr *daddr, *saddr; 2846 struct rt6_info *rt6 = (struct rt6_info *)dst; 2847 2848 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) 2849 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. 2850 * [see also comment in rt6_mtu_change_route()] 2851 */ 2852 2853 if (iph) { 2854 daddr = &iph->daddr; 2855 saddr = &iph->saddr; 2856 } else if (sk) { 2857 daddr = &sk->sk_v6_daddr; 2858 saddr = &inet6_sk(sk)->saddr; 2859 } else { 2860 daddr = NULL; 2861 saddr = NULL; 2862 } 2863 2864 if (confirm_neigh) 2865 dst_confirm_neigh(dst, daddr); 2866 2867 if (mtu < IPV6_MIN_MTU) 2868 return; 2869 if (mtu >= dst_mtu(dst)) 2870 return; 2871 2872 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2873 rt6_do_update_pmtu(rt6, mtu); 2874 /* update rt6_ex->stamp for cache */ 2875 if (rt6->rt6i_flags & RTF_CACHE) 2876 rt6_update_exception_stamp_rt(rt6); 2877 } else if (daddr) { 2878 struct fib6_result res = {}; 2879 struct rt6_info *nrt6; 2880 2881 rcu_read_lock(); 2882 res.f6i = rcu_dereference(rt6->from); 2883 if (!res.f6i) 2884 goto out_unlock; 2885 2886 res.fib6_flags = res.f6i->fib6_flags; 2887 res.fib6_type = res.f6i->fib6_type; 2888 2889 if (res.f6i->nh) { 2890 struct fib6_nh_match_arg arg = { 2891 .dev = dst->dev, 2892 .gw = &rt6->rt6i_gateway, 2893 }; 2894 2895 nexthop_for_each_fib6_nh(res.f6i->nh, 2896 fib6_nh_find_match, &arg); 2897 2898 /* fib6_info uses a nexthop that does not have fib6_nh 2899 * using the dst->dev + gw. Should be impossible. 2900 */ 2901 if (!arg.match) 2902 goto out_unlock; 2903 2904 res.nh = arg.match; 2905 } else { 2906 res.nh = res.f6i->fib6_nh; 2907 } 2908 2909 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2910 if (nrt6) { 2911 rt6_do_update_pmtu(nrt6, mtu); 2912 if (rt6_insert_exception(nrt6, &res)) 2913 dst_release_immediate(&nrt6->dst); 2914 } 2915 out_unlock: 2916 rcu_read_unlock(); 2917 } 2918 } 2919 2920 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2921 struct sk_buff *skb, u32 mtu, 2922 bool confirm_neigh) 2923 { 2924 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, 2925 confirm_neigh); 2926 } 2927 2928 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2929 int oif, u32 mark, kuid_t uid) 2930 { 2931 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2932 struct dst_entry *dst; 2933 struct flowi6 fl6 = { 2934 .flowi6_oif = oif, 2935 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2936 .daddr = iph->daddr, 2937 .saddr = iph->saddr, 2938 .flowlabel = ip6_flowinfo(iph), 2939 .flowi6_uid = uid, 2940 }; 2941 2942 dst = ip6_route_output(net, NULL, &fl6); 2943 if (!dst->error) 2944 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); 2945 dst_release(dst); 2946 } 2947 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2948 2949 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2950 { 2951 int oif = sk->sk_bound_dev_if; 2952 struct dst_entry *dst; 2953 2954 if (!oif && skb->dev) 2955 oif = l3mdev_master_ifindex(skb->dev); 2956 2957 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2958 2959 dst = __sk_dst_get(sk); 2960 if (!dst || !dst->obsolete || 2961 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2962 return; 2963 2964 bh_lock_sock(sk); 2965 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2966 ip6_datagram_dst_update(sk, false); 2967 bh_unlock_sock(sk); 2968 } 2969 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2970 2971 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2972 const struct flowi6 *fl6) 2973 { 2974 #ifdef CONFIG_IPV6_SUBTREES 2975 struct ipv6_pinfo *np = inet6_sk(sk); 2976 #endif 2977 2978 ip6_dst_store(sk, dst, 2979 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2980 &sk->sk_v6_daddr : NULL, 2981 #ifdef CONFIG_IPV6_SUBTREES 2982 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2983 &np->saddr : 2984 #endif 2985 NULL); 2986 } 2987 2988 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2989 struct flowi6 *fl6, 2990 const struct in6_addr *gw, 2991 struct rt6_info **ret) 2992 { 2993 const struct fib6_nh *nh = res->nh; 2994 2995 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2996 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2997 return false; 2998 2999 /* rt_cache's gateway might be different from its 'parent' 3000 * in the case of an ip redirect. 3001 * So we keep searching in the exception table if the gateway 3002 * is different. 3003 */ 3004 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 3005 struct rt6_info *rt_cache; 3006 3007 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 3008 if (rt_cache && 3009 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 3010 *ret = rt_cache; 3011 return true; 3012 } 3013 return false; 3014 } 3015 return true; 3016 } 3017 3018 struct fib6_nh_rd_arg { 3019 struct fib6_result *res; 3020 struct flowi6 *fl6; 3021 const struct in6_addr *gw; 3022 struct rt6_info **ret; 3023 }; 3024 3025 static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) 3026 { 3027 struct fib6_nh_rd_arg *arg = _arg; 3028 3029 arg->res->nh = nh; 3030 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); 3031 } 3032 3033 /* Handle redirects */ 3034 struct ip6rd_flowi { 3035 struct flowi6 fl6; 3036 struct in6_addr gateway; 3037 }; 3038 3039 INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, 3040 struct fib6_table *table, 3041 struct flowi6 *fl6, 3042 const struct sk_buff *skb, 3043 int flags) 3044 { 3045 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 3046 struct rt6_info *ret = NULL; 3047 struct fib6_result res = {}; 3048 struct fib6_nh_rd_arg arg = { 3049 .res = &res, 3050 .fl6 = fl6, 3051 .gw = &rdfl->gateway, 3052 .ret = &ret 3053 }; 3054 struct fib6_info *rt; 3055 struct fib6_node *fn; 3056 3057 /* l3mdev_update_flow overrides oif if the device is enslaved; in 3058 * this case we must match on the real ingress device, so reset it 3059 */ 3060 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 3061 fl6->flowi6_oif = skb->dev->ifindex; 3062 3063 /* Get the "current" route for this destination and 3064 * check if the redirect has come from appropriate router. 3065 * 3066 * RFC 4861 specifies that redirects should only be 3067 * accepted if they come from the nexthop to the target. 3068 * Due to the way the routes are chosen, this notion 3069 * is a bit fuzzy and one might need to check all possible 3070 * routes. 3071 */ 3072 3073 rcu_read_lock(); 3074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 3075 restart: 3076 for_each_fib6_node_rt_rcu(fn) { 3077 res.f6i = rt; 3078 if (fib6_check_expired(rt)) 3079 continue; 3080 if (rt->fib6_flags & RTF_REJECT) 3081 break; 3082 if (unlikely(rt->nh)) { 3083 if (nexthop_is_blackhole(rt->nh)) 3084 continue; 3085 /* on match, res->nh is filled in and potentially ret */ 3086 if (nexthop_for_each_fib6_nh(rt->nh, 3087 fib6_nh_redirect_match, 3088 &arg)) 3089 goto out; 3090 } else { 3091 res.nh = rt->fib6_nh; 3092 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, 3093 &ret)) 3094 goto out; 3095 } 3096 } 3097 3098 if (!rt) 3099 rt = net->ipv6.fib6_null_entry; 3100 else if (rt->fib6_flags & RTF_REJECT) { 3101 ret = net->ipv6.ip6_null_entry; 3102 goto out; 3103 } 3104 3105 if (rt == net->ipv6.fib6_null_entry) { 3106 fn = fib6_backtrack(fn, &fl6->saddr); 3107 if (fn) 3108 goto restart; 3109 } 3110 3111 res.f6i = rt; 3112 res.nh = rt->fib6_nh; 3113 out: 3114 if (ret) { 3115 ip6_hold_safe(net, &ret); 3116 } else { 3117 res.fib6_flags = res.f6i->fib6_flags; 3118 res.fib6_type = res.f6i->fib6_type; 3119 ret = ip6_create_rt_rcu(&res); 3120 } 3121 3122 rcu_read_unlock(); 3123 3124 trace_fib6_table_lookup(net, &res, table, fl6); 3125 return ret; 3126 }; 3127 3128 static struct dst_entry *ip6_route_redirect(struct net *net, 3129 const struct flowi6 *fl6, 3130 const struct sk_buff *skb, 3131 const struct in6_addr *gateway) 3132 { 3133 int flags = RT6_LOOKUP_F_HAS_SADDR; 3134 struct ip6rd_flowi rdfl; 3135 3136 rdfl.fl6 = *fl6; 3137 rdfl.gateway = *gateway; 3138 3139 return fib6_rule_lookup(net, &rdfl.fl6, skb, 3140 flags, __ip6_route_redirect); 3141 } 3142 3143 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 3144 kuid_t uid) 3145 { 3146 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 3147 struct dst_entry *dst; 3148 struct flowi6 fl6 = { 3149 .flowi6_iif = LOOPBACK_IFINDEX, 3150 .flowi6_oif = oif, 3151 .flowi6_mark = mark, 3152 .daddr = iph->daddr, 3153 .saddr = iph->saddr, 3154 .flowlabel = ip6_flowinfo(iph), 3155 .flowi6_uid = uid, 3156 }; 3157 3158 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 3159 rt6_do_redirect(dst, NULL, skb); 3160 dst_release(dst); 3161 } 3162 EXPORT_SYMBOL_GPL(ip6_redirect); 3163 3164 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 3165 { 3166 const struct ipv6hdr *iph = ipv6_hdr(skb); 3167 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 3168 struct dst_entry *dst; 3169 struct flowi6 fl6 = { 3170 .flowi6_iif = LOOPBACK_IFINDEX, 3171 .flowi6_oif = oif, 3172 .daddr = msg->dest, 3173 .saddr = iph->daddr, 3174 .flowi6_uid = sock_net_uid(net, NULL), 3175 }; 3176 3177 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 3178 rt6_do_redirect(dst, NULL, skb); 3179 dst_release(dst); 3180 } 3181 3182 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 3183 { 3184 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 3185 sk->sk_uid); 3186 } 3187 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 3188 3189 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 3190 { 3191 struct net_device *dev = dst->dev; 3192 unsigned int mtu = dst_mtu(dst); 3193 struct net *net = dev_net(dev); 3194 3195 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3196 3197 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3198 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3199 3200 /* 3201 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 3202 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 3203 * IPV6_MAXPLEN is also valid and means: "any MSS, 3204 * rely only on pmtu discovery" 3205 */ 3206 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 3207 mtu = IPV6_MAXPLEN; 3208 return mtu; 3209 } 3210 3211 INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) 3212 { 3213 return ip6_dst_mtu_maybe_forward(dst, false); 3214 } 3215 EXPORT_INDIRECT_CALLABLE(ip6_mtu); 3216 3217 /* MTU selection: 3218 * 1. mtu on route is locked - use it 3219 * 2. mtu from nexthop exception 3220 * 3. mtu from egress device 3221 * 3222 * based on ip6_dst_mtu_forward and exception logic of 3223 * rt6_find_cached_rt; called with rcu_read_lock 3224 */ 3225 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 3226 const struct in6_addr *daddr, 3227 const struct in6_addr *saddr) 3228 { 3229 const struct fib6_nh *nh = res->nh; 3230 struct fib6_info *f6i = res->f6i; 3231 struct inet6_dev *idev; 3232 struct rt6_info *rt; 3233 u32 mtu = 0; 3234 3235 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 3236 mtu = f6i->fib6_pmtu; 3237 if (mtu) 3238 goto out; 3239 } 3240 3241 rt = rt6_find_cached_rt(res, daddr, saddr); 3242 if (unlikely(rt)) { 3243 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 3244 } else { 3245 struct net_device *dev = nh->fib_nh_dev; 3246 3247 mtu = IPV6_MIN_MTU; 3248 idev = __in6_dev_get(dev); 3249 if (idev && idev->cnf.mtu6 > mtu) 3250 mtu = idev->cnf.mtu6; 3251 } 3252 3253 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3254 out: 3255 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 3256 } 3257 3258 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 3259 struct flowi6 *fl6) 3260 { 3261 struct dst_entry *dst; 3262 struct rt6_info *rt; 3263 struct inet6_dev *idev = in6_dev_get(dev); 3264 struct net *net = dev_net(dev); 3265 3266 if (unlikely(!idev)) 3267 return ERR_PTR(-ENODEV); 3268 3269 rt = ip6_dst_alloc(net, dev, 0); 3270 if (unlikely(!rt)) { 3271 in6_dev_put(idev); 3272 dst = ERR_PTR(-ENOMEM); 3273 goto out; 3274 } 3275 3276 rt->dst.input = ip6_input; 3277 rt->dst.output = ip6_output; 3278 rt->rt6i_gateway = fl6->daddr; 3279 rt->rt6i_dst.addr = fl6->daddr; 3280 rt->rt6i_dst.plen = 128; 3281 rt->rt6i_idev = idev; 3282 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 3283 3284 /* Add this dst into uncached_list so that rt6_disable_ip() can 3285 * do proper release of the net_device 3286 */ 3287 rt6_uncached_list_add(rt); 3288 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 3289 3290 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 3291 3292 out: 3293 return dst; 3294 } 3295 3296 static int ip6_dst_gc(struct dst_ops *ops) 3297 { 3298 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 3299 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 3300 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 3301 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 3302 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 3303 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 3304 int entries; 3305 3306 entries = dst_entries_get_fast(ops); 3307 if (entries > rt_max_size) 3308 entries = dst_entries_get_slow(ops); 3309 3310 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 3311 entries <= rt_max_size) 3312 goto out; 3313 3314 net->ipv6.ip6_rt_gc_expire++; 3315 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 3316 entries = dst_entries_get_slow(ops); 3317 if (entries < ops->gc_thresh) 3318 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 3319 out: 3320 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 3321 return entries > rt_max_size; 3322 } 3323 3324 static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, 3325 const struct in6_addr *gw_addr, u32 tbid, 3326 int flags, struct fib6_result *res) 3327 { 3328 struct flowi6 fl6 = { 3329 .flowi6_oif = cfg->fc_ifindex, 3330 .daddr = *gw_addr, 3331 .saddr = cfg->fc_prefsrc, 3332 }; 3333 struct fib6_table *table; 3334 int err; 3335 3336 table = fib6_get_table(net, tbid); 3337 if (!table) 3338 return -EINVAL; 3339 3340 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 3341 flags |= RT6_LOOKUP_F_HAS_SADDR; 3342 3343 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 3344 3345 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); 3346 if (!err && res->f6i != net->ipv6.fib6_null_entry) 3347 fib6_select_path(net, res, &fl6, cfg->fc_ifindex, 3348 cfg->fc_ifindex != 0, NULL, flags); 3349 3350 return err; 3351 } 3352 3353 static int ip6_route_check_nh_onlink(struct net *net, 3354 struct fib6_config *cfg, 3355 const struct net_device *dev, 3356 struct netlink_ext_ack *extack) 3357 { 3358 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 3359 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3360 struct fib6_result res = {}; 3361 int err; 3362 3363 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); 3364 if (!err && !(res.fib6_flags & RTF_REJECT) && 3365 /* ignore match if it is the default route */ 3366 !ipv6_addr_any(&res.f6i->fib6_dst.addr) && 3367 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { 3368 NL_SET_ERR_MSG(extack, 3369 "Nexthop has invalid gateway or device mismatch"); 3370 err = -EINVAL; 3371 } 3372 3373 return err; 3374 } 3375 3376 static int ip6_route_check_nh(struct net *net, 3377 struct fib6_config *cfg, 3378 struct net_device **_dev, 3379 struct inet6_dev **idev) 3380 { 3381 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3382 struct net_device *dev = _dev ? *_dev : NULL; 3383 int flags = RT6_LOOKUP_F_IFACE; 3384 struct fib6_result res = {}; 3385 int err = -EHOSTUNREACH; 3386 3387 if (cfg->fc_table) { 3388 err = ip6_nh_lookup_table(net, cfg, gw_addr, 3389 cfg->fc_table, flags, &res); 3390 /* gw_addr can not require a gateway or resolve to a reject 3391 * route. If a device is given, it must match the result. 3392 */ 3393 if (err || res.fib6_flags & RTF_REJECT || 3394 res.nh->fib_nh_gw_family || 3395 (dev && dev != res.nh->fib_nh_dev)) 3396 err = -EHOSTUNREACH; 3397 } 3398 3399 if (err < 0) { 3400 struct flowi6 fl6 = { 3401 .flowi6_oif = cfg->fc_ifindex, 3402 .daddr = *gw_addr, 3403 }; 3404 3405 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); 3406 if (err || res.fib6_flags & RTF_REJECT || 3407 res.nh->fib_nh_gw_family) 3408 err = -EHOSTUNREACH; 3409 3410 if (err) 3411 return err; 3412 3413 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, 3414 cfg->fc_ifindex != 0, NULL, flags); 3415 } 3416 3417 err = 0; 3418 if (dev) { 3419 if (dev != res.nh->fib_nh_dev) 3420 err = -EHOSTUNREACH; 3421 } else { 3422 *_dev = dev = res.nh->fib_nh_dev; 3423 dev_hold(dev); 3424 *idev = in6_dev_get(dev); 3425 } 3426 3427 return err; 3428 } 3429 3430 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 3431 struct net_device **_dev, struct inet6_dev **idev, 3432 struct netlink_ext_ack *extack) 3433 { 3434 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3435 int gwa_type = ipv6_addr_type(gw_addr); 3436 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3437 const struct net_device *dev = *_dev; 3438 bool need_addr_check = !dev; 3439 int err = -EINVAL; 3440 3441 /* if gw_addr is local we will fail to detect this in case 3442 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3443 * will return already-added prefix route via interface that 3444 * prefix route was assigned to, which might be non-loopback. 3445 */ 3446 if (dev && 3447 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3448 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3449 goto out; 3450 } 3451 3452 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3453 /* IPv6 strictly inhibits using not link-local 3454 * addresses as nexthop address. 3455 * Otherwise, router will not able to send redirects. 3456 * It is very good, but in some (rare!) circumstances 3457 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3458 * some exceptions. --ANK 3459 * We allow IPv4-mapped nexthops to support RFC4798-type 3460 * addressing 3461 */ 3462 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3463 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3464 goto out; 3465 } 3466 3467 rcu_read_lock(); 3468 3469 if (cfg->fc_flags & RTNH_F_ONLINK) 3470 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3471 else 3472 err = ip6_route_check_nh(net, cfg, _dev, idev); 3473 3474 rcu_read_unlock(); 3475 3476 if (err) 3477 goto out; 3478 } 3479 3480 /* reload in case device was changed */ 3481 dev = *_dev; 3482 3483 err = -EINVAL; 3484 if (!dev) { 3485 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3486 goto out; 3487 } else if (dev->flags & IFF_LOOPBACK) { 3488 NL_SET_ERR_MSG(extack, 3489 "Egress device can not be loopback device for this route"); 3490 goto out; 3491 } 3492 3493 /* if we did not check gw_addr above, do so now that the 3494 * egress device has been resolved. 3495 */ 3496 if (need_addr_check && 3497 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3498 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3499 goto out; 3500 } 3501 3502 err = 0; 3503 out: 3504 return err; 3505 } 3506 3507 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3508 { 3509 if ((flags & RTF_REJECT) || 3510 (dev && (dev->flags & IFF_LOOPBACK) && 3511 !(addr_type & IPV6_ADDR_LOOPBACK) && 3512 !(flags & (RTF_ANYCAST | RTF_LOCAL)))) 3513 return true; 3514 3515 return false; 3516 } 3517 3518 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3519 struct fib6_config *cfg, gfp_t gfp_flags, 3520 struct netlink_ext_ack *extack) 3521 { 3522 struct net_device *dev = NULL; 3523 struct inet6_dev *idev = NULL; 3524 int addr_type; 3525 int err; 3526 3527 fib6_nh->fib_nh_family = AF_INET6; 3528 #ifdef CONFIG_IPV6_ROUTER_PREF 3529 fib6_nh->last_probe = jiffies; 3530 #endif 3531 if (cfg->fc_is_fdb) { 3532 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3533 fib6_nh->fib_nh_gw_family = AF_INET6; 3534 return 0; 3535 } 3536 3537 err = -ENODEV; 3538 if (cfg->fc_ifindex) { 3539 dev = dev_get_by_index(net, cfg->fc_ifindex); 3540 if (!dev) 3541 goto out; 3542 idev = in6_dev_get(dev); 3543 if (!idev) 3544 goto out; 3545 } 3546 3547 if (cfg->fc_flags & RTNH_F_ONLINK) { 3548 if (!dev) { 3549 NL_SET_ERR_MSG(extack, 3550 "Nexthop device required for onlink"); 3551 goto out; 3552 } 3553 3554 if (!(dev->flags & IFF_UP)) { 3555 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3556 err = -ENETDOWN; 3557 goto out; 3558 } 3559 3560 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3561 } 3562 3563 fib6_nh->fib_nh_weight = 1; 3564 3565 /* We cannot add true routes via loopback here, 3566 * they would result in kernel looping; promote them to reject routes 3567 */ 3568 addr_type = ipv6_addr_type(&cfg->fc_dst); 3569 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3570 /* hold loopback dev/idev if we haven't done so. */ 3571 if (dev != net->loopback_dev) { 3572 if (dev) { 3573 dev_put(dev); 3574 in6_dev_put(idev); 3575 } 3576 dev = net->loopback_dev; 3577 dev_hold(dev); 3578 idev = in6_dev_get(dev); 3579 if (!idev) { 3580 err = -ENODEV; 3581 goto out; 3582 } 3583 } 3584 goto pcpu_alloc; 3585 } 3586 3587 if (cfg->fc_flags & RTF_GATEWAY) { 3588 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3589 if (err) 3590 goto out; 3591 3592 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3593 fib6_nh->fib_nh_gw_family = AF_INET6; 3594 } 3595 3596 err = -ENODEV; 3597 if (!dev) 3598 goto out; 3599 3600 if (idev->cnf.disable_ipv6) { 3601 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3602 err = -EACCES; 3603 goto out; 3604 } 3605 3606 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3607 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3608 err = -ENETDOWN; 3609 goto out; 3610 } 3611 3612 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3613 !netif_carrier_ok(dev)) 3614 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3615 3616 err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, 3617 cfg->fc_encap_type, cfg, gfp_flags, extack); 3618 if (err) 3619 goto out; 3620 3621 pcpu_alloc: 3622 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3623 if (!fib6_nh->rt6i_pcpu) { 3624 err = -ENOMEM; 3625 goto out; 3626 } 3627 3628 fib6_nh->fib_nh_dev = dev; 3629 fib6_nh->fib_nh_oif = dev->ifindex; 3630 err = 0; 3631 out: 3632 if (idev) 3633 in6_dev_put(idev); 3634 3635 if (err) { 3636 lwtstate_put(fib6_nh->fib_nh_lws); 3637 fib6_nh->fib_nh_lws = NULL; 3638 dev_put(dev); 3639 } 3640 3641 return err; 3642 } 3643 3644 void fib6_nh_release(struct fib6_nh *fib6_nh) 3645 { 3646 struct rt6_exception_bucket *bucket; 3647 3648 rcu_read_lock(); 3649 3650 fib6_nh_flush_exceptions(fib6_nh, NULL); 3651 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3652 if (bucket) { 3653 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3654 kfree(bucket); 3655 } 3656 3657 rcu_read_unlock(); 3658 3659 fib6_nh_release_dsts(fib6_nh); 3660 free_percpu(fib6_nh->rt6i_pcpu); 3661 3662 fib_nh_common_release(&fib6_nh->nh_common); 3663 } 3664 3665 void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) 3666 { 3667 int cpu; 3668 3669 if (!fib6_nh->rt6i_pcpu) 3670 return; 3671 3672 for_each_possible_cpu(cpu) { 3673 struct rt6_info *pcpu_rt, **ppcpu_rt; 3674 3675 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3676 pcpu_rt = xchg(ppcpu_rt, NULL); 3677 if (pcpu_rt) { 3678 dst_dev_put(&pcpu_rt->dst); 3679 dst_release(&pcpu_rt->dst); 3680 } 3681 } 3682 } 3683 3684 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3685 gfp_t gfp_flags, 3686 struct netlink_ext_ack *extack) 3687 { 3688 struct net *net = cfg->fc_nlinfo.nl_net; 3689 struct fib6_info *rt = NULL; 3690 struct nexthop *nh = NULL; 3691 struct fib6_table *table; 3692 struct fib6_nh *fib6_nh; 3693 int err = -EINVAL; 3694 int addr_type; 3695 3696 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3697 if (cfg->fc_flags & RTF_PCPU) { 3698 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3699 goto out; 3700 } 3701 3702 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3703 if (cfg->fc_flags & RTF_CACHE) { 3704 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3705 goto out; 3706 } 3707 3708 if (cfg->fc_type > RTN_MAX) { 3709 NL_SET_ERR_MSG(extack, "Invalid route type"); 3710 goto out; 3711 } 3712 3713 if (cfg->fc_dst_len > 128) { 3714 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3715 goto out; 3716 } 3717 if (cfg->fc_src_len > 128) { 3718 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3719 goto out; 3720 } 3721 #ifndef CONFIG_IPV6_SUBTREES 3722 if (cfg->fc_src_len) { 3723 NL_SET_ERR_MSG(extack, 3724 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3725 goto out; 3726 } 3727 #endif 3728 if (cfg->fc_nh_id) { 3729 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 3730 if (!nh) { 3731 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 3732 goto out; 3733 } 3734 err = fib6_check_nexthop(nh, cfg, extack); 3735 if (err) 3736 goto out; 3737 } 3738 3739 err = -ENOBUFS; 3740 if (cfg->fc_nlinfo.nlh && 3741 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3742 table = fib6_get_table(net, cfg->fc_table); 3743 if (!table) { 3744 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3745 table = fib6_new_table(net, cfg->fc_table); 3746 } 3747 } else { 3748 table = fib6_new_table(net, cfg->fc_table); 3749 } 3750 3751 if (!table) 3752 goto out; 3753 3754 err = -ENOMEM; 3755 rt = fib6_info_alloc(gfp_flags, !nh); 3756 if (!rt) 3757 goto out; 3758 3759 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3760 extack); 3761 if (IS_ERR(rt->fib6_metrics)) { 3762 err = PTR_ERR(rt->fib6_metrics); 3763 /* Do not leave garbage there. */ 3764 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3765 goto out_free; 3766 } 3767 3768 if (cfg->fc_flags & RTF_ADDRCONF) 3769 rt->dst_nocount = true; 3770 3771 if (cfg->fc_flags & RTF_EXPIRES) 3772 fib6_set_expires(rt, jiffies + 3773 clock_t_to_jiffies(cfg->fc_expires)); 3774 else 3775 fib6_clean_expires(rt); 3776 3777 if (cfg->fc_protocol == RTPROT_UNSPEC) 3778 cfg->fc_protocol = RTPROT_BOOT; 3779 rt->fib6_protocol = cfg->fc_protocol; 3780 3781 rt->fib6_table = table; 3782 rt->fib6_metric = cfg->fc_metric; 3783 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3784 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3785 3786 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3787 rt->fib6_dst.plen = cfg->fc_dst_len; 3788 3789 #ifdef CONFIG_IPV6_SUBTREES 3790 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3791 rt->fib6_src.plen = cfg->fc_src_len; 3792 #endif 3793 if (nh) { 3794 if (rt->fib6_src.plen) { 3795 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3796 goto out_free; 3797 } 3798 if (!nexthop_get(nh)) { 3799 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3800 goto out_free; 3801 } 3802 rt->nh = nh; 3803 fib6_nh = nexthop_fib6_nh(rt->nh); 3804 } else { 3805 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3806 if (err) 3807 goto out; 3808 3809 fib6_nh = rt->fib6_nh; 3810 3811 /* We cannot add true routes via loopback here, they would 3812 * result in kernel looping; promote them to reject routes 3813 */ 3814 addr_type = ipv6_addr_type(&cfg->fc_dst); 3815 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3816 addr_type)) 3817 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3818 } 3819 3820 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3821 struct net_device *dev = fib6_nh->fib_nh_dev; 3822 3823 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3824 NL_SET_ERR_MSG(extack, "Invalid source address"); 3825 err = -EINVAL; 3826 goto out; 3827 } 3828 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3829 rt->fib6_prefsrc.plen = 128; 3830 } else 3831 rt->fib6_prefsrc.plen = 0; 3832 3833 return rt; 3834 out: 3835 fib6_info_release(rt); 3836 return ERR_PTR(err); 3837 out_free: 3838 ip_fib_metrics_put(rt->fib6_metrics); 3839 kfree(rt); 3840 return ERR_PTR(err); 3841 } 3842 3843 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3844 struct netlink_ext_ack *extack) 3845 { 3846 struct fib6_info *rt; 3847 int err; 3848 3849 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3850 if (IS_ERR(rt)) 3851 return PTR_ERR(rt); 3852 3853 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3854 fib6_info_release(rt); 3855 3856 return err; 3857 } 3858 3859 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3860 { 3861 struct net *net = info->nl_net; 3862 struct fib6_table *table; 3863 int err; 3864 3865 if (rt == net->ipv6.fib6_null_entry) { 3866 err = -ENOENT; 3867 goto out; 3868 } 3869 3870 table = rt->fib6_table; 3871 spin_lock_bh(&table->tb6_lock); 3872 err = fib6_del(rt, info); 3873 spin_unlock_bh(&table->tb6_lock); 3874 3875 out: 3876 fib6_info_release(rt); 3877 return err; 3878 } 3879 3880 int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) 3881 { 3882 struct nl_info info = { 3883 .nl_net = net, 3884 .skip_notify = skip_notify 3885 }; 3886 3887 return __ip6_del_rt(rt, &info); 3888 } 3889 3890 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3891 { 3892 struct nl_info *info = &cfg->fc_nlinfo; 3893 struct net *net = info->nl_net; 3894 struct sk_buff *skb = NULL; 3895 struct fib6_table *table; 3896 int err = -ENOENT; 3897 3898 if (rt == net->ipv6.fib6_null_entry) 3899 goto out_put; 3900 table = rt->fib6_table; 3901 spin_lock_bh(&table->tb6_lock); 3902 3903 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3904 struct fib6_info *sibling, *next_sibling; 3905 struct fib6_node *fn; 3906 3907 /* prefer to send a single notification with all hops */ 3908 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3909 if (skb) { 3910 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3911 3912 if (rt6_fill_node(net, skb, rt, NULL, 3913 NULL, NULL, 0, RTM_DELROUTE, 3914 info->portid, seq, 0) < 0) { 3915 kfree_skb(skb); 3916 skb = NULL; 3917 } else 3918 info->skip_notify = 1; 3919 } 3920 3921 /* 'rt' points to the first sibling route. If it is not the 3922 * leaf, then we do not need to send a notification. Otherwise, 3923 * we need to check if the last sibling has a next route or not 3924 * and emit a replace or delete notification, respectively. 3925 */ 3926 info->skip_notify_kernel = 1; 3927 fn = rcu_dereference_protected(rt->fib6_node, 3928 lockdep_is_held(&table->tb6_lock)); 3929 if (rcu_access_pointer(fn->leaf) == rt) { 3930 struct fib6_info *last_sibling, *replace_rt; 3931 3932 last_sibling = list_last_entry(&rt->fib6_siblings, 3933 struct fib6_info, 3934 fib6_siblings); 3935 replace_rt = rcu_dereference_protected( 3936 last_sibling->fib6_next, 3937 lockdep_is_held(&table->tb6_lock)); 3938 if (replace_rt) 3939 call_fib6_entry_notifiers_replace(net, 3940 replace_rt); 3941 else 3942 call_fib6_multipath_entry_notifiers(net, 3943 FIB_EVENT_ENTRY_DEL, 3944 rt, rt->fib6_nsiblings, 3945 NULL); 3946 } 3947 list_for_each_entry_safe(sibling, next_sibling, 3948 &rt->fib6_siblings, 3949 fib6_siblings) { 3950 err = fib6_del(sibling, info); 3951 if (err) 3952 goto out_unlock; 3953 } 3954 } 3955 3956 err = fib6_del(rt, info); 3957 out_unlock: 3958 spin_unlock_bh(&table->tb6_lock); 3959 out_put: 3960 fib6_info_release(rt); 3961 3962 if (skb) { 3963 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3964 info->nlh, gfp_any()); 3965 } 3966 return err; 3967 } 3968 3969 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3970 { 3971 int rc = -ESRCH; 3972 3973 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3974 goto out; 3975 3976 if (cfg->fc_flags & RTF_GATEWAY && 3977 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3978 goto out; 3979 3980 rc = rt6_remove_exception_rt(rt); 3981 out: 3982 return rc; 3983 } 3984 3985 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3986 struct fib6_nh *nh) 3987 { 3988 struct fib6_result res = { 3989 .f6i = rt, 3990 .nh = nh, 3991 }; 3992 struct rt6_info *rt_cache; 3993 3994 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3995 if (rt_cache) 3996 return __ip6_del_cached_rt(rt_cache, cfg); 3997 3998 return 0; 3999 } 4000 4001 struct fib6_nh_del_cached_rt_arg { 4002 struct fib6_config *cfg; 4003 struct fib6_info *f6i; 4004 }; 4005 4006 static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) 4007 { 4008 struct fib6_nh_del_cached_rt_arg *arg = _arg; 4009 int rc; 4010 4011 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); 4012 return rc != -ESRCH ? rc : 0; 4013 } 4014 4015 static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) 4016 { 4017 struct fib6_nh_del_cached_rt_arg arg = { 4018 .cfg = cfg, 4019 .f6i = f6i 4020 }; 4021 4022 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); 4023 } 4024 4025 static int ip6_route_del(struct fib6_config *cfg, 4026 struct netlink_ext_ack *extack) 4027 { 4028 struct fib6_table *table; 4029 struct fib6_info *rt; 4030 struct fib6_node *fn; 4031 int err = -ESRCH; 4032 4033 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 4034 if (!table) { 4035 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 4036 return err; 4037 } 4038 4039 rcu_read_lock(); 4040 4041 fn = fib6_locate(&table->tb6_root, 4042 &cfg->fc_dst, cfg->fc_dst_len, 4043 &cfg->fc_src, cfg->fc_src_len, 4044 !(cfg->fc_flags & RTF_CACHE)); 4045 4046 if (fn) { 4047 for_each_fib6_node_rt_rcu(fn) { 4048 struct fib6_nh *nh; 4049 4050 if (rt->nh && cfg->fc_nh_id && 4051 rt->nh->id != cfg->fc_nh_id) 4052 continue; 4053 4054 if (cfg->fc_flags & RTF_CACHE) { 4055 int rc = 0; 4056 4057 if (rt->nh) { 4058 rc = ip6_del_cached_rt_nh(cfg, rt); 4059 } else if (cfg->fc_nh_id) { 4060 continue; 4061 } else { 4062 nh = rt->fib6_nh; 4063 rc = ip6_del_cached_rt(cfg, rt, nh); 4064 } 4065 if (rc != -ESRCH) { 4066 rcu_read_unlock(); 4067 return rc; 4068 } 4069 continue; 4070 } 4071 4072 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 4073 continue; 4074 if (cfg->fc_protocol && 4075 cfg->fc_protocol != rt->fib6_protocol) 4076 continue; 4077 4078 if (rt->nh) { 4079 if (!fib6_info_hold_safe(rt)) 4080 continue; 4081 rcu_read_unlock(); 4082 4083 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4084 } 4085 if (cfg->fc_nh_id) 4086 continue; 4087 4088 nh = rt->fib6_nh; 4089 if (cfg->fc_ifindex && 4090 (!nh->fib_nh_dev || 4091 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 4092 continue; 4093 if (cfg->fc_flags & RTF_GATEWAY && 4094 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 4095 continue; 4096 if (!fib6_info_hold_safe(rt)) 4097 continue; 4098 rcu_read_unlock(); 4099 4100 /* if gateway was specified only delete the one hop */ 4101 if (cfg->fc_flags & RTF_GATEWAY) 4102 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4103 4104 return __ip6_del_rt_siblings(rt, cfg); 4105 } 4106 } 4107 rcu_read_unlock(); 4108 4109 return err; 4110 } 4111 4112 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 4113 { 4114 struct netevent_redirect netevent; 4115 struct rt6_info *rt, *nrt = NULL; 4116 struct fib6_result res = {}; 4117 struct ndisc_options ndopts; 4118 struct inet6_dev *in6_dev; 4119 struct neighbour *neigh; 4120 struct rd_msg *msg; 4121 int optlen, on_link; 4122 u8 *lladdr; 4123 4124 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 4125 optlen -= sizeof(*msg); 4126 4127 if (optlen < 0) { 4128 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 4129 return; 4130 } 4131 4132 msg = (struct rd_msg *)icmp6_hdr(skb); 4133 4134 if (ipv6_addr_is_multicast(&msg->dest)) { 4135 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 4136 return; 4137 } 4138 4139 on_link = 0; 4140 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 4141 on_link = 1; 4142 } else if (ipv6_addr_type(&msg->target) != 4143 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 4144 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 4145 return; 4146 } 4147 4148 in6_dev = __in6_dev_get(skb->dev); 4149 if (!in6_dev) 4150 return; 4151 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 4152 return; 4153 4154 /* RFC2461 8.1: 4155 * The IP source address of the Redirect MUST be the same as the current 4156 * first-hop router for the specified ICMP Destination Address. 4157 */ 4158 4159 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 4160 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 4161 return; 4162 } 4163 4164 lladdr = NULL; 4165 if (ndopts.nd_opts_tgt_lladdr) { 4166 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 4167 skb->dev); 4168 if (!lladdr) { 4169 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 4170 return; 4171 } 4172 } 4173 4174 rt = (struct rt6_info *) dst; 4175 if (rt->rt6i_flags & RTF_REJECT) { 4176 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 4177 return; 4178 } 4179 4180 /* Redirect received -> path was valid. 4181 * Look, redirects are sent only in response to data packets, 4182 * so that this nexthop apparently is reachable. --ANK 4183 */ 4184 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 4185 4186 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 4187 if (!neigh) 4188 return; 4189 4190 /* 4191 * We have finally decided to accept it. 4192 */ 4193 4194 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 4195 NEIGH_UPDATE_F_WEAK_OVERRIDE| 4196 NEIGH_UPDATE_F_OVERRIDE| 4197 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 4198 NEIGH_UPDATE_F_ISROUTER)), 4199 NDISC_REDIRECT, &ndopts); 4200 4201 rcu_read_lock(); 4202 res.f6i = rcu_dereference(rt->from); 4203 if (!res.f6i) 4204 goto out; 4205 4206 if (res.f6i->nh) { 4207 struct fib6_nh_match_arg arg = { 4208 .dev = dst->dev, 4209 .gw = &rt->rt6i_gateway, 4210 }; 4211 4212 nexthop_for_each_fib6_nh(res.f6i->nh, 4213 fib6_nh_find_match, &arg); 4214 4215 /* fib6_info uses a nexthop that does not have fib6_nh 4216 * using the dst->dev. Should be impossible 4217 */ 4218 if (!arg.match) 4219 goto out; 4220 res.nh = arg.match; 4221 } else { 4222 res.nh = res.f6i->fib6_nh; 4223 } 4224 4225 res.fib6_flags = res.f6i->fib6_flags; 4226 res.fib6_type = res.f6i->fib6_type; 4227 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 4228 if (!nrt) 4229 goto out; 4230 4231 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 4232 if (on_link) 4233 nrt->rt6i_flags &= ~RTF_GATEWAY; 4234 4235 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 4236 4237 /* rt6_insert_exception() will take care of duplicated exceptions */ 4238 if (rt6_insert_exception(nrt, &res)) { 4239 dst_release_immediate(&nrt->dst); 4240 goto out; 4241 } 4242 4243 netevent.old = &rt->dst; 4244 netevent.new = &nrt->dst; 4245 netevent.daddr = &msg->dest; 4246 netevent.neigh = neigh; 4247 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 4248 4249 out: 4250 rcu_read_unlock(); 4251 neigh_release(neigh); 4252 } 4253 4254 #ifdef CONFIG_IPV6_ROUTE_INFO 4255 static struct fib6_info *rt6_get_route_info(struct net *net, 4256 const struct in6_addr *prefix, int prefixlen, 4257 const struct in6_addr *gwaddr, 4258 struct net_device *dev) 4259 { 4260 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4261 int ifindex = dev->ifindex; 4262 struct fib6_node *fn; 4263 struct fib6_info *rt = NULL; 4264 struct fib6_table *table; 4265 4266 table = fib6_get_table(net, tb_id); 4267 if (!table) 4268 return NULL; 4269 4270 rcu_read_lock(); 4271 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 4272 if (!fn) 4273 goto out; 4274 4275 for_each_fib6_node_rt_rcu(fn) { 4276 /* these routes do not use nexthops */ 4277 if (rt->nh) 4278 continue; 4279 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 4280 continue; 4281 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 4282 !rt->fib6_nh->fib_nh_gw_family) 4283 continue; 4284 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 4285 continue; 4286 if (!fib6_info_hold_safe(rt)) 4287 continue; 4288 break; 4289 } 4290 out: 4291 rcu_read_unlock(); 4292 return rt; 4293 } 4294 4295 static struct fib6_info *rt6_add_route_info(struct net *net, 4296 const struct in6_addr *prefix, int prefixlen, 4297 const struct in6_addr *gwaddr, 4298 struct net_device *dev, 4299 unsigned int pref) 4300 { 4301 struct fib6_config cfg = { 4302 .fc_metric = IP6_RT_PRIO_USER, 4303 .fc_ifindex = dev->ifindex, 4304 .fc_dst_len = prefixlen, 4305 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 4306 RTF_UP | RTF_PREF(pref), 4307 .fc_protocol = RTPROT_RA, 4308 .fc_type = RTN_UNICAST, 4309 .fc_nlinfo.portid = 0, 4310 .fc_nlinfo.nlh = NULL, 4311 .fc_nlinfo.nl_net = net, 4312 }; 4313 4314 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4315 cfg.fc_dst = *prefix; 4316 cfg.fc_gateway = *gwaddr; 4317 4318 /* We should treat it as a default route if prefix length is 0. */ 4319 if (!prefixlen) 4320 cfg.fc_flags |= RTF_DEFAULT; 4321 4322 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 4323 4324 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 4325 } 4326 #endif 4327 4328 struct fib6_info *rt6_get_dflt_router(struct net *net, 4329 const struct in6_addr *addr, 4330 struct net_device *dev) 4331 { 4332 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 4333 struct fib6_info *rt; 4334 struct fib6_table *table; 4335 4336 table = fib6_get_table(net, tb_id); 4337 if (!table) 4338 return NULL; 4339 4340 rcu_read_lock(); 4341 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4342 struct fib6_nh *nh; 4343 4344 /* RA routes do not use nexthops */ 4345 if (rt->nh) 4346 continue; 4347 4348 nh = rt->fib6_nh; 4349 if (dev == nh->fib_nh_dev && 4350 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 4351 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 4352 break; 4353 } 4354 if (rt && !fib6_info_hold_safe(rt)) 4355 rt = NULL; 4356 rcu_read_unlock(); 4357 return rt; 4358 } 4359 4360 struct fib6_info *rt6_add_dflt_router(struct net *net, 4361 const struct in6_addr *gwaddr, 4362 struct net_device *dev, 4363 unsigned int pref, 4364 u32 defrtr_usr_metric) 4365 { 4366 struct fib6_config cfg = { 4367 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 4368 .fc_metric = defrtr_usr_metric, 4369 .fc_ifindex = dev->ifindex, 4370 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 4371 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 4372 .fc_protocol = RTPROT_RA, 4373 .fc_type = RTN_UNICAST, 4374 .fc_nlinfo.portid = 0, 4375 .fc_nlinfo.nlh = NULL, 4376 .fc_nlinfo.nl_net = net, 4377 }; 4378 4379 cfg.fc_gateway = *gwaddr; 4380 4381 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 4382 struct fib6_table *table; 4383 4384 table = fib6_get_table(dev_net(dev), cfg.fc_table); 4385 if (table) 4386 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 4387 } 4388 4389 return rt6_get_dflt_router(net, gwaddr, dev); 4390 } 4391 4392 static void __rt6_purge_dflt_routers(struct net *net, 4393 struct fib6_table *table) 4394 { 4395 struct fib6_info *rt; 4396 4397 restart: 4398 rcu_read_lock(); 4399 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4400 struct net_device *dev = fib6_info_nh_dev(rt); 4401 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 4402 4403 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 4404 (!idev || idev->cnf.accept_ra != 2) && 4405 fib6_info_hold_safe(rt)) { 4406 rcu_read_unlock(); 4407 ip6_del_rt(net, rt, false); 4408 goto restart; 4409 } 4410 } 4411 rcu_read_unlock(); 4412 4413 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 4414 } 4415 4416 void rt6_purge_dflt_routers(struct net *net) 4417 { 4418 struct fib6_table *table; 4419 struct hlist_head *head; 4420 unsigned int h; 4421 4422 rcu_read_lock(); 4423 4424 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 4425 head = &net->ipv6.fib_table_hash[h]; 4426 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 4427 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 4428 __rt6_purge_dflt_routers(net, table); 4429 } 4430 } 4431 4432 rcu_read_unlock(); 4433 } 4434 4435 static void rtmsg_to_fib6_config(struct net *net, 4436 struct in6_rtmsg *rtmsg, 4437 struct fib6_config *cfg) 4438 { 4439 *cfg = (struct fib6_config){ 4440 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 4441 : RT6_TABLE_MAIN, 4442 .fc_ifindex = rtmsg->rtmsg_ifindex, 4443 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 4444 .fc_expires = rtmsg->rtmsg_info, 4445 .fc_dst_len = rtmsg->rtmsg_dst_len, 4446 .fc_src_len = rtmsg->rtmsg_src_len, 4447 .fc_flags = rtmsg->rtmsg_flags, 4448 .fc_type = rtmsg->rtmsg_type, 4449 4450 .fc_nlinfo.nl_net = net, 4451 4452 .fc_dst = rtmsg->rtmsg_dst, 4453 .fc_src = rtmsg->rtmsg_src, 4454 .fc_gateway = rtmsg->rtmsg_gateway, 4455 }; 4456 } 4457 4458 int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) 4459 { 4460 struct fib6_config cfg; 4461 int err; 4462 4463 if (cmd != SIOCADDRT && cmd != SIOCDELRT) 4464 return -EINVAL; 4465 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 4466 return -EPERM; 4467 4468 rtmsg_to_fib6_config(net, rtmsg, &cfg); 4469 4470 rtnl_lock(); 4471 switch (cmd) { 4472 case SIOCADDRT: 4473 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 4474 break; 4475 case SIOCDELRT: 4476 err = ip6_route_del(&cfg, NULL); 4477 break; 4478 } 4479 rtnl_unlock(); 4480 return err; 4481 } 4482 4483 /* 4484 * Drop the packet on the floor 4485 */ 4486 4487 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 4488 { 4489 struct dst_entry *dst = skb_dst(skb); 4490 struct net *net = dev_net(dst->dev); 4491 struct inet6_dev *idev; 4492 int type; 4493 4494 if (netif_is_l3_master(skb->dev) && 4495 dst->dev == net->loopback_dev) 4496 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 4497 else 4498 idev = ip6_dst_idev(dst); 4499 4500 switch (ipstats_mib_noroutes) { 4501 case IPSTATS_MIB_INNOROUTES: 4502 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 4503 if (type == IPV6_ADDR_ANY) { 4504 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 4505 break; 4506 } 4507 fallthrough; 4508 case IPSTATS_MIB_OUTNOROUTES: 4509 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 4510 break; 4511 } 4512 4513 /* Start over by dropping the dst for l3mdev case */ 4514 if (netif_is_l3_master(skb->dev)) 4515 skb_dst_drop(skb); 4516 4517 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 4518 kfree_skb(skb); 4519 return 0; 4520 } 4521 4522 static int ip6_pkt_discard(struct sk_buff *skb) 4523 { 4524 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 4525 } 4526 4527 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4528 { 4529 skb->dev = skb_dst(skb)->dev; 4530 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 4531 } 4532 4533 static int ip6_pkt_prohibit(struct sk_buff *skb) 4534 { 4535 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 4536 } 4537 4538 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4539 { 4540 skb->dev = skb_dst(skb)->dev; 4541 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4542 } 4543 4544 /* 4545 * Allocate a dst for local (unicast / anycast) address. 4546 */ 4547 4548 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4549 struct inet6_dev *idev, 4550 const struct in6_addr *addr, 4551 bool anycast, gfp_t gfp_flags) 4552 { 4553 struct fib6_config cfg = { 4554 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4555 .fc_ifindex = idev->dev->ifindex, 4556 .fc_flags = RTF_UP | RTF_NONEXTHOP, 4557 .fc_dst = *addr, 4558 .fc_dst_len = 128, 4559 .fc_protocol = RTPROT_KERNEL, 4560 .fc_nlinfo.nl_net = net, 4561 .fc_ignore_dev_down = true, 4562 }; 4563 struct fib6_info *f6i; 4564 4565 if (anycast) { 4566 cfg.fc_type = RTN_ANYCAST; 4567 cfg.fc_flags |= RTF_ANYCAST; 4568 } else { 4569 cfg.fc_type = RTN_LOCAL; 4570 cfg.fc_flags |= RTF_LOCAL; 4571 } 4572 4573 f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); 4574 if (!IS_ERR(f6i)) 4575 f6i->dst_nocount = true; 4576 return f6i; 4577 } 4578 4579 /* remove deleted ip from prefsrc entries */ 4580 struct arg_dev_net_ip { 4581 struct net_device *dev; 4582 struct net *net; 4583 struct in6_addr *addr; 4584 }; 4585 4586 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4587 { 4588 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 4589 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4590 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4591 4592 if (!rt->nh && 4593 ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && 4594 rt != net->ipv6.fib6_null_entry && 4595 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 4596 spin_lock_bh(&rt6_exception_lock); 4597 /* remove prefsrc entry */ 4598 rt->fib6_prefsrc.plen = 0; 4599 spin_unlock_bh(&rt6_exception_lock); 4600 } 4601 return 0; 4602 } 4603 4604 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4605 { 4606 struct net *net = dev_net(ifp->idev->dev); 4607 struct arg_dev_net_ip adni = { 4608 .dev = ifp->idev->dev, 4609 .net = net, 4610 .addr = &ifp->addr, 4611 }; 4612 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4613 } 4614 4615 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4616 4617 /* Remove routers and update dst entries when gateway turn into host. */ 4618 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4619 { 4620 struct in6_addr *gateway = (struct in6_addr *)arg; 4621 struct fib6_nh *nh; 4622 4623 /* RA routes do not use nexthops */ 4624 if (rt->nh) 4625 return 0; 4626 4627 nh = rt->fib6_nh; 4628 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4629 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4630 return -1; 4631 4632 /* Further clean up cached routes in exception table. 4633 * This is needed because cached route may have a different 4634 * gateway than its 'parent' in the case of an ip redirect. 4635 */ 4636 fib6_nh_exceptions_clean_tohost(nh, gateway); 4637 4638 return 0; 4639 } 4640 4641 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4642 { 4643 fib6_clean_all(net, fib6_clean_tohost, gateway); 4644 } 4645 4646 struct arg_netdev_event { 4647 const struct net_device *dev; 4648 union { 4649 unsigned char nh_flags; 4650 unsigned long event; 4651 }; 4652 }; 4653 4654 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4655 { 4656 struct fib6_info *iter; 4657 struct fib6_node *fn; 4658 4659 fn = rcu_dereference_protected(rt->fib6_node, 4660 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4661 iter = rcu_dereference_protected(fn->leaf, 4662 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4663 while (iter) { 4664 if (iter->fib6_metric == rt->fib6_metric && 4665 rt6_qualify_for_ecmp(iter)) 4666 return iter; 4667 iter = rcu_dereference_protected(iter->fib6_next, 4668 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4669 } 4670 4671 return NULL; 4672 } 4673 4674 /* only called for fib entries with builtin fib6_nh */ 4675 static bool rt6_is_dead(const struct fib6_info *rt) 4676 { 4677 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4678 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4679 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4680 return true; 4681 4682 return false; 4683 } 4684 4685 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4686 { 4687 struct fib6_info *iter; 4688 int total = 0; 4689 4690 if (!rt6_is_dead(rt)) 4691 total += rt->fib6_nh->fib_nh_weight; 4692 4693 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4694 if (!rt6_is_dead(iter)) 4695 total += iter->fib6_nh->fib_nh_weight; 4696 } 4697 4698 return total; 4699 } 4700 4701 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4702 { 4703 int upper_bound = -1; 4704 4705 if (!rt6_is_dead(rt)) { 4706 *weight += rt->fib6_nh->fib_nh_weight; 4707 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4708 total) - 1; 4709 } 4710 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4711 } 4712 4713 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4714 { 4715 struct fib6_info *iter; 4716 int weight = 0; 4717 4718 rt6_upper_bound_set(rt, &weight, total); 4719 4720 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4721 rt6_upper_bound_set(iter, &weight, total); 4722 } 4723 4724 void rt6_multipath_rebalance(struct fib6_info *rt) 4725 { 4726 struct fib6_info *first; 4727 int total; 4728 4729 /* In case the entire multipath route was marked for flushing, 4730 * then there is no need to rebalance upon the removal of every 4731 * sibling route. 4732 */ 4733 if (!rt->fib6_nsiblings || rt->should_flush) 4734 return; 4735 4736 /* During lookup routes are evaluated in order, so we need to 4737 * make sure upper bounds are assigned from the first sibling 4738 * onwards. 4739 */ 4740 first = rt6_multipath_first_sibling(rt); 4741 if (WARN_ON_ONCE(!first)) 4742 return; 4743 4744 total = rt6_multipath_total_weight(first); 4745 rt6_multipath_upper_bound_set(first, total); 4746 } 4747 4748 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4749 { 4750 const struct arg_netdev_event *arg = p_arg; 4751 struct net *net = dev_net(arg->dev); 4752 4753 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4754 rt->fib6_nh->fib_nh_dev == arg->dev) { 4755 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4756 fib6_update_sernum_upto_root(net, rt); 4757 rt6_multipath_rebalance(rt); 4758 } 4759 4760 return 0; 4761 } 4762 4763 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4764 { 4765 struct arg_netdev_event arg = { 4766 .dev = dev, 4767 { 4768 .nh_flags = nh_flags, 4769 }, 4770 }; 4771 4772 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4773 arg.nh_flags |= RTNH_F_LINKDOWN; 4774 4775 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4776 } 4777 4778 /* only called for fib entries with inline fib6_nh */ 4779 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4780 const struct net_device *dev) 4781 { 4782 struct fib6_info *iter; 4783 4784 if (rt->fib6_nh->fib_nh_dev == dev) 4785 return true; 4786 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4787 if (iter->fib6_nh->fib_nh_dev == dev) 4788 return true; 4789 4790 return false; 4791 } 4792 4793 static void rt6_multipath_flush(struct fib6_info *rt) 4794 { 4795 struct fib6_info *iter; 4796 4797 rt->should_flush = 1; 4798 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4799 iter->should_flush = 1; 4800 } 4801 4802 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4803 const struct net_device *down_dev) 4804 { 4805 struct fib6_info *iter; 4806 unsigned int dead = 0; 4807 4808 if (rt->fib6_nh->fib_nh_dev == down_dev || 4809 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4810 dead++; 4811 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4812 if (iter->fib6_nh->fib_nh_dev == down_dev || 4813 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4814 dead++; 4815 4816 return dead; 4817 } 4818 4819 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4820 const struct net_device *dev, 4821 unsigned char nh_flags) 4822 { 4823 struct fib6_info *iter; 4824 4825 if (rt->fib6_nh->fib_nh_dev == dev) 4826 rt->fib6_nh->fib_nh_flags |= nh_flags; 4827 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4828 if (iter->fib6_nh->fib_nh_dev == dev) 4829 iter->fib6_nh->fib_nh_flags |= nh_flags; 4830 } 4831 4832 /* called with write lock held for table with rt */ 4833 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4834 { 4835 const struct arg_netdev_event *arg = p_arg; 4836 const struct net_device *dev = arg->dev; 4837 struct net *net = dev_net(dev); 4838 4839 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4840 return 0; 4841 4842 switch (arg->event) { 4843 case NETDEV_UNREGISTER: 4844 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4845 case NETDEV_DOWN: 4846 if (rt->should_flush) 4847 return -1; 4848 if (!rt->fib6_nsiblings) 4849 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4850 if (rt6_multipath_uses_dev(rt, dev)) { 4851 unsigned int count; 4852 4853 count = rt6_multipath_dead_count(rt, dev); 4854 if (rt->fib6_nsiblings + 1 == count) { 4855 rt6_multipath_flush(rt); 4856 return -1; 4857 } 4858 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4859 RTNH_F_LINKDOWN); 4860 fib6_update_sernum(net, rt); 4861 rt6_multipath_rebalance(rt); 4862 } 4863 return -2; 4864 case NETDEV_CHANGE: 4865 if (rt->fib6_nh->fib_nh_dev != dev || 4866 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4867 break; 4868 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4869 rt6_multipath_rebalance(rt); 4870 break; 4871 } 4872 4873 return 0; 4874 } 4875 4876 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4877 { 4878 struct arg_netdev_event arg = { 4879 .dev = dev, 4880 { 4881 .event = event, 4882 }, 4883 }; 4884 struct net *net = dev_net(dev); 4885 4886 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4887 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4888 else 4889 fib6_clean_all(net, fib6_ifdown, &arg); 4890 } 4891 4892 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4893 { 4894 rt6_sync_down_dev(dev, event); 4895 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4896 neigh_ifdown(&nd_tbl, dev); 4897 } 4898 4899 struct rt6_mtu_change_arg { 4900 struct net_device *dev; 4901 unsigned int mtu; 4902 struct fib6_info *f6i; 4903 }; 4904 4905 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4906 { 4907 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4908 struct fib6_info *f6i = arg->f6i; 4909 4910 /* For administrative MTU increase, there is no way to discover 4911 * IPv6 PMTU increase, so PMTU increase should be updated here. 4912 * Since RFC 1981 doesn't include administrative MTU increase 4913 * update PMTU increase is a MUST. (i.e. jumbo frame) 4914 */ 4915 if (nh->fib_nh_dev == arg->dev) { 4916 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4917 u32 mtu = f6i->fib6_pmtu; 4918 4919 if (mtu >= arg->mtu || 4920 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4921 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4922 4923 spin_lock_bh(&rt6_exception_lock); 4924 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4925 spin_unlock_bh(&rt6_exception_lock); 4926 } 4927 4928 return 0; 4929 } 4930 4931 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4932 { 4933 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4934 struct inet6_dev *idev; 4935 4936 /* In IPv6 pmtu discovery is not optional, 4937 so that RTAX_MTU lock cannot disable it. 4938 We still use this lock to block changes 4939 caused by addrconf/ndisc. 4940 */ 4941 4942 idev = __in6_dev_get(arg->dev); 4943 if (!idev) 4944 return 0; 4945 4946 if (fib6_metric_locked(f6i, RTAX_MTU)) 4947 return 0; 4948 4949 arg->f6i = f6i; 4950 if (f6i->nh) { 4951 /* fib6_nh_mtu_change only returns 0, so this is safe */ 4952 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, 4953 arg); 4954 } 4955 4956 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4957 } 4958 4959 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4960 { 4961 struct rt6_mtu_change_arg arg = { 4962 .dev = dev, 4963 .mtu = mtu, 4964 }; 4965 4966 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4967 } 4968 4969 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4970 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4971 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4972 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4973 [RTA_OIF] = { .type = NLA_U32 }, 4974 [RTA_IIF] = { .type = NLA_U32 }, 4975 [RTA_PRIORITY] = { .type = NLA_U32 }, 4976 [RTA_METRICS] = { .type = NLA_NESTED }, 4977 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4978 [RTA_PREF] = { .type = NLA_U8 }, 4979 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4980 [RTA_ENCAP] = { .type = NLA_NESTED }, 4981 [RTA_EXPIRES] = { .type = NLA_U32 }, 4982 [RTA_UID] = { .type = NLA_U32 }, 4983 [RTA_MARK] = { .type = NLA_U32 }, 4984 [RTA_TABLE] = { .type = NLA_U32 }, 4985 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4986 [RTA_SPORT] = { .type = NLA_U16 }, 4987 [RTA_DPORT] = { .type = NLA_U16 }, 4988 [RTA_NH_ID] = { .type = NLA_U32 }, 4989 }; 4990 4991 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4992 struct fib6_config *cfg, 4993 struct netlink_ext_ack *extack) 4994 { 4995 struct rtmsg *rtm; 4996 struct nlattr *tb[RTA_MAX+1]; 4997 unsigned int pref; 4998 int err; 4999 5000 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5001 rtm_ipv6_policy, extack); 5002 if (err < 0) 5003 goto errout; 5004 5005 err = -EINVAL; 5006 rtm = nlmsg_data(nlh); 5007 5008 *cfg = (struct fib6_config){ 5009 .fc_table = rtm->rtm_table, 5010 .fc_dst_len = rtm->rtm_dst_len, 5011 .fc_src_len = rtm->rtm_src_len, 5012 .fc_flags = RTF_UP, 5013 .fc_protocol = rtm->rtm_protocol, 5014 .fc_type = rtm->rtm_type, 5015 5016 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 5017 .fc_nlinfo.nlh = nlh, 5018 .fc_nlinfo.nl_net = sock_net(skb->sk), 5019 }; 5020 5021 if (rtm->rtm_type == RTN_UNREACHABLE || 5022 rtm->rtm_type == RTN_BLACKHOLE || 5023 rtm->rtm_type == RTN_PROHIBIT || 5024 rtm->rtm_type == RTN_THROW) 5025 cfg->fc_flags |= RTF_REJECT; 5026 5027 if (rtm->rtm_type == RTN_LOCAL) 5028 cfg->fc_flags |= RTF_LOCAL; 5029 5030 if (rtm->rtm_flags & RTM_F_CLONED) 5031 cfg->fc_flags |= RTF_CACHE; 5032 5033 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 5034 5035 if (tb[RTA_NH_ID]) { 5036 if (tb[RTA_GATEWAY] || tb[RTA_OIF] || 5037 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { 5038 NL_SET_ERR_MSG(extack, 5039 "Nexthop specification and nexthop id are mutually exclusive"); 5040 goto errout; 5041 } 5042 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); 5043 } 5044 5045 if (tb[RTA_GATEWAY]) { 5046 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 5047 cfg->fc_flags |= RTF_GATEWAY; 5048 } 5049 if (tb[RTA_VIA]) { 5050 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 5051 goto errout; 5052 } 5053 5054 if (tb[RTA_DST]) { 5055 int plen = (rtm->rtm_dst_len + 7) >> 3; 5056 5057 if (nla_len(tb[RTA_DST]) < plen) 5058 goto errout; 5059 5060 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 5061 } 5062 5063 if (tb[RTA_SRC]) { 5064 int plen = (rtm->rtm_src_len + 7) >> 3; 5065 5066 if (nla_len(tb[RTA_SRC]) < plen) 5067 goto errout; 5068 5069 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 5070 } 5071 5072 if (tb[RTA_PREFSRC]) 5073 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 5074 5075 if (tb[RTA_OIF]) 5076 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 5077 5078 if (tb[RTA_PRIORITY]) 5079 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 5080 5081 if (tb[RTA_METRICS]) { 5082 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 5083 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 5084 } 5085 5086 if (tb[RTA_TABLE]) 5087 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 5088 5089 if (tb[RTA_MULTIPATH]) { 5090 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 5091 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 5092 5093 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 5094 cfg->fc_mp_len, extack); 5095 if (err < 0) 5096 goto errout; 5097 } 5098 5099 if (tb[RTA_PREF]) { 5100 pref = nla_get_u8(tb[RTA_PREF]); 5101 if (pref != ICMPV6_ROUTER_PREF_LOW && 5102 pref != ICMPV6_ROUTER_PREF_HIGH) 5103 pref = ICMPV6_ROUTER_PREF_MEDIUM; 5104 cfg->fc_flags |= RTF_PREF(pref); 5105 } 5106 5107 if (tb[RTA_ENCAP]) 5108 cfg->fc_encap = tb[RTA_ENCAP]; 5109 5110 if (tb[RTA_ENCAP_TYPE]) { 5111 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 5112 5113 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 5114 if (err < 0) 5115 goto errout; 5116 } 5117 5118 if (tb[RTA_EXPIRES]) { 5119 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 5120 5121 if (addrconf_finite_timeout(timeout)) { 5122 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 5123 cfg->fc_flags |= RTF_EXPIRES; 5124 } 5125 } 5126 5127 err = 0; 5128 errout: 5129 return err; 5130 } 5131 5132 struct rt6_nh { 5133 struct fib6_info *fib6_info; 5134 struct fib6_config r_cfg; 5135 struct list_head next; 5136 }; 5137 5138 static int ip6_route_info_append(struct net *net, 5139 struct list_head *rt6_nh_list, 5140 struct fib6_info *rt, 5141 struct fib6_config *r_cfg) 5142 { 5143 struct rt6_nh *nh; 5144 int err = -EEXIST; 5145 5146 list_for_each_entry(nh, rt6_nh_list, next) { 5147 /* check if fib6_info already exists */ 5148 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 5149 return err; 5150 } 5151 5152 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 5153 if (!nh) 5154 return -ENOMEM; 5155 nh->fib6_info = rt; 5156 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 5157 list_add_tail(&nh->next, rt6_nh_list); 5158 5159 return 0; 5160 } 5161 5162 static void ip6_route_mpath_notify(struct fib6_info *rt, 5163 struct fib6_info *rt_last, 5164 struct nl_info *info, 5165 __u16 nlflags) 5166 { 5167 /* if this is an APPEND route, then rt points to the first route 5168 * inserted and rt_last points to last route inserted. Userspace 5169 * wants a consistent dump of the route which starts at the first 5170 * nexthop. Since sibling routes are always added at the end of 5171 * the list, find the first sibling of the last route appended 5172 */ 5173 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 5174 rt = list_first_entry(&rt_last->fib6_siblings, 5175 struct fib6_info, 5176 fib6_siblings); 5177 } 5178 5179 if (rt) 5180 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 5181 } 5182 5183 static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) 5184 { 5185 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); 5186 bool should_notify = false; 5187 struct fib6_info *leaf; 5188 struct fib6_node *fn; 5189 5190 rcu_read_lock(); 5191 fn = rcu_dereference(rt->fib6_node); 5192 if (!fn) 5193 goto out; 5194 5195 leaf = rcu_dereference(fn->leaf); 5196 if (!leaf) 5197 goto out; 5198 5199 if (rt == leaf || 5200 (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && 5201 rt6_qualify_for_ecmp(leaf))) 5202 should_notify = true; 5203 out: 5204 rcu_read_unlock(); 5205 5206 return should_notify; 5207 } 5208 5209 static int ip6_route_multipath_add(struct fib6_config *cfg, 5210 struct netlink_ext_ack *extack) 5211 { 5212 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 5213 struct nl_info *info = &cfg->fc_nlinfo; 5214 struct fib6_config r_cfg; 5215 struct rtnexthop *rtnh; 5216 struct fib6_info *rt; 5217 struct rt6_nh *err_nh; 5218 struct rt6_nh *nh, *nh_safe; 5219 __u16 nlflags; 5220 int remaining; 5221 int attrlen; 5222 int err = 1; 5223 int nhn = 0; 5224 int replace = (cfg->fc_nlinfo.nlh && 5225 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 5226 LIST_HEAD(rt6_nh_list); 5227 5228 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 5229 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 5230 nlflags |= NLM_F_APPEND; 5231 5232 remaining = cfg->fc_mp_len; 5233 rtnh = (struct rtnexthop *)cfg->fc_mp; 5234 5235 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 5236 * fib6_info structs per nexthop 5237 */ 5238 while (rtnh_ok(rtnh, remaining)) { 5239 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5240 if (rtnh->rtnh_ifindex) 5241 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5242 5243 attrlen = rtnh_attrlen(rtnh); 5244 if (attrlen > 0) { 5245 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5246 5247 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5248 if (nla) { 5249 r_cfg.fc_gateway = nla_get_in6_addr(nla); 5250 r_cfg.fc_flags |= RTF_GATEWAY; 5251 } 5252 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 5253 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 5254 if (nla) 5255 r_cfg.fc_encap_type = nla_get_u16(nla); 5256 } 5257 5258 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 5259 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 5260 if (IS_ERR(rt)) { 5261 err = PTR_ERR(rt); 5262 rt = NULL; 5263 goto cleanup; 5264 } 5265 if (!rt6_qualify_for_ecmp(rt)) { 5266 err = -EINVAL; 5267 NL_SET_ERR_MSG(extack, 5268 "Device only routes can not be added for IPv6 using the multipath API."); 5269 fib6_info_release(rt); 5270 goto cleanup; 5271 } 5272 5273 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 5274 5275 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 5276 rt, &r_cfg); 5277 if (err) { 5278 fib6_info_release(rt); 5279 goto cleanup; 5280 } 5281 5282 rtnh = rtnh_next(rtnh, &remaining); 5283 } 5284 5285 if (list_empty(&rt6_nh_list)) { 5286 NL_SET_ERR_MSG(extack, 5287 "Invalid nexthop configuration - no valid nexthops"); 5288 return -EINVAL; 5289 } 5290 5291 /* for add and replace send one notification with all nexthops. 5292 * Skip the notification in fib6_add_rt2node and send one with 5293 * the full route when done 5294 */ 5295 info->skip_notify = 1; 5296 5297 /* For add and replace, send one notification with all nexthops. For 5298 * append, send one notification with all appended nexthops. 5299 */ 5300 info->skip_notify_kernel = 1; 5301 5302 err_nh = NULL; 5303 list_for_each_entry(nh, &rt6_nh_list, next) { 5304 err = __ip6_ins_rt(nh->fib6_info, info, extack); 5305 fib6_info_release(nh->fib6_info); 5306 5307 if (!err) { 5308 /* save reference to last route successfully inserted */ 5309 rt_last = nh->fib6_info; 5310 5311 /* save reference to first route for notification */ 5312 if (!rt_notif) 5313 rt_notif = nh->fib6_info; 5314 } 5315 5316 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 5317 nh->fib6_info = NULL; 5318 if (err) { 5319 if (replace && nhn) 5320 NL_SET_ERR_MSG_MOD(extack, 5321 "multipath route replace failed (check consistency of installed routes)"); 5322 err_nh = nh; 5323 goto add_errout; 5324 } 5325 5326 /* Because each route is added like a single route we remove 5327 * these flags after the first nexthop: if there is a collision, 5328 * we have already failed to add the first nexthop: 5329 * fib6_add_rt2node() has rejected it; when replacing, old 5330 * nexthops have been replaced by first new, the rest should 5331 * be added to it. 5332 */ 5333 if (cfg->fc_nlinfo.nlh) { 5334 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 5335 NLM_F_REPLACE); 5336 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; 5337 } 5338 nhn++; 5339 } 5340 5341 /* An in-kernel notification should only be sent in case the new 5342 * multipath route is added as the first route in the node, or if 5343 * it was appended to it. We pass 'rt_notif' since it is the first 5344 * sibling and might allow us to skip some checks in the replace case. 5345 */ 5346 if (ip6_route_mpath_should_notify(rt_notif)) { 5347 enum fib_event_type fib_event; 5348 5349 if (rt_notif->fib6_nsiblings != nhn - 1) 5350 fib_event = FIB_EVENT_ENTRY_APPEND; 5351 else 5352 fib_event = FIB_EVENT_ENTRY_REPLACE; 5353 5354 err = call_fib6_multipath_entry_notifiers(info->nl_net, 5355 fib_event, rt_notif, 5356 nhn - 1, extack); 5357 if (err) { 5358 /* Delete all the siblings that were just added */ 5359 err_nh = NULL; 5360 goto add_errout; 5361 } 5362 } 5363 5364 /* success ... tell user about new route */ 5365 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5366 goto cleanup; 5367 5368 add_errout: 5369 /* send notification for routes that were added so that 5370 * the delete notifications sent by ip6_route_del are 5371 * coherent 5372 */ 5373 if (rt_notif) 5374 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5375 5376 /* Delete routes that were already added */ 5377 list_for_each_entry(nh, &rt6_nh_list, next) { 5378 if (err_nh == nh) 5379 break; 5380 ip6_route_del(&nh->r_cfg, extack); 5381 } 5382 5383 cleanup: 5384 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 5385 if (nh->fib6_info) 5386 fib6_info_release(nh->fib6_info); 5387 list_del(&nh->next); 5388 kfree(nh); 5389 } 5390 5391 return err; 5392 } 5393 5394 static int ip6_route_multipath_del(struct fib6_config *cfg, 5395 struct netlink_ext_ack *extack) 5396 { 5397 struct fib6_config r_cfg; 5398 struct rtnexthop *rtnh; 5399 int last_err = 0; 5400 int remaining; 5401 int attrlen; 5402 int err; 5403 5404 remaining = cfg->fc_mp_len; 5405 rtnh = (struct rtnexthop *)cfg->fc_mp; 5406 5407 /* Parse a Multipath Entry */ 5408 while (rtnh_ok(rtnh, remaining)) { 5409 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5410 if (rtnh->rtnh_ifindex) 5411 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5412 5413 attrlen = rtnh_attrlen(rtnh); 5414 if (attrlen > 0) { 5415 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5416 5417 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5418 if (nla) { 5419 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 5420 r_cfg.fc_flags |= RTF_GATEWAY; 5421 } 5422 } 5423 err = ip6_route_del(&r_cfg, extack); 5424 if (err) 5425 last_err = err; 5426 5427 rtnh = rtnh_next(rtnh, &remaining); 5428 } 5429 5430 return last_err; 5431 } 5432 5433 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5434 struct netlink_ext_ack *extack) 5435 { 5436 struct fib6_config cfg; 5437 int err; 5438 5439 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5440 if (err < 0) 5441 return err; 5442 5443 if (cfg.fc_nh_id && 5444 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { 5445 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 5446 return -EINVAL; 5447 } 5448 5449 if (cfg.fc_mp) 5450 return ip6_route_multipath_del(&cfg, extack); 5451 else { 5452 cfg.fc_delete_all_nh = 1; 5453 return ip6_route_del(&cfg, extack); 5454 } 5455 } 5456 5457 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5458 struct netlink_ext_ack *extack) 5459 { 5460 struct fib6_config cfg; 5461 int err; 5462 5463 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5464 if (err < 0) 5465 return err; 5466 5467 if (cfg.fc_metric == 0) 5468 cfg.fc_metric = IP6_RT_PRIO_USER; 5469 5470 if (cfg.fc_mp) 5471 return ip6_route_multipath_add(&cfg, extack); 5472 else 5473 return ip6_route_add(&cfg, GFP_KERNEL, extack); 5474 } 5475 5476 /* add the overhead of this fib6_nh to nexthop_len */ 5477 static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) 5478 { 5479 int *nexthop_len = arg; 5480 5481 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ 5482 + NLA_ALIGN(sizeof(struct rtnexthop)) 5483 + nla_total_size(16); /* RTA_GATEWAY */ 5484 5485 if (nh->fib_nh_lws) { 5486 /* RTA_ENCAP_TYPE */ 5487 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5488 /* RTA_ENCAP */ 5489 *nexthop_len += nla_total_size(2); 5490 } 5491 5492 return 0; 5493 } 5494 5495 static size_t rt6_nlmsg_size(struct fib6_info *f6i) 5496 { 5497 int nexthop_len; 5498 5499 if (f6i->nh) { 5500 nexthop_len = nla_total_size(4); /* RTA_NH_ID */ 5501 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, 5502 &nexthop_len); 5503 } else { 5504 struct fib6_nh *nh = f6i->fib6_nh; 5505 5506 nexthop_len = 0; 5507 if (f6i->fib6_nsiblings) { 5508 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 5509 + NLA_ALIGN(sizeof(struct rtnexthop)) 5510 + nla_total_size(16) /* RTA_GATEWAY */ 5511 + lwtunnel_get_encap_size(nh->fib_nh_lws); 5512 5513 nexthop_len *= f6i->fib6_nsiblings; 5514 } 5515 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5516 } 5517 5518 return NLMSG_ALIGN(sizeof(struct rtmsg)) 5519 + nla_total_size(16) /* RTA_SRC */ 5520 + nla_total_size(16) /* RTA_DST */ 5521 + nla_total_size(16) /* RTA_GATEWAY */ 5522 + nla_total_size(16) /* RTA_PREFSRC */ 5523 + nla_total_size(4) /* RTA_TABLE */ 5524 + nla_total_size(4) /* RTA_IIF */ 5525 + nla_total_size(4) /* RTA_OIF */ 5526 + nla_total_size(4) /* RTA_PRIORITY */ 5527 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 5528 + nla_total_size(sizeof(struct rta_cacheinfo)) 5529 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 5530 + nla_total_size(1) /* RTA_PREF */ 5531 + nexthop_len; 5532 } 5533 5534 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 5535 unsigned char *flags) 5536 { 5537 if (nexthop_is_multipath(nh)) { 5538 struct nlattr *mp; 5539 5540 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5541 if (!mp) 5542 goto nla_put_failure; 5543 5544 if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) 5545 goto nla_put_failure; 5546 5547 nla_nest_end(skb, mp); 5548 } else { 5549 struct fib6_nh *fib6_nh; 5550 5551 fib6_nh = nexthop_fib6_nh(nh); 5552 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, 5553 flags, false) < 0) 5554 goto nla_put_failure; 5555 } 5556 5557 return 0; 5558 5559 nla_put_failure: 5560 return -EMSGSIZE; 5561 } 5562 5563 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 5564 struct fib6_info *rt, struct dst_entry *dst, 5565 struct in6_addr *dest, struct in6_addr *src, 5566 int iif, int type, u32 portid, u32 seq, 5567 unsigned int flags) 5568 { 5569 struct rt6_info *rt6 = (struct rt6_info *)dst; 5570 struct rt6key *rt6_dst, *rt6_src; 5571 u32 *pmetrics, table, rt6_flags; 5572 unsigned char nh_flags = 0; 5573 struct nlmsghdr *nlh; 5574 struct rtmsg *rtm; 5575 long expires = 0; 5576 5577 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 5578 if (!nlh) 5579 return -EMSGSIZE; 5580 5581 if (rt6) { 5582 rt6_dst = &rt6->rt6i_dst; 5583 rt6_src = &rt6->rt6i_src; 5584 rt6_flags = rt6->rt6i_flags; 5585 } else { 5586 rt6_dst = &rt->fib6_dst; 5587 rt6_src = &rt->fib6_src; 5588 rt6_flags = rt->fib6_flags; 5589 } 5590 5591 rtm = nlmsg_data(nlh); 5592 rtm->rtm_family = AF_INET6; 5593 rtm->rtm_dst_len = rt6_dst->plen; 5594 rtm->rtm_src_len = rt6_src->plen; 5595 rtm->rtm_tos = 0; 5596 if (rt->fib6_table) 5597 table = rt->fib6_table->tb6_id; 5598 else 5599 table = RT6_TABLE_UNSPEC; 5600 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 5601 if (nla_put_u32(skb, RTA_TABLE, table)) 5602 goto nla_put_failure; 5603 5604 rtm->rtm_type = rt->fib6_type; 5605 rtm->rtm_flags = 0; 5606 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 5607 rtm->rtm_protocol = rt->fib6_protocol; 5608 5609 if (rt6_flags & RTF_CACHE) 5610 rtm->rtm_flags |= RTM_F_CLONED; 5611 5612 if (dest) { 5613 if (nla_put_in6_addr(skb, RTA_DST, dest)) 5614 goto nla_put_failure; 5615 rtm->rtm_dst_len = 128; 5616 } else if (rtm->rtm_dst_len) 5617 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 5618 goto nla_put_failure; 5619 #ifdef CONFIG_IPV6_SUBTREES 5620 if (src) { 5621 if (nla_put_in6_addr(skb, RTA_SRC, src)) 5622 goto nla_put_failure; 5623 rtm->rtm_src_len = 128; 5624 } else if (rtm->rtm_src_len && 5625 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 5626 goto nla_put_failure; 5627 #endif 5628 if (iif) { 5629 #ifdef CONFIG_IPV6_MROUTE 5630 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 5631 int err = ip6mr_get_route(net, skb, rtm, portid); 5632 5633 if (err == 0) 5634 return 0; 5635 if (err < 0) 5636 goto nla_put_failure; 5637 } else 5638 #endif 5639 if (nla_put_u32(skb, RTA_IIF, iif)) 5640 goto nla_put_failure; 5641 } else if (dest) { 5642 struct in6_addr saddr_buf; 5643 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 5644 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5645 goto nla_put_failure; 5646 } 5647 5648 if (rt->fib6_prefsrc.plen) { 5649 struct in6_addr saddr_buf; 5650 saddr_buf = rt->fib6_prefsrc.addr; 5651 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5652 goto nla_put_failure; 5653 } 5654 5655 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 5656 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 5657 goto nla_put_failure; 5658 5659 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5660 goto nla_put_failure; 5661 5662 /* For multipath routes, walk the siblings list and add 5663 * each as a nexthop within RTA_MULTIPATH. 5664 */ 5665 if (rt6) { 5666 if (rt6_flags & RTF_GATEWAY && 5667 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5668 goto nla_put_failure; 5669 5670 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5671 goto nla_put_failure; 5672 5673 if (dst->lwtstate && 5674 lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 5675 goto nla_put_failure; 5676 } else if (rt->fib6_nsiblings) { 5677 struct fib6_info *sibling, *next_sibling; 5678 struct nlattr *mp; 5679 5680 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5681 if (!mp) 5682 goto nla_put_failure; 5683 5684 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5685 rt->fib6_nh->fib_nh_weight, AF_INET6, 5686 0) < 0) 5687 goto nla_put_failure; 5688 5689 list_for_each_entry_safe(sibling, next_sibling, 5690 &rt->fib6_siblings, fib6_siblings) { 5691 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5692 sibling->fib6_nh->fib_nh_weight, 5693 AF_INET6, 0) < 0) 5694 goto nla_put_failure; 5695 } 5696 5697 nla_nest_end(skb, mp); 5698 } else if (rt->nh) { 5699 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5700 goto nla_put_failure; 5701 5702 if (nexthop_is_blackhole(rt->nh)) 5703 rtm->rtm_type = RTN_BLACKHOLE; 5704 5705 if (net->ipv4.sysctl_nexthop_compat_mode && 5706 rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5707 goto nla_put_failure; 5708 5709 rtm->rtm_flags |= nh_flags; 5710 } else { 5711 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, 5712 &nh_flags, false) < 0) 5713 goto nla_put_failure; 5714 5715 rtm->rtm_flags |= nh_flags; 5716 } 5717 5718 if (rt6_flags & RTF_EXPIRES) { 5719 expires = dst ? dst->expires : rt->expires; 5720 expires -= jiffies; 5721 } 5722 5723 if (!dst) { 5724 if (rt->offload) 5725 rtm->rtm_flags |= RTM_F_OFFLOAD; 5726 if (rt->trap) 5727 rtm->rtm_flags |= RTM_F_TRAP; 5728 if (rt->offload_failed) 5729 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; 5730 } 5731 5732 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5733 goto nla_put_failure; 5734 5735 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5736 goto nla_put_failure; 5737 5738 5739 nlmsg_end(skb, nlh); 5740 return 0; 5741 5742 nla_put_failure: 5743 nlmsg_cancel(skb, nlh); 5744 return -EMSGSIZE; 5745 } 5746 5747 static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) 5748 { 5749 const struct net_device *dev = arg; 5750 5751 if (nh->fib_nh_dev == dev) 5752 return 1; 5753 5754 return 0; 5755 } 5756 5757 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5758 const struct net_device *dev) 5759 { 5760 if (f6i->nh) { 5761 struct net_device *_dev = (struct net_device *)dev; 5762 5763 return !!nexthop_for_each_fib6_nh(f6i->nh, 5764 fib6_info_nh_uses_dev, 5765 _dev); 5766 } 5767 5768 if (f6i->fib6_nh->fib_nh_dev == dev) 5769 return true; 5770 5771 if (f6i->fib6_nsiblings) { 5772 struct fib6_info *sibling, *next_sibling; 5773 5774 list_for_each_entry_safe(sibling, next_sibling, 5775 &f6i->fib6_siblings, fib6_siblings) { 5776 if (sibling->fib6_nh->fib_nh_dev == dev) 5777 return true; 5778 } 5779 } 5780 5781 return false; 5782 } 5783 5784 struct fib6_nh_exception_dump_walker { 5785 struct rt6_rtnl_dump_arg *dump; 5786 struct fib6_info *rt; 5787 unsigned int flags; 5788 unsigned int skip; 5789 unsigned int count; 5790 }; 5791 5792 static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) 5793 { 5794 struct fib6_nh_exception_dump_walker *w = arg; 5795 struct rt6_rtnl_dump_arg *dump = w->dump; 5796 struct rt6_exception_bucket *bucket; 5797 struct rt6_exception *rt6_ex; 5798 int i, err; 5799 5800 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 5801 if (!bucket) 5802 return 0; 5803 5804 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 5805 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 5806 if (w->skip) { 5807 w->skip--; 5808 continue; 5809 } 5810 5811 /* Expiration of entries doesn't bump sernum, insertion 5812 * does. Removal is triggered by insertion, so we can 5813 * rely on the fact that if entries change between two 5814 * partial dumps, this node is scanned again completely, 5815 * see rt6_insert_exception() and fib6_dump_table(). 5816 * 5817 * Count expired entries we go through as handled 5818 * entries that we'll skip next time, in case of partial 5819 * node dump. Otherwise, if entries expire meanwhile, 5820 * we'll skip the wrong amount. 5821 */ 5822 if (rt6_check_expired(rt6_ex->rt6i)) { 5823 w->count++; 5824 continue; 5825 } 5826 5827 err = rt6_fill_node(dump->net, dump->skb, w->rt, 5828 &rt6_ex->rt6i->dst, NULL, NULL, 0, 5829 RTM_NEWROUTE, 5830 NETLINK_CB(dump->cb->skb).portid, 5831 dump->cb->nlh->nlmsg_seq, w->flags); 5832 if (err) 5833 return err; 5834 5835 w->count++; 5836 } 5837 bucket++; 5838 } 5839 5840 return 0; 5841 } 5842 5843 /* Return -1 if done with node, number of handled routes on partial dump */ 5844 int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) 5845 { 5846 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5847 struct fib_dump_filter *filter = &arg->filter; 5848 unsigned int flags = NLM_F_MULTI; 5849 struct net *net = arg->net; 5850 int count = 0; 5851 5852 if (rt == net->ipv6.fib6_null_entry) 5853 return -1; 5854 5855 if ((filter->flags & RTM_F_PREFIX) && 5856 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5857 /* success since this is not a prefix route */ 5858 return -1; 5859 } 5860 if (filter->filter_set && 5861 ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5862 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5863 (filter->protocol && rt->fib6_protocol != filter->protocol))) { 5864 return -1; 5865 } 5866 5867 if (filter->filter_set || 5868 !filter->dump_routes || !filter->dump_exceptions) { 5869 flags |= NLM_F_DUMP_FILTERED; 5870 } 5871 5872 if (filter->dump_routes) { 5873 if (skip) { 5874 skip--; 5875 } else { 5876 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 5877 0, RTM_NEWROUTE, 5878 NETLINK_CB(arg->cb->skb).portid, 5879 arg->cb->nlh->nlmsg_seq, flags)) { 5880 return 0; 5881 } 5882 count++; 5883 } 5884 } 5885 5886 if (filter->dump_exceptions) { 5887 struct fib6_nh_exception_dump_walker w = { .dump = arg, 5888 .rt = rt, 5889 .flags = flags, 5890 .skip = skip, 5891 .count = 0 }; 5892 int err; 5893 5894 rcu_read_lock(); 5895 if (rt->nh) { 5896 err = nexthop_for_each_fib6_nh(rt->nh, 5897 rt6_nh_dump_exceptions, 5898 &w); 5899 } else { 5900 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); 5901 } 5902 rcu_read_unlock(); 5903 5904 if (err) 5905 return count += w.count; 5906 } 5907 5908 return -1; 5909 } 5910 5911 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5912 const struct nlmsghdr *nlh, 5913 struct nlattr **tb, 5914 struct netlink_ext_ack *extack) 5915 { 5916 struct rtmsg *rtm; 5917 int i, err; 5918 5919 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5920 NL_SET_ERR_MSG_MOD(extack, 5921 "Invalid header for get route request"); 5922 return -EINVAL; 5923 } 5924 5925 if (!netlink_strict_get_check(skb)) 5926 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5927 rtm_ipv6_policy, extack); 5928 5929 rtm = nlmsg_data(nlh); 5930 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5931 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5932 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5933 rtm->rtm_type) { 5934 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5935 return -EINVAL; 5936 } 5937 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5938 NL_SET_ERR_MSG_MOD(extack, 5939 "Invalid flags for get route request"); 5940 return -EINVAL; 5941 } 5942 5943 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5944 rtm_ipv6_policy, extack); 5945 if (err) 5946 return err; 5947 5948 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5949 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5950 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 5951 return -EINVAL; 5952 } 5953 5954 for (i = 0; i <= RTA_MAX; i++) { 5955 if (!tb[i]) 5956 continue; 5957 5958 switch (i) { 5959 case RTA_SRC: 5960 case RTA_DST: 5961 case RTA_IIF: 5962 case RTA_OIF: 5963 case RTA_MARK: 5964 case RTA_UID: 5965 case RTA_SPORT: 5966 case RTA_DPORT: 5967 case RTA_IP_PROTO: 5968 break; 5969 default: 5970 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 5971 return -EINVAL; 5972 } 5973 } 5974 5975 return 0; 5976 } 5977 5978 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 5979 struct netlink_ext_ack *extack) 5980 { 5981 struct net *net = sock_net(in_skb->sk); 5982 struct nlattr *tb[RTA_MAX+1]; 5983 int err, iif = 0, oif = 0; 5984 struct fib6_info *from; 5985 struct dst_entry *dst; 5986 struct rt6_info *rt; 5987 struct sk_buff *skb; 5988 struct rtmsg *rtm; 5989 struct flowi6 fl6 = {}; 5990 bool fibmatch; 5991 5992 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 5993 if (err < 0) 5994 goto errout; 5995 5996 err = -EINVAL; 5997 rtm = nlmsg_data(nlh); 5998 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 5999 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 6000 6001 if (tb[RTA_SRC]) { 6002 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 6003 goto errout; 6004 6005 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 6006 } 6007 6008 if (tb[RTA_DST]) { 6009 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 6010 goto errout; 6011 6012 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 6013 } 6014 6015 if (tb[RTA_IIF]) 6016 iif = nla_get_u32(tb[RTA_IIF]); 6017 6018 if (tb[RTA_OIF]) 6019 oif = nla_get_u32(tb[RTA_OIF]); 6020 6021 if (tb[RTA_MARK]) 6022 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 6023 6024 if (tb[RTA_UID]) 6025 fl6.flowi6_uid = make_kuid(current_user_ns(), 6026 nla_get_u32(tb[RTA_UID])); 6027 else 6028 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 6029 6030 if (tb[RTA_SPORT]) 6031 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 6032 6033 if (tb[RTA_DPORT]) 6034 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 6035 6036 if (tb[RTA_IP_PROTO]) { 6037 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 6038 &fl6.flowi6_proto, AF_INET6, 6039 extack); 6040 if (err) 6041 goto errout; 6042 } 6043 6044 if (iif) { 6045 struct net_device *dev; 6046 int flags = 0; 6047 6048 rcu_read_lock(); 6049 6050 dev = dev_get_by_index_rcu(net, iif); 6051 if (!dev) { 6052 rcu_read_unlock(); 6053 err = -ENODEV; 6054 goto errout; 6055 } 6056 6057 fl6.flowi6_iif = iif; 6058 6059 if (!ipv6_addr_any(&fl6.saddr)) 6060 flags |= RT6_LOOKUP_F_HAS_SADDR; 6061 6062 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 6063 6064 rcu_read_unlock(); 6065 } else { 6066 fl6.flowi6_oif = oif; 6067 6068 dst = ip6_route_output(net, NULL, &fl6); 6069 } 6070 6071 6072 rt = container_of(dst, struct rt6_info, dst); 6073 if (rt->dst.error) { 6074 err = rt->dst.error; 6075 ip6_rt_put(rt); 6076 goto errout; 6077 } 6078 6079 if (rt == net->ipv6.ip6_null_entry) { 6080 err = rt->dst.error; 6081 ip6_rt_put(rt); 6082 goto errout; 6083 } 6084 6085 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 6086 if (!skb) { 6087 ip6_rt_put(rt); 6088 err = -ENOBUFS; 6089 goto errout; 6090 } 6091 6092 skb_dst_set(skb, &rt->dst); 6093 6094 rcu_read_lock(); 6095 from = rcu_dereference(rt->from); 6096 if (from) { 6097 if (fibmatch) 6098 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 6099 iif, RTM_NEWROUTE, 6100 NETLINK_CB(in_skb).portid, 6101 nlh->nlmsg_seq, 0); 6102 else 6103 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 6104 &fl6.saddr, iif, RTM_NEWROUTE, 6105 NETLINK_CB(in_skb).portid, 6106 nlh->nlmsg_seq, 0); 6107 } else { 6108 err = -ENETUNREACH; 6109 } 6110 rcu_read_unlock(); 6111 6112 if (err < 0) { 6113 kfree_skb(skb); 6114 goto errout; 6115 } 6116 6117 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 6118 errout: 6119 return err; 6120 } 6121 6122 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 6123 unsigned int nlm_flags) 6124 { 6125 struct sk_buff *skb; 6126 struct net *net = info->nl_net; 6127 u32 seq; 6128 int err; 6129 6130 err = -ENOBUFS; 6131 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6132 6133 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6134 if (!skb) 6135 goto errout; 6136 6137 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6138 event, info->portid, seq, nlm_flags); 6139 if (err < 0) { 6140 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6141 WARN_ON(err == -EMSGSIZE); 6142 kfree_skb(skb); 6143 goto errout; 6144 } 6145 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6146 info->nlh, gfp_any()); 6147 return; 6148 errout: 6149 if (err < 0) 6150 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6151 } 6152 6153 void fib6_rt_update(struct net *net, struct fib6_info *rt, 6154 struct nl_info *info) 6155 { 6156 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6157 struct sk_buff *skb; 6158 int err = -ENOBUFS; 6159 6160 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6161 if (!skb) 6162 goto errout; 6163 6164 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6165 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 6166 if (err < 0) { 6167 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6168 WARN_ON(err == -EMSGSIZE); 6169 kfree_skb(skb); 6170 goto errout; 6171 } 6172 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6173 info->nlh, gfp_any()); 6174 return; 6175 errout: 6176 if (err < 0) 6177 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6178 } 6179 6180 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, 6181 bool offload, bool trap, bool offload_failed) 6182 { 6183 struct sk_buff *skb; 6184 int err; 6185 6186 if (f6i->offload == offload && f6i->trap == trap && 6187 f6i->offload_failed == offload_failed) 6188 return; 6189 6190 f6i->offload = offload; 6191 f6i->trap = trap; 6192 6193 /* 2 means send notifications only if offload_failed was changed. */ 6194 if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && 6195 f6i->offload_failed == offload_failed) 6196 return; 6197 6198 f6i->offload_failed = offload_failed; 6199 6200 if (!rcu_access_pointer(f6i->fib6_node)) 6201 /* The route was removed from the tree, do not send 6202 * notification. 6203 */ 6204 return; 6205 6206 if (!net->ipv6.sysctl.fib_notify_on_flag_change) 6207 return; 6208 6209 skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); 6210 if (!skb) { 6211 err = -ENOBUFS; 6212 goto errout; 6213 } 6214 6215 err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 6216 0, 0); 6217 if (err < 0) { 6218 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6219 WARN_ON(err == -EMSGSIZE); 6220 kfree_skb(skb); 6221 goto errout; 6222 } 6223 6224 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); 6225 return; 6226 6227 errout: 6228 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6229 } 6230 EXPORT_SYMBOL(fib6_info_hw_flags_set); 6231 6232 static int ip6_route_dev_notify(struct notifier_block *this, 6233 unsigned long event, void *ptr) 6234 { 6235 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 6236 struct net *net = dev_net(dev); 6237 6238 if (!(dev->flags & IFF_LOOPBACK)) 6239 return NOTIFY_OK; 6240 6241 if (event == NETDEV_REGISTER) { 6242 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 6243 net->ipv6.ip6_null_entry->dst.dev = dev; 6244 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 6245 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6246 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 6247 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 6248 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 6249 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 6250 #endif 6251 } else if (event == NETDEV_UNREGISTER && 6252 dev->reg_state != NETREG_UNREGISTERED) { 6253 /* NETDEV_UNREGISTER could be fired for multiple times by 6254 * netdev_wait_allrefs(). Make sure we only call this once. 6255 */ 6256 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 6257 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6258 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 6259 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 6260 #endif 6261 } 6262 6263 return NOTIFY_OK; 6264 } 6265 6266 /* 6267 * /proc 6268 */ 6269 6270 #ifdef CONFIG_PROC_FS 6271 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 6272 { 6273 struct net *net = (struct net *)seq->private; 6274 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 6275 net->ipv6.rt6_stats->fib_nodes, 6276 net->ipv6.rt6_stats->fib_route_nodes, 6277 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 6278 net->ipv6.rt6_stats->fib_rt_entries, 6279 net->ipv6.rt6_stats->fib_rt_cache, 6280 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 6281 net->ipv6.rt6_stats->fib_discarded_routes); 6282 6283 return 0; 6284 } 6285 #endif /* CONFIG_PROC_FS */ 6286 6287 #ifdef CONFIG_SYSCTL 6288 6289 static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 6290 void *buffer, size_t *lenp, loff_t *ppos) 6291 { 6292 struct net *net; 6293 int delay; 6294 int ret; 6295 if (!write) 6296 return -EINVAL; 6297 6298 net = (struct net *)ctl->extra1; 6299 delay = net->ipv6.sysctl.flush_delay; 6300 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 6301 if (ret) 6302 return ret; 6303 6304 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 6305 return 0; 6306 } 6307 6308 static struct ctl_table ipv6_route_table_template[] = { 6309 { 6310 .procname = "max_size", 6311 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 6312 .maxlen = sizeof(int), 6313 .mode = 0644, 6314 .proc_handler = proc_dointvec, 6315 }, 6316 { 6317 .procname = "gc_thresh", 6318 .data = &ip6_dst_ops_template.gc_thresh, 6319 .maxlen = sizeof(int), 6320 .mode = 0644, 6321 .proc_handler = proc_dointvec, 6322 }, 6323 { 6324 .procname = "flush", 6325 .data = &init_net.ipv6.sysctl.flush_delay, 6326 .maxlen = sizeof(int), 6327 .mode = 0200, 6328 .proc_handler = ipv6_sysctl_rtcache_flush 6329 }, 6330 { 6331 .procname = "gc_min_interval", 6332 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6333 .maxlen = sizeof(int), 6334 .mode = 0644, 6335 .proc_handler = proc_dointvec_jiffies, 6336 }, 6337 { 6338 .procname = "gc_timeout", 6339 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 6340 .maxlen = sizeof(int), 6341 .mode = 0644, 6342 .proc_handler = proc_dointvec_jiffies, 6343 }, 6344 { 6345 .procname = "gc_interval", 6346 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 6347 .maxlen = sizeof(int), 6348 .mode = 0644, 6349 .proc_handler = proc_dointvec_jiffies, 6350 }, 6351 { 6352 .procname = "gc_elasticity", 6353 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 6354 .maxlen = sizeof(int), 6355 .mode = 0644, 6356 .proc_handler = proc_dointvec, 6357 }, 6358 { 6359 .procname = "mtu_expires", 6360 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 6361 .maxlen = sizeof(int), 6362 .mode = 0644, 6363 .proc_handler = proc_dointvec_jiffies, 6364 }, 6365 { 6366 .procname = "min_adv_mss", 6367 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 6368 .maxlen = sizeof(int), 6369 .mode = 0644, 6370 .proc_handler = proc_dointvec, 6371 }, 6372 { 6373 .procname = "gc_min_interval_ms", 6374 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6375 .maxlen = sizeof(int), 6376 .mode = 0644, 6377 .proc_handler = proc_dointvec_ms_jiffies, 6378 }, 6379 { 6380 .procname = "skip_notify_on_dev_down", 6381 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 6382 .maxlen = sizeof(int), 6383 .mode = 0644, 6384 .proc_handler = proc_dointvec_minmax, 6385 .extra1 = SYSCTL_ZERO, 6386 .extra2 = SYSCTL_ONE, 6387 }, 6388 { } 6389 }; 6390 6391 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 6392 { 6393 struct ctl_table *table; 6394 6395 table = kmemdup(ipv6_route_table_template, 6396 sizeof(ipv6_route_table_template), 6397 GFP_KERNEL); 6398 6399 if (table) { 6400 table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; 6401 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 6402 table[2].data = &net->ipv6.sysctl.flush_delay; 6403 table[2].extra1 = net; 6404 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6405 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 6406 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 6407 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 6408 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 6409 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 6410 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6411 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 6412 6413 /* Don't export sysctls to unprivileged users */ 6414 if (net->user_ns != &init_user_ns) 6415 table[1].procname = NULL; 6416 } 6417 6418 return table; 6419 } 6420 #endif 6421 6422 static int __net_init ip6_route_net_init(struct net *net) 6423 { 6424 int ret = -ENOMEM; 6425 6426 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 6427 sizeof(net->ipv6.ip6_dst_ops)); 6428 6429 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 6430 goto out_ip6_dst_ops; 6431 6432 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 6433 if (!net->ipv6.fib6_null_entry) 6434 goto out_ip6_dst_entries; 6435 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 6436 sizeof(*net->ipv6.fib6_null_entry)); 6437 6438 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 6439 sizeof(*net->ipv6.ip6_null_entry), 6440 GFP_KERNEL); 6441 if (!net->ipv6.ip6_null_entry) 6442 goto out_fib6_null_entry; 6443 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6444 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 6445 ip6_template_metrics, true); 6446 INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); 6447 6448 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6449 net->ipv6.fib6_has_custom_rules = false; 6450 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 6451 sizeof(*net->ipv6.ip6_prohibit_entry), 6452 GFP_KERNEL); 6453 if (!net->ipv6.ip6_prohibit_entry) 6454 goto out_ip6_null_entry; 6455 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6456 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 6457 ip6_template_metrics, true); 6458 INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); 6459 6460 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 6461 sizeof(*net->ipv6.ip6_blk_hole_entry), 6462 GFP_KERNEL); 6463 if (!net->ipv6.ip6_blk_hole_entry) 6464 goto out_ip6_prohibit_entry; 6465 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6466 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6467 ip6_template_metrics, true); 6468 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); 6469 #ifdef CONFIG_IPV6_SUBTREES 6470 net->ipv6.fib6_routes_require_src = 0; 6471 #endif 6472 #endif 6473 6474 net->ipv6.sysctl.flush_delay = 0; 6475 net->ipv6.sysctl.ip6_rt_max_size = 4096; 6476 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 6477 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 6478 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 6479 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 6480 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 6481 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 6482 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 6483 6484 net->ipv6.ip6_rt_gc_expire = 30*HZ; 6485 6486 ret = 0; 6487 out: 6488 return ret; 6489 6490 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6491 out_ip6_prohibit_entry: 6492 kfree(net->ipv6.ip6_prohibit_entry); 6493 out_ip6_null_entry: 6494 kfree(net->ipv6.ip6_null_entry); 6495 #endif 6496 out_fib6_null_entry: 6497 kfree(net->ipv6.fib6_null_entry); 6498 out_ip6_dst_entries: 6499 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6500 out_ip6_dst_ops: 6501 goto out; 6502 } 6503 6504 static void __net_exit ip6_route_net_exit(struct net *net) 6505 { 6506 kfree(net->ipv6.fib6_null_entry); 6507 kfree(net->ipv6.ip6_null_entry); 6508 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6509 kfree(net->ipv6.ip6_prohibit_entry); 6510 kfree(net->ipv6.ip6_blk_hole_entry); 6511 #endif 6512 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6513 } 6514 6515 static int __net_init ip6_route_net_init_late(struct net *net) 6516 { 6517 #ifdef CONFIG_PROC_FS 6518 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 6519 sizeof(struct ipv6_route_iter)); 6520 proc_create_net_single("rt6_stats", 0444, net->proc_net, 6521 rt6_stats_seq_show, NULL); 6522 #endif 6523 return 0; 6524 } 6525 6526 static void __net_exit ip6_route_net_exit_late(struct net *net) 6527 { 6528 #ifdef CONFIG_PROC_FS 6529 remove_proc_entry("ipv6_route", net->proc_net); 6530 remove_proc_entry("rt6_stats", net->proc_net); 6531 #endif 6532 } 6533 6534 static struct pernet_operations ip6_route_net_ops = { 6535 .init = ip6_route_net_init, 6536 .exit = ip6_route_net_exit, 6537 }; 6538 6539 static int __net_init ipv6_inetpeer_init(struct net *net) 6540 { 6541 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 6542 6543 if (!bp) 6544 return -ENOMEM; 6545 inet_peer_base_init(bp); 6546 net->ipv6.peers = bp; 6547 return 0; 6548 } 6549 6550 static void __net_exit ipv6_inetpeer_exit(struct net *net) 6551 { 6552 struct inet_peer_base *bp = net->ipv6.peers; 6553 6554 net->ipv6.peers = NULL; 6555 inetpeer_invalidate_tree(bp); 6556 kfree(bp); 6557 } 6558 6559 static struct pernet_operations ipv6_inetpeer_ops = { 6560 .init = ipv6_inetpeer_init, 6561 .exit = ipv6_inetpeer_exit, 6562 }; 6563 6564 static struct pernet_operations ip6_route_net_late_ops = { 6565 .init = ip6_route_net_init_late, 6566 .exit = ip6_route_net_exit_late, 6567 }; 6568 6569 static struct notifier_block ip6_route_dev_notifier = { 6570 .notifier_call = ip6_route_dev_notify, 6571 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 6572 }; 6573 6574 void __init ip6_route_init_special_entries(void) 6575 { 6576 /* Registering of the loopback is done before this portion of code, 6577 * the loopback reference in rt6_info will not be taken, do it 6578 * manually for init_net */ 6579 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 6580 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 6581 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6582 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6583 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 6584 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6585 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 6586 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6587 #endif 6588 } 6589 6590 #if IS_BUILTIN(CONFIG_IPV6) 6591 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6592 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) 6593 6594 BTF_ID_LIST(btf_fib6_info_id) 6595 BTF_ID(struct, fib6_info) 6596 6597 static const struct bpf_iter_seq_info ipv6_route_seq_info = { 6598 .seq_ops = &ipv6_route_seq_ops, 6599 .init_seq_private = bpf_iter_init_seq_net, 6600 .fini_seq_private = bpf_iter_fini_seq_net, 6601 .seq_priv_size = sizeof(struct ipv6_route_iter), 6602 }; 6603 6604 static struct bpf_iter_reg ipv6_route_reg_info = { 6605 .target = "ipv6_route", 6606 .ctx_arg_info_size = 1, 6607 .ctx_arg_info = { 6608 { offsetof(struct bpf_iter__ipv6_route, rt), 6609 PTR_TO_BTF_ID_OR_NULL }, 6610 }, 6611 .seq_info = &ipv6_route_seq_info, 6612 }; 6613 6614 static int __init bpf_iter_register(void) 6615 { 6616 ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; 6617 return bpf_iter_reg_target(&ipv6_route_reg_info); 6618 } 6619 6620 static void bpf_iter_unregister(void) 6621 { 6622 bpf_iter_unreg_target(&ipv6_route_reg_info); 6623 } 6624 #endif 6625 #endif 6626 6627 int __init ip6_route_init(void) 6628 { 6629 int ret; 6630 int cpu; 6631 6632 ret = -ENOMEM; 6633 ip6_dst_ops_template.kmem_cachep = 6634 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 6635 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 6636 if (!ip6_dst_ops_template.kmem_cachep) 6637 goto out; 6638 6639 ret = dst_entries_init(&ip6_dst_blackhole_ops); 6640 if (ret) 6641 goto out_kmem_cache; 6642 6643 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 6644 if (ret) 6645 goto out_dst_entries; 6646 6647 ret = register_pernet_subsys(&ip6_route_net_ops); 6648 if (ret) 6649 goto out_register_inetpeer; 6650 6651 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 6652 6653 ret = fib6_init(); 6654 if (ret) 6655 goto out_register_subsys; 6656 6657 ret = xfrm6_init(); 6658 if (ret) 6659 goto out_fib6_init; 6660 6661 ret = fib6_rules_init(); 6662 if (ret) 6663 goto xfrm6_init; 6664 6665 ret = register_pernet_subsys(&ip6_route_net_late_ops); 6666 if (ret) 6667 goto fib6_rules_init; 6668 6669 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 6670 inet6_rtm_newroute, NULL, 0); 6671 if (ret < 0) 6672 goto out_register_late_subsys; 6673 6674 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 6675 inet6_rtm_delroute, NULL, 0); 6676 if (ret < 0) 6677 goto out_register_late_subsys; 6678 6679 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 6680 inet6_rtm_getroute, NULL, 6681 RTNL_FLAG_DOIT_UNLOCKED); 6682 if (ret < 0) 6683 goto out_register_late_subsys; 6684 6685 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 6686 if (ret) 6687 goto out_register_late_subsys; 6688 6689 #if IS_BUILTIN(CONFIG_IPV6) 6690 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6691 ret = bpf_iter_register(); 6692 if (ret) 6693 goto out_register_late_subsys; 6694 #endif 6695 #endif 6696 6697 for_each_possible_cpu(cpu) { 6698 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 6699 6700 INIT_LIST_HEAD(&ul->head); 6701 spin_lock_init(&ul->lock); 6702 } 6703 6704 out: 6705 return ret; 6706 6707 out_register_late_subsys: 6708 rtnl_unregister_all(PF_INET6); 6709 unregister_pernet_subsys(&ip6_route_net_late_ops); 6710 fib6_rules_init: 6711 fib6_rules_cleanup(); 6712 xfrm6_init: 6713 xfrm6_fini(); 6714 out_fib6_init: 6715 fib6_gc_cleanup(); 6716 out_register_subsys: 6717 unregister_pernet_subsys(&ip6_route_net_ops); 6718 out_register_inetpeer: 6719 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6720 out_dst_entries: 6721 dst_entries_destroy(&ip6_dst_blackhole_ops); 6722 out_kmem_cache: 6723 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6724 goto out; 6725 } 6726 6727 void ip6_route_cleanup(void) 6728 { 6729 #if IS_BUILTIN(CONFIG_IPV6) 6730 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6731 bpf_iter_unregister(); 6732 #endif 6733 #endif 6734 unregister_netdevice_notifier(&ip6_route_dev_notifier); 6735 unregister_pernet_subsys(&ip6_route_net_late_ops); 6736 fib6_rules_cleanup(); 6737 xfrm6_fini(); 6738 fib6_gc_cleanup(); 6739 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6740 unregister_pernet_subsys(&ip6_route_net_ops); 6741 dst_entries_destroy(&ip6_dst_blackhole_ops); 6742 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6743 } 6744