1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <linux/siphash.h> 45 #include <net/net_namespace.h> 46 #include <net/snmp.h> 47 #include <net/ipv6.h> 48 #include <net/ip6_fib.h> 49 #include <net/ip6_route.h> 50 #include <net/ndisc.h> 51 #include <net/addrconf.h> 52 #include <net/tcp.h> 53 #include <linux/rtnetlink.h> 54 #include <net/dst.h> 55 #include <net/dst_metadata.h> 56 #include <net/xfrm.h> 57 #include <net/netevent.h> 58 #include <net/netlink.h> 59 #include <net/rtnh.h> 60 #include <net/lwtunnel.h> 61 #include <net/ip_tunnels.h> 62 #include <net/l3mdev.h> 63 #include <net/ip.h> 64 #include <linux/uaccess.h> 65 #include <linux/btf_ids.h> 66 67 #ifdef CONFIG_SYSCTL 68 #include <linux/sysctl.h> 69 #endif 70 71 static int ip6_rt_type_to_error(u8 fib6_type); 72 73 #define CREATE_TRACE_POINTS 74 #include <trace/events/fib6.h> 75 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 76 #undef CREATE_TRACE_POINTS 77 78 enum rt6_nud_state { 79 RT6_NUD_FAIL_HARD = -3, 80 RT6_NUD_FAIL_PROBE = -2, 81 RT6_NUD_FAIL_DO_RR = -1, 82 RT6_NUD_SUCCEED = 1 83 }; 84 85 INDIRECT_CALLABLE_SCOPE 86 struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 87 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88 INDIRECT_CALLABLE_SCOPE 89 unsigned int ip6_mtu(const struct dst_entry *dst); 90 static void ip6_negative_advice(struct sock *sk, 91 struct dst_entry *dst); 92 static void ip6_dst_destroy(struct dst_entry *); 93 static void ip6_dst_ifdown(struct dst_entry *, 94 struct net_device *dev); 95 static void ip6_dst_gc(struct dst_ops *ops); 96 97 static int ip6_pkt_discard(struct sk_buff *skb); 98 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 99 static int ip6_pkt_prohibit(struct sk_buff *skb); 100 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 101 static void ip6_link_failure(struct sk_buff *skb); 102 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 103 struct sk_buff *skb, u32 mtu, 104 bool confirm_neigh); 105 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 106 struct sk_buff *skb); 107 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 108 int strict); 109 static size_t rt6_nlmsg_size(struct fib6_info *f6i); 110 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 111 struct fib6_info *rt, struct dst_entry *dst, 112 struct in6_addr *dest, struct in6_addr *src, 113 int iif, int type, u32 portid, u32 seq, 114 unsigned int flags); 115 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 116 const struct in6_addr *daddr, 117 const struct in6_addr *saddr); 118 119 #ifdef CONFIG_IPV6_ROUTE_INFO 120 static struct fib6_info *rt6_add_route_info(struct net *net, 121 const struct in6_addr *prefix, int prefixlen, 122 const struct in6_addr *gwaddr, 123 struct net_device *dev, 124 unsigned int pref); 125 static struct fib6_info *rt6_get_route_info(struct net *net, 126 const struct in6_addr *prefix, int prefixlen, 127 const struct in6_addr *gwaddr, 128 struct net_device *dev); 129 #endif 130 131 struct uncached_list { 132 spinlock_t lock; 133 struct list_head head; 134 struct list_head quarantine; 135 }; 136 137 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 138 139 void rt6_uncached_list_add(struct rt6_info *rt) 140 { 141 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 142 143 rt->dst.rt_uncached_list = ul; 144 145 spin_lock_bh(&ul->lock); 146 list_add_tail(&rt->dst.rt_uncached, &ul->head); 147 spin_unlock_bh(&ul->lock); 148 } 149 150 void rt6_uncached_list_del(struct rt6_info *rt) 151 { 152 if (!list_empty(&rt->dst.rt_uncached)) { 153 struct uncached_list *ul = rt->dst.rt_uncached_list; 154 155 spin_lock_bh(&ul->lock); 156 list_del_init(&rt->dst.rt_uncached); 157 spin_unlock_bh(&ul->lock); 158 } 159 } 160 161 static void rt6_uncached_list_flush_dev(struct net_device *dev) 162 { 163 int cpu; 164 165 for_each_possible_cpu(cpu) { 166 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 167 struct rt6_info *rt, *safe; 168 169 if (list_empty(&ul->head)) 170 continue; 171 172 spin_lock_bh(&ul->lock); 173 list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { 174 struct inet6_dev *rt_idev = rt->rt6i_idev; 175 struct net_device *rt_dev = rt->dst.dev; 176 bool handled = false; 177 178 if (rt_idev && rt_idev->dev == dev) { 179 rt->rt6i_idev = in6_dev_get(blackhole_netdev); 180 in6_dev_put(rt_idev); 181 handled = true; 182 } 183 184 if (rt_dev == dev) { 185 rt->dst.dev = blackhole_netdev; 186 netdev_ref_replace(rt_dev, blackhole_netdev, 187 &rt->dst.dev_tracker, 188 GFP_ATOMIC); 189 handled = true; 190 } 191 if (handled) 192 list_move(&rt->dst.rt_uncached, 193 &ul->quarantine); 194 } 195 spin_unlock_bh(&ul->lock); 196 } 197 } 198 199 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 200 struct sk_buff *skb, 201 const void *daddr) 202 { 203 if (!ipv6_addr_any(p)) 204 return (const void *) p; 205 else if (skb) 206 return &ipv6_hdr(skb)->daddr; 207 return daddr; 208 } 209 210 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 211 struct net_device *dev, 212 struct sk_buff *skb, 213 const void *daddr) 214 { 215 struct neighbour *n; 216 217 daddr = choose_neigh_daddr(gw, skb, daddr); 218 n = __ipv6_neigh_lookup(dev, daddr); 219 if (n) 220 return n; 221 222 n = neigh_create(&nd_tbl, daddr, dev); 223 return IS_ERR(n) ? NULL : n; 224 } 225 226 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 227 struct sk_buff *skb, 228 const void *daddr) 229 { 230 const struct rt6_info *rt = dst_rt6_info(dst); 231 232 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 233 dst->dev, skb, daddr); 234 } 235 236 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 237 { 238 const struct rt6_info *rt = dst_rt6_info(dst); 239 struct net_device *dev = dst->dev; 240 241 daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); 242 if (!daddr) 243 return; 244 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 245 return; 246 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 247 return; 248 __ipv6_confirm_neigh(dev, daddr); 249 } 250 251 static struct dst_ops ip6_dst_ops_template = { 252 .family = AF_INET6, 253 .gc = ip6_dst_gc, 254 .gc_thresh = 1024, 255 .check = ip6_dst_check, 256 .default_advmss = ip6_default_advmss, 257 .mtu = ip6_mtu, 258 .cow_metrics = dst_cow_metrics_generic, 259 .destroy = ip6_dst_destroy, 260 .ifdown = ip6_dst_ifdown, 261 .negative_advice = ip6_negative_advice, 262 .link_failure = ip6_link_failure, 263 .update_pmtu = ip6_rt_update_pmtu, 264 .redirect = rt6_do_redirect, 265 .local_out = __ip6_local_out, 266 .neigh_lookup = ip6_dst_neigh_lookup, 267 .confirm_neigh = ip6_confirm_neigh, 268 }; 269 270 static struct dst_ops ip6_dst_blackhole_ops = { 271 .family = AF_INET6, 272 .default_advmss = ip6_default_advmss, 273 .neigh_lookup = ip6_dst_neigh_lookup, 274 .check = ip6_dst_check, 275 .destroy = ip6_dst_destroy, 276 .cow_metrics = dst_cow_metrics_generic, 277 .update_pmtu = dst_blackhole_update_pmtu, 278 .redirect = dst_blackhole_redirect, 279 .mtu = dst_blackhole_mtu, 280 }; 281 282 static const u32 ip6_template_metrics[RTAX_MAX] = { 283 [RTAX_HOPLIMIT - 1] = 0, 284 }; 285 286 static const struct fib6_info fib6_null_entry_template = { 287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 288 .fib6_protocol = RTPROT_KERNEL, 289 .fib6_metric = ~(u32)0, 290 .fib6_ref = REFCOUNT_INIT(1), 291 .fib6_type = RTN_UNREACHABLE, 292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 293 }; 294 295 static const struct rt6_info ip6_null_entry_template = { 296 .dst = { 297 .__rcuref = RCUREF_INIT(1), 298 .__use = 1, 299 .obsolete = DST_OBSOLETE_FORCE_CHK, 300 .error = -ENETUNREACH, 301 .input = ip6_pkt_discard, 302 .output = ip6_pkt_discard_out, 303 }, 304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__rcuref = RCUREF_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 }; 320 321 static const struct rt6_info ip6_blk_hole_entry_template = { 322 .dst = { 323 .__rcuref = RCUREF_INIT(1), 324 .__use = 1, 325 .obsolete = DST_OBSOLETE_FORCE_CHK, 326 .error = -EINVAL, 327 .input = dst_discard, 328 .output = dst_discard_out, 329 }, 330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 331 }; 332 333 #endif 334 335 static void rt6_info_init(struct rt6_info *rt) 336 { 337 memset_after(rt, 0, dst); 338 } 339 340 /* allocate dst with ip6_dst_ops */ 341 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 342 int flags) 343 { 344 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 345 1, DST_OBSOLETE_FORCE_CHK, flags); 346 347 if (rt) { 348 rt6_info_init(rt); 349 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 350 } 351 352 return rt; 353 } 354 EXPORT_SYMBOL(ip6_dst_alloc); 355 356 static void ip6_dst_destroy(struct dst_entry *dst) 357 { 358 struct rt6_info *rt = dst_rt6_info(dst); 359 struct fib6_info *from; 360 struct inet6_dev *idev; 361 362 ip_dst_metrics_put(dst); 363 rt6_uncached_list_del(rt); 364 365 idev = rt->rt6i_idev; 366 if (idev) { 367 rt->rt6i_idev = NULL; 368 in6_dev_put(idev); 369 } 370 371 from = unrcu_pointer(xchg(&rt->from, NULL)); 372 fib6_info_release(from); 373 } 374 375 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) 376 { 377 struct rt6_info *rt = dst_rt6_info(dst); 378 struct inet6_dev *idev = rt->rt6i_idev; 379 struct fib6_info *from; 380 381 if (idev && idev->dev != blackhole_netdev) { 382 struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); 383 384 if (blackhole_idev) { 385 rt->rt6i_idev = blackhole_idev; 386 in6_dev_put(idev); 387 } 388 } 389 from = unrcu_pointer(xchg(&rt->from, NULL)); 390 fib6_info_release(from); 391 } 392 393 static bool __rt6_check_expired(const struct rt6_info *rt) 394 { 395 if (rt->rt6i_flags & RTF_EXPIRES) 396 return time_after(jiffies, rt->dst.expires); 397 else 398 return false; 399 } 400 401 static bool rt6_check_expired(const struct rt6_info *rt) 402 { 403 struct fib6_info *from; 404 405 from = rcu_dereference(rt->from); 406 407 if (rt->rt6i_flags & RTF_EXPIRES) { 408 if (time_after(jiffies, rt->dst.expires)) 409 return true; 410 } else if (from) { 411 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 412 fib6_check_expired(from); 413 } 414 return false; 415 } 416 417 static struct fib6_info * 418 rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) 419 { 420 struct fib6_info *iter; 421 struct fib6_node *fn; 422 423 fn = rcu_dereference(rt->fib6_node); 424 if (!fn) 425 goto out; 426 iter = rcu_dereference(fn->leaf); 427 if (!iter) 428 goto out; 429 430 while (iter) { 431 if (iter->fib6_metric == rt->fib6_metric && 432 rt6_qualify_for_ecmp(iter)) 433 return iter; 434 iter = rcu_dereference(iter->fib6_next); 435 } 436 437 out: 438 return NULL; 439 } 440 441 void fib6_select_path(const struct net *net, struct fib6_result *res, 442 struct flowi6 *fl6, int oif, bool have_oif_match, 443 const struct sk_buff *skb, int strict) 444 { 445 struct fib6_info *first, *match = res->f6i; 446 struct fib6_info *sibling; 447 int hash; 448 449 if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) 450 goto out; 451 452 if (match->nh && have_oif_match && res->nh) 453 return; 454 455 if (skb) 456 IP6CB(skb)->flags |= IP6SKB_MULTIPATH; 457 458 /* We might have already computed the hash for ICMPv6 errors. In such 459 * case it will always be non-zero. Otherwise now is the time to do it. 460 */ 461 if (!fl6->mp_hash && 462 (!match->nh || nexthop_is_multipath(match->nh))) 463 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 464 465 if (unlikely(match->nh)) { 466 nexthop_path_fib6_result(res, fl6->mp_hash); 467 return; 468 } 469 470 first = rt6_multipath_first_sibling_rcu(match); 471 if (!first) 472 goto out; 473 474 hash = fl6->mp_hash; 475 if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { 476 if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, 477 strict) >= 0) 478 match = first; 479 goto out; 480 } 481 482 list_for_each_entry_rcu(sibling, &first->fib6_siblings, 483 fib6_siblings) { 484 const struct fib6_nh *nh = sibling->fib6_nh; 485 int nh_upper_bound; 486 487 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 488 if (hash > nh_upper_bound) 489 continue; 490 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 491 break; 492 match = sibling; 493 break; 494 } 495 496 out: 497 res->f6i = match; 498 res->nh = match->fib6_nh; 499 } 500 501 /* 502 * Route lookup. rcu_read_lock() should be held. 503 */ 504 505 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 506 const struct in6_addr *saddr, int oif, int flags) 507 { 508 const struct net_device *dev; 509 510 if (nh->fib_nh_flags & RTNH_F_DEAD) 511 return false; 512 513 dev = nh->fib_nh_dev; 514 if (oif) { 515 if (dev->ifindex == oif) 516 return true; 517 } else { 518 if (ipv6_chk_addr(net, saddr, dev, 519 flags & RT6_LOOKUP_F_IFACE)) 520 return true; 521 } 522 523 return false; 524 } 525 526 struct fib6_nh_dm_arg { 527 struct net *net; 528 const struct in6_addr *saddr; 529 int oif; 530 int flags; 531 struct fib6_nh *nh; 532 }; 533 534 static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) 535 { 536 struct fib6_nh_dm_arg *arg = _arg; 537 538 arg->nh = nh; 539 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, 540 arg->flags); 541 } 542 543 /* returns fib6_nh from nexthop or NULL */ 544 static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, 545 struct fib6_result *res, 546 const struct in6_addr *saddr, 547 int oif, int flags) 548 { 549 struct fib6_nh_dm_arg arg = { 550 .net = net, 551 .saddr = saddr, 552 .oif = oif, 553 .flags = flags, 554 }; 555 556 if (nexthop_is_blackhole(nh)) 557 return NULL; 558 559 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) 560 return arg.nh; 561 562 return NULL; 563 } 564 565 static void rt6_device_match(struct net *net, struct fib6_result *res, 566 const struct in6_addr *saddr, int oif, int flags) 567 { 568 struct fib6_info *f6i = res->f6i; 569 struct fib6_info *spf6i; 570 struct fib6_nh *nh; 571 572 if (!oif && ipv6_addr_any(saddr)) { 573 if (unlikely(f6i->nh)) { 574 nh = nexthop_fib6_nh(f6i->nh); 575 if (nexthop_is_blackhole(f6i->nh)) 576 goto out_blackhole; 577 } else { 578 nh = f6i->fib6_nh; 579 } 580 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 581 goto out; 582 } 583 584 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 585 bool matched = false; 586 587 if (unlikely(spf6i->nh)) { 588 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, 589 oif, flags); 590 if (nh) 591 matched = true; 592 } else { 593 nh = spf6i->fib6_nh; 594 if (__rt6_device_match(net, nh, saddr, oif, flags)) 595 matched = true; 596 } 597 if (matched) { 598 res->f6i = spf6i; 599 goto out; 600 } 601 } 602 603 if (oif && flags & RT6_LOOKUP_F_IFACE) { 604 res->f6i = net->ipv6.fib6_null_entry; 605 nh = res->f6i->fib6_nh; 606 goto out; 607 } 608 609 if (unlikely(f6i->nh)) { 610 nh = nexthop_fib6_nh(f6i->nh); 611 if (nexthop_is_blackhole(f6i->nh)) 612 goto out_blackhole; 613 } else { 614 nh = f6i->fib6_nh; 615 } 616 617 if (nh->fib_nh_flags & RTNH_F_DEAD) { 618 res->f6i = net->ipv6.fib6_null_entry; 619 nh = res->f6i->fib6_nh; 620 } 621 out: 622 res->nh = nh; 623 res->fib6_type = res->f6i->fib6_type; 624 res->fib6_flags = res->f6i->fib6_flags; 625 return; 626 627 out_blackhole: 628 res->fib6_flags |= RTF_REJECT; 629 res->fib6_type = RTN_BLACKHOLE; 630 res->nh = nh; 631 } 632 633 #ifdef CONFIG_IPV6_ROUTER_PREF 634 struct __rt6_probe_work { 635 struct work_struct work; 636 struct in6_addr target; 637 struct net_device *dev; 638 netdevice_tracker dev_tracker; 639 }; 640 641 static void rt6_probe_deferred(struct work_struct *w) 642 { 643 struct in6_addr mcaddr; 644 struct __rt6_probe_work *work = 645 container_of(w, struct __rt6_probe_work, work); 646 647 addrconf_addr_solict_mult(&work->target, &mcaddr); 648 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 649 netdev_put(work->dev, &work->dev_tracker); 650 kfree(work); 651 } 652 653 static void rt6_probe(struct fib6_nh *fib6_nh) 654 { 655 struct __rt6_probe_work *work = NULL; 656 const struct in6_addr *nh_gw; 657 unsigned long last_probe; 658 struct neighbour *neigh; 659 struct net_device *dev; 660 struct inet6_dev *idev; 661 662 /* 663 * Okay, this does not seem to be appropriate 664 * for now, however, we need to check if it 665 * is really so; aka Router Reachability Probing. 666 * 667 * Router Reachability Probe MUST be rate-limited 668 * to no more than one per minute. 669 */ 670 if (!fib6_nh->fib_nh_gw_family) 671 return; 672 673 nh_gw = &fib6_nh->fib_nh_gw6; 674 dev = fib6_nh->fib_nh_dev; 675 rcu_read_lock(); 676 last_probe = READ_ONCE(fib6_nh->last_probe); 677 idev = __in6_dev_get(dev); 678 if (!idev) 679 goto out; 680 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 681 if (neigh) { 682 if (READ_ONCE(neigh->nud_state) & NUD_VALID) 683 goto out; 684 685 write_lock_bh(&neigh->lock); 686 if (!(neigh->nud_state & NUD_VALID) && 687 time_after(jiffies, 688 neigh->updated + idev->cnf.rtr_probe_interval)) { 689 work = kmalloc(sizeof(*work), GFP_ATOMIC); 690 if (work) 691 __neigh_set_probe_once(neigh); 692 } 693 write_unlock_bh(&neigh->lock); 694 } else if (time_after(jiffies, last_probe + 695 idev->cnf.rtr_probe_interval)) { 696 work = kmalloc(sizeof(*work), GFP_ATOMIC); 697 } 698 699 if (!work || cmpxchg(&fib6_nh->last_probe, 700 last_probe, jiffies) != last_probe) { 701 kfree(work); 702 } else { 703 INIT_WORK(&work->work, rt6_probe_deferred); 704 work->target = *nh_gw; 705 netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); 706 work->dev = dev; 707 schedule_work(&work->work); 708 } 709 710 out: 711 rcu_read_unlock(); 712 } 713 #else 714 static inline void rt6_probe(struct fib6_nh *fib6_nh) 715 { 716 } 717 #endif 718 719 /* 720 * Default Router Selection (RFC 2461 6.3.6) 721 */ 722 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 723 { 724 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 725 struct neighbour *neigh; 726 727 rcu_read_lock(); 728 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 729 &fib6_nh->fib_nh_gw6); 730 if (neigh) { 731 u8 nud_state = READ_ONCE(neigh->nud_state); 732 733 if (nud_state & NUD_VALID) 734 ret = RT6_NUD_SUCCEED; 735 #ifdef CONFIG_IPV6_ROUTER_PREF 736 else if (!(nud_state & NUD_FAILED)) 737 ret = RT6_NUD_SUCCEED; 738 else 739 ret = RT6_NUD_FAIL_PROBE; 740 #endif 741 } else { 742 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 743 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 744 } 745 rcu_read_unlock(); 746 747 return ret; 748 } 749 750 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 751 int strict) 752 { 753 int m = 0; 754 755 if (!oif || nh->fib_nh_dev->ifindex == oif) 756 m = 2; 757 758 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 759 return RT6_NUD_FAIL_HARD; 760 #ifdef CONFIG_IPV6_ROUTER_PREF 761 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 762 #endif 763 if ((strict & RT6_LOOKUP_F_REACHABLE) && 764 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 765 int n = rt6_check_neigh(nh); 766 if (n < 0) 767 return n; 768 } 769 return m; 770 } 771 772 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 773 int oif, int strict, int *mpri, bool *do_rr) 774 { 775 bool match_do_rr = false; 776 bool rc = false; 777 int m; 778 779 if (nh->fib_nh_flags & RTNH_F_DEAD) 780 goto out; 781 782 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 783 nh->fib_nh_flags & RTNH_F_LINKDOWN && 784 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 785 goto out; 786 787 m = rt6_score_route(nh, fib6_flags, oif, strict); 788 if (m == RT6_NUD_FAIL_DO_RR) { 789 match_do_rr = true; 790 m = 0; /* lowest valid score */ 791 } else if (m == RT6_NUD_FAIL_HARD) { 792 goto out; 793 } 794 795 if (strict & RT6_LOOKUP_F_REACHABLE) 796 rt6_probe(nh); 797 798 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 799 if (m > *mpri) { 800 *do_rr = match_do_rr; 801 *mpri = m; 802 rc = true; 803 } 804 out: 805 return rc; 806 } 807 808 struct fib6_nh_frl_arg { 809 u32 flags; 810 int oif; 811 int strict; 812 int *mpri; 813 bool *do_rr; 814 struct fib6_nh *nh; 815 }; 816 817 static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) 818 { 819 struct fib6_nh_frl_arg *arg = _arg; 820 821 arg->nh = nh; 822 return find_match(nh, arg->flags, arg->oif, arg->strict, 823 arg->mpri, arg->do_rr); 824 } 825 826 static void __find_rr_leaf(struct fib6_info *f6i_start, 827 struct fib6_info *nomatch, u32 metric, 828 struct fib6_result *res, struct fib6_info **cont, 829 int oif, int strict, bool *do_rr, int *mpri) 830 { 831 struct fib6_info *f6i; 832 833 for (f6i = f6i_start; 834 f6i && f6i != nomatch; 835 f6i = rcu_dereference(f6i->fib6_next)) { 836 bool matched = false; 837 struct fib6_nh *nh; 838 839 if (cont && f6i->fib6_metric != metric) { 840 *cont = f6i; 841 return; 842 } 843 844 if (fib6_check_expired(f6i)) 845 continue; 846 847 if (unlikely(f6i->nh)) { 848 struct fib6_nh_frl_arg arg = { 849 .flags = f6i->fib6_flags, 850 .oif = oif, 851 .strict = strict, 852 .mpri = mpri, 853 .do_rr = do_rr 854 }; 855 856 if (nexthop_is_blackhole(f6i->nh)) { 857 res->fib6_flags = RTF_REJECT; 858 res->fib6_type = RTN_BLACKHOLE; 859 res->f6i = f6i; 860 res->nh = nexthop_fib6_nh(f6i->nh); 861 return; 862 } 863 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, 864 &arg)) { 865 matched = true; 866 nh = arg.nh; 867 } 868 } else { 869 nh = f6i->fib6_nh; 870 if (find_match(nh, f6i->fib6_flags, oif, strict, 871 mpri, do_rr)) 872 matched = true; 873 } 874 if (matched) { 875 res->f6i = f6i; 876 res->nh = nh; 877 res->fib6_flags = f6i->fib6_flags; 878 res->fib6_type = f6i->fib6_type; 879 } 880 } 881 } 882 883 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 884 struct fib6_info *rr_head, int oif, int strict, 885 bool *do_rr, struct fib6_result *res) 886 { 887 u32 metric = rr_head->fib6_metric; 888 struct fib6_info *cont = NULL; 889 int mpri = -1; 890 891 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 892 oif, strict, do_rr, &mpri); 893 894 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 895 oif, strict, do_rr, &mpri); 896 897 if (res->f6i || !cont) 898 return; 899 900 __find_rr_leaf(cont, NULL, metric, res, NULL, 901 oif, strict, do_rr, &mpri); 902 } 903 904 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 905 struct fib6_result *res, int strict) 906 { 907 struct fib6_info *leaf = rcu_dereference(fn->leaf); 908 struct fib6_info *rt0; 909 bool do_rr = false; 910 int key_plen; 911 912 /* make sure this function or its helpers sets f6i */ 913 res->f6i = NULL; 914 915 if (!leaf || leaf == net->ipv6.fib6_null_entry) 916 goto out; 917 918 rt0 = rcu_dereference(fn->rr_ptr); 919 if (!rt0) 920 rt0 = leaf; 921 922 /* Double check to make sure fn is not an intermediate node 923 * and fn->leaf does not points to its child's leaf 924 * (This might happen if all routes under fn are deleted from 925 * the tree and fib6_repair_tree() is called on the node.) 926 */ 927 key_plen = rt0->fib6_dst.plen; 928 #ifdef CONFIG_IPV6_SUBTREES 929 if (rt0->fib6_src.plen) 930 key_plen = rt0->fib6_src.plen; 931 #endif 932 if (fn->fn_bit != key_plen) 933 goto out; 934 935 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 936 if (do_rr) { 937 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 938 939 /* no entries matched; do round-robin */ 940 if (!next || next->fib6_metric != rt0->fib6_metric) 941 next = leaf; 942 943 if (next != rt0) { 944 spin_lock_bh(&leaf->fib6_table->tb6_lock); 945 /* make sure next is not being deleted from the tree */ 946 if (next->fib6_node) 947 rcu_assign_pointer(fn->rr_ptr, next); 948 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 949 } 950 } 951 952 out: 953 if (!res->f6i) { 954 res->f6i = net->ipv6.fib6_null_entry; 955 res->nh = res->f6i->fib6_nh; 956 res->fib6_flags = res->f6i->fib6_flags; 957 res->fib6_type = res->f6i->fib6_type; 958 } 959 } 960 961 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 962 { 963 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 964 res->nh->fib_nh_gw_family; 965 } 966 967 #ifdef CONFIG_IPV6_ROUTE_INFO 968 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 969 const struct in6_addr *gwaddr) 970 { 971 struct net *net = dev_net(dev); 972 struct route_info *rinfo = (struct route_info *) opt; 973 struct in6_addr prefix_buf, *prefix; 974 unsigned int pref; 975 unsigned long lifetime; 976 struct fib6_info *rt; 977 978 if (len < sizeof(struct route_info)) { 979 return -EINVAL; 980 } 981 982 /* Sanity check for prefix_len and length */ 983 if (rinfo->length > 3) { 984 return -EINVAL; 985 } else if (rinfo->prefix_len > 128) { 986 return -EINVAL; 987 } else if (rinfo->prefix_len > 64) { 988 if (rinfo->length < 2) { 989 return -EINVAL; 990 } 991 } else if (rinfo->prefix_len > 0) { 992 if (rinfo->length < 1) { 993 return -EINVAL; 994 } 995 } 996 997 pref = rinfo->route_pref; 998 if (pref == ICMPV6_ROUTER_PREF_INVALID) 999 return -EINVAL; 1000 1001 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 1002 1003 if (rinfo->length == 3) 1004 prefix = (struct in6_addr *)rinfo->prefix; 1005 else { 1006 /* this function is safe */ 1007 ipv6_addr_prefix(&prefix_buf, 1008 (struct in6_addr *)rinfo->prefix, 1009 rinfo->prefix_len); 1010 prefix = &prefix_buf; 1011 } 1012 1013 if (rinfo->prefix_len == 0) 1014 rt = rt6_get_dflt_router(net, gwaddr, dev); 1015 else 1016 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 1017 gwaddr, dev); 1018 1019 if (rt && !lifetime) { 1020 ip6_del_rt(net, rt, false); 1021 rt = NULL; 1022 } 1023 1024 if (!rt && lifetime) 1025 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 1026 dev, pref); 1027 else if (rt) 1028 rt->fib6_flags = RTF_ROUTEINFO | 1029 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 1030 1031 if (rt) { 1032 if (!addrconf_finite_timeout(lifetime)) 1033 fib6_clean_expires(rt); 1034 else 1035 fib6_set_expires(rt, jiffies + HZ * lifetime); 1036 1037 fib6_info_release(rt); 1038 } 1039 return 0; 1040 } 1041 #endif 1042 1043 /* 1044 * Misc support functions 1045 */ 1046 1047 /* called with rcu_lock held */ 1048 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 1049 { 1050 struct net_device *dev = res->nh->fib_nh_dev; 1051 1052 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1053 /* for copies of local routes, dst->dev needs to be the 1054 * device if it is a master device, the master device if 1055 * device is enslaved, and the loopback as the default 1056 */ 1057 if (netif_is_l3_slave(dev) && 1058 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 1059 dev = l3mdev_master_dev_rcu(dev); 1060 else if (!netif_is_l3_master(dev)) 1061 dev = dev_net(dev)->loopback_dev; 1062 /* last case is netif_is_l3_master(dev) is true in which 1063 * case we want dev returned to be dev 1064 */ 1065 } 1066 1067 return dev; 1068 } 1069 1070 static const int fib6_prop[RTN_MAX + 1] = { 1071 [RTN_UNSPEC] = 0, 1072 [RTN_UNICAST] = 0, 1073 [RTN_LOCAL] = 0, 1074 [RTN_BROADCAST] = 0, 1075 [RTN_ANYCAST] = 0, 1076 [RTN_MULTICAST] = 0, 1077 [RTN_BLACKHOLE] = -EINVAL, 1078 [RTN_UNREACHABLE] = -EHOSTUNREACH, 1079 [RTN_PROHIBIT] = -EACCES, 1080 [RTN_THROW] = -EAGAIN, 1081 [RTN_NAT] = -EINVAL, 1082 [RTN_XRESOLVE] = -EINVAL, 1083 }; 1084 1085 static int ip6_rt_type_to_error(u8 fib6_type) 1086 { 1087 return fib6_prop[fib6_type]; 1088 } 1089 1090 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 1091 { 1092 unsigned short flags = 0; 1093 1094 if (rt->dst_nocount) 1095 flags |= DST_NOCOUNT; 1096 if (rt->dst_nopolicy) 1097 flags |= DST_NOPOLICY; 1098 1099 return flags; 1100 } 1101 1102 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 1103 { 1104 rt->dst.error = ip6_rt_type_to_error(fib6_type); 1105 1106 switch (fib6_type) { 1107 case RTN_BLACKHOLE: 1108 rt->dst.output = dst_discard_out; 1109 rt->dst.input = dst_discard; 1110 break; 1111 case RTN_PROHIBIT: 1112 rt->dst.output = ip6_pkt_prohibit_out; 1113 rt->dst.input = ip6_pkt_prohibit; 1114 break; 1115 case RTN_THROW: 1116 case RTN_UNREACHABLE: 1117 default: 1118 rt->dst.output = ip6_pkt_discard_out; 1119 rt->dst.input = ip6_pkt_discard; 1120 break; 1121 } 1122 } 1123 1124 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 1125 { 1126 struct fib6_info *f6i = res->f6i; 1127 1128 if (res->fib6_flags & RTF_REJECT) { 1129 ip6_rt_init_dst_reject(rt, res->fib6_type); 1130 return; 1131 } 1132 1133 rt->dst.error = 0; 1134 rt->dst.output = ip6_output; 1135 1136 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1137 rt->dst.input = ip6_input; 1138 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1139 rt->dst.input = ip6_mc_input; 1140 } else { 1141 rt->dst.input = ip6_forward; 1142 } 1143 1144 if (res->nh->fib_nh_lws) { 1145 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1146 lwtunnel_set_redirect(&rt->dst); 1147 } 1148 1149 rt->dst.lastuse = jiffies; 1150 } 1151 1152 /* Caller must already hold reference to @from */ 1153 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1154 { 1155 rt->rt6i_flags &= ~RTF_EXPIRES; 1156 rcu_assign_pointer(rt->from, from); 1157 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1158 } 1159 1160 /* Caller must already hold reference to f6i in result */ 1161 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1162 { 1163 const struct fib6_nh *nh = res->nh; 1164 const struct net_device *dev = nh->fib_nh_dev; 1165 struct fib6_info *f6i = res->f6i; 1166 1167 ip6_rt_init_dst(rt, res); 1168 1169 rt->rt6i_dst = f6i->fib6_dst; 1170 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1171 rt->rt6i_flags = res->fib6_flags; 1172 if (nh->fib_nh_gw_family) { 1173 rt->rt6i_gateway = nh->fib_nh_gw6; 1174 rt->rt6i_flags |= RTF_GATEWAY; 1175 } 1176 rt6_set_from(rt, f6i); 1177 #ifdef CONFIG_IPV6_SUBTREES 1178 rt->rt6i_src = f6i->fib6_src; 1179 #endif 1180 } 1181 1182 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1183 struct in6_addr *saddr) 1184 { 1185 struct fib6_node *pn, *sn; 1186 while (1) { 1187 if (fn->fn_flags & RTN_TL_ROOT) 1188 return NULL; 1189 pn = rcu_dereference(fn->parent); 1190 sn = FIB6_SUBTREE(pn); 1191 if (sn && sn != fn) 1192 fn = fib6_node_lookup(sn, NULL, saddr); 1193 else 1194 fn = pn; 1195 if (fn->fn_flags & RTN_RTINFO) 1196 return fn; 1197 } 1198 } 1199 1200 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1201 { 1202 struct rt6_info *rt = *prt; 1203 1204 if (dst_hold_safe(&rt->dst)) 1205 return true; 1206 if (net) { 1207 rt = net->ipv6.ip6_null_entry; 1208 dst_hold(&rt->dst); 1209 } else { 1210 rt = NULL; 1211 } 1212 *prt = rt; 1213 return false; 1214 } 1215 1216 /* called with rcu_lock held */ 1217 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1218 { 1219 struct net_device *dev = res->nh->fib_nh_dev; 1220 struct fib6_info *f6i = res->f6i; 1221 unsigned short flags; 1222 struct rt6_info *nrt; 1223 1224 if (!fib6_info_hold_safe(f6i)) 1225 goto fallback; 1226 1227 flags = fib6_info_dst_flags(f6i); 1228 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1229 if (!nrt) { 1230 fib6_info_release(f6i); 1231 goto fallback; 1232 } 1233 1234 ip6_rt_copy_init(nrt, res); 1235 return nrt; 1236 1237 fallback: 1238 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1239 dst_hold(&nrt->dst); 1240 return nrt; 1241 } 1242 1243 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, 1244 struct fib6_table *table, 1245 struct flowi6 *fl6, 1246 const struct sk_buff *skb, 1247 int flags) 1248 { 1249 struct fib6_result res = {}; 1250 struct fib6_node *fn; 1251 struct rt6_info *rt; 1252 1253 rcu_read_lock(); 1254 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1255 restart: 1256 res.f6i = rcu_dereference(fn->leaf); 1257 if (!res.f6i) 1258 res.f6i = net->ipv6.fib6_null_entry; 1259 else 1260 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1261 flags); 1262 1263 if (res.f6i == net->ipv6.fib6_null_entry) { 1264 fn = fib6_backtrack(fn, &fl6->saddr); 1265 if (fn) 1266 goto restart; 1267 1268 rt = net->ipv6.ip6_null_entry; 1269 dst_hold(&rt->dst); 1270 goto out; 1271 } else if (res.fib6_flags & RTF_REJECT) { 1272 goto do_create; 1273 } 1274 1275 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1276 fl6->flowi6_oif != 0, skb, flags); 1277 1278 /* Search through exception table */ 1279 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1280 if (rt) { 1281 if (ip6_hold_safe(net, &rt)) 1282 dst_use_noref(&rt->dst, jiffies); 1283 } else { 1284 do_create: 1285 rt = ip6_create_rt_rcu(&res); 1286 } 1287 1288 out: 1289 trace_fib6_table_lookup(net, &res, table, fl6); 1290 1291 rcu_read_unlock(); 1292 1293 return rt; 1294 } 1295 1296 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1297 const struct sk_buff *skb, int flags) 1298 { 1299 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1300 } 1301 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1302 1303 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1304 const struct in6_addr *saddr, int oif, 1305 const struct sk_buff *skb, int strict) 1306 { 1307 struct flowi6 fl6 = { 1308 .flowi6_oif = oif, 1309 .daddr = *daddr, 1310 }; 1311 struct dst_entry *dst; 1312 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1313 1314 if (saddr) { 1315 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1316 flags |= RT6_LOOKUP_F_HAS_SADDR; 1317 } 1318 1319 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1320 if (dst->error == 0) 1321 return dst_rt6_info(dst); 1322 1323 dst_release(dst); 1324 1325 return NULL; 1326 } 1327 EXPORT_SYMBOL(rt6_lookup); 1328 1329 /* ip6_ins_rt is called with FREE table->tb6_lock. 1330 * It takes new route entry, the addition fails by any reason the 1331 * route is released. 1332 * Caller must hold dst before calling it. 1333 */ 1334 1335 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1336 struct netlink_ext_ack *extack) 1337 { 1338 int err; 1339 struct fib6_table *table; 1340 1341 table = rt->fib6_table; 1342 spin_lock_bh(&table->tb6_lock); 1343 err = fib6_add(&table->tb6_root, rt, info, extack); 1344 spin_unlock_bh(&table->tb6_lock); 1345 1346 return err; 1347 } 1348 1349 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1350 { 1351 struct nl_info info = { .nl_net = net, }; 1352 1353 return __ip6_ins_rt(rt, &info, NULL); 1354 } 1355 1356 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1357 const struct in6_addr *daddr, 1358 const struct in6_addr *saddr) 1359 { 1360 struct fib6_info *f6i = res->f6i; 1361 struct net_device *dev; 1362 struct rt6_info *rt; 1363 1364 /* 1365 * Clone the route. 1366 */ 1367 1368 if (!fib6_info_hold_safe(f6i)) 1369 return NULL; 1370 1371 dev = ip6_rt_get_dev_rcu(res); 1372 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1373 if (!rt) { 1374 fib6_info_release(f6i); 1375 return NULL; 1376 } 1377 1378 ip6_rt_copy_init(rt, res); 1379 rt->rt6i_flags |= RTF_CACHE; 1380 rt->rt6i_dst.addr = *daddr; 1381 rt->rt6i_dst.plen = 128; 1382 1383 if (!rt6_is_gw_or_nonexthop(res)) { 1384 if (f6i->fib6_dst.plen != 128 && 1385 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1386 rt->rt6i_flags |= RTF_ANYCAST; 1387 #ifdef CONFIG_IPV6_SUBTREES 1388 if (rt->rt6i_src.plen && saddr) { 1389 rt->rt6i_src.addr = *saddr; 1390 rt->rt6i_src.plen = 128; 1391 } 1392 #endif 1393 } 1394 1395 return rt; 1396 } 1397 1398 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1399 { 1400 struct fib6_info *f6i = res->f6i; 1401 unsigned short flags = fib6_info_dst_flags(f6i); 1402 struct net_device *dev; 1403 struct rt6_info *pcpu_rt; 1404 1405 if (!fib6_info_hold_safe(f6i)) 1406 return NULL; 1407 1408 rcu_read_lock(); 1409 dev = ip6_rt_get_dev_rcu(res); 1410 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); 1411 rcu_read_unlock(); 1412 if (!pcpu_rt) { 1413 fib6_info_release(f6i); 1414 return NULL; 1415 } 1416 ip6_rt_copy_init(pcpu_rt, res); 1417 pcpu_rt->rt6i_flags |= RTF_PCPU; 1418 1419 if (f6i->nh) 1420 pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); 1421 1422 return pcpu_rt; 1423 } 1424 1425 static bool rt6_is_valid(const struct rt6_info *rt6) 1426 { 1427 return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); 1428 } 1429 1430 /* It should be called with rcu_read_lock() acquired */ 1431 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1432 { 1433 struct rt6_info *pcpu_rt; 1434 1435 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1436 1437 if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { 1438 struct rt6_info *prev, **p; 1439 1440 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1441 /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ 1442 prev = xchg(p, NULL); 1443 if (prev) { 1444 dst_dev_put(&prev->dst); 1445 dst_release(&prev->dst); 1446 } 1447 1448 pcpu_rt = NULL; 1449 } 1450 1451 return pcpu_rt; 1452 } 1453 1454 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1455 const struct fib6_result *res) 1456 { 1457 struct rt6_info *pcpu_rt, *prev, **p; 1458 1459 pcpu_rt = ip6_rt_pcpu_alloc(res); 1460 if (!pcpu_rt) 1461 return NULL; 1462 1463 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1464 prev = cmpxchg(p, NULL, pcpu_rt); 1465 BUG_ON(prev); 1466 1467 if (res->f6i->fib6_destroying) { 1468 struct fib6_info *from; 1469 1470 from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); 1471 fib6_info_release(from); 1472 } 1473 1474 return pcpu_rt; 1475 } 1476 1477 /* exception hash table implementation 1478 */ 1479 static DEFINE_SPINLOCK(rt6_exception_lock); 1480 1481 /* Remove rt6_ex from hash table and free the memory 1482 * Caller must hold rt6_exception_lock 1483 */ 1484 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1485 struct rt6_exception *rt6_ex) 1486 { 1487 struct net *net; 1488 1489 if (!bucket || !rt6_ex) 1490 return; 1491 1492 net = dev_net(rt6_ex->rt6i->dst.dev); 1493 net->ipv6.rt6_stats->fib_rt_cache--; 1494 1495 /* purge completely the exception to allow releasing the held resources: 1496 * some [sk] cache may keep the dst around for unlimited time 1497 */ 1498 dst_dev_put(&rt6_ex->rt6i->dst); 1499 1500 hlist_del_rcu(&rt6_ex->hlist); 1501 dst_release(&rt6_ex->rt6i->dst); 1502 kfree_rcu(rt6_ex, rcu); 1503 WARN_ON_ONCE(!bucket->depth); 1504 bucket->depth--; 1505 } 1506 1507 /* Remove oldest rt6_ex in bucket and free the memory 1508 * Caller must hold rt6_exception_lock 1509 */ 1510 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1511 { 1512 struct rt6_exception *rt6_ex, *oldest = NULL; 1513 1514 if (!bucket) 1515 return; 1516 1517 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1518 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1519 oldest = rt6_ex; 1520 } 1521 rt6_remove_exception(bucket, oldest); 1522 } 1523 1524 static u32 rt6_exception_hash(const struct in6_addr *dst, 1525 const struct in6_addr *src) 1526 { 1527 static siphash_aligned_key_t rt6_exception_key; 1528 struct { 1529 struct in6_addr dst; 1530 struct in6_addr src; 1531 } __aligned(SIPHASH_ALIGNMENT) combined = { 1532 .dst = *dst, 1533 }; 1534 u64 val; 1535 1536 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); 1537 1538 #ifdef CONFIG_IPV6_SUBTREES 1539 if (src) 1540 combined.src = *src; 1541 #endif 1542 val = siphash(&combined, sizeof(combined), &rt6_exception_key); 1543 1544 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1545 } 1546 1547 /* Helper function to find the cached rt in the hash table 1548 * and update bucket pointer to point to the bucket for this 1549 * (daddr, saddr) pair 1550 * Caller must hold rt6_exception_lock 1551 */ 1552 static struct rt6_exception * 1553 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1554 const struct in6_addr *daddr, 1555 const struct in6_addr *saddr) 1556 { 1557 struct rt6_exception *rt6_ex; 1558 u32 hval; 1559 1560 if (!(*bucket) || !daddr) 1561 return NULL; 1562 1563 hval = rt6_exception_hash(daddr, saddr); 1564 *bucket += hval; 1565 1566 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1567 struct rt6_info *rt6 = rt6_ex->rt6i; 1568 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1569 1570 #ifdef CONFIG_IPV6_SUBTREES 1571 if (matched && saddr) 1572 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1573 #endif 1574 if (matched) 1575 return rt6_ex; 1576 } 1577 return NULL; 1578 } 1579 1580 /* Helper function to find the cached rt in the hash table 1581 * and update bucket pointer to point to the bucket for this 1582 * (daddr, saddr) pair 1583 * Caller must hold rcu_read_lock() 1584 */ 1585 static struct rt6_exception * 1586 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1587 const struct in6_addr *daddr, 1588 const struct in6_addr *saddr) 1589 { 1590 struct rt6_exception *rt6_ex; 1591 u32 hval; 1592 1593 WARN_ON_ONCE(!rcu_read_lock_held()); 1594 1595 if (!(*bucket) || !daddr) 1596 return NULL; 1597 1598 hval = rt6_exception_hash(daddr, saddr); 1599 *bucket += hval; 1600 1601 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1602 struct rt6_info *rt6 = rt6_ex->rt6i; 1603 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1604 1605 #ifdef CONFIG_IPV6_SUBTREES 1606 if (matched && saddr) 1607 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1608 #endif 1609 if (matched) 1610 return rt6_ex; 1611 } 1612 return NULL; 1613 } 1614 1615 static unsigned int fib6_mtu(const struct fib6_result *res) 1616 { 1617 const struct fib6_nh *nh = res->nh; 1618 unsigned int mtu; 1619 1620 if (res->f6i->fib6_pmtu) { 1621 mtu = res->f6i->fib6_pmtu; 1622 } else { 1623 struct net_device *dev = nh->fib_nh_dev; 1624 struct inet6_dev *idev; 1625 1626 rcu_read_lock(); 1627 idev = __in6_dev_get(dev); 1628 mtu = idev->cnf.mtu6; 1629 rcu_read_unlock(); 1630 } 1631 1632 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1633 1634 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1635 } 1636 1637 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1638 1639 /* used when the flushed bit is not relevant, only access to the bucket 1640 * (ie., all bucket users except rt6_insert_exception); 1641 * 1642 * called under rcu lock; sometimes called with rt6_exception_lock held 1643 */ 1644 static 1645 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1646 spinlock_t *lock) 1647 { 1648 struct rt6_exception_bucket *bucket; 1649 1650 if (lock) 1651 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1652 lockdep_is_held(lock)); 1653 else 1654 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1655 1656 /* remove bucket flushed bit if set */ 1657 if (bucket) { 1658 unsigned long p = (unsigned long)bucket; 1659 1660 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1661 bucket = (struct rt6_exception_bucket *)p; 1662 } 1663 1664 return bucket; 1665 } 1666 1667 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1668 { 1669 unsigned long p = (unsigned long)bucket; 1670 1671 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1672 } 1673 1674 /* called with rt6_exception_lock held */ 1675 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1676 spinlock_t *lock) 1677 { 1678 struct rt6_exception_bucket *bucket; 1679 unsigned long p; 1680 1681 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1682 lockdep_is_held(lock)); 1683 1684 p = (unsigned long)bucket; 1685 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1686 bucket = (struct rt6_exception_bucket *)p; 1687 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1688 } 1689 1690 static int rt6_insert_exception(struct rt6_info *nrt, 1691 const struct fib6_result *res) 1692 { 1693 struct net *net = dev_net(nrt->dst.dev); 1694 struct rt6_exception_bucket *bucket; 1695 struct fib6_info *f6i = res->f6i; 1696 struct in6_addr *src_key = NULL; 1697 struct rt6_exception *rt6_ex; 1698 struct fib6_nh *nh = res->nh; 1699 int max_depth; 1700 int err = 0; 1701 1702 spin_lock_bh(&rt6_exception_lock); 1703 1704 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1705 lockdep_is_held(&rt6_exception_lock)); 1706 if (!bucket) { 1707 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1708 GFP_ATOMIC); 1709 if (!bucket) { 1710 err = -ENOMEM; 1711 goto out; 1712 } 1713 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1714 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1715 err = -EINVAL; 1716 goto out; 1717 } 1718 1719 #ifdef CONFIG_IPV6_SUBTREES 1720 /* fib6_src.plen != 0 indicates f6i is in subtree 1721 * and exception table is indexed by a hash of 1722 * both fib6_dst and fib6_src. 1723 * Otherwise, the exception table is indexed by 1724 * a hash of only fib6_dst. 1725 */ 1726 if (f6i->fib6_src.plen) 1727 src_key = &nrt->rt6i_src.addr; 1728 #endif 1729 /* rt6_mtu_change() might lower mtu on f6i. 1730 * Only insert this exception route if its mtu 1731 * is less than f6i's mtu value. 1732 */ 1733 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1734 err = -EINVAL; 1735 goto out; 1736 } 1737 1738 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1739 src_key); 1740 if (rt6_ex) 1741 rt6_remove_exception(bucket, rt6_ex); 1742 1743 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1744 if (!rt6_ex) { 1745 err = -ENOMEM; 1746 goto out; 1747 } 1748 rt6_ex->rt6i = nrt; 1749 rt6_ex->stamp = jiffies; 1750 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1751 bucket->depth++; 1752 net->ipv6.rt6_stats->fib_rt_cache++; 1753 1754 /* Randomize max depth to avoid some side channels attacks. */ 1755 max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); 1756 while (bucket->depth > max_depth) 1757 rt6_exception_remove_oldest(bucket); 1758 1759 out: 1760 spin_unlock_bh(&rt6_exception_lock); 1761 1762 /* Update fn->fn_sernum to invalidate all cached dst */ 1763 if (!err) { 1764 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1765 fib6_update_sernum(net, f6i); 1766 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1767 fib6_force_start_gc(net); 1768 } 1769 1770 return err; 1771 } 1772 1773 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1774 { 1775 struct rt6_exception_bucket *bucket; 1776 struct rt6_exception *rt6_ex; 1777 struct hlist_node *tmp; 1778 int i; 1779 1780 spin_lock_bh(&rt6_exception_lock); 1781 1782 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1783 if (!bucket) 1784 goto out; 1785 1786 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1787 if (!from) 1788 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1789 1790 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1791 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1792 if (!from || 1793 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1794 rt6_remove_exception(bucket, rt6_ex); 1795 } 1796 WARN_ON_ONCE(!from && bucket->depth); 1797 bucket++; 1798 } 1799 out: 1800 spin_unlock_bh(&rt6_exception_lock); 1801 } 1802 1803 static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) 1804 { 1805 struct fib6_info *f6i = arg; 1806 1807 fib6_nh_flush_exceptions(nh, f6i); 1808 1809 return 0; 1810 } 1811 1812 void rt6_flush_exceptions(struct fib6_info *f6i) 1813 { 1814 if (f6i->nh) 1815 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, 1816 f6i); 1817 else 1818 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1819 } 1820 1821 /* Find cached rt in the hash table inside passed in rt 1822 * Caller has to hold rcu_read_lock() 1823 */ 1824 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1825 const struct in6_addr *daddr, 1826 const struct in6_addr *saddr) 1827 { 1828 const struct in6_addr *src_key = NULL; 1829 struct rt6_exception_bucket *bucket; 1830 struct rt6_exception *rt6_ex; 1831 struct rt6_info *ret = NULL; 1832 1833 #ifdef CONFIG_IPV6_SUBTREES 1834 /* fib6i_src.plen != 0 indicates f6i is in subtree 1835 * and exception table is indexed by a hash of 1836 * both fib6_dst and fib6_src. 1837 * However, the src addr used to create the hash 1838 * might not be exactly the passed in saddr which 1839 * is a /128 addr from the flow. 1840 * So we need to use f6i->fib6_src to redo lookup 1841 * if the passed in saddr does not find anything. 1842 * (See the logic in ip6_rt_cache_alloc() on how 1843 * rt->rt6i_src is updated.) 1844 */ 1845 if (res->f6i->fib6_src.plen) 1846 src_key = saddr; 1847 find_ex: 1848 #endif 1849 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1850 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1851 1852 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1853 ret = rt6_ex->rt6i; 1854 1855 #ifdef CONFIG_IPV6_SUBTREES 1856 /* Use fib6_src as src_key and redo lookup */ 1857 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1858 src_key = &res->f6i->fib6_src.addr; 1859 goto find_ex; 1860 } 1861 #endif 1862 1863 return ret; 1864 } 1865 1866 /* Remove the passed in cached rt from the hash table that contains it */ 1867 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1868 const struct rt6_info *rt) 1869 { 1870 const struct in6_addr *src_key = NULL; 1871 struct rt6_exception_bucket *bucket; 1872 struct rt6_exception *rt6_ex; 1873 int err; 1874 1875 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1876 return -ENOENT; 1877 1878 spin_lock_bh(&rt6_exception_lock); 1879 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1880 1881 #ifdef CONFIG_IPV6_SUBTREES 1882 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1883 * and exception table is indexed by a hash of 1884 * both rt6i_dst and rt6i_src. 1885 * Otherwise, the exception table is indexed by 1886 * a hash of only rt6i_dst. 1887 */ 1888 if (plen) 1889 src_key = &rt->rt6i_src.addr; 1890 #endif 1891 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1892 &rt->rt6i_dst.addr, 1893 src_key); 1894 if (rt6_ex) { 1895 rt6_remove_exception(bucket, rt6_ex); 1896 err = 0; 1897 } else { 1898 err = -ENOENT; 1899 } 1900 1901 spin_unlock_bh(&rt6_exception_lock); 1902 return err; 1903 } 1904 1905 struct fib6_nh_excptn_arg { 1906 struct rt6_info *rt; 1907 int plen; 1908 }; 1909 1910 static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) 1911 { 1912 struct fib6_nh_excptn_arg *arg = _arg; 1913 int err; 1914 1915 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); 1916 if (err == 0) 1917 return 1; 1918 1919 return 0; 1920 } 1921 1922 static int rt6_remove_exception_rt(struct rt6_info *rt) 1923 { 1924 struct fib6_info *from; 1925 1926 from = rcu_dereference(rt->from); 1927 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1928 return -EINVAL; 1929 1930 if (from->nh) { 1931 struct fib6_nh_excptn_arg arg = { 1932 .rt = rt, 1933 .plen = from->fib6_src.plen 1934 }; 1935 int rc; 1936 1937 /* rc = 1 means an entry was found */ 1938 rc = nexthop_for_each_fib6_nh(from->nh, 1939 rt6_nh_remove_exception_rt, 1940 &arg); 1941 return rc ? 0 : -ENOENT; 1942 } 1943 1944 return fib6_nh_remove_exception(from->fib6_nh, 1945 from->fib6_src.plen, rt); 1946 } 1947 1948 /* Find rt6_ex which contains the passed in rt cache and 1949 * refresh its stamp 1950 */ 1951 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1952 const struct rt6_info *rt) 1953 { 1954 const struct in6_addr *src_key = NULL; 1955 struct rt6_exception_bucket *bucket; 1956 struct rt6_exception *rt6_ex; 1957 1958 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1959 #ifdef CONFIG_IPV6_SUBTREES 1960 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1961 * and exception table is indexed by a hash of 1962 * both rt6i_dst and rt6i_src. 1963 * Otherwise, the exception table is indexed by 1964 * a hash of only rt6i_dst. 1965 */ 1966 if (plen) 1967 src_key = &rt->rt6i_src.addr; 1968 #endif 1969 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1970 if (rt6_ex) 1971 rt6_ex->stamp = jiffies; 1972 } 1973 1974 struct fib6_nh_match_arg { 1975 const struct net_device *dev; 1976 const struct in6_addr *gw; 1977 struct fib6_nh *match; 1978 }; 1979 1980 /* determine if fib6_nh has given device and gateway */ 1981 static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) 1982 { 1983 struct fib6_nh_match_arg *arg = _arg; 1984 1985 if (arg->dev != nh->fib_nh_dev || 1986 (arg->gw && !nh->fib_nh_gw_family) || 1987 (!arg->gw && nh->fib_nh_gw_family) || 1988 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) 1989 return 0; 1990 1991 arg->match = nh; 1992 1993 /* found a match, break the loop */ 1994 return 1; 1995 } 1996 1997 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1998 { 1999 struct fib6_info *from; 2000 struct fib6_nh *fib6_nh; 2001 2002 rcu_read_lock(); 2003 2004 from = rcu_dereference(rt->from); 2005 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 2006 goto unlock; 2007 2008 if (from->nh) { 2009 struct fib6_nh_match_arg arg = { 2010 .dev = rt->dst.dev, 2011 .gw = &rt->rt6i_gateway, 2012 }; 2013 2014 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); 2015 2016 if (!arg.match) 2017 goto unlock; 2018 fib6_nh = arg.match; 2019 } else { 2020 fib6_nh = from->fib6_nh; 2021 } 2022 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); 2023 unlock: 2024 rcu_read_unlock(); 2025 } 2026 2027 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 2028 struct rt6_info *rt, int mtu) 2029 { 2030 /* If the new MTU is lower than the route PMTU, this new MTU will be the 2031 * lowest MTU in the path: always allow updating the route PMTU to 2032 * reflect PMTU decreases. 2033 * 2034 * If the new MTU is higher, and the route PMTU is equal to the local 2035 * MTU, this means the old MTU is the lowest in the path, so allow 2036 * updating it: if other nodes now have lower MTUs, PMTU discovery will 2037 * handle this. 2038 */ 2039 2040 if (dst_mtu(&rt->dst) >= mtu) 2041 return true; 2042 2043 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 2044 return true; 2045 2046 return false; 2047 } 2048 2049 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 2050 const struct fib6_nh *nh, int mtu) 2051 { 2052 struct rt6_exception_bucket *bucket; 2053 struct rt6_exception *rt6_ex; 2054 int i; 2055 2056 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2057 if (!bucket) 2058 return; 2059 2060 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2061 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 2062 struct rt6_info *entry = rt6_ex->rt6i; 2063 2064 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 2065 * route), the metrics of its rt->from have already 2066 * been updated. 2067 */ 2068 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 2069 rt6_mtu_change_route_allowed(idev, entry, mtu)) 2070 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 2071 } 2072 bucket++; 2073 } 2074 } 2075 2076 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2077 2078 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 2079 const struct in6_addr *gateway) 2080 { 2081 struct rt6_exception_bucket *bucket; 2082 struct rt6_exception *rt6_ex; 2083 struct hlist_node *tmp; 2084 int i; 2085 2086 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2087 return; 2088 2089 spin_lock_bh(&rt6_exception_lock); 2090 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2091 if (bucket) { 2092 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2093 hlist_for_each_entry_safe(rt6_ex, tmp, 2094 &bucket->chain, hlist) { 2095 struct rt6_info *entry = rt6_ex->rt6i; 2096 2097 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 2098 RTF_CACHE_GATEWAY && 2099 ipv6_addr_equal(gateway, 2100 &entry->rt6i_gateway)) { 2101 rt6_remove_exception(bucket, rt6_ex); 2102 } 2103 } 2104 bucket++; 2105 } 2106 } 2107 2108 spin_unlock_bh(&rt6_exception_lock); 2109 } 2110 2111 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 2112 struct rt6_exception *rt6_ex, 2113 struct fib6_gc_args *gc_args, 2114 unsigned long now) 2115 { 2116 struct rt6_info *rt = rt6_ex->rt6i; 2117 2118 /* we are pruning and obsoleting aged-out and non gateway exceptions 2119 * even if others have still references to them, so that on next 2120 * dst_check() such references can be dropped. 2121 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 2122 * expired, independently from their aging, as per RFC 8201 section 4 2123 */ 2124 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 2125 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 2126 RT6_TRACE("aging clone %p\n", rt); 2127 rt6_remove_exception(bucket, rt6_ex); 2128 return; 2129 } 2130 } else if (time_after(jiffies, rt->dst.expires)) { 2131 RT6_TRACE("purging expired route %p\n", rt); 2132 rt6_remove_exception(bucket, rt6_ex); 2133 return; 2134 } 2135 2136 if (rt->rt6i_flags & RTF_GATEWAY) { 2137 struct neighbour *neigh; 2138 2139 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 2140 2141 if (!(neigh && (neigh->flags & NTF_ROUTER))) { 2142 RT6_TRACE("purging route %p via non-router but gateway\n", 2143 rt); 2144 rt6_remove_exception(bucket, rt6_ex); 2145 return; 2146 } 2147 } 2148 2149 gc_args->more++; 2150 } 2151 2152 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 2153 struct fib6_gc_args *gc_args, 2154 unsigned long now) 2155 { 2156 struct rt6_exception_bucket *bucket; 2157 struct rt6_exception *rt6_ex; 2158 struct hlist_node *tmp; 2159 int i; 2160 2161 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2162 return; 2163 2164 rcu_read_lock_bh(); 2165 spin_lock(&rt6_exception_lock); 2166 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2167 if (bucket) { 2168 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2169 hlist_for_each_entry_safe(rt6_ex, tmp, 2170 &bucket->chain, hlist) { 2171 rt6_age_examine_exception(bucket, rt6_ex, 2172 gc_args, now); 2173 } 2174 bucket++; 2175 } 2176 } 2177 spin_unlock(&rt6_exception_lock); 2178 rcu_read_unlock_bh(); 2179 } 2180 2181 struct fib6_nh_age_excptn_arg { 2182 struct fib6_gc_args *gc_args; 2183 unsigned long now; 2184 }; 2185 2186 static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) 2187 { 2188 struct fib6_nh_age_excptn_arg *arg = _arg; 2189 2190 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); 2191 return 0; 2192 } 2193 2194 void rt6_age_exceptions(struct fib6_info *f6i, 2195 struct fib6_gc_args *gc_args, 2196 unsigned long now) 2197 { 2198 if (f6i->nh) { 2199 struct fib6_nh_age_excptn_arg arg = { 2200 .gc_args = gc_args, 2201 .now = now 2202 }; 2203 2204 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, 2205 &arg); 2206 } else { 2207 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 2208 } 2209 } 2210 2211 /* must be called with rcu lock held */ 2212 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 2213 struct flowi6 *fl6, struct fib6_result *res, int strict) 2214 { 2215 struct fib6_node *fn, *saved_fn; 2216 2217 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2218 saved_fn = fn; 2219 2220 redo_rt6_select: 2221 rt6_select(net, fn, oif, res, strict); 2222 if (res->f6i == net->ipv6.fib6_null_entry) { 2223 fn = fib6_backtrack(fn, &fl6->saddr); 2224 if (fn) 2225 goto redo_rt6_select; 2226 else if (strict & RT6_LOOKUP_F_REACHABLE) { 2227 /* also consider unreachable route */ 2228 strict &= ~RT6_LOOKUP_F_REACHABLE; 2229 fn = saved_fn; 2230 goto redo_rt6_select; 2231 } 2232 } 2233 2234 trace_fib6_table_lookup(net, res, table, fl6); 2235 2236 return 0; 2237 } 2238 2239 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 2240 int oif, struct flowi6 *fl6, 2241 const struct sk_buff *skb, int flags) 2242 { 2243 struct fib6_result res = {}; 2244 struct rt6_info *rt = NULL; 2245 int strict = 0; 2246 2247 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2248 !rcu_read_lock_held()); 2249 2250 strict |= flags & RT6_LOOKUP_F_IFACE; 2251 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 2252 if (net->ipv6.devconf_all->forwarding == 0) 2253 strict |= RT6_LOOKUP_F_REACHABLE; 2254 2255 rcu_read_lock(); 2256 2257 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2258 if (res.f6i == net->ipv6.fib6_null_entry) 2259 goto out; 2260 2261 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2262 2263 /*Search through exception table */ 2264 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2265 if (rt) { 2266 goto out; 2267 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2268 !res.nh->fib_nh_gw_family)) { 2269 /* Create a RTF_CACHE clone which will not be 2270 * owned by the fib6 tree. It is for the special case where 2271 * the daddr in the skb during the neighbor look-up is different 2272 * from the fl6->daddr used to look-up route here. 2273 */ 2274 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2275 2276 if (rt) { 2277 /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2278 * As rt6_uncached_list_add() does not consume refcnt, 2279 * this refcnt is always returned to the caller even 2280 * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2281 */ 2282 rt6_uncached_list_add(rt); 2283 rcu_read_unlock(); 2284 2285 return rt; 2286 } 2287 } else { 2288 /* Get a percpu copy */ 2289 local_bh_disable(); 2290 rt = rt6_get_pcpu_route(&res); 2291 2292 if (!rt) 2293 rt = rt6_make_pcpu_route(net, &res); 2294 2295 local_bh_enable(); 2296 } 2297 out: 2298 if (!rt) 2299 rt = net->ipv6.ip6_null_entry; 2300 if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2301 ip6_hold_safe(net, &rt); 2302 rcu_read_unlock(); 2303 2304 return rt; 2305 } 2306 EXPORT_SYMBOL_GPL(ip6_pol_route); 2307 2308 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, 2309 struct fib6_table *table, 2310 struct flowi6 *fl6, 2311 const struct sk_buff *skb, 2312 int flags) 2313 { 2314 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2315 } 2316 2317 struct dst_entry *ip6_route_input_lookup(struct net *net, 2318 struct net_device *dev, 2319 struct flowi6 *fl6, 2320 const struct sk_buff *skb, 2321 int flags) 2322 { 2323 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2324 flags |= RT6_LOOKUP_F_IFACE; 2325 2326 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2327 } 2328 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2329 2330 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2331 struct flow_keys *keys, 2332 struct flow_keys *flkeys) 2333 { 2334 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2335 const struct ipv6hdr *key_iph = outer_iph; 2336 struct flow_keys *_flkeys = flkeys; 2337 const struct ipv6hdr *inner_iph; 2338 const struct icmp6hdr *icmph; 2339 struct ipv6hdr _inner_iph; 2340 struct icmp6hdr _icmph; 2341 2342 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2343 goto out; 2344 2345 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2346 sizeof(_icmph), &_icmph); 2347 if (!icmph) 2348 goto out; 2349 2350 if (!icmpv6_is_err(icmph->icmp6_type)) 2351 goto out; 2352 2353 inner_iph = skb_header_pointer(skb, 2354 skb_transport_offset(skb) + sizeof(*icmph), 2355 sizeof(_inner_iph), &_inner_iph); 2356 if (!inner_iph) 2357 goto out; 2358 2359 key_iph = inner_iph; 2360 _flkeys = NULL; 2361 out: 2362 if (_flkeys) { 2363 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2364 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2365 keys->tags.flow_label = _flkeys->tags.flow_label; 2366 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2367 } else { 2368 keys->addrs.v6addrs.src = key_iph->saddr; 2369 keys->addrs.v6addrs.dst = key_iph->daddr; 2370 keys->tags.flow_label = ip6_flowlabel(key_iph); 2371 keys->basic.ip_proto = key_iph->nexthdr; 2372 } 2373 } 2374 2375 static u32 rt6_multipath_custom_hash_outer(const struct net *net, 2376 const struct sk_buff *skb, 2377 bool *p_has_inner) 2378 { 2379 u32 hash_fields = ip6_multipath_hash_fields(net); 2380 struct flow_keys keys, hash_keys; 2381 2382 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2383 return 0; 2384 2385 memset(&hash_keys, 0, sizeof(hash_keys)); 2386 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); 2387 2388 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2389 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2390 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2391 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2392 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2393 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2394 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2395 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2396 hash_keys.tags.flow_label = keys.tags.flow_label; 2397 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2398 hash_keys.ports.src = keys.ports.src; 2399 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2400 hash_keys.ports.dst = keys.ports.dst; 2401 2402 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); 2403 return flow_hash_from_keys(&hash_keys); 2404 } 2405 2406 static u32 rt6_multipath_custom_hash_inner(const struct net *net, 2407 const struct sk_buff *skb, 2408 bool has_inner) 2409 { 2410 u32 hash_fields = ip6_multipath_hash_fields(net); 2411 struct flow_keys keys, hash_keys; 2412 2413 /* We assume the packet carries an encapsulation, but if none was 2414 * encountered during dissection of the outer flow, then there is no 2415 * point in calling the flow dissector again. 2416 */ 2417 if (!has_inner) 2418 return 0; 2419 2420 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) 2421 return 0; 2422 2423 memset(&hash_keys, 0, sizeof(hash_keys)); 2424 skb_flow_dissect_flow_keys(skb, &keys, 0); 2425 2426 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) 2427 return 0; 2428 2429 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2430 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2431 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2432 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 2433 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2434 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 2435 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2436 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2437 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2438 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2439 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2440 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2441 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) 2442 hash_keys.tags.flow_label = keys.tags.flow_label; 2443 } 2444 2445 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) 2446 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2447 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) 2448 hash_keys.ports.src = keys.ports.src; 2449 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) 2450 hash_keys.ports.dst = keys.ports.dst; 2451 2452 return flow_hash_from_keys(&hash_keys); 2453 } 2454 2455 static u32 rt6_multipath_custom_hash_skb(const struct net *net, 2456 const struct sk_buff *skb) 2457 { 2458 u32 mhash, mhash_inner; 2459 bool has_inner = true; 2460 2461 mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); 2462 mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); 2463 2464 return jhash_2words(mhash, mhash_inner, 0); 2465 } 2466 2467 static u32 rt6_multipath_custom_hash_fl6(const struct net *net, 2468 const struct flowi6 *fl6) 2469 { 2470 u32 hash_fields = ip6_multipath_hash_fields(net); 2471 struct flow_keys hash_keys; 2472 2473 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2474 return 0; 2475 2476 memset(&hash_keys, 0, sizeof(hash_keys)); 2477 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2478 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2479 hash_keys.addrs.v6addrs.src = fl6->saddr; 2480 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2481 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2482 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2483 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2484 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2485 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2486 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2487 hash_keys.ports.src = fl6->fl6_sport; 2488 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2489 hash_keys.ports.dst = fl6->fl6_dport; 2490 2491 return flow_hash_from_keys(&hash_keys); 2492 } 2493 2494 /* if skb is set it will be used and fl6 can be NULL */ 2495 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2496 const struct sk_buff *skb, struct flow_keys *flkeys) 2497 { 2498 struct flow_keys hash_keys; 2499 u32 mhash = 0; 2500 2501 switch (ip6_multipath_hash_policy(net)) { 2502 case 0: 2503 memset(&hash_keys, 0, sizeof(hash_keys)); 2504 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2505 if (skb) { 2506 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2507 } else { 2508 hash_keys.addrs.v6addrs.src = fl6->saddr; 2509 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2510 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2511 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2512 } 2513 mhash = flow_hash_from_keys(&hash_keys); 2514 break; 2515 case 1: 2516 if (skb) { 2517 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2518 struct flow_keys keys; 2519 2520 /* short-circuit if we already have L4 hash present */ 2521 if (skb->l4_hash) 2522 return skb_get_hash_raw(skb) >> 1; 2523 2524 memset(&hash_keys, 0, sizeof(hash_keys)); 2525 2526 if (!flkeys) { 2527 skb_flow_dissect_flow_keys(skb, &keys, flag); 2528 flkeys = &keys; 2529 } 2530 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2531 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2532 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2533 hash_keys.ports.src = flkeys->ports.src; 2534 hash_keys.ports.dst = flkeys->ports.dst; 2535 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2536 } else { 2537 memset(&hash_keys, 0, sizeof(hash_keys)); 2538 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2539 hash_keys.addrs.v6addrs.src = fl6->saddr; 2540 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2541 hash_keys.ports.src = fl6->fl6_sport; 2542 hash_keys.ports.dst = fl6->fl6_dport; 2543 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2544 } 2545 mhash = flow_hash_from_keys(&hash_keys); 2546 break; 2547 case 2: 2548 memset(&hash_keys, 0, sizeof(hash_keys)); 2549 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2550 if (skb) { 2551 struct flow_keys keys; 2552 2553 if (!flkeys) { 2554 skb_flow_dissect_flow_keys(skb, &keys, 0); 2555 flkeys = &keys; 2556 } 2557 2558 /* Inner can be v4 or v6 */ 2559 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2560 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2561 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 2562 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 2563 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2564 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2565 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2566 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2567 hash_keys.tags.flow_label = flkeys->tags.flow_label; 2568 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2569 } else { 2570 /* Same as case 0 */ 2571 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2572 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2573 } 2574 } else { 2575 /* Same as case 0 */ 2576 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2577 hash_keys.addrs.v6addrs.src = fl6->saddr; 2578 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2579 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2580 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2581 } 2582 mhash = flow_hash_from_keys(&hash_keys); 2583 break; 2584 case 3: 2585 if (skb) 2586 mhash = rt6_multipath_custom_hash_skb(net, skb); 2587 else 2588 mhash = rt6_multipath_custom_hash_fl6(net, fl6); 2589 break; 2590 } 2591 2592 return mhash >> 1; 2593 } 2594 2595 /* Called with rcu held */ 2596 void ip6_route_input(struct sk_buff *skb) 2597 { 2598 const struct ipv6hdr *iph = ipv6_hdr(skb); 2599 struct net *net = dev_net(skb->dev); 2600 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2601 struct ip_tunnel_info *tun_info; 2602 struct flowi6 fl6 = { 2603 .flowi6_iif = skb->dev->ifindex, 2604 .daddr = iph->daddr, 2605 .saddr = iph->saddr, 2606 .flowlabel = ip6_flowinfo(iph), 2607 .flowi6_mark = skb->mark, 2608 .flowi6_proto = iph->nexthdr, 2609 }; 2610 struct flow_keys *flkeys = NULL, _flkeys; 2611 2612 tun_info = skb_tunnel_info(skb); 2613 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2614 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2615 2616 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2617 flkeys = &_flkeys; 2618 2619 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2620 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2621 skb_dst_drop(skb); 2622 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2623 &fl6, skb, flags)); 2624 } 2625 2626 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, 2627 struct fib6_table *table, 2628 struct flowi6 *fl6, 2629 const struct sk_buff *skb, 2630 int flags) 2631 { 2632 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2633 } 2634 2635 static struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2636 const struct sock *sk, 2637 struct flowi6 *fl6, 2638 int flags) 2639 { 2640 bool any_src; 2641 2642 if (ipv6_addr_type(&fl6->daddr) & 2643 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2644 struct dst_entry *dst; 2645 2646 /* This function does not take refcnt on the dst */ 2647 dst = l3mdev_link_scope_lookup(net, fl6); 2648 if (dst) 2649 return dst; 2650 } 2651 2652 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2653 2654 flags |= RT6_LOOKUP_F_DST_NOREF; 2655 any_src = ipv6_addr_any(&fl6->saddr); 2656 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2657 (fl6->flowi6_oif && any_src)) 2658 flags |= RT6_LOOKUP_F_IFACE; 2659 2660 if (!any_src) 2661 flags |= RT6_LOOKUP_F_HAS_SADDR; 2662 else if (sk) 2663 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2664 2665 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2666 } 2667 2668 struct dst_entry *ip6_route_output_flags(struct net *net, 2669 const struct sock *sk, 2670 struct flowi6 *fl6, 2671 int flags) 2672 { 2673 struct dst_entry *dst; 2674 struct rt6_info *rt6; 2675 2676 rcu_read_lock(); 2677 dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2678 rt6 = dst_rt6_info(dst); 2679 /* For dst cached in uncached_list, refcnt is already taken. */ 2680 if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { 2681 dst = &net->ipv6.ip6_null_entry->dst; 2682 dst_hold(dst); 2683 } 2684 rcu_read_unlock(); 2685 2686 return dst; 2687 } 2688 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2689 2690 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2691 { 2692 struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); 2693 struct net_device *loopback_dev = net->loopback_dev; 2694 struct dst_entry *new = NULL; 2695 2696 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2697 DST_OBSOLETE_DEAD, 0); 2698 if (rt) { 2699 rt6_info_init(rt); 2700 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2701 2702 new = &rt->dst; 2703 new->__use = 1; 2704 new->input = dst_discard; 2705 new->output = dst_discard_out; 2706 2707 dst_copy_metrics(new, &ort->dst); 2708 2709 rt->rt6i_idev = in6_dev_get(loopback_dev); 2710 rt->rt6i_gateway = ort->rt6i_gateway; 2711 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2712 2713 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2714 #ifdef CONFIG_IPV6_SUBTREES 2715 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2716 #endif 2717 } 2718 2719 dst_release(dst_orig); 2720 return new ? new : ERR_PTR(-ENOMEM); 2721 } 2722 2723 /* 2724 * Destination cache support functions 2725 */ 2726 2727 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2728 { 2729 u32 rt_cookie = 0; 2730 2731 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2732 return false; 2733 2734 if (fib6_check_expired(f6i)) 2735 return false; 2736 2737 return true; 2738 } 2739 2740 static struct dst_entry *rt6_check(struct rt6_info *rt, 2741 struct fib6_info *from, 2742 u32 cookie) 2743 { 2744 u32 rt_cookie = 0; 2745 2746 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || 2747 rt_cookie != cookie) 2748 return NULL; 2749 2750 if (rt6_check_expired(rt)) 2751 return NULL; 2752 2753 return &rt->dst; 2754 } 2755 2756 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2757 struct fib6_info *from, 2758 u32 cookie) 2759 { 2760 if (!__rt6_check_expired(rt) && 2761 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2762 fib6_check(from, cookie)) 2763 return &rt->dst; 2764 else 2765 return NULL; 2766 } 2767 2768 INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, 2769 u32 cookie) 2770 { 2771 struct dst_entry *dst_ret; 2772 struct fib6_info *from; 2773 struct rt6_info *rt; 2774 2775 rt = dst_rt6_info(dst); 2776 2777 if (rt->sernum) 2778 return rt6_is_valid(rt) ? dst : NULL; 2779 2780 rcu_read_lock(); 2781 2782 /* All IPV6 dsts are created with ->obsolete set to the value 2783 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2784 * into this function always. 2785 */ 2786 2787 from = rcu_dereference(rt->from); 2788 2789 if (from && (rt->rt6i_flags & RTF_PCPU || 2790 unlikely(!list_empty(&rt->dst.rt_uncached)))) 2791 dst_ret = rt6_dst_from_check(rt, from, cookie); 2792 else 2793 dst_ret = rt6_check(rt, from, cookie); 2794 2795 rcu_read_unlock(); 2796 2797 return dst_ret; 2798 } 2799 EXPORT_INDIRECT_CALLABLE(ip6_dst_check); 2800 2801 static void ip6_negative_advice(struct sock *sk, 2802 struct dst_entry *dst) 2803 { 2804 struct rt6_info *rt = dst_rt6_info(dst); 2805 2806 if (rt->rt6i_flags & RTF_CACHE) { 2807 rcu_read_lock(); 2808 if (rt6_check_expired(rt)) { 2809 /* rt/dst can not be destroyed yet, 2810 * because of rcu_read_lock() 2811 */ 2812 sk_dst_reset(sk); 2813 rt6_remove_exception_rt(rt); 2814 } 2815 rcu_read_unlock(); 2816 return; 2817 } 2818 sk_dst_reset(sk); 2819 } 2820 2821 static void ip6_link_failure(struct sk_buff *skb) 2822 { 2823 struct rt6_info *rt; 2824 2825 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2826 2827 rt = dst_rt6_info(skb_dst(skb)); 2828 if (rt) { 2829 rcu_read_lock(); 2830 if (rt->rt6i_flags & RTF_CACHE) { 2831 rt6_remove_exception_rt(rt); 2832 } else { 2833 struct fib6_info *from; 2834 struct fib6_node *fn; 2835 2836 from = rcu_dereference(rt->from); 2837 if (from) { 2838 fn = rcu_dereference(from->fib6_node); 2839 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2840 WRITE_ONCE(fn->fn_sernum, -1); 2841 } 2842 } 2843 rcu_read_unlock(); 2844 } 2845 } 2846 2847 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2848 { 2849 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2850 struct fib6_info *from; 2851 2852 rcu_read_lock(); 2853 from = rcu_dereference(rt0->from); 2854 if (from) 2855 rt0->dst.expires = from->expires; 2856 rcu_read_unlock(); 2857 } 2858 2859 dst_set_expires(&rt0->dst, timeout); 2860 rt0->rt6i_flags |= RTF_EXPIRES; 2861 } 2862 2863 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2864 { 2865 struct net *net = dev_net(rt->dst.dev); 2866 2867 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2868 rt->rt6i_flags |= RTF_MODIFIED; 2869 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2870 } 2871 2872 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2873 { 2874 return !(rt->rt6i_flags & RTF_CACHE) && 2875 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2876 } 2877 2878 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2879 const struct ipv6hdr *iph, u32 mtu, 2880 bool confirm_neigh) 2881 { 2882 const struct in6_addr *daddr, *saddr; 2883 struct rt6_info *rt6 = dst_rt6_info(dst); 2884 2885 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) 2886 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. 2887 * [see also comment in rt6_mtu_change_route()] 2888 */ 2889 2890 if (iph) { 2891 daddr = &iph->daddr; 2892 saddr = &iph->saddr; 2893 } else if (sk) { 2894 daddr = &sk->sk_v6_daddr; 2895 saddr = &inet6_sk(sk)->saddr; 2896 } else { 2897 daddr = NULL; 2898 saddr = NULL; 2899 } 2900 2901 if (confirm_neigh) 2902 dst_confirm_neigh(dst, daddr); 2903 2904 if (mtu < IPV6_MIN_MTU) 2905 return; 2906 if (mtu >= dst_mtu(dst)) 2907 return; 2908 2909 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2910 rt6_do_update_pmtu(rt6, mtu); 2911 /* update rt6_ex->stamp for cache */ 2912 if (rt6->rt6i_flags & RTF_CACHE) 2913 rt6_update_exception_stamp_rt(rt6); 2914 } else if (daddr) { 2915 struct fib6_result res = {}; 2916 struct rt6_info *nrt6; 2917 2918 rcu_read_lock(); 2919 res.f6i = rcu_dereference(rt6->from); 2920 if (!res.f6i) 2921 goto out_unlock; 2922 2923 res.fib6_flags = res.f6i->fib6_flags; 2924 res.fib6_type = res.f6i->fib6_type; 2925 2926 if (res.f6i->nh) { 2927 struct fib6_nh_match_arg arg = { 2928 .dev = dst->dev, 2929 .gw = &rt6->rt6i_gateway, 2930 }; 2931 2932 nexthop_for_each_fib6_nh(res.f6i->nh, 2933 fib6_nh_find_match, &arg); 2934 2935 /* fib6_info uses a nexthop that does not have fib6_nh 2936 * using the dst->dev + gw. Should be impossible. 2937 */ 2938 if (!arg.match) 2939 goto out_unlock; 2940 2941 res.nh = arg.match; 2942 } else { 2943 res.nh = res.f6i->fib6_nh; 2944 } 2945 2946 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2947 if (nrt6) { 2948 rt6_do_update_pmtu(nrt6, mtu); 2949 if (rt6_insert_exception(nrt6, &res)) 2950 dst_release_immediate(&nrt6->dst); 2951 } 2952 out_unlock: 2953 rcu_read_unlock(); 2954 } 2955 } 2956 2957 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2958 struct sk_buff *skb, u32 mtu, 2959 bool confirm_neigh) 2960 { 2961 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, 2962 confirm_neigh); 2963 } 2964 2965 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2966 int oif, u32 mark, kuid_t uid) 2967 { 2968 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2969 struct dst_entry *dst; 2970 struct flowi6 fl6 = { 2971 .flowi6_oif = oif, 2972 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2973 .daddr = iph->daddr, 2974 .saddr = iph->saddr, 2975 .flowlabel = ip6_flowinfo(iph), 2976 .flowi6_uid = uid, 2977 }; 2978 2979 dst = ip6_route_output(net, NULL, &fl6); 2980 if (!dst->error) 2981 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); 2982 dst_release(dst); 2983 } 2984 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2985 2986 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2987 { 2988 int oif = sk->sk_bound_dev_if; 2989 struct dst_entry *dst; 2990 2991 if (!oif && skb->dev) 2992 oif = l3mdev_master_ifindex(skb->dev); 2993 2994 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), 2995 sk->sk_uid); 2996 2997 dst = __sk_dst_get(sk); 2998 if (!dst || !dst->obsolete || 2999 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 3000 return; 3001 3002 bh_lock_sock(sk); 3003 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 3004 ip6_datagram_dst_update(sk, false); 3005 bh_unlock_sock(sk); 3006 } 3007 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 3008 3009 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 3010 const struct flowi6 *fl6) 3011 { 3012 #ifdef CONFIG_IPV6_SUBTREES 3013 struct ipv6_pinfo *np = inet6_sk(sk); 3014 #endif 3015 3016 ip6_dst_store(sk, dst, 3017 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 3018 &sk->sk_v6_daddr : NULL, 3019 #ifdef CONFIG_IPV6_SUBTREES 3020 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 3021 &np->saddr : 3022 #endif 3023 NULL); 3024 } 3025 3026 static bool ip6_redirect_nh_match(const struct fib6_result *res, 3027 struct flowi6 *fl6, 3028 const struct in6_addr *gw, 3029 struct rt6_info **ret) 3030 { 3031 const struct fib6_nh *nh = res->nh; 3032 3033 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 3034 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 3035 return false; 3036 3037 /* rt_cache's gateway might be different from its 'parent' 3038 * in the case of an ip redirect. 3039 * So we keep searching in the exception table if the gateway 3040 * is different. 3041 */ 3042 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 3043 struct rt6_info *rt_cache; 3044 3045 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 3046 if (rt_cache && 3047 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 3048 *ret = rt_cache; 3049 return true; 3050 } 3051 return false; 3052 } 3053 return true; 3054 } 3055 3056 struct fib6_nh_rd_arg { 3057 struct fib6_result *res; 3058 struct flowi6 *fl6; 3059 const struct in6_addr *gw; 3060 struct rt6_info **ret; 3061 }; 3062 3063 static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) 3064 { 3065 struct fib6_nh_rd_arg *arg = _arg; 3066 3067 arg->res->nh = nh; 3068 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); 3069 } 3070 3071 /* Handle redirects */ 3072 struct ip6rd_flowi { 3073 struct flowi6 fl6; 3074 struct in6_addr gateway; 3075 }; 3076 3077 INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, 3078 struct fib6_table *table, 3079 struct flowi6 *fl6, 3080 const struct sk_buff *skb, 3081 int flags) 3082 { 3083 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 3084 struct rt6_info *ret = NULL; 3085 struct fib6_result res = {}; 3086 struct fib6_nh_rd_arg arg = { 3087 .res = &res, 3088 .fl6 = fl6, 3089 .gw = &rdfl->gateway, 3090 .ret = &ret 3091 }; 3092 struct fib6_info *rt; 3093 struct fib6_node *fn; 3094 3095 /* Get the "current" route for this destination and 3096 * check if the redirect has come from appropriate router. 3097 * 3098 * RFC 4861 specifies that redirects should only be 3099 * accepted if they come from the nexthop to the target. 3100 * Due to the way the routes are chosen, this notion 3101 * is a bit fuzzy and one might need to check all possible 3102 * routes. 3103 */ 3104 3105 rcu_read_lock(); 3106 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 3107 restart: 3108 for_each_fib6_node_rt_rcu(fn) { 3109 res.f6i = rt; 3110 if (fib6_check_expired(rt)) 3111 continue; 3112 if (rt->fib6_flags & RTF_REJECT) 3113 break; 3114 if (unlikely(rt->nh)) { 3115 if (nexthop_is_blackhole(rt->nh)) 3116 continue; 3117 /* on match, res->nh is filled in and potentially ret */ 3118 if (nexthop_for_each_fib6_nh(rt->nh, 3119 fib6_nh_redirect_match, 3120 &arg)) 3121 goto out; 3122 } else { 3123 res.nh = rt->fib6_nh; 3124 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, 3125 &ret)) 3126 goto out; 3127 } 3128 } 3129 3130 if (!rt) 3131 rt = net->ipv6.fib6_null_entry; 3132 else if (rt->fib6_flags & RTF_REJECT) { 3133 ret = net->ipv6.ip6_null_entry; 3134 goto out; 3135 } 3136 3137 if (rt == net->ipv6.fib6_null_entry) { 3138 fn = fib6_backtrack(fn, &fl6->saddr); 3139 if (fn) 3140 goto restart; 3141 } 3142 3143 res.f6i = rt; 3144 res.nh = rt->fib6_nh; 3145 out: 3146 if (ret) { 3147 ip6_hold_safe(net, &ret); 3148 } else { 3149 res.fib6_flags = res.f6i->fib6_flags; 3150 res.fib6_type = res.f6i->fib6_type; 3151 ret = ip6_create_rt_rcu(&res); 3152 } 3153 3154 rcu_read_unlock(); 3155 3156 trace_fib6_table_lookup(net, &res, table, fl6); 3157 return ret; 3158 }; 3159 3160 static struct dst_entry *ip6_route_redirect(struct net *net, 3161 const struct flowi6 *fl6, 3162 const struct sk_buff *skb, 3163 const struct in6_addr *gateway) 3164 { 3165 int flags = RT6_LOOKUP_F_HAS_SADDR; 3166 struct ip6rd_flowi rdfl; 3167 3168 rdfl.fl6 = *fl6; 3169 rdfl.gateway = *gateway; 3170 3171 return fib6_rule_lookup(net, &rdfl.fl6, skb, 3172 flags, __ip6_route_redirect); 3173 } 3174 3175 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 3176 kuid_t uid) 3177 { 3178 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 3179 struct dst_entry *dst; 3180 struct flowi6 fl6 = { 3181 .flowi6_iif = LOOPBACK_IFINDEX, 3182 .flowi6_oif = oif, 3183 .flowi6_mark = mark, 3184 .daddr = iph->daddr, 3185 .saddr = iph->saddr, 3186 .flowlabel = ip6_flowinfo(iph), 3187 .flowi6_uid = uid, 3188 }; 3189 3190 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 3191 rt6_do_redirect(dst, NULL, skb); 3192 dst_release(dst); 3193 } 3194 EXPORT_SYMBOL_GPL(ip6_redirect); 3195 3196 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 3197 { 3198 const struct ipv6hdr *iph = ipv6_hdr(skb); 3199 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 3200 struct dst_entry *dst; 3201 struct flowi6 fl6 = { 3202 .flowi6_iif = LOOPBACK_IFINDEX, 3203 .flowi6_oif = oif, 3204 .daddr = msg->dest, 3205 .saddr = iph->daddr, 3206 .flowi6_uid = sock_net_uid(net, NULL), 3207 }; 3208 3209 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 3210 rt6_do_redirect(dst, NULL, skb); 3211 dst_release(dst); 3212 } 3213 3214 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 3215 { 3216 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, 3217 READ_ONCE(sk->sk_mark), sk->sk_uid); 3218 } 3219 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 3220 3221 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 3222 { 3223 struct net_device *dev = dst->dev; 3224 unsigned int mtu = dst_mtu(dst); 3225 struct net *net; 3226 3227 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3228 3229 rcu_read_lock(); 3230 3231 net = dev_net_rcu(dev); 3232 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3233 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3234 3235 rcu_read_unlock(); 3236 3237 /* 3238 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 3239 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 3240 * IPV6_MAXPLEN is also valid and means: "any MSS, 3241 * rely only on pmtu discovery" 3242 */ 3243 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 3244 mtu = IPV6_MAXPLEN; 3245 return mtu; 3246 } 3247 3248 INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) 3249 { 3250 return ip6_dst_mtu_maybe_forward(dst, false); 3251 } 3252 EXPORT_INDIRECT_CALLABLE(ip6_mtu); 3253 3254 /* MTU selection: 3255 * 1. mtu on route is locked - use it 3256 * 2. mtu from nexthop exception 3257 * 3. mtu from egress device 3258 * 3259 * based on ip6_dst_mtu_forward and exception logic of 3260 * rt6_find_cached_rt; called with rcu_read_lock 3261 */ 3262 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 3263 const struct in6_addr *daddr, 3264 const struct in6_addr *saddr) 3265 { 3266 const struct fib6_nh *nh = res->nh; 3267 struct fib6_info *f6i = res->f6i; 3268 struct inet6_dev *idev; 3269 struct rt6_info *rt; 3270 u32 mtu = 0; 3271 3272 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 3273 mtu = f6i->fib6_pmtu; 3274 if (mtu) 3275 goto out; 3276 } 3277 3278 rt = rt6_find_cached_rt(res, daddr, saddr); 3279 if (unlikely(rt)) { 3280 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 3281 } else { 3282 struct net_device *dev = nh->fib_nh_dev; 3283 3284 mtu = IPV6_MIN_MTU; 3285 idev = __in6_dev_get(dev); 3286 if (idev && idev->cnf.mtu6 > mtu) 3287 mtu = idev->cnf.mtu6; 3288 } 3289 3290 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3291 out: 3292 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 3293 } 3294 3295 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 3296 struct flowi6 *fl6) 3297 { 3298 struct dst_entry *dst; 3299 struct rt6_info *rt; 3300 struct inet6_dev *idev = in6_dev_get(dev); 3301 struct net *net = dev_net(dev); 3302 3303 if (unlikely(!idev)) 3304 return ERR_PTR(-ENODEV); 3305 3306 rt = ip6_dst_alloc(net, dev, 0); 3307 if (unlikely(!rt)) { 3308 in6_dev_put(idev); 3309 dst = ERR_PTR(-ENOMEM); 3310 goto out; 3311 } 3312 3313 rt->dst.input = ip6_input; 3314 rt->dst.output = ip6_output; 3315 rt->rt6i_gateway = fl6->daddr; 3316 rt->rt6i_dst.addr = fl6->daddr; 3317 rt->rt6i_dst.plen = 128; 3318 rt->rt6i_idev = idev; 3319 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 3320 3321 /* Add this dst into uncached_list so that rt6_disable_ip() can 3322 * do proper release of the net_device 3323 */ 3324 rt6_uncached_list_add(rt); 3325 3326 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 3327 3328 out: 3329 return dst; 3330 } 3331 3332 static void ip6_dst_gc(struct dst_ops *ops) 3333 { 3334 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 3335 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 3336 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 3337 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 3338 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 3339 unsigned int val; 3340 int entries; 3341 3342 if (time_after(rt_last_gc + rt_min_interval, jiffies)) 3343 goto out; 3344 3345 fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); 3346 entries = dst_entries_get_slow(ops); 3347 if (entries < ops->gc_thresh) 3348 atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); 3349 out: 3350 val = atomic_read(&net->ipv6.ip6_rt_gc_expire); 3351 atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); 3352 } 3353 3354 static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, 3355 const struct in6_addr *gw_addr, u32 tbid, 3356 int flags, struct fib6_result *res) 3357 { 3358 struct flowi6 fl6 = { 3359 .flowi6_oif = cfg->fc_ifindex, 3360 .daddr = *gw_addr, 3361 .saddr = cfg->fc_prefsrc, 3362 }; 3363 struct fib6_table *table; 3364 int err; 3365 3366 table = fib6_get_table(net, tbid); 3367 if (!table) 3368 return -EINVAL; 3369 3370 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 3371 flags |= RT6_LOOKUP_F_HAS_SADDR; 3372 3373 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 3374 3375 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); 3376 if (!err && res->f6i != net->ipv6.fib6_null_entry) 3377 fib6_select_path(net, res, &fl6, cfg->fc_ifindex, 3378 cfg->fc_ifindex != 0, NULL, flags); 3379 3380 return err; 3381 } 3382 3383 static int ip6_route_check_nh_onlink(struct net *net, 3384 struct fib6_config *cfg, 3385 const struct net_device *dev, 3386 struct netlink_ext_ack *extack) 3387 { 3388 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 3389 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3390 struct fib6_result res = {}; 3391 int err; 3392 3393 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); 3394 if (!err && !(res.fib6_flags & RTF_REJECT) && 3395 /* ignore match if it is the default route */ 3396 !ipv6_addr_any(&res.f6i->fib6_dst.addr) && 3397 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { 3398 NL_SET_ERR_MSG(extack, 3399 "Nexthop has invalid gateway or device mismatch"); 3400 err = -EINVAL; 3401 } 3402 3403 return err; 3404 } 3405 3406 static int ip6_route_check_nh(struct net *net, 3407 struct fib6_config *cfg, 3408 struct net_device **_dev, 3409 netdevice_tracker *dev_tracker, 3410 struct inet6_dev **idev) 3411 { 3412 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3413 struct net_device *dev = _dev ? *_dev : NULL; 3414 int flags = RT6_LOOKUP_F_IFACE; 3415 struct fib6_result res = {}; 3416 int err = -EHOSTUNREACH; 3417 3418 if (cfg->fc_table) { 3419 err = ip6_nh_lookup_table(net, cfg, gw_addr, 3420 cfg->fc_table, flags, &res); 3421 /* gw_addr can not require a gateway or resolve to a reject 3422 * route. If a device is given, it must match the result. 3423 */ 3424 if (err || res.fib6_flags & RTF_REJECT || 3425 res.nh->fib_nh_gw_family || 3426 (dev && dev != res.nh->fib_nh_dev)) 3427 err = -EHOSTUNREACH; 3428 } 3429 3430 if (err < 0) { 3431 struct flowi6 fl6 = { 3432 .flowi6_oif = cfg->fc_ifindex, 3433 .daddr = *gw_addr, 3434 }; 3435 3436 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); 3437 if (err || res.fib6_flags & RTF_REJECT || 3438 res.nh->fib_nh_gw_family) 3439 err = -EHOSTUNREACH; 3440 3441 if (err) 3442 return err; 3443 3444 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, 3445 cfg->fc_ifindex != 0, NULL, flags); 3446 } 3447 3448 err = 0; 3449 if (dev) { 3450 if (dev != res.nh->fib_nh_dev) 3451 err = -EHOSTUNREACH; 3452 } else { 3453 *_dev = dev = res.nh->fib_nh_dev; 3454 netdev_hold(dev, dev_tracker, GFP_ATOMIC); 3455 *idev = in6_dev_get(dev); 3456 } 3457 3458 return err; 3459 } 3460 3461 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 3462 struct net_device **_dev, 3463 netdevice_tracker *dev_tracker, 3464 struct inet6_dev **idev, 3465 struct netlink_ext_ack *extack) 3466 { 3467 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3468 int gwa_type = ipv6_addr_type(gw_addr); 3469 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3470 const struct net_device *dev = *_dev; 3471 bool need_addr_check = !dev; 3472 int err = -EINVAL; 3473 3474 /* if gw_addr is local we will fail to detect this in case 3475 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3476 * will return already-added prefix route via interface that 3477 * prefix route was assigned to, which might be non-loopback. 3478 */ 3479 if (dev && 3480 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3481 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3482 goto out; 3483 } 3484 3485 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3486 /* IPv6 strictly inhibits using not link-local 3487 * addresses as nexthop address. 3488 * Otherwise, router will not able to send redirects. 3489 * It is very good, but in some (rare!) circumstances 3490 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3491 * some exceptions. --ANK 3492 * We allow IPv4-mapped nexthops to support RFC4798-type 3493 * addressing 3494 */ 3495 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3496 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3497 goto out; 3498 } 3499 3500 rcu_read_lock(); 3501 3502 if (cfg->fc_flags & RTNH_F_ONLINK) 3503 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3504 else 3505 err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, 3506 idev); 3507 3508 rcu_read_unlock(); 3509 3510 if (err) 3511 goto out; 3512 } 3513 3514 /* reload in case device was changed */ 3515 dev = *_dev; 3516 3517 err = -EINVAL; 3518 if (!dev) { 3519 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3520 goto out; 3521 } else if (dev->flags & IFF_LOOPBACK) { 3522 NL_SET_ERR_MSG(extack, 3523 "Egress device can not be loopback device for this route"); 3524 goto out; 3525 } 3526 3527 /* if we did not check gw_addr above, do so now that the 3528 * egress device has been resolved. 3529 */ 3530 if (need_addr_check && 3531 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3532 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3533 goto out; 3534 } 3535 3536 err = 0; 3537 out: 3538 return err; 3539 } 3540 3541 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3542 { 3543 if ((flags & RTF_REJECT) || 3544 (dev && (dev->flags & IFF_LOOPBACK) && 3545 !(addr_type & IPV6_ADDR_LOOPBACK) && 3546 !(flags & (RTF_ANYCAST | RTF_LOCAL)))) 3547 return true; 3548 3549 return false; 3550 } 3551 3552 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3553 struct fib6_config *cfg, gfp_t gfp_flags, 3554 struct netlink_ext_ack *extack) 3555 { 3556 netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; 3557 struct net_device *dev = NULL; 3558 struct inet6_dev *idev = NULL; 3559 int addr_type; 3560 int err; 3561 3562 fib6_nh->fib_nh_family = AF_INET6; 3563 #ifdef CONFIG_IPV6_ROUTER_PREF 3564 fib6_nh->last_probe = jiffies; 3565 #endif 3566 if (cfg->fc_is_fdb) { 3567 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3568 fib6_nh->fib_nh_gw_family = AF_INET6; 3569 return 0; 3570 } 3571 3572 err = -ENODEV; 3573 if (cfg->fc_ifindex) { 3574 dev = netdev_get_by_index(net, cfg->fc_ifindex, 3575 dev_tracker, gfp_flags); 3576 if (!dev) 3577 goto out; 3578 idev = in6_dev_get(dev); 3579 if (!idev) 3580 goto out; 3581 } 3582 3583 if (cfg->fc_flags & RTNH_F_ONLINK) { 3584 if (!dev) { 3585 NL_SET_ERR_MSG(extack, 3586 "Nexthop device required for onlink"); 3587 goto out; 3588 } 3589 3590 if (!(dev->flags & IFF_UP)) { 3591 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3592 err = -ENETDOWN; 3593 goto out; 3594 } 3595 3596 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3597 } 3598 3599 fib6_nh->fib_nh_weight = 1; 3600 3601 /* We cannot add true routes via loopback here, 3602 * they would result in kernel looping; promote them to reject routes 3603 */ 3604 addr_type = ipv6_addr_type(&cfg->fc_dst); 3605 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3606 /* hold loopback dev/idev if we haven't done so. */ 3607 if (dev != net->loopback_dev) { 3608 if (dev) { 3609 netdev_put(dev, dev_tracker); 3610 in6_dev_put(idev); 3611 } 3612 dev = net->loopback_dev; 3613 netdev_hold(dev, dev_tracker, gfp_flags); 3614 idev = in6_dev_get(dev); 3615 if (!idev) { 3616 err = -ENODEV; 3617 goto out; 3618 } 3619 } 3620 goto pcpu_alloc; 3621 } 3622 3623 if (cfg->fc_flags & RTF_GATEWAY) { 3624 err = ip6_validate_gw(net, cfg, &dev, dev_tracker, 3625 &idev, extack); 3626 if (err) 3627 goto out; 3628 3629 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3630 fib6_nh->fib_nh_gw_family = AF_INET6; 3631 } 3632 3633 err = -ENODEV; 3634 if (!dev) 3635 goto out; 3636 3637 if (!idev || idev->cnf.disable_ipv6) { 3638 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3639 err = -EACCES; 3640 goto out; 3641 } 3642 3643 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3644 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3645 err = -ENETDOWN; 3646 goto out; 3647 } 3648 3649 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3650 !netif_carrier_ok(dev)) 3651 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3652 3653 err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, 3654 cfg->fc_encap_type, cfg, gfp_flags, extack); 3655 if (err) 3656 goto out; 3657 3658 pcpu_alloc: 3659 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3660 if (!fib6_nh->rt6i_pcpu) { 3661 err = -ENOMEM; 3662 goto out; 3663 } 3664 3665 fib6_nh->fib_nh_dev = dev; 3666 fib6_nh->fib_nh_oif = dev->ifindex; 3667 err = 0; 3668 out: 3669 if (idev) 3670 in6_dev_put(idev); 3671 3672 if (err) { 3673 fib_nh_common_release(&fib6_nh->nh_common); 3674 fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; 3675 fib6_nh->fib_nh_lws = NULL; 3676 netdev_put(dev, dev_tracker); 3677 } 3678 3679 return err; 3680 } 3681 3682 void fib6_nh_release(struct fib6_nh *fib6_nh) 3683 { 3684 struct rt6_exception_bucket *bucket; 3685 3686 rcu_read_lock(); 3687 3688 fib6_nh_flush_exceptions(fib6_nh, NULL); 3689 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3690 if (bucket) { 3691 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3692 kfree(bucket); 3693 } 3694 3695 rcu_read_unlock(); 3696 3697 fib6_nh_release_dsts(fib6_nh); 3698 free_percpu(fib6_nh->rt6i_pcpu); 3699 3700 fib_nh_common_release(&fib6_nh->nh_common); 3701 } 3702 3703 void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) 3704 { 3705 int cpu; 3706 3707 if (!fib6_nh->rt6i_pcpu) 3708 return; 3709 3710 for_each_possible_cpu(cpu) { 3711 struct rt6_info *pcpu_rt, **ppcpu_rt; 3712 3713 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3714 pcpu_rt = xchg(ppcpu_rt, NULL); 3715 if (pcpu_rt) { 3716 dst_dev_put(&pcpu_rt->dst); 3717 dst_release(&pcpu_rt->dst); 3718 } 3719 } 3720 } 3721 3722 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3723 gfp_t gfp_flags, 3724 struct netlink_ext_ack *extack) 3725 { 3726 struct net *net = cfg->fc_nlinfo.nl_net; 3727 struct fib6_info *rt = NULL; 3728 struct nexthop *nh = NULL; 3729 struct fib6_table *table; 3730 struct fib6_nh *fib6_nh; 3731 int err = -EINVAL; 3732 int addr_type; 3733 3734 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3735 if (cfg->fc_flags & RTF_PCPU) { 3736 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3737 goto out; 3738 } 3739 3740 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3741 if (cfg->fc_flags & RTF_CACHE) { 3742 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3743 goto out; 3744 } 3745 3746 if (cfg->fc_type > RTN_MAX) { 3747 NL_SET_ERR_MSG(extack, "Invalid route type"); 3748 goto out; 3749 } 3750 3751 if (cfg->fc_dst_len > 128) { 3752 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3753 goto out; 3754 } 3755 if (cfg->fc_src_len > 128) { 3756 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3757 goto out; 3758 } 3759 #ifndef CONFIG_IPV6_SUBTREES 3760 if (cfg->fc_src_len) { 3761 NL_SET_ERR_MSG(extack, 3762 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3763 goto out; 3764 } 3765 #endif 3766 if (cfg->fc_nh_id) { 3767 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 3768 if (!nh) { 3769 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 3770 goto out; 3771 } 3772 err = fib6_check_nexthop(nh, cfg, extack); 3773 if (err) 3774 goto out; 3775 } 3776 3777 err = -ENOBUFS; 3778 if (cfg->fc_nlinfo.nlh && 3779 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3780 table = fib6_get_table(net, cfg->fc_table); 3781 if (!table) { 3782 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3783 table = fib6_new_table(net, cfg->fc_table); 3784 } 3785 } else { 3786 table = fib6_new_table(net, cfg->fc_table); 3787 } 3788 3789 if (!table) 3790 goto out; 3791 3792 err = -ENOMEM; 3793 rt = fib6_info_alloc(gfp_flags, !nh); 3794 if (!rt) 3795 goto out; 3796 3797 rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, 3798 extack); 3799 if (IS_ERR(rt->fib6_metrics)) { 3800 err = PTR_ERR(rt->fib6_metrics); 3801 /* Do not leave garbage there. */ 3802 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3803 goto out_free; 3804 } 3805 3806 if (cfg->fc_flags & RTF_ADDRCONF) 3807 rt->dst_nocount = true; 3808 3809 if (cfg->fc_flags & RTF_EXPIRES) 3810 fib6_set_expires(rt, jiffies + 3811 clock_t_to_jiffies(cfg->fc_expires)); 3812 else 3813 fib6_clean_expires(rt); 3814 3815 if (cfg->fc_protocol == RTPROT_UNSPEC) 3816 cfg->fc_protocol = RTPROT_BOOT; 3817 rt->fib6_protocol = cfg->fc_protocol; 3818 3819 rt->fib6_table = table; 3820 rt->fib6_metric = cfg->fc_metric; 3821 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3822 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3823 3824 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3825 rt->fib6_dst.plen = cfg->fc_dst_len; 3826 3827 #ifdef CONFIG_IPV6_SUBTREES 3828 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3829 rt->fib6_src.plen = cfg->fc_src_len; 3830 #endif 3831 if (nh) { 3832 if (rt->fib6_src.plen) { 3833 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3834 err = -EINVAL; 3835 goto out_free; 3836 } 3837 if (!nexthop_get(nh)) { 3838 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3839 err = -ENOENT; 3840 goto out_free; 3841 } 3842 rt->nh = nh; 3843 fib6_nh = nexthop_fib6_nh(rt->nh); 3844 } else { 3845 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3846 if (err) 3847 goto out; 3848 3849 fib6_nh = rt->fib6_nh; 3850 3851 /* We cannot add true routes via loopback here, they would 3852 * result in kernel looping; promote them to reject routes 3853 */ 3854 addr_type = ipv6_addr_type(&cfg->fc_dst); 3855 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3856 addr_type)) 3857 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3858 } 3859 3860 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3861 struct net_device *dev = fib6_nh->fib_nh_dev; 3862 3863 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3864 NL_SET_ERR_MSG(extack, "Invalid source address"); 3865 err = -EINVAL; 3866 goto out; 3867 } 3868 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3869 rt->fib6_prefsrc.plen = 128; 3870 } else 3871 rt->fib6_prefsrc.plen = 0; 3872 3873 return rt; 3874 out: 3875 fib6_info_release(rt); 3876 return ERR_PTR(err); 3877 out_free: 3878 ip_fib_metrics_put(rt->fib6_metrics); 3879 kfree(rt); 3880 return ERR_PTR(err); 3881 } 3882 3883 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3884 struct netlink_ext_ack *extack) 3885 { 3886 struct fib6_info *rt; 3887 int err; 3888 3889 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3890 if (IS_ERR(rt)) 3891 return PTR_ERR(rt); 3892 3893 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3894 fib6_info_release(rt); 3895 3896 return err; 3897 } 3898 3899 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3900 { 3901 struct net *net = info->nl_net; 3902 struct fib6_table *table; 3903 int err; 3904 3905 if (rt == net->ipv6.fib6_null_entry) { 3906 err = -ENOENT; 3907 goto out; 3908 } 3909 3910 table = rt->fib6_table; 3911 spin_lock_bh(&table->tb6_lock); 3912 err = fib6_del(rt, info); 3913 spin_unlock_bh(&table->tb6_lock); 3914 3915 out: 3916 fib6_info_release(rt); 3917 return err; 3918 } 3919 3920 int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) 3921 { 3922 struct nl_info info = { 3923 .nl_net = net, 3924 .skip_notify = skip_notify 3925 }; 3926 3927 return __ip6_del_rt(rt, &info); 3928 } 3929 3930 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3931 { 3932 struct nl_info *info = &cfg->fc_nlinfo; 3933 struct net *net = info->nl_net; 3934 struct sk_buff *skb = NULL; 3935 struct fib6_table *table; 3936 int err = -ENOENT; 3937 3938 if (rt == net->ipv6.fib6_null_entry) 3939 goto out_put; 3940 table = rt->fib6_table; 3941 spin_lock_bh(&table->tb6_lock); 3942 3943 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3944 struct fib6_info *sibling, *next_sibling; 3945 struct fib6_node *fn; 3946 3947 /* prefer to send a single notification with all hops */ 3948 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3949 if (skb) { 3950 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3951 3952 if (rt6_fill_node(net, skb, rt, NULL, 3953 NULL, NULL, 0, RTM_DELROUTE, 3954 info->portid, seq, 0) < 0) { 3955 kfree_skb(skb); 3956 skb = NULL; 3957 } else 3958 info->skip_notify = 1; 3959 } 3960 3961 /* 'rt' points to the first sibling route. If it is not the 3962 * leaf, then we do not need to send a notification. Otherwise, 3963 * we need to check if the last sibling has a next route or not 3964 * and emit a replace or delete notification, respectively. 3965 */ 3966 info->skip_notify_kernel = 1; 3967 fn = rcu_dereference_protected(rt->fib6_node, 3968 lockdep_is_held(&table->tb6_lock)); 3969 if (rcu_access_pointer(fn->leaf) == rt) { 3970 struct fib6_info *last_sibling, *replace_rt; 3971 3972 last_sibling = list_last_entry(&rt->fib6_siblings, 3973 struct fib6_info, 3974 fib6_siblings); 3975 replace_rt = rcu_dereference_protected( 3976 last_sibling->fib6_next, 3977 lockdep_is_held(&table->tb6_lock)); 3978 if (replace_rt) 3979 call_fib6_entry_notifiers_replace(net, 3980 replace_rt); 3981 else 3982 call_fib6_multipath_entry_notifiers(net, 3983 FIB_EVENT_ENTRY_DEL, 3984 rt, rt->fib6_nsiblings, 3985 NULL); 3986 } 3987 list_for_each_entry_safe(sibling, next_sibling, 3988 &rt->fib6_siblings, 3989 fib6_siblings) { 3990 err = fib6_del(sibling, info); 3991 if (err) 3992 goto out_unlock; 3993 } 3994 } 3995 3996 err = fib6_del(rt, info); 3997 out_unlock: 3998 spin_unlock_bh(&table->tb6_lock); 3999 out_put: 4000 fib6_info_release(rt); 4001 4002 if (skb) { 4003 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4004 info->nlh, gfp_any()); 4005 } 4006 return err; 4007 } 4008 4009 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 4010 { 4011 int rc = -ESRCH; 4012 4013 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 4014 goto out; 4015 4016 if (cfg->fc_flags & RTF_GATEWAY && 4017 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 4018 goto out; 4019 4020 rc = rt6_remove_exception_rt(rt); 4021 out: 4022 return rc; 4023 } 4024 4025 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 4026 struct fib6_nh *nh) 4027 { 4028 struct fib6_result res = { 4029 .f6i = rt, 4030 .nh = nh, 4031 }; 4032 struct rt6_info *rt_cache; 4033 4034 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 4035 if (rt_cache) 4036 return __ip6_del_cached_rt(rt_cache, cfg); 4037 4038 return 0; 4039 } 4040 4041 struct fib6_nh_del_cached_rt_arg { 4042 struct fib6_config *cfg; 4043 struct fib6_info *f6i; 4044 }; 4045 4046 static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) 4047 { 4048 struct fib6_nh_del_cached_rt_arg *arg = _arg; 4049 int rc; 4050 4051 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); 4052 return rc != -ESRCH ? rc : 0; 4053 } 4054 4055 static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) 4056 { 4057 struct fib6_nh_del_cached_rt_arg arg = { 4058 .cfg = cfg, 4059 .f6i = f6i 4060 }; 4061 4062 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); 4063 } 4064 4065 static int ip6_route_del(struct fib6_config *cfg, 4066 struct netlink_ext_ack *extack) 4067 { 4068 struct fib6_table *table; 4069 struct fib6_info *rt; 4070 struct fib6_node *fn; 4071 int err = -ESRCH; 4072 4073 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 4074 if (!table) { 4075 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 4076 return err; 4077 } 4078 4079 rcu_read_lock(); 4080 4081 fn = fib6_locate(&table->tb6_root, 4082 &cfg->fc_dst, cfg->fc_dst_len, 4083 &cfg->fc_src, cfg->fc_src_len, 4084 !(cfg->fc_flags & RTF_CACHE)); 4085 4086 if (fn) { 4087 for_each_fib6_node_rt_rcu(fn) { 4088 struct fib6_nh *nh; 4089 4090 if (rt->nh && cfg->fc_nh_id && 4091 rt->nh->id != cfg->fc_nh_id) 4092 continue; 4093 4094 if (cfg->fc_flags & RTF_CACHE) { 4095 int rc = 0; 4096 4097 if (rt->nh) { 4098 rc = ip6_del_cached_rt_nh(cfg, rt); 4099 } else if (cfg->fc_nh_id) { 4100 continue; 4101 } else { 4102 nh = rt->fib6_nh; 4103 rc = ip6_del_cached_rt(cfg, rt, nh); 4104 } 4105 if (rc != -ESRCH) { 4106 rcu_read_unlock(); 4107 return rc; 4108 } 4109 continue; 4110 } 4111 4112 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 4113 continue; 4114 if (cfg->fc_protocol && 4115 cfg->fc_protocol != rt->fib6_protocol) 4116 continue; 4117 4118 if (rt->nh) { 4119 if (!fib6_info_hold_safe(rt)) 4120 continue; 4121 rcu_read_unlock(); 4122 4123 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4124 } 4125 if (cfg->fc_nh_id) 4126 continue; 4127 4128 nh = rt->fib6_nh; 4129 if (cfg->fc_ifindex && 4130 (!nh->fib_nh_dev || 4131 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 4132 continue; 4133 if (cfg->fc_flags & RTF_GATEWAY && 4134 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 4135 continue; 4136 if (!fib6_info_hold_safe(rt)) 4137 continue; 4138 rcu_read_unlock(); 4139 4140 /* if gateway was specified only delete the one hop */ 4141 if (cfg->fc_flags & RTF_GATEWAY) 4142 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4143 4144 return __ip6_del_rt_siblings(rt, cfg); 4145 } 4146 } 4147 rcu_read_unlock(); 4148 4149 return err; 4150 } 4151 4152 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 4153 { 4154 struct netevent_redirect netevent; 4155 struct rt6_info *rt, *nrt = NULL; 4156 struct fib6_result res = {}; 4157 struct ndisc_options ndopts; 4158 struct inet6_dev *in6_dev; 4159 struct neighbour *neigh; 4160 struct rd_msg *msg; 4161 int optlen, on_link; 4162 u8 *lladdr; 4163 4164 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 4165 optlen -= sizeof(*msg); 4166 4167 if (optlen < 0) { 4168 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 4169 return; 4170 } 4171 4172 msg = (struct rd_msg *)icmp6_hdr(skb); 4173 4174 if (ipv6_addr_is_multicast(&msg->dest)) { 4175 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 4176 return; 4177 } 4178 4179 on_link = 0; 4180 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 4181 on_link = 1; 4182 } else if (ipv6_addr_type(&msg->target) != 4183 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 4184 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 4185 return; 4186 } 4187 4188 in6_dev = __in6_dev_get(skb->dev); 4189 if (!in6_dev) 4190 return; 4191 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 4192 return; 4193 4194 /* RFC2461 8.1: 4195 * The IP source address of the Redirect MUST be the same as the current 4196 * first-hop router for the specified ICMP Destination Address. 4197 */ 4198 4199 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 4200 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 4201 return; 4202 } 4203 4204 lladdr = NULL; 4205 if (ndopts.nd_opts_tgt_lladdr) { 4206 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 4207 skb->dev); 4208 if (!lladdr) { 4209 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 4210 return; 4211 } 4212 } 4213 4214 rt = dst_rt6_info(dst); 4215 if (rt->rt6i_flags & RTF_REJECT) { 4216 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 4217 return; 4218 } 4219 4220 /* Redirect received -> path was valid. 4221 * Look, redirects are sent only in response to data packets, 4222 * so that this nexthop apparently is reachable. --ANK 4223 */ 4224 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 4225 4226 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 4227 if (!neigh) 4228 return; 4229 4230 /* 4231 * We have finally decided to accept it. 4232 */ 4233 4234 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 4235 NEIGH_UPDATE_F_WEAK_OVERRIDE| 4236 NEIGH_UPDATE_F_OVERRIDE| 4237 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 4238 NEIGH_UPDATE_F_ISROUTER)), 4239 NDISC_REDIRECT, &ndopts); 4240 4241 rcu_read_lock(); 4242 res.f6i = rcu_dereference(rt->from); 4243 if (!res.f6i) 4244 goto out; 4245 4246 if (res.f6i->nh) { 4247 struct fib6_nh_match_arg arg = { 4248 .dev = dst->dev, 4249 .gw = &rt->rt6i_gateway, 4250 }; 4251 4252 nexthop_for_each_fib6_nh(res.f6i->nh, 4253 fib6_nh_find_match, &arg); 4254 4255 /* fib6_info uses a nexthop that does not have fib6_nh 4256 * using the dst->dev. Should be impossible 4257 */ 4258 if (!arg.match) 4259 goto out; 4260 res.nh = arg.match; 4261 } else { 4262 res.nh = res.f6i->fib6_nh; 4263 } 4264 4265 res.fib6_flags = res.f6i->fib6_flags; 4266 res.fib6_type = res.f6i->fib6_type; 4267 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 4268 if (!nrt) 4269 goto out; 4270 4271 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 4272 if (on_link) 4273 nrt->rt6i_flags &= ~RTF_GATEWAY; 4274 4275 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 4276 4277 /* rt6_insert_exception() will take care of duplicated exceptions */ 4278 if (rt6_insert_exception(nrt, &res)) { 4279 dst_release_immediate(&nrt->dst); 4280 goto out; 4281 } 4282 4283 netevent.old = &rt->dst; 4284 netevent.new = &nrt->dst; 4285 netevent.daddr = &msg->dest; 4286 netevent.neigh = neigh; 4287 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 4288 4289 out: 4290 rcu_read_unlock(); 4291 neigh_release(neigh); 4292 } 4293 4294 #ifdef CONFIG_IPV6_ROUTE_INFO 4295 static struct fib6_info *rt6_get_route_info(struct net *net, 4296 const struct in6_addr *prefix, int prefixlen, 4297 const struct in6_addr *gwaddr, 4298 struct net_device *dev) 4299 { 4300 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4301 int ifindex = dev->ifindex; 4302 struct fib6_node *fn; 4303 struct fib6_info *rt = NULL; 4304 struct fib6_table *table; 4305 4306 table = fib6_get_table(net, tb_id); 4307 if (!table) 4308 return NULL; 4309 4310 rcu_read_lock(); 4311 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 4312 if (!fn) 4313 goto out; 4314 4315 for_each_fib6_node_rt_rcu(fn) { 4316 /* these routes do not use nexthops */ 4317 if (rt->nh) 4318 continue; 4319 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 4320 continue; 4321 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 4322 !rt->fib6_nh->fib_nh_gw_family) 4323 continue; 4324 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 4325 continue; 4326 if (!fib6_info_hold_safe(rt)) 4327 continue; 4328 break; 4329 } 4330 out: 4331 rcu_read_unlock(); 4332 return rt; 4333 } 4334 4335 static struct fib6_info *rt6_add_route_info(struct net *net, 4336 const struct in6_addr *prefix, int prefixlen, 4337 const struct in6_addr *gwaddr, 4338 struct net_device *dev, 4339 unsigned int pref) 4340 { 4341 struct fib6_config cfg = { 4342 .fc_metric = IP6_RT_PRIO_USER, 4343 .fc_ifindex = dev->ifindex, 4344 .fc_dst_len = prefixlen, 4345 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 4346 RTF_UP | RTF_PREF(pref), 4347 .fc_protocol = RTPROT_RA, 4348 .fc_type = RTN_UNICAST, 4349 .fc_nlinfo.portid = 0, 4350 .fc_nlinfo.nlh = NULL, 4351 .fc_nlinfo.nl_net = net, 4352 }; 4353 4354 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4355 cfg.fc_dst = *prefix; 4356 cfg.fc_gateway = *gwaddr; 4357 4358 /* We should treat it as a default route if prefix length is 0. */ 4359 if (!prefixlen) 4360 cfg.fc_flags |= RTF_DEFAULT; 4361 4362 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 4363 4364 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 4365 } 4366 #endif 4367 4368 struct fib6_info *rt6_get_dflt_router(struct net *net, 4369 const struct in6_addr *addr, 4370 struct net_device *dev) 4371 { 4372 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 4373 struct fib6_info *rt; 4374 struct fib6_table *table; 4375 4376 table = fib6_get_table(net, tb_id); 4377 if (!table) 4378 return NULL; 4379 4380 rcu_read_lock(); 4381 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4382 struct fib6_nh *nh; 4383 4384 /* RA routes do not use nexthops */ 4385 if (rt->nh) 4386 continue; 4387 4388 nh = rt->fib6_nh; 4389 if (dev == nh->fib_nh_dev && 4390 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 4391 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 4392 break; 4393 } 4394 if (rt && !fib6_info_hold_safe(rt)) 4395 rt = NULL; 4396 rcu_read_unlock(); 4397 return rt; 4398 } 4399 4400 struct fib6_info *rt6_add_dflt_router(struct net *net, 4401 const struct in6_addr *gwaddr, 4402 struct net_device *dev, 4403 unsigned int pref, 4404 u32 defrtr_usr_metric) 4405 { 4406 struct fib6_config cfg = { 4407 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 4408 .fc_metric = defrtr_usr_metric, 4409 .fc_ifindex = dev->ifindex, 4410 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 4411 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 4412 .fc_protocol = RTPROT_RA, 4413 .fc_type = RTN_UNICAST, 4414 .fc_nlinfo.portid = 0, 4415 .fc_nlinfo.nlh = NULL, 4416 .fc_nlinfo.nl_net = net, 4417 }; 4418 4419 cfg.fc_gateway = *gwaddr; 4420 4421 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 4422 struct fib6_table *table; 4423 4424 table = fib6_get_table(dev_net(dev), cfg.fc_table); 4425 if (table) 4426 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 4427 } 4428 4429 return rt6_get_dflt_router(net, gwaddr, dev); 4430 } 4431 4432 static void __rt6_purge_dflt_routers(struct net *net, 4433 struct fib6_table *table) 4434 { 4435 struct fib6_info *rt; 4436 4437 restart: 4438 rcu_read_lock(); 4439 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4440 struct net_device *dev = fib6_info_nh_dev(rt); 4441 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 4442 4443 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 4444 (!idev || idev->cnf.accept_ra != 2) && 4445 fib6_info_hold_safe(rt)) { 4446 rcu_read_unlock(); 4447 ip6_del_rt(net, rt, false); 4448 goto restart; 4449 } 4450 } 4451 rcu_read_unlock(); 4452 4453 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 4454 } 4455 4456 void rt6_purge_dflt_routers(struct net *net) 4457 { 4458 struct fib6_table *table; 4459 struct hlist_head *head; 4460 unsigned int h; 4461 4462 rcu_read_lock(); 4463 4464 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 4465 head = &net->ipv6.fib_table_hash[h]; 4466 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 4467 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 4468 __rt6_purge_dflt_routers(net, table); 4469 } 4470 } 4471 4472 rcu_read_unlock(); 4473 } 4474 4475 static void rtmsg_to_fib6_config(struct net *net, 4476 struct in6_rtmsg *rtmsg, 4477 struct fib6_config *cfg) 4478 { 4479 *cfg = (struct fib6_config){ 4480 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 4481 : RT6_TABLE_MAIN, 4482 .fc_ifindex = rtmsg->rtmsg_ifindex, 4483 .fc_metric = rtmsg->rtmsg_metric, 4484 .fc_expires = rtmsg->rtmsg_info, 4485 .fc_dst_len = rtmsg->rtmsg_dst_len, 4486 .fc_src_len = rtmsg->rtmsg_src_len, 4487 .fc_flags = rtmsg->rtmsg_flags, 4488 .fc_type = rtmsg->rtmsg_type, 4489 4490 .fc_nlinfo.nl_net = net, 4491 4492 .fc_dst = rtmsg->rtmsg_dst, 4493 .fc_src = rtmsg->rtmsg_src, 4494 .fc_gateway = rtmsg->rtmsg_gateway, 4495 }; 4496 } 4497 4498 int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) 4499 { 4500 struct fib6_config cfg; 4501 int err; 4502 4503 if (cmd != SIOCADDRT && cmd != SIOCDELRT) 4504 return -EINVAL; 4505 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 4506 return -EPERM; 4507 4508 rtmsg_to_fib6_config(net, rtmsg, &cfg); 4509 4510 rtnl_lock(); 4511 switch (cmd) { 4512 case SIOCADDRT: 4513 /* Only do the default setting of fc_metric in route adding */ 4514 if (cfg.fc_metric == 0) 4515 cfg.fc_metric = IP6_RT_PRIO_USER; 4516 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 4517 break; 4518 case SIOCDELRT: 4519 err = ip6_route_del(&cfg, NULL); 4520 break; 4521 } 4522 rtnl_unlock(); 4523 return err; 4524 } 4525 4526 /* 4527 * Drop the packet on the floor 4528 */ 4529 4530 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 4531 { 4532 struct dst_entry *dst = skb_dst(skb); 4533 struct net *net = dev_net(dst->dev); 4534 struct inet6_dev *idev; 4535 SKB_DR(reason); 4536 int type; 4537 4538 if (netif_is_l3_master(skb->dev) || 4539 dst->dev == net->loopback_dev) 4540 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 4541 else 4542 idev = ip6_dst_idev(dst); 4543 4544 switch (ipstats_mib_noroutes) { 4545 case IPSTATS_MIB_INNOROUTES: 4546 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 4547 if (type == IPV6_ADDR_ANY) { 4548 SKB_DR_SET(reason, IP_INADDRERRORS); 4549 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 4550 break; 4551 } 4552 SKB_DR_SET(reason, IP_INNOROUTES); 4553 fallthrough; 4554 case IPSTATS_MIB_OUTNOROUTES: 4555 SKB_DR_OR(reason, IP_OUTNOROUTES); 4556 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 4557 break; 4558 } 4559 4560 /* Start over by dropping the dst for l3mdev case */ 4561 if (netif_is_l3_master(skb->dev)) 4562 skb_dst_drop(skb); 4563 4564 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 4565 kfree_skb_reason(skb, reason); 4566 return 0; 4567 } 4568 4569 static int ip6_pkt_discard(struct sk_buff *skb) 4570 { 4571 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 4572 } 4573 4574 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4575 { 4576 skb->dev = skb_dst(skb)->dev; 4577 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 4578 } 4579 4580 static int ip6_pkt_prohibit(struct sk_buff *skb) 4581 { 4582 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 4583 } 4584 4585 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4586 { 4587 skb->dev = skb_dst(skb)->dev; 4588 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4589 } 4590 4591 /* 4592 * Allocate a dst for local (unicast / anycast) address. 4593 */ 4594 4595 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4596 struct inet6_dev *idev, 4597 const struct in6_addr *addr, 4598 bool anycast, gfp_t gfp_flags, 4599 struct netlink_ext_ack *extack) 4600 { 4601 struct fib6_config cfg = { 4602 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4603 .fc_ifindex = idev->dev->ifindex, 4604 .fc_flags = RTF_UP | RTF_NONEXTHOP, 4605 .fc_dst = *addr, 4606 .fc_dst_len = 128, 4607 .fc_protocol = RTPROT_KERNEL, 4608 .fc_nlinfo.nl_net = net, 4609 .fc_ignore_dev_down = true, 4610 }; 4611 struct fib6_info *f6i; 4612 4613 if (anycast) { 4614 cfg.fc_type = RTN_ANYCAST; 4615 cfg.fc_flags |= RTF_ANYCAST; 4616 } else { 4617 cfg.fc_type = RTN_LOCAL; 4618 cfg.fc_flags |= RTF_LOCAL; 4619 } 4620 4621 f6i = ip6_route_info_create(&cfg, gfp_flags, extack); 4622 if (!IS_ERR(f6i)) { 4623 f6i->dst_nocount = true; 4624 4625 if (!anycast && 4626 (net->ipv6.devconf_all->disable_policy || 4627 idev->cnf.disable_policy)) 4628 f6i->dst_nopolicy = true; 4629 } 4630 4631 return f6i; 4632 } 4633 4634 /* remove deleted ip from prefsrc entries */ 4635 struct arg_dev_net_ip { 4636 struct net *net; 4637 struct in6_addr *addr; 4638 }; 4639 4640 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4641 { 4642 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4643 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4644 4645 if (!rt->nh && 4646 rt != net->ipv6.fib6_null_entry && 4647 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && 4648 !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { 4649 spin_lock_bh(&rt6_exception_lock); 4650 /* remove prefsrc entry */ 4651 rt->fib6_prefsrc.plen = 0; 4652 spin_unlock_bh(&rt6_exception_lock); 4653 } 4654 return 0; 4655 } 4656 4657 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4658 { 4659 struct net *net = dev_net(ifp->idev->dev); 4660 struct arg_dev_net_ip adni = { 4661 .net = net, 4662 .addr = &ifp->addr, 4663 }; 4664 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4665 } 4666 4667 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4668 4669 /* Remove routers and update dst entries when gateway turn into host. */ 4670 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4671 { 4672 struct in6_addr *gateway = (struct in6_addr *)arg; 4673 struct fib6_nh *nh; 4674 4675 /* RA routes do not use nexthops */ 4676 if (rt->nh) 4677 return 0; 4678 4679 nh = rt->fib6_nh; 4680 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4681 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4682 return -1; 4683 4684 /* Further clean up cached routes in exception table. 4685 * This is needed because cached route may have a different 4686 * gateway than its 'parent' in the case of an ip redirect. 4687 */ 4688 fib6_nh_exceptions_clean_tohost(nh, gateway); 4689 4690 return 0; 4691 } 4692 4693 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4694 { 4695 fib6_clean_all(net, fib6_clean_tohost, gateway); 4696 } 4697 4698 struct arg_netdev_event { 4699 const struct net_device *dev; 4700 union { 4701 unsigned char nh_flags; 4702 unsigned long event; 4703 }; 4704 }; 4705 4706 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4707 { 4708 struct fib6_info *iter; 4709 struct fib6_node *fn; 4710 4711 fn = rcu_dereference_protected(rt->fib6_node, 4712 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4713 iter = rcu_dereference_protected(fn->leaf, 4714 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4715 while (iter) { 4716 if (iter->fib6_metric == rt->fib6_metric && 4717 rt6_qualify_for_ecmp(iter)) 4718 return iter; 4719 iter = rcu_dereference_protected(iter->fib6_next, 4720 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4721 } 4722 4723 return NULL; 4724 } 4725 4726 /* only called for fib entries with builtin fib6_nh */ 4727 static bool rt6_is_dead(const struct fib6_info *rt) 4728 { 4729 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4730 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4731 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4732 return true; 4733 4734 return false; 4735 } 4736 4737 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4738 { 4739 struct fib6_info *iter; 4740 int total = 0; 4741 4742 if (!rt6_is_dead(rt)) 4743 total += rt->fib6_nh->fib_nh_weight; 4744 4745 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4746 if (!rt6_is_dead(iter)) 4747 total += iter->fib6_nh->fib_nh_weight; 4748 } 4749 4750 return total; 4751 } 4752 4753 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4754 { 4755 int upper_bound = -1; 4756 4757 if (!rt6_is_dead(rt)) { 4758 *weight += rt->fib6_nh->fib_nh_weight; 4759 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4760 total) - 1; 4761 } 4762 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4763 } 4764 4765 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4766 { 4767 struct fib6_info *iter; 4768 int weight = 0; 4769 4770 rt6_upper_bound_set(rt, &weight, total); 4771 4772 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4773 rt6_upper_bound_set(iter, &weight, total); 4774 } 4775 4776 void rt6_multipath_rebalance(struct fib6_info *rt) 4777 { 4778 struct fib6_info *first; 4779 int total; 4780 4781 /* In case the entire multipath route was marked for flushing, 4782 * then there is no need to rebalance upon the removal of every 4783 * sibling route. 4784 */ 4785 if (!rt->fib6_nsiblings || rt->should_flush) 4786 return; 4787 4788 /* During lookup routes are evaluated in order, so we need to 4789 * make sure upper bounds are assigned from the first sibling 4790 * onwards. 4791 */ 4792 first = rt6_multipath_first_sibling(rt); 4793 if (WARN_ON_ONCE(!first)) 4794 return; 4795 4796 total = rt6_multipath_total_weight(first); 4797 rt6_multipath_upper_bound_set(first, total); 4798 } 4799 4800 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4801 { 4802 const struct arg_netdev_event *arg = p_arg; 4803 struct net *net = dev_net(arg->dev); 4804 4805 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4806 rt->fib6_nh->fib_nh_dev == arg->dev) { 4807 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4808 fib6_update_sernum_upto_root(net, rt); 4809 rt6_multipath_rebalance(rt); 4810 } 4811 4812 return 0; 4813 } 4814 4815 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4816 { 4817 struct arg_netdev_event arg = { 4818 .dev = dev, 4819 { 4820 .nh_flags = nh_flags, 4821 }, 4822 }; 4823 4824 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4825 arg.nh_flags |= RTNH_F_LINKDOWN; 4826 4827 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4828 } 4829 4830 /* only called for fib entries with inline fib6_nh */ 4831 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4832 const struct net_device *dev) 4833 { 4834 struct fib6_info *iter; 4835 4836 if (rt->fib6_nh->fib_nh_dev == dev) 4837 return true; 4838 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4839 if (iter->fib6_nh->fib_nh_dev == dev) 4840 return true; 4841 4842 return false; 4843 } 4844 4845 static void rt6_multipath_flush(struct fib6_info *rt) 4846 { 4847 struct fib6_info *iter; 4848 4849 rt->should_flush = 1; 4850 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4851 iter->should_flush = 1; 4852 } 4853 4854 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4855 const struct net_device *down_dev) 4856 { 4857 struct fib6_info *iter; 4858 unsigned int dead = 0; 4859 4860 if (rt->fib6_nh->fib_nh_dev == down_dev || 4861 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4862 dead++; 4863 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4864 if (iter->fib6_nh->fib_nh_dev == down_dev || 4865 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4866 dead++; 4867 4868 return dead; 4869 } 4870 4871 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4872 const struct net_device *dev, 4873 unsigned char nh_flags) 4874 { 4875 struct fib6_info *iter; 4876 4877 if (rt->fib6_nh->fib_nh_dev == dev) 4878 rt->fib6_nh->fib_nh_flags |= nh_flags; 4879 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4880 if (iter->fib6_nh->fib_nh_dev == dev) 4881 iter->fib6_nh->fib_nh_flags |= nh_flags; 4882 } 4883 4884 /* called with write lock held for table with rt */ 4885 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4886 { 4887 const struct arg_netdev_event *arg = p_arg; 4888 const struct net_device *dev = arg->dev; 4889 struct net *net = dev_net(dev); 4890 4891 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4892 return 0; 4893 4894 switch (arg->event) { 4895 case NETDEV_UNREGISTER: 4896 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4897 case NETDEV_DOWN: 4898 if (rt->should_flush) 4899 return -1; 4900 if (!rt->fib6_nsiblings) 4901 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4902 if (rt6_multipath_uses_dev(rt, dev)) { 4903 unsigned int count; 4904 4905 count = rt6_multipath_dead_count(rt, dev); 4906 if (rt->fib6_nsiblings + 1 == count) { 4907 rt6_multipath_flush(rt); 4908 return -1; 4909 } 4910 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4911 RTNH_F_LINKDOWN); 4912 fib6_update_sernum(net, rt); 4913 rt6_multipath_rebalance(rt); 4914 } 4915 return -2; 4916 case NETDEV_CHANGE: 4917 if (rt->fib6_nh->fib_nh_dev != dev || 4918 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4919 break; 4920 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4921 rt6_multipath_rebalance(rt); 4922 break; 4923 } 4924 4925 return 0; 4926 } 4927 4928 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4929 { 4930 struct arg_netdev_event arg = { 4931 .dev = dev, 4932 { 4933 .event = event, 4934 }, 4935 }; 4936 struct net *net = dev_net(dev); 4937 4938 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4939 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4940 else 4941 fib6_clean_all(net, fib6_ifdown, &arg); 4942 } 4943 4944 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4945 { 4946 rt6_sync_down_dev(dev, event); 4947 rt6_uncached_list_flush_dev(dev); 4948 neigh_ifdown(&nd_tbl, dev); 4949 } 4950 4951 struct rt6_mtu_change_arg { 4952 struct net_device *dev; 4953 unsigned int mtu; 4954 struct fib6_info *f6i; 4955 }; 4956 4957 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4958 { 4959 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4960 struct fib6_info *f6i = arg->f6i; 4961 4962 /* For administrative MTU increase, there is no way to discover 4963 * IPv6 PMTU increase, so PMTU increase should be updated here. 4964 * Since RFC 1981 doesn't include administrative MTU increase 4965 * update PMTU increase is a MUST. (i.e. jumbo frame) 4966 */ 4967 if (nh->fib_nh_dev == arg->dev) { 4968 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4969 u32 mtu = f6i->fib6_pmtu; 4970 4971 if (mtu >= arg->mtu || 4972 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4973 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4974 4975 spin_lock_bh(&rt6_exception_lock); 4976 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4977 spin_unlock_bh(&rt6_exception_lock); 4978 } 4979 4980 return 0; 4981 } 4982 4983 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4984 { 4985 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4986 struct inet6_dev *idev; 4987 4988 /* In IPv6 pmtu discovery is not optional, 4989 so that RTAX_MTU lock cannot disable it. 4990 We still use this lock to block changes 4991 caused by addrconf/ndisc. 4992 */ 4993 4994 idev = __in6_dev_get(arg->dev); 4995 if (!idev) 4996 return 0; 4997 4998 if (fib6_metric_locked(f6i, RTAX_MTU)) 4999 return 0; 5000 5001 arg->f6i = f6i; 5002 if (f6i->nh) { 5003 /* fib6_nh_mtu_change only returns 0, so this is safe */ 5004 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, 5005 arg); 5006 } 5007 5008 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 5009 } 5010 5011 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 5012 { 5013 struct rt6_mtu_change_arg arg = { 5014 .dev = dev, 5015 .mtu = mtu, 5016 }; 5017 5018 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 5019 } 5020 5021 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 5022 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 5023 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 5024 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 5025 [RTA_OIF] = { .type = NLA_U32 }, 5026 [RTA_IIF] = { .type = NLA_U32 }, 5027 [RTA_PRIORITY] = { .type = NLA_U32 }, 5028 [RTA_METRICS] = { .type = NLA_NESTED }, 5029 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 5030 [RTA_PREF] = { .type = NLA_U8 }, 5031 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 5032 [RTA_ENCAP] = { .type = NLA_NESTED }, 5033 [RTA_EXPIRES] = { .type = NLA_U32 }, 5034 [RTA_UID] = { .type = NLA_U32 }, 5035 [RTA_MARK] = { .type = NLA_U32 }, 5036 [RTA_TABLE] = { .type = NLA_U32 }, 5037 [RTA_IP_PROTO] = { .type = NLA_U8 }, 5038 [RTA_SPORT] = { .type = NLA_U16 }, 5039 [RTA_DPORT] = { .type = NLA_U16 }, 5040 [RTA_NH_ID] = { .type = NLA_U32 }, 5041 }; 5042 5043 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 5044 struct fib6_config *cfg, 5045 struct netlink_ext_ack *extack) 5046 { 5047 struct rtmsg *rtm; 5048 struct nlattr *tb[RTA_MAX+1]; 5049 unsigned int pref; 5050 int err; 5051 5052 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5053 rtm_ipv6_policy, extack); 5054 if (err < 0) 5055 goto errout; 5056 5057 err = -EINVAL; 5058 rtm = nlmsg_data(nlh); 5059 5060 if (rtm->rtm_tos) { 5061 NL_SET_ERR_MSG(extack, 5062 "Invalid dsfield (tos): option not available for IPv6"); 5063 goto errout; 5064 } 5065 5066 *cfg = (struct fib6_config){ 5067 .fc_table = rtm->rtm_table, 5068 .fc_dst_len = rtm->rtm_dst_len, 5069 .fc_src_len = rtm->rtm_src_len, 5070 .fc_flags = RTF_UP, 5071 .fc_protocol = rtm->rtm_protocol, 5072 .fc_type = rtm->rtm_type, 5073 5074 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 5075 .fc_nlinfo.nlh = nlh, 5076 .fc_nlinfo.nl_net = sock_net(skb->sk), 5077 }; 5078 5079 if (rtm->rtm_type == RTN_UNREACHABLE || 5080 rtm->rtm_type == RTN_BLACKHOLE || 5081 rtm->rtm_type == RTN_PROHIBIT || 5082 rtm->rtm_type == RTN_THROW) 5083 cfg->fc_flags |= RTF_REJECT; 5084 5085 if (rtm->rtm_type == RTN_LOCAL) 5086 cfg->fc_flags |= RTF_LOCAL; 5087 5088 if (rtm->rtm_flags & RTM_F_CLONED) 5089 cfg->fc_flags |= RTF_CACHE; 5090 5091 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 5092 5093 if (tb[RTA_NH_ID]) { 5094 if (tb[RTA_GATEWAY] || tb[RTA_OIF] || 5095 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { 5096 NL_SET_ERR_MSG(extack, 5097 "Nexthop specification and nexthop id are mutually exclusive"); 5098 goto errout; 5099 } 5100 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); 5101 } 5102 5103 if (tb[RTA_GATEWAY]) { 5104 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 5105 cfg->fc_flags |= RTF_GATEWAY; 5106 } 5107 if (tb[RTA_VIA]) { 5108 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 5109 goto errout; 5110 } 5111 5112 if (tb[RTA_DST]) { 5113 int plen = (rtm->rtm_dst_len + 7) >> 3; 5114 5115 if (nla_len(tb[RTA_DST]) < plen) 5116 goto errout; 5117 5118 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 5119 } 5120 5121 if (tb[RTA_SRC]) { 5122 int plen = (rtm->rtm_src_len + 7) >> 3; 5123 5124 if (nla_len(tb[RTA_SRC]) < plen) 5125 goto errout; 5126 5127 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 5128 } 5129 5130 if (tb[RTA_PREFSRC]) 5131 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 5132 5133 if (tb[RTA_OIF]) 5134 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 5135 5136 if (tb[RTA_PRIORITY]) 5137 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 5138 5139 if (tb[RTA_METRICS]) { 5140 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 5141 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 5142 } 5143 5144 if (tb[RTA_TABLE]) 5145 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 5146 5147 if (tb[RTA_MULTIPATH]) { 5148 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 5149 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 5150 5151 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 5152 cfg->fc_mp_len, extack); 5153 if (err < 0) 5154 goto errout; 5155 } 5156 5157 if (tb[RTA_PREF]) { 5158 pref = nla_get_u8(tb[RTA_PREF]); 5159 if (pref != ICMPV6_ROUTER_PREF_LOW && 5160 pref != ICMPV6_ROUTER_PREF_HIGH) 5161 pref = ICMPV6_ROUTER_PREF_MEDIUM; 5162 cfg->fc_flags |= RTF_PREF(pref); 5163 } 5164 5165 if (tb[RTA_ENCAP]) 5166 cfg->fc_encap = tb[RTA_ENCAP]; 5167 5168 if (tb[RTA_ENCAP_TYPE]) { 5169 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 5170 5171 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 5172 if (err < 0) 5173 goto errout; 5174 } 5175 5176 if (tb[RTA_EXPIRES]) { 5177 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 5178 5179 if (addrconf_finite_timeout(timeout)) { 5180 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 5181 cfg->fc_flags |= RTF_EXPIRES; 5182 } 5183 } 5184 5185 err = 0; 5186 errout: 5187 return err; 5188 } 5189 5190 struct rt6_nh { 5191 struct fib6_info *fib6_info; 5192 struct fib6_config r_cfg; 5193 struct list_head next; 5194 }; 5195 5196 static int ip6_route_info_append(struct net *net, 5197 struct list_head *rt6_nh_list, 5198 struct fib6_info *rt, 5199 struct fib6_config *r_cfg) 5200 { 5201 struct rt6_nh *nh; 5202 int err = -EEXIST; 5203 5204 list_for_each_entry(nh, rt6_nh_list, next) { 5205 /* check if fib6_info already exists */ 5206 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 5207 return err; 5208 } 5209 5210 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 5211 if (!nh) 5212 return -ENOMEM; 5213 nh->fib6_info = rt; 5214 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 5215 list_add_tail(&nh->next, rt6_nh_list); 5216 5217 return 0; 5218 } 5219 5220 static void ip6_route_mpath_notify(struct fib6_info *rt, 5221 struct fib6_info *rt_last, 5222 struct nl_info *info, 5223 __u16 nlflags) 5224 { 5225 /* if this is an APPEND route, then rt points to the first route 5226 * inserted and rt_last points to last route inserted. Userspace 5227 * wants a consistent dump of the route which starts at the first 5228 * nexthop. Since sibling routes are always added at the end of 5229 * the list, find the first sibling of the last route appended 5230 */ 5231 rcu_read_lock(); 5232 5233 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 5234 rt = list_first_or_null_rcu(&rt_last->fib6_siblings, 5235 struct fib6_info, 5236 fib6_siblings); 5237 } 5238 5239 if (rt) 5240 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 5241 5242 rcu_read_unlock(); 5243 } 5244 5245 static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) 5246 { 5247 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); 5248 bool should_notify = false; 5249 struct fib6_info *leaf; 5250 struct fib6_node *fn; 5251 5252 rcu_read_lock(); 5253 fn = rcu_dereference(rt->fib6_node); 5254 if (!fn) 5255 goto out; 5256 5257 leaf = rcu_dereference(fn->leaf); 5258 if (!leaf) 5259 goto out; 5260 5261 if (rt == leaf || 5262 (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && 5263 rt6_qualify_for_ecmp(leaf))) 5264 should_notify = true; 5265 out: 5266 rcu_read_unlock(); 5267 5268 return should_notify; 5269 } 5270 5271 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, 5272 struct netlink_ext_ack *extack) 5273 { 5274 if (nla_len(nla) < sizeof(*gw)) { 5275 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); 5276 return -EINVAL; 5277 } 5278 5279 *gw = nla_get_in6_addr(nla); 5280 5281 return 0; 5282 } 5283 5284 static int ip6_route_multipath_add(struct fib6_config *cfg, 5285 struct netlink_ext_ack *extack) 5286 { 5287 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 5288 struct nl_info *info = &cfg->fc_nlinfo; 5289 struct fib6_config r_cfg; 5290 struct rtnexthop *rtnh; 5291 struct fib6_info *rt; 5292 struct rt6_nh *err_nh; 5293 struct rt6_nh *nh, *nh_safe; 5294 __u16 nlflags; 5295 int remaining; 5296 int attrlen; 5297 int err = 1; 5298 int nhn = 0; 5299 int replace = (cfg->fc_nlinfo.nlh && 5300 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 5301 LIST_HEAD(rt6_nh_list); 5302 5303 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 5304 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 5305 nlflags |= NLM_F_APPEND; 5306 5307 remaining = cfg->fc_mp_len; 5308 rtnh = (struct rtnexthop *)cfg->fc_mp; 5309 5310 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 5311 * fib6_info structs per nexthop 5312 */ 5313 while (rtnh_ok(rtnh, remaining)) { 5314 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5315 if (rtnh->rtnh_ifindex) 5316 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5317 5318 attrlen = rtnh_attrlen(rtnh); 5319 if (attrlen > 0) { 5320 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5321 5322 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5323 if (nla) { 5324 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, 5325 extack); 5326 if (err) 5327 goto cleanup; 5328 5329 r_cfg.fc_flags |= RTF_GATEWAY; 5330 } 5331 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 5332 5333 /* RTA_ENCAP_TYPE length checked in 5334 * lwtunnel_valid_encap_type_attr 5335 */ 5336 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 5337 if (nla) 5338 r_cfg.fc_encap_type = nla_get_u16(nla); 5339 } 5340 5341 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 5342 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 5343 if (IS_ERR(rt)) { 5344 err = PTR_ERR(rt); 5345 rt = NULL; 5346 goto cleanup; 5347 } 5348 if (!rt6_qualify_for_ecmp(rt)) { 5349 err = -EINVAL; 5350 NL_SET_ERR_MSG(extack, 5351 "Device only routes can not be added for IPv6 using the multipath API."); 5352 fib6_info_release(rt); 5353 goto cleanup; 5354 } 5355 5356 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 5357 5358 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 5359 rt, &r_cfg); 5360 if (err) { 5361 fib6_info_release(rt); 5362 goto cleanup; 5363 } 5364 5365 rtnh = rtnh_next(rtnh, &remaining); 5366 } 5367 5368 if (list_empty(&rt6_nh_list)) { 5369 NL_SET_ERR_MSG(extack, 5370 "Invalid nexthop configuration - no valid nexthops"); 5371 return -EINVAL; 5372 } 5373 5374 /* for add and replace send one notification with all nexthops. 5375 * Skip the notification in fib6_add_rt2node and send one with 5376 * the full route when done 5377 */ 5378 info->skip_notify = 1; 5379 5380 /* For add and replace, send one notification with all nexthops. For 5381 * append, send one notification with all appended nexthops. 5382 */ 5383 info->skip_notify_kernel = 1; 5384 5385 err_nh = NULL; 5386 list_for_each_entry(nh, &rt6_nh_list, next) { 5387 err = __ip6_ins_rt(nh->fib6_info, info, extack); 5388 5389 if (err) { 5390 if (replace && nhn) 5391 NL_SET_ERR_MSG_MOD(extack, 5392 "multipath route replace failed (check consistency of installed routes)"); 5393 err_nh = nh; 5394 goto add_errout; 5395 } 5396 /* save reference to last route successfully inserted */ 5397 rt_last = nh->fib6_info; 5398 5399 /* save reference to first route for notification */ 5400 if (!rt_notif) 5401 rt_notif = nh->fib6_info; 5402 5403 /* Because each route is added like a single route we remove 5404 * these flags after the first nexthop: if there is a collision, 5405 * we have already failed to add the first nexthop: 5406 * fib6_add_rt2node() has rejected it; when replacing, old 5407 * nexthops have been replaced by first new, the rest should 5408 * be added to it. 5409 */ 5410 if (cfg->fc_nlinfo.nlh) { 5411 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 5412 NLM_F_REPLACE); 5413 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; 5414 } 5415 nhn++; 5416 } 5417 5418 /* An in-kernel notification should only be sent in case the new 5419 * multipath route is added as the first route in the node, or if 5420 * it was appended to it. We pass 'rt_notif' since it is the first 5421 * sibling and might allow us to skip some checks in the replace case. 5422 */ 5423 if (ip6_route_mpath_should_notify(rt_notif)) { 5424 enum fib_event_type fib_event; 5425 5426 if (rt_notif->fib6_nsiblings != nhn - 1) 5427 fib_event = FIB_EVENT_ENTRY_APPEND; 5428 else 5429 fib_event = FIB_EVENT_ENTRY_REPLACE; 5430 5431 err = call_fib6_multipath_entry_notifiers(info->nl_net, 5432 fib_event, rt_notif, 5433 nhn - 1, extack); 5434 if (err) { 5435 /* Delete all the siblings that were just added */ 5436 err_nh = NULL; 5437 goto add_errout; 5438 } 5439 } 5440 5441 /* success ... tell user about new route */ 5442 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5443 goto cleanup; 5444 5445 add_errout: 5446 /* send notification for routes that were added so that 5447 * the delete notifications sent by ip6_route_del are 5448 * coherent 5449 */ 5450 if (rt_notif) 5451 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5452 5453 /* Delete routes that were already added */ 5454 list_for_each_entry(nh, &rt6_nh_list, next) { 5455 if (err_nh == nh) 5456 break; 5457 ip6_route_del(&nh->r_cfg, extack); 5458 } 5459 5460 cleanup: 5461 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 5462 fib6_info_release(nh->fib6_info); 5463 list_del(&nh->next); 5464 kfree(nh); 5465 } 5466 5467 return err; 5468 } 5469 5470 static int ip6_route_multipath_del(struct fib6_config *cfg, 5471 struct netlink_ext_ack *extack) 5472 { 5473 struct fib6_config r_cfg; 5474 struct rtnexthop *rtnh; 5475 int last_err = 0; 5476 int remaining; 5477 int attrlen; 5478 int err; 5479 5480 remaining = cfg->fc_mp_len; 5481 rtnh = (struct rtnexthop *)cfg->fc_mp; 5482 5483 /* Parse a Multipath Entry */ 5484 while (rtnh_ok(rtnh, remaining)) { 5485 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5486 if (rtnh->rtnh_ifindex) 5487 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5488 5489 attrlen = rtnh_attrlen(rtnh); 5490 if (attrlen > 0) { 5491 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5492 5493 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5494 if (nla) { 5495 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, 5496 extack); 5497 if (err) { 5498 last_err = err; 5499 goto next_rtnh; 5500 } 5501 5502 r_cfg.fc_flags |= RTF_GATEWAY; 5503 } 5504 } 5505 err = ip6_route_del(&r_cfg, extack); 5506 if (err) 5507 last_err = err; 5508 5509 next_rtnh: 5510 rtnh = rtnh_next(rtnh, &remaining); 5511 } 5512 5513 return last_err; 5514 } 5515 5516 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5517 struct netlink_ext_ack *extack) 5518 { 5519 struct fib6_config cfg; 5520 int err; 5521 5522 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5523 if (err < 0) 5524 return err; 5525 5526 if (cfg.fc_nh_id && 5527 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { 5528 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 5529 return -EINVAL; 5530 } 5531 5532 if (cfg.fc_mp) 5533 return ip6_route_multipath_del(&cfg, extack); 5534 else { 5535 cfg.fc_delete_all_nh = 1; 5536 return ip6_route_del(&cfg, extack); 5537 } 5538 } 5539 5540 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5541 struct netlink_ext_ack *extack) 5542 { 5543 struct fib6_config cfg; 5544 int err; 5545 5546 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5547 if (err < 0) 5548 return err; 5549 5550 if (cfg.fc_metric == 0) 5551 cfg.fc_metric = IP6_RT_PRIO_USER; 5552 5553 if (cfg.fc_mp) 5554 return ip6_route_multipath_add(&cfg, extack); 5555 else 5556 return ip6_route_add(&cfg, GFP_KERNEL, extack); 5557 } 5558 5559 /* add the overhead of this fib6_nh to nexthop_len */ 5560 static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) 5561 { 5562 int *nexthop_len = arg; 5563 5564 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ 5565 + NLA_ALIGN(sizeof(struct rtnexthop)) 5566 + nla_total_size(16); /* RTA_GATEWAY */ 5567 5568 if (nh->fib_nh_lws) { 5569 /* RTA_ENCAP_TYPE */ 5570 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5571 /* RTA_ENCAP */ 5572 *nexthop_len += nla_total_size(2); 5573 } 5574 5575 return 0; 5576 } 5577 5578 static size_t rt6_nlmsg_size(struct fib6_info *f6i) 5579 { 5580 int nexthop_len; 5581 5582 if (f6i->nh) { 5583 nexthop_len = nla_total_size(4); /* RTA_NH_ID */ 5584 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, 5585 &nexthop_len); 5586 } else { 5587 struct fib6_nh *nh = f6i->fib6_nh; 5588 struct fib6_info *sibling; 5589 5590 nexthop_len = 0; 5591 if (f6i->fib6_nsiblings) { 5592 rt6_nh_nlmsg_size(nh, &nexthop_len); 5593 5594 rcu_read_lock(); 5595 5596 list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, 5597 fib6_siblings) { 5598 rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); 5599 } 5600 5601 rcu_read_unlock(); 5602 } 5603 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5604 } 5605 5606 return NLMSG_ALIGN(sizeof(struct rtmsg)) 5607 + nla_total_size(16) /* RTA_SRC */ 5608 + nla_total_size(16) /* RTA_DST */ 5609 + nla_total_size(16) /* RTA_GATEWAY */ 5610 + nla_total_size(16) /* RTA_PREFSRC */ 5611 + nla_total_size(4) /* RTA_TABLE */ 5612 + nla_total_size(4) /* RTA_IIF */ 5613 + nla_total_size(4) /* RTA_OIF */ 5614 + nla_total_size(4) /* RTA_PRIORITY */ 5615 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 5616 + nla_total_size(sizeof(struct rta_cacheinfo)) 5617 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 5618 + nla_total_size(1) /* RTA_PREF */ 5619 + nexthop_len; 5620 } 5621 5622 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 5623 unsigned char *flags) 5624 { 5625 if (nexthop_is_multipath(nh)) { 5626 struct nlattr *mp; 5627 5628 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5629 if (!mp) 5630 goto nla_put_failure; 5631 5632 if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) 5633 goto nla_put_failure; 5634 5635 nla_nest_end(skb, mp); 5636 } else { 5637 struct fib6_nh *fib6_nh; 5638 5639 fib6_nh = nexthop_fib6_nh(nh); 5640 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, 5641 flags, false) < 0) 5642 goto nla_put_failure; 5643 } 5644 5645 return 0; 5646 5647 nla_put_failure: 5648 return -EMSGSIZE; 5649 } 5650 5651 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 5652 struct fib6_info *rt, struct dst_entry *dst, 5653 struct in6_addr *dest, struct in6_addr *src, 5654 int iif, int type, u32 portid, u32 seq, 5655 unsigned int flags) 5656 { 5657 struct rt6_info *rt6 = dst_rt6_info(dst); 5658 struct rt6key *rt6_dst, *rt6_src; 5659 u32 *pmetrics, table, rt6_flags; 5660 unsigned char nh_flags = 0; 5661 struct nlmsghdr *nlh; 5662 struct rtmsg *rtm; 5663 long expires = 0; 5664 5665 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 5666 if (!nlh) 5667 return -EMSGSIZE; 5668 5669 if (rt6) { 5670 rt6_dst = &rt6->rt6i_dst; 5671 rt6_src = &rt6->rt6i_src; 5672 rt6_flags = rt6->rt6i_flags; 5673 } else { 5674 rt6_dst = &rt->fib6_dst; 5675 rt6_src = &rt->fib6_src; 5676 rt6_flags = rt->fib6_flags; 5677 } 5678 5679 rtm = nlmsg_data(nlh); 5680 rtm->rtm_family = AF_INET6; 5681 rtm->rtm_dst_len = rt6_dst->plen; 5682 rtm->rtm_src_len = rt6_src->plen; 5683 rtm->rtm_tos = 0; 5684 if (rt->fib6_table) 5685 table = rt->fib6_table->tb6_id; 5686 else 5687 table = RT6_TABLE_UNSPEC; 5688 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 5689 if (nla_put_u32(skb, RTA_TABLE, table)) 5690 goto nla_put_failure; 5691 5692 rtm->rtm_type = rt->fib6_type; 5693 rtm->rtm_flags = 0; 5694 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 5695 rtm->rtm_protocol = rt->fib6_protocol; 5696 5697 if (rt6_flags & RTF_CACHE) 5698 rtm->rtm_flags |= RTM_F_CLONED; 5699 5700 if (dest) { 5701 if (nla_put_in6_addr(skb, RTA_DST, dest)) 5702 goto nla_put_failure; 5703 rtm->rtm_dst_len = 128; 5704 } else if (rtm->rtm_dst_len) 5705 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 5706 goto nla_put_failure; 5707 #ifdef CONFIG_IPV6_SUBTREES 5708 if (src) { 5709 if (nla_put_in6_addr(skb, RTA_SRC, src)) 5710 goto nla_put_failure; 5711 rtm->rtm_src_len = 128; 5712 } else if (rtm->rtm_src_len && 5713 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 5714 goto nla_put_failure; 5715 #endif 5716 if (iif) { 5717 #ifdef CONFIG_IPV6_MROUTE 5718 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 5719 int err = ip6mr_get_route(net, skb, rtm, portid); 5720 5721 if (err == 0) 5722 return 0; 5723 if (err < 0) 5724 goto nla_put_failure; 5725 } else 5726 #endif 5727 if (nla_put_u32(skb, RTA_IIF, iif)) 5728 goto nla_put_failure; 5729 } else if (dest) { 5730 struct in6_addr saddr_buf; 5731 if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && 5732 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5733 goto nla_put_failure; 5734 } 5735 5736 if (rt->fib6_prefsrc.plen) { 5737 struct in6_addr saddr_buf; 5738 saddr_buf = rt->fib6_prefsrc.addr; 5739 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5740 goto nla_put_failure; 5741 } 5742 5743 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 5744 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 5745 goto nla_put_failure; 5746 5747 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5748 goto nla_put_failure; 5749 5750 /* For multipath routes, walk the siblings list and add 5751 * each as a nexthop within RTA_MULTIPATH. 5752 */ 5753 if (rt6) { 5754 if (rt6_flags & RTF_GATEWAY && 5755 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5756 goto nla_put_failure; 5757 5758 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5759 goto nla_put_failure; 5760 5761 if (dst->lwtstate && 5762 lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 5763 goto nla_put_failure; 5764 } else if (rt->fib6_nsiblings) { 5765 struct fib6_info *sibling; 5766 struct nlattr *mp; 5767 5768 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5769 if (!mp) 5770 goto nla_put_failure; 5771 5772 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5773 rt->fib6_nh->fib_nh_weight, AF_INET6, 5774 0) < 0) 5775 goto nla_put_failure; 5776 5777 rcu_read_lock(); 5778 5779 list_for_each_entry_rcu(sibling, &rt->fib6_siblings, 5780 fib6_siblings) { 5781 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5782 sibling->fib6_nh->fib_nh_weight, 5783 AF_INET6, 0) < 0) { 5784 rcu_read_unlock(); 5785 5786 goto nla_put_failure; 5787 } 5788 } 5789 5790 rcu_read_unlock(); 5791 5792 nla_nest_end(skb, mp); 5793 } else if (rt->nh) { 5794 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5795 goto nla_put_failure; 5796 5797 if (nexthop_is_blackhole(rt->nh)) 5798 rtm->rtm_type = RTN_BLACKHOLE; 5799 5800 if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && 5801 rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5802 goto nla_put_failure; 5803 5804 rtm->rtm_flags |= nh_flags; 5805 } else { 5806 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, 5807 &nh_flags, false) < 0) 5808 goto nla_put_failure; 5809 5810 rtm->rtm_flags |= nh_flags; 5811 } 5812 5813 if (rt6_flags & RTF_EXPIRES) { 5814 expires = dst ? dst->expires : rt->expires; 5815 expires -= jiffies; 5816 } 5817 5818 if (!dst) { 5819 if (READ_ONCE(rt->offload)) 5820 rtm->rtm_flags |= RTM_F_OFFLOAD; 5821 if (READ_ONCE(rt->trap)) 5822 rtm->rtm_flags |= RTM_F_TRAP; 5823 if (READ_ONCE(rt->offload_failed)) 5824 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; 5825 } 5826 5827 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5828 goto nla_put_failure; 5829 5830 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5831 goto nla_put_failure; 5832 5833 5834 nlmsg_end(skb, nlh); 5835 return 0; 5836 5837 nla_put_failure: 5838 nlmsg_cancel(skb, nlh); 5839 return -EMSGSIZE; 5840 } 5841 5842 static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) 5843 { 5844 const struct net_device *dev = arg; 5845 5846 if (nh->fib_nh_dev == dev) 5847 return 1; 5848 5849 return 0; 5850 } 5851 5852 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5853 const struct net_device *dev) 5854 { 5855 if (f6i->nh) { 5856 struct net_device *_dev = (struct net_device *)dev; 5857 5858 return !!nexthop_for_each_fib6_nh(f6i->nh, 5859 fib6_info_nh_uses_dev, 5860 _dev); 5861 } 5862 5863 if (f6i->fib6_nh->fib_nh_dev == dev) 5864 return true; 5865 5866 if (f6i->fib6_nsiblings) { 5867 struct fib6_info *sibling, *next_sibling; 5868 5869 list_for_each_entry_safe(sibling, next_sibling, 5870 &f6i->fib6_siblings, fib6_siblings) { 5871 if (sibling->fib6_nh->fib_nh_dev == dev) 5872 return true; 5873 } 5874 } 5875 5876 return false; 5877 } 5878 5879 struct fib6_nh_exception_dump_walker { 5880 struct rt6_rtnl_dump_arg *dump; 5881 struct fib6_info *rt; 5882 unsigned int flags; 5883 unsigned int skip; 5884 unsigned int count; 5885 }; 5886 5887 static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) 5888 { 5889 struct fib6_nh_exception_dump_walker *w = arg; 5890 struct rt6_rtnl_dump_arg *dump = w->dump; 5891 struct rt6_exception_bucket *bucket; 5892 struct rt6_exception *rt6_ex; 5893 int i, err; 5894 5895 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 5896 if (!bucket) 5897 return 0; 5898 5899 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 5900 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 5901 if (w->skip) { 5902 w->skip--; 5903 continue; 5904 } 5905 5906 /* Expiration of entries doesn't bump sernum, insertion 5907 * does. Removal is triggered by insertion, so we can 5908 * rely on the fact that if entries change between two 5909 * partial dumps, this node is scanned again completely, 5910 * see rt6_insert_exception() and fib6_dump_table(). 5911 * 5912 * Count expired entries we go through as handled 5913 * entries that we'll skip next time, in case of partial 5914 * node dump. Otherwise, if entries expire meanwhile, 5915 * we'll skip the wrong amount. 5916 */ 5917 if (rt6_check_expired(rt6_ex->rt6i)) { 5918 w->count++; 5919 continue; 5920 } 5921 5922 err = rt6_fill_node(dump->net, dump->skb, w->rt, 5923 &rt6_ex->rt6i->dst, NULL, NULL, 0, 5924 RTM_NEWROUTE, 5925 NETLINK_CB(dump->cb->skb).portid, 5926 dump->cb->nlh->nlmsg_seq, w->flags); 5927 if (err) 5928 return err; 5929 5930 w->count++; 5931 } 5932 bucket++; 5933 } 5934 5935 return 0; 5936 } 5937 5938 /* Return -1 if done with node, number of handled routes on partial dump */ 5939 int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) 5940 { 5941 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5942 struct fib_dump_filter *filter = &arg->filter; 5943 unsigned int flags = NLM_F_MULTI; 5944 struct net *net = arg->net; 5945 int count = 0; 5946 5947 if (rt == net->ipv6.fib6_null_entry) 5948 return -1; 5949 5950 if ((filter->flags & RTM_F_PREFIX) && 5951 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5952 /* success since this is not a prefix route */ 5953 return -1; 5954 } 5955 if (filter->filter_set && 5956 ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5957 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5958 (filter->protocol && rt->fib6_protocol != filter->protocol))) { 5959 return -1; 5960 } 5961 5962 if (filter->filter_set || 5963 !filter->dump_routes || !filter->dump_exceptions) { 5964 flags |= NLM_F_DUMP_FILTERED; 5965 } 5966 5967 if (filter->dump_routes) { 5968 if (skip) { 5969 skip--; 5970 } else { 5971 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 5972 0, RTM_NEWROUTE, 5973 NETLINK_CB(arg->cb->skb).portid, 5974 arg->cb->nlh->nlmsg_seq, flags)) { 5975 return 0; 5976 } 5977 count++; 5978 } 5979 } 5980 5981 if (filter->dump_exceptions) { 5982 struct fib6_nh_exception_dump_walker w = { .dump = arg, 5983 .rt = rt, 5984 .flags = flags, 5985 .skip = skip, 5986 .count = 0 }; 5987 int err; 5988 5989 rcu_read_lock(); 5990 if (rt->nh) { 5991 err = nexthop_for_each_fib6_nh(rt->nh, 5992 rt6_nh_dump_exceptions, 5993 &w); 5994 } else { 5995 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); 5996 } 5997 rcu_read_unlock(); 5998 5999 if (err) 6000 return count + w.count; 6001 } 6002 6003 return -1; 6004 } 6005 6006 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 6007 const struct nlmsghdr *nlh, 6008 struct nlattr **tb, 6009 struct netlink_ext_ack *extack) 6010 { 6011 struct rtmsg *rtm; 6012 int i, err; 6013 6014 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 6015 NL_SET_ERR_MSG_MOD(extack, 6016 "Invalid header for get route request"); 6017 return -EINVAL; 6018 } 6019 6020 if (!netlink_strict_get_check(skb)) 6021 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 6022 rtm_ipv6_policy, extack); 6023 6024 rtm = nlmsg_data(nlh); 6025 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 6026 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 6027 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 6028 rtm->rtm_type) { 6029 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 6030 return -EINVAL; 6031 } 6032 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 6033 NL_SET_ERR_MSG_MOD(extack, 6034 "Invalid flags for get route request"); 6035 return -EINVAL; 6036 } 6037 6038 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 6039 rtm_ipv6_policy, extack); 6040 if (err) 6041 return err; 6042 6043 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 6044 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 6045 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 6046 return -EINVAL; 6047 } 6048 6049 for (i = 0; i <= RTA_MAX; i++) { 6050 if (!tb[i]) 6051 continue; 6052 6053 switch (i) { 6054 case RTA_SRC: 6055 case RTA_DST: 6056 case RTA_IIF: 6057 case RTA_OIF: 6058 case RTA_MARK: 6059 case RTA_UID: 6060 case RTA_SPORT: 6061 case RTA_DPORT: 6062 case RTA_IP_PROTO: 6063 break; 6064 default: 6065 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 6066 return -EINVAL; 6067 } 6068 } 6069 6070 return 0; 6071 } 6072 6073 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 6074 struct netlink_ext_ack *extack) 6075 { 6076 struct net *net = sock_net(in_skb->sk); 6077 struct nlattr *tb[RTA_MAX+1]; 6078 int err, iif = 0, oif = 0; 6079 struct fib6_info *from; 6080 struct dst_entry *dst; 6081 struct rt6_info *rt; 6082 struct sk_buff *skb; 6083 struct rtmsg *rtm; 6084 struct flowi6 fl6 = {}; 6085 bool fibmatch; 6086 6087 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 6088 if (err < 0) 6089 goto errout; 6090 6091 err = -EINVAL; 6092 rtm = nlmsg_data(nlh); 6093 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 6094 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 6095 6096 if (tb[RTA_SRC]) { 6097 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 6098 goto errout; 6099 6100 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 6101 } 6102 6103 if (tb[RTA_DST]) { 6104 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 6105 goto errout; 6106 6107 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 6108 } 6109 6110 if (tb[RTA_IIF]) 6111 iif = nla_get_u32(tb[RTA_IIF]); 6112 6113 if (tb[RTA_OIF]) 6114 oif = nla_get_u32(tb[RTA_OIF]); 6115 6116 if (tb[RTA_MARK]) 6117 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 6118 6119 if (tb[RTA_UID]) 6120 fl6.flowi6_uid = make_kuid(current_user_ns(), 6121 nla_get_u32(tb[RTA_UID])); 6122 else 6123 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 6124 6125 if (tb[RTA_SPORT]) 6126 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 6127 6128 if (tb[RTA_DPORT]) 6129 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 6130 6131 if (tb[RTA_IP_PROTO]) { 6132 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 6133 &fl6.flowi6_proto, AF_INET6, 6134 extack); 6135 if (err) 6136 goto errout; 6137 } 6138 6139 if (iif) { 6140 struct net_device *dev; 6141 int flags = 0; 6142 6143 rcu_read_lock(); 6144 6145 dev = dev_get_by_index_rcu(net, iif); 6146 if (!dev) { 6147 rcu_read_unlock(); 6148 err = -ENODEV; 6149 goto errout; 6150 } 6151 6152 fl6.flowi6_iif = iif; 6153 6154 if (!ipv6_addr_any(&fl6.saddr)) 6155 flags |= RT6_LOOKUP_F_HAS_SADDR; 6156 6157 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 6158 6159 rcu_read_unlock(); 6160 } else { 6161 fl6.flowi6_oif = oif; 6162 6163 dst = ip6_route_output(net, NULL, &fl6); 6164 } 6165 6166 6167 rt = dst_rt6_info(dst); 6168 if (rt->dst.error) { 6169 err = rt->dst.error; 6170 ip6_rt_put(rt); 6171 goto errout; 6172 } 6173 6174 if (rt == net->ipv6.ip6_null_entry) { 6175 err = rt->dst.error; 6176 ip6_rt_put(rt); 6177 goto errout; 6178 } 6179 6180 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 6181 if (!skb) { 6182 ip6_rt_put(rt); 6183 err = -ENOBUFS; 6184 goto errout; 6185 } 6186 6187 skb_dst_set(skb, &rt->dst); 6188 6189 rcu_read_lock(); 6190 from = rcu_dereference(rt->from); 6191 if (from) { 6192 if (fibmatch) 6193 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 6194 iif, RTM_NEWROUTE, 6195 NETLINK_CB(in_skb).portid, 6196 nlh->nlmsg_seq, 0); 6197 else 6198 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 6199 &fl6.saddr, iif, RTM_NEWROUTE, 6200 NETLINK_CB(in_skb).portid, 6201 nlh->nlmsg_seq, 0); 6202 } else { 6203 err = -ENETUNREACH; 6204 } 6205 rcu_read_unlock(); 6206 6207 if (err < 0) { 6208 kfree_skb(skb); 6209 goto errout; 6210 } 6211 6212 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 6213 errout: 6214 return err; 6215 } 6216 6217 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 6218 unsigned int nlm_flags) 6219 { 6220 struct sk_buff *skb; 6221 struct net *net = info->nl_net; 6222 u32 seq; 6223 int err; 6224 6225 err = -ENOBUFS; 6226 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6227 6228 skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); 6229 if (!skb) 6230 goto errout; 6231 6232 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6233 event, info->portid, seq, nlm_flags); 6234 if (err < 0) { 6235 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6236 WARN_ON(err == -EMSGSIZE); 6237 kfree_skb(skb); 6238 goto errout; 6239 } 6240 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6241 info->nlh, GFP_ATOMIC); 6242 return; 6243 errout: 6244 if (err < 0) 6245 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6246 } 6247 6248 void fib6_rt_update(struct net *net, struct fib6_info *rt, 6249 struct nl_info *info) 6250 { 6251 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6252 struct sk_buff *skb; 6253 int err = -ENOBUFS; 6254 6255 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6256 if (!skb) 6257 goto errout; 6258 6259 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6260 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 6261 if (err < 0) { 6262 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6263 WARN_ON(err == -EMSGSIZE); 6264 kfree_skb(skb); 6265 goto errout; 6266 } 6267 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6268 info->nlh, gfp_any()); 6269 return; 6270 errout: 6271 if (err < 0) 6272 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6273 } 6274 6275 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, 6276 bool offload, bool trap, bool offload_failed) 6277 { 6278 struct sk_buff *skb; 6279 int err; 6280 6281 if (READ_ONCE(f6i->offload) == offload && 6282 READ_ONCE(f6i->trap) == trap && 6283 READ_ONCE(f6i->offload_failed) == offload_failed) 6284 return; 6285 6286 WRITE_ONCE(f6i->offload, offload); 6287 WRITE_ONCE(f6i->trap, trap); 6288 6289 /* 2 means send notifications only if offload_failed was changed. */ 6290 if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && 6291 READ_ONCE(f6i->offload_failed) == offload_failed) 6292 return; 6293 6294 WRITE_ONCE(f6i->offload_failed, offload_failed); 6295 6296 if (!rcu_access_pointer(f6i->fib6_node)) 6297 /* The route was removed from the tree, do not send 6298 * notification. 6299 */ 6300 return; 6301 6302 if (!net->ipv6.sysctl.fib_notify_on_flag_change) 6303 return; 6304 6305 skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); 6306 if (!skb) { 6307 err = -ENOBUFS; 6308 goto errout; 6309 } 6310 6311 err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 6312 0, 0); 6313 if (err < 0) { 6314 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6315 WARN_ON(err == -EMSGSIZE); 6316 kfree_skb(skb); 6317 goto errout; 6318 } 6319 6320 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); 6321 return; 6322 6323 errout: 6324 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6325 } 6326 EXPORT_SYMBOL(fib6_info_hw_flags_set); 6327 6328 static int ip6_route_dev_notify(struct notifier_block *this, 6329 unsigned long event, void *ptr) 6330 { 6331 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 6332 struct net *net = dev_net(dev); 6333 6334 if (!(dev->flags & IFF_LOOPBACK)) 6335 return NOTIFY_OK; 6336 6337 if (event == NETDEV_REGISTER) { 6338 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 6339 net->ipv6.ip6_null_entry->dst.dev = dev; 6340 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 6341 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6342 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 6343 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 6344 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 6345 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 6346 #endif 6347 } else if (event == NETDEV_UNREGISTER && 6348 dev->reg_state != NETREG_UNREGISTERED) { 6349 /* NETDEV_UNREGISTER could be fired for multiple times by 6350 * netdev_wait_allrefs(). Make sure we only call this once. 6351 */ 6352 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 6353 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6354 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 6355 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 6356 #endif 6357 } 6358 6359 return NOTIFY_OK; 6360 } 6361 6362 /* 6363 * /proc 6364 */ 6365 6366 #ifdef CONFIG_PROC_FS 6367 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 6368 { 6369 struct net *net = (struct net *)seq->private; 6370 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 6371 net->ipv6.rt6_stats->fib_nodes, 6372 net->ipv6.rt6_stats->fib_route_nodes, 6373 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 6374 net->ipv6.rt6_stats->fib_rt_entries, 6375 net->ipv6.rt6_stats->fib_rt_cache, 6376 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 6377 net->ipv6.rt6_stats->fib_discarded_routes); 6378 6379 return 0; 6380 } 6381 #endif /* CONFIG_PROC_FS */ 6382 6383 #ifdef CONFIG_SYSCTL 6384 6385 static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 6386 void *buffer, size_t *lenp, loff_t *ppos) 6387 { 6388 struct net *net; 6389 int delay; 6390 int ret; 6391 if (!write) 6392 return -EINVAL; 6393 6394 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 6395 if (ret) 6396 return ret; 6397 6398 net = (struct net *)ctl->extra1; 6399 delay = net->ipv6.sysctl.flush_delay; 6400 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 6401 return 0; 6402 } 6403 6404 static struct ctl_table ipv6_route_table_template[] = { 6405 { 6406 .procname = "max_size", 6407 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 6408 .maxlen = sizeof(int), 6409 .mode = 0644, 6410 .proc_handler = proc_dointvec, 6411 }, 6412 { 6413 .procname = "gc_thresh", 6414 .data = &ip6_dst_ops_template.gc_thresh, 6415 .maxlen = sizeof(int), 6416 .mode = 0644, 6417 .proc_handler = proc_dointvec, 6418 }, 6419 { 6420 .procname = "flush", 6421 .data = &init_net.ipv6.sysctl.flush_delay, 6422 .maxlen = sizeof(int), 6423 .mode = 0200, 6424 .proc_handler = ipv6_sysctl_rtcache_flush 6425 }, 6426 { 6427 .procname = "gc_min_interval", 6428 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6429 .maxlen = sizeof(int), 6430 .mode = 0644, 6431 .proc_handler = proc_dointvec_jiffies, 6432 }, 6433 { 6434 .procname = "gc_timeout", 6435 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 6436 .maxlen = sizeof(int), 6437 .mode = 0644, 6438 .proc_handler = proc_dointvec_jiffies, 6439 }, 6440 { 6441 .procname = "gc_interval", 6442 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 6443 .maxlen = sizeof(int), 6444 .mode = 0644, 6445 .proc_handler = proc_dointvec_jiffies, 6446 }, 6447 { 6448 .procname = "gc_elasticity", 6449 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 6450 .maxlen = sizeof(int), 6451 .mode = 0644, 6452 .proc_handler = proc_dointvec, 6453 }, 6454 { 6455 .procname = "mtu_expires", 6456 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 6457 .maxlen = sizeof(int), 6458 .mode = 0644, 6459 .proc_handler = proc_dointvec_jiffies, 6460 }, 6461 { 6462 .procname = "min_adv_mss", 6463 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 6464 .maxlen = sizeof(int), 6465 .mode = 0644, 6466 .proc_handler = proc_dointvec, 6467 }, 6468 { 6469 .procname = "gc_min_interval_ms", 6470 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6471 .maxlen = sizeof(int), 6472 .mode = 0644, 6473 .proc_handler = proc_dointvec_ms_jiffies, 6474 }, 6475 { 6476 .procname = "skip_notify_on_dev_down", 6477 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 6478 .maxlen = sizeof(u8), 6479 .mode = 0644, 6480 .proc_handler = proc_dou8vec_minmax, 6481 .extra1 = SYSCTL_ZERO, 6482 .extra2 = SYSCTL_ONE, 6483 }, 6484 { } 6485 }; 6486 6487 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 6488 { 6489 struct ctl_table *table; 6490 6491 table = kmemdup(ipv6_route_table_template, 6492 sizeof(ipv6_route_table_template), 6493 GFP_KERNEL); 6494 6495 if (table) { 6496 table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; 6497 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 6498 table[2].data = &net->ipv6.sysctl.flush_delay; 6499 table[2].extra1 = net; 6500 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6501 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 6502 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 6503 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 6504 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 6505 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 6506 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6507 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 6508 6509 /* Don't export sysctls to unprivileged users */ 6510 if (net->user_ns != &init_user_ns) 6511 table[1].procname = NULL; 6512 } 6513 6514 return table; 6515 } 6516 6517 size_t ipv6_route_sysctl_table_size(struct net *net) 6518 { 6519 /* Don't export sysctls to unprivileged users */ 6520 if (net->user_ns != &init_user_ns) 6521 return 1; 6522 6523 return ARRAY_SIZE(ipv6_route_table_template); 6524 } 6525 #endif 6526 6527 static int __net_init ip6_route_net_init(struct net *net) 6528 { 6529 int ret = -ENOMEM; 6530 6531 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 6532 sizeof(net->ipv6.ip6_dst_ops)); 6533 6534 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 6535 goto out_ip6_dst_ops; 6536 6537 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 6538 if (!net->ipv6.fib6_null_entry) 6539 goto out_ip6_dst_entries; 6540 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 6541 sizeof(*net->ipv6.fib6_null_entry)); 6542 6543 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 6544 sizeof(*net->ipv6.ip6_null_entry), 6545 GFP_KERNEL); 6546 if (!net->ipv6.ip6_null_entry) 6547 goto out_fib6_null_entry; 6548 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6549 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 6550 ip6_template_metrics, true); 6551 INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); 6552 6553 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6554 net->ipv6.fib6_has_custom_rules = false; 6555 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 6556 sizeof(*net->ipv6.ip6_prohibit_entry), 6557 GFP_KERNEL); 6558 if (!net->ipv6.ip6_prohibit_entry) 6559 goto out_ip6_null_entry; 6560 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6561 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 6562 ip6_template_metrics, true); 6563 INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); 6564 6565 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 6566 sizeof(*net->ipv6.ip6_blk_hole_entry), 6567 GFP_KERNEL); 6568 if (!net->ipv6.ip6_blk_hole_entry) 6569 goto out_ip6_prohibit_entry; 6570 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6571 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6572 ip6_template_metrics, true); 6573 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); 6574 #ifdef CONFIG_IPV6_SUBTREES 6575 net->ipv6.fib6_routes_require_src = 0; 6576 #endif 6577 #endif 6578 6579 net->ipv6.sysctl.flush_delay = 0; 6580 net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; 6581 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 6582 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 6583 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 6584 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 6585 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 6586 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 6587 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 6588 6589 atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); 6590 6591 ret = 0; 6592 out: 6593 return ret; 6594 6595 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6596 out_ip6_prohibit_entry: 6597 kfree(net->ipv6.ip6_prohibit_entry); 6598 out_ip6_null_entry: 6599 kfree(net->ipv6.ip6_null_entry); 6600 #endif 6601 out_fib6_null_entry: 6602 kfree(net->ipv6.fib6_null_entry); 6603 out_ip6_dst_entries: 6604 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6605 out_ip6_dst_ops: 6606 goto out; 6607 } 6608 6609 static void __net_exit ip6_route_net_exit(struct net *net) 6610 { 6611 kfree(net->ipv6.fib6_null_entry); 6612 kfree(net->ipv6.ip6_null_entry); 6613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6614 kfree(net->ipv6.ip6_prohibit_entry); 6615 kfree(net->ipv6.ip6_blk_hole_entry); 6616 #endif 6617 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6618 } 6619 6620 static int __net_init ip6_route_net_init_late(struct net *net) 6621 { 6622 #ifdef CONFIG_PROC_FS 6623 if (!proc_create_net("ipv6_route", 0, net->proc_net, 6624 &ipv6_route_seq_ops, 6625 sizeof(struct ipv6_route_iter))) 6626 return -ENOMEM; 6627 6628 if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, 6629 rt6_stats_seq_show, NULL)) { 6630 remove_proc_entry("ipv6_route", net->proc_net); 6631 return -ENOMEM; 6632 } 6633 #endif 6634 return 0; 6635 } 6636 6637 static void __net_exit ip6_route_net_exit_late(struct net *net) 6638 { 6639 #ifdef CONFIG_PROC_FS 6640 remove_proc_entry("ipv6_route", net->proc_net); 6641 remove_proc_entry("rt6_stats", net->proc_net); 6642 #endif 6643 } 6644 6645 static struct pernet_operations ip6_route_net_ops = { 6646 .init = ip6_route_net_init, 6647 .exit = ip6_route_net_exit, 6648 }; 6649 6650 static int __net_init ipv6_inetpeer_init(struct net *net) 6651 { 6652 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 6653 6654 if (!bp) 6655 return -ENOMEM; 6656 inet_peer_base_init(bp); 6657 net->ipv6.peers = bp; 6658 return 0; 6659 } 6660 6661 static void __net_exit ipv6_inetpeer_exit(struct net *net) 6662 { 6663 struct inet_peer_base *bp = net->ipv6.peers; 6664 6665 net->ipv6.peers = NULL; 6666 inetpeer_invalidate_tree(bp); 6667 kfree(bp); 6668 } 6669 6670 static struct pernet_operations ipv6_inetpeer_ops = { 6671 .init = ipv6_inetpeer_init, 6672 .exit = ipv6_inetpeer_exit, 6673 }; 6674 6675 static struct pernet_operations ip6_route_net_late_ops = { 6676 .init = ip6_route_net_init_late, 6677 .exit = ip6_route_net_exit_late, 6678 }; 6679 6680 static struct notifier_block ip6_route_dev_notifier = { 6681 .notifier_call = ip6_route_dev_notify, 6682 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 6683 }; 6684 6685 void __init ip6_route_init_special_entries(void) 6686 { 6687 /* Registering of the loopback is done before this portion of code, 6688 * the loopback reference in rt6_info will not be taken, do it 6689 * manually for init_net */ 6690 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 6691 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 6692 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6693 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6694 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 6695 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6696 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 6697 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6698 #endif 6699 } 6700 6701 #if IS_BUILTIN(CONFIG_IPV6) 6702 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6703 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) 6704 6705 BTF_ID_LIST(btf_fib6_info_id) 6706 BTF_ID(struct, fib6_info) 6707 6708 static const struct bpf_iter_seq_info ipv6_route_seq_info = { 6709 .seq_ops = &ipv6_route_seq_ops, 6710 .init_seq_private = bpf_iter_init_seq_net, 6711 .fini_seq_private = bpf_iter_fini_seq_net, 6712 .seq_priv_size = sizeof(struct ipv6_route_iter), 6713 }; 6714 6715 static struct bpf_iter_reg ipv6_route_reg_info = { 6716 .target = "ipv6_route", 6717 .ctx_arg_info_size = 1, 6718 .ctx_arg_info = { 6719 { offsetof(struct bpf_iter__ipv6_route, rt), 6720 PTR_TO_BTF_ID_OR_NULL }, 6721 }, 6722 .seq_info = &ipv6_route_seq_info, 6723 }; 6724 6725 static int __init bpf_iter_register(void) 6726 { 6727 ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; 6728 return bpf_iter_reg_target(&ipv6_route_reg_info); 6729 } 6730 6731 static void bpf_iter_unregister(void) 6732 { 6733 bpf_iter_unreg_target(&ipv6_route_reg_info); 6734 } 6735 #endif 6736 #endif 6737 6738 int __init ip6_route_init(void) 6739 { 6740 int ret; 6741 int cpu; 6742 6743 ret = -ENOMEM; 6744 ip6_dst_ops_template.kmem_cachep = 6745 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 6746 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 6747 if (!ip6_dst_ops_template.kmem_cachep) 6748 goto out; 6749 6750 ret = dst_entries_init(&ip6_dst_blackhole_ops); 6751 if (ret) 6752 goto out_kmem_cache; 6753 6754 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 6755 if (ret) 6756 goto out_dst_entries; 6757 6758 ret = register_pernet_subsys(&ip6_route_net_ops); 6759 if (ret) 6760 goto out_register_inetpeer; 6761 6762 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 6763 6764 ret = fib6_init(); 6765 if (ret) 6766 goto out_register_subsys; 6767 6768 ret = xfrm6_init(); 6769 if (ret) 6770 goto out_fib6_init; 6771 6772 ret = fib6_rules_init(); 6773 if (ret) 6774 goto xfrm6_init; 6775 6776 ret = register_pernet_subsys(&ip6_route_net_late_ops); 6777 if (ret) 6778 goto fib6_rules_init; 6779 6780 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 6781 inet6_rtm_newroute, NULL, 0); 6782 if (ret < 0) 6783 goto out_register_late_subsys; 6784 6785 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 6786 inet6_rtm_delroute, NULL, 0); 6787 if (ret < 0) 6788 goto out_register_late_subsys; 6789 6790 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 6791 inet6_rtm_getroute, NULL, 6792 RTNL_FLAG_DOIT_UNLOCKED); 6793 if (ret < 0) 6794 goto out_register_late_subsys; 6795 6796 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 6797 if (ret) 6798 goto out_register_late_subsys; 6799 6800 #if IS_BUILTIN(CONFIG_IPV6) 6801 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6802 ret = bpf_iter_register(); 6803 if (ret) 6804 goto out_register_late_subsys; 6805 #endif 6806 #endif 6807 6808 for_each_possible_cpu(cpu) { 6809 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 6810 6811 INIT_LIST_HEAD(&ul->head); 6812 INIT_LIST_HEAD(&ul->quarantine); 6813 spin_lock_init(&ul->lock); 6814 } 6815 6816 out: 6817 return ret; 6818 6819 out_register_late_subsys: 6820 rtnl_unregister_all(PF_INET6); 6821 unregister_pernet_subsys(&ip6_route_net_late_ops); 6822 fib6_rules_init: 6823 fib6_rules_cleanup(); 6824 xfrm6_init: 6825 xfrm6_fini(); 6826 out_fib6_init: 6827 fib6_gc_cleanup(); 6828 out_register_subsys: 6829 unregister_pernet_subsys(&ip6_route_net_ops); 6830 out_register_inetpeer: 6831 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6832 out_dst_entries: 6833 dst_entries_destroy(&ip6_dst_blackhole_ops); 6834 out_kmem_cache: 6835 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6836 goto out; 6837 } 6838 6839 void ip6_route_cleanup(void) 6840 { 6841 #if IS_BUILTIN(CONFIG_IPV6) 6842 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6843 bpf_iter_unregister(); 6844 #endif 6845 #endif 6846 unregister_netdevice_notifier(&ip6_route_dev_notifier); 6847 unregister_pernet_subsys(&ip6_route_net_late_ops); 6848 fib6_rules_cleanup(); 6849 xfrm6_fini(); 6850 fib6_gc_cleanup(); 6851 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6852 unregister_pernet_subsys(&ip6_route_net_ops); 6853 dst_entries_destroy(&ip6_dst_blackhole_ops); 6854 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6855 } 6856