1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 } 999 1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1001 struct in6_addr *saddr) 1002 { 1003 struct fib6_node *pn, *sn; 1004 while (1) { 1005 if (fn->fn_flags & RTN_TL_ROOT) 1006 return NULL; 1007 pn = rcu_dereference(fn->parent); 1008 sn = FIB6_SUBTREE(pn); 1009 if (sn && sn != fn) 1010 fn = fib6_node_lookup(sn, NULL, saddr); 1011 else 1012 fn = pn; 1013 if (fn->fn_flags & RTN_RTINFO) 1014 return fn; 1015 } 1016 } 1017 1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1019 bool null_fallback) 1020 { 1021 struct rt6_info *rt = *prt; 1022 1023 if (dst_hold_safe(&rt->dst)) 1024 return true; 1025 if (null_fallback) { 1026 rt = net->ipv6.ip6_null_entry; 1027 dst_hold(&rt->dst); 1028 } else { 1029 rt = NULL; 1030 } 1031 *prt = rt; 1032 return false; 1033 } 1034 1035 /* called with rcu_lock held */ 1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1037 { 1038 unsigned short flags = fib6_info_dst_flags(rt); 1039 struct net_device *dev = rt->fib6_nh.nh_dev; 1040 struct rt6_info *nrt; 1041 1042 if (!fib6_info_hold_safe(rt)) 1043 return NULL; 1044 1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1046 if (nrt) 1047 ip6_rt_copy_init(nrt, rt); 1048 else 1049 fib6_info_release(rt); 1050 1051 return nrt; 1052 } 1053 1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1055 struct fib6_table *table, 1056 struct flowi6 *fl6, 1057 const struct sk_buff *skb, 1058 int flags) 1059 { 1060 struct fib6_info *f6i; 1061 struct fib6_node *fn; 1062 struct rt6_info *rt; 1063 1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1065 flags &= ~RT6_LOOKUP_F_IFACE; 1066 1067 rcu_read_lock(); 1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1069 restart: 1070 f6i = rcu_dereference(fn->leaf); 1071 if (!f6i) { 1072 f6i = net->ipv6.fib6_null_entry; 1073 } else { 1074 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1075 fl6->flowi6_oif, flags); 1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1077 f6i = fib6_multipath_select(net, f6i, fl6, 1078 fl6->flowi6_oif, skb, 1079 flags); 1080 } 1081 if (f6i == net->ipv6.fib6_null_entry) { 1082 fn = fib6_backtrack(fn, &fl6->saddr); 1083 if (fn) 1084 goto restart; 1085 } 1086 1087 trace_fib6_table_lookup(net, f6i, table, fl6); 1088 1089 /* Search through exception table */ 1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1091 if (rt) { 1092 if (ip6_hold_safe(net, &rt, true)) 1093 dst_use_noref(&rt->dst, jiffies); 1094 } else if (f6i == net->ipv6.fib6_null_entry) { 1095 rt = net->ipv6.ip6_null_entry; 1096 dst_hold(&rt->dst); 1097 } else { 1098 rt = ip6_create_rt_rcu(f6i); 1099 if (!rt) { 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 } 1103 } 1104 1105 rcu_read_unlock(); 1106 1107 return rt; 1108 } 1109 1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1111 const struct sk_buff *skb, int flags) 1112 { 1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1114 } 1115 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1116 1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1118 const struct in6_addr *saddr, int oif, 1119 const struct sk_buff *skb, int strict) 1120 { 1121 struct flowi6 fl6 = { 1122 .flowi6_oif = oif, 1123 .daddr = *daddr, 1124 }; 1125 struct dst_entry *dst; 1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1127 1128 if (saddr) { 1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1130 flags |= RT6_LOOKUP_F_HAS_SADDR; 1131 } 1132 1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1134 if (dst->error == 0) 1135 return (struct rt6_info *) dst; 1136 1137 dst_release(dst); 1138 1139 return NULL; 1140 } 1141 EXPORT_SYMBOL(rt6_lookup); 1142 1143 /* ip6_ins_rt is called with FREE table->tb6_lock. 1144 * It takes new route entry, the addition fails by any reason the 1145 * route is released. 1146 * Caller must hold dst before calling it. 1147 */ 1148 1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1150 struct netlink_ext_ack *extack) 1151 { 1152 int err; 1153 struct fib6_table *table; 1154 1155 table = rt->fib6_table; 1156 spin_lock_bh(&table->tb6_lock); 1157 err = fib6_add(&table->tb6_root, rt, info, extack); 1158 spin_unlock_bh(&table->tb6_lock); 1159 1160 return err; 1161 } 1162 1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1164 { 1165 struct nl_info info = { .nl_net = net, }; 1166 1167 return __ip6_ins_rt(rt, &info, NULL); 1168 } 1169 1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1171 const struct in6_addr *daddr, 1172 const struct in6_addr *saddr) 1173 { 1174 struct net_device *dev; 1175 struct rt6_info *rt; 1176 1177 /* 1178 * Clone the route. 1179 */ 1180 1181 if (!fib6_info_hold_safe(ort)) 1182 return NULL; 1183 1184 dev = ip6_rt_get_dev_rcu(ort); 1185 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1186 if (!rt) { 1187 fib6_info_release(ort); 1188 return NULL; 1189 } 1190 1191 ip6_rt_copy_init(rt, ort); 1192 rt->rt6i_flags |= RTF_CACHE; 1193 rt->dst.flags |= DST_HOST; 1194 rt->rt6i_dst.addr = *daddr; 1195 rt->rt6i_dst.plen = 128; 1196 1197 if (!rt6_is_gw_or_nonexthop(ort)) { 1198 if (ort->fib6_dst.plen != 128 && 1199 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1200 rt->rt6i_flags |= RTF_ANYCAST; 1201 #ifdef CONFIG_IPV6_SUBTREES 1202 if (rt->rt6i_src.plen && saddr) { 1203 rt->rt6i_src.addr = *saddr; 1204 rt->rt6i_src.plen = 128; 1205 } 1206 #endif 1207 } 1208 1209 return rt; 1210 } 1211 1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1213 { 1214 unsigned short flags = fib6_info_dst_flags(rt); 1215 struct net_device *dev; 1216 struct rt6_info *pcpu_rt; 1217 1218 if (!fib6_info_hold_safe(rt)) 1219 return NULL; 1220 1221 rcu_read_lock(); 1222 dev = ip6_rt_get_dev_rcu(rt); 1223 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1224 rcu_read_unlock(); 1225 if (!pcpu_rt) { 1226 fib6_info_release(rt); 1227 return NULL; 1228 } 1229 ip6_rt_copy_init(pcpu_rt, rt); 1230 pcpu_rt->rt6i_flags |= RTF_PCPU; 1231 return pcpu_rt; 1232 } 1233 1234 /* It should be called with rcu_read_lock() acquired */ 1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1236 { 1237 struct rt6_info *pcpu_rt, **p; 1238 1239 p = this_cpu_ptr(rt->rt6i_pcpu); 1240 pcpu_rt = *p; 1241 1242 if (pcpu_rt) 1243 ip6_hold_safe(NULL, &pcpu_rt, false); 1244 1245 return pcpu_rt; 1246 } 1247 1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1249 struct fib6_info *rt) 1250 { 1251 struct rt6_info *pcpu_rt, *prev, **p; 1252 1253 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1254 if (!pcpu_rt) { 1255 dst_hold(&net->ipv6.ip6_null_entry->dst); 1256 return net->ipv6.ip6_null_entry; 1257 } 1258 1259 dst_hold(&pcpu_rt->dst); 1260 p = this_cpu_ptr(rt->rt6i_pcpu); 1261 prev = cmpxchg(p, NULL, pcpu_rt); 1262 BUG_ON(prev); 1263 1264 return pcpu_rt; 1265 } 1266 1267 /* exception hash table implementation 1268 */ 1269 static DEFINE_SPINLOCK(rt6_exception_lock); 1270 1271 /* Remove rt6_ex from hash table and free the memory 1272 * Caller must hold rt6_exception_lock 1273 */ 1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1275 struct rt6_exception *rt6_ex) 1276 { 1277 struct net *net; 1278 1279 if (!bucket || !rt6_ex) 1280 return; 1281 1282 net = dev_net(rt6_ex->rt6i->dst.dev); 1283 hlist_del_rcu(&rt6_ex->hlist); 1284 dst_release(&rt6_ex->rt6i->dst); 1285 kfree_rcu(rt6_ex, rcu); 1286 WARN_ON_ONCE(!bucket->depth); 1287 bucket->depth--; 1288 net->ipv6.rt6_stats->fib_rt_cache--; 1289 } 1290 1291 /* Remove oldest rt6_ex in bucket and free the memory 1292 * Caller must hold rt6_exception_lock 1293 */ 1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1295 { 1296 struct rt6_exception *rt6_ex, *oldest = NULL; 1297 1298 if (!bucket) 1299 return; 1300 1301 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1302 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1303 oldest = rt6_ex; 1304 } 1305 rt6_remove_exception(bucket, oldest); 1306 } 1307 1308 static u32 rt6_exception_hash(const struct in6_addr *dst, 1309 const struct in6_addr *src) 1310 { 1311 static u32 seed __read_mostly; 1312 u32 val; 1313 1314 net_get_random_once(&seed, sizeof(seed)); 1315 val = jhash(dst, sizeof(*dst), seed); 1316 1317 #ifdef CONFIG_IPV6_SUBTREES 1318 if (src) 1319 val = jhash(src, sizeof(*src), val); 1320 #endif 1321 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1322 } 1323 1324 /* Helper function to find the cached rt in the hash table 1325 * and update bucket pointer to point to the bucket for this 1326 * (daddr, saddr) pair 1327 * Caller must hold rt6_exception_lock 1328 */ 1329 static struct rt6_exception * 1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1331 const struct in6_addr *daddr, 1332 const struct in6_addr *saddr) 1333 { 1334 struct rt6_exception *rt6_ex; 1335 u32 hval; 1336 1337 if (!(*bucket) || !daddr) 1338 return NULL; 1339 1340 hval = rt6_exception_hash(daddr, saddr); 1341 *bucket += hval; 1342 1343 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1344 struct rt6_info *rt6 = rt6_ex->rt6i; 1345 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1346 1347 #ifdef CONFIG_IPV6_SUBTREES 1348 if (matched && saddr) 1349 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1350 #endif 1351 if (matched) 1352 return rt6_ex; 1353 } 1354 return NULL; 1355 } 1356 1357 /* Helper function to find the cached rt in the hash table 1358 * and update bucket pointer to point to the bucket for this 1359 * (daddr, saddr) pair 1360 * Caller must hold rcu_read_lock() 1361 */ 1362 static struct rt6_exception * 1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1364 const struct in6_addr *daddr, 1365 const struct in6_addr *saddr) 1366 { 1367 struct rt6_exception *rt6_ex; 1368 u32 hval; 1369 1370 WARN_ON_ONCE(!rcu_read_lock_held()); 1371 1372 if (!(*bucket) || !daddr) 1373 return NULL; 1374 1375 hval = rt6_exception_hash(daddr, saddr); 1376 *bucket += hval; 1377 1378 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1379 struct rt6_info *rt6 = rt6_ex->rt6i; 1380 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1381 1382 #ifdef CONFIG_IPV6_SUBTREES 1383 if (matched && saddr) 1384 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1385 #endif 1386 if (matched) 1387 return rt6_ex; 1388 } 1389 return NULL; 1390 } 1391 1392 static unsigned int fib6_mtu(const struct fib6_info *rt) 1393 { 1394 unsigned int mtu; 1395 1396 if (rt->fib6_pmtu) { 1397 mtu = rt->fib6_pmtu; 1398 } else { 1399 struct net_device *dev = fib6_info_nh_dev(rt); 1400 struct inet6_dev *idev; 1401 1402 rcu_read_lock(); 1403 idev = __in6_dev_get(dev); 1404 mtu = idev->cnf.mtu6; 1405 rcu_read_unlock(); 1406 } 1407 1408 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1409 1410 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1411 } 1412 1413 static int rt6_insert_exception(struct rt6_info *nrt, 1414 struct fib6_info *ort) 1415 { 1416 struct net *net = dev_net(nrt->dst.dev); 1417 struct rt6_exception_bucket *bucket; 1418 struct in6_addr *src_key = NULL; 1419 struct rt6_exception *rt6_ex; 1420 int err = 0; 1421 1422 spin_lock_bh(&rt6_exception_lock); 1423 1424 if (ort->exception_bucket_flushed) { 1425 err = -EINVAL; 1426 goto out; 1427 } 1428 1429 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1430 lockdep_is_held(&rt6_exception_lock)); 1431 if (!bucket) { 1432 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1433 GFP_ATOMIC); 1434 if (!bucket) { 1435 err = -ENOMEM; 1436 goto out; 1437 } 1438 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1439 } 1440 1441 #ifdef CONFIG_IPV6_SUBTREES 1442 /* rt6i_src.plen != 0 indicates ort is in subtree 1443 * and exception table is indexed by a hash of 1444 * both rt6i_dst and rt6i_src. 1445 * Otherwise, the exception table is indexed by 1446 * a hash of only rt6i_dst. 1447 */ 1448 if (ort->fib6_src.plen) 1449 src_key = &nrt->rt6i_src.addr; 1450 #endif 1451 /* rt6_mtu_change() might lower mtu on ort. 1452 * Only insert this exception route if its mtu 1453 * is less than ort's mtu value. 1454 */ 1455 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1456 err = -EINVAL; 1457 goto out; 1458 } 1459 1460 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1461 src_key); 1462 if (rt6_ex) 1463 rt6_remove_exception(bucket, rt6_ex); 1464 1465 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1466 if (!rt6_ex) { 1467 err = -ENOMEM; 1468 goto out; 1469 } 1470 rt6_ex->rt6i = nrt; 1471 rt6_ex->stamp = jiffies; 1472 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1473 bucket->depth++; 1474 net->ipv6.rt6_stats->fib_rt_cache++; 1475 1476 if (bucket->depth > FIB6_MAX_DEPTH) 1477 rt6_exception_remove_oldest(bucket); 1478 1479 out: 1480 spin_unlock_bh(&rt6_exception_lock); 1481 1482 /* Update fn->fn_sernum to invalidate all cached dst */ 1483 if (!err) { 1484 spin_lock_bh(&ort->fib6_table->tb6_lock); 1485 fib6_update_sernum(net, ort); 1486 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1487 fib6_force_start_gc(net); 1488 } 1489 1490 return err; 1491 } 1492 1493 void rt6_flush_exceptions(struct fib6_info *rt) 1494 { 1495 struct rt6_exception_bucket *bucket; 1496 struct rt6_exception *rt6_ex; 1497 struct hlist_node *tmp; 1498 int i; 1499 1500 spin_lock_bh(&rt6_exception_lock); 1501 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1502 rt->exception_bucket_flushed = 1; 1503 1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1505 lockdep_is_held(&rt6_exception_lock)); 1506 if (!bucket) 1507 goto out; 1508 1509 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1510 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1511 rt6_remove_exception(bucket, rt6_ex); 1512 WARN_ON_ONCE(bucket->depth); 1513 bucket++; 1514 } 1515 1516 out: 1517 spin_unlock_bh(&rt6_exception_lock); 1518 } 1519 1520 /* Find cached rt in the hash table inside passed in rt 1521 * Caller has to hold rcu_read_lock() 1522 */ 1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1524 struct in6_addr *daddr, 1525 struct in6_addr *saddr) 1526 { 1527 struct rt6_exception_bucket *bucket; 1528 struct in6_addr *src_key = NULL; 1529 struct rt6_exception *rt6_ex; 1530 struct rt6_info *res = NULL; 1531 1532 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1533 1534 #ifdef CONFIG_IPV6_SUBTREES 1535 /* rt6i_src.plen != 0 indicates rt is in subtree 1536 * and exception table is indexed by a hash of 1537 * both rt6i_dst and rt6i_src. 1538 * Otherwise, the exception table is indexed by 1539 * a hash of only rt6i_dst. 1540 */ 1541 if (rt->fib6_src.plen) 1542 src_key = saddr; 1543 #endif 1544 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1545 1546 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1547 res = rt6_ex->rt6i; 1548 1549 return res; 1550 } 1551 1552 /* Remove the passed in cached rt from the hash table that contains it */ 1553 static int rt6_remove_exception_rt(struct rt6_info *rt) 1554 { 1555 struct rt6_exception_bucket *bucket; 1556 struct in6_addr *src_key = NULL; 1557 struct rt6_exception *rt6_ex; 1558 struct fib6_info *from; 1559 int err; 1560 1561 from = rcu_dereference(rt->from); 1562 if (!from || 1563 !(rt->rt6i_flags & RTF_CACHE)) 1564 return -EINVAL; 1565 1566 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1567 return -ENOENT; 1568 1569 spin_lock_bh(&rt6_exception_lock); 1570 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1571 lockdep_is_held(&rt6_exception_lock)); 1572 #ifdef CONFIG_IPV6_SUBTREES 1573 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1574 * and exception table is indexed by a hash of 1575 * both rt6i_dst and rt6i_src. 1576 * Otherwise, the exception table is indexed by 1577 * a hash of only rt6i_dst. 1578 */ 1579 if (from->fib6_src.plen) 1580 src_key = &rt->rt6i_src.addr; 1581 #endif 1582 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1583 &rt->rt6i_dst.addr, 1584 src_key); 1585 if (rt6_ex) { 1586 rt6_remove_exception(bucket, rt6_ex); 1587 err = 0; 1588 } else { 1589 err = -ENOENT; 1590 } 1591 1592 spin_unlock_bh(&rt6_exception_lock); 1593 return err; 1594 } 1595 1596 /* Find rt6_ex which contains the passed in rt cache and 1597 * refresh its stamp 1598 */ 1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1600 { 1601 struct rt6_exception_bucket *bucket; 1602 struct fib6_info *from = rt->from; 1603 struct in6_addr *src_key = NULL; 1604 struct rt6_exception *rt6_ex; 1605 1606 if (!from || 1607 !(rt->rt6i_flags & RTF_CACHE)) 1608 return; 1609 1610 rcu_read_lock(); 1611 bucket = rcu_dereference(from->rt6i_exception_bucket); 1612 1613 #ifdef CONFIG_IPV6_SUBTREES 1614 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1615 * and exception table is indexed by a hash of 1616 * both rt6i_dst and rt6i_src. 1617 * Otherwise, the exception table is indexed by 1618 * a hash of only rt6i_dst. 1619 */ 1620 if (from->fib6_src.plen) 1621 src_key = &rt->rt6i_src.addr; 1622 #endif 1623 rt6_ex = __rt6_find_exception_rcu(&bucket, 1624 &rt->rt6i_dst.addr, 1625 src_key); 1626 if (rt6_ex) 1627 rt6_ex->stamp = jiffies; 1628 1629 rcu_read_unlock(); 1630 } 1631 1632 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1633 struct rt6_info *rt, int mtu) 1634 { 1635 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1636 * lowest MTU in the path: always allow updating the route PMTU to 1637 * reflect PMTU decreases. 1638 * 1639 * If the new MTU is higher, and the route PMTU is equal to the local 1640 * MTU, this means the old MTU is the lowest in the path, so allow 1641 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1642 * handle this. 1643 */ 1644 1645 if (dst_mtu(&rt->dst) >= mtu) 1646 return true; 1647 1648 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1649 return true; 1650 1651 return false; 1652 } 1653 1654 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1655 struct fib6_info *rt, int mtu) 1656 { 1657 struct rt6_exception_bucket *bucket; 1658 struct rt6_exception *rt6_ex; 1659 int i; 1660 1661 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1662 lockdep_is_held(&rt6_exception_lock)); 1663 1664 if (!bucket) 1665 return; 1666 1667 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1668 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1669 struct rt6_info *entry = rt6_ex->rt6i; 1670 1671 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1672 * route), the metrics of its rt->from have already 1673 * been updated. 1674 */ 1675 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1676 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1677 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1678 } 1679 bucket++; 1680 } 1681 } 1682 1683 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1684 1685 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1686 struct in6_addr *gateway) 1687 { 1688 struct rt6_exception_bucket *bucket; 1689 struct rt6_exception *rt6_ex; 1690 struct hlist_node *tmp; 1691 int i; 1692 1693 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1694 return; 1695 1696 spin_lock_bh(&rt6_exception_lock); 1697 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1698 lockdep_is_held(&rt6_exception_lock)); 1699 1700 if (bucket) { 1701 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1702 hlist_for_each_entry_safe(rt6_ex, tmp, 1703 &bucket->chain, hlist) { 1704 struct rt6_info *entry = rt6_ex->rt6i; 1705 1706 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1707 RTF_CACHE_GATEWAY && 1708 ipv6_addr_equal(gateway, 1709 &entry->rt6i_gateway)) { 1710 rt6_remove_exception(bucket, rt6_ex); 1711 } 1712 } 1713 bucket++; 1714 } 1715 } 1716 1717 spin_unlock_bh(&rt6_exception_lock); 1718 } 1719 1720 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1721 struct rt6_exception *rt6_ex, 1722 struct fib6_gc_args *gc_args, 1723 unsigned long now) 1724 { 1725 struct rt6_info *rt = rt6_ex->rt6i; 1726 1727 /* we are pruning and obsoleting aged-out and non gateway exceptions 1728 * even if others have still references to them, so that on next 1729 * dst_check() such references can be dropped. 1730 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1731 * expired, independently from their aging, as per RFC 8201 section 4 1732 */ 1733 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1734 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1735 RT6_TRACE("aging clone %p\n", rt); 1736 rt6_remove_exception(bucket, rt6_ex); 1737 return; 1738 } 1739 } else if (time_after(jiffies, rt->dst.expires)) { 1740 RT6_TRACE("purging expired route %p\n", rt); 1741 rt6_remove_exception(bucket, rt6_ex); 1742 return; 1743 } 1744 1745 if (rt->rt6i_flags & RTF_GATEWAY) { 1746 struct neighbour *neigh; 1747 __u8 neigh_flags = 0; 1748 1749 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1750 if (neigh) 1751 neigh_flags = neigh->flags; 1752 1753 if (!(neigh_flags & NTF_ROUTER)) { 1754 RT6_TRACE("purging route %p via non-router but gateway\n", 1755 rt); 1756 rt6_remove_exception(bucket, rt6_ex); 1757 return; 1758 } 1759 } 1760 1761 gc_args->more++; 1762 } 1763 1764 void rt6_age_exceptions(struct fib6_info *rt, 1765 struct fib6_gc_args *gc_args, 1766 unsigned long now) 1767 { 1768 struct rt6_exception_bucket *bucket; 1769 struct rt6_exception *rt6_ex; 1770 struct hlist_node *tmp; 1771 int i; 1772 1773 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1774 return; 1775 1776 rcu_read_lock_bh(); 1777 spin_lock(&rt6_exception_lock); 1778 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1779 lockdep_is_held(&rt6_exception_lock)); 1780 1781 if (bucket) { 1782 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1783 hlist_for_each_entry_safe(rt6_ex, tmp, 1784 &bucket->chain, hlist) { 1785 rt6_age_examine_exception(bucket, rt6_ex, 1786 gc_args, now); 1787 } 1788 bucket++; 1789 } 1790 } 1791 spin_unlock(&rt6_exception_lock); 1792 rcu_read_unlock_bh(); 1793 } 1794 1795 /* must be called with rcu lock held */ 1796 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1797 int oif, struct flowi6 *fl6, int strict) 1798 { 1799 struct fib6_node *fn, *saved_fn; 1800 struct fib6_info *f6i; 1801 1802 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1803 saved_fn = fn; 1804 1805 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1806 oif = 0; 1807 1808 redo_rt6_select: 1809 f6i = rt6_select(net, fn, oif, strict); 1810 if (f6i == net->ipv6.fib6_null_entry) { 1811 fn = fib6_backtrack(fn, &fl6->saddr); 1812 if (fn) 1813 goto redo_rt6_select; 1814 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1815 /* also consider unreachable route */ 1816 strict &= ~RT6_LOOKUP_F_REACHABLE; 1817 fn = saved_fn; 1818 goto redo_rt6_select; 1819 } 1820 } 1821 1822 trace_fib6_table_lookup(net, f6i, table, fl6); 1823 1824 return f6i; 1825 } 1826 1827 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1828 int oif, struct flowi6 *fl6, 1829 const struct sk_buff *skb, int flags) 1830 { 1831 struct fib6_info *f6i; 1832 struct rt6_info *rt; 1833 int strict = 0; 1834 1835 strict |= flags & RT6_LOOKUP_F_IFACE; 1836 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1837 if (net->ipv6.devconf_all->forwarding == 0) 1838 strict |= RT6_LOOKUP_F_REACHABLE; 1839 1840 rcu_read_lock(); 1841 1842 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1843 if (f6i->fib6_nsiblings) 1844 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1845 1846 if (f6i == net->ipv6.fib6_null_entry) { 1847 rt = net->ipv6.ip6_null_entry; 1848 rcu_read_unlock(); 1849 dst_hold(&rt->dst); 1850 return rt; 1851 } 1852 1853 /*Search through exception table */ 1854 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1855 if (rt) { 1856 if (ip6_hold_safe(net, &rt, true)) 1857 dst_use_noref(&rt->dst, jiffies); 1858 1859 rcu_read_unlock(); 1860 return rt; 1861 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1862 !(f6i->fib6_flags & RTF_GATEWAY))) { 1863 /* Create a RTF_CACHE clone which will not be 1864 * owned by the fib6 tree. It is for the special case where 1865 * the daddr in the skb during the neighbor look-up is different 1866 * from the fl6->daddr used to look-up route here. 1867 */ 1868 struct rt6_info *uncached_rt; 1869 1870 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1871 1872 rcu_read_unlock(); 1873 1874 if (uncached_rt) { 1875 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1876 * No need for another dst_hold() 1877 */ 1878 rt6_uncached_list_add(uncached_rt); 1879 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1880 } else { 1881 uncached_rt = net->ipv6.ip6_null_entry; 1882 dst_hold(&uncached_rt->dst); 1883 } 1884 1885 return uncached_rt; 1886 } else { 1887 /* Get a percpu copy */ 1888 1889 struct rt6_info *pcpu_rt; 1890 1891 local_bh_disable(); 1892 pcpu_rt = rt6_get_pcpu_route(f6i); 1893 1894 if (!pcpu_rt) 1895 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1896 1897 local_bh_enable(); 1898 rcu_read_unlock(); 1899 1900 return pcpu_rt; 1901 } 1902 } 1903 EXPORT_SYMBOL_GPL(ip6_pol_route); 1904 1905 static struct rt6_info *ip6_pol_route_input(struct net *net, 1906 struct fib6_table *table, 1907 struct flowi6 *fl6, 1908 const struct sk_buff *skb, 1909 int flags) 1910 { 1911 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1912 } 1913 1914 struct dst_entry *ip6_route_input_lookup(struct net *net, 1915 struct net_device *dev, 1916 struct flowi6 *fl6, 1917 const struct sk_buff *skb, 1918 int flags) 1919 { 1920 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1921 flags |= RT6_LOOKUP_F_IFACE; 1922 1923 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1924 } 1925 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1926 1927 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1928 struct flow_keys *keys, 1929 struct flow_keys *flkeys) 1930 { 1931 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1932 const struct ipv6hdr *key_iph = outer_iph; 1933 struct flow_keys *_flkeys = flkeys; 1934 const struct ipv6hdr *inner_iph; 1935 const struct icmp6hdr *icmph; 1936 struct ipv6hdr _inner_iph; 1937 struct icmp6hdr _icmph; 1938 1939 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1940 goto out; 1941 1942 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1943 sizeof(_icmph), &_icmph); 1944 if (!icmph) 1945 goto out; 1946 1947 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1948 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1949 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1950 icmph->icmp6_type != ICMPV6_PARAMPROB) 1951 goto out; 1952 1953 inner_iph = skb_header_pointer(skb, 1954 skb_transport_offset(skb) + sizeof(*icmph), 1955 sizeof(_inner_iph), &_inner_iph); 1956 if (!inner_iph) 1957 goto out; 1958 1959 key_iph = inner_iph; 1960 _flkeys = NULL; 1961 out: 1962 if (_flkeys) { 1963 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1964 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1965 keys->tags.flow_label = _flkeys->tags.flow_label; 1966 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1967 } else { 1968 keys->addrs.v6addrs.src = key_iph->saddr; 1969 keys->addrs.v6addrs.dst = key_iph->daddr; 1970 keys->tags.flow_label = ip6_flowlabel(key_iph); 1971 keys->basic.ip_proto = key_iph->nexthdr; 1972 } 1973 } 1974 1975 /* if skb is set it will be used and fl6 can be NULL */ 1976 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1977 const struct sk_buff *skb, struct flow_keys *flkeys) 1978 { 1979 struct flow_keys hash_keys; 1980 u32 mhash; 1981 1982 switch (ip6_multipath_hash_policy(net)) { 1983 case 0: 1984 memset(&hash_keys, 0, sizeof(hash_keys)); 1985 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1986 if (skb) { 1987 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1988 } else { 1989 hash_keys.addrs.v6addrs.src = fl6->saddr; 1990 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1991 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 1992 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1993 } 1994 break; 1995 case 1: 1996 if (skb) { 1997 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1998 struct flow_keys keys; 1999 2000 /* short-circuit if we already have L4 hash present */ 2001 if (skb->l4_hash) 2002 return skb_get_hash_raw(skb) >> 1; 2003 2004 memset(&hash_keys, 0, sizeof(hash_keys)); 2005 2006 if (!flkeys) { 2007 skb_flow_dissect_flow_keys(skb, &keys, flag); 2008 flkeys = &keys; 2009 } 2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2011 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2012 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2013 hash_keys.ports.src = flkeys->ports.src; 2014 hash_keys.ports.dst = flkeys->ports.dst; 2015 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2016 } else { 2017 memset(&hash_keys, 0, sizeof(hash_keys)); 2018 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2019 hash_keys.addrs.v6addrs.src = fl6->saddr; 2020 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2021 hash_keys.ports.src = fl6->fl6_sport; 2022 hash_keys.ports.dst = fl6->fl6_dport; 2023 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2024 } 2025 break; 2026 } 2027 mhash = flow_hash_from_keys(&hash_keys); 2028 2029 return mhash >> 1; 2030 } 2031 2032 void ip6_route_input(struct sk_buff *skb) 2033 { 2034 const struct ipv6hdr *iph = ipv6_hdr(skb); 2035 struct net *net = dev_net(skb->dev); 2036 int flags = RT6_LOOKUP_F_HAS_SADDR; 2037 struct ip_tunnel_info *tun_info; 2038 struct flowi6 fl6 = { 2039 .flowi6_iif = skb->dev->ifindex, 2040 .daddr = iph->daddr, 2041 .saddr = iph->saddr, 2042 .flowlabel = ip6_flowinfo(iph), 2043 .flowi6_mark = skb->mark, 2044 .flowi6_proto = iph->nexthdr, 2045 }; 2046 struct flow_keys *flkeys = NULL, _flkeys; 2047 2048 tun_info = skb_tunnel_info(skb); 2049 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2050 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2051 2052 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2053 flkeys = &_flkeys; 2054 2055 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2056 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2057 skb_dst_drop(skb); 2058 skb_dst_set(skb, 2059 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2060 } 2061 2062 static struct rt6_info *ip6_pol_route_output(struct net *net, 2063 struct fib6_table *table, 2064 struct flowi6 *fl6, 2065 const struct sk_buff *skb, 2066 int flags) 2067 { 2068 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2069 } 2070 2071 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2072 struct flowi6 *fl6, int flags) 2073 { 2074 bool any_src; 2075 2076 if (rt6_need_strict(&fl6->daddr)) { 2077 struct dst_entry *dst; 2078 2079 dst = l3mdev_link_scope_lookup(net, fl6); 2080 if (dst) 2081 return dst; 2082 } 2083 2084 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2085 2086 any_src = ipv6_addr_any(&fl6->saddr); 2087 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2088 (fl6->flowi6_oif && any_src)) 2089 flags |= RT6_LOOKUP_F_IFACE; 2090 2091 if (!any_src) 2092 flags |= RT6_LOOKUP_F_HAS_SADDR; 2093 else if (sk) 2094 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2095 2096 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2097 } 2098 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2099 2100 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2101 { 2102 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2103 struct net_device *loopback_dev = net->loopback_dev; 2104 struct dst_entry *new = NULL; 2105 2106 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2107 DST_OBSOLETE_DEAD, 0); 2108 if (rt) { 2109 rt6_info_init(rt); 2110 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2111 2112 new = &rt->dst; 2113 new->__use = 1; 2114 new->input = dst_discard; 2115 new->output = dst_discard_out; 2116 2117 dst_copy_metrics(new, &ort->dst); 2118 2119 rt->rt6i_idev = in6_dev_get(loopback_dev); 2120 rt->rt6i_gateway = ort->rt6i_gateway; 2121 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2122 2123 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2124 #ifdef CONFIG_IPV6_SUBTREES 2125 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2126 #endif 2127 } 2128 2129 dst_release(dst_orig); 2130 return new ? new : ERR_PTR(-ENOMEM); 2131 } 2132 2133 /* 2134 * Destination cache support functions 2135 */ 2136 2137 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2138 { 2139 u32 rt_cookie = 0; 2140 2141 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2142 return false; 2143 2144 if (fib6_check_expired(f6i)) 2145 return false; 2146 2147 return true; 2148 } 2149 2150 static struct dst_entry *rt6_check(struct rt6_info *rt, 2151 struct fib6_info *from, 2152 u32 cookie) 2153 { 2154 u32 rt_cookie = 0; 2155 2156 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2157 rt_cookie != cookie) 2158 return NULL; 2159 2160 if (rt6_check_expired(rt)) 2161 return NULL; 2162 2163 return &rt->dst; 2164 } 2165 2166 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2167 struct fib6_info *from, 2168 u32 cookie) 2169 { 2170 if (!__rt6_check_expired(rt) && 2171 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2172 fib6_check(from, cookie)) 2173 return &rt->dst; 2174 else 2175 return NULL; 2176 } 2177 2178 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2179 { 2180 struct dst_entry *dst_ret; 2181 struct fib6_info *from; 2182 struct rt6_info *rt; 2183 2184 rt = container_of(dst, struct rt6_info, dst); 2185 2186 rcu_read_lock(); 2187 2188 /* All IPV6 dsts are created with ->obsolete set to the value 2189 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2190 * into this function always. 2191 */ 2192 2193 from = rcu_dereference(rt->from); 2194 2195 if (from && (rt->rt6i_flags & RTF_PCPU || 2196 unlikely(!list_empty(&rt->rt6i_uncached)))) 2197 dst_ret = rt6_dst_from_check(rt, from, cookie); 2198 else 2199 dst_ret = rt6_check(rt, from, cookie); 2200 2201 rcu_read_unlock(); 2202 2203 return dst_ret; 2204 } 2205 2206 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2207 { 2208 struct rt6_info *rt = (struct rt6_info *) dst; 2209 2210 if (rt) { 2211 if (rt->rt6i_flags & RTF_CACHE) { 2212 rcu_read_lock(); 2213 if (rt6_check_expired(rt)) { 2214 rt6_remove_exception_rt(rt); 2215 dst = NULL; 2216 } 2217 rcu_read_unlock(); 2218 } else { 2219 dst_release(dst); 2220 dst = NULL; 2221 } 2222 } 2223 return dst; 2224 } 2225 2226 static void ip6_link_failure(struct sk_buff *skb) 2227 { 2228 struct rt6_info *rt; 2229 2230 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2231 2232 rt = (struct rt6_info *) skb_dst(skb); 2233 if (rt) { 2234 rcu_read_lock(); 2235 if (rt->rt6i_flags & RTF_CACHE) { 2236 if (dst_hold_safe(&rt->dst)) 2237 rt6_remove_exception_rt(rt); 2238 } else { 2239 struct fib6_info *from; 2240 struct fib6_node *fn; 2241 2242 from = rcu_dereference(rt->from); 2243 if (from) { 2244 fn = rcu_dereference(from->fib6_node); 2245 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2246 fn->fn_sernum = -1; 2247 } 2248 } 2249 rcu_read_unlock(); 2250 } 2251 } 2252 2253 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2254 { 2255 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2256 struct fib6_info *from; 2257 2258 rcu_read_lock(); 2259 from = rcu_dereference(rt0->from); 2260 if (from) 2261 rt0->dst.expires = from->expires; 2262 rcu_read_unlock(); 2263 } 2264 2265 dst_set_expires(&rt0->dst, timeout); 2266 rt0->rt6i_flags |= RTF_EXPIRES; 2267 } 2268 2269 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2270 { 2271 struct net *net = dev_net(rt->dst.dev); 2272 2273 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2274 rt->rt6i_flags |= RTF_MODIFIED; 2275 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2276 } 2277 2278 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2279 { 2280 bool from_set; 2281 2282 rcu_read_lock(); 2283 from_set = !!rcu_dereference(rt->from); 2284 rcu_read_unlock(); 2285 2286 return !(rt->rt6i_flags & RTF_CACHE) && 2287 (rt->rt6i_flags & RTF_PCPU || from_set); 2288 } 2289 2290 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2291 const struct ipv6hdr *iph, u32 mtu) 2292 { 2293 const struct in6_addr *daddr, *saddr; 2294 struct rt6_info *rt6 = (struct rt6_info *)dst; 2295 2296 if (dst_metric_locked(dst, RTAX_MTU)) 2297 return; 2298 2299 if (iph) { 2300 daddr = &iph->daddr; 2301 saddr = &iph->saddr; 2302 } else if (sk) { 2303 daddr = &sk->sk_v6_daddr; 2304 saddr = &inet6_sk(sk)->saddr; 2305 } else { 2306 daddr = NULL; 2307 saddr = NULL; 2308 } 2309 dst_confirm_neigh(dst, daddr); 2310 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2311 if (mtu >= dst_mtu(dst)) 2312 return; 2313 2314 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2315 rt6_do_update_pmtu(rt6, mtu); 2316 /* update rt6_ex->stamp for cache */ 2317 if (rt6->rt6i_flags & RTF_CACHE) 2318 rt6_update_exception_stamp_rt(rt6); 2319 } else if (daddr) { 2320 struct fib6_info *from; 2321 struct rt6_info *nrt6; 2322 2323 rcu_read_lock(); 2324 from = rcu_dereference(rt6->from); 2325 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2326 if (nrt6) { 2327 rt6_do_update_pmtu(nrt6, mtu); 2328 if (rt6_insert_exception(nrt6, from)) 2329 dst_release_immediate(&nrt6->dst); 2330 } 2331 rcu_read_unlock(); 2332 } 2333 } 2334 2335 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2336 struct sk_buff *skb, u32 mtu) 2337 { 2338 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2339 } 2340 2341 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2342 int oif, u32 mark, kuid_t uid) 2343 { 2344 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2345 struct dst_entry *dst; 2346 struct flowi6 fl6; 2347 2348 memset(&fl6, 0, sizeof(fl6)); 2349 fl6.flowi6_oif = oif; 2350 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2351 fl6.daddr = iph->daddr; 2352 fl6.saddr = iph->saddr; 2353 fl6.flowlabel = ip6_flowinfo(iph); 2354 fl6.flowi6_uid = uid; 2355 2356 dst = ip6_route_output(net, NULL, &fl6); 2357 if (!dst->error) 2358 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2359 dst_release(dst); 2360 } 2361 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2362 2363 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2364 { 2365 struct dst_entry *dst; 2366 2367 ip6_update_pmtu(skb, sock_net(sk), mtu, 2368 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2369 2370 dst = __sk_dst_get(sk); 2371 if (!dst || !dst->obsolete || 2372 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2373 return; 2374 2375 bh_lock_sock(sk); 2376 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2377 ip6_datagram_dst_update(sk, false); 2378 bh_unlock_sock(sk); 2379 } 2380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2381 2382 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2383 const struct flowi6 *fl6) 2384 { 2385 #ifdef CONFIG_IPV6_SUBTREES 2386 struct ipv6_pinfo *np = inet6_sk(sk); 2387 #endif 2388 2389 ip6_dst_store(sk, dst, 2390 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2391 &sk->sk_v6_daddr : NULL, 2392 #ifdef CONFIG_IPV6_SUBTREES 2393 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2394 &np->saddr : 2395 #endif 2396 NULL); 2397 } 2398 2399 /* Handle redirects */ 2400 struct ip6rd_flowi { 2401 struct flowi6 fl6; 2402 struct in6_addr gateway; 2403 }; 2404 2405 static struct rt6_info *__ip6_route_redirect(struct net *net, 2406 struct fib6_table *table, 2407 struct flowi6 *fl6, 2408 const struct sk_buff *skb, 2409 int flags) 2410 { 2411 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2412 struct rt6_info *ret = NULL, *rt_cache; 2413 struct fib6_info *rt; 2414 struct fib6_node *fn; 2415 2416 /* Get the "current" route for this destination and 2417 * check if the redirect has come from appropriate router. 2418 * 2419 * RFC 4861 specifies that redirects should only be 2420 * accepted if they come from the nexthop to the target. 2421 * Due to the way the routes are chosen, this notion 2422 * is a bit fuzzy and one might need to check all possible 2423 * routes. 2424 */ 2425 2426 rcu_read_lock(); 2427 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2428 restart: 2429 for_each_fib6_node_rt_rcu(fn) { 2430 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2431 continue; 2432 if (fib6_check_expired(rt)) 2433 continue; 2434 if (rt->fib6_flags & RTF_REJECT) 2435 break; 2436 if (!(rt->fib6_flags & RTF_GATEWAY)) 2437 continue; 2438 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2439 continue; 2440 /* rt_cache's gateway might be different from its 'parent' 2441 * in the case of an ip redirect. 2442 * So we keep searching in the exception table if the gateway 2443 * is different. 2444 */ 2445 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2446 rt_cache = rt6_find_cached_rt(rt, 2447 &fl6->daddr, 2448 &fl6->saddr); 2449 if (rt_cache && 2450 ipv6_addr_equal(&rdfl->gateway, 2451 &rt_cache->rt6i_gateway)) { 2452 ret = rt_cache; 2453 break; 2454 } 2455 continue; 2456 } 2457 break; 2458 } 2459 2460 if (!rt) 2461 rt = net->ipv6.fib6_null_entry; 2462 else if (rt->fib6_flags & RTF_REJECT) { 2463 ret = net->ipv6.ip6_null_entry; 2464 goto out; 2465 } 2466 2467 if (rt == net->ipv6.fib6_null_entry) { 2468 fn = fib6_backtrack(fn, &fl6->saddr); 2469 if (fn) 2470 goto restart; 2471 } 2472 2473 out: 2474 if (ret) 2475 ip6_hold_safe(net, &ret, true); 2476 else 2477 ret = ip6_create_rt_rcu(rt); 2478 2479 rcu_read_unlock(); 2480 2481 trace_fib6_table_lookup(net, rt, table, fl6); 2482 return ret; 2483 }; 2484 2485 static struct dst_entry *ip6_route_redirect(struct net *net, 2486 const struct flowi6 *fl6, 2487 const struct sk_buff *skb, 2488 const struct in6_addr *gateway) 2489 { 2490 int flags = RT6_LOOKUP_F_HAS_SADDR; 2491 struct ip6rd_flowi rdfl; 2492 2493 rdfl.fl6 = *fl6; 2494 rdfl.gateway = *gateway; 2495 2496 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2497 flags, __ip6_route_redirect); 2498 } 2499 2500 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2501 kuid_t uid) 2502 { 2503 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2504 struct dst_entry *dst; 2505 struct flowi6 fl6; 2506 2507 memset(&fl6, 0, sizeof(fl6)); 2508 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2509 fl6.flowi6_oif = oif; 2510 fl6.flowi6_mark = mark; 2511 fl6.daddr = iph->daddr; 2512 fl6.saddr = iph->saddr; 2513 fl6.flowlabel = ip6_flowinfo(iph); 2514 fl6.flowi6_uid = uid; 2515 2516 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2517 rt6_do_redirect(dst, NULL, skb); 2518 dst_release(dst); 2519 } 2520 EXPORT_SYMBOL_GPL(ip6_redirect); 2521 2522 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2523 u32 mark) 2524 { 2525 const struct ipv6hdr *iph = ipv6_hdr(skb); 2526 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2527 struct dst_entry *dst; 2528 struct flowi6 fl6; 2529 2530 memset(&fl6, 0, sizeof(fl6)); 2531 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2532 fl6.flowi6_oif = oif; 2533 fl6.flowi6_mark = mark; 2534 fl6.daddr = msg->dest; 2535 fl6.saddr = iph->daddr; 2536 fl6.flowi6_uid = sock_net_uid(net, NULL); 2537 2538 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2539 rt6_do_redirect(dst, NULL, skb); 2540 dst_release(dst); 2541 } 2542 2543 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2544 { 2545 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2546 sk->sk_uid); 2547 } 2548 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2549 2550 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2551 { 2552 struct net_device *dev = dst->dev; 2553 unsigned int mtu = dst_mtu(dst); 2554 struct net *net = dev_net(dev); 2555 2556 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2557 2558 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2559 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2560 2561 /* 2562 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2563 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2564 * IPV6_MAXPLEN is also valid and means: "any MSS, 2565 * rely only on pmtu discovery" 2566 */ 2567 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2568 mtu = IPV6_MAXPLEN; 2569 return mtu; 2570 } 2571 2572 static unsigned int ip6_mtu(const struct dst_entry *dst) 2573 { 2574 struct inet6_dev *idev; 2575 unsigned int mtu; 2576 2577 mtu = dst_metric_raw(dst, RTAX_MTU); 2578 if (mtu) 2579 goto out; 2580 2581 mtu = IPV6_MIN_MTU; 2582 2583 rcu_read_lock(); 2584 idev = __in6_dev_get(dst->dev); 2585 if (idev) 2586 mtu = idev->cnf.mtu6; 2587 rcu_read_unlock(); 2588 2589 out: 2590 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2591 2592 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2593 } 2594 2595 /* MTU selection: 2596 * 1. mtu on route is locked - use it 2597 * 2. mtu from nexthop exception 2598 * 3. mtu from egress device 2599 * 2600 * based on ip6_dst_mtu_forward and exception logic of 2601 * rt6_find_cached_rt; called with rcu_read_lock 2602 */ 2603 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2604 struct in6_addr *saddr) 2605 { 2606 struct rt6_exception_bucket *bucket; 2607 struct rt6_exception *rt6_ex; 2608 struct in6_addr *src_key; 2609 struct inet6_dev *idev; 2610 u32 mtu = 0; 2611 2612 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2613 mtu = f6i->fib6_pmtu; 2614 if (mtu) 2615 goto out; 2616 } 2617 2618 src_key = NULL; 2619 #ifdef CONFIG_IPV6_SUBTREES 2620 if (f6i->fib6_src.plen) 2621 src_key = saddr; 2622 #endif 2623 2624 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2625 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2626 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2627 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2628 2629 if (likely(!mtu)) { 2630 struct net_device *dev = fib6_info_nh_dev(f6i); 2631 2632 mtu = IPV6_MIN_MTU; 2633 idev = __in6_dev_get(dev); 2634 if (idev && idev->cnf.mtu6 > mtu) 2635 mtu = idev->cnf.mtu6; 2636 } 2637 2638 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2639 out: 2640 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2641 } 2642 2643 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2644 struct flowi6 *fl6) 2645 { 2646 struct dst_entry *dst; 2647 struct rt6_info *rt; 2648 struct inet6_dev *idev = in6_dev_get(dev); 2649 struct net *net = dev_net(dev); 2650 2651 if (unlikely(!idev)) 2652 return ERR_PTR(-ENODEV); 2653 2654 rt = ip6_dst_alloc(net, dev, 0); 2655 if (unlikely(!rt)) { 2656 in6_dev_put(idev); 2657 dst = ERR_PTR(-ENOMEM); 2658 goto out; 2659 } 2660 2661 rt->dst.flags |= DST_HOST; 2662 rt->dst.input = ip6_input; 2663 rt->dst.output = ip6_output; 2664 rt->rt6i_gateway = fl6->daddr; 2665 rt->rt6i_dst.addr = fl6->daddr; 2666 rt->rt6i_dst.plen = 128; 2667 rt->rt6i_idev = idev; 2668 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2669 2670 /* Add this dst into uncached_list so that rt6_disable_ip() can 2671 * do proper release of the net_device 2672 */ 2673 rt6_uncached_list_add(rt); 2674 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2675 2676 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2677 2678 out: 2679 return dst; 2680 } 2681 2682 static int ip6_dst_gc(struct dst_ops *ops) 2683 { 2684 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2685 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2686 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2687 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2688 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2689 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2690 int entries; 2691 2692 entries = dst_entries_get_fast(ops); 2693 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2694 entries <= rt_max_size) 2695 goto out; 2696 2697 net->ipv6.ip6_rt_gc_expire++; 2698 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2699 entries = dst_entries_get_slow(ops); 2700 if (entries < ops->gc_thresh) 2701 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2702 out: 2703 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2704 return entries > rt_max_size; 2705 } 2706 2707 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2708 struct fib6_config *cfg) 2709 { 2710 struct dst_metrics *p; 2711 2712 if (!cfg->fc_mx) 2713 return 0; 2714 2715 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2716 if (unlikely(!p)) 2717 return -ENOMEM; 2718 2719 refcount_set(&p->refcnt, 1); 2720 rt->fib6_metrics = p; 2721 2722 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2723 } 2724 2725 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2726 struct fib6_config *cfg, 2727 const struct in6_addr *gw_addr, 2728 u32 tbid, int flags) 2729 { 2730 struct flowi6 fl6 = { 2731 .flowi6_oif = cfg->fc_ifindex, 2732 .daddr = *gw_addr, 2733 .saddr = cfg->fc_prefsrc, 2734 }; 2735 struct fib6_table *table; 2736 struct rt6_info *rt; 2737 2738 table = fib6_get_table(net, tbid); 2739 if (!table) 2740 return NULL; 2741 2742 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2743 flags |= RT6_LOOKUP_F_HAS_SADDR; 2744 2745 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2746 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2747 2748 /* if table lookup failed, fall back to full lookup */ 2749 if (rt == net->ipv6.ip6_null_entry) { 2750 ip6_rt_put(rt); 2751 rt = NULL; 2752 } 2753 2754 return rt; 2755 } 2756 2757 static int ip6_route_check_nh_onlink(struct net *net, 2758 struct fib6_config *cfg, 2759 const struct net_device *dev, 2760 struct netlink_ext_ack *extack) 2761 { 2762 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2763 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2764 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2765 struct rt6_info *grt; 2766 int err; 2767 2768 err = 0; 2769 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2770 if (grt) { 2771 if (!grt->dst.error && 2772 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2773 NL_SET_ERR_MSG(extack, 2774 "Nexthop has invalid gateway or device mismatch"); 2775 err = -EINVAL; 2776 } 2777 2778 ip6_rt_put(grt); 2779 } 2780 2781 return err; 2782 } 2783 2784 static int ip6_route_check_nh(struct net *net, 2785 struct fib6_config *cfg, 2786 struct net_device **_dev, 2787 struct inet6_dev **idev) 2788 { 2789 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2790 struct net_device *dev = _dev ? *_dev : NULL; 2791 struct rt6_info *grt = NULL; 2792 int err = -EHOSTUNREACH; 2793 2794 if (cfg->fc_table) { 2795 int flags = RT6_LOOKUP_F_IFACE; 2796 2797 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2798 cfg->fc_table, flags); 2799 if (grt) { 2800 if (grt->rt6i_flags & RTF_GATEWAY || 2801 (dev && dev != grt->dst.dev)) { 2802 ip6_rt_put(grt); 2803 grt = NULL; 2804 } 2805 } 2806 } 2807 2808 if (!grt) 2809 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2810 2811 if (!grt) 2812 goto out; 2813 2814 if (dev) { 2815 if (dev != grt->dst.dev) { 2816 ip6_rt_put(grt); 2817 goto out; 2818 } 2819 } else { 2820 *_dev = dev = grt->dst.dev; 2821 *idev = grt->rt6i_idev; 2822 dev_hold(dev); 2823 in6_dev_hold(grt->rt6i_idev); 2824 } 2825 2826 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2827 err = 0; 2828 2829 ip6_rt_put(grt); 2830 2831 out: 2832 return err; 2833 } 2834 2835 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2836 struct net_device **_dev, struct inet6_dev **idev, 2837 struct netlink_ext_ack *extack) 2838 { 2839 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2840 int gwa_type = ipv6_addr_type(gw_addr); 2841 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2842 const struct net_device *dev = *_dev; 2843 bool need_addr_check = !dev; 2844 int err = -EINVAL; 2845 2846 /* if gw_addr is local we will fail to detect this in case 2847 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2848 * will return already-added prefix route via interface that 2849 * prefix route was assigned to, which might be non-loopback. 2850 */ 2851 if (dev && 2852 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2853 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2854 goto out; 2855 } 2856 2857 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2858 /* IPv6 strictly inhibits using not link-local 2859 * addresses as nexthop address. 2860 * Otherwise, router will not able to send redirects. 2861 * It is very good, but in some (rare!) circumstances 2862 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2863 * some exceptions. --ANK 2864 * We allow IPv4-mapped nexthops to support RFC4798-type 2865 * addressing 2866 */ 2867 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2868 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2869 goto out; 2870 } 2871 2872 if (cfg->fc_flags & RTNH_F_ONLINK) 2873 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2874 else 2875 err = ip6_route_check_nh(net, cfg, _dev, idev); 2876 2877 if (err) 2878 goto out; 2879 } 2880 2881 /* reload in case device was changed */ 2882 dev = *_dev; 2883 2884 err = -EINVAL; 2885 if (!dev) { 2886 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2887 goto out; 2888 } else if (dev->flags & IFF_LOOPBACK) { 2889 NL_SET_ERR_MSG(extack, 2890 "Egress device can not be loopback device for this route"); 2891 goto out; 2892 } 2893 2894 /* if we did not check gw_addr above, do so now that the 2895 * egress device has been resolved. 2896 */ 2897 if (need_addr_check && 2898 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2899 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2900 goto out; 2901 } 2902 2903 err = 0; 2904 out: 2905 return err; 2906 } 2907 2908 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2909 gfp_t gfp_flags, 2910 struct netlink_ext_ack *extack) 2911 { 2912 struct net *net = cfg->fc_nlinfo.nl_net; 2913 struct fib6_info *rt = NULL; 2914 struct net_device *dev = NULL; 2915 struct inet6_dev *idev = NULL; 2916 struct fib6_table *table; 2917 int addr_type; 2918 int err = -EINVAL; 2919 2920 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2921 if (cfg->fc_flags & RTF_PCPU) { 2922 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2923 goto out; 2924 } 2925 2926 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2927 if (cfg->fc_flags & RTF_CACHE) { 2928 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2929 goto out; 2930 } 2931 2932 if (cfg->fc_type > RTN_MAX) { 2933 NL_SET_ERR_MSG(extack, "Invalid route type"); 2934 goto out; 2935 } 2936 2937 if (cfg->fc_dst_len > 128) { 2938 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2939 goto out; 2940 } 2941 if (cfg->fc_src_len > 128) { 2942 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2943 goto out; 2944 } 2945 #ifndef CONFIG_IPV6_SUBTREES 2946 if (cfg->fc_src_len) { 2947 NL_SET_ERR_MSG(extack, 2948 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2949 goto out; 2950 } 2951 #endif 2952 if (cfg->fc_ifindex) { 2953 err = -ENODEV; 2954 dev = dev_get_by_index(net, cfg->fc_ifindex); 2955 if (!dev) 2956 goto out; 2957 idev = in6_dev_get(dev); 2958 if (!idev) 2959 goto out; 2960 } 2961 2962 if (cfg->fc_metric == 0) 2963 cfg->fc_metric = IP6_RT_PRIO_USER; 2964 2965 if (cfg->fc_flags & RTNH_F_ONLINK) { 2966 if (!dev) { 2967 NL_SET_ERR_MSG(extack, 2968 "Nexthop device required for onlink"); 2969 err = -ENODEV; 2970 goto out; 2971 } 2972 2973 if (!(dev->flags & IFF_UP)) { 2974 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2975 err = -ENETDOWN; 2976 goto out; 2977 } 2978 } 2979 2980 err = -ENOBUFS; 2981 if (cfg->fc_nlinfo.nlh && 2982 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2983 table = fib6_get_table(net, cfg->fc_table); 2984 if (!table) { 2985 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2986 table = fib6_new_table(net, cfg->fc_table); 2987 } 2988 } else { 2989 table = fib6_new_table(net, cfg->fc_table); 2990 } 2991 2992 if (!table) 2993 goto out; 2994 2995 err = -ENOMEM; 2996 rt = fib6_info_alloc(gfp_flags); 2997 if (!rt) 2998 goto out; 2999 3000 if (cfg->fc_flags & RTF_ADDRCONF) 3001 rt->dst_nocount = true; 3002 3003 err = ip6_convert_metrics(net, rt, cfg); 3004 if (err < 0) 3005 goto out; 3006 3007 if (cfg->fc_flags & RTF_EXPIRES) 3008 fib6_set_expires(rt, jiffies + 3009 clock_t_to_jiffies(cfg->fc_expires)); 3010 else 3011 fib6_clean_expires(rt); 3012 3013 if (cfg->fc_protocol == RTPROT_UNSPEC) 3014 cfg->fc_protocol = RTPROT_BOOT; 3015 rt->fib6_protocol = cfg->fc_protocol; 3016 3017 addr_type = ipv6_addr_type(&cfg->fc_dst); 3018 3019 if (cfg->fc_encap) { 3020 struct lwtunnel_state *lwtstate; 3021 3022 err = lwtunnel_build_state(cfg->fc_encap_type, 3023 cfg->fc_encap, AF_INET6, cfg, 3024 &lwtstate, extack); 3025 if (err) 3026 goto out; 3027 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3028 } 3029 3030 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3031 rt->fib6_dst.plen = cfg->fc_dst_len; 3032 if (rt->fib6_dst.plen == 128) 3033 rt->dst_host = true; 3034 3035 #ifdef CONFIG_IPV6_SUBTREES 3036 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3037 rt->fib6_src.plen = cfg->fc_src_len; 3038 #endif 3039 3040 rt->fib6_metric = cfg->fc_metric; 3041 rt->fib6_nh.nh_weight = 1; 3042 3043 rt->fib6_type = cfg->fc_type; 3044 3045 /* We cannot add true routes via loopback here, 3046 they would result in kernel looping; promote them to reject routes 3047 */ 3048 if ((cfg->fc_flags & RTF_REJECT) || 3049 (dev && (dev->flags & IFF_LOOPBACK) && 3050 !(addr_type & IPV6_ADDR_LOOPBACK) && 3051 !(cfg->fc_flags & RTF_LOCAL))) { 3052 /* hold loopback dev/idev if we haven't done so. */ 3053 if (dev != net->loopback_dev) { 3054 if (dev) { 3055 dev_put(dev); 3056 in6_dev_put(idev); 3057 } 3058 dev = net->loopback_dev; 3059 dev_hold(dev); 3060 idev = in6_dev_get(dev); 3061 if (!idev) { 3062 err = -ENODEV; 3063 goto out; 3064 } 3065 } 3066 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3067 goto install_route; 3068 } 3069 3070 if (cfg->fc_flags & RTF_GATEWAY) { 3071 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3072 if (err) 3073 goto out; 3074 3075 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3076 } 3077 3078 err = -ENODEV; 3079 if (!dev) 3080 goto out; 3081 3082 if (idev->cnf.disable_ipv6) { 3083 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3084 err = -EACCES; 3085 goto out; 3086 } 3087 3088 if (!(dev->flags & IFF_UP)) { 3089 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3090 err = -ENETDOWN; 3091 goto out; 3092 } 3093 3094 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3095 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3096 NL_SET_ERR_MSG(extack, "Invalid source address"); 3097 err = -EINVAL; 3098 goto out; 3099 } 3100 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3101 rt->fib6_prefsrc.plen = 128; 3102 } else 3103 rt->fib6_prefsrc.plen = 0; 3104 3105 rt->fib6_flags = cfg->fc_flags; 3106 3107 install_route: 3108 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3109 !netif_carrier_ok(dev)) 3110 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3111 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3112 rt->fib6_nh.nh_dev = dev; 3113 rt->fib6_table = table; 3114 3115 if (idev) 3116 in6_dev_put(idev); 3117 3118 return rt; 3119 out: 3120 if (dev) 3121 dev_put(dev); 3122 if (idev) 3123 in6_dev_put(idev); 3124 3125 fib6_info_release(rt); 3126 return ERR_PTR(err); 3127 } 3128 3129 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3130 struct netlink_ext_ack *extack) 3131 { 3132 struct fib6_info *rt; 3133 int err; 3134 3135 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3136 if (IS_ERR(rt)) 3137 return PTR_ERR(rt); 3138 3139 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3140 fib6_info_release(rt); 3141 3142 return err; 3143 } 3144 3145 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3146 { 3147 struct net *net = info->nl_net; 3148 struct fib6_table *table; 3149 int err; 3150 3151 if (rt == net->ipv6.fib6_null_entry) { 3152 err = -ENOENT; 3153 goto out; 3154 } 3155 3156 table = rt->fib6_table; 3157 spin_lock_bh(&table->tb6_lock); 3158 err = fib6_del(rt, info); 3159 spin_unlock_bh(&table->tb6_lock); 3160 3161 out: 3162 fib6_info_release(rt); 3163 return err; 3164 } 3165 3166 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3167 { 3168 struct nl_info info = { .nl_net = net }; 3169 3170 return __ip6_del_rt(rt, &info); 3171 } 3172 3173 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3174 { 3175 struct nl_info *info = &cfg->fc_nlinfo; 3176 struct net *net = info->nl_net; 3177 struct sk_buff *skb = NULL; 3178 struct fib6_table *table; 3179 int err = -ENOENT; 3180 3181 if (rt == net->ipv6.fib6_null_entry) 3182 goto out_put; 3183 table = rt->fib6_table; 3184 spin_lock_bh(&table->tb6_lock); 3185 3186 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3187 struct fib6_info *sibling, *next_sibling; 3188 3189 /* prefer to send a single notification with all hops */ 3190 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3191 if (skb) { 3192 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3193 3194 if (rt6_fill_node(net, skb, rt, NULL, 3195 NULL, NULL, 0, RTM_DELROUTE, 3196 info->portid, seq, 0) < 0) { 3197 kfree_skb(skb); 3198 skb = NULL; 3199 } else 3200 info->skip_notify = 1; 3201 } 3202 3203 list_for_each_entry_safe(sibling, next_sibling, 3204 &rt->fib6_siblings, 3205 fib6_siblings) { 3206 err = fib6_del(sibling, info); 3207 if (err) 3208 goto out_unlock; 3209 } 3210 } 3211 3212 err = fib6_del(rt, info); 3213 out_unlock: 3214 spin_unlock_bh(&table->tb6_lock); 3215 out_put: 3216 fib6_info_release(rt); 3217 3218 if (skb) { 3219 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3220 info->nlh, gfp_any()); 3221 } 3222 return err; 3223 } 3224 3225 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3226 { 3227 int rc = -ESRCH; 3228 3229 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3230 goto out; 3231 3232 if (cfg->fc_flags & RTF_GATEWAY && 3233 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3234 goto out; 3235 if (dst_hold_safe(&rt->dst)) 3236 rc = rt6_remove_exception_rt(rt); 3237 out: 3238 return rc; 3239 } 3240 3241 static int ip6_route_del(struct fib6_config *cfg, 3242 struct netlink_ext_ack *extack) 3243 { 3244 struct rt6_info *rt_cache; 3245 struct fib6_table *table; 3246 struct fib6_info *rt; 3247 struct fib6_node *fn; 3248 int err = -ESRCH; 3249 3250 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3251 if (!table) { 3252 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3253 return err; 3254 } 3255 3256 rcu_read_lock(); 3257 3258 fn = fib6_locate(&table->tb6_root, 3259 &cfg->fc_dst, cfg->fc_dst_len, 3260 &cfg->fc_src, cfg->fc_src_len, 3261 !(cfg->fc_flags & RTF_CACHE)); 3262 3263 if (fn) { 3264 for_each_fib6_node_rt_rcu(fn) { 3265 if (cfg->fc_flags & RTF_CACHE) { 3266 int rc; 3267 3268 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3269 &cfg->fc_src); 3270 if (rt_cache) { 3271 rc = ip6_del_cached_rt(rt_cache, cfg); 3272 if (rc != -ESRCH) { 3273 rcu_read_unlock(); 3274 return rc; 3275 } 3276 } 3277 continue; 3278 } 3279 if (cfg->fc_ifindex && 3280 (!rt->fib6_nh.nh_dev || 3281 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3282 continue; 3283 if (cfg->fc_flags & RTF_GATEWAY && 3284 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3285 continue; 3286 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3287 continue; 3288 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3289 continue; 3290 if (!fib6_info_hold_safe(rt)) 3291 continue; 3292 rcu_read_unlock(); 3293 3294 /* if gateway was specified only delete the one hop */ 3295 if (cfg->fc_flags & RTF_GATEWAY) 3296 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3297 3298 return __ip6_del_rt_siblings(rt, cfg); 3299 } 3300 } 3301 rcu_read_unlock(); 3302 3303 return err; 3304 } 3305 3306 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3307 { 3308 struct netevent_redirect netevent; 3309 struct rt6_info *rt, *nrt = NULL; 3310 struct ndisc_options ndopts; 3311 struct inet6_dev *in6_dev; 3312 struct neighbour *neigh; 3313 struct fib6_info *from; 3314 struct rd_msg *msg; 3315 int optlen, on_link; 3316 u8 *lladdr; 3317 3318 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3319 optlen -= sizeof(*msg); 3320 3321 if (optlen < 0) { 3322 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3323 return; 3324 } 3325 3326 msg = (struct rd_msg *)icmp6_hdr(skb); 3327 3328 if (ipv6_addr_is_multicast(&msg->dest)) { 3329 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3330 return; 3331 } 3332 3333 on_link = 0; 3334 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3335 on_link = 1; 3336 } else if (ipv6_addr_type(&msg->target) != 3337 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3338 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3339 return; 3340 } 3341 3342 in6_dev = __in6_dev_get(skb->dev); 3343 if (!in6_dev) 3344 return; 3345 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3346 return; 3347 3348 /* RFC2461 8.1: 3349 * The IP source address of the Redirect MUST be the same as the current 3350 * first-hop router for the specified ICMP Destination Address. 3351 */ 3352 3353 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3354 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3355 return; 3356 } 3357 3358 lladdr = NULL; 3359 if (ndopts.nd_opts_tgt_lladdr) { 3360 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3361 skb->dev); 3362 if (!lladdr) { 3363 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3364 return; 3365 } 3366 } 3367 3368 rt = (struct rt6_info *) dst; 3369 if (rt->rt6i_flags & RTF_REJECT) { 3370 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3371 return; 3372 } 3373 3374 /* Redirect received -> path was valid. 3375 * Look, redirects are sent only in response to data packets, 3376 * so that this nexthop apparently is reachable. --ANK 3377 */ 3378 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3379 3380 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3381 if (!neigh) 3382 return; 3383 3384 /* 3385 * We have finally decided to accept it. 3386 */ 3387 3388 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3389 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3390 NEIGH_UPDATE_F_OVERRIDE| 3391 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3392 NEIGH_UPDATE_F_ISROUTER)), 3393 NDISC_REDIRECT, &ndopts); 3394 3395 rcu_read_lock(); 3396 from = rcu_dereference(rt->from); 3397 /* This fib6_info_hold() is safe here because we hold reference to rt 3398 * and rt already holds reference to fib6_info. 3399 */ 3400 fib6_info_hold(from); 3401 rcu_read_unlock(); 3402 3403 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3404 if (!nrt) 3405 goto out; 3406 3407 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3408 if (on_link) 3409 nrt->rt6i_flags &= ~RTF_GATEWAY; 3410 3411 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3412 3413 /* No need to remove rt from the exception table if rt is 3414 * a cached route because rt6_insert_exception() will 3415 * takes care of it 3416 */ 3417 if (rt6_insert_exception(nrt, from)) { 3418 dst_release_immediate(&nrt->dst); 3419 goto out; 3420 } 3421 3422 netevent.old = &rt->dst; 3423 netevent.new = &nrt->dst; 3424 netevent.daddr = &msg->dest; 3425 netevent.neigh = neigh; 3426 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3427 3428 out: 3429 fib6_info_release(from); 3430 neigh_release(neigh); 3431 } 3432 3433 #ifdef CONFIG_IPV6_ROUTE_INFO 3434 static struct fib6_info *rt6_get_route_info(struct net *net, 3435 const struct in6_addr *prefix, int prefixlen, 3436 const struct in6_addr *gwaddr, 3437 struct net_device *dev) 3438 { 3439 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3440 int ifindex = dev->ifindex; 3441 struct fib6_node *fn; 3442 struct fib6_info *rt = NULL; 3443 struct fib6_table *table; 3444 3445 table = fib6_get_table(net, tb_id); 3446 if (!table) 3447 return NULL; 3448 3449 rcu_read_lock(); 3450 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3451 if (!fn) 3452 goto out; 3453 3454 for_each_fib6_node_rt_rcu(fn) { 3455 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3456 continue; 3457 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3458 continue; 3459 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3460 continue; 3461 if (!fib6_info_hold_safe(rt)) 3462 continue; 3463 break; 3464 } 3465 out: 3466 rcu_read_unlock(); 3467 return rt; 3468 } 3469 3470 static struct fib6_info *rt6_add_route_info(struct net *net, 3471 const struct in6_addr *prefix, int prefixlen, 3472 const struct in6_addr *gwaddr, 3473 struct net_device *dev, 3474 unsigned int pref) 3475 { 3476 struct fib6_config cfg = { 3477 .fc_metric = IP6_RT_PRIO_USER, 3478 .fc_ifindex = dev->ifindex, 3479 .fc_dst_len = prefixlen, 3480 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3481 RTF_UP | RTF_PREF(pref), 3482 .fc_protocol = RTPROT_RA, 3483 .fc_type = RTN_UNICAST, 3484 .fc_nlinfo.portid = 0, 3485 .fc_nlinfo.nlh = NULL, 3486 .fc_nlinfo.nl_net = net, 3487 }; 3488 3489 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3490 cfg.fc_dst = *prefix; 3491 cfg.fc_gateway = *gwaddr; 3492 3493 /* We should treat it as a default route if prefix length is 0. */ 3494 if (!prefixlen) 3495 cfg.fc_flags |= RTF_DEFAULT; 3496 3497 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3498 3499 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3500 } 3501 #endif 3502 3503 struct fib6_info *rt6_get_dflt_router(struct net *net, 3504 const struct in6_addr *addr, 3505 struct net_device *dev) 3506 { 3507 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3508 struct fib6_info *rt; 3509 struct fib6_table *table; 3510 3511 table = fib6_get_table(net, tb_id); 3512 if (!table) 3513 return NULL; 3514 3515 rcu_read_lock(); 3516 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3517 if (dev == rt->fib6_nh.nh_dev && 3518 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3519 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3520 break; 3521 } 3522 if (rt && !fib6_info_hold_safe(rt)) 3523 rt = NULL; 3524 rcu_read_unlock(); 3525 return rt; 3526 } 3527 3528 struct fib6_info *rt6_add_dflt_router(struct net *net, 3529 const struct in6_addr *gwaddr, 3530 struct net_device *dev, 3531 unsigned int pref) 3532 { 3533 struct fib6_config cfg = { 3534 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3535 .fc_metric = IP6_RT_PRIO_USER, 3536 .fc_ifindex = dev->ifindex, 3537 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3538 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3539 .fc_protocol = RTPROT_RA, 3540 .fc_type = RTN_UNICAST, 3541 .fc_nlinfo.portid = 0, 3542 .fc_nlinfo.nlh = NULL, 3543 .fc_nlinfo.nl_net = net, 3544 }; 3545 3546 cfg.fc_gateway = *gwaddr; 3547 3548 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3549 struct fib6_table *table; 3550 3551 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3552 if (table) 3553 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3554 } 3555 3556 return rt6_get_dflt_router(net, gwaddr, dev); 3557 } 3558 3559 static void __rt6_purge_dflt_routers(struct net *net, 3560 struct fib6_table *table) 3561 { 3562 struct fib6_info *rt; 3563 3564 restart: 3565 rcu_read_lock(); 3566 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3567 struct net_device *dev = fib6_info_nh_dev(rt); 3568 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3569 3570 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3571 (!idev || idev->cnf.accept_ra != 2) && 3572 fib6_info_hold_safe(rt)) { 3573 rcu_read_unlock(); 3574 ip6_del_rt(net, rt); 3575 goto restart; 3576 } 3577 } 3578 rcu_read_unlock(); 3579 3580 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3581 } 3582 3583 void rt6_purge_dflt_routers(struct net *net) 3584 { 3585 struct fib6_table *table; 3586 struct hlist_head *head; 3587 unsigned int h; 3588 3589 rcu_read_lock(); 3590 3591 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3592 head = &net->ipv6.fib_table_hash[h]; 3593 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3594 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3595 __rt6_purge_dflt_routers(net, table); 3596 } 3597 } 3598 3599 rcu_read_unlock(); 3600 } 3601 3602 static void rtmsg_to_fib6_config(struct net *net, 3603 struct in6_rtmsg *rtmsg, 3604 struct fib6_config *cfg) 3605 { 3606 memset(cfg, 0, sizeof(*cfg)); 3607 3608 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3609 : RT6_TABLE_MAIN; 3610 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3611 cfg->fc_metric = rtmsg->rtmsg_metric; 3612 cfg->fc_expires = rtmsg->rtmsg_info; 3613 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3614 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3615 cfg->fc_flags = rtmsg->rtmsg_flags; 3616 cfg->fc_type = rtmsg->rtmsg_type; 3617 3618 cfg->fc_nlinfo.nl_net = net; 3619 3620 cfg->fc_dst = rtmsg->rtmsg_dst; 3621 cfg->fc_src = rtmsg->rtmsg_src; 3622 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3623 } 3624 3625 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3626 { 3627 struct fib6_config cfg; 3628 struct in6_rtmsg rtmsg; 3629 int err; 3630 3631 switch (cmd) { 3632 case SIOCADDRT: /* Add a route */ 3633 case SIOCDELRT: /* Delete a route */ 3634 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3635 return -EPERM; 3636 err = copy_from_user(&rtmsg, arg, 3637 sizeof(struct in6_rtmsg)); 3638 if (err) 3639 return -EFAULT; 3640 3641 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3642 3643 rtnl_lock(); 3644 switch (cmd) { 3645 case SIOCADDRT: 3646 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3647 break; 3648 case SIOCDELRT: 3649 err = ip6_route_del(&cfg, NULL); 3650 break; 3651 default: 3652 err = -EINVAL; 3653 } 3654 rtnl_unlock(); 3655 3656 return err; 3657 } 3658 3659 return -EINVAL; 3660 } 3661 3662 /* 3663 * Drop the packet on the floor 3664 */ 3665 3666 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3667 { 3668 int type; 3669 struct dst_entry *dst = skb_dst(skb); 3670 switch (ipstats_mib_noroutes) { 3671 case IPSTATS_MIB_INNOROUTES: 3672 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3673 if (type == IPV6_ADDR_ANY) { 3674 IP6_INC_STATS(dev_net(dst->dev), 3675 __in6_dev_get_safely(skb->dev), 3676 IPSTATS_MIB_INADDRERRORS); 3677 break; 3678 } 3679 /* FALLTHROUGH */ 3680 case IPSTATS_MIB_OUTNOROUTES: 3681 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3682 ipstats_mib_noroutes); 3683 break; 3684 } 3685 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3686 kfree_skb(skb); 3687 return 0; 3688 } 3689 3690 static int ip6_pkt_discard(struct sk_buff *skb) 3691 { 3692 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3693 } 3694 3695 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3696 { 3697 skb->dev = skb_dst(skb)->dev; 3698 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3699 } 3700 3701 static int ip6_pkt_prohibit(struct sk_buff *skb) 3702 { 3703 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3704 } 3705 3706 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3707 { 3708 skb->dev = skb_dst(skb)->dev; 3709 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3710 } 3711 3712 /* 3713 * Allocate a dst for local (unicast / anycast) address. 3714 */ 3715 3716 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3717 struct inet6_dev *idev, 3718 const struct in6_addr *addr, 3719 bool anycast, gfp_t gfp_flags) 3720 { 3721 u32 tb_id; 3722 struct net_device *dev = idev->dev; 3723 struct fib6_info *f6i; 3724 3725 f6i = fib6_info_alloc(gfp_flags); 3726 if (!f6i) 3727 return ERR_PTR(-ENOMEM); 3728 3729 f6i->dst_nocount = true; 3730 f6i->dst_host = true; 3731 f6i->fib6_protocol = RTPROT_KERNEL; 3732 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3733 if (anycast) { 3734 f6i->fib6_type = RTN_ANYCAST; 3735 f6i->fib6_flags |= RTF_ANYCAST; 3736 } else { 3737 f6i->fib6_type = RTN_LOCAL; 3738 f6i->fib6_flags |= RTF_LOCAL; 3739 } 3740 3741 f6i->fib6_nh.nh_gw = *addr; 3742 dev_hold(dev); 3743 f6i->fib6_nh.nh_dev = dev; 3744 f6i->fib6_dst.addr = *addr; 3745 f6i->fib6_dst.plen = 128; 3746 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3747 f6i->fib6_table = fib6_get_table(net, tb_id); 3748 3749 return f6i; 3750 } 3751 3752 /* remove deleted ip from prefsrc entries */ 3753 struct arg_dev_net_ip { 3754 struct net_device *dev; 3755 struct net *net; 3756 struct in6_addr *addr; 3757 }; 3758 3759 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3760 { 3761 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3762 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3763 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3764 3765 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3766 rt != net->ipv6.fib6_null_entry && 3767 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3768 spin_lock_bh(&rt6_exception_lock); 3769 /* remove prefsrc entry */ 3770 rt->fib6_prefsrc.plen = 0; 3771 spin_unlock_bh(&rt6_exception_lock); 3772 } 3773 return 0; 3774 } 3775 3776 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3777 { 3778 struct net *net = dev_net(ifp->idev->dev); 3779 struct arg_dev_net_ip adni = { 3780 .dev = ifp->idev->dev, 3781 .net = net, 3782 .addr = &ifp->addr, 3783 }; 3784 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3785 } 3786 3787 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3788 3789 /* Remove routers and update dst entries when gateway turn into host. */ 3790 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3791 { 3792 struct in6_addr *gateway = (struct in6_addr *)arg; 3793 3794 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3795 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3796 return -1; 3797 } 3798 3799 /* Further clean up cached routes in exception table. 3800 * This is needed because cached route may have a different 3801 * gateway than its 'parent' in the case of an ip redirect. 3802 */ 3803 rt6_exceptions_clean_tohost(rt, gateway); 3804 3805 return 0; 3806 } 3807 3808 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3809 { 3810 fib6_clean_all(net, fib6_clean_tohost, gateway); 3811 } 3812 3813 struct arg_netdev_event { 3814 const struct net_device *dev; 3815 union { 3816 unsigned int nh_flags; 3817 unsigned long event; 3818 }; 3819 }; 3820 3821 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3822 { 3823 struct fib6_info *iter; 3824 struct fib6_node *fn; 3825 3826 fn = rcu_dereference_protected(rt->fib6_node, 3827 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3828 iter = rcu_dereference_protected(fn->leaf, 3829 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3830 while (iter) { 3831 if (iter->fib6_metric == rt->fib6_metric && 3832 rt6_qualify_for_ecmp(iter)) 3833 return iter; 3834 iter = rcu_dereference_protected(iter->fib6_next, 3835 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3836 } 3837 3838 return NULL; 3839 } 3840 3841 static bool rt6_is_dead(const struct fib6_info *rt) 3842 { 3843 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3844 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3845 fib6_ignore_linkdown(rt))) 3846 return true; 3847 3848 return false; 3849 } 3850 3851 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3852 { 3853 struct fib6_info *iter; 3854 int total = 0; 3855 3856 if (!rt6_is_dead(rt)) 3857 total += rt->fib6_nh.nh_weight; 3858 3859 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3860 if (!rt6_is_dead(iter)) 3861 total += iter->fib6_nh.nh_weight; 3862 } 3863 3864 return total; 3865 } 3866 3867 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3868 { 3869 int upper_bound = -1; 3870 3871 if (!rt6_is_dead(rt)) { 3872 *weight += rt->fib6_nh.nh_weight; 3873 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3874 total) - 1; 3875 } 3876 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3877 } 3878 3879 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3880 { 3881 struct fib6_info *iter; 3882 int weight = 0; 3883 3884 rt6_upper_bound_set(rt, &weight, total); 3885 3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3887 rt6_upper_bound_set(iter, &weight, total); 3888 } 3889 3890 void rt6_multipath_rebalance(struct fib6_info *rt) 3891 { 3892 struct fib6_info *first; 3893 int total; 3894 3895 /* In case the entire multipath route was marked for flushing, 3896 * then there is no need to rebalance upon the removal of every 3897 * sibling route. 3898 */ 3899 if (!rt->fib6_nsiblings || rt->should_flush) 3900 return; 3901 3902 /* During lookup routes are evaluated in order, so we need to 3903 * make sure upper bounds are assigned from the first sibling 3904 * onwards. 3905 */ 3906 first = rt6_multipath_first_sibling(rt); 3907 if (WARN_ON_ONCE(!first)) 3908 return; 3909 3910 total = rt6_multipath_total_weight(first); 3911 rt6_multipath_upper_bound_set(first, total); 3912 } 3913 3914 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3915 { 3916 const struct arg_netdev_event *arg = p_arg; 3917 struct net *net = dev_net(arg->dev); 3918 3919 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3920 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3921 fib6_update_sernum_upto_root(net, rt); 3922 rt6_multipath_rebalance(rt); 3923 } 3924 3925 return 0; 3926 } 3927 3928 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3929 { 3930 struct arg_netdev_event arg = { 3931 .dev = dev, 3932 { 3933 .nh_flags = nh_flags, 3934 }, 3935 }; 3936 3937 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3938 arg.nh_flags |= RTNH_F_LINKDOWN; 3939 3940 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3941 } 3942 3943 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3944 const struct net_device *dev) 3945 { 3946 struct fib6_info *iter; 3947 3948 if (rt->fib6_nh.nh_dev == dev) 3949 return true; 3950 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3951 if (iter->fib6_nh.nh_dev == dev) 3952 return true; 3953 3954 return false; 3955 } 3956 3957 static void rt6_multipath_flush(struct fib6_info *rt) 3958 { 3959 struct fib6_info *iter; 3960 3961 rt->should_flush = 1; 3962 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3963 iter->should_flush = 1; 3964 } 3965 3966 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3967 const struct net_device *down_dev) 3968 { 3969 struct fib6_info *iter; 3970 unsigned int dead = 0; 3971 3972 if (rt->fib6_nh.nh_dev == down_dev || 3973 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3974 dead++; 3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3976 if (iter->fib6_nh.nh_dev == down_dev || 3977 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3978 dead++; 3979 3980 return dead; 3981 } 3982 3983 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3984 const struct net_device *dev, 3985 unsigned int nh_flags) 3986 { 3987 struct fib6_info *iter; 3988 3989 if (rt->fib6_nh.nh_dev == dev) 3990 rt->fib6_nh.nh_flags |= nh_flags; 3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3992 if (iter->fib6_nh.nh_dev == dev) 3993 iter->fib6_nh.nh_flags |= nh_flags; 3994 } 3995 3996 /* called with write lock held for table with rt */ 3997 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3998 { 3999 const struct arg_netdev_event *arg = p_arg; 4000 const struct net_device *dev = arg->dev; 4001 struct net *net = dev_net(dev); 4002 4003 if (rt == net->ipv6.fib6_null_entry) 4004 return 0; 4005 4006 switch (arg->event) { 4007 case NETDEV_UNREGISTER: 4008 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4009 case NETDEV_DOWN: 4010 if (rt->should_flush) 4011 return -1; 4012 if (!rt->fib6_nsiblings) 4013 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4014 if (rt6_multipath_uses_dev(rt, dev)) { 4015 unsigned int count; 4016 4017 count = rt6_multipath_dead_count(rt, dev); 4018 if (rt->fib6_nsiblings + 1 == count) { 4019 rt6_multipath_flush(rt); 4020 return -1; 4021 } 4022 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4023 RTNH_F_LINKDOWN); 4024 fib6_update_sernum(net, rt); 4025 rt6_multipath_rebalance(rt); 4026 } 4027 return -2; 4028 case NETDEV_CHANGE: 4029 if (rt->fib6_nh.nh_dev != dev || 4030 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4031 break; 4032 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4033 rt6_multipath_rebalance(rt); 4034 break; 4035 } 4036 4037 return 0; 4038 } 4039 4040 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4041 { 4042 struct arg_netdev_event arg = { 4043 .dev = dev, 4044 { 4045 .event = event, 4046 }, 4047 }; 4048 4049 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4050 } 4051 4052 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4053 { 4054 rt6_sync_down_dev(dev, event); 4055 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4056 neigh_ifdown(&nd_tbl, dev); 4057 } 4058 4059 struct rt6_mtu_change_arg { 4060 struct net_device *dev; 4061 unsigned int mtu; 4062 }; 4063 4064 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4065 { 4066 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4067 struct inet6_dev *idev; 4068 4069 /* In IPv6 pmtu discovery is not optional, 4070 so that RTAX_MTU lock cannot disable it. 4071 We still use this lock to block changes 4072 caused by addrconf/ndisc. 4073 */ 4074 4075 idev = __in6_dev_get(arg->dev); 4076 if (!idev) 4077 return 0; 4078 4079 /* For administrative MTU increase, there is no way to discover 4080 IPv6 PMTU increase, so PMTU increase should be updated here. 4081 Since RFC 1981 doesn't include administrative MTU increase 4082 update PMTU increase is a MUST. (i.e. jumbo frame) 4083 */ 4084 if (rt->fib6_nh.nh_dev == arg->dev && 4085 !fib6_metric_locked(rt, RTAX_MTU)) { 4086 u32 mtu = rt->fib6_pmtu; 4087 4088 if (mtu >= arg->mtu || 4089 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4090 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4091 4092 spin_lock_bh(&rt6_exception_lock); 4093 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4094 spin_unlock_bh(&rt6_exception_lock); 4095 } 4096 return 0; 4097 } 4098 4099 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4100 { 4101 struct rt6_mtu_change_arg arg = { 4102 .dev = dev, 4103 .mtu = mtu, 4104 }; 4105 4106 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4107 } 4108 4109 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4110 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4111 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4112 [RTA_OIF] = { .type = NLA_U32 }, 4113 [RTA_IIF] = { .type = NLA_U32 }, 4114 [RTA_PRIORITY] = { .type = NLA_U32 }, 4115 [RTA_METRICS] = { .type = NLA_NESTED }, 4116 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4117 [RTA_PREF] = { .type = NLA_U8 }, 4118 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4119 [RTA_ENCAP] = { .type = NLA_NESTED }, 4120 [RTA_EXPIRES] = { .type = NLA_U32 }, 4121 [RTA_UID] = { .type = NLA_U32 }, 4122 [RTA_MARK] = { .type = NLA_U32 }, 4123 [RTA_TABLE] = { .type = NLA_U32 }, 4124 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4125 [RTA_SPORT] = { .type = NLA_U16 }, 4126 [RTA_DPORT] = { .type = NLA_U16 }, 4127 }; 4128 4129 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4130 struct fib6_config *cfg, 4131 struct netlink_ext_ack *extack) 4132 { 4133 struct rtmsg *rtm; 4134 struct nlattr *tb[RTA_MAX+1]; 4135 unsigned int pref; 4136 int err; 4137 4138 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4139 NULL); 4140 if (err < 0) 4141 goto errout; 4142 4143 err = -EINVAL; 4144 rtm = nlmsg_data(nlh); 4145 memset(cfg, 0, sizeof(*cfg)); 4146 4147 cfg->fc_table = rtm->rtm_table; 4148 cfg->fc_dst_len = rtm->rtm_dst_len; 4149 cfg->fc_src_len = rtm->rtm_src_len; 4150 cfg->fc_flags = RTF_UP; 4151 cfg->fc_protocol = rtm->rtm_protocol; 4152 cfg->fc_type = rtm->rtm_type; 4153 4154 if (rtm->rtm_type == RTN_UNREACHABLE || 4155 rtm->rtm_type == RTN_BLACKHOLE || 4156 rtm->rtm_type == RTN_PROHIBIT || 4157 rtm->rtm_type == RTN_THROW) 4158 cfg->fc_flags |= RTF_REJECT; 4159 4160 if (rtm->rtm_type == RTN_LOCAL) 4161 cfg->fc_flags |= RTF_LOCAL; 4162 4163 if (rtm->rtm_flags & RTM_F_CLONED) 4164 cfg->fc_flags |= RTF_CACHE; 4165 4166 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4167 4168 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4169 cfg->fc_nlinfo.nlh = nlh; 4170 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4171 4172 if (tb[RTA_GATEWAY]) { 4173 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4174 cfg->fc_flags |= RTF_GATEWAY; 4175 } 4176 4177 if (tb[RTA_DST]) { 4178 int plen = (rtm->rtm_dst_len + 7) >> 3; 4179 4180 if (nla_len(tb[RTA_DST]) < plen) 4181 goto errout; 4182 4183 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4184 } 4185 4186 if (tb[RTA_SRC]) { 4187 int plen = (rtm->rtm_src_len + 7) >> 3; 4188 4189 if (nla_len(tb[RTA_SRC]) < plen) 4190 goto errout; 4191 4192 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4193 } 4194 4195 if (tb[RTA_PREFSRC]) 4196 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4197 4198 if (tb[RTA_OIF]) 4199 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4200 4201 if (tb[RTA_PRIORITY]) 4202 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4203 4204 if (tb[RTA_METRICS]) { 4205 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4206 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4207 } 4208 4209 if (tb[RTA_TABLE]) 4210 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4211 4212 if (tb[RTA_MULTIPATH]) { 4213 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4214 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4215 4216 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4217 cfg->fc_mp_len, extack); 4218 if (err < 0) 4219 goto errout; 4220 } 4221 4222 if (tb[RTA_PREF]) { 4223 pref = nla_get_u8(tb[RTA_PREF]); 4224 if (pref != ICMPV6_ROUTER_PREF_LOW && 4225 pref != ICMPV6_ROUTER_PREF_HIGH) 4226 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4227 cfg->fc_flags |= RTF_PREF(pref); 4228 } 4229 4230 if (tb[RTA_ENCAP]) 4231 cfg->fc_encap = tb[RTA_ENCAP]; 4232 4233 if (tb[RTA_ENCAP_TYPE]) { 4234 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4235 4236 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4237 if (err < 0) 4238 goto errout; 4239 } 4240 4241 if (tb[RTA_EXPIRES]) { 4242 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4243 4244 if (addrconf_finite_timeout(timeout)) { 4245 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4246 cfg->fc_flags |= RTF_EXPIRES; 4247 } 4248 } 4249 4250 err = 0; 4251 errout: 4252 return err; 4253 } 4254 4255 struct rt6_nh { 4256 struct fib6_info *fib6_info; 4257 struct fib6_config r_cfg; 4258 struct list_head next; 4259 }; 4260 4261 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4262 { 4263 struct rt6_nh *nh; 4264 4265 list_for_each_entry(nh, rt6_nh_list, next) { 4266 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4267 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4268 nh->r_cfg.fc_ifindex); 4269 } 4270 } 4271 4272 static int ip6_route_info_append(struct net *net, 4273 struct list_head *rt6_nh_list, 4274 struct fib6_info *rt, 4275 struct fib6_config *r_cfg) 4276 { 4277 struct rt6_nh *nh; 4278 int err = -EEXIST; 4279 4280 list_for_each_entry(nh, rt6_nh_list, next) { 4281 /* check if fib6_info already exists */ 4282 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4283 return err; 4284 } 4285 4286 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4287 if (!nh) 4288 return -ENOMEM; 4289 nh->fib6_info = rt; 4290 err = ip6_convert_metrics(net, rt, r_cfg); 4291 if (err) { 4292 kfree(nh); 4293 return err; 4294 } 4295 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4296 list_add_tail(&nh->next, rt6_nh_list); 4297 4298 return 0; 4299 } 4300 4301 static void ip6_route_mpath_notify(struct fib6_info *rt, 4302 struct fib6_info *rt_last, 4303 struct nl_info *info, 4304 __u16 nlflags) 4305 { 4306 /* if this is an APPEND route, then rt points to the first route 4307 * inserted and rt_last points to last route inserted. Userspace 4308 * wants a consistent dump of the route which starts at the first 4309 * nexthop. Since sibling routes are always added at the end of 4310 * the list, find the first sibling of the last route appended 4311 */ 4312 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4313 rt = list_first_entry(&rt_last->fib6_siblings, 4314 struct fib6_info, 4315 fib6_siblings); 4316 } 4317 4318 if (rt) 4319 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4320 } 4321 4322 static int ip6_route_multipath_add(struct fib6_config *cfg, 4323 struct netlink_ext_ack *extack) 4324 { 4325 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4326 struct nl_info *info = &cfg->fc_nlinfo; 4327 struct fib6_config r_cfg; 4328 struct rtnexthop *rtnh; 4329 struct fib6_info *rt; 4330 struct rt6_nh *err_nh; 4331 struct rt6_nh *nh, *nh_safe; 4332 __u16 nlflags; 4333 int remaining; 4334 int attrlen; 4335 int err = 1; 4336 int nhn = 0; 4337 int replace = (cfg->fc_nlinfo.nlh && 4338 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4339 LIST_HEAD(rt6_nh_list); 4340 4341 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4342 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4343 nlflags |= NLM_F_APPEND; 4344 4345 remaining = cfg->fc_mp_len; 4346 rtnh = (struct rtnexthop *)cfg->fc_mp; 4347 4348 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4349 * fib6_info structs per nexthop 4350 */ 4351 while (rtnh_ok(rtnh, remaining)) { 4352 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4353 if (rtnh->rtnh_ifindex) 4354 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4355 4356 attrlen = rtnh_attrlen(rtnh); 4357 if (attrlen > 0) { 4358 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4359 4360 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4361 if (nla) { 4362 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4363 r_cfg.fc_flags |= RTF_GATEWAY; 4364 } 4365 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4366 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4367 if (nla) 4368 r_cfg.fc_encap_type = nla_get_u16(nla); 4369 } 4370 4371 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4372 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4373 if (IS_ERR(rt)) { 4374 err = PTR_ERR(rt); 4375 rt = NULL; 4376 goto cleanup; 4377 } 4378 if (!rt6_qualify_for_ecmp(rt)) { 4379 err = -EINVAL; 4380 NL_SET_ERR_MSG(extack, 4381 "Device only routes can not be added for IPv6 using the multipath API."); 4382 fib6_info_release(rt); 4383 goto cleanup; 4384 } 4385 4386 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4387 4388 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4389 rt, &r_cfg); 4390 if (err) { 4391 fib6_info_release(rt); 4392 goto cleanup; 4393 } 4394 4395 rtnh = rtnh_next(rtnh, &remaining); 4396 } 4397 4398 /* for add and replace send one notification with all nexthops. 4399 * Skip the notification in fib6_add_rt2node and send one with 4400 * the full route when done 4401 */ 4402 info->skip_notify = 1; 4403 4404 err_nh = NULL; 4405 list_for_each_entry(nh, &rt6_nh_list, next) { 4406 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4407 fib6_info_release(nh->fib6_info); 4408 4409 if (!err) { 4410 /* save reference to last route successfully inserted */ 4411 rt_last = nh->fib6_info; 4412 4413 /* save reference to first route for notification */ 4414 if (!rt_notif) 4415 rt_notif = nh->fib6_info; 4416 } 4417 4418 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4419 nh->fib6_info = NULL; 4420 if (err) { 4421 if (replace && nhn) 4422 ip6_print_replace_route_err(&rt6_nh_list); 4423 err_nh = nh; 4424 goto add_errout; 4425 } 4426 4427 /* Because each route is added like a single route we remove 4428 * these flags after the first nexthop: if there is a collision, 4429 * we have already failed to add the first nexthop: 4430 * fib6_add_rt2node() has rejected it; when replacing, old 4431 * nexthops have been replaced by first new, the rest should 4432 * be added to it. 4433 */ 4434 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4435 NLM_F_REPLACE); 4436 nhn++; 4437 } 4438 4439 /* success ... tell user about new route */ 4440 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4441 goto cleanup; 4442 4443 add_errout: 4444 /* send notification for routes that were added so that 4445 * the delete notifications sent by ip6_route_del are 4446 * coherent 4447 */ 4448 if (rt_notif) 4449 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4450 4451 /* Delete routes that were already added */ 4452 list_for_each_entry(nh, &rt6_nh_list, next) { 4453 if (err_nh == nh) 4454 break; 4455 ip6_route_del(&nh->r_cfg, extack); 4456 } 4457 4458 cleanup: 4459 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4460 if (nh->fib6_info) 4461 fib6_info_release(nh->fib6_info); 4462 list_del(&nh->next); 4463 kfree(nh); 4464 } 4465 4466 return err; 4467 } 4468 4469 static int ip6_route_multipath_del(struct fib6_config *cfg, 4470 struct netlink_ext_ack *extack) 4471 { 4472 struct fib6_config r_cfg; 4473 struct rtnexthop *rtnh; 4474 int remaining; 4475 int attrlen; 4476 int err = 1, last_err = 0; 4477 4478 remaining = cfg->fc_mp_len; 4479 rtnh = (struct rtnexthop *)cfg->fc_mp; 4480 4481 /* Parse a Multipath Entry */ 4482 while (rtnh_ok(rtnh, remaining)) { 4483 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4484 if (rtnh->rtnh_ifindex) 4485 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4486 4487 attrlen = rtnh_attrlen(rtnh); 4488 if (attrlen > 0) { 4489 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4490 4491 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4492 if (nla) { 4493 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4494 r_cfg.fc_flags |= RTF_GATEWAY; 4495 } 4496 } 4497 err = ip6_route_del(&r_cfg, extack); 4498 if (err) 4499 last_err = err; 4500 4501 rtnh = rtnh_next(rtnh, &remaining); 4502 } 4503 4504 return last_err; 4505 } 4506 4507 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4508 struct netlink_ext_ack *extack) 4509 { 4510 struct fib6_config cfg; 4511 int err; 4512 4513 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4514 if (err < 0) 4515 return err; 4516 4517 if (cfg.fc_mp) 4518 return ip6_route_multipath_del(&cfg, extack); 4519 else { 4520 cfg.fc_delete_all_nh = 1; 4521 return ip6_route_del(&cfg, extack); 4522 } 4523 } 4524 4525 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4526 struct netlink_ext_ack *extack) 4527 { 4528 struct fib6_config cfg; 4529 int err; 4530 4531 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4532 if (err < 0) 4533 return err; 4534 4535 if (cfg.fc_mp) 4536 return ip6_route_multipath_add(&cfg, extack); 4537 else 4538 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4539 } 4540 4541 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4542 { 4543 int nexthop_len = 0; 4544 4545 if (rt->fib6_nsiblings) { 4546 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4547 + NLA_ALIGN(sizeof(struct rtnexthop)) 4548 + nla_total_size(16) /* RTA_GATEWAY */ 4549 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4550 4551 nexthop_len *= rt->fib6_nsiblings; 4552 } 4553 4554 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4555 + nla_total_size(16) /* RTA_SRC */ 4556 + nla_total_size(16) /* RTA_DST */ 4557 + nla_total_size(16) /* RTA_GATEWAY */ 4558 + nla_total_size(16) /* RTA_PREFSRC */ 4559 + nla_total_size(4) /* RTA_TABLE */ 4560 + nla_total_size(4) /* RTA_IIF */ 4561 + nla_total_size(4) /* RTA_OIF */ 4562 + nla_total_size(4) /* RTA_PRIORITY */ 4563 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4564 + nla_total_size(sizeof(struct rta_cacheinfo)) 4565 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4566 + nla_total_size(1) /* RTA_PREF */ 4567 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4568 + nexthop_len; 4569 } 4570 4571 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4572 unsigned int *flags, bool skip_oif) 4573 { 4574 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4575 *flags |= RTNH_F_DEAD; 4576 4577 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4578 *flags |= RTNH_F_LINKDOWN; 4579 4580 rcu_read_lock(); 4581 if (fib6_ignore_linkdown(rt)) 4582 *flags |= RTNH_F_DEAD; 4583 rcu_read_unlock(); 4584 } 4585 4586 if (rt->fib6_flags & RTF_GATEWAY) { 4587 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4588 goto nla_put_failure; 4589 } 4590 4591 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4592 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4593 *flags |= RTNH_F_OFFLOAD; 4594 4595 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4596 if (!skip_oif && rt->fib6_nh.nh_dev && 4597 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4598 goto nla_put_failure; 4599 4600 if (rt->fib6_nh.nh_lwtstate && 4601 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4602 goto nla_put_failure; 4603 4604 return 0; 4605 4606 nla_put_failure: 4607 return -EMSGSIZE; 4608 } 4609 4610 /* add multipath next hop */ 4611 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4612 { 4613 const struct net_device *dev = rt->fib6_nh.nh_dev; 4614 struct rtnexthop *rtnh; 4615 unsigned int flags = 0; 4616 4617 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4618 if (!rtnh) 4619 goto nla_put_failure; 4620 4621 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4622 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4623 4624 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4625 goto nla_put_failure; 4626 4627 rtnh->rtnh_flags = flags; 4628 4629 /* length of rtnetlink header + attributes */ 4630 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4631 4632 return 0; 4633 4634 nla_put_failure: 4635 return -EMSGSIZE; 4636 } 4637 4638 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4639 struct fib6_info *rt, struct dst_entry *dst, 4640 struct in6_addr *dest, struct in6_addr *src, 4641 int iif, int type, u32 portid, u32 seq, 4642 unsigned int flags) 4643 { 4644 struct rtmsg *rtm; 4645 struct nlmsghdr *nlh; 4646 long expires = 0; 4647 u32 *pmetrics; 4648 u32 table; 4649 4650 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4651 if (!nlh) 4652 return -EMSGSIZE; 4653 4654 rtm = nlmsg_data(nlh); 4655 rtm->rtm_family = AF_INET6; 4656 rtm->rtm_dst_len = rt->fib6_dst.plen; 4657 rtm->rtm_src_len = rt->fib6_src.plen; 4658 rtm->rtm_tos = 0; 4659 if (rt->fib6_table) 4660 table = rt->fib6_table->tb6_id; 4661 else 4662 table = RT6_TABLE_UNSPEC; 4663 rtm->rtm_table = table; 4664 if (nla_put_u32(skb, RTA_TABLE, table)) 4665 goto nla_put_failure; 4666 4667 rtm->rtm_type = rt->fib6_type; 4668 rtm->rtm_flags = 0; 4669 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4670 rtm->rtm_protocol = rt->fib6_protocol; 4671 4672 if (rt->fib6_flags & RTF_CACHE) 4673 rtm->rtm_flags |= RTM_F_CLONED; 4674 4675 if (dest) { 4676 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4677 goto nla_put_failure; 4678 rtm->rtm_dst_len = 128; 4679 } else if (rtm->rtm_dst_len) 4680 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4681 goto nla_put_failure; 4682 #ifdef CONFIG_IPV6_SUBTREES 4683 if (src) { 4684 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4685 goto nla_put_failure; 4686 rtm->rtm_src_len = 128; 4687 } else if (rtm->rtm_src_len && 4688 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4689 goto nla_put_failure; 4690 #endif 4691 if (iif) { 4692 #ifdef CONFIG_IPV6_MROUTE 4693 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4694 int err = ip6mr_get_route(net, skb, rtm, portid); 4695 4696 if (err == 0) 4697 return 0; 4698 if (err < 0) 4699 goto nla_put_failure; 4700 } else 4701 #endif 4702 if (nla_put_u32(skb, RTA_IIF, iif)) 4703 goto nla_put_failure; 4704 } else if (dest) { 4705 struct in6_addr saddr_buf; 4706 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4707 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4708 goto nla_put_failure; 4709 } 4710 4711 if (rt->fib6_prefsrc.plen) { 4712 struct in6_addr saddr_buf; 4713 saddr_buf = rt->fib6_prefsrc.addr; 4714 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4715 goto nla_put_failure; 4716 } 4717 4718 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4719 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4720 goto nla_put_failure; 4721 4722 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4723 goto nla_put_failure; 4724 4725 /* For multipath routes, walk the siblings list and add 4726 * each as a nexthop within RTA_MULTIPATH. 4727 */ 4728 if (rt->fib6_nsiblings) { 4729 struct fib6_info *sibling, *next_sibling; 4730 struct nlattr *mp; 4731 4732 mp = nla_nest_start(skb, RTA_MULTIPATH); 4733 if (!mp) 4734 goto nla_put_failure; 4735 4736 if (rt6_add_nexthop(skb, rt) < 0) 4737 goto nla_put_failure; 4738 4739 list_for_each_entry_safe(sibling, next_sibling, 4740 &rt->fib6_siblings, fib6_siblings) { 4741 if (rt6_add_nexthop(skb, sibling) < 0) 4742 goto nla_put_failure; 4743 } 4744 4745 nla_nest_end(skb, mp); 4746 } else { 4747 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4748 goto nla_put_failure; 4749 } 4750 4751 if (rt->fib6_flags & RTF_EXPIRES) { 4752 expires = dst ? dst->expires : rt->expires; 4753 expires -= jiffies; 4754 } 4755 4756 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4757 goto nla_put_failure; 4758 4759 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4760 goto nla_put_failure; 4761 4762 4763 nlmsg_end(skb, nlh); 4764 return 0; 4765 4766 nla_put_failure: 4767 nlmsg_cancel(skb, nlh); 4768 return -EMSGSIZE; 4769 } 4770 4771 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4772 { 4773 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4774 struct net *net = arg->net; 4775 4776 if (rt == net->ipv6.fib6_null_entry) 4777 return 0; 4778 4779 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4780 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4781 4782 /* user wants prefix routes only */ 4783 if (rtm->rtm_flags & RTM_F_PREFIX && 4784 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4785 /* success since this is not a prefix route */ 4786 return 1; 4787 } 4788 } 4789 4790 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4791 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4792 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4793 } 4794 4795 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4796 struct netlink_ext_ack *extack) 4797 { 4798 struct net *net = sock_net(in_skb->sk); 4799 struct nlattr *tb[RTA_MAX+1]; 4800 int err, iif = 0, oif = 0; 4801 struct fib6_info *from; 4802 struct dst_entry *dst; 4803 struct rt6_info *rt; 4804 struct sk_buff *skb; 4805 struct rtmsg *rtm; 4806 struct flowi6 fl6; 4807 bool fibmatch; 4808 4809 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4810 extack); 4811 if (err < 0) 4812 goto errout; 4813 4814 err = -EINVAL; 4815 memset(&fl6, 0, sizeof(fl6)); 4816 rtm = nlmsg_data(nlh); 4817 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4818 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4819 4820 if (tb[RTA_SRC]) { 4821 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4822 goto errout; 4823 4824 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4825 } 4826 4827 if (tb[RTA_DST]) { 4828 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4829 goto errout; 4830 4831 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4832 } 4833 4834 if (tb[RTA_IIF]) 4835 iif = nla_get_u32(tb[RTA_IIF]); 4836 4837 if (tb[RTA_OIF]) 4838 oif = nla_get_u32(tb[RTA_OIF]); 4839 4840 if (tb[RTA_MARK]) 4841 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4842 4843 if (tb[RTA_UID]) 4844 fl6.flowi6_uid = make_kuid(current_user_ns(), 4845 nla_get_u32(tb[RTA_UID])); 4846 else 4847 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4848 4849 if (tb[RTA_SPORT]) 4850 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4851 4852 if (tb[RTA_DPORT]) 4853 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4854 4855 if (tb[RTA_IP_PROTO]) { 4856 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4857 &fl6.flowi6_proto, extack); 4858 if (err) 4859 goto errout; 4860 } 4861 4862 if (iif) { 4863 struct net_device *dev; 4864 int flags = 0; 4865 4866 rcu_read_lock(); 4867 4868 dev = dev_get_by_index_rcu(net, iif); 4869 if (!dev) { 4870 rcu_read_unlock(); 4871 err = -ENODEV; 4872 goto errout; 4873 } 4874 4875 fl6.flowi6_iif = iif; 4876 4877 if (!ipv6_addr_any(&fl6.saddr)) 4878 flags |= RT6_LOOKUP_F_HAS_SADDR; 4879 4880 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4881 4882 rcu_read_unlock(); 4883 } else { 4884 fl6.flowi6_oif = oif; 4885 4886 dst = ip6_route_output(net, NULL, &fl6); 4887 } 4888 4889 4890 rt = container_of(dst, struct rt6_info, dst); 4891 if (rt->dst.error) { 4892 err = rt->dst.error; 4893 ip6_rt_put(rt); 4894 goto errout; 4895 } 4896 4897 if (rt == net->ipv6.ip6_null_entry) { 4898 err = rt->dst.error; 4899 ip6_rt_put(rt); 4900 goto errout; 4901 } 4902 4903 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4904 if (!skb) { 4905 ip6_rt_put(rt); 4906 err = -ENOBUFS; 4907 goto errout; 4908 } 4909 4910 skb_dst_set(skb, &rt->dst); 4911 4912 rcu_read_lock(); 4913 from = rcu_dereference(rt->from); 4914 4915 if (fibmatch) 4916 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4917 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4918 nlh->nlmsg_seq, 0); 4919 else 4920 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4921 &fl6.saddr, iif, RTM_NEWROUTE, 4922 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4923 0); 4924 rcu_read_unlock(); 4925 4926 if (err < 0) { 4927 kfree_skb(skb); 4928 goto errout; 4929 } 4930 4931 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4932 errout: 4933 return err; 4934 } 4935 4936 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4937 unsigned int nlm_flags) 4938 { 4939 struct sk_buff *skb; 4940 struct net *net = info->nl_net; 4941 u32 seq; 4942 int err; 4943 4944 err = -ENOBUFS; 4945 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4946 4947 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4948 if (!skb) 4949 goto errout; 4950 4951 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4952 event, info->portid, seq, nlm_flags); 4953 if (err < 0) { 4954 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4955 WARN_ON(err == -EMSGSIZE); 4956 kfree_skb(skb); 4957 goto errout; 4958 } 4959 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4960 info->nlh, gfp_any()); 4961 return; 4962 errout: 4963 if (err < 0) 4964 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4965 } 4966 4967 static int ip6_route_dev_notify(struct notifier_block *this, 4968 unsigned long event, void *ptr) 4969 { 4970 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4971 struct net *net = dev_net(dev); 4972 4973 if (!(dev->flags & IFF_LOOPBACK)) 4974 return NOTIFY_OK; 4975 4976 if (event == NETDEV_REGISTER) { 4977 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4978 net->ipv6.ip6_null_entry->dst.dev = dev; 4979 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4981 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4982 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4983 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4984 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4985 #endif 4986 } else if (event == NETDEV_UNREGISTER && 4987 dev->reg_state != NETREG_UNREGISTERED) { 4988 /* NETDEV_UNREGISTER could be fired for multiple times by 4989 * netdev_wait_allrefs(). Make sure we only call this once. 4990 */ 4991 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4992 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4993 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4994 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4995 #endif 4996 } 4997 4998 return NOTIFY_OK; 4999 } 5000 5001 /* 5002 * /proc 5003 */ 5004 5005 #ifdef CONFIG_PROC_FS 5006 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5007 { 5008 struct net *net = (struct net *)seq->private; 5009 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5010 net->ipv6.rt6_stats->fib_nodes, 5011 net->ipv6.rt6_stats->fib_route_nodes, 5012 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5013 net->ipv6.rt6_stats->fib_rt_entries, 5014 net->ipv6.rt6_stats->fib_rt_cache, 5015 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5016 net->ipv6.rt6_stats->fib_discarded_routes); 5017 5018 return 0; 5019 } 5020 #endif /* CONFIG_PROC_FS */ 5021 5022 #ifdef CONFIG_SYSCTL 5023 5024 static 5025 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5026 void __user *buffer, size_t *lenp, loff_t *ppos) 5027 { 5028 struct net *net; 5029 int delay; 5030 if (!write) 5031 return -EINVAL; 5032 5033 net = (struct net *)ctl->extra1; 5034 delay = net->ipv6.sysctl.flush_delay; 5035 proc_dointvec(ctl, write, buffer, lenp, ppos); 5036 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5037 return 0; 5038 } 5039 5040 struct ctl_table ipv6_route_table_template[] = { 5041 { 5042 .procname = "flush", 5043 .data = &init_net.ipv6.sysctl.flush_delay, 5044 .maxlen = sizeof(int), 5045 .mode = 0200, 5046 .proc_handler = ipv6_sysctl_rtcache_flush 5047 }, 5048 { 5049 .procname = "gc_thresh", 5050 .data = &ip6_dst_ops_template.gc_thresh, 5051 .maxlen = sizeof(int), 5052 .mode = 0644, 5053 .proc_handler = proc_dointvec, 5054 }, 5055 { 5056 .procname = "max_size", 5057 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5058 .maxlen = sizeof(int), 5059 .mode = 0644, 5060 .proc_handler = proc_dointvec, 5061 }, 5062 { 5063 .procname = "gc_min_interval", 5064 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5065 .maxlen = sizeof(int), 5066 .mode = 0644, 5067 .proc_handler = proc_dointvec_jiffies, 5068 }, 5069 { 5070 .procname = "gc_timeout", 5071 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5072 .maxlen = sizeof(int), 5073 .mode = 0644, 5074 .proc_handler = proc_dointvec_jiffies, 5075 }, 5076 { 5077 .procname = "gc_interval", 5078 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5079 .maxlen = sizeof(int), 5080 .mode = 0644, 5081 .proc_handler = proc_dointvec_jiffies, 5082 }, 5083 { 5084 .procname = "gc_elasticity", 5085 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5086 .maxlen = sizeof(int), 5087 .mode = 0644, 5088 .proc_handler = proc_dointvec, 5089 }, 5090 { 5091 .procname = "mtu_expires", 5092 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5093 .maxlen = sizeof(int), 5094 .mode = 0644, 5095 .proc_handler = proc_dointvec_jiffies, 5096 }, 5097 { 5098 .procname = "min_adv_mss", 5099 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5100 .maxlen = sizeof(int), 5101 .mode = 0644, 5102 .proc_handler = proc_dointvec, 5103 }, 5104 { 5105 .procname = "gc_min_interval_ms", 5106 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5107 .maxlen = sizeof(int), 5108 .mode = 0644, 5109 .proc_handler = proc_dointvec_ms_jiffies, 5110 }, 5111 { } 5112 }; 5113 5114 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5115 { 5116 struct ctl_table *table; 5117 5118 table = kmemdup(ipv6_route_table_template, 5119 sizeof(ipv6_route_table_template), 5120 GFP_KERNEL); 5121 5122 if (table) { 5123 table[0].data = &net->ipv6.sysctl.flush_delay; 5124 table[0].extra1 = net; 5125 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5126 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5127 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5128 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5129 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5130 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5131 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5132 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5133 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5134 5135 /* Don't export sysctls to unprivileged users */ 5136 if (net->user_ns != &init_user_ns) 5137 table[0].procname = NULL; 5138 } 5139 5140 return table; 5141 } 5142 #endif 5143 5144 static int __net_init ip6_route_net_init(struct net *net) 5145 { 5146 int ret = -ENOMEM; 5147 5148 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5149 sizeof(net->ipv6.ip6_dst_ops)); 5150 5151 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5152 goto out_ip6_dst_ops; 5153 5154 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5155 sizeof(*net->ipv6.fib6_null_entry), 5156 GFP_KERNEL); 5157 if (!net->ipv6.fib6_null_entry) 5158 goto out_ip6_dst_entries; 5159 5160 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5161 sizeof(*net->ipv6.ip6_null_entry), 5162 GFP_KERNEL); 5163 if (!net->ipv6.ip6_null_entry) 5164 goto out_fib6_null_entry; 5165 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5166 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5167 ip6_template_metrics, true); 5168 5169 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5170 net->ipv6.fib6_has_custom_rules = false; 5171 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5172 sizeof(*net->ipv6.ip6_prohibit_entry), 5173 GFP_KERNEL); 5174 if (!net->ipv6.ip6_prohibit_entry) 5175 goto out_ip6_null_entry; 5176 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5177 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5178 ip6_template_metrics, true); 5179 5180 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5181 sizeof(*net->ipv6.ip6_blk_hole_entry), 5182 GFP_KERNEL); 5183 if (!net->ipv6.ip6_blk_hole_entry) 5184 goto out_ip6_prohibit_entry; 5185 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5186 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5187 ip6_template_metrics, true); 5188 #endif 5189 5190 net->ipv6.sysctl.flush_delay = 0; 5191 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5192 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5193 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5194 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5195 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5196 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5197 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5198 5199 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5200 5201 ret = 0; 5202 out: 5203 return ret; 5204 5205 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5206 out_ip6_prohibit_entry: 5207 kfree(net->ipv6.ip6_prohibit_entry); 5208 out_ip6_null_entry: 5209 kfree(net->ipv6.ip6_null_entry); 5210 #endif 5211 out_fib6_null_entry: 5212 kfree(net->ipv6.fib6_null_entry); 5213 out_ip6_dst_entries: 5214 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5215 out_ip6_dst_ops: 5216 goto out; 5217 } 5218 5219 static void __net_exit ip6_route_net_exit(struct net *net) 5220 { 5221 kfree(net->ipv6.fib6_null_entry); 5222 kfree(net->ipv6.ip6_null_entry); 5223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5224 kfree(net->ipv6.ip6_prohibit_entry); 5225 kfree(net->ipv6.ip6_blk_hole_entry); 5226 #endif 5227 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5228 } 5229 5230 static int __net_init ip6_route_net_init_late(struct net *net) 5231 { 5232 #ifdef CONFIG_PROC_FS 5233 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5234 sizeof(struct ipv6_route_iter)); 5235 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5236 rt6_stats_seq_show, NULL); 5237 #endif 5238 return 0; 5239 } 5240 5241 static void __net_exit ip6_route_net_exit_late(struct net *net) 5242 { 5243 #ifdef CONFIG_PROC_FS 5244 remove_proc_entry("ipv6_route", net->proc_net); 5245 remove_proc_entry("rt6_stats", net->proc_net); 5246 #endif 5247 } 5248 5249 static struct pernet_operations ip6_route_net_ops = { 5250 .init = ip6_route_net_init, 5251 .exit = ip6_route_net_exit, 5252 }; 5253 5254 static int __net_init ipv6_inetpeer_init(struct net *net) 5255 { 5256 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5257 5258 if (!bp) 5259 return -ENOMEM; 5260 inet_peer_base_init(bp); 5261 net->ipv6.peers = bp; 5262 return 0; 5263 } 5264 5265 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5266 { 5267 struct inet_peer_base *bp = net->ipv6.peers; 5268 5269 net->ipv6.peers = NULL; 5270 inetpeer_invalidate_tree(bp); 5271 kfree(bp); 5272 } 5273 5274 static struct pernet_operations ipv6_inetpeer_ops = { 5275 .init = ipv6_inetpeer_init, 5276 .exit = ipv6_inetpeer_exit, 5277 }; 5278 5279 static struct pernet_operations ip6_route_net_late_ops = { 5280 .init = ip6_route_net_init_late, 5281 .exit = ip6_route_net_exit_late, 5282 }; 5283 5284 static struct notifier_block ip6_route_dev_notifier = { 5285 .notifier_call = ip6_route_dev_notify, 5286 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5287 }; 5288 5289 void __init ip6_route_init_special_entries(void) 5290 { 5291 /* Registering of the loopback is done before this portion of code, 5292 * the loopback reference in rt6_info will not be taken, do it 5293 * manually for init_net */ 5294 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5295 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5296 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5297 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5298 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5299 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5300 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5301 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5302 #endif 5303 } 5304 5305 int __init ip6_route_init(void) 5306 { 5307 int ret; 5308 int cpu; 5309 5310 ret = -ENOMEM; 5311 ip6_dst_ops_template.kmem_cachep = 5312 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5313 SLAB_HWCACHE_ALIGN, NULL); 5314 if (!ip6_dst_ops_template.kmem_cachep) 5315 goto out; 5316 5317 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5318 if (ret) 5319 goto out_kmem_cache; 5320 5321 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5322 if (ret) 5323 goto out_dst_entries; 5324 5325 ret = register_pernet_subsys(&ip6_route_net_ops); 5326 if (ret) 5327 goto out_register_inetpeer; 5328 5329 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5330 5331 ret = fib6_init(); 5332 if (ret) 5333 goto out_register_subsys; 5334 5335 ret = xfrm6_init(); 5336 if (ret) 5337 goto out_fib6_init; 5338 5339 ret = fib6_rules_init(); 5340 if (ret) 5341 goto xfrm6_init; 5342 5343 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5344 if (ret) 5345 goto fib6_rules_init; 5346 5347 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5348 inet6_rtm_newroute, NULL, 0); 5349 if (ret < 0) 5350 goto out_register_late_subsys; 5351 5352 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5353 inet6_rtm_delroute, NULL, 0); 5354 if (ret < 0) 5355 goto out_register_late_subsys; 5356 5357 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5358 inet6_rtm_getroute, NULL, 5359 RTNL_FLAG_DOIT_UNLOCKED); 5360 if (ret < 0) 5361 goto out_register_late_subsys; 5362 5363 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5364 if (ret) 5365 goto out_register_late_subsys; 5366 5367 for_each_possible_cpu(cpu) { 5368 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5369 5370 INIT_LIST_HEAD(&ul->head); 5371 spin_lock_init(&ul->lock); 5372 } 5373 5374 out: 5375 return ret; 5376 5377 out_register_late_subsys: 5378 rtnl_unregister_all(PF_INET6); 5379 unregister_pernet_subsys(&ip6_route_net_late_ops); 5380 fib6_rules_init: 5381 fib6_rules_cleanup(); 5382 xfrm6_init: 5383 xfrm6_fini(); 5384 out_fib6_init: 5385 fib6_gc_cleanup(); 5386 out_register_subsys: 5387 unregister_pernet_subsys(&ip6_route_net_ops); 5388 out_register_inetpeer: 5389 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5390 out_dst_entries: 5391 dst_entries_destroy(&ip6_dst_blackhole_ops); 5392 out_kmem_cache: 5393 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5394 goto out; 5395 } 5396 5397 void ip6_route_cleanup(void) 5398 { 5399 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5400 unregister_pernet_subsys(&ip6_route_net_late_ops); 5401 fib6_rules_cleanup(); 5402 xfrm6_fini(); 5403 fib6_gc_cleanup(); 5404 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5405 unregister_pernet_subsys(&ip6_route_net_ops); 5406 dst_entries_destroy(&ip6_dst_blackhole_ops); 5407 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5408 } 5409