1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 976 { 977 rt->rt6i_flags &= ~RTF_EXPIRES; 978 fib6_info_hold(from); 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 if (from->fib6_metrics != &dst_default_metrics) { 982 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 983 refcount_inc(&from->fib6_metrics->refcnt); 984 } 985 } 986 987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 988 { 989 struct net_device *dev = fib6_info_nh_dev(ort); 990 991 ip6_rt_init_dst(rt, ort); 992 993 rt->rt6i_dst = ort->fib6_dst; 994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 995 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 996 rt->rt6i_flags = ort->fib6_flags; 997 rt6_set_from(rt, ort); 998 #ifdef CONFIG_IPV6_SUBTREES 999 rt->rt6i_src = ort->fib6_src; 1000 #endif 1001 rt->rt6i_prefsrc = ort->fib6_prefsrc; 1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 1003 } 1004 1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1006 struct in6_addr *saddr) 1007 { 1008 struct fib6_node *pn, *sn; 1009 while (1) { 1010 if (fn->fn_flags & RTN_TL_ROOT) 1011 return NULL; 1012 pn = rcu_dereference(fn->parent); 1013 sn = FIB6_SUBTREE(pn); 1014 if (sn && sn != fn) 1015 fn = fib6_node_lookup(sn, NULL, saddr); 1016 else 1017 fn = pn; 1018 if (fn->fn_flags & RTN_RTINFO) 1019 return fn; 1020 } 1021 } 1022 1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1024 bool null_fallback) 1025 { 1026 struct rt6_info *rt = *prt; 1027 1028 if (dst_hold_safe(&rt->dst)) 1029 return true; 1030 if (null_fallback) { 1031 rt = net->ipv6.ip6_null_entry; 1032 dst_hold(&rt->dst); 1033 } else { 1034 rt = NULL; 1035 } 1036 *prt = rt; 1037 return false; 1038 } 1039 1040 /* called with rcu_lock held */ 1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1042 { 1043 unsigned short flags = fib6_info_dst_flags(rt); 1044 struct net_device *dev = rt->fib6_nh.nh_dev; 1045 struct rt6_info *nrt; 1046 1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1048 if (nrt) 1049 ip6_rt_copy_init(nrt, rt); 1050 1051 return nrt; 1052 } 1053 1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1055 struct fib6_table *table, 1056 struct flowi6 *fl6, 1057 const struct sk_buff *skb, 1058 int flags) 1059 { 1060 struct fib6_info *f6i; 1061 struct fib6_node *fn; 1062 struct rt6_info *rt; 1063 1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1065 flags &= ~RT6_LOOKUP_F_IFACE; 1066 1067 rcu_read_lock(); 1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1069 restart: 1070 f6i = rcu_dereference(fn->leaf); 1071 if (!f6i) { 1072 f6i = net->ipv6.fib6_null_entry; 1073 } else { 1074 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1075 fl6->flowi6_oif, flags); 1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1077 f6i = fib6_multipath_select(net, f6i, fl6, 1078 fl6->flowi6_oif, skb, 1079 flags); 1080 } 1081 if (f6i == net->ipv6.fib6_null_entry) { 1082 fn = fib6_backtrack(fn, &fl6->saddr); 1083 if (fn) 1084 goto restart; 1085 } 1086 1087 trace_fib6_table_lookup(net, f6i, table, fl6); 1088 1089 /* Search through exception table */ 1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1091 if (rt) { 1092 if (ip6_hold_safe(net, &rt, true)) 1093 dst_use_noref(&rt->dst, jiffies); 1094 } else if (f6i == net->ipv6.fib6_null_entry) { 1095 rt = net->ipv6.ip6_null_entry; 1096 dst_hold(&rt->dst); 1097 } else { 1098 rt = ip6_create_rt_rcu(f6i); 1099 if (!rt) { 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 } 1103 } 1104 1105 rcu_read_unlock(); 1106 1107 return rt; 1108 } 1109 1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1111 const struct sk_buff *skb, int flags) 1112 { 1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1114 } 1115 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1116 1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1118 const struct in6_addr *saddr, int oif, 1119 const struct sk_buff *skb, int strict) 1120 { 1121 struct flowi6 fl6 = { 1122 .flowi6_oif = oif, 1123 .daddr = *daddr, 1124 }; 1125 struct dst_entry *dst; 1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1127 1128 if (saddr) { 1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1130 flags |= RT6_LOOKUP_F_HAS_SADDR; 1131 } 1132 1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1134 if (dst->error == 0) 1135 return (struct rt6_info *) dst; 1136 1137 dst_release(dst); 1138 1139 return NULL; 1140 } 1141 EXPORT_SYMBOL(rt6_lookup); 1142 1143 /* ip6_ins_rt is called with FREE table->tb6_lock. 1144 * It takes new route entry, the addition fails by any reason the 1145 * route is released. 1146 * Caller must hold dst before calling it. 1147 */ 1148 1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1150 struct netlink_ext_ack *extack) 1151 { 1152 int err; 1153 struct fib6_table *table; 1154 1155 table = rt->fib6_table; 1156 spin_lock_bh(&table->tb6_lock); 1157 err = fib6_add(&table->tb6_root, rt, info, extack); 1158 spin_unlock_bh(&table->tb6_lock); 1159 1160 return err; 1161 } 1162 1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1164 { 1165 struct nl_info info = { .nl_net = net, }; 1166 1167 return __ip6_ins_rt(rt, &info, NULL); 1168 } 1169 1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1171 const struct in6_addr *daddr, 1172 const struct in6_addr *saddr) 1173 { 1174 struct net_device *dev; 1175 struct rt6_info *rt; 1176 1177 /* 1178 * Clone the route. 1179 */ 1180 1181 dev = ip6_rt_get_dev_rcu(ort); 1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1183 if (!rt) 1184 return NULL; 1185 1186 ip6_rt_copy_init(rt, ort); 1187 rt->rt6i_flags |= RTF_CACHE; 1188 rt->dst.flags |= DST_HOST; 1189 rt->rt6i_dst.addr = *daddr; 1190 rt->rt6i_dst.plen = 128; 1191 1192 if (!rt6_is_gw_or_nonexthop(ort)) { 1193 if (ort->fib6_dst.plen != 128 && 1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1195 rt->rt6i_flags |= RTF_ANYCAST; 1196 #ifdef CONFIG_IPV6_SUBTREES 1197 if (rt->rt6i_src.plen && saddr) { 1198 rt->rt6i_src.addr = *saddr; 1199 rt->rt6i_src.plen = 128; 1200 } 1201 #endif 1202 } 1203 1204 return rt; 1205 } 1206 1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1208 { 1209 unsigned short flags = fib6_info_dst_flags(rt); 1210 struct net_device *dev; 1211 struct rt6_info *pcpu_rt; 1212 1213 rcu_read_lock(); 1214 dev = ip6_rt_get_dev_rcu(rt); 1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1216 rcu_read_unlock(); 1217 if (!pcpu_rt) 1218 return NULL; 1219 ip6_rt_copy_init(pcpu_rt, rt); 1220 pcpu_rt->rt6i_flags |= RTF_PCPU; 1221 return pcpu_rt; 1222 } 1223 1224 /* It should be called with rcu_read_lock() acquired */ 1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1226 { 1227 struct rt6_info *pcpu_rt, **p; 1228 1229 p = this_cpu_ptr(rt->rt6i_pcpu); 1230 pcpu_rt = *p; 1231 1232 if (pcpu_rt) 1233 ip6_hold_safe(NULL, &pcpu_rt, false); 1234 1235 return pcpu_rt; 1236 } 1237 1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1239 struct fib6_info *rt) 1240 { 1241 struct rt6_info *pcpu_rt, *prev, **p; 1242 1243 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1244 if (!pcpu_rt) { 1245 dst_hold(&net->ipv6.ip6_null_entry->dst); 1246 return net->ipv6.ip6_null_entry; 1247 } 1248 1249 dst_hold(&pcpu_rt->dst); 1250 p = this_cpu_ptr(rt->rt6i_pcpu); 1251 prev = cmpxchg(p, NULL, pcpu_rt); 1252 BUG_ON(prev); 1253 1254 return pcpu_rt; 1255 } 1256 1257 /* exception hash table implementation 1258 */ 1259 static DEFINE_SPINLOCK(rt6_exception_lock); 1260 1261 /* Remove rt6_ex from hash table and free the memory 1262 * Caller must hold rt6_exception_lock 1263 */ 1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1265 struct rt6_exception *rt6_ex) 1266 { 1267 struct net *net; 1268 1269 if (!bucket || !rt6_ex) 1270 return; 1271 1272 net = dev_net(rt6_ex->rt6i->dst.dev); 1273 hlist_del_rcu(&rt6_ex->hlist); 1274 dst_release(&rt6_ex->rt6i->dst); 1275 kfree_rcu(rt6_ex, rcu); 1276 WARN_ON_ONCE(!bucket->depth); 1277 bucket->depth--; 1278 net->ipv6.rt6_stats->fib_rt_cache--; 1279 } 1280 1281 /* Remove oldest rt6_ex in bucket and free the memory 1282 * Caller must hold rt6_exception_lock 1283 */ 1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1285 { 1286 struct rt6_exception *rt6_ex, *oldest = NULL; 1287 1288 if (!bucket) 1289 return; 1290 1291 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1292 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1293 oldest = rt6_ex; 1294 } 1295 rt6_remove_exception(bucket, oldest); 1296 } 1297 1298 static u32 rt6_exception_hash(const struct in6_addr *dst, 1299 const struct in6_addr *src) 1300 { 1301 static u32 seed __read_mostly; 1302 u32 val; 1303 1304 net_get_random_once(&seed, sizeof(seed)); 1305 val = jhash(dst, sizeof(*dst), seed); 1306 1307 #ifdef CONFIG_IPV6_SUBTREES 1308 if (src) 1309 val = jhash(src, sizeof(*src), val); 1310 #endif 1311 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1312 } 1313 1314 /* Helper function to find the cached rt in the hash table 1315 * and update bucket pointer to point to the bucket for this 1316 * (daddr, saddr) pair 1317 * Caller must hold rt6_exception_lock 1318 */ 1319 static struct rt6_exception * 1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1321 const struct in6_addr *daddr, 1322 const struct in6_addr *saddr) 1323 { 1324 struct rt6_exception *rt6_ex; 1325 u32 hval; 1326 1327 if (!(*bucket) || !daddr) 1328 return NULL; 1329 1330 hval = rt6_exception_hash(daddr, saddr); 1331 *bucket += hval; 1332 1333 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1334 struct rt6_info *rt6 = rt6_ex->rt6i; 1335 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1336 1337 #ifdef CONFIG_IPV6_SUBTREES 1338 if (matched && saddr) 1339 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1340 #endif 1341 if (matched) 1342 return rt6_ex; 1343 } 1344 return NULL; 1345 } 1346 1347 /* Helper function to find the cached rt in the hash table 1348 * and update bucket pointer to point to the bucket for this 1349 * (daddr, saddr) pair 1350 * Caller must hold rcu_read_lock() 1351 */ 1352 static struct rt6_exception * 1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1354 const struct in6_addr *daddr, 1355 const struct in6_addr *saddr) 1356 { 1357 struct rt6_exception *rt6_ex; 1358 u32 hval; 1359 1360 WARN_ON_ONCE(!rcu_read_lock_held()); 1361 1362 if (!(*bucket) || !daddr) 1363 return NULL; 1364 1365 hval = rt6_exception_hash(daddr, saddr); 1366 *bucket += hval; 1367 1368 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1369 struct rt6_info *rt6 = rt6_ex->rt6i; 1370 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1371 1372 #ifdef CONFIG_IPV6_SUBTREES 1373 if (matched && saddr) 1374 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1375 #endif 1376 if (matched) 1377 return rt6_ex; 1378 } 1379 return NULL; 1380 } 1381 1382 static unsigned int fib6_mtu(const struct fib6_info *rt) 1383 { 1384 unsigned int mtu; 1385 1386 if (rt->fib6_pmtu) { 1387 mtu = rt->fib6_pmtu; 1388 } else { 1389 struct net_device *dev = fib6_info_nh_dev(rt); 1390 struct inet6_dev *idev; 1391 1392 rcu_read_lock(); 1393 idev = __in6_dev_get(dev); 1394 mtu = idev->cnf.mtu6; 1395 rcu_read_unlock(); 1396 } 1397 1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1399 1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1401 } 1402 1403 static int rt6_insert_exception(struct rt6_info *nrt, 1404 struct fib6_info *ort) 1405 { 1406 struct net *net = dev_net(nrt->dst.dev); 1407 struct rt6_exception_bucket *bucket; 1408 struct in6_addr *src_key = NULL; 1409 struct rt6_exception *rt6_ex; 1410 int err = 0; 1411 1412 spin_lock_bh(&rt6_exception_lock); 1413 1414 if (ort->exception_bucket_flushed) { 1415 err = -EINVAL; 1416 goto out; 1417 } 1418 1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1420 lockdep_is_held(&rt6_exception_lock)); 1421 if (!bucket) { 1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1423 GFP_ATOMIC); 1424 if (!bucket) { 1425 err = -ENOMEM; 1426 goto out; 1427 } 1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1429 } 1430 1431 #ifdef CONFIG_IPV6_SUBTREES 1432 /* rt6i_src.plen != 0 indicates ort is in subtree 1433 * and exception table is indexed by a hash of 1434 * both rt6i_dst and rt6i_src. 1435 * Otherwise, the exception table is indexed by 1436 * a hash of only rt6i_dst. 1437 */ 1438 if (ort->fib6_src.plen) 1439 src_key = &nrt->rt6i_src.addr; 1440 #endif 1441 1442 /* Update rt6i_prefsrc as it could be changed 1443 * in rt6_remove_prefsrc() 1444 */ 1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1446 /* rt6_mtu_change() might lower mtu on ort. 1447 * Only insert this exception route if its mtu 1448 * is less than ort's mtu value. 1449 */ 1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1451 err = -EINVAL; 1452 goto out; 1453 } 1454 1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1456 src_key); 1457 if (rt6_ex) 1458 rt6_remove_exception(bucket, rt6_ex); 1459 1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1461 if (!rt6_ex) { 1462 err = -ENOMEM; 1463 goto out; 1464 } 1465 rt6_ex->rt6i = nrt; 1466 rt6_ex->stamp = jiffies; 1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1468 bucket->depth++; 1469 net->ipv6.rt6_stats->fib_rt_cache++; 1470 1471 if (bucket->depth > FIB6_MAX_DEPTH) 1472 rt6_exception_remove_oldest(bucket); 1473 1474 out: 1475 spin_unlock_bh(&rt6_exception_lock); 1476 1477 /* Update fn->fn_sernum to invalidate all cached dst */ 1478 if (!err) { 1479 spin_lock_bh(&ort->fib6_table->tb6_lock); 1480 fib6_update_sernum(net, ort); 1481 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1482 fib6_force_start_gc(net); 1483 } 1484 1485 return err; 1486 } 1487 1488 void rt6_flush_exceptions(struct fib6_info *rt) 1489 { 1490 struct rt6_exception_bucket *bucket; 1491 struct rt6_exception *rt6_ex; 1492 struct hlist_node *tmp; 1493 int i; 1494 1495 spin_lock_bh(&rt6_exception_lock); 1496 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1497 rt->exception_bucket_flushed = 1; 1498 1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1500 lockdep_is_held(&rt6_exception_lock)); 1501 if (!bucket) 1502 goto out; 1503 1504 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1505 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1506 rt6_remove_exception(bucket, rt6_ex); 1507 WARN_ON_ONCE(bucket->depth); 1508 bucket++; 1509 } 1510 1511 out: 1512 spin_unlock_bh(&rt6_exception_lock); 1513 } 1514 1515 /* Find cached rt in the hash table inside passed in rt 1516 * Caller has to hold rcu_read_lock() 1517 */ 1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1519 struct in6_addr *daddr, 1520 struct in6_addr *saddr) 1521 { 1522 struct rt6_exception_bucket *bucket; 1523 struct in6_addr *src_key = NULL; 1524 struct rt6_exception *rt6_ex; 1525 struct rt6_info *res = NULL; 1526 1527 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1528 1529 #ifdef CONFIG_IPV6_SUBTREES 1530 /* rt6i_src.plen != 0 indicates rt is in subtree 1531 * and exception table is indexed by a hash of 1532 * both rt6i_dst and rt6i_src. 1533 * Otherwise, the exception table is indexed by 1534 * a hash of only rt6i_dst. 1535 */ 1536 if (rt->fib6_src.plen) 1537 src_key = saddr; 1538 #endif 1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1540 1541 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1542 res = rt6_ex->rt6i; 1543 1544 return res; 1545 } 1546 1547 /* Remove the passed in cached rt from the hash table that contains it */ 1548 static int rt6_remove_exception_rt(struct rt6_info *rt) 1549 { 1550 struct rt6_exception_bucket *bucket; 1551 struct in6_addr *src_key = NULL; 1552 struct rt6_exception *rt6_ex; 1553 struct fib6_info *from; 1554 int err; 1555 1556 from = rcu_dereference(rt->from); 1557 if (!from || 1558 !(rt->rt6i_flags & RTF_CACHE)) 1559 return -EINVAL; 1560 1561 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1562 return -ENOENT; 1563 1564 spin_lock_bh(&rt6_exception_lock); 1565 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1566 lockdep_is_held(&rt6_exception_lock)); 1567 #ifdef CONFIG_IPV6_SUBTREES 1568 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1569 * and exception table is indexed by a hash of 1570 * both rt6i_dst and rt6i_src. 1571 * Otherwise, the exception table is indexed by 1572 * a hash of only rt6i_dst. 1573 */ 1574 if (from->fib6_src.plen) 1575 src_key = &rt->rt6i_src.addr; 1576 #endif 1577 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1578 &rt->rt6i_dst.addr, 1579 src_key); 1580 if (rt6_ex) { 1581 rt6_remove_exception(bucket, rt6_ex); 1582 err = 0; 1583 } else { 1584 err = -ENOENT; 1585 } 1586 1587 spin_unlock_bh(&rt6_exception_lock); 1588 return err; 1589 } 1590 1591 /* Find rt6_ex which contains the passed in rt cache and 1592 * refresh its stamp 1593 */ 1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1595 { 1596 struct rt6_exception_bucket *bucket; 1597 struct fib6_info *from = rt->from; 1598 struct in6_addr *src_key = NULL; 1599 struct rt6_exception *rt6_ex; 1600 1601 if (!from || 1602 !(rt->rt6i_flags & RTF_CACHE)) 1603 return; 1604 1605 rcu_read_lock(); 1606 bucket = rcu_dereference(from->rt6i_exception_bucket); 1607 1608 #ifdef CONFIG_IPV6_SUBTREES 1609 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1610 * and exception table is indexed by a hash of 1611 * both rt6i_dst and rt6i_src. 1612 * Otherwise, the exception table is indexed by 1613 * a hash of only rt6i_dst. 1614 */ 1615 if (from->fib6_src.plen) 1616 src_key = &rt->rt6i_src.addr; 1617 #endif 1618 rt6_ex = __rt6_find_exception_rcu(&bucket, 1619 &rt->rt6i_dst.addr, 1620 src_key); 1621 if (rt6_ex) 1622 rt6_ex->stamp = jiffies; 1623 1624 rcu_read_unlock(); 1625 } 1626 1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1628 { 1629 struct rt6_exception_bucket *bucket; 1630 struct rt6_exception *rt6_ex; 1631 int i; 1632 1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1634 lockdep_is_held(&rt6_exception_lock)); 1635 1636 if (bucket) { 1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1638 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1639 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1640 } 1641 bucket++; 1642 } 1643 } 1644 } 1645 1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1647 struct rt6_info *rt, int mtu) 1648 { 1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1650 * lowest MTU in the path: always allow updating the route PMTU to 1651 * reflect PMTU decreases. 1652 * 1653 * If the new MTU is higher, and the route PMTU is equal to the local 1654 * MTU, this means the old MTU is the lowest in the path, so allow 1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1656 * handle this. 1657 */ 1658 1659 if (dst_mtu(&rt->dst) >= mtu) 1660 return true; 1661 1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1663 return true; 1664 1665 return false; 1666 } 1667 1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1669 struct fib6_info *rt, int mtu) 1670 { 1671 struct rt6_exception_bucket *bucket; 1672 struct rt6_exception *rt6_ex; 1673 int i; 1674 1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1676 lockdep_is_held(&rt6_exception_lock)); 1677 1678 if (!bucket) 1679 return; 1680 1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1683 struct rt6_info *entry = rt6_ex->rt6i; 1684 1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1686 * route), the metrics of its rt->from have already 1687 * been updated. 1688 */ 1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1690 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1692 } 1693 bucket++; 1694 } 1695 } 1696 1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1698 1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1700 struct in6_addr *gateway) 1701 { 1702 struct rt6_exception_bucket *bucket; 1703 struct rt6_exception *rt6_ex; 1704 struct hlist_node *tmp; 1705 int i; 1706 1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1708 return; 1709 1710 spin_lock_bh(&rt6_exception_lock); 1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1712 lockdep_is_held(&rt6_exception_lock)); 1713 1714 if (bucket) { 1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1716 hlist_for_each_entry_safe(rt6_ex, tmp, 1717 &bucket->chain, hlist) { 1718 struct rt6_info *entry = rt6_ex->rt6i; 1719 1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1721 RTF_CACHE_GATEWAY && 1722 ipv6_addr_equal(gateway, 1723 &entry->rt6i_gateway)) { 1724 rt6_remove_exception(bucket, rt6_ex); 1725 } 1726 } 1727 bucket++; 1728 } 1729 } 1730 1731 spin_unlock_bh(&rt6_exception_lock); 1732 } 1733 1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1735 struct rt6_exception *rt6_ex, 1736 struct fib6_gc_args *gc_args, 1737 unsigned long now) 1738 { 1739 struct rt6_info *rt = rt6_ex->rt6i; 1740 1741 /* we are pruning and obsoleting aged-out and non gateway exceptions 1742 * even if others have still references to them, so that on next 1743 * dst_check() such references can be dropped. 1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1745 * expired, independently from their aging, as per RFC 8201 section 4 1746 */ 1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1749 RT6_TRACE("aging clone %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 } else if (time_after(jiffies, rt->dst.expires)) { 1754 RT6_TRACE("purging expired route %p\n", rt); 1755 rt6_remove_exception(bucket, rt6_ex); 1756 return; 1757 } 1758 1759 if (rt->rt6i_flags & RTF_GATEWAY) { 1760 struct neighbour *neigh; 1761 __u8 neigh_flags = 0; 1762 1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1764 if (neigh) 1765 neigh_flags = neigh->flags; 1766 1767 if (!(neigh_flags & NTF_ROUTER)) { 1768 RT6_TRACE("purging route %p via non-router but gateway\n", 1769 rt); 1770 rt6_remove_exception(bucket, rt6_ex); 1771 return; 1772 } 1773 } 1774 1775 gc_args->more++; 1776 } 1777 1778 void rt6_age_exceptions(struct fib6_info *rt, 1779 struct fib6_gc_args *gc_args, 1780 unsigned long now) 1781 { 1782 struct rt6_exception_bucket *bucket; 1783 struct rt6_exception *rt6_ex; 1784 struct hlist_node *tmp; 1785 int i; 1786 1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1788 return; 1789 1790 rcu_read_lock_bh(); 1791 spin_lock(&rt6_exception_lock); 1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1793 lockdep_is_held(&rt6_exception_lock)); 1794 1795 if (bucket) { 1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1797 hlist_for_each_entry_safe(rt6_ex, tmp, 1798 &bucket->chain, hlist) { 1799 rt6_age_examine_exception(bucket, rt6_ex, 1800 gc_args, now); 1801 } 1802 bucket++; 1803 } 1804 } 1805 spin_unlock(&rt6_exception_lock); 1806 rcu_read_unlock_bh(); 1807 } 1808 1809 /* must be called with rcu lock held */ 1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1811 int oif, struct flowi6 *fl6, int strict) 1812 { 1813 struct fib6_node *fn, *saved_fn; 1814 struct fib6_info *f6i; 1815 1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1817 saved_fn = fn; 1818 1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1820 oif = 0; 1821 1822 redo_rt6_select: 1823 f6i = rt6_select(net, fn, oif, strict); 1824 if (f6i == net->ipv6.fib6_null_entry) { 1825 fn = fib6_backtrack(fn, &fl6->saddr); 1826 if (fn) 1827 goto redo_rt6_select; 1828 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1829 /* also consider unreachable route */ 1830 strict &= ~RT6_LOOKUP_F_REACHABLE; 1831 fn = saved_fn; 1832 goto redo_rt6_select; 1833 } 1834 } 1835 1836 trace_fib6_table_lookup(net, f6i, table, fl6); 1837 1838 return f6i; 1839 } 1840 1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1842 int oif, struct flowi6 *fl6, 1843 const struct sk_buff *skb, int flags) 1844 { 1845 struct fib6_info *f6i; 1846 struct rt6_info *rt; 1847 int strict = 0; 1848 1849 strict |= flags & RT6_LOOKUP_F_IFACE; 1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1851 if (net->ipv6.devconf_all->forwarding == 0) 1852 strict |= RT6_LOOKUP_F_REACHABLE; 1853 1854 rcu_read_lock(); 1855 1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1857 if (f6i->fib6_nsiblings) 1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1859 1860 if (f6i == net->ipv6.fib6_null_entry) { 1861 rt = net->ipv6.ip6_null_entry; 1862 rcu_read_unlock(); 1863 dst_hold(&rt->dst); 1864 return rt; 1865 } 1866 1867 /*Search through exception table */ 1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1869 if (rt) { 1870 if (ip6_hold_safe(net, &rt, true)) 1871 dst_use_noref(&rt->dst, jiffies); 1872 1873 rcu_read_unlock(); 1874 return rt; 1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1876 !(f6i->fib6_flags & RTF_GATEWAY))) { 1877 /* Create a RTF_CACHE clone which will not be 1878 * owned by the fib6 tree. It is for the special case where 1879 * the daddr in the skb during the neighbor look-up is different 1880 * from the fl6->daddr used to look-up route here. 1881 */ 1882 struct rt6_info *uncached_rt; 1883 1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1885 1886 rcu_read_unlock(); 1887 1888 if (uncached_rt) { 1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1890 * No need for another dst_hold() 1891 */ 1892 rt6_uncached_list_add(uncached_rt); 1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1894 } else { 1895 uncached_rt = net->ipv6.ip6_null_entry; 1896 dst_hold(&uncached_rt->dst); 1897 } 1898 1899 return uncached_rt; 1900 } else { 1901 /* Get a percpu copy */ 1902 1903 struct rt6_info *pcpu_rt; 1904 1905 local_bh_disable(); 1906 pcpu_rt = rt6_get_pcpu_route(f6i); 1907 1908 if (!pcpu_rt) 1909 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1910 1911 local_bh_enable(); 1912 rcu_read_unlock(); 1913 1914 return pcpu_rt; 1915 } 1916 } 1917 EXPORT_SYMBOL_GPL(ip6_pol_route); 1918 1919 static struct rt6_info *ip6_pol_route_input(struct net *net, 1920 struct fib6_table *table, 1921 struct flowi6 *fl6, 1922 const struct sk_buff *skb, 1923 int flags) 1924 { 1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1926 } 1927 1928 struct dst_entry *ip6_route_input_lookup(struct net *net, 1929 struct net_device *dev, 1930 struct flowi6 *fl6, 1931 const struct sk_buff *skb, 1932 int flags) 1933 { 1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1935 flags |= RT6_LOOKUP_F_IFACE; 1936 1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1938 } 1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1940 1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1942 struct flow_keys *keys, 1943 struct flow_keys *flkeys) 1944 { 1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1946 const struct ipv6hdr *key_iph = outer_iph; 1947 struct flow_keys *_flkeys = flkeys; 1948 const struct ipv6hdr *inner_iph; 1949 const struct icmp6hdr *icmph; 1950 struct ipv6hdr _inner_iph; 1951 struct icmp6hdr _icmph; 1952 1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1954 goto out; 1955 1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1957 sizeof(_icmph), &_icmph); 1958 if (!icmph) 1959 goto out; 1960 1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1964 icmph->icmp6_type != ICMPV6_PARAMPROB) 1965 goto out; 1966 1967 inner_iph = skb_header_pointer(skb, 1968 skb_transport_offset(skb) + sizeof(*icmph), 1969 sizeof(_inner_iph), &_inner_iph); 1970 if (!inner_iph) 1971 goto out; 1972 1973 key_iph = inner_iph; 1974 _flkeys = NULL; 1975 out: 1976 if (_flkeys) { 1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1979 keys->tags.flow_label = _flkeys->tags.flow_label; 1980 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1981 } else { 1982 keys->addrs.v6addrs.src = key_iph->saddr; 1983 keys->addrs.v6addrs.dst = key_iph->daddr; 1984 keys->tags.flow_label = ip6_flowlabel(key_iph); 1985 keys->basic.ip_proto = key_iph->nexthdr; 1986 } 1987 } 1988 1989 /* if skb is set it will be used and fl6 can be NULL */ 1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1991 const struct sk_buff *skb, struct flow_keys *flkeys) 1992 { 1993 struct flow_keys hash_keys; 1994 u32 mhash; 1995 1996 switch (ip6_multipath_hash_policy(net)) { 1997 case 0: 1998 memset(&hash_keys, 0, sizeof(hash_keys)); 1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2000 if (skb) { 2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2002 } else { 2003 hash_keys.addrs.v6addrs.src = fl6->saddr; 2004 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2007 } 2008 break; 2009 case 1: 2010 if (skb) { 2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2012 struct flow_keys keys; 2013 2014 /* short-circuit if we already have L4 hash present */ 2015 if (skb->l4_hash) 2016 return skb_get_hash_raw(skb) >> 1; 2017 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 2020 if (!flkeys) { 2021 skb_flow_dissect_flow_keys(skb, &keys, flag); 2022 flkeys = &keys; 2023 } 2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2027 hash_keys.ports.src = flkeys->ports.src; 2028 hash_keys.ports.dst = flkeys->ports.dst; 2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2030 } else { 2031 memset(&hash_keys, 0, sizeof(hash_keys)); 2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.ports.src = fl6->fl6_sport; 2036 hash_keys.ports.dst = fl6->fl6_dport; 2037 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2038 } 2039 break; 2040 } 2041 mhash = flow_hash_from_keys(&hash_keys); 2042 2043 return mhash >> 1; 2044 } 2045 2046 void ip6_route_input(struct sk_buff *skb) 2047 { 2048 const struct ipv6hdr *iph = ipv6_hdr(skb); 2049 struct net *net = dev_net(skb->dev); 2050 int flags = RT6_LOOKUP_F_HAS_SADDR; 2051 struct ip_tunnel_info *tun_info; 2052 struct flowi6 fl6 = { 2053 .flowi6_iif = skb->dev->ifindex, 2054 .daddr = iph->daddr, 2055 .saddr = iph->saddr, 2056 .flowlabel = ip6_flowinfo(iph), 2057 .flowi6_mark = skb->mark, 2058 .flowi6_proto = iph->nexthdr, 2059 }; 2060 struct flow_keys *flkeys = NULL, _flkeys; 2061 2062 tun_info = skb_tunnel_info(skb); 2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2065 2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2067 flkeys = &_flkeys; 2068 2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2071 skb_dst_drop(skb); 2072 skb_dst_set(skb, 2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2074 } 2075 2076 static struct rt6_info *ip6_pol_route_output(struct net *net, 2077 struct fib6_table *table, 2078 struct flowi6 *fl6, 2079 const struct sk_buff *skb, 2080 int flags) 2081 { 2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2083 } 2084 2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2086 struct flowi6 *fl6, int flags) 2087 { 2088 bool any_src; 2089 2090 if (rt6_need_strict(&fl6->daddr)) { 2091 struct dst_entry *dst; 2092 2093 dst = l3mdev_link_scope_lookup(net, fl6); 2094 if (dst) 2095 return dst; 2096 } 2097 2098 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2099 2100 any_src = ipv6_addr_any(&fl6->saddr); 2101 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2102 (fl6->flowi6_oif && any_src)) 2103 flags |= RT6_LOOKUP_F_IFACE; 2104 2105 if (!any_src) 2106 flags |= RT6_LOOKUP_F_HAS_SADDR; 2107 else if (sk) 2108 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2109 2110 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2111 } 2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2113 2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2115 { 2116 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2117 struct net_device *loopback_dev = net->loopback_dev; 2118 struct dst_entry *new = NULL; 2119 2120 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2121 DST_OBSOLETE_DEAD, 0); 2122 if (rt) { 2123 rt6_info_init(rt); 2124 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2125 2126 new = &rt->dst; 2127 new->__use = 1; 2128 new->input = dst_discard; 2129 new->output = dst_discard_out; 2130 2131 dst_copy_metrics(new, &ort->dst); 2132 2133 rt->rt6i_idev = in6_dev_get(loopback_dev); 2134 rt->rt6i_gateway = ort->rt6i_gateway; 2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2136 2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2138 #ifdef CONFIG_IPV6_SUBTREES 2139 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2140 #endif 2141 } 2142 2143 dst_release(dst_orig); 2144 return new ? new : ERR_PTR(-ENOMEM); 2145 } 2146 2147 /* 2148 * Destination cache support functions 2149 */ 2150 2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2152 { 2153 u32 rt_cookie = 0; 2154 2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2156 return false; 2157 2158 if (fib6_check_expired(f6i)) 2159 return false; 2160 2161 return true; 2162 } 2163 2164 static struct dst_entry *rt6_check(struct rt6_info *rt, 2165 struct fib6_info *from, 2166 u32 cookie) 2167 { 2168 u32 rt_cookie = 0; 2169 2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2171 rt_cookie != cookie) 2172 return NULL; 2173 2174 if (rt6_check_expired(rt)) 2175 return NULL; 2176 2177 return &rt->dst; 2178 } 2179 2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2181 struct fib6_info *from, 2182 u32 cookie) 2183 { 2184 if (!__rt6_check_expired(rt) && 2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2186 fib6_check(from, cookie)) 2187 return &rt->dst; 2188 else 2189 return NULL; 2190 } 2191 2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2193 { 2194 struct dst_entry *dst_ret; 2195 struct fib6_info *from; 2196 struct rt6_info *rt; 2197 2198 rt = container_of(dst, struct rt6_info, dst); 2199 2200 rcu_read_lock(); 2201 2202 /* All IPV6 dsts are created with ->obsolete set to the value 2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2204 * into this function always. 2205 */ 2206 2207 from = rcu_dereference(rt->from); 2208 2209 if (from && (rt->rt6i_flags & RTF_PCPU || 2210 unlikely(!list_empty(&rt->rt6i_uncached)))) 2211 dst_ret = rt6_dst_from_check(rt, from, cookie); 2212 else 2213 dst_ret = rt6_check(rt, from, cookie); 2214 2215 rcu_read_unlock(); 2216 2217 return dst_ret; 2218 } 2219 2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2221 { 2222 struct rt6_info *rt = (struct rt6_info *) dst; 2223 2224 if (rt) { 2225 if (rt->rt6i_flags & RTF_CACHE) { 2226 rcu_read_lock(); 2227 if (rt6_check_expired(rt)) { 2228 rt6_remove_exception_rt(rt); 2229 dst = NULL; 2230 } 2231 rcu_read_unlock(); 2232 } else { 2233 dst_release(dst); 2234 dst = NULL; 2235 } 2236 } 2237 return dst; 2238 } 2239 2240 static void ip6_link_failure(struct sk_buff *skb) 2241 { 2242 struct rt6_info *rt; 2243 2244 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2245 2246 rt = (struct rt6_info *) skb_dst(skb); 2247 if (rt) { 2248 rcu_read_lock(); 2249 if (rt->rt6i_flags & RTF_CACHE) { 2250 if (dst_hold_safe(&rt->dst)) 2251 rt6_remove_exception_rt(rt); 2252 } else { 2253 struct fib6_info *from; 2254 struct fib6_node *fn; 2255 2256 from = rcu_dereference(rt->from); 2257 if (from) { 2258 fn = rcu_dereference(from->fib6_node); 2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2260 fn->fn_sernum = -1; 2261 } 2262 } 2263 rcu_read_unlock(); 2264 } 2265 } 2266 2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2268 { 2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2270 struct fib6_info *from; 2271 2272 rcu_read_lock(); 2273 from = rcu_dereference(rt0->from); 2274 if (from) 2275 rt0->dst.expires = from->expires; 2276 rcu_read_unlock(); 2277 } 2278 2279 dst_set_expires(&rt0->dst, timeout); 2280 rt0->rt6i_flags |= RTF_EXPIRES; 2281 } 2282 2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2284 { 2285 struct net *net = dev_net(rt->dst.dev); 2286 2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2288 rt->rt6i_flags |= RTF_MODIFIED; 2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2290 } 2291 2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2293 { 2294 bool from_set; 2295 2296 rcu_read_lock(); 2297 from_set = !!rcu_dereference(rt->from); 2298 rcu_read_unlock(); 2299 2300 return !(rt->rt6i_flags & RTF_CACHE) && 2301 (rt->rt6i_flags & RTF_PCPU || from_set); 2302 } 2303 2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2305 const struct ipv6hdr *iph, u32 mtu) 2306 { 2307 const struct in6_addr *daddr, *saddr; 2308 struct rt6_info *rt6 = (struct rt6_info *)dst; 2309 2310 if (rt6->rt6i_flags & RTF_LOCAL) 2311 return; 2312 2313 if (dst_metric_locked(dst, RTAX_MTU)) 2314 return; 2315 2316 if (iph) { 2317 daddr = &iph->daddr; 2318 saddr = &iph->saddr; 2319 } else if (sk) { 2320 daddr = &sk->sk_v6_daddr; 2321 saddr = &inet6_sk(sk)->saddr; 2322 } else { 2323 daddr = NULL; 2324 saddr = NULL; 2325 } 2326 dst_confirm_neigh(dst, daddr); 2327 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2328 if (mtu >= dst_mtu(dst)) 2329 return; 2330 2331 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2332 rt6_do_update_pmtu(rt6, mtu); 2333 /* update rt6_ex->stamp for cache */ 2334 if (rt6->rt6i_flags & RTF_CACHE) 2335 rt6_update_exception_stamp_rt(rt6); 2336 } else if (daddr) { 2337 struct fib6_info *from; 2338 struct rt6_info *nrt6; 2339 2340 rcu_read_lock(); 2341 from = rcu_dereference(rt6->from); 2342 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2343 if (nrt6) { 2344 rt6_do_update_pmtu(nrt6, mtu); 2345 if (rt6_insert_exception(nrt6, from)) 2346 dst_release_immediate(&nrt6->dst); 2347 } 2348 rcu_read_unlock(); 2349 } 2350 } 2351 2352 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2353 struct sk_buff *skb, u32 mtu) 2354 { 2355 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2356 } 2357 2358 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2359 int oif, u32 mark, kuid_t uid) 2360 { 2361 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2362 struct dst_entry *dst; 2363 struct flowi6 fl6; 2364 2365 memset(&fl6, 0, sizeof(fl6)); 2366 fl6.flowi6_oif = oif; 2367 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2368 fl6.daddr = iph->daddr; 2369 fl6.saddr = iph->saddr; 2370 fl6.flowlabel = ip6_flowinfo(iph); 2371 fl6.flowi6_uid = uid; 2372 2373 dst = ip6_route_output(net, NULL, &fl6); 2374 if (!dst->error) 2375 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2376 dst_release(dst); 2377 } 2378 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2379 2380 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2381 { 2382 struct dst_entry *dst; 2383 2384 ip6_update_pmtu(skb, sock_net(sk), mtu, 2385 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2386 2387 dst = __sk_dst_get(sk); 2388 if (!dst || !dst->obsolete || 2389 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2390 return; 2391 2392 bh_lock_sock(sk); 2393 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2394 ip6_datagram_dst_update(sk, false); 2395 bh_unlock_sock(sk); 2396 } 2397 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2398 2399 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2400 const struct flowi6 *fl6) 2401 { 2402 #ifdef CONFIG_IPV6_SUBTREES 2403 struct ipv6_pinfo *np = inet6_sk(sk); 2404 #endif 2405 2406 ip6_dst_store(sk, dst, 2407 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2408 &sk->sk_v6_daddr : NULL, 2409 #ifdef CONFIG_IPV6_SUBTREES 2410 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2411 &np->saddr : 2412 #endif 2413 NULL); 2414 } 2415 2416 /* Handle redirects */ 2417 struct ip6rd_flowi { 2418 struct flowi6 fl6; 2419 struct in6_addr gateway; 2420 }; 2421 2422 static struct rt6_info *__ip6_route_redirect(struct net *net, 2423 struct fib6_table *table, 2424 struct flowi6 *fl6, 2425 const struct sk_buff *skb, 2426 int flags) 2427 { 2428 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2429 struct rt6_info *ret = NULL, *rt_cache; 2430 struct fib6_info *rt; 2431 struct fib6_node *fn; 2432 2433 /* Get the "current" route for this destination and 2434 * check if the redirect has come from appropriate router. 2435 * 2436 * RFC 4861 specifies that redirects should only be 2437 * accepted if they come from the nexthop to the target. 2438 * Due to the way the routes are chosen, this notion 2439 * is a bit fuzzy and one might need to check all possible 2440 * routes. 2441 */ 2442 2443 rcu_read_lock(); 2444 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2445 restart: 2446 for_each_fib6_node_rt_rcu(fn) { 2447 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2448 continue; 2449 if (fib6_check_expired(rt)) 2450 continue; 2451 if (rt->fib6_flags & RTF_REJECT) 2452 break; 2453 if (!(rt->fib6_flags & RTF_GATEWAY)) 2454 continue; 2455 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2456 continue; 2457 /* rt_cache's gateway might be different from its 'parent' 2458 * in the case of an ip redirect. 2459 * So we keep searching in the exception table if the gateway 2460 * is different. 2461 */ 2462 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2463 rt_cache = rt6_find_cached_rt(rt, 2464 &fl6->daddr, 2465 &fl6->saddr); 2466 if (rt_cache && 2467 ipv6_addr_equal(&rdfl->gateway, 2468 &rt_cache->rt6i_gateway)) { 2469 ret = rt_cache; 2470 break; 2471 } 2472 continue; 2473 } 2474 break; 2475 } 2476 2477 if (!rt) 2478 rt = net->ipv6.fib6_null_entry; 2479 else if (rt->fib6_flags & RTF_REJECT) { 2480 ret = net->ipv6.ip6_null_entry; 2481 goto out; 2482 } 2483 2484 if (rt == net->ipv6.fib6_null_entry) { 2485 fn = fib6_backtrack(fn, &fl6->saddr); 2486 if (fn) 2487 goto restart; 2488 } 2489 2490 out: 2491 if (ret) 2492 dst_hold(&ret->dst); 2493 else 2494 ret = ip6_create_rt_rcu(rt); 2495 2496 rcu_read_unlock(); 2497 2498 trace_fib6_table_lookup(net, rt, table, fl6); 2499 return ret; 2500 }; 2501 2502 static struct dst_entry *ip6_route_redirect(struct net *net, 2503 const struct flowi6 *fl6, 2504 const struct sk_buff *skb, 2505 const struct in6_addr *gateway) 2506 { 2507 int flags = RT6_LOOKUP_F_HAS_SADDR; 2508 struct ip6rd_flowi rdfl; 2509 2510 rdfl.fl6 = *fl6; 2511 rdfl.gateway = *gateway; 2512 2513 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2514 flags, __ip6_route_redirect); 2515 } 2516 2517 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2518 kuid_t uid) 2519 { 2520 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2521 struct dst_entry *dst; 2522 struct flowi6 fl6; 2523 2524 memset(&fl6, 0, sizeof(fl6)); 2525 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2526 fl6.flowi6_oif = oif; 2527 fl6.flowi6_mark = mark; 2528 fl6.daddr = iph->daddr; 2529 fl6.saddr = iph->saddr; 2530 fl6.flowlabel = ip6_flowinfo(iph); 2531 fl6.flowi6_uid = uid; 2532 2533 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2534 rt6_do_redirect(dst, NULL, skb); 2535 dst_release(dst); 2536 } 2537 EXPORT_SYMBOL_GPL(ip6_redirect); 2538 2539 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2540 u32 mark) 2541 { 2542 const struct ipv6hdr *iph = ipv6_hdr(skb); 2543 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2544 struct dst_entry *dst; 2545 struct flowi6 fl6; 2546 2547 memset(&fl6, 0, sizeof(fl6)); 2548 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2549 fl6.flowi6_oif = oif; 2550 fl6.flowi6_mark = mark; 2551 fl6.daddr = msg->dest; 2552 fl6.saddr = iph->daddr; 2553 fl6.flowi6_uid = sock_net_uid(net, NULL); 2554 2555 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2556 rt6_do_redirect(dst, NULL, skb); 2557 dst_release(dst); 2558 } 2559 2560 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2561 { 2562 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2563 sk->sk_uid); 2564 } 2565 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2566 2567 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2568 { 2569 struct net_device *dev = dst->dev; 2570 unsigned int mtu = dst_mtu(dst); 2571 struct net *net = dev_net(dev); 2572 2573 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2574 2575 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2576 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2577 2578 /* 2579 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2580 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2581 * IPV6_MAXPLEN is also valid and means: "any MSS, 2582 * rely only on pmtu discovery" 2583 */ 2584 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2585 mtu = IPV6_MAXPLEN; 2586 return mtu; 2587 } 2588 2589 static unsigned int ip6_mtu(const struct dst_entry *dst) 2590 { 2591 struct inet6_dev *idev; 2592 unsigned int mtu; 2593 2594 mtu = dst_metric_raw(dst, RTAX_MTU); 2595 if (mtu) 2596 goto out; 2597 2598 mtu = IPV6_MIN_MTU; 2599 2600 rcu_read_lock(); 2601 idev = __in6_dev_get(dst->dev); 2602 if (idev) 2603 mtu = idev->cnf.mtu6; 2604 rcu_read_unlock(); 2605 2606 out: 2607 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2608 2609 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2610 } 2611 2612 /* MTU selection: 2613 * 1. mtu on route is locked - use it 2614 * 2. mtu from nexthop exception 2615 * 3. mtu from egress device 2616 * 2617 * based on ip6_dst_mtu_forward and exception logic of 2618 * rt6_find_cached_rt; called with rcu_read_lock 2619 */ 2620 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2621 struct in6_addr *saddr) 2622 { 2623 struct rt6_exception_bucket *bucket; 2624 struct rt6_exception *rt6_ex; 2625 struct in6_addr *src_key; 2626 struct inet6_dev *idev; 2627 u32 mtu = 0; 2628 2629 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2630 mtu = f6i->fib6_pmtu; 2631 if (mtu) 2632 goto out; 2633 } 2634 2635 src_key = NULL; 2636 #ifdef CONFIG_IPV6_SUBTREES 2637 if (f6i->fib6_src.plen) 2638 src_key = saddr; 2639 #endif 2640 2641 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2642 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2643 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2644 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2645 2646 if (likely(!mtu)) { 2647 struct net_device *dev = fib6_info_nh_dev(f6i); 2648 2649 mtu = IPV6_MIN_MTU; 2650 idev = __in6_dev_get(dev); 2651 if (idev && idev->cnf.mtu6 > mtu) 2652 mtu = idev->cnf.mtu6; 2653 } 2654 2655 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2656 out: 2657 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2658 } 2659 2660 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2661 struct flowi6 *fl6) 2662 { 2663 struct dst_entry *dst; 2664 struct rt6_info *rt; 2665 struct inet6_dev *idev = in6_dev_get(dev); 2666 struct net *net = dev_net(dev); 2667 2668 if (unlikely(!idev)) 2669 return ERR_PTR(-ENODEV); 2670 2671 rt = ip6_dst_alloc(net, dev, 0); 2672 if (unlikely(!rt)) { 2673 in6_dev_put(idev); 2674 dst = ERR_PTR(-ENOMEM); 2675 goto out; 2676 } 2677 2678 rt->dst.flags |= DST_HOST; 2679 rt->dst.input = ip6_input; 2680 rt->dst.output = ip6_output; 2681 rt->rt6i_gateway = fl6->daddr; 2682 rt->rt6i_dst.addr = fl6->daddr; 2683 rt->rt6i_dst.plen = 128; 2684 rt->rt6i_idev = idev; 2685 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2686 2687 /* Add this dst into uncached_list so that rt6_disable_ip() can 2688 * do proper release of the net_device 2689 */ 2690 rt6_uncached_list_add(rt); 2691 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2692 2693 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2694 2695 out: 2696 return dst; 2697 } 2698 2699 static int ip6_dst_gc(struct dst_ops *ops) 2700 { 2701 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2702 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2703 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2704 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2705 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2706 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2707 int entries; 2708 2709 entries = dst_entries_get_fast(ops); 2710 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2711 entries <= rt_max_size) 2712 goto out; 2713 2714 net->ipv6.ip6_rt_gc_expire++; 2715 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2716 entries = dst_entries_get_slow(ops); 2717 if (entries < ops->gc_thresh) 2718 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2719 out: 2720 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2721 return entries > rt_max_size; 2722 } 2723 2724 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2725 struct fib6_config *cfg) 2726 { 2727 struct dst_metrics *p; 2728 2729 if (!cfg->fc_mx) 2730 return 0; 2731 2732 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2733 if (unlikely(!p)) 2734 return -ENOMEM; 2735 2736 refcount_set(&p->refcnt, 1); 2737 rt->fib6_metrics = p; 2738 2739 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2740 } 2741 2742 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2743 struct fib6_config *cfg, 2744 const struct in6_addr *gw_addr, 2745 u32 tbid, int flags) 2746 { 2747 struct flowi6 fl6 = { 2748 .flowi6_oif = cfg->fc_ifindex, 2749 .daddr = *gw_addr, 2750 .saddr = cfg->fc_prefsrc, 2751 }; 2752 struct fib6_table *table; 2753 struct rt6_info *rt; 2754 2755 table = fib6_get_table(net, tbid); 2756 if (!table) 2757 return NULL; 2758 2759 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2760 flags |= RT6_LOOKUP_F_HAS_SADDR; 2761 2762 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2763 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2764 2765 /* if table lookup failed, fall back to full lookup */ 2766 if (rt == net->ipv6.ip6_null_entry) { 2767 ip6_rt_put(rt); 2768 rt = NULL; 2769 } 2770 2771 return rt; 2772 } 2773 2774 static int ip6_route_check_nh_onlink(struct net *net, 2775 struct fib6_config *cfg, 2776 const struct net_device *dev, 2777 struct netlink_ext_ack *extack) 2778 { 2779 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2780 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2781 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2782 struct rt6_info *grt; 2783 int err; 2784 2785 err = 0; 2786 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2787 if (grt) { 2788 if (!grt->dst.error && 2789 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2790 NL_SET_ERR_MSG(extack, 2791 "Nexthop has invalid gateway or device mismatch"); 2792 err = -EINVAL; 2793 } 2794 2795 ip6_rt_put(grt); 2796 } 2797 2798 return err; 2799 } 2800 2801 static int ip6_route_check_nh(struct net *net, 2802 struct fib6_config *cfg, 2803 struct net_device **_dev, 2804 struct inet6_dev **idev) 2805 { 2806 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2807 struct net_device *dev = _dev ? *_dev : NULL; 2808 struct rt6_info *grt = NULL; 2809 int err = -EHOSTUNREACH; 2810 2811 if (cfg->fc_table) { 2812 int flags = RT6_LOOKUP_F_IFACE; 2813 2814 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2815 cfg->fc_table, flags); 2816 if (grt) { 2817 if (grt->rt6i_flags & RTF_GATEWAY || 2818 (dev && dev != grt->dst.dev)) { 2819 ip6_rt_put(grt); 2820 grt = NULL; 2821 } 2822 } 2823 } 2824 2825 if (!grt) 2826 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2827 2828 if (!grt) 2829 goto out; 2830 2831 if (dev) { 2832 if (dev != grt->dst.dev) { 2833 ip6_rt_put(grt); 2834 goto out; 2835 } 2836 } else { 2837 *_dev = dev = grt->dst.dev; 2838 *idev = grt->rt6i_idev; 2839 dev_hold(dev); 2840 in6_dev_hold(grt->rt6i_idev); 2841 } 2842 2843 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2844 err = 0; 2845 2846 ip6_rt_put(grt); 2847 2848 out: 2849 return err; 2850 } 2851 2852 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2853 struct net_device **_dev, struct inet6_dev **idev, 2854 struct netlink_ext_ack *extack) 2855 { 2856 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2857 int gwa_type = ipv6_addr_type(gw_addr); 2858 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2859 const struct net_device *dev = *_dev; 2860 bool need_addr_check = !dev; 2861 int err = -EINVAL; 2862 2863 /* if gw_addr is local we will fail to detect this in case 2864 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2865 * will return already-added prefix route via interface that 2866 * prefix route was assigned to, which might be non-loopback. 2867 */ 2868 if (dev && 2869 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2870 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2871 goto out; 2872 } 2873 2874 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2875 /* IPv6 strictly inhibits using not link-local 2876 * addresses as nexthop address. 2877 * Otherwise, router will not able to send redirects. 2878 * It is very good, but in some (rare!) circumstances 2879 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2880 * some exceptions. --ANK 2881 * We allow IPv4-mapped nexthops to support RFC4798-type 2882 * addressing 2883 */ 2884 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2885 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2886 goto out; 2887 } 2888 2889 if (cfg->fc_flags & RTNH_F_ONLINK) 2890 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2891 else 2892 err = ip6_route_check_nh(net, cfg, _dev, idev); 2893 2894 if (err) 2895 goto out; 2896 } 2897 2898 /* reload in case device was changed */ 2899 dev = *_dev; 2900 2901 err = -EINVAL; 2902 if (!dev) { 2903 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2904 goto out; 2905 } else if (dev->flags & IFF_LOOPBACK) { 2906 NL_SET_ERR_MSG(extack, 2907 "Egress device can not be loopback device for this route"); 2908 goto out; 2909 } 2910 2911 /* if we did not check gw_addr above, do so now that the 2912 * egress device has been resolved. 2913 */ 2914 if (need_addr_check && 2915 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2916 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2917 goto out; 2918 } 2919 2920 err = 0; 2921 out: 2922 return err; 2923 } 2924 2925 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2926 gfp_t gfp_flags, 2927 struct netlink_ext_ack *extack) 2928 { 2929 struct net *net = cfg->fc_nlinfo.nl_net; 2930 struct fib6_info *rt = NULL; 2931 struct net_device *dev = NULL; 2932 struct inet6_dev *idev = NULL; 2933 struct fib6_table *table; 2934 int addr_type; 2935 int err = -EINVAL; 2936 2937 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2938 if (cfg->fc_flags & RTF_PCPU) { 2939 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2940 goto out; 2941 } 2942 2943 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2944 if (cfg->fc_flags & RTF_CACHE) { 2945 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2946 goto out; 2947 } 2948 2949 if (cfg->fc_type > RTN_MAX) { 2950 NL_SET_ERR_MSG(extack, "Invalid route type"); 2951 goto out; 2952 } 2953 2954 if (cfg->fc_dst_len > 128) { 2955 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2956 goto out; 2957 } 2958 if (cfg->fc_src_len > 128) { 2959 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2960 goto out; 2961 } 2962 #ifndef CONFIG_IPV6_SUBTREES 2963 if (cfg->fc_src_len) { 2964 NL_SET_ERR_MSG(extack, 2965 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2966 goto out; 2967 } 2968 #endif 2969 if (cfg->fc_ifindex) { 2970 err = -ENODEV; 2971 dev = dev_get_by_index(net, cfg->fc_ifindex); 2972 if (!dev) 2973 goto out; 2974 idev = in6_dev_get(dev); 2975 if (!idev) 2976 goto out; 2977 } 2978 2979 if (cfg->fc_metric == 0) 2980 cfg->fc_metric = IP6_RT_PRIO_USER; 2981 2982 if (cfg->fc_flags & RTNH_F_ONLINK) { 2983 if (!dev) { 2984 NL_SET_ERR_MSG(extack, 2985 "Nexthop device required for onlink"); 2986 err = -ENODEV; 2987 goto out; 2988 } 2989 2990 if (!(dev->flags & IFF_UP)) { 2991 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2992 err = -ENETDOWN; 2993 goto out; 2994 } 2995 } 2996 2997 err = -ENOBUFS; 2998 if (cfg->fc_nlinfo.nlh && 2999 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3000 table = fib6_get_table(net, cfg->fc_table); 3001 if (!table) { 3002 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3003 table = fib6_new_table(net, cfg->fc_table); 3004 } 3005 } else { 3006 table = fib6_new_table(net, cfg->fc_table); 3007 } 3008 3009 if (!table) 3010 goto out; 3011 3012 err = -ENOMEM; 3013 rt = fib6_info_alloc(gfp_flags); 3014 if (!rt) 3015 goto out; 3016 3017 if (cfg->fc_flags & RTF_ADDRCONF) 3018 rt->dst_nocount = true; 3019 3020 err = ip6_convert_metrics(net, rt, cfg); 3021 if (err < 0) 3022 goto out; 3023 3024 if (cfg->fc_flags & RTF_EXPIRES) 3025 fib6_set_expires(rt, jiffies + 3026 clock_t_to_jiffies(cfg->fc_expires)); 3027 else 3028 fib6_clean_expires(rt); 3029 3030 if (cfg->fc_protocol == RTPROT_UNSPEC) 3031 cfg->fc_protocol = RTPROT_BOOT; 3032 rt->fib6_protocol = cfg->fc_protocol; 3033 3034 addr_type = ipv6_addr_type(&cfg->fc_dst); 3035 3036 if (cfg->fc_encap) { 3037 struct lwtunnel_state *lwtstate; 3038 3039 err = lwtunnel_build_state(cfg->fc_encap_type, 3040 cfg->fc_encap, AF_INET6, cfg, 3041 &lwtstate, extack); 3042 if (err) 3043 goto out; 3044 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3045 } 3046 3047 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3048 rt->fib6_dst.plen = cfg->fc_dst_len; 3049 if (rt->fib6_dst.plen == 128) 3050 rt->dst_host = true; 3051 3052 #ifdef CONFIG_IPV6_SUBTREES 3053 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3054 rt->fib6_src.plen = cfg->fc_src_len; 3055 #endif 3056 3057 rt->fib6_metric = cfg->fc_metric; 3058 rt->fib6_nh.nh_weight = 1; 3059 3060 rt->fib6_type = cfg->fc_type; 3061 3062 /* We cannot add true routes via loopback here, 3063 they would result in kernel looping; promote them to reject routes 3064 */ 3065 if ((cfg->fc_flags & RTF_REJECT) || 3066 (dev && (dev->flags & IFF_LOOPBACK) && 3067 !(addr_type & IPV6_ADDR_LOOPBACK) && 3068 !(cfg->fc_flags & RTF_LOCAL))) { 3069 /* hold loopback dev/idev if we haven't done so. */ 3070 if (dev != net->loopback_dev) { 3071 if (dev) { 3072 dev_put(dev); 3073 in6_dev_put(idev); 3074 } 3075 dev = net->loopback_dev; 3076 dev_hold(dev); 3077 idev = in6_dev_get(dev); 3078 if (!idev) { 3079 err = -ENODEV; 3080 goto out; 3081 } 3082 } 3083 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3084 goto install_route; 3085 } 3086 3087 if (cfg->fc_flags & RTF_GATEWAY) { 3088 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3089 if (err) 3090 goto out; 3091 3092 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3093 } 3094 3095 err = -ENODEV; 3096 if (!dev) 3097 goto out; 3098 3099 if (idev->cnf.disable_ipv6) { 3100 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3101 err = -EACCES; 3102 goto out; 3103 } 3104 3105 if (!(dev->flags & IFF_UP)) { 3106 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3107 err = -ENETDOWN; 3108 goto out; 3109 } 3110 3111 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3112 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3113 NL_SET_ERR_MSG(extack, "Invalid source address"); 3114 err = -EINVAL; 3115 goto out; 3116 } 3117 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3118 rt->fib6_prefsrc.plen = 128; 3119 } else 3120 rt->fib6_prefsrc.plen = 0; 3121 3122 rt->fib6_flags = cfg->fc_flags; 3123 3124 install_route: 3125 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3126 !netif_carrier_ok(dev)) 3127 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3128 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3129 rt->fib6_nh.nh_dev = dev; 3130 rt->fib6_table = table; 3131 3132 cfg->fc_nlinfo.nl_net = dev_net(dev); 3133 3134 if (idev) 3135 in6_dev_put(idev); 3136 3137 return rt; 3138 out: 3139 if (dev) 3140 dev_put(dev); 3141 if (idev) 3142 in6_dev_put(idev); 3143 3144 fib6_info_release(rt); 3145 return ERR_PTR(err); 3146 } 3147 3148 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3149 struct netlink_ext_ack *extack) 3150 { 3151 struct fib6_info *rt; 3152 int err; 3153 3154 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3155 if (IS_ERR(rt)) 3156 return PTR_ERR(rt); 3157 3158 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3159 fib6_info_release(rt); 3160 3161 return err; 3162 } 3163 3164 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3165 { 3166 struct net *net = info->nl_net; 3167 struct fib6_table *table; 3168 int err; 3169 3170 if (rt == net->ipv6.fib6_null_entry) { 3171 err = -ENOENT; 3172 goto out; 3173 } 3174 3175 table = rt->fib6_table; 3176 spin_lock_bh(&table->tb6_lock); 3177 err = fib6_del(rt, info); 3178 spin_unlock_bh(&table->tb6_lock); 3179 3180 out: 3181 fib6_info_release(rt); 3182 return err; 3183 } 3184 3185 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3186 { 3187 struct nl_info info = { .nl_net = net }; 3188 3189 return __ip6_del_rt(rt, &info); 3190 } 3191 3192 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3193 { 3194 struct nl_info *info = &cfg->fc_nlinfo; 3195 struct net *net = info->nl_net; 3196 struct sk_buff *skb = NULL; 3197 struct fib6_table *table; 3198 int err = -ENOENT; 3199 3200 if (rt == net->ipv6.fib6_null_entry) 3201 goto out_put; 3202 table = rt->fib6_table; 3203 spin_lock_bh(&table->tb6_lock); 3204 3205 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3206 struct fib6_info *sibling, *next_sibling; 3207 3208 /* prefer to send a single notification with all hops */ 3209 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3210 if (skb) { 3211 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3212 3213 if (rt6_fill_node(net, skb, rt, NULL, 3214 NULL, NULL, 0, RTM_DELROUTE, 3215 info->portid, seq, 0) < 0) { 3216 kfree_skb(skb); 3217 skb = NULL; 3218 } else 3219 info->skip_notify = 1; 3220 } 3221 3222 list_for_each_entry_safe(sibling, next_sibling, 3223 &rt->fib6_siblings, 3224 fib6_siblings) { 3225 err = fib6_del(sibling, info); 3226 if (err) 3227 goto out_unlock; 3228 } 3229 } 3230 3231 err = fib6_del(rt, info); 3232 out_unlock: 3233 spin_unlock_bh(&table->tb6_lock); 3234 out_put: 3235 fib6_info_release(rt); 3236 3237 if (skb) { 3238 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3239 info->nlh, gfp_any()); 3240 } 3241 return err; 3242 } 3243 3244 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3245 { 3246 int rc = -ESRCH; 3247 3248 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3249 goto out; 3250 3251 if (cfg->fc_flags & RTF_GATEWAY && 3252 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3253 goto out; 3254 if (dst_hold_safe(&rt->dst)) 3255 rc = rt6_remove_exception_rt(rt); 3256 out: 3257 return rc; 3258 } 3259 3260 static int ip6_route_del(struct fib6_config *cfg, 3261 struct netlink_ext_ack *extack) 3262 { 3263 struct rt6_info *rt_cache; 3264 struct fib6_table *table; 3265 struct fib6_info *rt; 3266 struct fib6_node *fn; 3267 int err = -ESRCH; 3268 3269 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3270 if (!table) { 3271 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3272 return err; 3273 } 3274 3275 rcu_read_lock(); 3276 3277 fn = fib6_locate(&table->tb6_root, 3278 &cfg->fc_dst, cfg->fc_dst_len, 3279 &cfg->fc_src, cfg->fc_src_len, 3280 !(cfg->fc_flags & RTF_CACHE)); 3281 3282 if (fn) { 3283 for_each_fib6_node_rt_rcu(fn) { 3284 if (cfg->fc_flags & RTF_CACHE) { 3285 int rc; 3286 3287 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3288 &cfg->fc_src); 3289 if (rt_cache) { 3290 rc = ip6_del_cached_rt(rt_cache, cfg); 3291 if (rc != -ESRCH) { 3292 rcu_read_unlock(); 3293 return rc; 3294 } 3295 } 3296 continue; 3297 } 3298 if (cfg->fc_ifindex && 3299 (!rt->fib6_nh.nh_dev || 3300 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3301 continue; 3302 if (cfg->fc_flags & RTF_GATEWAY && 3303 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3304 continue; 3305 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3306 continue; 3307 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3308 continue; 3309 fib6_info_hold(rt); 3310 rcu_read_unlock(); 3311 3312 /* if gateway was specified only delete the one hop */ 3313 if (cfg->fc_flags & RTF_GATEWAY) 3314 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3315 3316 return __ip6_del_rt_siblings(rt, cfg); 3317 } 3318 } 3319 rcu_read_unlock(); 3320 3321 return err; 3322 } 3323 3324 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3325 { 3326 struct netevent_redirect netevent; 3327 struct rt6_info *rt, *nrt = NULL; 3328 struct ndisc_options ndopts; 3329 struct inet6_dev *in6_dev; 3330 struct neighbour *neigh; 3331 struct fib6_info *from; 3332 struct rd_msg *msg; 3333 int optlen, on_link; 3334 u8 *lladdr; 3335 3336 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3337 optlen -= sizeof(*msg); 3338 3339 if (optlen < 0) { 3340 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3341 return; 3342 } 3343 3344 msg = (struct rd_msg *)icmp6_hdr(skb); 3345 3346 if (ipv6_addr_is_multicast(&msg->dest)) { 3347 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3348 return; 3349 } 3350 3351 on_link = 0; 3352 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3353 on_link = 1; 3354 } else if (ipv6_addr_type(&msg->target) != 3355 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3356 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3357 return; 3358 } 3359 3360 in6_dev = __in6_dev_get(skb->dev); 3361 if (!in6_dev) 3362 return; 3363 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3364 return; 3365 3366 /* RFC2461 8.1: 3367 * The IP source address of the Redirect MUST be the same as the current 3368 * first-hop router for the specified ICMP Destination Address. 3369 */ 3370 3371 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3372 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3373 return; 3374 } 3375 3376 lladdr = NULL; 3377 if (ndopts.nd_opts_tgt_lladdr) { 3378 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3379 skb->dev); 3380 if (!lladdr) { 3381 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3382 return; 3383 } 3384 } 3385 3386 rt = (struct rt6_info *) dst; 3387 if (rt->rt6i_flags & RTF_REJECT) { 3388 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3389 return; 3390 } 3391 3392 /* Redirect received -> path was valid. 3393 * Look, redirects are sent only in response to data packets, 3394 * so that this nexthop apparently is reachable. --ANK 3395 */ 3396 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3397 3398 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3399 if (!neigh) 3400 return; 3401 3402 /* 3403 * We have finally decided to accept it. 3404 */ 3405 3406 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3407 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3408 NEIGH_UPDATE_F_OVERRIDE| 3409 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3410 NEIGH_UPDATE_F_ISROUTER)), 3411 NDISC_REDIRECT, &ndopts); 3412 3413 rcu_read_lock(); 3414 from = rcu_dereference(rt->from); 3415 fib6_info_hold(from); 3416 rcu_read_unlock(); 3417 3418 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3419 if (!nrt) 3420 goto out; 3421 3422 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3423 if (on_link) 3424 nrt->rt6i_flags &= ~RTF_GATEWAY; 3425 3426 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3427 3428 /* No need to remove rt from the exception table if rt is 3429 * a cached route because rt6_insert_exception() will 3430 * takes care of it 3431 */ 3432 if (rt6_insert_exception(nrt, from)) { 3433 dst_release_immediate(&nrt->dst); 3434 goto out; 3435 } 3436 3437 netevent.old = &rt->dst; 3438 netevent.new = &nrt->dst; 3439 netevent.daddr = &msg->dest; 3440 netevent.neigh = neigh; 3441 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3442 3443 out: 3444 fib6_info_release(from); 3445 neigh_release(neigh); 3446 } 3447 3448 #ifdef CONFIG_IPV6_ROUTE_INFO 3449 static struct fib6_info *rt6_get_route_info(struct net *net, 3450 const struct in6_addr *prefix, int prefixlen, 3451 const struct in6_addr *gwaddr, 3452 struct net_device *dev) 3453 { 3454 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3455 int ifindex = dev->ifindex; 3456 struct fib6_node *fn; 3457 struct fib6_info *rt = NULL; 3458 struct fib6_table *table; 3459 3460 table = fib6_get_table(net, tb_id); 3461 if (!table) 3462 return NULL; 3463 3464 rcu_read_lock(); 3465 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3466 if (!fn) 3467 goto out; 3468 3469 for_each_fib6_node_rt_rcu(fn) { 3470 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3471 continue; 3472 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3473 continue; 3474 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3475 continue; 3476 fib6_info_hold(rt); 3477 break; 3478 } 3479 out: 3480 rcu_read_unlock(); 3481 return rt; 3482 } 3483 3484 static struct fib6_info *rt6_add_route_info(struct net *net, 3485 const struct in6_addr *prefix, int prefixlen, 3486 const struct in6_addr *gwaddr, 3487 struct net_device *dev, 3488 unsigned int pref) 3489 { 3490 struct fib6_config cfg = { 3491 .fc_metric = IP6_RT_PRIO_USER, 3492 .fc_ifindex = dev->ifindex, 3493 .fc_dst_len = prefixlen, 3494 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3495 RTF_UP | RTF_PREF(pref), 3496 .fc_protocol = RTPROT_RA, 3497 .fc_type = RTN_UNICAST, 3498 .fc_nlinfo.portid = 0, 3499 .fc_nlinfo.nlh = NULL, 3500 .fc_nlinfo.nl_net = net, 3501 }; 3502 3503 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3504 cfg.fc_dst = *prefix; 3505 cfg.fc_gateway = *gwaddr; 3506 3507 /* We should treat it as a default route if prefix length is 0. */ 3508 if (!prefixlen) 3509 cfg.fc_flags |= RTF_DEFAULT; 3510 3511 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3512 3513 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3514 } 3515 #endif 3516 3517 struct fib6_info *rt6_get_dflt_router(struct net *net, 3518 const struct in6_addr *addr, 3519 struct net_device *dev) 3520 { 3521 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3522 struct fib6_info *rt; 3523 struct fib6_table *table; 3524 3525 table = fib6_get_table(net, tb_id); 3526 if (!table) 3527 return NULL; 3528 3529 rcu_read_lock(); 3530 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3531 if (dev == rt->fib6_nh.nh_dev && 3532 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3533 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3534 break; 3535 } 3536 if (rt) 3537 fib6_info_hold(rt); 3538 rcu_read_unlock(); 3539 return rt; 3540 } 3541 3542 struct fib6_info *rt6_add_dflt_router(struct net *net, 3543 const struct in6_addr *gwaddr, 3544 struct net_device *dev, 3545 unsigned int pref) 3546 { 3547 struct fib6_config cfg = { 3548 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3549 .fc_metric = IP6_RT_PRIO_USER, 3550 .fc_ifindex = dev->ifindex, 3551 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3552 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3553 .fc_protocol = RTPROT_RA, 3554 .fc_type = RTN_UNICAST, 3555 .fc_nlinfo.portid = 0, 3556 .fc_nlinfo.nlh = NULL, 3557 .fc_nlinfo.nl_net = net, 3558 }; 3559 3560 cfg.fc_gateway = *gwaddr; 3561 3562 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3563 struct fib6_table *table; 3564 3565 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3566 if (table) 3567 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3568 } 3569 3570 return rt6_get_dflt_router(net, gwaddr, dev); 3571 } 3572 3573 static void __rt6_purge_dflt_routers(struct net *net, 3574 struct fib6_table *table) 3575 { 3576 struct fib6_info *rt; 3577 3578 restart: 3579 rcu_read_lock(); 3580 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3581 struct net_device *dev = fib6_info_nh_dev(rt); 3582 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3583 3584 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3585 (!idev || idev->cnf.accept_ra != 2)) { 3586 fib6_info_hold(rt); 3587 rcu_read_unlock(); 3588 ip6_del_rt(net, rt); 3589 goto restart; 3590 } 3591 } 3592 rcu_read_unlock(); 3593 3594 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3595 } 3596 3597 void rt6_purge_dflt_routers(struct net *net) 3598 { 3599 struct fib6_table *table; 3600 struct hlist_head *head; 3601 unsigned int h; 3602 3603 rcu_read_lock(); 3604 3605 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3606 head = &net->ipv6.fib_table_hash[h]; 3607 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3608 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3609 __rt6_purge_dflt_routers(net, table); 3610 } 3611 } 3612 3613 rcu_read_unlock(); 3614 } 3615 3616 static void rtmsg_to_fib6_config(struct net *net, 3617 struct in6_rtmsg *rtmsg, 3618 struct fib6_config *cfg) 3619 { 3620 memset(cfg, 0, sizeof(*cfg)); 3621 3622 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3623 : RT6_TABLE_MAIN; 3624 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3625 cfg->fc_metric = rtmsg->rtmsg_metric; 3626 cfg->fc_expires = rtmsg->rtmsg_info; 3627 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3628 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3629 cfg->fc_flags = rtmsg->rtmsg_flags; 3630 cfg->fc_type = rtmsg->rtmsg_type; 3631 3632 cfg->fc_nlinfo.nl_net = net; 3633 3634 cfg->fc_dst = rtmsg->rtmsg_dst; 3635 cfg->fc_src = rtmsg->rtmsg_src; 3636 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3637 } 3638 3639 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3640 { 3641 struct fib6_config cfg; 3642 struct in6_rtmsg rtmsg; 3643 int err; 3644 3645 switch (cmd) { 3646 case SIOCADDRT: /* Add a route */ 3647 case SIOCDELRT: /* Delete a route */ 3648 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3649 return -EPERM; 3650 err = copy_from_user(&rtmsg, arg, 3651 sizeof(struct in6_rtmsg)); 3652 if (err) 3653 return -EFAULT; 3654 3655 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3656 3657 rtnl_lock(); 3658 switch (cmd) { 3659 case SIOCADDRT: 3660 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3661 break; 3662 case SIOCDELRT: 3663 err = ip6_route_del(&cfg, NULL); 3664 break; 3665 default: 3666 err = -EINVAL; 3667 } 3668 rtnl_unlock(); 3669 3670 return err; 3671 } 3672 3673 return -EINVAL; 3674 } 3675 3676 /* 3677 * Drop the packet on the floor 3678 */ 3679 3680 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3681 { 3682 int type; 3683 struct dst_entry *dst = skb_dst(skb); 3684 switch (ipstats_mib_noroutes) { 3685 case IPSTATS_MIB_INNOROUTES: 3686 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3687 if (type == IPV6_ADDR_ANY) { 3688 IP6_INC_STATS(dev_net(dst->dev), 3689 __in6_dev_get_safely(skb->dev), 3690 IPSTATS_MIB_INADDRERRORS); 3691 break; 3692 } 3693 /* FALLTHROUGH */ 3694 case IPSTATS_MIB_OUTNOROUTES: 3695 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3696 ipstats_mib_noroutes); 3697 break; 3698 } 3699 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3700 kfree_skb(skb); 3701 return 0; 3702 } 3703 3704 static int ip6_pkt_discard(struct sk_buff *skb) 3705 { 3706 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3707 } 3708 3709 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3710 { 3711 skb->dev = skb_dst(skb)->dev; 3712 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3713 } 3714 3715 static int ip6_pkt_prohibit(struct sk_buff *skb) 3716 { 3717 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3718 } 3719 3720 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3721 { 3722 skb->dev = skb_dst(skb)->dev; 3723 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3724 } 3725 3726 /* 3727 * Allocate a dst for local (unicast / anycast) address. 3728 */ 3729 3730 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3731 struct inet6_dev *idev, 3732 const struct in6_addr *addr, 3733 bool anycast, gfp_t gfp_flags) 3734 { 3735 u32 tb_id; 3736 struct net_device *dev = idev->dev; 3737 struct fib6_info *f6i; 3738 3739 f6i = fib6_info_alloc(gfp_flags); 3740 if (!f6i) 3741 return ERR_PTR(-ENOMEM); 3742 3743 f6i->dst_nocount = true; 3744 f6i->dst_host = true; 3745 f6i->fib6_protocol = RTPROT_KERNEL; 3746 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3747 if (anycast) { 3748 f6i->fib6_type = RTN_ANYCAST; 3749 f6i->fib6_flags |= RTF_ANYCAST; 3750 } else { 3751 f6i->fib6_type = RTN_LOCAL; 3752 f6i->fib6_flags |= RTF_LOCAL; 3753 } 3754 3755 f6i->fib6_nh.nh_gw = *addr; 3756 dev_hold(dev); 3757 f6i->fib6_nh.nh_dev = dev; 3758 f6i->fib6_dst.addr = *addr; 3759 f6i->fib6_dst.plen = 128; 3760 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3761 f6i->fib6_table = fib6_get_table(net, tb_id); 3762 3763 return f6i; 3764 } 3765 3766 /* remove deleted ip from prefsrc entries */ 3767 struct arg_dev_net_ip { 3768 struct net_device *dev; 3769 struct net *net; 3770 struct in6_addr *addr; 3771 }; 3772 3773 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3774 { 3775 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3776 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3777 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3778 3779 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3780 rt != net->ipv6.fib6_null_entry && 3781 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3782 spin_lock_bh(&rt6_exception_lock); 3783 /* remove prefsrc entry */ 3784 rt->fib6_prefsrc.plen = 0; 3785 /* need to update cache as well */ 3786 rt6_exceptions_remove_prefsrc(rt); 3787 spin_unlock_bh(&rt6_exception_lock); 3788 } 3789 return 0; 3790 } 3791 3792 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3793 { 3794 struct net *net = dev_net(ifp->idev->dev); 3795 struct arg_dev_net_ip adni = { 3796 .dev = ifp->idev->dev, 3797 .net = net, 3798 .addr = &ifp->addr, 3799 }; 3800 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3801 } 3802 3803 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3804 3805 /* Remove routers and update dst entries when gateway turn into host. */ 3806 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3807 { 3808 struct in6_addr *gateway = (struct in6_addr *)arg; 3809 3810 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3811 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3812 return -1; 3813 } 3814 3815 /* Further clean up cached routes in exception table. 3816 * This is needed because cached route may have a different 3817 * gateway than its 'parent' in the case of an ip redirect. 3818 */ 3819 rt6_exceptions_clean_tohost(rt, gateway); 3820 3821 return 0; 3822 } 3823 3824 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3825 { 3826 fib6_clean_all(net, fib6_clean_tohost, gateway); 3827 } 3828 3829 struct arg_netdev_event { 3830 const struct net_device *dev; 3831 union { 3832 unsigned int nh_flags; 3833 unsigned long event; 3834 }; 3835 }; 3836 3837 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3838 { 3839 struct fib6_info *iter; 3840 struct fib6_node *fn; 3841 3842 fn = rcu_dereference_protected(rt->fib6_node, 3843 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3844 iter = rcu_dereference_protected(fn->leaf, 3845 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3846 while (iter) { 3847 if (iter->fib6_metric == rt->fib6_metric && 3848 iter->fib6_nsiblings) 3849 return iter; 3850 iter = rcu_dereference_protected(iter->fib6_next, 3851 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3852 } 3853 3854 return NULL; 3855 } 3856 3857 static bool rt6_is_dead(const struct fib6_info *rt) 3858 { 3859 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3860 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3861 fib6_ignore_linkdown(rt))) 3862 return true; 3863 3864 return false; 3865 } 3866 3867 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3868 { 3869 struct fib6_info *iter; 3870 int total = 0; 3871 3872 if (!rt6_is_dead(rt)) 3873 total += rt->fib6_nh.nh_weight; 3874 3875 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3876 if (!rt6_is_dead(iter)) 3877 total += iter->fib6_nh.nh_weight; 3878 } 3879 3880 return total; 3881 } 3882 3883 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3884 { 3885 int upper_bound = -1; 3886 3887 if (!rt6_is_dead(rt)) { 3888 *weight += rt->fib6_nh.nh_weight; 3889 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3890 total) - 1; 3891 } 3892 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3893 } 3894 3895 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3896 { 3897 struct fib6_info *iter; 3898 int weight = 0; 3899 3900 rt6_upper_bound_set(rt, &weight, total); 3901 3902 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3903 rt6_upper_bound_set(iter, &weight, total); 3904 } 3905 3906 void rt6_multipath_rebalance(struct fib6_info *rt) 3907 { 3908 struct fib6_info *first; 3909 int total; 3910 3911 /* In case the entire multipath route was marked for flushing, 3912 * then there is no need to rebalance upon the removal of every 3913 * sibling route. 3914 */ 3915 if (!rt->fib6_nsiblings || rt->should_flush) 3916 return; 3917 3918 /* During lookup routes are evaluated in order, so we need to 3919 * make sure upper bounds are assigned from the first sibling 3920 * onwards. 3921 */ 3922 first = rt6_multipath_first_sibling(rt); 3923 if (WARN_ON_ONCE(!first)) 3924 return; 3925 3926 total = rt6_multipath_total_weight(first); 3927 rt6_multipath_upper_bound_set(first, total); 3928 } 3929 3930 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3931 { 3932 const struct arg_netdev_event *arg = p_arg; 3933 struct net *net = dev_net(arg->dev); 3934 3935 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3936 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3937 fib6_update_sernum_upto_root(net, rt); 3938 rt6_multipath_rebalance(rt); 3939 } 3940 3941 return 0; 3942 } 3943 3944 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3945 { 3946 struct arg_netdev_event arg = { 3947 .dev = dev, 3948 { 3949 .nh_flags = nh_flags, 3950 }, 3951 }; 3952 3953 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3954 arg.nh_flags |= RTNH_F_LINKDOWN; 3955 3956 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3957 } 3958 3959 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3960 const struct net_device *dev) 3961 { 3962 struct fib6_info *iter; 3963 3964 if (rt->fib6_nh.nh_dev == dev) 3965 return true; 3966 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3967 if (iter->fib6_nh.nh_dev == dev) 3968 return true; 3969 3970 return false; 3971 } 3972 3973 static void rt6_multipath_flush(struct fib6_info *rt) 3974 { 3975 struct fib6_info *iter; 3976 3977 rt->should_flush = 1; 3978 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3979 iter->should_flush = 1; 3980 } 3981 3982 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3983 const struct net_device *down_dev) 3984 { 3985 struct fib6_info *iter; 3986 unsigned int dead = 0; 3987 3988 if (rt->fib6_nh.nh_dev == down_dev || 3989 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3990 dead++; 3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3992 if (iter->fib6_nh.nh_dev == down_dev || 3993 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3994 dead++; 3995 3996 return dead; 3997 } 3998 3999 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4000 const struct net_device *dev, 4001 unsigned int nh_flags) 4002 { 4003 struct fib6_info *iter; 4004 4005 if (rt->fib6_nh.nh_dev == dev) 4006 rt->fib6_nh.nh_flags |= nh_flags; 4007 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4008 if (iter->fib6_nh.nh_dev == dev) 4009 iter->fib6_nh.nh_flags |= nh_flags; 4010 } 4011 4012 /* called with write lock held for table with rt */ 4013 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4014 { 4015 const struct arg_netdev_event *arg = p_arg; 4016 const struct net_device *dev = arg->dev; 4017 struct net *net = dev_net(dev); 4018 4019 if (rt == net->ipv6.fib6_null_entry) 4020 return 0; 4021 4022 switch (arg->event) { 4023 case NETDEV_UNREGISTER: 4024 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4025 case NETDEV_DOWN: 4026 if (rt->should_flush) 4027 return -1; 4028 if (!rt->fib6_nsiblings) 4029 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4030 if (rt6_multipath_uses_dev(rt, dev)) { 4031 unsigned int count; 4032 4033 count = rt6_multipath_dead_count(rt, dev); 4034 if (rt->fib6_nsiblings + 1 == count) { 4035 rt6_multipath_flush(rt); 4036 return -1; 4037 } 4038 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4039 RTNH_F_LINKDOWN); 4040 fib6_update_sernum(net, rt); 4041 rt6_multipath_rebalance(rt); 4042 } 4043 return -2; 4044 case NETDEV_CHANGE: 4045 if (rt->fib6_nh.nh_dev != dev || 4046 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4047 break; 4048 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4049 rt6_multipath_rebalance(rt); 4050 break; 4051 } 4052 4053 return 0; 4054 } 4055 4056 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4057 { 4058 struct arg_netdev_event arg = { 4059 .dev = dev, 4060 { 4061 .event = event, 4062 }, 4063 }; 4064 4065 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4066 } 4067 4068 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4069 { 4070 rt6_sync_down_dev(dev, event); 4071 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4072 neigh_ifdown(&nd_tbl, dev); 4073 } 4074 4075 struct rt6_mtu_change_arg { 4076 struct net_device *dev; 4077 unsigned int mtu; 4078 }; 4079 4080 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4081 { 4082 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4083 struct inet6_dev *idev; 4084 4085 /* In IPv6 pmtu discovery is not optional, 4086 so that RTAX_MTU lock cannot disable it. 4087 We still use this lock to block changes 4088 caused by addrconf/ndisc. 4089 */ 4090 4091 idev = __in6_dev_get(arg->dev); 4092 if (!idev) 4093 return 0; 4094 4095 /* For administrative MTU increase, there is no way to discover 4096 IPv6 PMTU increase, so PMTU increase should be updated here. 4097 Since RFC 1981 doesn't include administrative MTU increase 4098 update PMTU increase is a MUST. (i.e. jumbo frame) 4099 */ 4100 if (rt->fib6_nh.nh_dev == arg->dev && 4101 !fib6_metric_locked(rt, RTAX_MTU)) { 4102 u32 mtu = rt->fib6_pmtu; 4103 4104 if (mtu >= arg->mtu || 4105 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4106 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4107 4108 spin_lock_bh(&rt6_exception_lock); 4109 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4110 spin_unlock_bh(&rt6_exception_lock); 4111 } 4112 return 0; 4113 } 4114 4115 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4116 { 4117 struct rt6_mtu_change_arg arg = { 4118 .dev = dev, 4119 .mtu = mtu, 4120 }; 4121 4122 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4123 } 4124 4125 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4126 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4127 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4128 [RTA_OIF] = { .type = NLA_U32 }, 4129 [RTA_IIF] = { .type = NLA_U32 }, 4130 [RTA_PRIORITY] = { .type = NLA_U32 }, 4131 [RTA_METRICS] = { .type = NLA_NESTED }, 4132 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4133 [RTA_PREF] = { .type = NLA_U8 }, 4134 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4135 [RTA_ENCAP] = { .type = NLA_NESTED }, 4136 [RTA_EXPIRES] = { .type = NLA_U32 }, 4137 [RTA_UID] = { .type = NLA_U32 }, 4138 [RTA_MARK] = { .type = NLA_U32 }, 4139 [RTA_TABLE] = { .type = NLA_U32 }, 4140 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4141 [RTA_SPORT] = { .type = NLA_U16 }, 4142 [RTA_DPORT] = { .type = NLA_U16 }, 4143 }; 4144 4145 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4146 struct fib6_config *cfg, 4147 struct netlink_ext_ack *extack) 4148 { 4149 struct rtmsg *rtm; 4150 struct nlattr *tb[RTA_MAX+1]; 4151 unsigned int pref; 4152 int err; 4153 4154 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4155 NULL); 4156 if (err < 0) 4157 goto errout; 4158 4159 err = -EINVAL; 4160 rtm = nlmsg_data(nlh); 4161 memset(cfg, 0, sizeof(*cfg)); 4162 4163 cfg->fc_table = rtm->rtm_table; 4164 cfg->fc_dst_len = rtm->rtm_dst_len; 4165 cfg->fc_src_len = rtm->rtm_src_len; 4166 cfg->fc_flags = RTF_UP; 4167 cfg->fc_protocol = rtm->rtm_protocol; 4168 cfg->fc_type = rtm->rtm_type; 4169 4170 if (rtm->rtm_type == RTN_UNREACHABLE || 4171 rtm->rtm_type == RTN_BLACKHOLE || 4172 rtm->rtm_type == RTN_PROHIBIT || 4173 rtm->rtm_type == RTN_THROW) 4174 cfg->fc_flags |= RTF_REJECT; 4175 4176 if (rtm->rtm_type == RTN_LOCAL) 4177 cfg->fc_flags |= RTF_LOCAL; 4178 4179 if (rtm->rtm_flags & RTM_F_CLONED) 4180 cfg->fc_flags |= RTF_CACHE; 4181 4182 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4183 4184 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4185 cfg->fc_nlinfo.nlh = nlh; 4186 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4187 4188 if (tb[RTA_GATEWAY]) { 4189 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4190 cfg->fc_flags |= RTF_GATEWAY; 4191 } 4192 4193 if (tb[RTA_DST]) { 4194 int plen = (rtm->rtm_dst_len + 7) >> 3; 4195 4196 if (nla_len(tb[RTA_DST]) < plen) 4197 goto errout; 4198 4199 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4200 } 4201 4202 if (tb[RTA_SRC]) { 4203 int plen = (rtm->rtm_src_len + 7) >> 3; 4204 4205 if (nla_len(tb[RTA_SRC]) < plen) 4206 goto errout; 4207 4208 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4209 } 4210 4211 if (tb[RTA_PREFSRC]) 4212 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4213 4214 if (tb[RTA_OIF]) 4215 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4216 4217 if (tb[RTA_PRIORITY]) 4218 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4219 4220 if (tb[RTA_METRICS]) { 4221 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4222 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4223 } 4224 4225 if (tb[RTA_TABLE]) 4226 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4227 4228 if (tb[RTA_MULTIPATH]) { 4229 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4230 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4231 4232 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4233 cfg->fc_mp_len, extack); 4234 if (err < 0) 4235 goto errout; 4236 } 4237 4238 if (tb[RTA_PREF]) { 4239 pref = nla_get_u8(tb[RTA_PREF]); 4240 if (pref != ICMPV6_ROUTER_PREF_LOW && 4241 pref != ICMPV6_ROUTER_PREF_HIGH) 4242 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4243 cfg->fc_flags |= RTF_PREF(pref); 4244 } 4245 4246 if (tb[RTA_ENCAP]) 4247 cfg->fc_encap = tb[RTA_ENCAP]; 4248 4249 if (tb[RTA_ENCAP_TYPE]) { 4250 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4251 4252 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4253 if (err < 0) 4254 goto errout; 4255 } 4256 4257 if (tb[RTA_EXPIRES]) { 4258 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4259 4260 if (addrconf_finite_timeout(timeout)) { 4261 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4262 cfg->fc_flags |= RTF_EXPIRES; 4263 } 4264 } 4265 4266 err = 0; 4267 errout: 4268 return err; 4269 } 4270 4271 struct rt6_nh { 4272 struct fib6_info *fib6_info; 4273 struct fib6_config r_cfg; 4274 struct list_head next; 4275 }; 4276 4277 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4278 { 4279 struct rt6_nh *nh; 4280 4281 list_for_each_entry(nh, rt6_nh_list, next) { 4282 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4283 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4284 nh->r_cfg.fc_ifindex); 4285 } 4286 } 4287 4288 static int ip6_route_info_append(struct net *net, 4289 struct list_head *rt6_nh_list, 4290 struct fib6_info *rt, 4291 struct fib6_config *r_cfg) 4292 { 4293 struct rt6_nh *nh; 4294 int err = -EEXIST; 4295 4296 list_for_each_entry(nh, rt6_nh_list, next) { 4297 /* check if fib6_info already exists */ 4298 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4299 return err; 4300 } 4301 4302 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4303 if (!nh) 4304 return -ENOMEM; 4305 nh->fib6_info = rt; 4306 err = ip6_convert_metrics(net, rt, r_cfg); 4307 if (err) { 4308 kfree(nh); 4309 return err; 4310 } 4311 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4312 list_add_tail(&nh->next, rt6_nh_list); 4313 4314 return 0; 4315 } 4316 4317 static void ip6_route_mpath_notify(struct fib6_info *rt, 4318 struct fib6_info *rt_last, 4319 struct nl_info *info, 4320 __u16 nlflags) 4321 { 4322 /* if this is an APPEND route, then rt points to the first route 4323 * inserted and rt_last points to last route inserted. Userspace 4324 * wants a consistent dump of the route which starts at the first 4325 * nexthop. Since sibling routes are always added at the end of 4326 * the list, find the first sibling of the last route appended 4327 */ 4328 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4329 rt = list_first_entry(&rt_last->fib6_siblings, 4330 struct fib6_info, 4331 fib6_siblings); 4332 } 4333 4334 if (rt) 4335 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4336 } 4337 4338 static int ip6_route_multipath_add(struct fib6_config *cfg, 4339 struct netlink_ext_ack *extack) 4340 { 4341 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4342 struct nl_info *info = &cfg->fc_nlinfo; 4343 struct fib6_config r_cfg; 4344 struct rtnexthop *rtnh; 4345 struct fib6_info *rt; 4346 struct rt6_nh *err_nh; 4347 struct rt6_nh *nh, *nh_safe; 4348 __u16 nlflags; 4349 int remaining; 4350 int attrlen; 4351 int err = 1; 4352 int nhn = 0; 4353 int replace = (cfg->fc_nlinfo.nlh && 4354 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4355 LIST_HEAD(rt6_nh_list); 4356 4357 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4358 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4359 nlflags |= NLM_F_APPEND; 4360 4361 remaining = cfg->fc_mp_len; 4362 rtnh = (struct rtnexthop *)cfg->fc_mp; 4363 4364 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4365 * fib6_info structs per nexthop 4366 */ 4367 while (rtnh_ok(rtnh, remaining)) { 4368 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4369 if (rtnh->rtnh_ifindex) 4370 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4371 4372 attrlen = rtnh_attrlen(rtnh); 4373 if (attrlen > 0) { 4374 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4375 4376 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4377 if (nla) { 4378 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4379 r_cfg.fc_flags |= RTF_GATEWAY; 4380 } 4381 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4382 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4383 if (nla) 4384 r_cfg.fc_encap_type = nla_get_u16(nla); 4385 } 4386 4387 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4388 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4389 if (IS_ERR(rt)) { 4390 err = PTR_ERR(rt); 4391 rt = NULL; 4392 goto cleanup; 4393 } 4394 4395 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4396 4397 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4398 rt, &r_cfg); 4399 if (err) { 4400 fib6_info_release(rt); 4401 goto cleanup; 4402 } 4403 4404 rtnh = rtnh_next(rtnh, &remaining); 4405 } 4406 4407 /* for add and replace send one notification with all nexthops. 4408 * Skip the notification in fib6_add_rt2node and send one with 4409 * the full route when done 4410 */ 4411 info->skip_notify = 1; 4412 4413 err_nh = NULL; 4414 list_for_each_entry(nh, &rt6_nh_list, next) { 4415 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4416 fib6_info_release(nh->fib6_info); 4417 4418 if (!err) { 4419 /* save reference to last route successfully inserted */ 4420 rt_last = nh->fib6_info; 4421 4422 /* save reference to first route for notification */ 4423 if (!rt_notif) 4424 rt_notif = nh->fib6_info; 4425 } 4426 4427 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4428 nh->fib6_info = NULL; 4429 if (err) { 4430 if (replace && nhn) 4431 ip6_print_replace_route_err(&rt6_nh_list); 4432 err_nh = nh; 4433 goto add_errout; 4434 } 4435 4436 /* Because each route is added like a single route we remove 4437 * these flags after the first nexthop: if there is a collision, 4438 * we have already failed to add the first nexthop: 4439 * fib6_add_rt2node() has rejected it; when replacing, old 4440 * nexthops have been replaced by first new, the rest should 4441 * be added to it. 4442 */ 4443 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4444 NLM_F_REPLACE); 4445 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; 4446 nhn++; 4447 } 4448 4449 /* success ... tell user about new route */ 4450 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4451 goto cleanup; 4452 4453 add_errout: 4454 /* send notification for routes that were added so that 4455 * the delete notifications sent by ip6_route_del are 4456 * coherent 4457 */ 4458 if (rt_notif) 4459 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4460 4461 /* Delete routes that were already added */ 4462 list_for_each_entry(nh, &rt6_nh_list, next) { 4463 if (err_nh == nh) 4464 break; 4465 ip6_route_del(&nh->r_cfg, extack); 4466 } 4467 4468 cleanup: 4469 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4470 if (nh->fib6_info) 4471 fib6_info_release(nh->fib6_info); 4472 list_del(&nh->next); 4473 kfree(nh); 4474 } 4475 4476 return err; 4477 } 4478 4479 static int ip6_route_multipath_del(struct fib6_config *cfg, 4480 struct netlink_ext_ack *extack) 4481 { 4482 struct fib6_config r_cfg; 4483 struct rtnexthop *rtnh; 4484 int remaining; 4485 int attrlen; 4486 int err = 1, last_err = 0; 4487 4488 remaining = cfg->fc_mp_len; 4489 rtnh = (struct rtnexthop *)cfg->fc_mp; 4490 4491 /* Parse a Multipath Entry */ 4492 while (rtnh_ok(rtnh, remaining)) { 4493 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4494 if (rtnh->rtnh_ifindex) 4495 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4496 4497 attrlen = rtnh_attrlen(rtnh); 4498 if (attrlen > 0) { 4499 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4500 4501 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4502 if (nla) { 4503 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4504 r_cfg.fc_flags |= RTF_GATEWAY; 4505 } 4506 } 4507 err = ip6_route_del(&r_cfg, extack); 4508 if (err) 4509 last_err = err; 4510 4511 rtnh = rtnh_next(rtnh, &remaining); 4512 } 4513 4514 return last_err; 4515 } 4516 4517 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4518 struct netlink_ext_ack *extack) 4519 { 4520 struct fib6_config cfg; 4521 int err; 4522 4523 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4524 if (err < 0) 4525 return err; 4526 4527 if (cfg.fc_mp) 4528 return ip6_route_multipath_del(&cfg, extack); 4529 else { 4530 cfg.fc_delete_all_nh = 1; 4531 return ip6_route_del(&cfg, extack); 4532 } 4533 } 4534 4535 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4536 struct netlink_ext_ack *extack) 4537 { 4538 struct fib6_config cfg; 4539 int err; 4540 4541 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4542 if (err < 0) 4543 return err; 4544 4545 if (cfg.fc_mp) 4546 return ip6_route_multipath_add(&cfg, extack); 4547 else 4548 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4549 } 4550 4551 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4552 { 4553 int nexthop_len = 0; 4554 4555 if (rt->fib6_nsiblings) { 4556 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4557 + NLA_ALIGN(sizeof(struct rtnexthop)) 4558 + nla_total_size(16) /* RTA_GATEWAY */ 4559 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4560 4561 nexthop_len *= rt->fib6_nsiblings; 4562 } 4563 4564 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4565 + nla_total_size(16) /* RTA_SRC */ 4566 + nla_total_size(16) /* RTA_DST */ 4567 + nla_total_size(16) /* RTA_GATEWAY */ 4568 + nla_total_size(16) /* RTA_PREFSRC */ 4569 + nla_total_size(4) /* RTA_TABLE */ 4570 + nla_total_size(4) /* RTA_IIF */ 4571 + nla_total_size(4) /* RTA_OIF */ 4572 + nla_total_size(4) /* RTA_PRIORITY */ 4573 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4574 + nla_total_size(sizeof(struct rta_cacheinfo)) 4575 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4576 + nla_total_size(1) /* RTA_PREF */ 4577 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4578 + nexthop_len; 4579 } 4580 4581 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4582 unsigned int *flags, bool skip_oif) 4583 { 4584 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4585 *flags |= RTNH_F_DEAD; 4586 4587 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4588 *flags |= RTNH_F_LINKDOWN; 4589 4590 rcu_read_lock(); 4591 if (fib6_ignore_linkdown(rt)) 4592 *flags |= RTNH_F_DEAD; 4593 rcu_read_unlock(); 4594 } 4595 4596 if (rt->fib6_flags & RTF_GATEWAY) { 4597 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4598 goto nla_put_failure; 4599 } 4600 4601 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4602 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4603 *flags |= RTNH_F_OFFLOAD; 4604 4605 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4606 if (!skip_oif && rt->fib6_nh.nh_dev && 4607 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4608 goto nla_put_failure; 4609 4610 if (rt->fib6_nh.nh_lwtstate && 4611 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4612 goto nla_put_failure; 4613 4614 return 0; 4615 4616 nla_put_failure: 4617 return -EMSGSIZE; 4618 } 4619 4620 /* add multipath next hop */ 4621 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4622 { 4623 const struct net_device *dev = rt->fib6_nh.nh_dev; 4624 struct rtnexthop *rtnh; 4625 unsigned int flags = 0; 4626 4627 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4628 if (!rtnh) 4629 goto nla_put_failure; 4630 4631 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4632 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4633 4634 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4635 goto nla_put_failure; 4636 4637 rtnh->rtnh_flags = flags; 4638 4639 /* length of rtnetlink header + attributes */ 4640 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4641 4642 return 0; 4643 4644 nla_put_failure: 4645 return -EMSGSIZE; 4646 } 4647 4648 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4649 struct fib6_info *rt, struct dst_entry *dst, 4650 struct in6_addr *dest, struct in6_addr *src, 4651 int iif, int type, u32 portid, u32 seq, 4652 unsigned int flags) 4653 { 4654 struct rtmsg *rtm; 4655 struct nlmsghdr *nlh; 4656 long expires = 0; 4657 u32 *pmetrics; 4658 u32 table; 4659 4660 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4661 if (!nlh) 4662 return -EMSGSIZE; 4663 4664 rtm = nlmsg_data(nlh); 4665 rtm->rtm_family = AF_INET6; 4666 rtm->rtm_dst_len = rt->fib6_dst.plen; 4667 rtm->rtm_src_len = rt->fib6_src.plen; 4668 rtm->rtm_tos = 0; 4669 if (rt->fib6_table) 4670 table = rt->fib6_table->tb6_id; 4671 else 4672 table = RT6_TABLE_UNSPEC; 4673 rtm->rtm_table = table; 4674 if (nla_put_u32(skb, RTA_TABLE, table)) 4675 goto nla_put_failure; 4676 4677 rtm->rtm_type = rt->fib6_type; 4678 rtm->rtm_flags = 0; 4679 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4680 rtm->rtm_protocol = rt->fib6_protocol; 4681 4682 if (rt->fib6_flags & RTF_CACHE) 4683 rtm->rtm_flags |= RTM_F_CLONED; 4684 4685 if (dest) { 4686 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4687 goto nla_put_failure; 4688 rtm->rtm_dst_len = 128; 4689 } else if (rtm->rtm_dst_len) 4690 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4691 goto nla_put_failure; 4692 #ifdef CONFIG_IPV6_SUBTREES 4693 if (src) { 4694 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4695 goto nla_put_failure; 4696 rtm->rtm_src_len = 128; 4697 } else if (rtm->rtm_src_len && 4698 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4699 goto nla_put_failure; 4700 #endif 4701 if (iif) { 4702 #ifdef CONFIG_IPV6_MROUTE 4703 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4704 int err = ip6mr_get_route(net, skb, rtm, portid); 4705 4706 if (err == 0) 4707 return 0; 4708 if (err < 0) 4709 goto nla_put_failure; 4710 } else 4711 #endif 4712 if (nla_put_u32(skb, RTA_IIF, iif)) 4713 goto nla_put_failure; 4714 } else if (dest) { 4715 struct in6_addr saddr_buf; 4716 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4717 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4718 goto nla_put_failure; 4719 } 4720 4721 if (rt->fib6_prefsrc.plen) { 4722 struct in6_addr saddr_buf; 4723 saddr_buf = rt->fib6_prefsrc.addr; 4724 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4725 goto nla_put_failure; 4726 } 4727 4728 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4729 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4730 goto nla_put_failure; 4731 4732 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4733 goto nla_put_failure; 4734 4735 /* For multipath routes, walk the siblings list and add 4736 * each as a nexthop within RTA_MULTIPATH. 4737 */ 4738 if (rt->fib6_nsiblings) { 4739 struct fib6_info *sibling, *next_sibling; 4740 struct nlattr *mp; 4741 4742 mp = nla_nest_start(skb, RTA_MULTIPATH); 4743 if (!mp) 4744 goto nla_put_failure; 4745 4746 if (rt6_add_nexthop(skb, rt) < 0) 4747 goto nla_put_failure; 4748 4749 list_for_each_entry_safe(sibling, next_sibling, 4750 &rt->fib6_siblings, fib6_siblings) { 4751 if (rt6_add_nexthop(skb, sibling) < 0) 4752 goto nla_put_failure; 4753 } 4754 4755 nla_nest_end(skb, mp); 4756 } else { 4757 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4758 goto nla_put_failure; 4759 } 4760 4761 if (rt->fib6_flags & RTF_EXPIRES) { 4762 expires = dst ? dst->expires : rt->expires; 4763 expires -= jiffies; 4764 } 4765 4766 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4767 goto nla_put_failure; 4768 4769 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4770 goto nla_put_failure; 4771 4772 4773 nlmsg_end(skb, nlh); 4774 return 0; 4775 4776 nla_put_failure: 4777 nlmsg_cancel(skb, nlh); 4778 return -EMSGSIZE; 4779 } 4780 4781 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4782 { 4783 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4784 struct net *net = arg->net; 4785 4786 if (rt == net->ipv6.fib6_null_entry) 4787 return 0; 4788 4789 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4790 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4791 4792 /* user wants prefix routes only */ 4793 if (rtm->rtm_flags & RTM_F_PREFIX && 4794 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4795 /* success since this is not a prefix route */ 4796 return 1; 4797 } 4798 } 4799 4800 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4801 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4802 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4803 } 4804 4805 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4806 struct netlink_ext_ack *extack) 4807 { 4808 struct net *net = sock_net(in_skb->sk); 4809 struct nlattr *tb[RTA_MAX+1]; 4810 int err, iif = 0, oif = 0; 4811 struct fib6_info *from; 4812 struct dst_entry *dst; 4813 struct rt6_info *rt; 4814 struct sk_buff *skb; 4815 struct rtmsg *rtm; 4816 struct flowi6 fl6; 4817 bool fibmatch; 4818 4819 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4820 extack); 4821 if (err < 0) 4822 goto errout; 4823 4824 err = -EINVAL; 4825 memset(&fl6, 0, sizeof(fl6)); 4826 rtm = nlmsg_data(nlh); 4827 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4828 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4829 4830 if (tb[RTA_SRC]) { 4831 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4832 goto errout; 4833 4834 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4835 } 4836 4837 if (tb[RTA_DST]) { 4838 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4839 goto errout; 4840 4841 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4842 } 4843 4844 if (tb[RTA_IIF]) 4845 iif = nla_get_u32(tb[RTA_IIF]); 4846 4847 if (tb[RTA_OIF]) 4848 oif = nla_get_u32(tb[RTA_OIF]); 4849 4850 if (tb[RTA_MARK]) 4851 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4852 4853 if (tb[RTA_UID]) 4854 fl6.flowi6_uid = make_kuid(current_user_ns(), 4855 nla_get_u32(tb[RTA_UID])); 4856 else 4857 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4858 4859 if (tb[RTA_SPORT]) 4860 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4861 4862 if (tb[RTA_DPORT]) 4863 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4864 4865 if (tb[RTA_IP_PROTO]) { 4866 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4867 &fl6.flowi6_proto, extack); 4868 if (err) 4869 goto errout; 4870 } 4871 4872 if (iif) { 4873 struct net_device *dev; 4874 int flags = 0; 4875 4876 rcu_read_lock(); 4877 4878 dev = dev_get_by_index_rcu(net, iif); 4879 if (!dev) { 4880 rcu_read_unlock(); 4881 err = -ENODEV; 4882 goto errout; 4883 } 4884 4885 fl6.flowi6_iif = iif; 4886 4887 if (!ipv6_addr_any(&fl6.saddr)) 4888 flags |= RT6_LOOKUP_F_HAS_SADDR; 4889 4890 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4891 4892 rcu_read_unlock(); 4893 } else { 4894 fl6.flowi6_oif = oif; 4895 4896 dst = ip6_route_output(net, NULL, &fl6); 4897 } 4898 4899 4900 rt = container_of(dst, struct rt6_info, dst); 4901 if (rt->dst.error) { 4902 err = rt->dst.error; 4903 ip6_rt_put(rt); 4904 goto errout; 4905 } 4906 4907 if (rt == net->ipv6.ip6_null_entry) { 4908 err = rt->dst.error; 4909 ip6_rt_put(rt); 4910 goto errout; 4911 } 4912 4913 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4914 if (!skb) { 4915 ip6_rt_put(rt); 4916 err = -ENOBUFS; 4917 goto errout; 4918 } 4919 4920 skb_dst_set(skb, &rt->dst); 4921 4922 rcu_read_lock(); 4923 from = rcu_dereference(rt->from); 4924 4925 if (fibmatch) 4926 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4927 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4928 nlh->nlmsg_seq, 0); 4929 else 4930 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4931 &fl6.saddr, iif, RTM_NEWROUTE, 4932 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4933 0); 4934 rcu_read_unlock(); 4935 4936 if (err < 0) { 4937 kfree_skb(skb); 4938 goto errout; 4939 } 4940 4941 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4942 errout: 4943 return err; 4944 } 4945 4946 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4947 unsigned int nlm_flags) 4948 { 4949 struct sk_buff *skb; 4950 struct net *net = info->nl_net; 4951 u32 seq; 4952 int err; 4953 4954 err = -ENOBUFS; 4955 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4956 4957 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4958 if (!skb) 4959 goto errout; 4960 4961 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4962 event, info->portid, seq, nlm_flags); 4963 if (err < 0) { 4964 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4965 WARN_ON(err == -EMSGSIZE); 4966 kfree_skb(skb); 4967 goto errout; 4968 } 4969 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4970 info->nlh, gfp_any()); 4971 return; 4972 errout: 4973 if (err < 0) 4974 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4975 } 4976 4977 static int ip6_route_dev_notify(struct notifier_block *this, 4978 unsigned long event, void *ptr) 4979 { 4980 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4981 struct net *net = dev_net(dev); 4982 4983 if (!(dev->flags & IFF_LOOPBACK)) 4984 return NOTIFY_OK; 4985 4986 if (event == NETDEV_REGISTER) { 4987 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4988 net->ipv6.ip6_null_entry->dst.dev = dev; 4989 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4990 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4991 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4992 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4993 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4994 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4995 #endif 4996 } else if (event == NETDEV_UNREGISTER && 4997 dev->reg_state != NETREG_UNREGISTERED) { 4998 /* NETDEV_UNREGISTER could be fired for multiple times by 4999 * netdev_wait_allrefs(). Make sure we only call this once. 5000 */ 5001 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5002 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5003 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5004 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5005 #endif 5006 } 5007 5008 return NOTIFY_OK; 5009 } 5010 5011 /* 5012 * /proc 5013 */ 5014 5015 #ifdef CONFIG_PROC_FS 5016 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5017 { 5018 struct net *net = (struct net *)seq->private; 5019 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5020 net->ipv6.rt6_stats->fib_nodes, 5021 net->ipv6.rt6_stats->fib_route_nodes, 5022 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5023 net->ipv6.rt6_stats->fib_rt_entries, 5024 net->ipv6.rt6_stats->fib_rt_cache, 5025 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5026 net->ipv6.rt6_stats->fib_discarded_routes); 5027 5028 return 0; 5029 } 5030 #endif /* CONFIG_PROC_FS */ 5031 5032 #ifdef CONFIG_SYSCTL 5033 5034 static 5035 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5036 void __user *buffer, size_t *lenp, loff_t *ppos) 5037 { 5038 struct net *net; 5039 int delay; 5040 if (!write) 5041 return -EINVAL; 5042 5043 net = (struct net *)ctl->extra1; 5044 delay = net->ipv6.sysctl.flush_delay; 5045 proc_dointvec(ctl, write, buffer, lenp, ppos); 5046 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5047 return 0; 5048 } 5049 5050 struct ctl_table ipv6_route_table_template[] = { 5051 { 5052 .procname = "flush", 5053 .data = &init_net.ipv6.sysctl.flush_delay, 5054 .maxlen = sizeof(int), 5055 .mode = 0200, 5056 .proc_handler = ipv6_sysctl_rtcache_flush 5057 }, 5058 { 5059 .procname = "gc_thresh", 5060 .data = &ip6_dst_ops_template.gc_thresh, 5061 .maxlen = sizeof(int), 5062 .mode = 0644, 5063 .proc_handler = proc_dointvec, 5064 }, 5065 { 5066 .procname = "max_size", 5067 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5068 .maxlen = sizeof(int), 5069 .mode = 0644, 5070 .proc_handler = proc_dointvec, 5071 }, 5072 { 5073 .procname = "gc_min_interval", 5074 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5075 .maxlen = sizeof(int), 5076 .mode = 0644, 5077 .proc_handler = proc_dointvec_jiffies, 5078 }, 5079 { 5080 .procname = "gc_timeout", 5081 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5082 .maxlen = sizeof(int), 5083 .mode = 0644, 5084 .proc_handler = proc_dointvec_jiffies, 5085 }, 5086 { 5087 .procname = "gc_interval", 5088 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5089 .maxlen = sizeof(int), 5090 .mode = 0644, 5091 .proc_handler = proc_dointvec_jiffies, 5092 }, 5093 { 5094 .procname = "gc_elasticity", 5095 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5096 .maxlen = sizeof(int), 5097 .mode = 0644, 5098 .proc_handler = proc_dointvec, 5099 }, 5100 { 5101 .procname = "mtu_expires", 5102 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5103 .maxlen = sizeof(int), 5104 .mode = 0644, 5105 .proc_handler = proc_dointvec_jiffies, 5106 }, 5107 { 5108 .procname = "min_adv_mss", 5109 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5110 .maxlen = sizeof(int), 5111 .mode = 0644, 5112 .proc_handler = proc_dointvec, 5113 }, 5114 { 5115 .procname = "gc_min_interval_ms", 5116 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5117 .maxlen = sizeof(int), 5118 .mode = 0644, 5119 .proc_handler = proc_dointvec_ms_jiffies, 5120 }, 5121 { } 5122 }; 5123 5124 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5125 { 5126 struct ctl_table *table; 5127 5128 table = kmemdup(ipv6_route_table_template, 5129 sizeof(ipv6_route_table_template), 5130 GFP_KERNEL); 5131 5132 if (table) { 5133 table[0].data = &net->ipv6.sysctl.flush_delay; 5134 table[0].extra1 = net; 5135 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5136 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5137 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5138 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5139 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5140 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5141 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5142 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5143 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5144 5145 /* Don't export sysctls to unprivileged users */ 5146 if (net->user_ns != &init_user_ns) 5147 table[0].procname = NULL; 5148 } 5149 5150 return table; 5151 } 5152 #endif 5153 5154 static int __net_init ip6_route_net_init(struct net *net) 5155 { 5156 int ret = -ENOMEM; 5157 5158 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5159 sizeof(net->ipv6.ip6_dst_ops)); 5160 5161 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5162 goto out_ip6_dst_ops; 5163 5164 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5165 sizeof(*net->ipv6.fib6_null_entry), 5166 GFP_KERNEL); 5167 if (!net->ipv6.fib6_null_entry) 5168 goto out_ip6_dst_entries; 5169 5170 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5171 sizeof(*net->ipv6.ip6_null_entry), 5172 GFP_KERNEL); 5173 if (!net->ipv6.ip6_null_entry) 5174 goto out_fib6_null_entry; 5175 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5176 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5177 ip6_template_metrics, true); 5178 5179 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5180 net->ipv6.fib6_has_custom_rules = false; 5181 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5182 sizeof(*net->ipv6.ip6_prohibit_entry), 5183 GFP_KERNEL); 5184 if (!net->ipv6.ip6_prohibit_entry) 5185 goto out_ip6_null_entry; 5186 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5187 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5188 ip6_template_metrics, true); 5189 5190 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5191 sizeof(*net->ipv6.ip6_blk_hole_entry), 5192 GFP_KERNEL); 5193 if (!net->ipv6.ip6_blk_hole_entry) 5194 goto out_ip6_prohibit_entry; 5195 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5196 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5197 ip6_template_metrics, true); 5198 #endif 5199 5200 net->ipv6.sysctl.flush_delay = 0; 5201 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5202 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5203 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5204 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5205 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5206 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5207 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5208 5209 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5210 5211 ret = 0; 5212 out: 5213 return ret; 5214 5215 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5216 out_ip6_prohibit_entry: 5217 kfree(net->ipv6.ip6_prohibit_entry); 5218 out_ip6_null_entry: 5219 kfree(net->ipv6.ip6_null_entry); 5220 #endif 5221 out_fib6_null_entry: 5222 kfree(net->ipv6.fib6_null_entry); 5223 out_ip6_dst_entries: 5224 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5225 out_ip6_dst_ops: 5226 goto out; 5227 } 5228 5229 static void __net_exit ip6_route_net_exit(struct net *net) 5230 { 5231 kfree(net->ipv6.fib6_null_entry); 5232 kfree(net->ipv6.ip6_null_entry); 5233 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5234 kfree(net->ipv6.ip6_prohibit_entry); 5235 kfree(net->ipv6.ip6_blk_hole_entry); 5236 #endif 5237 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5238 } 5239 5240 static int __net_init ip6_route_net_init_late(struct net *net) 5241 { 5242 #ifdef CONFIG_PROC_FS 5243 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5244 sizeof(struct ipv6_route_iter)); 5245 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5246 rt6_stats_seq_show, NULL); 5247 #endif 5248 return 0; 5249 } 5250 5251 static void __net_exit ip6_route_net_exit_late(struct net *net) 5252 { 5253 #ifdef CONFIG_PROC_FS 5254 remove_proc_entry("ipv6_route", net->proc_net); 5255 remove_proc_entry("rt6_stats", net->proc_net); 5256 #endif 5257 } 5258 5259 static struct pernet_operations ip6_route_net_ops = { 5260 .init = ip6_route_net_init, 5261 .exit = ip6_route_net_exit, 5262 }; 5263 5264 static int __net_init ipv6_inetpeer_init(struct net *net) 5265 { 5266 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5267 5268 if (!bp) 5269 return -ENOMEM; 5270 inet_peer_base_init(bp); 5271 net->ipv6.peers = bp; 5272 return 0; 5273 } 5274 5275 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5276 { 5277 struct inet_peer_base *bp = net->ipv6.peers; 5278 5279 net->ipv6.peers = NULL; 5280 inetpeer_invalidate_tree(bp); 5281 kfree(bp); 5282 } 5283 5284 static struct pernet_operations ipv6_inetpeer_ops = { 5285 .init = ipv6_inetpeer_init, 5286 .exit = ipv6_inetpeer_exit, 5287 }; 5288 5289 static struct pernet_operations ip6_route_net_late_ops = { 5290 .init = ip6_route_net_init_late, 5291 .exit = ip6_route_net_exit_late, 5292 }; 5293 5294 static struct notifier_block ip6_route_dev_notifier = { 5295 .notifier_call = ip6_route_dev_notify, 5296 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5297 }; 5298 5299 void __init ip6_route_init_special_entries(void) 5300 { 5301 /* Registering of the loopback is done before this portion of code, 5302 * the loopback reference in rt6_info will not be taken, do it 5303 * manually for init_net */ 5304 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5305 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5306 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5308 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5309 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5310 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5311 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5312 #endif 5313 } 5314 5315 int __init ip6_route_init(void) 5316 { 5317 int ret; 5318 int cpu; 5319 5320 ret = -ENOMEM; 5321 ip6_dst_ops_template.kmem_cachep = 5322 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5323 SLAB_HWCACHE_ALIGN, NULL); 5324 if (!ip6_dst_ops_template.kmem_cachep) 5325 goto out; 5326 5327 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5328 if (ret) 5329 goto out_kmem_cache; 5330 5331 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5332 if (ret) 5333 goto out_dst_entries; 5334 5335 ret = register_pernet_subsys(&ip6_route_net_ops); 5336 if (ret) 5337 goto out_register_inetpeer; 5338 5339 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5340 5341 ret = fib6_init(); 5342 if (ret) 5343 goto out_register_subsys; 5344 5345 ret = xfrm6_init(); 5346 if (ret) 5347 goto out_fib6_init; 5348 5349 ret = fib6_rules_init(); 5350 if (ret) 5351 goto xfrm6_init; 5352 5353 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5354 if (ret) 5355 goto fib6_rules_init; 5356 5357 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5358 inet6_rtm_newroute, NULL, 0); 5359 if (ret < 0) 5360 goto out_register_late_subsys; 5361 5362 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5363 inet6_rtm_delroute, NULL, 0); 5364 if (ret < 0) 5365 goto out_register_late_subsys; 5366 5367 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5368 inet6_rtm_getroute, NULL, 5369 RTNL_FLAG_DOIT_UNLOCKED); 5370 if (ret < 0) 5371 goto out_register_late_subsys; 5372 5373 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5374 if (ret) 5375 goto out_register_late_subsys; 5376 5377 for_each_possible_cpu(cpu) { 5378 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5379 5380 INIT_LIST_HEAD(&ul->head); 5381 spin_lock_init(&ul->lock); 5382 } 5383 5384 out: 5385 return ret; 5386 5387 out_register_late_subsys: 5388 rtnl_unregister_all(PF_INET6); 5389 unregister_pernet_subsys(&ip6_route_net_late_ops); 5390 fib6_rules_init: 5391 fib6_rules_cleanup(); 5392 xfrm6_init: 5393 xfrm6_fini(); 5394 out_fib6_init: 5395 fib6_gc_cleanup(); 5396 out_register_subsys: 5397 unregister_pernet_subsys(&ip6_route_net_ops); 5398 out_register_inetpeer: 5399 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5400 out_dst_entries: 5401 dst_entries_destroy(&ip6_dst_blackhole_ops); 5402 out_kmem_cache: 5403 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5404 goto out; 5405 } 5406 5407 void ip6_route_cleanup(void) 5408 { 5409 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5410 unregister_pernet_subsys(&ip6_route_net_late_ops); 5411 fib6_rules_cleanup(); 5412 xfrm6_fini(); 5413 fib6_gc_cleanup(); 5414 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5415 unregister_pernet_subsys(&ip6_route_net_ops); 5416 dst_entries_destroy(&ip6_dst_blackhole_ops); 5417 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5418 } 5419