1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 976 { 977 rt->rt6i_flags &= ~RTF_EXPIRES; 978 fib6_info_hold(from); 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 if (from->fib6_metrics != &dst_default_metrics) { 982 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 983 refcount_inc(&from->fib6_metrics->refcnt); 984 } 985 } 986 987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 988 { 989 struct net_device *dev = fib6_info_nh_dev(ort); 990 991 ip6_rt_init_dst(rt, ort); 992 993 rt->rt6i_dst = ort->fib6_dst; 994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 995 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 996 rt->rt6i_flags = ort->fib6_flags; 997 rt6_set_from(rt, ort); 998 #ifdef CONFIG_IPV6_SUBTREES 999 rt->rt6i_src = ort->fib6_src; 1000 #endif 1001 rt->rt6i_prefsrc = ort->fib6_prefsrc; 1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 1003 } 1004 1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1006 struct in6_addr *saddr) 1007 { 1008 struct fib6_node *pn, *sn; 1009 while (1) { 1010 if (fn->fn_flags & RTN_TL_ROOT) 1011 return NULL; 1012 pn = rcu_dereference(fn->parent); 1013 sn = FIB6_SUBTREE(pn); 1014 if (sn && sn != fn) 1015 fn = fib6_node_lookup(sn, NULL, saddr); 1016 else 1017 fn = pn; 1018 if (fn->fn_flags & RTN_RTINFO) 1019 return fn; 1020 } 1021 } 1022 1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1024 bool null_fallback) 1025 { 1026 struct rt6_info *rt = *prt; 1027 1028 if (dst_hold_safe(&rt->dst)) 1029 return true; 1030 if (null_fallback) { 1031 rt = net->ipv6.ip6_null_entry; 1032 dst_hold(&rt->dst); 1033 } else { 1034 rt = NULL; 1035 } 1036 *prt = rt; 1037 return false; 1038 } 1039 1040 /* called with rcu_lock held */ 1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1042 { 1043 unsigned short flags = fib6_info_dst_flags(rt); 1044 struct net_device *dev = rt->fib6_nh.nh_dev; 1045 struct rt6_info *nrt; 1046 1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1048 if (nrt) 1049 ip6_rt_copy_init(nrt, rt); 1050 1051 return nrt; 1052 } 1053 1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1055 struct fib6_table *table, 1056 struct flowi6 *fl6, 1057 const struct sk_buff *skb, 1058 int flags) 1059 { 1060 struct fib6_info *f6i; 1061 struct fib6_node *fn; 1062 struct rt6_info *rt; 1063 1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1065 flags &= ~RT6_LOOKUP_F_IFACE; 1066 1067 rcu_read_lock(); 1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1069 restart: 1070 f6i = rcu_dereference(fn->leaf); 1071 if (!f6i) { 1072 f6i = net->ipv6.fib6_null_entry; 1073 } else { 1074 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1075 fl6->flowi6_oif, flags); 1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1077 f6i = fib6_multipath_select(net, f6i, fl6, 1078 fl6->flowi6_oif, skb, 1079 flags); 1080 } 1081 if (f6i == net->ipv6.fib6_null_entry) { 1082 fn = fib6_backtrack(fn, &fl6->saddr); 1083 if (fn) 1084 goto restart; 1085 } 1086 1087 trace_fib6_table_lookup(net, f6i, table, fl6); 1088 1089 /* Search through exception table */ 1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1091 if (rt) { 1092 if (ip6_hold_safe(net, &rt, true)) 1093 dst_use_noref(&rt->dst, jiffies); 1094 } else if (f6i == net->ipv6.fib6_null_entry) { 1095 rt = net->ipv6.ip6_null_entry; 1096 dst_hold(&rt->dst); 1097 } else { 1098 rt = ip6_create_rt_rcu(f6i); 1099 if (!rt) { 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 } 1103 } 1104 1105 rcu_read_unlock(); 1106 1107 return rt; 1108 } 1109 1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1111 const struct sk_buff *skb, int flags) 1112 { 1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1114 } 1115 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1116 1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1118 const struct in6_addr *saddr, int oif, 1119 const struct sk_buff *skb, int strict) 1120 { 1121 struct flowi6 fl6 = { 1122 .flowi6_oif = oif, 1123 .daddr = *daddr, 1124 }; 1125 struct dst_entry *dst; 1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1127 1128 if (saddr) { 1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1130 flags |= RT6_LOOKUP_F_HAS_SADDR; 1131 } 1132 1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1134 if (dst->error == 0) 1135 return (struct rt6_info *) dst; 1136 1137 dst_release(dst); 1138 1139 return NULL; 1140 } 1141 EXPORT_SYMBOL(rt6_lookup); 1142 1143 /* ip6_ins_rt is called with FREE table->tb6_lock. 1144 * It takes new route entry, the addition fails by any reason the 1145 * route is released. 1146 * Caller must hold dst before calling it. 1147 */ 1148 1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1150 struct netlink_ext_ack *extack) 1151 { 1152 int err; 1153 struct fib6_table *table; 1154 1155 table = rt->fib6_table; 1156 spin_lock_bh(&table->tb6_lock); 1157 err = fib6_add(&table->tb6_root, rt, info, extack); 1158 spin_unlock_bh(&table->tb6_lock); 1159 1160 return err; 1161 } 1162 1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1164 { 1165 struct nl_info info = { .nl_net = net, }; 1166 1167 return __ip6_ins_rt(rt, &info, NULL); 1168 } 1169 1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1171 const struct in6_addr *daddr, 1172 const struct in6_addr *saddr) 1173 { 1174 struct net_device *dev; 1175 struct rt6_info *rt; 1176 1177 /* 1178 * Clone the route. 1179 */ 1180 1181 dev = ip6_rt_get_dev_rcu(ort); 1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1183 if (!rt) 1184 return NULL; 1185 1186 ip6_rt_copy_init(rt, ort); 1187 rt->rt6i_flags |= RTF_CACHE; 1188 rt->dst.flags |= DST_HOST; 1189 rt->rt6i_dst.addr = *daddr; 1190 rt->rt6i_dst.plen = 128; 1191 1192 if (!rt6_is_gw_or_nonexthop(ort)) { 1193 if (ort->fib6_dst.plen != 128 && 1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1195 rt->rt6i_flags |= RTF_ANYCAST; 1196 #ifdef CONFIG_IPV6_SUBTREES 1197 if (rt->rt6i_src.plen && saddr) { 1198 rt->rt6i_src.addr = *saddr; 1199 rt->rt6i_src.plen = 128; 1200 } 1201 #endif 1202 } 1203 1204 return rt; 1205 } 1206 1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1208 { 1209 unsigned short flags = fib6_info_dst_flags(rt); 1210 struct net_device *dev; 1211 struct rt6_info *pcpu_rt; 1212 1213 rcu_read_lock(); 1214 dev = ip6_rt_get_dev_rcu(rt); 1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1216 rcu_read_unlock(); 1217 if (!pcpu_rt) 1218 return NULL; 1219 ip6_rt_copy_init(pcpu_rt, rt); 1220 pcpu_rt->rt6i_flags |= RTF_PCPU; 1221 return pcpu_rt; 1222 } 1223 1224 /* It should be called with rcu_read_lock() acquired */ 1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1226 { 1227 struct rt6_info *pcpu_rt, **p; 1228 1229 p = this_cpu_ptr(rt->rt6i_pcpu); 1230 pcpu_rt = *p; 1231 1232 if (pcpu_rt) 1233 ip6_hold_safe(NULL, &pcpu_rt, false); 1234 1235 return pcpu_rt; 1236 } 1237 1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1239 struct fib6_info *rt) 1240 { 1241 struct rt6_info *pcpu_rt, *prev, **p; 1242 1243 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1244 if (!pcpu_rt) { 1245 dst_hold(&net->ipv6.ip6_null_entry->dst); 1246 return net->ipv6.ip6_null_entry; 1247 } 1248 1249 dst_hold(&pcpu_rt->dst); 1250 p = this_cpu_ptr(rt->rt6i_pcpu); 1251 prev = cmpxchg(p, NULL, pcpu_rt); 1252 BUG_ON(prev); 1253 1254 return pcpu_rt; 1255 } 1256 1257 /* exception hash table implementation 1258 */ 1259 static DEFINE_SPINLOCK(rt6_exception_lock); 1260 1261 /* Remove rt6_ex from hash table and free the memory 1262 * Caller must hold rt6_exception_lock 1263 */ 1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1265 struct rt6_exception *rt6_ex) 1266 { 1267 struct net *net; 1268 1269 if (!bucket || !rt6_ex) 1270 return; 1271 1272 net = dev_net(rt6_ex->rt6i->dst.dev); 1273 hlist_del_rcu(&rt6_ex->hlist); 1274 dst_release(&rt6_ex->rt6i->dst); 1275 kfree_rcu(rt6_ex, rcu); 1276 WARN_ON_ONCE(!bucket->depth); 1277 bucket->depth--; 1278 net->ipv6.rt6_stats->fib_rt_cache--; 1279 } 1280 1281 /* Remove oldest rt6_ex in bucket and free the memory 1282 * Caller must hold rt6_exception_lock 1283 */ 1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1285 { 1286 struct rt6_exception *rt6_ex, *oldest = NULL; 1287 1288 if (!bucket) 1289 return; 1290 1291 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1292 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1293 oldest = rt6_ex; 1294 } 1295 rt6_remove_exception(bucket, oldest); 1296 } 1297 1298 static u32 rt6_exception_hash(const struct in6_addr *dst, 1299 const struct in6_addr *src) 1300 { 1301 static u32 seed __read_mostly; 1302 u32 val; 1303 1304 net_get_random_once(&seed, sizeof(seed)); 1305 val = jhash(dst, sizeof(*dst), seed); 1306 1307 #ifdef CONFIG_IPV6_SUBTREES 1308 if (src) 1309 val = jhash(src, sizeof(*src), val); 1310 #endif 1311 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1312 } 1313 1314 /* Helper function to find the cached rt in the hash table 1315 * and update bucket pointer to point to the bucket for this 1316 * (daddr, saddr) pair 1317 * Caller must hold rt6_exception_lock 1318 */ 1319 static struct rt6_exception * 1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1321 const struct in6_addr *daddr, 1322 const struct in6_addr *saddr) 1323 { 1324 struct rt6_exception *rt6_ex; 1325 u32 hval; 1326 1327 if (!(*bucket) || !daddr) 1328 return NULL; 1329 1330 hval = rt6_exception_hash(daddr, saddr); 1331 *bucket += hval; 1332 1333 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1334 struct rt6_info *rt6 = rt6_ex->rt6i; 1335 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1336 1337 #ifdef CONFIG_IPV6_SUBTREES 1338 if (matched && saddr) 1339 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1340 #endif 1341 if (matched) 1342 return rt6_ex; 1343 } 1344 return NULL; 1345 } 1346 1347 /* Helper function to find the cached rt in the hash table 1348 * and update bucket pointer to point to the bucket for this 1349 * (daddr, saddr) pair 1350 * Caller must hold rcu_read_lock() 1351 */ 1352 static struct rt6_exception * 1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1354 const struct in6_addr *daddr, 1355 const struct in6_addr *saddr) 1356 { 1357 struct rt6_exception *rt6_ex; 1358 u32 hval; 1359 1360 WARN_ON_ONCE(!rcu_read_lock_held()); 1361 1362 if (!(*bucket) || !daddr) 1363 return NULL; 1364 1365 hval = rt6_exception_hash(daddr, saddr); 1366 *bucket += hval; 1367 1368 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1369 struct rt6_info *rt6 = rt6_ex->rt6i; 1370 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1371 1372 #ifdef CONFIG_IPV6_SUBTREES 1373 if (matched && saddr) 1374 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1375 #endif 1376 if (matched) 1377 return rt6_ex; 1378 } 1379 return NULL; 1380 } 1381 1382 static unsigned int fib6_mtu(const struct fib6_info *rt) 1383 { 1384 unsigned int mtu; 1385 1386 if (rt->fib6_pmtu) { 1387 mtu = rt->fib6_pmtu; 1388 } else { 1389 struct net_device *dev = fib6_info_nh_dev(rt); 1390 struct inet6_dev *idev; 1391 1392 rcu_read_lock(); 1393 idev = __in6_dev_get(dev); 1394 mtu = idev->cnf.mtu6; 1395 rcu_read_unlock(); 1396 } 1397 1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1399 1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1401 } 1402 1403 static int rt6_insert_exception(struct rt6_info *nrt, 1404 struct fib6_info *ort) 1405 { 1406 struct net *net = dev_net(nrt->dst.dev); 1407 struct rt6_exception_bucket *bucket; 1408 struct in6_addr *src_key = NULL; 1409 struct rt6_exception *rt6_ex; 1410 int err = 0; 1411 1412 spin_lock_bh(&rt6_exception_lock); 1413 1414 if (ort->exception_bucket_flushed) { 1415 err = -EINVAL; 1416 goto out; 1417 } 1418 1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1420 lockdep_is_held(&rt6_exception_lock)); 1421 if (!bucket) { 1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1423 GFP_ATOMIC); 1424 if (!bucket) { 1425 err = -ENOMEM; 1426 goto out; 1427 } 1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1429 } 1430 1431 #ifdef CONFIG_IPV6_SUBTREES 1432 /* rt6i_src.plen != 0 indicates ort is in subtree 1433 * and exception table is indexed by a hash of 1434 * both rt6i_dst and rt6i_src. 1435 * Otherwise, the exception table is indexed by 1436 * a hash of only rt6i_dst. 1437 */ 1438 if (ort->fib6_src.plen) 1439 src_key = &nrt->rt6i_src.addr; 1440 #endif 1441 1442 /* Update rt6i_prefsrc as it could be changed 1443 * in rt6_remove_prefsrc() 1444 */ 1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1446 /* rt6_mtu_change() might lower mtu on ort. 1447 * Only insert this exception route if its mtu 1448 * is less than ort's mtu value. 1449 */ 1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1451 err = -EINVAL; 1452 goto out; 1453 } 1454 1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1456 src_key); 1457 if (rt6_ex) 1458 rt6_remove_exception(bucket, rt6_ex); 1459 1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1461 if (!rt6_ex) { 1462 err = -ENOMEM; 1463 goto out; 1464 } 1465 rt6_ex->rt6i = nrt; 1466 rt6_ex->stamp = jiffies; 1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1468 bucket->depth++; 1469 net->ipv6.rt6_stats->fib_rt_cache++; 1470 1471 if (bucket->depth > FIB6_MAX_DEPTH) 1472 rt6_exception_remove_oldest(bucket); 1473 1474 out: 1475 spin_unlock_bh(&rt6_exception_lock); 1476 1477 /* Update fn->fn_sernum to invalidate all cached dst */ 1478 if (!err) { 1479 spin_lock_bh(&ort->fib6_table->tb6_lock); 1480 fib6_update_sernum(net, ort); 1481 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1482 fib6_force_start_gc(net); 1483 } 1484 1485 return err; 1486 } 1487 1488 void rt6_flush_exceptions(struct fib6_info *rt) 1489 { 1490 struct rt6_exception_bucket *bucket; 1491 struct rt6_exception *rt6_ex; 1492 struct hlist_node *tmp; 1493 int i; 1494 1495 spin_lock_bh(&rt6_exception_lock); 1496 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1497 rt->exception_bucket_flushed = 1; 1498 1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1500 lockdep_is_held(&rt6_exception_lock)); 1501 if (!bucket) 1502 goto out; 1503 1504 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1505 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1506 rt6_remove_exception(bucket, rt6_ex); 1507 WARN_ON_ONCE(bucket->depth); 1508 bucket++; 1509 } 1510 1511 out: 1512 spin_unlock_bh(&rt6_exception_lock); 1513 } 1514 1515 /* Find cached rt in the hash table inside passed in rt 1516 * Caller has to hold rcu_read_lock() 1517 */ 1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1519 struct in6_addr *daddr, 1520 struct in6_addr *saddr) 1521 { 1522 struct rt6_exception_bucket *bucket; 1523 struct in6_addr *src_key = NULL; 1524 struct rt6_exception *rt6_ex; 1525 struct rt6_info *res = NULL; 1526 1527 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1528 1529 #ifdef CONFIG_IPV6_SUBTREES 1530 /* rt6i_src.plen != 0 indicates rt is in subtree 1531 * and exception table is indexed by a hash of 1532 * both rt6i_dst and rt6i_src. 1533 * Otherwise, the exception table is indexed by 1534 * a hash of only rt6i_dst. 1535 */ 1536 if (rt->fib6_src.plen) 1537 src_key = saddr; 1538 #endif 1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1540 1541 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1542 res = rt6_ex->rt6i; 1543 1544 return res; 1545 } 1546 1547 /* Remove the passed in cached rt from the hash table that contains it */ 1548 static int rt6_remove_exception_rt(struct rt6_info *rt) 1549 { 1550 struct rt6_exception_bucket *bucket; 1551 struct in6_addr *src_key = NULL; 1552 struct rt6_exception *rt6_ex; 1553 struct fib6_info *from; 1554 int err; 1555 1556 from = rcu_dereference(rt->from); 1557 if (!from || 1558 !(rt->rt6i_flags & RTF_CACHE)) 1559 return -EINVAL; 1560 1561 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1562 return -ENOENT; 1563 1564 spin_lock_bh(&rt6_exception_lock); 1565 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1566 lockdep_is_held(&rt6_exception_lock)); 1567 #ifdef CONFIG_IPV6_SUBTREES 1568 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1569 * and exception table is indexed by a hash of 1570 * both rt6i_dst and rt6i_src. 1571 * Otherwise, the exception table is indexed by 1572 * a hash of only rt6i_dst. 1573 */ 1574 if (from->fib6_src.plen) 1575 src_key = &rt->rt6i_src.addr; 1576 #endif 1577 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1578 &rt->rt6i_dst.addr, 1579 src_key); 1580 if (rt6_ex) { 1581 rt6_remove_exception(bucket, rt6_ex); 1582 err = 0; 1583 } else { 1584 err = -ENOENT; 1585 } 1586 1587 spin_unlock_bh(&rt6_exception_lock); 1588 return err; 1589 } 1590 1591 /* Find rt6_ex which contains the passed in rt cache and 1592 * refresh its stamp 1593 */ 1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1595 { 1596 struct rt6_exception_bucket *bucket; 1597 struct fib6_info *from = rt->from; 1598 struct in6_addr *src_key = NULL; 1599 struct rt6_exception *rt6_ex; 1600 1601 if (!from || 1602 !(rt->rt6i_flags & RTF_CACHE)) 1603 return; 1604 1605 rcu_read_lock(); 1606 bucket = rcu_dereference(from->rt6i_exception_bucket); 1607 1608 #ifdef CONFIG_IPV6_SUBTREES 1609 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1610 * and exception table is indexed by a hash of 1611 * both rt6i_dst and rt6i_src. 1612 * Otherwise, the exception table is indexed by 1613 * a hash of only rt6i_dst. 1614 */ 1615 if (from->fib6_src.plen) 1616 src_key = &rt->rt6i_src.addr; 1617 #endif 1618 rt6_ex = __rt6_find_exception_rcu(&bucket, 1619 &rt->rt6i_dst.addr, 1620 src_key); 1621 if (rt6_ex) 1622 rt6_ex->stamp = jiffies; 1623 1624 rcu_read_unlock(); 1625 } 1626 1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1628 { 1629 struct rt6_exception_bucket *bucket; 1630 struct rt6_exception *rt6_ex; 1631 int i; 1632 1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1634 lockdep_is_held(&rt6_exception_lock)); 1635 1636 if (bucket) { 1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1638 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1639 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1640 } 1641 bucket++; 1642 } 1643 } 1644 } 1645 1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1647 struct rt6_info *rt, int mtu) 1648 { 1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1650 * lowest MTU in the path: always allow updating the route PMTU to 1651 * reflect PMTU decreases. 1652 * 1653 * If the new MTU is higher, and the route PMTU is equal to the local 1654 * MTU, this means the old MTU is the lowest in the path, so allow 1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1656 * handle this. 1657 */ 1658 1659 if (dst_mtu(&rt->dst) >= mtu) 1660 return true; 1661 1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1663 return true; 1664 1665 return false; 1666 } 1667 1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1669 struct fib6_info *rt, int mtu) 1670 { 1671 struct rt6_exception_bucket *bucket; 1672 struct rt6_exception *rt6_ex; 1673 int i; 1674 1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1676 lockdep_is_held(&rt6_exception_lock)); 1677 1678 if (!bucket) 1679 return; 1680 1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1683 struct rt6_info *entry = rt6_ex->rt6i; 1684 1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1686 * route), the metrics of its rt->from have already 1687 * been updated. 1688 */ 1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1690 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1692 } 1693 bucket++; 1694 } 1695 } 1696 1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1698 1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1700 struct in6_addr *gateway) 1701 { 1702 struct rt6_exception_bucket *bucket; 1703 struct rt6_exception *rt6_ex; 1704 struct hlist_node *tmp; 1705 int i; 1706 1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1708 return; 1709 1710 spin_lock_bh(&rt6_exception_lock); 1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1712 lockdep_is_held(&rt6_exception_lock)); 1713 1714 if (bucket) { 1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1716 hlist_for_each_entry_safe(rt6_ex, tmp, 1717 &bucket->chain, hlist) { 1718 struct rt6_info *entry = rt6_ex->rt6i; 1719 1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1721 RTF_CACHE_GATEWAY && 1722 ipv6_addr_equal(gateway, 1723 &entry->rt6i_gateway)) { 1724 rt6_remove_exception(bucket, rt6_ex); 1725 } 1726 } 1727 bucket++; 1728 } 1729 } 1730 1731 spin_unlock_bh(&rt6_exception_lock); 1732 } 1733 1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1735 struct rt6_exception *rt6_ex, 1736 struct fib6_gc_args *gc_args, 1737 unsigned long now) 1738 { 1739 struct rt6_info *rt = rt6_ex->rt6i; 1740 1741 /* we are pruning and obsoleting aged-out and non gateway exceptions 1742 * even if others have still references to them, so that on next 1743 * dst_check() such references can be dropped. 1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1745 * expired, independently from their aging, as per RFC 8201 section 4 1746 */ 1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1749 RT6_TRACE("aging clone %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 } else if (time_after(jiffies, rt->dst.expires)) { 1754 RT6_TRACE("purging expired route %p\n", rt); 1755 rt6_remove_exception(bucket, rt6_ex); 1756 return; 1757 } 1758 1759 if (rt->rt6i_flags & RTF_GATEWAY) { 1760 struct neighbour *neigh; 1761 __u8 neigh_flags = 0; 1762 1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1764 if (neigh) 1765 neigh_flags = neigh->flags; 1766 1767 if (!(neigh_flags & NTF_ROUTER)) { 1768 RT6_TRACE("purging route %p via non-router but gateway\n", 1769 rt); 1770 rt6_remove_exception(bucket, rt6_ex); 1771 return; 1772 } 1773 } 1774 1775 gc_args->more++; 1776 } 1777 1778 void rt6_age_exceptions(struct fib6_info *rt, 1779 struct fib6_gc_args *gc_args, 1780 unsigned long now) 1781 { 1782 struct rt6_exception_bucket *bucket; 1783 struct rt6_exception *rt6_ex; 1784 struct hlist_node *tmp; 1785 int i; 1786 1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1788 return; 1789 1790 rcu_read_lock_bh(); 1791 spin_lock(&rt6_exception_lock); 1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1793 lockdep_is_held(&rt6_exception_lock)); 1794 1795 if (bucket) { 1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1797 hlist_for_each_entry_safe(rt6_ex, tmp, 1798 &bucket->chain, hlist) { 1799 rt6_age_examine_exception(bucket, rt6_ex, 1800 gc_args, now); 1801 } 1802 bucket++; 1803 } 1804 } 1805 spin_unlock(&rt6_exception_lock); 1806 rcu_read_unlock_bh(); 1807 } 1808 1809 /* must be called with rcu lock held */ 1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1811 int oif, struct flowi6 *fl6, int strict) 1812 { 1813 struct fib6_node *fn, *saved_fn; 1814 struct fib6_info *f6i; 1815 1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1817 saved_fn = fn; 1818 1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1820 oif = 0; 1821 1822 redo_rt6_select: 1823 f6i = rt6_select(net, fn, oif, strict); 1824 if (f6i == net->ipv6.fib6_null_entry) { 1825 fn = fib6_backtrack(fn, &fl6->saddr); 1826 if (fn) 1827 goto redo_rt6_select; 1828 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1829 /* also consider unreachable route */ 1830 strict &= ~RT6_LOOKUP_F_REACHABLE; 1831 fn = saved_fn; 1832 goto redo_rt6_select; 1833 } 1834 } 1835 1836 trace_fib6_table_lookup(net, f6i, table, fl6); 1837 1838 return f6i; 1839 } 1840 1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1842 int oif, struct flowi6 *fl6, 1843 const struct sk_buff *skb, int flags) 1844 { 1845 struct fib6_info *f6i; 1846 struct rt6_info *rt; 1847 int strict = 0; 1848 1849 strict |= flags & RT6_LOOKUP_F_IFACE; 1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1851 if (net->ipv6.devconf_all->forwarding == 0) 1852 strict |= RT6_LOOKUP_F_REACHABLE; 1853 1854 rcu_read_lock(); 1855 1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1857 if (f6i->fib6_nsiblings) 1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1859 1860 if (f6i == net->ipv6.fib6_null_entry) { 1861 rt = net->ipv6.ip6_null_entry; 1862 rcu_read_unlock(); 1863 dst_hold(&rt->dst); 1864 return rt; 1865 } 1866 1867 /*Search through exception table */ 1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1869 if (rt) { 1870 if (ip6_hold_safe(net, &rt, true)) 1871 dst_use_noref(&rt->dst, jiffies); 1872 1873 rcu_read_unlock(); 1874 return rt; 1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1876 !(f6i->fib6_flags & RTF_GATEWAY))) { 1877 /* Create a RTF_CACHE clone which will not be 1878 * owned by the fib6 tree. It is for the special case where 1879 * the daddr in the skb during the neighbor look-up is different 1880 * from the fl6->daddr used to look-up route here. 1881 */ 1882 struct rt6_info *uncached_rt; 1883 1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1885 1886 rcu_read_unlock(); 1887 1888 if (uncached_rt) { 1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1890 * No need for another dst_hold() 1891 */ 1892 rt6_uncached_list_add(uncached_rt); 1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1894 } else { 1895 uncached_rt = net->ipv6.ip6_null_entry; 1896 dst_hold(&uncached_rt->dst); 1897 } 1898 1899 return uncached_rt; 1900 } else { 1901 /* Get a percpu copy */ 1902 1903 struct rt6_info *pcpu_rt; 1904 1905 local_bh_disable(); 1906 pcpu_rt = rt6_get_pcpu_route(f6i); 1907 1908 if (!pcpu_rt) 1909 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1910 1911 local_bh_enable(); 1912 rcu_read_unlock(); 1913 1914 return pcpu_rt; 1915 } 1916 } 1917 EXPORT_SYMBOL_GPL(ip6_pol_route); 1918 1919 static struct rt6_info *ip6_pol_route_input(struct net *net, 1920 struct fib6_table *table, 1921 struct flowi6 *fl6, 1922 const struct sk_buff *skb, 1923 int flags) 1924 { 1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1926 } 1927 1928 struct dst_entry *ip6_route_input_lookup(struct net *net, 1929 struct net_device *dev, 1930 struct flowi6 *fl6, 1931 const struct sk_buff *skb, 1932 int flags) 1933 { 1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1935 flags |= RT6_LOOKUP_F_IFACE; 1936 1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1938 } 1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1940 1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1942 struct flow_keys *keys, 1943 struct flow_keys *flkeys) 1944 { 1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1946 const struct ipv6hdr *key_iph = outer_iph; 1947 struct flow_keys *_flkeys = flkeys; 1948 const struct ipv6hdr *inner_iph; 1949 const struct icmp6hdr *icmph; 1950 struct ipv6hdr _inner_iph; 1951 struct icmp6hdr _icmph; 1952 1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1954 goto out; 1955 1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1957 sizeof(_icmph), &_icmph); 1958 if (!icmph) 1959 goto out; 1960 1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1964 icmph->icmp6_type != ICMPV6_PARAMPROB) 1965 goto out; 1966 1967 inner_iph = skb_header_pointer(skb, 1968 skb_transport_offset(skb) + sizeof(*icmph), 1969 sizeof(_inner_iph), &_inner_iph); 1970 if (!inner_iph) 1971 goto out; 1972 1973 key_iph = inner_iph; 1974 _flkeys = NULL; 1975 out: 1976 if (_flkeys) { 1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1979 keys->tags.flow_label = _flkeys->tags.flow_label; 1980 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1981 } else { 1982 keys->addrs.v6addrs.src = key_iph->saddr; 1983 keys->addrs.v6addrs.dst = key_iph->daddr; 1984 keys->tags.flow_label = ip6_flowinfo(key_iph); 1985 keys->basic.ip_proto = key_iph->nexthdr; 1986 } 1987 } 1988 1989 /* if skb is set it will be used and fl6 can be NULL */ 1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1991 const struct sk_buff *skb, struct flow_keys *flkeys) 1992 { 1993 struct flow_keys hash_keys; 1994 u32 mhash; 1995 1996 switch (ip6_multipath_hash_policy(net)) { 1997 case 0: 1998 memset(&hash_keys, 0, sizeof(hash_keys)); 1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2000 if (skb) { 2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2002 } else { 2003 hash_keys.addrs.v6addrs.src = fl6->saddr; 2004 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2005 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2007 } 2008 break; 2009 case 1: 2010 if (skb) { 2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2012 struct flow_keys keys; 2013 2014 /* short-circuit if we already have L4 hash present */ 2015 if (skb->l4_hash) 2016 return skb_get_hash_raw(skb) >> 1; 2017 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 2020 if (!flkeys) { 2021 skb_flow_dissect_flow_keys(skb, &keys, flag); 2022 flkeys = &keys; 2023 } 2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2027 hash_keys.ports.src = flkeys->ports.src; 2028 hash_keys.ports.dst = flkeys->ports.dst; 2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2030 } else { 2031 memset(&hash_keys, 0, sizeof(hash_keys)); 2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.ports.src = fl6->fl6_sport; 2036 hash_keys.ports.dst = fl6->fl6_dport; 2037 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2038 } 2039 break; 2040 } 2041 mhash = flow_hash_from_keys(&hash_keys); 2042 2043 return mhash >> 1; 2044 } 2045 2046 void ip6_route_input(struct sk_buff *skb) 2047 { 2048 const struct ipv6hdr *iph = ipv6_hdr(skb); 2049 struct net *net = dev_net(skb->dev); 2050 int flags = RT6_LOOKUP_F_HAS_SADDR; 2051 struct ip_tunnel_info *tun_info; 2052 struct flowi6 fl6 = { 2053 .flowi6_iif = skb->dev->ifindex, 2054 .daddr = iph->daddr, 2055 .saddr = iph->saddr, 2056 .flowlabel = ip6_flowinfo(iph), 2057 .flowi6_mark = skb->mark, 2058 .flowi6_proto = iph->nexthdr, 2059 }; 2060 struct flow_keys *flkeys = NULL, _flkeys; 2061 2062 tun_info = skb_tunnel_info(skb); 2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2065 2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2067 flkeys = &_flkeys; 2068 2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2071 skb_dst_drop(skb); 2072 skb_dst_set(skb, 2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2074 } 2075 2076 static struct rt6_info *ip6_pol_route_output(struct net *net, 2077 struct fib6_table *table, 2078 struct flowi6 *fl6, 2079 const struct sk_buff *skb, 2080 int flags) 2081 { 2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2083 } 2084 2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2086 struct flowi6 *fl6, int flags) 2087 { 2088 bool any_src; 2089 2090 if (rt6_need_strict(&fl6->daddr)) { 2091 struct dst_entry *dst; 2092 2093 dst = l3mdev_link_scope_lookup(net, fl6); 2094 if (dst) 2095 return dst; 2096 } 2097 2098 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2099 2100 any_src = ipv6_addr_any(&fl6->saddr); 2101 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2102 (fl6->flowi6_oif && any_src)) 2103 flags |= RT6_LOOKUP_F_IFACE; 2104 2105 if (!any_src) 2106 flags |= RT6_LOOKUP_F_HAS_SADDR; 2107 else if (sk) 2108 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2109 2110 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2111 } 2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2113 2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2115 { 2116 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2117 struct net_device *loopback_dev = net->loopback_dev; 2118 struct dst_entry *new = NULL; 2119 2120 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2121 DST_OBSOLETE_DEAD, 0); 2122 if (rt) { 2123 rt6_info_init(rt); 2124 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2125 2126 new = &rt->dst; 2127 new->__use = 1; 2128 new->input = dst_discard; 2129 new->output = dst_discard_out; 2130 2131 dst_copy_metrics(new, &ort->dst); 2132 2133 rt->rt6i_idev = in6_dev_get(loopback_dev); 2134 rt->rt6i_gateway = ort->rt6i_gateway; 2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2136 2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2138 #ifdef CONFIG_IPV6_SUBTREES 2139 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2140 #endif 2141 } 2142 2143 dst_release(dst_orig); 2144 return new ? new : ERR_PTR(-ENOMEM); 2145 } 2146 2147 /* 2148 * Destination cache support functions 2149 */ 2150 2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2152 { 2153 u32 rt_cookie = 0; 2154 2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2156 return false; 2157 2158 if (fib6_check_expired(f6i)) 2159 return false; 2160 2161 return true; 2162 } 2163 2164 static struct dst_entry *rt6_check(struct rt6_info *rt, 2165 struct fib6_info *from, 2166 u32 cookie) 2167 { 2168 u32 rt_cookie = 0; 2169 2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2171 rt_cookie != cookie) 2172 return NULL; 2173 2174 if (rt6_check_expired(rt)) 2175 return NULL; 2176 2177 return &rt->dst; 2178 } 2179 2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2181 struct fib6_info *from, 2182 u32 cookie) 2183 { 2184 if (!__rt6_check_expired(rt) && 2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2186 fib6_check(from, cookie)) 2187 return &rt->dst; 2188 else 2189 return NULL; 2190 } 2191 2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2193 { 2194 struct dst_entry *dst_ret; 2195 struct fib6_info *from; 2196 struct rt6_info *rt; 2197 2198 rt = container_of(dst, struct rt6_info, dst); 2199 2200 rcu_read_lock(); 2201 2202 /* All IPV6 dsts are created with ->obsolete set to the value 2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2204 * into this function always. 2205 */ 2206 2207 from = rcu_dereference(rt->from); 2208 2209 if (from && (rt->rt6i_flags & RTF_PCPU || 2210 unlikely(!list_empty(&rt->rt6i_uncached)))) 2211 dst_ret = rt6_dst_from_check(rt, from, cookie); 2212 else 2213 dst_ret = rt6_check(rt, from, cookie); 2214 2215 rcu_read_unlock(); 2216 2217 return dst_ret; 2218 } 2219 2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2221 { 2222 struct rt6_info *rt = (struct rt6_info *) dst; 2223 2224 if (rt) { 2225 if (rt->rt6i_flags & RTF_CACHE) { 2226 rcu_read_lock(); 2227 if (rt6_check_expired(rt)) { 2228 rt6_remove_exception_rt(rt); 2229 dst = NULL; 2230 } 2231 rcu_read_unlock(); 2232 } else { 2233 dst_release(dst); 2234 dst = NULL; 2235 } 2236 } 2237 return dst; 2238 } 2239 2240 static void ip6_link_failure(struct sk_buff *skb) 2241 { 2242 struct rt6_info *rt; 2243 2244 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2245 2246 rt = (struct rt6_info *) skb_dst(skb); 2247 if (rt) { 2248 rcu_read_lock(); 2249 if (rt->rt6i_flags & RTF_CACHE) { 2250 if (dst_hold_safe(&rt->dst)) 2251 rt6_remove_exception_rt(rt); 2252 } else { 2253 struct fib6_info *from; 2254 struct fib6_node *fn; 2255 2256 from = rcu_dereference(rt->from); 2257 if (from) { 2258 fn = rcu_dereference(from->fib6_node); 2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2260 fn->fn_sernum = -1; 2261 } 2262 } 2263 rcu_read_unlock(); 2264 } 2265 } 2266 2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2268 { 2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2270 struct fib6_info *from; 2271 2272 rcu_read_lock(); 2273 from = rcu_dereference(rt0->from); 2274 if (from) 2275 rt0->dst.expires = from->expires; 2276 rcu_read_unlock(); 2277 } 2278 2279 dst_set_expires(&rt0->dst, timeout); 2280 rt0->rt6i_flags |= RTF_EXPIRES; 2281 } 2282 2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2284 { 2285 struct net *net = dev_net(rt->dst.dev); 2286 2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2288 rt->rt6i_flags |= RTF_MODIFIED; 2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2290 } 2291 2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2293 { 2294 bool from_set; 2295 2296 rcu_read_lock(); 2297 from_set = !!rcu_dereference(rt->from); 2298 rcu_read_unlock(); 2299 2300 return !(rt->rt6i_flags & RTF_CACHE) && 2301 (rt->rt6i_flags & RTF_PCPU || from_set); 2302 } 2303 2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2305 const struct ipv6hdr *iph, u32 mtu) 2306 { 2307 const struct in6_addr *daddr, *saddr; 2308 struct rt6_info *rt6 = (struct rt6_info *)dst; 2309 2310 if (rt6->rt6i_flags & RTF_LOCAL) 2311 return; 2312 2313 if (dst_metric_locked(dst, RTAX_MTU)) 2314 return; 2315 2316 if (iph) { 2317 daddr = &iph->daddr; 2318 saddr = &iph->saddr; 2319 } else if (sk) { 2320 daddr = &sk->sk_v6_daddr; 2321 saddr = &inet6_sk(sk)->saddr; 2322 } else { 2323 daddr = NULL; 2324 saddr = NULL; 2325 } 2326 dst_confirm_neigh(dst, daddr); 2327 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2328 if (mtu >= dst_mtu(dst)) 2329 return; 2330 2331 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2332 rt6_do_update_pmtu(rt6, mtu); 2333 /* update rt6_ex->stamp for cache */ 2334 if (rt6->rt6i_flags & RTF_CACHE) 2335 rt6_update_exception_stamp_rt(rt6); 2336 } else if (daddr) { 2337 struct fib6_info *from; 2338 struct rt6_info *nrt6; 2339 2340 rcu_read_lock(); 2341 from = rcu_dereference(rt6->from); 2342 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2343 if (nrt6) { 2344 rt6_do_update_pmtu(nrt6, mtu); 2345 if (rt6_insert_exception(nrt6, from)) 2346 dst_release_immediate(&nrt6->dst); 2347 } 2348 rcu_read_unlock(); 2349 } 2350 } 2351 2352 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2353 struct sk_buff *skb, u32 mtu) 2354 { 2355 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2356 } 2357 2358 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2359 int oif, u32 mark, kuid_t uid) 2360 { 2361 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2362 struct dst_entry *dst; 2363 struct flowi6 fl6; 2364 2365 memset(&fl6, 0, sizeof(fl6)); 2366 fl6.flowi6_oif = oif; 2367 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2368 fl6.daddr = iph->daddr; 2369 fl6.saddr = iph->saddr; 2370 fl6.flowlabel = ip6_flowinfo(iph); 2371 fl6.flowi6_uid = uid; 2372 2373 dst = ip6_route_output(net, NULL, &fl6); 2374 if (!dst->error) 2375 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2376 dst_release(dst); 2377 } 2378 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2379 2380 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2381 { 2382 struct dst_entry *dst; 2383 2384 ip6_update_pmtu(skb, sock_net(sk), mtu, 2385 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2386 2387 dst = __sk_dst_get(sk); 2388 if (!dst || !dst->obsolete || 2389 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2390 return; 2391 2392 bh_lock_sock(sk); 2393 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2394 ip6_datagram_dst_update(sk, false); 2395 bh_unlock_sock(sk); 2396 } 2397 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2398 2399 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2400 const struct flowi6 *fl6) 2401 { 2402 #ifdef CONFIG_IPV6_SUBTREES 2403 struct ipv6_pinfo *np = inet6_sk(sk); 2404 #endif 2405 2406 ip6_dst_store(sk, dst, 2407 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2408 &sk->sk_v6_daddr : NULL, 2409 #ifdef CONFIG_IPV6_SUBTREES 2410 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2411 &np->saddr : 2412 #endif 2413 NULL); 2414 } 2415 2416 /* Handle redirects */ 2417 struct ip6rd_flowi { 2418 struct flowi6 fl6; 2419 struct in6_addr gateway; 2420 }; 2421 2422 static struct rt6_info *__ip6_route_redirect(struct net *net, 2423 struct fib6_table *table, 2424 struct flowi6 *fl6, 2425 const struct sk_buff *skb, 2426 int flags) 2427 { 2428 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2429 struct rt6_info *ret = NULL, *rt_cache; 2430 struct fib6_info *rt; 2431 struct fib6_node *fn; 2432 2433 /* Get the "current" route for this destination and 2434 * check if the redirect has come from appropriate router. 2435 * 2436 * RFC 4861 specifies that redirects should only be 2437 * accepted if they come from the nexthop to the target. 2438 * Due to the way the routes are chosen, this notion 2439 * is a bit fuzzy and one might need to check all possible 2440 * routes. 2441 */ 2442 2443 rcu_read_lock(); 2444 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2445 restart: 2446 for_each_fib6_node_rt_rcu(fn) { 2447 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2448 continue; 2449 if (fib6_check_expired(rt)) 2450 continue; 2451 if (rt->fib6_flags & RTF_REJECT) 2452 break; 2453 if (!(rt->fib6_flags & RTF_GATEWAY)) 2454 continue; 2455 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2456 continue; 2457 /* rt_cache's gateway might be different from its 'parent' 2458 * in the case of an ip redirect. 2459 * So we keep searching in the exception table if the gateway 2460 * is different. 2461 */ 2462 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2463 rt_cache = rt6_find_cached_rt(rt, 2464 &fl6->daddr, 2465 &fl6->saddr); 2466 if (rt_cache && 2467 ipv6_addr_equal(&rdfl->gateway, 2468 &rt_cache->rt6i_gateway)) { 2469 ret = rt_cache; 2470 break; 2471 } 2472 continue; 2473 } 2474 break; 2475 } 2476 2477 if (!rt) 2478 rt = net->ipv6.fib6_null_entry; 2479 else if (rt->fib6_flags & RTF_REJECT) { 2480 ret = net->ipv6.ip6_null_entry; 2481 goto out; 2482 } 2483 2484 if (rt == net->ipv6.fib6_null_entry) { 2485 fn = fib6_backtrack(fn, &fl6->saddr); 2486 if (fn) 2487 goto restart; 2488 } 2489 2490 out: 2491 if (ret) 2492 dst_hold(&ret->dst); 2493 else 2494 ret = ip6_create_rt_rcu(rt); 2495 2496 rcu_read_unlock(); 2497 2498 trace_fib6_table_lookup(net, rt, table, fl6); 2499 return ret; 2500 }; 2501 2502 static struct dst_entry *ip6_route_redirect(struct net *net, 2503 const struct flowi6 *fl6, 2504 const struct sk_buff *skb, 2505 const struct in6_addr *gateway) 2506 { 2507 int flags = RT6_LOOKUP_F_HAS_SADDR; 2508 struct ip6rd_flowi rdfl; 2509 2510 rdfl.fl6 = *fl6; 2511 rdfl.gateway = *gateway; 2512 2513 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2514 flags, __ip6_route_redirect); 2515 } 2516 2517 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2518 kuid_t uid) 2519 { 2520 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2521 struct dst_entry *dst; 2522 struct flowi6 fl6; 2523 2524 memset(&fl6, 0, sizeof(fl6)); 2525 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2526 fl6.flowi6_oif = oif; 2527 fl6.flowi6_mark = mark; 2528 fl6.daddr = iph->daddr; 2529 fl6.saddr = iph->saddr; 2530 fl6.flowlabel = ip6_flowinfo(iph); 2531 fl6.flowi6_uid = uid; 2532 2533 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2534 rt6_do_redirect(dst, NULL, skb); 2535 dst_release(dst); 2536 } 2537 EXPORT_SYMBOL_GPL(ip6_redirect); 2538 2539 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2540 u32 mark) 2541 { 2542 const struct ipv6hdr *iph = ipv6_hdr(skb); 2543 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2544 struct dst_entry *dst; 2545 struct flowi6 fl6; 2546 2547 memset(&fl6, 0, sizeof(fl6)); 2548 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2549 fl6.flowi6_oif = oif; 2550 fl6.flowi6_mark = mark; 2551 fl6.daddr = msg->dest; 2552 fl6.saddr = iph->daddr; 2553 fl6.flowi6_uid = sock_net_uid(net, NULL); 2554 2555 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2556 rt6_do_redirect(dst, NULL, skb); 2557 dst_release(dst); 2558 } 2559 2560 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2561 { 2562 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2563 sk->sk_uid); 2564 } 2565 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2566 2567 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2568 { 2569 struct net_device *dev = dst->dev; 2570 unsigned int mtu = dst_mtu(dst); 2571 struct net *net = dev_net(dev); 2572 2573 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2574 2575 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2576 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2577 2578 /* 2579 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2580 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2581 * IPV6_MAXPLEN is also valid and means: "any MSS, 2582 * rely only on pmtu discovery" 2583 */ 2584 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2585 mtu = IPV6_MAXPLEN; 2586 return mtu; 2587 } 2588 2589 static unsigned int ip6_mtu(const struct dst_entry *dst) 2590 { 2591 struct inet6_dev *idev; 2592 unsigned int mtu; 2593 2594 mtu = dst_metric_raw(dst, RTAX_MTU); 2595 if (mtu) 2596 goto out; 2597 2598 mtu = IPV6_MIN_MTU; 2599 2600 rcu_read_lock(); 2601 idev = __in6_dev_get(dst->dev); 2602 if (idev) 2603 mtu = idev->cnf.mtu6; 2604 rcu_read_unlock(); 2605 2606 out: 2607 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2608 2609 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2610 } 2611 2612 /* MTU selection: 2613 * 1. mtu on route is locked - use it 2614 * 2. mtu from nexthop exception 2615 * 3. mtu from egress device 2616 * 2617 * based on ip6_dst_mtu_forward and exception logic of 2618 * rt6_find_cached_rt; called with rcu_read_lock 2619 */ 2620 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2621 struct in6_addr *saddr) 2622 { 2623 struct rt6_exception_bucket *bucket; 2624 struct rt6_exception *rt6_ex; 2625 struct in6_addr *src_key; 2626 struct inet6_dev *idev; 2627 u32 mtu = 0; 2628 2629 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2630 mtu = f6i->fib6_pmtu; 2631 if (mtu) 2632 goto out; 2633 } 2634 2635 src_key = NULL; 2636 #ifdef CONFIG_IPV6_SUBTREES 2637 if (f6i->fib6_src.plen) 2638 src_key = saddr; 2639 #endif 2640 2641 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2642 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2643 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2644 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2645 2646 if (likely(!mtu)) { 2647 struct net_device *dev = fib6_info_nh_dev(f6i); 2648 2649 mtu = IPV6_MIN_MTU; 2650 idev = __in6_dev_get(dev); 2651 if (idev && idev->cnf.mtu6 > mtu) 2652 mtu = idev->cnf.mtu6; 2653 } 2654 2655 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2656 out: 2657 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2658 } 2659 2660 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2661 struct flowi6 *fl6) 2662 { 2663 struct dst_entry *dst; 2664 struct rt6_info *rt; 2665 struct inet6_dev *idev = in6_dev_get(dev); 2666 struct net *net = dev_net(dev); 2667 2668 if (unlikely(!idev)) 2669 return ERR_PTR(-ENODEV); 2670 2671 rt = ip6_dst_alloc(net, dev, 0); 2672 if (unlikely(!rt)) { 2673 in6_dev_put(idev); 2674 dst = ERR_PTR(-ENOMEM); 2675 goto out; 2676 } 2677 2678 rt->dst.flags |= DST_HOST; 2679 rt->dst.input = ip6_input; 2680 rt->dst.output = ip6_output; 2681 rt->rt6i_gateway = fl6->daddr; 2682 rt->rt6i_dst.addr = fl6->daddr; 2683 rt->rt6i_dst.plen = 128; 2684 rt->rt6i_idev = idev; 2685 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2686 2687 /* Add this dst into uncached_list so that rt6_disable_ip() can 2688 * do proper release of the net_device 2689 */ 2690 rt6_uncached_list_add(rt); 2691 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2692 2693 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2694 2695 out: 2696 return dst; 2697 } 2698 2699 static int ip6_dst_gc(struct dst_ops *ops) 2700 { 2701 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2702 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2703 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2704 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2705 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2706 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2707 int entries; 2708 2709 entries = dst_entries_get_fast(ops); 2710 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2711 entries <= rt_max_size) 2712 goto out; 2713 2714 net->ipv6.ip6_rt_gc_expire++; 2715 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2716 entries = dst_entries_get_slow(ops); 2717 if (entries < ops->gc_thresh) 2718 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2719 out: 2720 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2721 return entries > rt_max_size; 2722 } 2723 2724 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2725 struct fib6_config *cfg) 2726 { 2727 struct dst_metrics *p; 2728 2729 if (!cfg->fc_mx) 2730 return 0; 2731 2732 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2733 if (unlikely(!p)) 2734 return -ENOMEM; 2735 2736 refcount_set(&p->refcnt, 1); 2737 rt->fib6_metrics = p; 2738 2739 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2740 } 2741 2742 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2743 struct fib6_config *cfg, 2744 const struct in6_addr *gw_addr, 2745 u32 tbid, int flags) 2746 { 2747 struct flowi6 fl6 = { 2748 .flowi6_oif = cfg->fc_ifindex, 2749 .daddr = *gw_addr, 2750 .saddr = cfg->fc_prefsrc, 2751 }; 2752 struct fib6_table *table; 2753 struct rt6_info *rt; 2754 2755 table = fib6_get_table(net, tbid); 2756 if (!table) 2757 return NULL; 2758 2759 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2760 flags |= RT6_LOOKUP_F_HAS_SADDR; 2761 2762 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2763 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2764 2765 /* if table lookup failed, fall back to full lookup */ 2766 if (rt == net->ipv6.ip6_null_entry) { 2767 ip6_rt_put(rt); 2768 rt = NULL; 2769 } 2770 2771 return rt; 2772 } 2773 2774 static int ip6_route_check_nh_onlink(struct net *net, 2775 struct fib6_config *cfg, 2776 const struct net_device *dev, 2777 struct netlink_ext_ack *extack) 2778 { 2779 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2780 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2781 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2782 struct rt6_info *grt; 2783 int err; 2784 2785 err = 0; 2786 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2787 if (grt) { 2788 if (!grt->dst.error && 2789 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2790 NL_SET_ERR_MSG(extack, 2791 "Nexthop has invalid gateway or device mismatch"); 2792 err = -EINVAL; 2793 } 2794 2795 ip6_rt_put(grt); 2796 } 2797 2798 return err; 2799 } 2800 2801 static int ip6_route_check_nh(struct net *net, 2802 struct fib6_config *cfg, 2803 struct net_device **_dev, 2804 struct inet6_dev **idev) 2805 { 2806 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2807 struct net_device *dev = _dev ? *_dev : NULL; 2808 struct rt6_info *grt = NULL; 2809 int err = -EHOSTUNREACH; 2810 2811 if (cfg->fc_table) { 2812 int flags = RT6_LOOKUP_F_IFACE; 2813 2814 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2815 cfg->fc_table, flags); 2816 if (grt) { 2817 if (grt->rt6i_flags & RTF_GATEWAY || 2818 (dev && dev != grt->dst.dev)) { 2819 ip6_rt_put(grt); 2820 grt = NULL; 2821 } 2822 } 2823 } 2824 2825 if (!grt) 2826 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2827 2828 if (!grt) 2829 goto out; 2830 2831 if (dev) { 2832 if (dev != grt->dst.dev) { 2833 ip6_rt_put(grt); 2834 goto out; 2835 } 2836 } else { 2837 *_dev = dev = grt->dst.dev; 2838 *idev = grt->rt6i_idev; 2839 dev_hold(dev); 2840 in6_dev_hold(grt->rt6i_idev); 2841 } 2842 2843 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2844 err = 0; 2845 2846 ip6_rt_put(grt); 2847 2848 out: 2849 return err; 2850 } 2851 2852 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2853 struct net_device **_dev, struct inet6_dev **idev, 2854 struct netlink_ext_ack *extack) 2855 { 2856 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2857 int gwa_type = ipv6_addr_type(gw_addr); 2858 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2859 const struct net_device *dev = *_dev; 2860 bool need_addr_check = !dev; 2861 int err = -EINVAL; 2862 2863 /* if gw_addr is local we will fail to detect this in case 2864 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2865 * will return already-added prefix route via interface that 2866 * prefix route was assigned to, which might be non-loopback. 2867 */ 2868 if (dev && 2869 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2870 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2871 goto out; 2872 } 2873 2874 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2875 /* IPv6 strictly inhibits using not link-local 2876 * addresses as nexthop address. 2877 * Otherwise, router will not able to send redirects. 2878 * It is very good, but in some (rare!) circumstances 2879 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2880 * some exceptions. --ANK 2881 * We allow IPv4-mapped nexthops to support RFC4798-type 2882 * addressing 2883 */ 2884 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2885 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2886 goto out; 2887 } 2888 2889 if (cfg->fc_flags & RTNH_F_ONLINK) 2890 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2891 else 2892 err = ip6_route_check_nh(net, cfg, _dev, idev); 2893 2894 if (err) 2895 goto out; 2896 } 2897 2898 /* reload in case device was changed */ 2899 dev = *_dev; 2900 2901 err = -EINVAL; 2902 if (!dev) { 2903 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2904 goto out; 2905 } else if (dev->flags & IFF_LOOPBACK) { 2906 NL_SET_ERR_MSG(extack, 2907 "Egress device can not be loopback device for this route"); 2908 goto out; 2909 } 2910 2911 /* if we did not check gw_addr above, do so now that the 2912 * egress device has been resolved. 2913 */ 2914 if (need_addr_check && 2915 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2916 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2917 goto out; 2918 } 2919 2920 err = 0; 2921 out: 2922 return err; 2923 } 2924 2925 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2926 gfp_t gfp_flags, 2927 struct netlink_ext_ack *extack) 2928 { 2929 struct net *net = cfg->fc_nlinfo.nl_net; 2930 struct fib6_info *rt = NULL; 2931 struct net_device *dev = NULL; 2932 struct inet6_dev *idev = NULL; 2933 struct fib6_table *table; 2934 int addr_type; 2935 int err = -EINVAL; 2936 2937 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2938 if (cfg->fc_flags & RTF_PCPU) { 2939 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2940 goto out; 2941 } 2942 2943 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2944 if (cfg->fc_flags & RTF_CACHE) { 2945 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2946 goto out; 2947 } 2948 2949 if (cfg->fc_type > RTN_MAX) { 2950 NL_SET_ERR_MSG(extack, "Invalid route type"); 2951 goto out; 2952 } 2953 2954 if (cfg->fc_dst_len > 128) { 2955 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2956 goto out; 2957 } 2958 if (cfg->fc_src_len > 128) { 2959 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2960 goto out; 2961 } 2962 #ifndef CONFIG_IPV6_SUBTREES 2963 if (cfg->fc_src_len) { 2964 NL_SET_ERR_MSG(extack, 2965 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2966 goto out; 2967 } 2968 #endif 2969 if (cfg->fc_ifindex) { 2970 err = -ENODEV; 2971 dev = dev_get_by_index(net, cfg->fc_ifindex); 2972 if (!dev) 2973 goto out; 2974 idev = in6_dev_get(dev); 2975 if (!idev) 2976 goto out; 2977 } 2978 2979 if (cfg->fc_metric == 0) 2980 cfg->fc_metric = IP6_RT_PRIO_USER; 2981 2982 if (cfg->fc_flags & RTNH_F_ONLINK) { 2983 if (!dev) { 2984 NL_SET_ERR_MSG(extack, 2985 "Nexthop device required for onlink"); 2986 err = -ENODEV; 2987 goto out; 2988 } 2989 2990 if (!(dev->flags & IFF_UP)) { 2991 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2992 err = -ENETDOWN; 2993 goto out; 2994 } 2995 } 2996 2997 err = -ENOBUFS; 2998 if (cfg->fc_nlinfo.nlh && 2999 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3000 table = fib6_get_table(net, cfg->fc_table); 3001 if (!table) { 3002 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3003 table = fib6_new_table(net, cfg->fc_table); 3004 } 3005 } else { 3006 table = fib6_new_table(net, cfg->fc_table); 3007 } 3008 3009 if (!table) 3010 goto out; 3011 3012 err = -ENOMEM; 3013 rt = fib6_info_alloc(gfp_flags); 3014 if (!rt) 3015 goto out; 3016 3017 if (cfg->fc_flags & RTF_ADDRCONF) 3018 rt->dst_nocount = true; 3019 3020 err = ip6_convert_metrics(net, rt, cfg); 3021 if (err < 0) 3022 goto out; 3023 3024 if (cfg->fc_flags & RTF_EXPIRES) 3025 fib6_set_expires(rt, jiffies + 3026 clock_t_to_jiffies(cfg->fc_expires)); 3027 else 3028 fib6_clean_expires(rt); 3029 3030 if (cfg->fc_protocol == RTPROT_UNSPEC) 3031 cfg->fc_protocol = RTPROT_BOOT; 3032 rt->fib6_protocol = cfg->fc_protocol; 3033 3034 addr_type = ipv6_addr_type(&cfg->fc_dst); 3035 3036 if (cfg->fc_encap) { 3037 struct lwtunnel_state *lwtstate; 3038 3039 err = lwtunnel_build_state(cfg->fc_encap_type, 3040 cfg->fc_encap, AF_INET6, cfg, 3041 &lwtstate, extack); 3042 if (err) 3043 goto out; 3044 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3045 } 3046 3047 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3048 rt->fib6_dst.plen = cfg->fc_dst_len; 3049 if (rt->fib6_dst.plen == 128) 3050 rt->dst_host = true; 3051 3052 #ifdef CONFIG_IPV6_SUBTREES 3053 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3054 rt->fib6_src.plen = cfg->fc_src_len; 3055 #endif 3056 3057 rt->fib6_metric = cfg->fc_metric; 3058 rt->fib6_nh.nh_weight = 1; 3059 3060 rt->fib6_type = cfg->fc_type; 3061 3062 /* We cannot add true routes via loopback here, 3063 they would result in kernel looping; promote them to reject routes 3064 */ 3065 if ((cfg->fc_flags & RTF_REJECT) || 3066 (dev && (dev->flags & IFF_LOOPBACK) && 3067 !(addr_type & IPV6_ADDR_LOOPBACK) && 3068 !(cfg->fc_flags & RTF_LOCAL))) { 3069 /* hold loopback dev/idev if we haven't done so. */ 3070 if (dev != net->loopback_dev) { 3071 if (dev) { 3072 dev_put(dev); 3073 in6_dev_put(idev); 3074 } 3075 dev = net->loopback_dev; 3076 dev_hold(dev); 3077 idev = in6_dev_get(dev); 3078 if (!idev) { 3079 err = -ENODEV; 3080 goto out; 3081 } 3082 } 3083 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3084 goto install_route; 3085 } 3086 3087 if (cfg->fc_flags & RTF_GATEWAY) { 3088 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3089 if (err) 3090 goto out; 3091 3092 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3093 } 3094 3095 err = -ENODEV; 3096 if (!dev) 3097 goto out; 3098 3099 if (idev->cnf.disable_ipv6) { 3100 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3101 err = -EACCES; 3102 goto out; 3103 } 3104 3105 if (!(dev->flags & IFF_UP)) { 3106 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3107 err = -ENETDOWN; 3108 goto out; 3109 } 3110 3111 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3112 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3113 NL_SET_ERR_MSG(extack, "Invalid source address"); 3114 err = -EINVAL; 3115 goto out; 3116 } 3117 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3118 rt->fib6_prefsrc.plen = 128; 3119 } else 3120 rt->fib6_prefsrc.plen = 0; 3121 3122 rt->fib6_flags = cfg->fc_flags; 3123 3124 install_route: 3125 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3126 !netif_carrier_ok(dev)) 3127 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3128 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3129 rt->fib6_nh.nh_dev = dev; 3130 rt->fib6_table = table; 3131 3132 cfg->fc_nlinfo.nl_net = dev_net(dev); 3133 3134 if (idev) 3135 in6_dev_put(idev); 3136 3137 return rt; 3138 out: 3139 if (dev) 3140 dev_put(dev); 3141 if (idev) 3142 in6_dev_put(idev); 3143 3144 fib6_info_release(rt); 3145 return ERR_PTR(err); 3146 } 3147 3148 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3149 struct netlink_ext_ack *extack) 3150 { 3151 struct fib6_info *rt; 3152 int err; 3153 3154 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3155 if (IS_ERR(rt)) 3156 return PTR_ERR(rt); 3157 3158 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3159 fib6_info_release(rt); 3160 3161 return err; 3162 } 3163 3164 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3165 { 3166 struct net *net = info->nl_net; 3167 struct fib6_table *table; 3168 int err; 3169 3170 if (rt == net->ipv6.fib6_null_entry) { 3171 err = -ENOENT; 3172 goto out; 3173 } 3174 3175 table = rt->fib6_table; 3176 spin_lock_bh(&table->tb6_lock); 3177 err = fib6_del(rt, info); 3178 spin_unlock_bh(&table->tb6_lock); 3179 3180 out: 3181 fib6_info_release(rt); 3182 return err; 3183 } 3184 3185 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3186 { 3187 struct nl_info info = { .nl_net = net }; 3188 3189 return __ip6_del_rt(rt, &info); 3190 } 3191 3192 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3193 { 3194 struct nl_info *info = &cfg->fc_nlinfo; 3195 struct net *net = info->nl_net; 3196 struct sk_buff *skb = NULL; 3197 struct fib6_table *table; 3198 int err = -ENOENT; 3199 3200 if (rt == net->ipv6.fib6_null_entry) 3201 goto out_put; 3202 table = rt->fib6_table; 3203 spin_lock_bh(&table->tb6_lock); 3204 3205 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3206 struct fib6_info *sibling, *next_sibling; 3207 3208 /* prefer to send a single notification with all hops */ 3209 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3210 if (skb) { 3211 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3212 3213 if (rt6_fill_node(net, skb, rt, NULL, 3214 NULL, NULL, 0, RTM_DELROUTE, 3215 info->portid, seq, 0) < 0) { 3216 kfree_skb(skb); 3217 skb = NULL; 3218 } else 3219 info->skip_notify = 1; 3220 } 3221 3222 list_for_each_entry_safe(sibling, next_sibling, 3223 &rt->fib6_siblings, 3224 fib6_siblings) { 3225 err = fib6_del(sibling, info); 3226 if (err) 3227 goto out_unlock; 3228 } 3229 } 3230 3231 err = fib6_del(rt, info); 3232 out_unlock: 3233 spin_unlock_bh(&table->tb6_lock); 3234 out_put: 3235 fib6_info_release(rt); 3236 3237 if (skb) { 3238 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3239 info->nlh, gfp_any()); 3240 } 3241 return err; 3242 } 3243 3244 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3245 { 3246 int rc = -ESRCH; 3247 3248 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3249 goto out; 3250 3251 if (cfg->fc_flags & RTF_GATEWAY && 3252 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3253 goto out; 3254 if (dst_hold_safe(&rt->dst)) 3255 rc = rt6_remove_exception_rt(rt); 3256 out: 3257 return rc; 3258 } 3259 3260 static int ip6_route_del(struct fib6_config *cfg, 3261 struct netlink_ext_ack *extack) 3262 { 3263 struct rt6_info *rt_cache; 3264 struct fib6_table *table; 3265 struct fib6_info *rt; 3266 struct fib6_node *fn; 3267 int err = -ESRCH; 3268 3269 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3270 if (!table) { 3271 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3272 return err; 3273 } 3274 3275 rcu_read_lock(); 3276 3277 fn = fib6_locate(&table->tb6_root, 3278 &cfg->fc_dst, cfg->fc_dst_len, 3279 &cfg->fc_src, cfg->fc_src_len, 3280 !(cfg->fc_flags & RTF_CACHE)); 3281 3282 if (fn) { 3283 for_each_fib6_node_rt_rcu(fn) { 3284 if (cfg->fc_flags & RTF_CACHE) { 3285 int rc; 3286 3287 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3288 &cfg->fc_src); 3289 if (rt_cache) { 3290 rc = ip6_del_cached_rt(rt_cache, cfg); 3291 if (rc != -ESRCH) { 3292 rcu_read_unlock(); 3293 return rc; 3294 } 3295 } 3296 continue; 3297 } 3298 if (cfg->fc_ifindex && 3299 (!rt->fib6_nh.nh_dev || 3300 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3301 continue; 3302 if (cfg->fc_flags & RTF_GATEWAY && 3303 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3304 continue; 3305 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3306 continue; 3307 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3308 continue; 3309 fib6_info_hold(rt); 3310 rcu_read_unlock(); 3311 3312 /* if gateway was specified only delete the one hop */ 3313 if (cfg->fc_flags & RTF_GATEWAY) 3314 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3315 3316 return __ip6_del_rt_siblings(rt, cfg); 3317 } 3318 } 3319 rcu_read_unlock(); 3320 3321 return err; 3322 } 3323 3324 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3325 { 3326 struct netevent_redirect netevent; 3327 struct rt6_info *rt, *nrt = NULL; 3328 struct ndisc_options ndopts; 3329 struct inet6_dev *in6_dev; 3330 struct neighbour *neigh; 3331 struct fib6_info *from; 3332 struct rd_msg *msg; 3333 int optlen, on_link; 3334 u8 *lladdr; 3335 3336 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3337 optlen -= sizeof(*msg); 3338 3339 if (optlen < 0) { 3340 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3341 return; 3342 } 3343 3344 msg = (struct rd_msg *)icmp6_hdr(skb); 3345 3346 if (ipv6_addr_is_multicast(&msg->dest)) { 3347 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3348 return; 3349 } 3350 3351 on_link = 0; 3352 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3353 on_link = 1; 3354 } else if (ipv6_addr_type(&msg->target) != 3355 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3356 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3357 return; 3358 } 3359 3360 in6_dev = __in6_dev_get(skb->dev); 3361 if (!in6_dev) 3362 return; 3363 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3364 return; 3365 3366 /* RFC2461 8.1: 3367 * The IP source address of the Redirect MUST be the same as the current 3368 * first-hop router for the specified ICMP Destination Address. 3369 */ 3370 3371 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3372 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3373 return; 3374 } 3375 3376 lladdr = NULL; 3377 if (ndopts.nd_opts_tgt_lladdr) { 3378 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3379 skb->dev); 3380 if (!lladdr) { 3381 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3382 return; 3383 } 3384 } 3385 3386 rt = (struct rt6_info *) dst; 3387 if (rt->rt6i_flags & RTF_REJECT) { 3388 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3389 return; 3390 } 3391 3392 /* Redirect received -> path was valid. 3393 * Look, redirects are sent only in response to data packets, 3394 * so that this nexthop apparently is reachable. --ANK 3395 */ 3396 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3397 3398 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3399 if (!neigh) 3400 return; 3401 3402 /* 3403 * We have finally decided to accept it. 3404 */ 3405 3406 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3407 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3408 NEIGH_UPDATE_F_OVERRIDE| 3409 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3410 NEIGH_UPDATE_F_ISROUTER)), 3411 NDISC_REDIRECT, &ndopts); 3412 3413 rcu_read_lock(); 3414 from = rcu_dereference(rt->from); 3415 fib6_info_hold(from); 3416 rcu_read_unlock(); 3417 3418 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3419 if (!nrt) 3420 goto out; 3421 3422 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3423 if (on_link) 3424 nrt->rt6i_flags &= ~RTF_GATEWAY; 3425 3426 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3427 3428 /* No need to remove rt from the exception table if rt is 3429 * a cached route because rt6_insert_exception() will 3430 * takes care of it 3431 */ 3432 if (rt6_insert_exception(nrt, from)) { 3433 dst_release_immediate(&nrt->dst); 3434 goto out; 3435 } 3436 3437 netevent.old = &rt->dst; 3438 netevent.new = &nrt->dst; 3439 netevent.daddr = &msg->dest; 3440 netevent.neigh = neigh; 3441 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3442 3443 out: 3444 fib6_info_release(from); 3445 neigh_release(neigh); 3446 } 3447 3448 #ifdef CONFIG_IPV6_ROUTE_INFO 3449 static struct fib6_info *rt6_get_route_info(struct net *net, 3450 const struct in6_addr *prefix, int prefixlen, 3451 const struct in6_addr *gwaddr, 3452 struct net_device *dev) 3453 { 3454 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3455 int ifindex = dev->ifindex; 3456 struct fib6_node *fn; 3457 struct fib6_info *rt = NULL; 3458 struct fib6_table *table; 3459 3460 table = fib6_get_table(net, tb_id); 3461 if (!table) 3462 return NULL; 3463 3464 rcu_read_lock(); 3465 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3466 if (!fn) 3467 goto out; 3468 3469 for_each_fib6_node_rt_rcu(fn) { 3470 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3471 continue; 3472 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3473 continue; 3474 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3475 continue; 3476 fib6_info_hold(rt); 3477 break; 3478 } 3479 out: 3480 rcu_read_unlock(); 3481 return rt; 3482 } 3483 3484 static struct fib6_info *rt6_add_route_info(struct net *net, 3485 const struct in6_addr *prefix, int prefixlen, 3486 const struct in6_addr *gwaddr, 3487 struct net_device *dev, 3488 unsigned int pref) 3489 { 3490 struct fib6_config cfg = { 3491 .fc_metric = IP6_RT_PRIO_USER, 3492 .fc_ifindex = dev->ifindex, 3493 .fc_dst_len = prefixlen, 3494 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3495 RTF_UP | RTF_PREF(pref), 3496 .fc_protocol = RTPROT_RA, 3497 .fc_type = RTN_UNICAST, 3498 .fc_nlinfo.portid = 0, 3499 .fc_nlinfo.nlh = NULL, 3500 .fc_nlinfo.nl_net = net, 3501 }; 3502 3503 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3504 cfg.fc_dst = *prefix; 3505 cfg.fc_gateway = *gwaddr; 3506 3507 /* We should treat it as a default route if prefix length is 0. */ 3508 if (!prefixlen) 3509 cfg.fc_flags |= RTF_DEFAULT; 3510 3511 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3512 3513 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3514 } 3515 #endif 3516 3517 struct fib6_info *rt6_get_dflt_router(struct net *net, 3518 const struct in6_addr *addr, 3519 struct net_device *dev) 3520 { 3521 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3522 struct fib6_info *rt; 3523 struct fib6_table *table; 3524 3525 table = fib6_get_table(net, tb_id); 3526 if (!table) 3527 return NULL; 3528 3529 rcu_read_lock(); 3530 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3531 if (dev == rt->fib6_nh.nh_dev && 3532 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3533 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3534 break; 3535 } 3536 if (rt) 3537 fib6_info_hold(rt); 3538 rcu_read_unlock(); 3539 return rt; 3540 } 3541 3542 struct fib6_info *rt6_add_dflt_router(struct net *net, 3543 const struct in6_addr *gwaddr, 3544 struct net_device *dev, 3545 unsigned int pref) 3546 { 3547 struct fib6_config cfg = { 3548 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3549 .fc_metric = IP6_RT_PRIO_USER, 3550 .fc_ifindex = dev->ifindex, 3551 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3552 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3553 .fc_protocol = RTPROT_RA, 3554 .fc_type = RTN_UNICAST, 3555 .fc_nlinfo.portid = 0, 3556 .fc_nlinfo.nlh = NULL, 3557 .fc_nlinfo.nl_net = net, 3558 }; 3559 3560 cfg.fc_gateway = *gwaddr; 3561 3562 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3563 struct fib6_table *table; 3564 3565 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3566 if (table) 3567 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3568 } 3569 3570 return rt6_get_dflt_router(net, gwaddr, dev); 3571 } 3572 3573 static void __rt6_purge_dflt_routers(struct net *net, 3574 struct fib6_table *table) 3575 { 3576 struct fib6_info *rt; 3577 3578 restart: 3579 rcu_read_lock(); 3580 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3581 struct net_device *dev = fib6_info_nh_dev(rt); 3582 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3583 3584 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3585 (!idev || idev->cnf.accept_ra != 2)) { 3586 fib6_info_hold(rt); 3587 rcu_read_unlock(); 3588 ip6_del_rt(net, rt); 3589 goto restart; 3590 } 3591 } 3592 rcu_read_unlock(); 3593 3594 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3595 } 3596 3597 void rt6_purge_dflt_routers(struct net *net) 3598 { 3599 struct fib6_table *table; 3600 struct hlist_head *head; 3601 unsigned int h; 3602 3603 rcu_read_lock(); 3604 3605 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3606 head = &net->ipv6.fib_table_hash[h]; 3607 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3608 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3609 __rt6_purge_dflt_routers(net, table); 3610 } 3611 } 3612 3613 rcu_read_unlock(); 3614 } 3615 3616 static void rtmsg_to_fib6_config(struct net *net, 3617 struct in6_rtmsg *rtmsg, 3618 struct fib6_config *cfg) 3619 { 3620 memset(cfg, 0, sizeof(*cfg)); 3621 3622 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3623 : RT6_TABLE_MAIN; 3624 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3625 cfg->fc_metric = rtmsg->rtmsg_metric; 3626 cfg->fc_expires = rtmsg->rtmsg_info; 3627 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3628 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3629 cfg->fc_flags = rtmsg->rtmsg_flags; 3630 cfg->fc_type = rtmsg->rtmsg_type; 3631 3632 cfg->fc_nlinfo.nl_net = net; 3633 3634 cfg->fc_dst = rtmsg->rtmsg_dst; 3635 cfg->fc_src = rtmsg->rtmsg_src; 3636 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3637 } 3638 3639 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3640 { 3641 struct fib6_config cfg; 3642 struct in6_rtmsg rtmsg; 3643 int err; 3644 3645 switch (cmd) { 3646 case SIOCADDRT: /* Add a route */ 3647 case SIOCDELRT: /* Delete a route */ 3648 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3649 return -EPERM; 3650 err = copy_from_user(&rtmsg, arg, 3651 sizeof(struct in6_rtmsg)); 3652 if (err) 3653 return -EFAULT; 3654 3655 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3656 3657 rtnl_lock(); 3658 switch (cmd) { 3659 case SIOCADDRT: 3660 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3661 break; 3662 case SIOCDELRT: 3663 err = ip6_route_del(&cfg, NULL); 3664 break; 3665 default: 3666 err = -EINVAL; 3667 } 3668 rtnl_unlock(); 3669 3670 return err; 3671 } 3672 3673 return -EINVAL; 3674 } 3675 3676 /* 3677 * Drop the packet on the floor 3678 */ 3679 3680 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3681 { 3682 int type; 3683 struct dst_entry *dst = skb_dst(skb); 3684 switch (ipstats_mib_noroutes) { 3685 case IPSTATS_MIB_INNOROUTES: 3686 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3687 if (type == IPV6_ADDR_ANY) { 3688 IP6_INC_STATS(dev_net(dst->dev), 3689 __in6_dev_get_safely(skb->dev), 3690 IPSTATS_MIB_INADDRERRORS); 3691 break; 3692 } 3693 /* FALLTHROUGH */ 3694 case IPSTATS_MIB_OUTNOROUTES: 3695 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3696 ipstats_mib_noroutes); 3697 break; 3698 } 3699 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3700 kfree_skb(skb); 3701 return 0; 3702 } 3703 3704 static int ip6_pkt_discard(struct sk_buff *skb) 3705 { 3706 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3707 } 3708 3709 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3710 { 3711 skb->dev = skb_dst(skb)->dev; 3712 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3713 } 3714 3715 static int ip6_pkt_prohibit(struct sk_buff *skb) 3716 { 3717 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3718 } 3719 3720 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3721 { 3722 skb->dev = skb_dst(skb)->dev; 3723 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3724 } 3725 3726 /* 3727 * Allocate a dst for local (unicast / anycast) address. 3728 */ 3729 3730 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3731 struct inet6_dev *idev, 3732 const struct in6_addr *addr, 3733 bool anycast, gfp_t gfp_flags) 3734 { 3735 u32 tb_id; 3736 struct net_device *dev = idev->dev; 3737 struct fib6_info *f6i; 3738 3739 f6i = fib6_info_alloc(gfp_flags); 3740 if (!f6i) 3741 return ERR_PTR(-ENOMEM); 3742 3743 f6i->dst_nocount = true; 3744 f6i->dst_host = true; 3745 f6i->fib6_protocol = RTPROT_KERNEL; 3746 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3747 if (anycast) { 3748 f6i->fib6_type = RTN_ANYCAST; 3749 f6i->fib6_flags |= RTF_ANYCAST; 3750 } else { 3751 f6i->fib6_type = RTN_LOCAL; 3752 f6i->fib6_flags |= RTF_LOCAL; 3753 } 3754 3755 f6i->fib6_nh.nh_gw = *addr; 3756 dev_hold(dev); 3757 f6i->fib6_nh.nh_dev = dev; 3758 f6i->fib6_dst.addr = *addr; 3759 f6i->fib6_dst.plen = 128; 3760 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3761 f6i->fib6_table = fib6_get_table(net, tb_id); 3762 3763 return f6i; 3764 } 3765 3766 /* remove deleted ip from prefsrc entries */ 3767 struct arg_dev_net_ip { 3768 struct net_device *dev; 3769 struct net *net; 3770 struct in6_addr *addr; 3771 }; 3772 3773 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3774 { 3775 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3776 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3777 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3778 3779 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3780 rt != net->ipv6.fib6_null_entry && 3781 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3782 spin_lock_bh(&rt6_exception_lock); 3783 /* remove prefsrc entry */ 3784 rt->fib6_prefsrc.plen = 0; 3785 /* need to update cache as well */ 3786 rt6_exceptions_remove_prefsrc(rt); 3787 spin_unlock_bh(&rt6_exception_lock); 3788 } 3789 return 0; 3790 } 3791 3792 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3793 { 3794 struct net *net = dev_net(ifp->idev->dev); 3795 struct arg_dev_net_ip adni = { 3796 .dev = ifp->idev->dev, 3797 .net = net, 3798 .addr = &ifp->addr, 3799 }; 3800 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3801 } 3802 3803 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3804 3805 /* Remove routers and update dst entries when gateway turn into host. */ 3806 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3807 { 3808 struct in6_addr *gateway = (struct in6_addr *)arg; 3809 3810 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3811 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3812 return -1; 3813 } 3814 3815 /* Further clean up cached routes in exception table. 3816 * This is needed because cached route may have a different 3817 * gateway than its 'parent' in the case of an ip redirect. 3818 */ 3819 rt6_exceptions_clean_tohost(rt, gateway); 3820 3821 return 0; 3822 } 3823 3824 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3825 { 3826 fib6_clean_all(net, fib6_clean_tohost, gateway); 3827 } 3828 3829 struct arg_netdev_event { 3830 const struct net_device *dev; 3831 union { 3832 unsigned int nh_flags; 3833 unsigned long event; 3834 }; 3835 }; 3836 3837 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3838 { 3839 struct fib6_info *iter; 3840 struct fib6_node *fn; 3841 3842 fn = rcu_dereference_protected(rt->fib6_node, 3843 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3844 iter = rcu_dereference_protected(fn->leaf, 3845 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3846 while (iter) { 3847 if (iter->fib6_metric == rt->fib6_metric && 3848 iter->fib6_nsiblings) 3849 return iter; 3850 iter = rcu_dereference_protected(iter->fib6_next, 3851 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3852 } 3853 3854 return NULL; 3855 } 3856 3857 static bool rt6_is_dead(const struct fib6_info *rt) 3858 { 3859 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3860 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3861 fib6_ignore_linkdown(rt))) 3862 return true; 3863 3864 return false; 3865 } 3866 3867 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3868 { 3869 struct fib6_info *iter; 3870 int total = 0; 3871 3872 if (!rt6_is_dead(rt)) 3873 total += rt->fib6_nh.nh_weight; 3874 3875 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3876 if (!rt6_is_dead(iter)) 3877 total += iter->fib6_nh.nh_weight; 3878 } 3879 3880 return total; 3881 } 3882 3883 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3884 { 3885 int upper_bound = -1; 3886 3887 if (!rt6_is_dead(rt)) { 3888 *weight += rt->fib6_nh.nh_weight; 3889 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3890 total) - 1; 3891 } 3892 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3893 } 3894 3895 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3896 { 3897 struct fib6_info *iter; 3898 int weight = 0; 3899 3900 rt6_upper_bound_set(rt, &weight, total); 3901 3902 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3903 rt6_upper_bound_set(iter, &weight, total); 3904 } 3905 3906 void rt6_multipath_rebalance(struct fib6_info *rt) 3907 { 3908 struct fib6_info *first; 3909 int total; 3910 3911 /* In case the entire multipath route was marked for flushing, 3912 * then there is no need to rebalance upon the removal of every 3913 * sibling route. 3914 */ 3915 if (!rt->fib6_nsiblings || rt->should_flush) 3916 return; 3917 3918 /* During lookup routes are evaluated in order, so we need to 3919 * make sure upper bounds are assigned from the first sibling 3920 * onwards. 3921 */ 3922 first = rt6_multipath_first_sibling(rt); 3923 if (WARN_ON_ONCE(!first)) 3924 return; 3925 3926 total = rt6_multipath_total_weight(first); 3927 rt6_multipath_upper_bound_set(first, total); 3928 } 3929 3930 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3931 { 3932 const struct arg_netdev_event *arg = p_arg; 3933 struct net *net = dev_net(arg->dev); 3934 3935 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3936 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3937 fib6_update_sernum_upto_root(net, rt); 3938 rt6_multipath_rebalance(rt); 3939 } 3940 3941 return 0; 3942 } 3943 3944 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3945 { 3946 struct arg_netdev_event arg = { 3947 .dev = dev, 3948 { 3949 .nh_flags = nh_flags, 3950 }, 3951 }; 3952 3953 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3954 arg.nh_flags |= RTNH_F_LINKDOWN; 3955 3956 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3957 } 3958 3959 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3960 const struct net_device *dev) 3961 { 3962 struct fib6_info *iter; 3963 3964 if (rt->fib6_nh.nh_dev == dev) 3965 return true; 3966 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3967 if (iter->fib6_nh.nh_dev == dev) 3968 return true; 3969 3970 return false; 3971 } 3972 3973 static void rt6_multipath_flush(struct fib6_info *rt) 3974 { 3975 struct fib6_info *iter; 3976 3977 rt->should_flush = 1; 3978 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3979 iter->should_flush = 1; 3980 } 3981 3982 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3983 const struct net_device *down_dev) 3984 { 3985 struct fib6_info *iter; 3986 unsigned int dead = 0; 3987 3988 if (rt->fib6_nh.nh_dev == down_dev || 3989 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3990 dead++; 3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3992 if (iter->fib6_nh.nh_dev == down_dev || 3993 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3994 dead++; 3995 3996 return dead; 3997 } 3998 3999 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4000 const struct net_device *dev, 4001 unsigned int nh_flags) 4002 { 4003 struct fib6_info *iter; 4004 4005 if (rt->fib6_nh.nh_dev == dev) 4006 rt->fib6_nh.nh_flags |= nh_flags; 4007 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4008 if (iter->fib6_nh.nh_dev == dev) 4009 iter->fib6_nh.nh_flags |= nh_flags; 4010 } 4011 4012 /* called with write lock held for table with rt */ 4013 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4014 { 4015 const struct arg_netdev_event *arg = p_arg; 4016 const struct net_device *dev = arg->dev; 4017 struct net *net = dev_net(dev); 4018 4019 if (rt == net->ipv6.fib6_null_entry) 4020 return 0; 4021 4022 switch (arg->event) { 4023 case NETDEV_UNREGISTER: 4024 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4025 case NETDEV_DOWN: 4026 if (rt->should_flush) 4027 return -1; 4028 if (!rt->fib6_nsiblings) 4029 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4030 if (rt6_multipath_uses_dev(rt, dev)) { 4031 unsigned int count; 4032 4033 count = rt6_multipath_dead_count(rt, dev); 4034 if (rt->fib6_nsiblings + 1 == count) { 4035 rt6_multipath_flush(rt); 4036 return -1; 4037 } 4038 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4039 RTNH_F_LINKDOWN); 4040 fib6_update_sernum(net, rt); 4041 rt6_multipath_rebalance(rt); 4042 } 4043 return -2; 4044 case NETDEV_CHANGE: 4045 if (rt->fib6_nh.nh_dev != dev || 4046 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4047 break; 4048 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4049 rt6_multipath_rebalance(rt); 4050 break; 4051 } 4052 4053 return 0; 4054 } 4055 4056 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4057 { 4058 struct arg_netdev_event arg = { 4059 .dev = dev, 4060 { 4061 .event = event, 4062 }, 4063 }; 4064 4065 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4066 } 4067 4068 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4069 { 4070 rt6_sync_down_dev(dev, event); 4071 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4072 neigh_ifdown(&nd_tbl, dev); 4073 } 4074 4075 struct rt6_mtu_change_arg { 4076 struct net_device *dev; 4077 unsigned int mtu; 4078 }; 4079 4080 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4081 { 4082 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4083 struct inet6_dev *idev; 4084 4085 /* In IPv6 pmtu discovery is not optional, 4086 so that RTAX_MTU lock cannot disable it. 4087 We still use this lock to block changes 4088 caused by addrconf/ndisc. 4089 */ 4090 4091 idev = __in6_dev_get(arg->dev); 4092 if (!idev) 4093 return 0; 4094 4095 /* For administrative MTU increase, there is no way to discover 4096 IPv6 PMTU increase, so PMTU increase should be updated here. 4097 Since RFC 1981 doesn't include administrative MTU increase 4098 update PMTU increase is a MUST. (i.e. jumbo frame) 4099 */ 4100 if (rt->fib6_nh.nh_dev == arg->dev && 4101 !fib6_metric_locked(rt, RTAX_MTU)) { 4102 u32 mtu = rt->fib6_pmtu; 4103 4104 if (mtu >= arg->mtu || 4105 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4106 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4107 4108 spin_lock_bh(&rt6_exception_lock); 4109 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4110 spin_unlock_bh(&rt6_exception_lock); 4111 } 4112 return 0; 4113 } 4114 4115 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4116 { 4117 struct rt6_mtu_change_arg arg = { 4118 .dev = dev, 4119 .mtu = mtu, 4120 }; 4121 4122 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4123 } 4124 4125 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4126 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4127 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4128 [RTA_OIF] = { .type = NLA_U32 }, 4129 [RTA_IIF] = { .type = NLA_U32 }, 4130 [RTA_PRIORITY] = { .type = NLA_U32 }, 4131 [RTA_METRICS] = { .type = NLA_NESTED }, 4132 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4133 [RTA_PREF] = { .type = NLA_U8 }, 4134 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4135 [RTA_ENCAP] = { .type = NLA_NESTED }, 4136 [RTA_EXPIRES] = { .type = NLA_U32 }, 4137 [RTA_UID] = { .type = NLA_U32 }, 4138 [RTA_MARK] = { .type = NLA_U32 }, 4139 [RTA_TABLE] = { .type = NLA_U32 }, 4140 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4141 [RTA_SPORT] = { .type = NLA_U16 }, 4142 [RTA_DPORT] = { .type = NLA_U16 }, 4143 }; 4144 4145 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4146 struct fib6_config *cfg, 4147 struct netlink_ext_ack *extack) 4148 { 4149 struct rtmsg *rtm; 4150 struct nlattr *tb[RTA_MAX+1]; 4151 unsigned int pref; 4152 int err; 4153 4154 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4155 NULL); 4156 if (err < 0) 4157 goto errout; 4158 4159 err = -EINVAL; 4160 rtm = nlmsg_data(nlh); 4161 memset(cfg, 0, sizeof(*cfg)); 4162 4163 cfg->fc_table = rtm->rtm_table; 4164 cfg->fc_dst_len = rtm->rtm_dst_len; 4165 cfg->fc_src_len = rtm->rtm_src_len; 4166 cfg->fc_flags = RTF_UP; 4167 cfg->fc_protocol = rtm->rtm_protocol; 4168 cfg->fc_type = rtm->rtm_type; 4169 4170 if (rtm->rtm_type == RTN_UNREACHABLE || 4171 rtm->rtm_type == RTN_BLACKHOLE || 4172 rtm->rtm_type == RTN_PROHIBIT || 4173 rtm->rtm_type == RTN_THROW) 4174 cfg->fc_flags |= RTF_REJECT; 4175 4176 if (rtm->rtm_type == RTN_LOCAL) 4177 cfg->fc_flags |= RTF_LOCAL; 4178 4179 if (rtm->rtm_flags & RTM_F_CLONED) 4180 cfg->fc_flags |= RTF_CACHE; 4181 4182 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4183 4184 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4185 cfg->fc_nlinfo.nlh = nlh; 4186 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4187 4188 if (tb[RTA_GATEWAY]) { 4189 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4190 cfg->fc_flags |= RTF_GATEWAY; 4191 } 4192 4193 if (tb[RTA_DST]) { 4194 int plen = (rtm->rtm_dst_len + 7) >> 3; 4195 4196 if (nla_len(tb[RTA_DST]) < plen) 4197 goto errout; 4198 4199 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4200 } 4201 4202 if (tb[RTA_SRC]) { 4203 int plen = (rtm->rtm_src_len + 7) >> 3; 4204 4205 if (nla_len(tb[RTA_SRC]) < plen) 4206 goto errout; 4207 4208 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4209 } 4210 4211 if (tb[RTA_PREFSRC]) 4212 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4213 4214 if (tb[RTA_OIF]) 4215 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4216 4217 if (tb[RTA_PRIORITY]) 4218 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4219 4220 if (tb[RTA_METRICS]) { 4221 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4222 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4223 } 4224 4225 if (tb[RTA_TABLE]) 4226 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4227 4228 if (tb[RTA_MULTIPATH]) { 4229 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4230 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4231 4232 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4233 cfg->fc_mp_len, extack); 4234 if (err < 0) 4235 goto errout; 4236 } 4237 4238 if (tb[RTA_PREF]) { 4239 pref = nla_get_u8(tb[RTA_PREF]); 4240 if (pref != ICMPV6_ROUTER_PREF_LOW && 4241 pref != ICMPV6_ROUTER_PREF_HIGH) 4242 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4243 cfg->fc_flags |= RTF_PREF(pref); 4244 } 4245 4246 if (tb[RTA_ENCAP]) 4247 cfg->fc_encap = tb[RTA_ENCAP]; 4248 4249 if (tb[RTA_ENCAP_TYPE]) { 4250 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4251 4252 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4253 if (err < 0) 4254 goto errout; 4255 } 4256 4257 if (tb[RTA_EXPIRES]) { 4258 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4259 4260 if (addrconf_finite_timeout(timeout)) { 4261 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4262 cfg->fc_flags |= RTF_EXPIRES; 4263 } 4264 } 4265 4266 err = 0; 4267 errout: 4268 return err; 4269 } 4270 4271 struct rt6_nh { 4272 struct fib6_info *fib6_info; 4273 struct fib6_config r_cfg; 4274 struct list_head next; 4275 }; 4276 4277 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4278 { 4279 struct rt6_nh *nh; 4280 4281 list_for_each_entry(nh, rt6_nh_list, next) { 4282 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4283 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4284 nh->r_cfg.fc_ifindex); 4285 } 4286 } 4287 4288 static int ip6_route_info_append(struct net *net, 4289 struct list_head *rt6_nh_list, 4290 struct fib6_info *rt, 4291 struct fib6_config *r_cfg) 4292 { 4293 struct rt6_nh *nh; 4294 int err = -EEXIST; 4295 4296 list_for_each_entry(nh, rt6_nh_list, next) { 4297 /* check if fib6_info already exists */ 4298 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4299 return err; 4300 } 4301 4302 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4303 if (!nh) 4304 return -ENOMEM; 4305 nh->fib6_info = rt; 4306 err = ip6_convert_metrics(net, rt, r_cfg); 4307 if (err) { 4308 kfree(nh); 4309 return err; 4310 } 4311 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4312 list_add_tail(&nh->next, rt6_nh_list); 4313 4314 return 0; 4315 } 4316 4317 static void ip6_route_mpath_notify(struct fib6_info *rt, 4318 struct fib6_info *rt_last, 4319 struct nl_info *info, 4320 __u16 nlflags) 4321 { 4322 /* if this is an APPEND route, then rt points to the first route 4323 * inserted and rt_last points to last route inserted. Userspace 4324 * wants a consistent dump of the route which starts at the first 4325 * nexthop. Since sibling routes are always added at the end of 4326 * the list, find the first sibling of the last route appended 4327 */ 4328 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4329 rt = list_first_entry(&rt_last->fib6_siblings, 4330 struct fib6_info, 4331 fib6_siblings); 4332 } 4333 4334 if (rt) 4335 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4336 } 4337 4338 static int ip6_route_multipath_add(struct fib6_config *cfg, 4339 struct netlink_ext_ack *extack) 4340 { 4341 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4342 struct nl_info *info = &cfg->fc_nlinfo; 4343 struct fib6_config r_cfg; 4344 struct rtnexthop *rtnh; 4345 struct fib6_info *rt; 4346 struct rt6_nh *err_nh; 4347 struct rt6_nh *nh, *nh_safe; 4348 __u16 nlflags; 4349 int remaining; 4350 int attrlen; 4351 int err = 1; 4352 int nhn = 0; 4353 int replace = (cfg->fc_nlinfo.nlh && 4354 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4355 LIST_HEAD(rt6_nh_list); 4356 4357 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4358 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4359 nlflags |= NLM_F_APPEND; 4360 4361 remaining = cfg->fc_mp_len; 4362 rtnh = (struct rtnexthop *)cfg->fc_mp; 4363 4364 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4365 * fib6_info structs per nexthop 4366 */ 4367 while (rtnh_ok(rtnh, remaining)) { 4368 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4369 if (rtnh->rtnh_ifindex) 4370 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4371 4372 attrlen = rtnh_attrlen(rtnh); 4373 if (attrlen > 0) { 4374 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4375 4376 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4377 if (nla) { 4378 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4379 r_cfg.fc_flags |= RTF_GATEWAY; 4380 } 4381 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4382 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4383 if (nla) 4384 r_cfg.fc_encap_type = nla_get_u16(nla); 4385 } 4386 4387 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4388 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4389 if (IS_ERR(rt)) { 4390 err = PTR_ERR(rt); 4391 rt = NULL; 4392 goto cleanup; 4393 } 4394 4395 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4396 4397 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4398 rt, &r_cfg); 4399 if (err) { 4400 fib6_info_release(rt); 4401 goto cleanup; 4402 } 4403 4404 rtnh = rtnh_next(rtnh, &remaining); 4405 } 4406 4407 /* for add and replace send one notification with all nexthops. 4408 * Skip the notification in fib6_add_rt2node and send one with 4409 * the full route when done 4410 */ 4411 info->skip_notify = 1; 4412 4413 err_nh = NULL; 4414 list_for_each_entry(nh, &rt6_nh_list, next) { 4415 rt_last = nh->fib6_info; 4416 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4417 fib6_info_release(nh->fib6_info); 4418 4419 /* save reference to first route for notification */ 4420 if (!rt_notif && !err) 4421 rt_notif = nh->fib6_info; 4422 4423 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4424 nh->fib6_info = NULL; 4425 if (err) { 4426 if (replace && nhn) 4427 ip6_print_replace_route_err(&rt6_nh_list); 4428 err_nh = nh; 4429 goto add_errout; 4430 } 4431 4432 /* Because each route is added like a single route we remove 4433 * these flags after the first nexthop: if there is a collision, 4434 * we have already failed to add the first nexthop: 4435 * fib6_add_rt2node() has rejected it; when replacing, old 4436 * nexthops have been replaced by first new, the rest should 4437 * be added to it. 4438 */ 4439 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4440 NLM_F_REPLACE); 4441 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; 4442 nhn++; 4443 } 4444 4445 /* success ... tell user about new route */ 4446 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4447 goto cleanup; 4448 4449 add_errout: 4450 /* send notification for routes that were added so that 4451 * the delete notifications sent by ip6_route_del are 4452 * coherent 4453 */ 4454 if (rt_notif) 4455 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4456 4457 /* Delete routes that were already added */ 4458 list_for_each_entry(nh, &rt6_nh_list, next) { 4459 if (err_nh == nh) 4460 break; 4461 ip6_route_del(&nh->r_cfg, extack); 4462 } 4463 4464 cleanup: 4465 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4466 if (nh->fib6_info) 4467 fib6_info_release(nh->fib6_info); 4468 list_del(&nh->next); 4469 kfree(nh); 4470 } 4471 4472 return err; 4473 } 4474 4475 static int ip6_route_multipath_del(struct fib6_config *cfg, 4476 struct netlink_ext_ack *extack) 4477 { 4478 struct fib6_config r_cfg; 4479 struct rtnexthop *rtnh; 4480 int remaining; 4481 int attrlen; 4482 int err = 1, last_err = 0; 4483 4484 remaining = cfg->fc_mp_len; 4485 rtnh = (struct rtnexthop *)cfg->fc_mp; 4486 4487 /* Parse a Multipath Entry */ 4488 while (rtnh_ok(rtnh, remaining)) { 4489 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4490 if (rtnh->rtnh_ifindex) 4491 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4492 4493 attrlen = rtnh_attrlen(rtnh); 4494 if (attrlen > 0) { 4495 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4496 4497 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4498 if (nla) { 4499 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4500 r_cfg.fc_flags |= RTF_GATEWAY; 4501 } 4502 } 4503 err = ip6_route_del(&r_cfg, extack); 4504 if (err) 4505 last_err = err; 4506 4507 rtnh = rtnh_next(rtnh, &remaining); 4508 } 4509 4510 return last_err; 4511 } 4512 4513 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4514 struct netlink_ext_ack *extack) 4515 { 4516 struct fib6_config cfg; 4517 int err; 4518 4519 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4520 if (err < 0) 4521 return err; 4522 4523 if (cfg.fc_mp) 4524 return ip6_route_multipath_del(&cfg, extack); 4525 else { 4526 cfg.fc_delete_all_nh = 1; 4527 return ip6_route_del(&cfg, extack); 4528 } 4529 } 4530 4531 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4532 struct netlink_ext_ack *extack) 4533 { 4534 struct fib6_config cfg; 4535 int err; 4536 4537 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4538 if (err < 0) 4539 return err; 4540 4541 if (cfg.fc_mp) 4542 return ip6_route_multipath_add(&cfg, extack); 4543 else 4544 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4545 } 4546 4547 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4548 { 4549 int nexthop_len = 0; 4550 4551 if (rt->fib6_nsiblings) { 4552 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4553 + NLA_ALIGN(sizeof(struct rtnexthop)) 4554 + nla_total_size(16) /* RTA_GATEWAY */ 4555 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4556 4557 nexthop_len *= rt->fib6_nsiblings; 4558 } 4559 4560 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4561 + nla_total_size(16) /* RTA_SRC */ 4562 + nla_total_size(16) /* RTA_DST */ 4563 + nla_total_size(16) /* RTA_GATEWAY */ 4564 + nla_total_size(16) /* RTA_PREFSRC */ 4565 + nla_total_size(4) /* RTA_TABLE */ 4566 + nla_total_size(4) /* RTA_IIF */ 4567 + nla_total_size(4) /* RTA_OIF */ 4568 + nla_total_size(4) /* RTA_PRIORITY */ 4569 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4570 + nla_total_size(sizeof(struct rta_cacheinfo)) 4571 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4572 + nla_total_size(1) /* RTA_PREF */ 4573 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4574 + nexthop_len; 4575 } 4576 4577 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4578 unsigned int *flags, bool skip_oif) 4579 { 4580 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4581 *flags |= RTNH_F_DEAD; 4582 4583 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4584 *flags |= RTNH_F_LINKDOWN; 4585 4586 rcu_read_lock(); 4587 if (fib6_ignore_linkdown(rt)) 4588 *flags |= RTNH_F_DEAD; 4589 rcu_read_unlock(); 4590 } 4591 4592 if (rt->fib6_flags & RTF_GATEWAY) { 4593 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4594 goto nla_put_failure; 4595 } 4596 4597 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4598 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4599 *flags |= RTNH_F_OFFLOAD; 4600 4601 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4602 if (!skip_oif && rt->fib6_nh.nh_dev && 4603 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4604 goto nla_put_failure; 4605 4606 if (rt->fib6_nh.nh_lwtstate && 4607 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4608 goto nla_put_failure; 4609 4610 return 0; 4611 4612 nla_put_failure: 4613 return -EMSGSIZE; 4614 } 4615 4616 /* add multipath next hop */ 4617 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4618 { 4619 const struct net_device *dev = rt->fib6_nh.nh_dev; 4620 struct rtnexthop *rtnh; 4621 unsigned int flags = 0; 4622 4623 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4624 if (!rtnh) 4625 goto nla_put_failure; 4626 4627 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4628 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4629 4630 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4631 goto nla_put_failure; 4632 4633 rtnh->rtnh_flags = flags; 4634 4635 /* length of rtnetlink header + attributes */ 4636 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4637 4638 return 0; 4639 4640 nla_put_failure: 4641 return -EMSGSIZE; 4642 } 4643 4644 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4645 struct fib6_info *rt, struct dst_entry *dst, 4646 struct in6_addr *dest, struct in6_addr *src, 4647 int iif, int type, u32 portid, u32 seq, 4648 unsigned int flags) 4649 { 4650 struct rtmsg *rtm; 4651 struct nlmsghdr *nlh; 4652 long expires = 0; 4653 u32 *pmetrics; 4654 u32 table; 4655 4656 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4657 if (!nlh) 4658 return -EMSGSIZE; 4659 4660 rtm = nlmsg_data(nlh); 4661 rtm->rtm_family = AF_INET6; 4662 rtm->rtm_dst_len = rt->fib6_dst.plen; 4663 rtm->rtm_src_len = rt->fib6_src.plen; 4664 rtm->rtm_tos = 0; 4665 if (rt->fib6_table) 4666 table = rt->fib6_table->tb6_id; 4667 else 4668 table = RT6_TABLE_UNSPEC; 4669 rtm->rtm_table = table; 4670 if (nla_put_u32(skb, RTA_TABLE, table)) 4671 goto nla_put_failure; 4672 4673 rtm->rtm_type = rt->fib6_type; 4674 rtm->rtm_flags = 0; 4675 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4676 rtm->rtm_protocol = rt->fib6_protocol; 4677 4678 if (rt->fib6_flags & RTF_CACHE) 4679 rtm->rtm_flags |= RTM_F_CLONED; 4680 4681 if (dest) { 4682 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4683 goto nla_put_failure; 4684 rtm->rtm_dst_len = 128; 4685 } else if (rtm->rtm_dst_len) 4686 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4687 goto nla_put_failure; 4688 #ifdef CONFIG_IPV6_SUBTREES 4689 if (src) { 4690 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4691 goto nla_put_failure; 4692 rtm->rtm_src_len = 128; 4693 } else if (rtm->rtm_src_len && 4694 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4695 goto nla_put_failure; 4696 #endif 4697 if (iif) { 4698 #ifdef CONFIG_IPV6_MROUTE 4699 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4700 int err = ip6mr_get_route(net, skb, rtm, portid); 4701 4702 if (err == 0) 4703 return 0; 4704 if (err < 0) 4705 goto nla_put_failure; 4706 } else 4707 #endif 4708 if (nla_put_u32(skb, RTA_IIF, iif)) 4709 goto nla_put_failure; 4710 } else if (dest) { 4711 struct in6_addr saddr_buf; 4712 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4713 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4714 goto nla_put_failure; 4715 } 4716 4717 if (rt->fib6_prefsrc.plen) { 4718 struct in6_addr saddr_buf; 4719 saddr_buf = rt->fib6_prefsrc.addr; 4720 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4721 goto nla_put_failure; 4722 } 4723 4724 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4725 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4726 goto nla_put_failure; 4727 4728 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4729 goto nla_put_failure; 4730 4731 /* For multipath routes, walk the siblings list and add 4732 * each as a nexthop within RTA_MULTIPATH. 4733 */ 4734 if (rt->fib6_nsiblings) { 4735 struct fib6_info *sibling, *next_sibling; 4736 struct nlattr *mp; 4737 4738 mp = nla_nest_start(skb, RTA_MULTIPATH); 4739 if (!mp) 4740 goto nla_put_failure; 4741 4742 if (rt6_add_nexthop(skb, rt) < 0) 4743 goto nla_put_failure; 4744 4745 list_for_each_entry_safe(sibling, next_sibling, 4746 &rt->fib6_siblings, fib6_siblings) { 4747 if (rt6_add_nexthop(skb, sibling) < 0) 4748 goto nla_put_failure; 4749 } 4750 4751 nla_nest_end(skb, mp); 4752 } else { 4753 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4754 goto nla_put_failure; 4755 } 4756 4757 if (rt->fib6_flags & RTF_EXPIRES) { 4758 expires = dst ? dst->expires : rt->expires; 4759 expires -= jiffies; 4760 } 4761 4762 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4763 goto nla_put_failure; 4764 4765 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4766 goto nla_put_failure; 4767 4768 4769 nlmsg_end(skb, nlh); 4770 return 0; 4771 4772 nla_put_failure: 4773 nlmsg_cancel(skb, nlh); 4774 return -EMSGSIZE; 4775 } 4776 4777 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4778 { 4779 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4780 struct net *net = arg->net; 4781 4782 if (rt == net->ipv6.fib6_null_entry) 4783 return 0; 4784 4785 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4786 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4787 4788 /* user wants prefix routes only */ 4789 if (rtm->rtm_flags & RTM_F_PREFIX && 4790 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4791 /* success since this is not a prefix route */ 4792 return 1; 4793 } 4794 } 4795 4796 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4797 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4798 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4799 } 4800 4801 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4802 struct netlink_ext_ack *extack) 4803 { 4804 struct net *net = sock_net(in_skb->sk); 4805 struct nlattr *tb[RTA_MAX+1]; 4806 int err, iif = 0, oif = 0; 4807 struct fib6_info *from; 4808 struct dst_entry *dst; 4809 struct rt6_info *rt; 4810 struct sk_buff *skb; 4811 struct rtmsg *rtm; 4812 struct flowi6 fl6; 4813 bool fibmatch; 4814 4815 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4816 extack); 4817 if (err < 0) 4818 goto errout; 4819 4820 err = -EINVAL; 4821 memset(&fl6, 0, sizeof(fl6)); 4822 rtm = nlmsg_data(nlh); 4823 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4824 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4825 4826 if (tb[RTA_SRC]) { 4827 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4828 goto errout; 4829 4830 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4831 } 4832 4833 if (tb[RTA_DST]) { 4834 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4835 goto errout; 4836 4837 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4838 } 4839 4840 if (tb[RTA_IIF]) 4841 iif = nla_get_u32(tb[RTA_IIF]); 4842 4843 if (tb[RTA_OIF]) 4844 oif = nla_get_u32(tb[RTA_OIF]); 4845 4846 if (tb[RTA_MARK]) 4847 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4848 4849 if (tb[RTA_UID]) 4850 fl6.flowi6_uid = make_kuid(current_user_ns(), 4851 nla_get_u32(tb[RTA_UID])); 4852 else 4853 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4854 4855 if (tb[RTA_SPORT]) 4856 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4857 4858 if (tb[RTA_DPORT]) 4859 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4860 4861 if (tb[RTA_IP_PROTO]) { 4862 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4863 &fl6.flowi6_proto, extack); 4864 if (err) 4865 goto errout; 4866 } 4867 4868 if (iif) { 4869 struct net_device *dev; 4870 int flags = 0; 4871 4872 rcu_read_lock(); 4873 4874 dev = dev_get_by_index_rcu(net, iif); 4875 if (!dev) { 4876 rcu_read_unlock(); 4877 err = -ENODEV; 4878 goto errout; 4879 } 4880 4881 fl6.flowi6_iif = iif; 4882 4883 if (!ipv6_addr_any(&fl6.saddr)) 4884 flags |= RT6_LOOKUP_F_HAS_SADDR; 4885 4886 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4887 4888 rcu_read_unlock(); 4889 } else { 4890 fl6.flowi6_oif = oif; 4891 4892 dst = ip6_route_output(net, NULL, &fl6); 4893 } 4894 4895 4896 rt = container_of(dst, struct rt6_info, dst); 4897 if (rt->dst.error) { 4898 err = rt->dst.error; 4899 ip6_rt_put(rt); 4900 goto errout; 4901 } 4902 4903 if (rt == net->ipv6.ip6_null_entry) { 4904 err = rt->dst.error; 4905 ip6_rt_put(rt); 4906 goto errout; 4907 } 4908 4909 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4910 if (!skb) { 4911 ip6_rt_put(rt); 4912 err = -ENOBUFS; 4913 goto errout; 4914 } 4915 4916 skb_dst_set(skb, &rt->dst); 4917 4918 rcu_read_lock(); 4919 from = rcu_dereference(rt->from); 4920 4921 if (fibmatch) 4922 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4923 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4924 nlh->nlmsg_seq, 0); 4925 else 4926 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4927 &fl6.saddr, iif, RTM_NEWROUTE, 4928 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4929 0); 4930 rcu_read_unlock(); 4931 4932 if (err < 0) { 4933 kfree_skb(skb); 4934 goto errout; 4935 } 4936 4937 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4938 errout: 4939 return err; 4940 } 4941 4942 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4943 unsigned int nlm_flags) 4944 { 4945 struct sk_buff *skb; 4946 struct net *net = info->nl_net; 4947 u32 seq; 4948 int err; 4949 4950 err = -ENOBUFS; 4951 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4952 4953 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4954 if (!skb) 4955 goto errout; 4956 4957 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4958 event, info->portid, seq, nlm_flags); 4959 if (err < 0) { 4960 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4961 WARN_ON(err == -EMSGSIZE); 4962 kfree_skb(skb); 4963 goto errout; 4964 } 4965 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4966 info->nlh, gfp_any()); 4967 return; 4968 errout: 4969 if (err < 0) 4970 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4971 } 4972 4973 static int ip6_route_dev_notify(struct notifier_block *this, 4974 unsigned long event, void *ptr) 4975 { 4976 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4977 struct net *net = dev_net(dev); 4978 4979 if (!(dev->flags & IFF_LOOPBACK)) 4980 return NOTIFY_OK; 4981 4982 if (event == NETDEV_REGISTER) { 4983 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4984 net->ipv6.ip6_null_entry->dst.dev = dev; 4985 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4986 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4987 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4988 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4989 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4990 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4991 #endif 4992 } else if (event == NETDEV_UNREGISTER && 4993 dev->reg_state != NETREG_UNREGISTERED) { 4994 /* NETDEV_UNREGISTER could be fired for multiple times by 4995 * netdev_wait_allrefs(). Make sure we only call this once. 4996 */ 4997 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4998 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4999 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5000 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5001 #endif 5002 } 5003 5004 return NOTIFY_OK; 5005 } 5006 5007 /* 5008 * /proc 5009 */ 5010 5011 #ifdef CONFIG_PROC_FS 5012 5013 static const struct file_operations ipv6_route_proc_fops = { 5014 .open = ipv6_route_open, 5015 .read = seq_read, 5016 .llseek = seq_lseek, 5017 .release = seq_release_net, 5018 }; 5019 5020 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5021 { 5022 struct net *net = (struct net *)seq->private; 5023 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5024 net->ipv6.rt6_stats->fib_nodes, 5025 net->ipv6.rt6_stats->fib_route_nodes, 5026 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5027 net->ipv6.rt6_stats->fib_rt_entries, 5028 net->ipv6.rt6_stats->fib_rt_cache, 5029 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5030 net->ipv6.rt6_stats->fib_discarded_routes); 5031 5032 return 0; 5033 } 5034 5035 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 5036 { 5037 return single_open_net(inode, file, rt6_stats_seq_show); 5038 } 5039 5040 static const struct file_operations rt6_stats_seq_fops = { 5041 .open = rt6_stats_seq_open, 5042 .read = seq_read, 5043 .llseek = seq_lseek, 5044 .release = single_release_net, 5045 }; 5046 #endif /* CONFIG_PROC_FS */ 5047 5048 #ifdef CONFIG_SYSCTL 5049 5050 static 5051 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5052 void __user *buffer, size_t *lenp, loff_t *ppos) 5053 { 5054 struct net *net; 5055 int delay; 5056 if (!write) 5057 return -EINVAL; 5058 5059 net = (struct net *)ctl->extra1; 5060 delay = net->ipv6.sysctl.flush_delay; 5061 proc_dointvec(ctl, write, buffer, lenp, ppos); 5062 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5063 return 0; 5064 } 5065 5066 struct ctl_table ipv6_route_table_template[] = { 5067 { 5068 .procname = "flush", 5069 .data = &init_net.ipv6.sysctl.flush_delay, 5070 .maxlen = sizeof(int), 5071 .mode = 0200, 5072 .proc_handler = ipv6_sysctl_rtcache_flush 5073 }, 5074 { 5075 .procname = "gc_thresh", 5076 .data = &ip6_dst_ops_template.gc_thresh, 5077 .maxlen = sizeof(int), 5078 .mode = 0644, 5079 .proc_handler = proc_dointvec, 5080 }, 5081 { 5082 .procname = "max_size", 5083 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5084 .maxlen = sizeof(int), 5085 .mode = 0644, 5086 .proc_handler = proc_dointvec, 5087 }, 5088 { 5089 .procname = "gc_min_interval", 5090 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5091 .maxlen = sizeof(int), 5092 .mode = 0644, 5093 .proc_handler = proc_dointvec_jiffies, 5094 }, 5095 { 5096 .procname = "gc_timeout", 5097 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5098 .maxlen = sizeof(int), 5099 .mode = 0644, 5100 .proc_handler = proc_dointvec_jiffies, 5101 }, 5102 { 5103 .procname = "gc_interval", 5104 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5105 .maxlen = sizeof(int), 5106 .mode = 0644, 5107 .proc_handler = proc_dointvec_jiffies, 5108 }, 5109 { 5110 .procname = "gc_elasticity", 5111 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5112 .maxlen = sizeof(int), 5113 .mode = 0644, 5114 .proc_handler = proc_dointvec, 5115 }, 5116 { 5117 .procname = "mtu_expires", 5118 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5119 .maxlen = sizeof(int), 5120 .mode = 0644, 5121 .proc_handler = proc_dointvec_jiffies, 5122 }, 5123 { 5124 .procname = "min_adv_mss", 5125 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5126 .maxlen = sizeof(int), 5127 .mode = 0644, 5128 .proc_handler = proc_dointvec, 5129 }, 5130 { 5131 .procname = "gc_min_interval_ms", 5132 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5133 .maxlen = sizeof(int), 5134 .mode = 0644, 5135 .proc_handler = proc_dointvec_ms_jiffies, 5136 }, 5137 { } 5138 }; 5139 5140 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5141 { 5142 struct ctl_table *table; 5143 5144 table = kmemdup(ipv6_route_table_template, 5145 sizeof(ipv6_route_table_template), 5146 GFP_KERNEL); 5147 5148 if (table) { 5149 table[0].data = &net->ipv6.sysctl.flush_delay; 5150 table[0].extra1 = net; 5151 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5152 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5153 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5154 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5155 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5156 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5157 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5158 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5159 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5160 5161 /* Don't export sysctls to unprivileged users */ 5162 if (net->user_ns != &init_user_ns) 5163 table[0].procname = NULL; 5164 } 5165 5166 return table; 5167 } 5168 #endif 5169 5170 static int __net_init ip6_route_net_init(struct net *net) 5171 { 5172 int ret = -ENOMEM; 5173 5174 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5175 sizeof(net->ipv6.ip6_dst_ops)); 5176 5177 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5178 goto out_ip6_dst_ops; 5179 5180 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5181 sizeof(*net->ipv6.fib6_null_entry), 5182 GFP_KERNEL); 5183 if (!net->ipv6.fib6_null_entry) 5184 goto out_ip6_dst_entries; 5185 5186 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5187 sizeof(*net->ipv6.ip6_null_entry), 5188 GFP_KERNEL); 5189 if (!net->ipv6.ip6_null_entry) 5190 goto out_fib6_null_entry; 5191 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5192 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5193 ip6_template_metrics, true); 5194 5195 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5196 net->ipv6.fib6_has_custom_rules = false; 5197 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5198 sizeof(*net->ipv6.ip6_prohibit_entry), 5199 GFP_KERNEL); 5200 if (!net->ipv6.ip6_prohibit_entry) 5201 goto out_ip6_null_entry; 5202 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5203 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5204 ip6_template_metrics, true); 5205 5206 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5207 sizeof(*net->ipv6.ip6_blk_hole_entry), 5208 GFP_KERNEL); 5209 if (!net->ipv6.ip6_blk_hole_entry) 5210 goto out_ip6_prohibit_entry; 5211 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5212 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5213 ip6_template_metrics, true); 5214 #endif 5215 5216 net->ipv6.sysctl.flush_delay = 0; 5217 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5218 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5219 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5220 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5221 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5222 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5223 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5224 5225 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5226 5227 ret = 0; 5228 out: 5229 return ret; 5230 5231 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5232 out_ip6_prohibit_entry: 5233 kfree(net->ipv6.ip6_prohibit_entry); 5234 out_ip6_null_entry: 5235 kfree(net->ipv6.ip6_null_entry); 5236 #endif 5237 out_fib6_null_entry: 5238 kfree(net->ipv6.fib6_null_entry); 5239 out_ip6_dst_entries: 5240 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5241 out_ip6_dst_ops: 5242 goto out; 5243 } 5244 5245 static void __net_exit ip6_route_net_exit(struct net *net) 5246 { 5247 kfree(net->ipv6.fib6_null_entry); 5248 kfree(net->ipv6.ip6_null_entry); 5249 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5250 kfree(net->ipv6.ip6_prohibit_entry); 5251 kfree(net->ipv6.ip6_blk_hole_entry); 5252 #endif 5253 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5254 } 5255 5256 static int __net_init ip6_route_net_init_late(struct net *net) 5257 { 5258 #ifdef CONFIG_PROC_FS 5259 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5260 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5261 #endif 5262 return 0; 5263 } 5264 5265 static void __net_exit ip6_route_net_exit_late(struct net *net) 5266 { 5267 #ifdef CONFIG_PROC_FS 5268 remove_proc_entry("ipv6_route", net->proc_net); 5269 remove_proc_entry("rt6_stats", net->proc_net); 5270 #endif 5271 } 5272 5273 static struct pernet_operations ip6_route_net_ops = { 5274 .init = ip6_route_net_init, 5275 .exit = ip6_route_net_exit, 5276 }; 5277 5278 static int __net_init ipv6_inetpeer_init(struct net *net) 5279 { 5280 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5281 5282 if (!bp) 5283 return -ENOMEM; 5284 inet_peer_base_init(bp); 5285 net->ipv6.peers = bp; 5286 return 0; 5287 } 5288 5289 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5290 { 5291 struct inet_peer_base *bp = net->ipv6.peers; 5292 5293 net->ipv6.peers = NULL; 5294 inetpeer_invalidate_tree(bp); 5295 kfree(bp); 5296 } 5297 5298 static struct pernet_operations ipv6_inetpeer_ops = { 5299 .init = ipv6_inetpeer_init, 5300 .exit = ipv6_inetpeer_exit, 5301 }; 5302 5303 static struct pernet_operations ip6_route_net_late_ops = { 5304 .init = ip6_route_net_init_late, 5305 .exit = ip6_route_net_exit_late, 5306 }; 5307 5308 static struct notifier_block ip6_route_dev_notifier = { 5309 .notifier_call = ip6_route_dev_notify, 5310 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5311 }; 5312 5313 void __init ip6_route_init_special_entries(void) 5314 { 5315 /* Registering of the loopback is done before this portion of code, 5316 * the loopback reference in rt6_info will not be taken, do it 5317 * manually for init_net */ 5318 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5319 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5320 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5321 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5322 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5323 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5324 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5325 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5326 #endif 5327 } 5328 5329 int __init ip6_route_init(void) 5330 { 5331 int ret; 5332 int cpu; 5333 5334 ret = -ENOMEM; 5335 ip6_dst_ops_template.kmem_cachep = 5336 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5337 SLAB_HWCACHE_ALIGN, NULL); 5338 if (!ip6_dst_ops_template.kmem_cachep) 5339 goto out; 5340 5341 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5342 if (ret) 5343 goto out_kmem_cache; 5344 5345 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5346 if (ret) 5347 goto out_dst_entries; 5348 5349 ret = register_pernet_subsys(&ip6_route_net_ops); 5350 if (ret) 5351 goto out_register_inetpeer; 5352 5353 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5354 5355 ret = fib6_init(); 5356 if (ret) 5357 goto out_register_subsys; 5358 5359 ret = xfrm6_init(); 5360 if (ret) 5361 goto out_fib6_init; 5362 5363 ret = fib6_rules_init(); 5364 if (ret) 5365 goto xfrm6_init; 5366 5367 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5368 if (ret) 5369 goto fib6_rules_init; 5370 5371 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5372 inet6_rtm_newroute, NULL, 0); 5373 if (ret < 0) 5374 goto out_register_late_subsys; 5375 5376 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5377 inet6_rtm_delroute, NULL, 0); 5378 if (ret < 0) 5379 goto out_register_late_subsys; 5380 5381 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5382 inet6_rtm_getroute, NULL, 5383 RTNL_FLAG_DOIT_UNLOCKED); 5384 if (ret < 0) 5385 goto out_register_late_subsys; 5386 5387 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5388 if (ret) 5389 goto out_register_late_subsys; 5390 5391 for_each_possible_cpu(cpu) { 5392 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5393 5394 INIT_LIST_HEAD(&ul->head); 5395 spin_lock_init(&ul->lock); 5396 } 5397 5398 out: 5399 return ret; 5400 5401 out_register_late_subsys: 5402 rtnl_unregister_all(PF_INET6); 5403 unregister_pernet_subsys(&ip6_route_net_late_ops); 5404 fib6_rules_init: 5405 fib6_rules_cleanup(); 5406 xfrm6_init: 5407 xfrm6_fini(); 5408 out_fib6_init: 5409 fib6_gc_cleanup(); 5410 out_register_subsys: 5411 unregister_pernet_subsys(&ip6_route_net_ops); 5412 out_register_inetpeer: 5413 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5414 out_dst_entries: 5415 dst_entries_destroy(&ip6_dst_blackhole_ops); 5416 out_kmem_cache: 5417 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5418 goto out; 5419 } 5420 5421 void ip6_route_cleanup(void) 5422 { 5423 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5424 unregister_pernet_subsys(&ip6_route_net_late_ops); 5425 fib6_rules_cleanup(); 5426 xfrm6_fini(); 5427 fib6_gc_cleanup(); 5428 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5429 unregister_pernet_subsys(&ip6_route_net_ops); 5430 dst_entries_destroy(&ip6_dst_blackhole_ops); 5431 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5432 } 5433