1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 rt->rt6i_prefsrc = ort->fib6_prefsrc; 999 } 1000 1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1002 struct in6_addr *saddr) 1003 { 1004 struct fib6_node *pn, *sn; 1005 while (1) { 1006 if (fn->fn_flags & RTN_TL_ROOT) 1007 return NULL; 1008 pn = rcu_dereference(fn->parent); 1009 sn = FIB6_SUBTREE(pn); 1010 if (sn && sn != fn) 1011 fn = fib6_node_lookup(sn, NULL, saddr); 1012 else 1013 fn = pn; 1014 if (fn->fn_flags & RTN_RTINFO) 1015 return fn; 1016 } 1017 } 1018 1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1020 bool null_fallback) 1021 { 1022 struct rt6_info *rt = *prt; 1023 1024 if (dst_hold_safe(&rt->dst)) 1025 return true; 1026 if (null_fallback) { 1027 rt = net->ipv6.ip6_null_entry; 1028 dst_hold(&rt->dst); 1029 } else { 1030 rt = NULL; 1031 } 1032 *prt = rt; 1033 return false; 1034 } 1035 1036 /* called with rcu_lock held */ 1037 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1038 { 1039 unsigned short flags = fib6_info_dst_flags(rt); 1040 struct net_device *dev = rt->fib6_nh.nh_dev; 1041 struct rt6_info *nrt; 1042 1043 if (!fib6_info_hold_safe(rt)) 1044 return NULL; 1045 1046 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1047 if (nrt) 1048 ip6_rt_copy_init(nrt, rt); 1049 else 1050 fib6_info_release(rt); 1051 1052 return nrt; 1053 } 1054 1055 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1056 struct fib6_table *table, 1057 struct flowi6 *fl6, 1058 const struct sk_buff *skb, 1059 int flags) 1060 { 1061 struct fib6_info *f6i; 1062 struct fib6_node *fn; 1063 struct rt6_info *rt; 1064 1065 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1066 flags &= ~RT6_LOOKUP_F_IFACE; 1067 1068 rcu_read_lock(); 1069 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1070 restart: 1071 f6i = rcu_dereference(fn->leaf); 1072 if (!f6i) { 1073 f6i = net->ipv6.fib6_null_entry; 1074 } else { 1075 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1076 fl6->flowi6_oif, flags); 1077 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1078 f6i = fib6_multipath_select(net, f6i, fl6, 1079 fl6->flowi6_oif, skb, 1080 flags); 1081 } 1082 if (f6i == net->ipv6.fib6_null_entry) { 1083 fn = fib6_backtrack(fn, &fl6->saddr); 1084 if (fn) 1085 goto restart; 1086 } 1087 1088 trace_fib6_table_lookup(net, f6i, table, fl6); 1089 1090 /* Search through exception table */ 1091 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1092 if (rt) { 1093 if (ip6_hold_safe(net, &rt, true)) 1094 dst_use_noref(&rt->dst, jiffies); 1095 } else if (f6i == net->ipv6.fib6_null_entry) { 1096 rt = net->ipv6.ip6_null_entry; 1097 dst_hold(&rt->dst); 1098 } else { 1099 rt = ip6_create_rt_rcu(f6i); 1100 if (!rt) { 1101 rt = net->ipv6.ip6_null_entry; 1102 dst_hold(&rt->dst); 1103 } 1104 } 1105 1106 rcu_read_unlock(); 1107 1108 return rt; 1109 } 1110 1111 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1112 const struct sk_buff *skb, int flags) 1113 { 1114 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1115 } 1116 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1117 1118 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1119 const struct in6_addr *saddr, int oif, 1120 const struct sk_buff *skb, int strict) 1121 { 1122 struct flowi6 fl6 = { 1123 .flowi6_oif = oif, 1124 .daddr = *daddr, 1125 }; 1126 struct dst_entry *dst; 1127 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1128 1129 if (saddr) { 1130 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1131 flags |= RT6_LOOKUP_F_HAS_SADDR; 1132 } 1133 1134 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1135 if (dst->error == 0) 1136 return (struct rt6_info *) dst; 1137 1138 dst_release(dst); 1139 1140 return NULL; 1141 } 1142 EXPORT_SYMBOL(rt6_lookup); 1143 1144 /* ip6_ins_rt is called with FREE table->tb6_lock. 1145 * It takes new route entry, the addition fails by any reason the 1146 * route is released. 1147 * Caller must hold dst before calling it. 1148 */ 1149 1150 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1151 struct netlink_ext_ack *extack) 1152 { 1153 int err; 1154 struct fib6_table *table; 1155 1156 table = rt->fib6_table; 1157 spin_lock_bh(&table->tb6_lock); 1158 err = fib6_add(&table->tb6_root, rt, info, extack); 1159 spin_unlock_bh(&table->tb6_lock); 1160 1161 return err; 1162 } 1163 1164 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1165 { 1166 struct nl_info info = { .nl_net = net, }; 1167 1168 return __ip6_ins_rt(rt, &info, NULL); 1169 } 1170 1171 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1172 const struct in6_addr *daddr, 1173 const struct in6_addr *saddr) 1174 { 1175 struct net_device *dev; 1176 struct rt6_info *rt; 1177 1178 /* 1179 * Clone the route. 1180 */ 1181 1182 if (!fib6_info_hold_safe(ort)) 1183 return NULL; 1184 1185 dev = ip6_rt_get_dev_rcu(ort); 1186 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1187 if (!rt) { 1188 fib6_info_release(ort); 1189 return NULL; 1190 } 1191 1192 ip6_rt_copy_init(rt, ort); 1193 rt->rt6i_flags |= RTF_CACHE; 1194 rt->dst.flags |= DST_HOST; 1195 rt->rt6i_dst.addr = *daddr; 1196 rt->rt6i_dst.plen = 128; 1197 1198 if (!rt6_is_gw_or_nonexthop(ort)) { 1199 if (ort->fib6_dst.plen != 128 && 1200 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1201 rt->rt6i_flags |= RTF_ANYCAST; 1202 #ifdef CONFIG_IPV6_SUBTREES 1203 if (rt->rt6i_src.plen && saddr) { 1204 rt->rt6i_src.addr = *saddr; 1205 rt->rt6i_src.plen = 128; 1206 } 1207 #endif 1208 } 1209 1210 return rt; 1211 } 1212 1213 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1214 { 1215 unsigned short flags = fib6_info_dst_flags(rt); 1216 struct net_device *dev; 1217 struct rt6_info *pcpu_rt; 1218 1219 if (!fib6_info_hold_safe(rt)) 1220 return NULL; 1221 1222 rcu_read_lock(); 1223 dev = ip6_rt_get_dev_rcu(rt); 1224 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1225 rcu_read_unlock(); 1226 if (!pcpu_rt) { 1227 fib6_info_release(rt); 1228 return NULL; 1229 } 1230 ip6_rt_copy_init(pcpu_rt, rt); 1231 pcpu_rt->rt6i_flags |= RTF_PCPU; 1232 return pcpu_rt; 1233 } 1234 1235 /* It should be called with rcu_read_lock() acquired */ 1236 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1237 { 1238 struct rt6_info *pcpu_rt, **p; 1239 1240 p = this_cpu_ptr(rt->rt6i_pcpu); 1241 pcpu_rt = *p; 1242 1243 if (pcpu_rt) 1244 ip6_hold_safe(NULL, &pcpu_rt, false); 1245 1246 return pcpu_rt; 1247 } 1248 1249 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1250 struct fib6_info *rt) 1251 { 1252 struct rt6_info *pcpu_rt, *prev, **p; 1253 1254 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1255 if (!pcpu_rt) { 1256 dst_hold(&net->ipv6.ip6_null_entry->dst); 1257 return net->ipv6.ip6_null_entry; 1258 } 1259 1260 dst_hold(&pcpu_rt->dst); 1261 p = this_cpu_ptr(rt->rt6i_pcpu); 1262 prev = cmpxchg(p, NULL, pcpu_rt); 1263 BUG_ON(prev); 1264 1265 return pcpu_rt; 1266 } 1267 1268 /* exception hash table implementation 1269 */ 1270 static DEFINE_SPINLOCK(rt6_exception_lock); 1271 1272 /* Remove rt6_ex from hash table and free the memory 1273 * Caller must hold rt6_exception_lock 1274 */ 1275 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1276 struct rt6_exception *rt6_ex) 1277 { 1278 struct net *net; 1279 1280 if (!bucket || !rt6_ex) 1281 return; 1282 1283 net = dev_net(rt6_ex->rt6i->dst.dev); 1284 hlist_del_rcu(&rt6_ex->hlist); 1285 dst_release(&rt6_ex->rt6i->dst); 1286 kfree_rcu(rt6_ex, rcu); 1287 WARN_ON_ONCE(!bucket->depth); 1288 bucket->depth--; 1289 net->ipv6.rt6_stats->fib_rt_cache--; 1290 } 1291 1292 /* Remove oldest rt6_ex in bucket and free the memory 1293 * Caller must hold rt6_exception_lock 1294 */ 1295 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1296 { 1297 struct rt6_exception *rt6_ex, *oldest = NULL; 1298 1299 if (!bucket) 1300 return; 1301 1302 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1303 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1304 oldest = rt6_ex; 1305 } 1306 rt6_remove_exception(bucket, oldest); 1307 } 1308 1309 static u32 rt6_exception_hash(const struct in6_addr *dst, 1310 const struct in6_addr *src) 1311 { 1312 static u32 seed __read_mostly; 1313 u32 val; 1314 1315 net_get_random_once(&seed, sizeof(seed)); 1316 val = jhash(dst, sizeof(*dst), seed); 1317 1318 #ifdef CONFIG_IPV6_SUBTREES 1319 if (src) 1320 val = jhash(src, sizeof(*src), val); 1321 #endif 1322 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1323 } 1324 1325 /* Helper function to find the cached rt in the hash table 1326 * and update bucket pointer to point to the bucket for this 1327 * (daddr, saddr) pair 1328 * Caller must hold rt6_exception_lock 1329 */ 1330 static struct rt6_exception * 1331 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1332 const struct in6_addr *daddr, 1333 const struct in6_addr *saddr) 1334 { 1335 struct rt6_exception *rt6_ex; 1336 u32 hval; 1337 1338 if (!(*bucket) || !daddr) 1339 return NULL; 1340 1341 hval = rt6_exception_hash(daddr, saddr); 1342 *bucket += hval; 1343 1344 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1345 struct rt6_info *rt6 = rt6_ex->rt6i; 1346 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1347 1348 #ifdef CONFIG_IPV6_SUBTREES 1349 if (matched && saddr) 1350 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1351 #endif 1352 if (matched) 1353 return rt6_ex; 1354 } 1355 return NULL; 1356 } 1357 1358 /* Helper function to find the cached rt in the hash table 1359 * and update bucket pointer to point to the bucket for this 1360 * (daddr, saddr) pair 1361 * Caller must hold rcu_read_lock() 1362 */ 1363 static struct rt6_exception * 1364 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1365 const struct in6_addr *daddr, 1366 const struct in6_addr *saddr) 1367 { 1368 struct rt6_exception *rt6_ex; 1369 u32 hval; 1370 1371 WARN_ON_ONCE(!rcu_read_lock_held()); 1372 1373 if (!(*bucket) || !daddr) 1374 return NULL; 1375 1376 hval = rt6_exception_hash(daddr, saddr); 1377 *bucket += hval; 1378 1379 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1380 struct rt6_info *rt6 = rt6_ex->rt6i; 1381 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1382 1383 #ifdef CONFIG_IPV6_SUBTREES 1384 if (matched && saddr) 1385 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1386 #endif 1387 if (matched) 1388 return rt6_ex; 1389 } 1390 return NULL; 1391 } 1392 1393 static unsigned int fib6_mtu(const struct fib6_info *rt) 1394 { 1395 unsigned int mtu; 1396 1397 if (rt->fib6_pmtu) { 1398 mtu = rt->fib6_pmtu; 1399 } else { 1400 struct net_device *dev = fib6_info_nh_dev(rt); 1401 struct inet6_dev *idev; 1402 1403 rcu_read_lock(); 1404 idev = __in6_dev_get(dev); 1405 mtu = idev->cnf.mtu6; 1406 rcu_read_unlock(); 1407 } 1408 1409 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1410 1411 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1412 } 1413 1414 static int rt6_insert_exception(struct rt6_info *nrt, 1415 struct fib6_info *ort) 1416 { 1417 struct net *net = dev_net(nrt->dst.dev); 1418 struct rt6_exception_bucket *bucket; 1419 struct in6_addr *src_key = NULL; 1420 struct rt6_exception *rt6_ex; 1421 int err = 0; 1422 1423 spin_lock_bh(&rt6_exception_lock); 1424 1425 if (ort->exception_bucket_flushed) { 1426 err = -EINVAL; 1427 goto out; 1428 } 1429 1430 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1431 lockdep_is_held(&rt6_exception_lock)); 1432 if (!bucket) { 1433 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1434 GFP_ATOMIC); 1435 if (!bucket) { 1436 err = -ENOMEM; 1437 goto out; 1438 } 1439 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1440 } 1441 1442 #ifdef CONFIG_IPV6_SUBTREES 1443 /* rt6i_src.plen != 0 indicates ort is in subtree 1444 * and exception table is indexed by a hash of 1445 * both rt6i_dst and rt6i_src. 1446 * Otherwise, the exception table is indexed by 1447 * a hash of only rt6i_dst. 1448 */ 1449 if (ort->fib6_src.plen) 1450 src_key = &nrt->rt6i_src.addr; 1451 #endif 1452 1453 /* Update rt6i_prefsrc as it could be changed 1454 * in rt6_remove_prefsrc() 1455 */ 1456 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1457 /* rt6_mtu_change() might lower mtu on ort. 1458 * Only insert this exception route if its mtu 1459 * is less than ort's mtu value. 1460 */ 1461 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1462 err = -EINVAL; 1463 goto out; 1464 } 1465 1466 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1467 src_key); 1468 if (rt6_ex) 1469 rt6_remove_exception(bucket, rt6_ex); 1470 1471 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1472 if (!rt6_ex) { 1473 err = -ENOMEM; 1474 goto out; 1475 } 1476 rt6_ex->rt6i = nrt; 1477 rt6_ex->stamp = jiffies; 1478 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1479 bucket->depth++; 1480 net->ipv6.rt6_stats->fib_rt_cache++; 1481 1482 if (bucket->depth > FIB6_MAX_DEPTH) 1483 rt6_exception_remove_oldest(bucket); 1484 1485 out: 1486 spin_unlock_bh(&rt6_exception_lock); 1487 1488 /* Update fn->fn_sernum to invalidate all cached dst */ 1489 if (!err) { 1490 spin_lock_bh(&ort->fib6_table->tb6_lock); 1491 fib6_update_sernum(net, ort); 1492 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1493 fib6_force_start_gc(net); 1494 } 1495 1496 return err; 1497 } 1498 1499 void rt6_flush_exceptions(struct fib6_info *rt) 1500 { 1501 struct rt6_exception_bucket *bucket; 1502 struct rt6_exception *rt6_ex; 1503 struct hlist_node *tmp; 1504 int i; 1505 1506 spin_lock_bh(&rt6_exception_lock); 1507 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1508 rt->exception_bucket_flushed = 1; 1509 1510 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1511 lockdep_is_held(&rt6_exception_lock)); 1512 if (!bucket) 1513 goto out; 1514 1515 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1516 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1517 rt6_remove_exception(bucket, rt6_ex); 1518 WARN_ON_ONCE(bucket->depth); 1519 bucket++; 1520 } 1521 1522 out: 1523 spin_unlock_bh(&rt6_exception_lock); 1524 } 1525 1526 /* Find cached rt in the hash table inside passed in rt 1527 * Caller has to hold rcu_read_lock() 1528 */ 1529 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1530 struct in6_addr *daddr, 1531 struct in6_addr *saddr) 1532 { 1533 struct rt6_exception_bucket *bucket; 1534 struct in6_addr *src_key = NULL; 1535 struct rt6_exception *rt6_ex; 1536 struct rt6_info *res = NULL; 1537 1538 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1539 1540 #ifdef CONFIG_IPV6_SUBTREES 1541 /* rt6i_src.plen != 0 indicates rt is in subtree 1542 * and exception table is indexed by a hash of 1543 * both rt6i_dst and rt6i_src. 1544 * Otherwise, the exception table is indexed by 1545 * a hash of only rt6i_dst. 1546 */ 1547 if (rt->fib6_src.plen) 1548 src_key = saddr; 1549 #endif 1550 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1551 1552 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1553 res = rt6_ex->rt6i; 1554 1555 return res; 1556 } 1557 1558 /* Remove the passed in cached rt from the hash table that contains it */ 1559 static int rt6_remove_exception_rt(struct rt6_info *rt) 1560 { 1561 struct rt6_exception_bucket *bucket; 1562 struct in6_addr *src_key = NULL; 1563 struct rt6_exception *rt6_ex; 1564 struct fib6_info *from; 1565 int err; 1566 1567 from = rcu_dereference(rt->from); 1568 if (!from || 1569 !(rt->rt6i_flags & RTF_CACHE)) 1570 return -EINVAL; 1571 1572 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1573 return -ENOENT; 1574 1575 spin_lock_bh(&rt6_exception_lock); 1576 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1577 lockdep_is_held(&rt6_exception_lock)); 1578 #ifdef CONFIG_IPV6_SUBTREES 1579 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1580 * and exception table is indexed by a hash of 1581 * both rt6i_dst and rt6i_src. 1582 * Otherwise, the exception table is indexed by 1583 * a hash of only rt6i_dst. 1584 */ 1585 if (from->fib6_src.plen) 1586 src_key = &rt->rt6i_src.addr; 1587 #endif 1588 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1589 &rt->rt6i_dst.addr, 1590 src_key); 1591 if (rt6_ex) { 1592 rt6_remove_exception(bucket, rt6_ex); 1593 err = 0; 1594 } else { 1595 err = -ENOENT; 1596 } 1597 1598 spin_unlock_bh(&rt6_exception_lock); 1599 return err; 1600 } 1601 1602 /* Find rt6_ex which contains the passed in rt cache and 1603 * refresh its stamp 1604 */ 1605 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1606 { 1607 struct rt6_exception_bucket *bucket; 1608 struct fib6_info *from = rt->from; 1609 struct in6_addr *src_key = NULL; 1610 struct rt6_exception *rt6_ex; 1611 1612 if (!from || 1613 !(rt->rt6i_flags & RTF_CACHE)) 1614 return; 1615 1616 rcu_read_lock(); 1617 bucket = rcu_dereference(from->rt6i_exception_bucket); 1618 1619 #ifdef CONFIG_IPV6_SUBTREES 1620 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1621 * and exception table is indexed by a hash of 1622 * both rt6i_dst and rt6i_src. 1623 * Otherwise, the exception table is indexed by 1624 * a hash of only rt6i_dst. 1625 */ 1626 if (from->fib6_src.plen) 1627 src_key = &rt->rt6i_src.addr; 1628 #endif 1629 rt6_ex = __rt6_find_exception_rcu(&bucket, 1630 &rt->rt6i_dst.addr, 1631 src_key); 1632 if (rt6_ex) 1633 rt6_ex->stamp = jiffies; 1634 1635 rcu_read_unlock(); 1636 } 1637 1638 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1639 { 1640 struct rt6_exception_bucket *bucket; 1641 struct rt6_exception *rt6_ex; 1642 int i; 1643 1644 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1645 lockdep_is_held(&rt6_exception_lock)); 1646 1647 if (bucket) { 1648 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1649 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1650 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1651 } 1652 bucket++; 1653 } 1654 } 1655 } 1656 1657 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1658 struct rt6_info *rt, int mtu) 1659 { 1660 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1661 * lowest MTU in the path: always allow updating the route PMTU to 1662 * reflect PMTU decreases. 1663 * 1664 * If the new MTU is higher, and the route PMTU is equal to the local 1665 * MTU, this means the old MTU is the lowest in the path, so allow 1666 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1667 * handle this. 1668 */ 1669 1670 if (dst_mtu(&rt->dst) >= mtu) 1671 return true; 1672 1673 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1674 return true; 1675 1676 return false; 1677 } 1678 1679 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1680 struct fib6_info *rt, int mtu) 1681 { 1682 struct rt6_exception_bucket *bucket; 1683 struct rt6_exception *rt6_ex; 1684 int i; 1685 1686 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1687 lockdep_is_held(&rt6_exception_lock)); 1688 1689 if (!bucket) 1690 return; 1691 1692 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1693 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1694 struct rt6_info *entry = rt6_ex->rt6i; 1695 1696 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1697 * route), the metrics of its rt->from have already 1698 * been updated. 1699 */ 1700 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1701 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1702 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1703 } 1704 bucket++; 1705 } 1706 } 1707 1708 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1709 1710 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1711 struct in6_addr *gateway) 1712 { 1713 struct rt6_exception_bucket *bucket; 1714 struct rt6_exception *rt6_ex; 1715 struct hlist_node *tmp; 1716 int i; 1717 1718 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1719 return; 1720 1721 spin_lock_bh(&rt6_exception_lock); 1722 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1723 lockdep_is_held(&rt6_exception_lock)); 1724 1725 if (bucket) { 1726 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1727 hlist_for_each_entry_safe(rt6_ex, tmp, 1728 &bucket->chain, hlist) { 1729 struct rt6_info *entry = rt6_ex->rt6i; 1730 1731 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1732 RTF_CACHE_GATEWAY && 1733 ipv6_addr_equal(gateway, 1734 &entry->rt6i_gateway)) { 1735 rt6_remove_exception(bucket, rt6_ex); 1736 } 1737 } 1738 bucket++; 1739 } 1740 } 1741 1742 spin_unlock_bh(&rt6_exception_lock); 1743 } 1744 1745 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1746 struct rt6_exception *rt6_ex, 1747 struct fib6_gc_args *gc_args, 1748 unsigned long now) 1749 { 1750 struct rt6_info *rt = rt6_ex->rt6i; 1751 1752 /* we are pruning and obsoleting aged-out and non gateway exceptions 1753 * even if others have still references to them, so that on next 1754 * dst_check() such references can be dropped. 1755 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1756 * expired, independently from their aging, as per RFC 8201 section 4 1757 */ 1758 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1759 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1760 RT6_TRACE("aging clone %p\n", rt); 1761 rt6_remove_exception(bucket, rt6_ex); 1762 return; 1763 } 1764 } else if (time_after(jiffies, rt->dst.expires)) { 1765 RT6_TRACE("purging expired route %p\n", rt); 1766 rt6_remove_exception(bucket, rt6_ex); 1767 return; 1768 } 1769 1770 if (rt->rt6i_flags & RTF_GATEWAY) { 1771 struct neighbour *neigh; 1772 __u8 neigh_flags = 0; 1773 1774 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1775 if (neigh) 1776 neigh_flags = neigh->flags; 1777 1778 if (!(neigh_flags & NTF_ROUTER)) { 1779 RT6_TRACE("purging route %p via non-router but gateway\n", 1780 rt); 1781 rt6_remove_exception(bucket, rt6_ex); 1782 return; 1783 } 1784 } 1785 1786 gc_args->more++; 1787 } 1788 1789 void rt6_age_exceptions(struct fib6_info *rt, 1790 struct fib6_gc_args *gc_args, 1791 unsigned long now) 1792 { 1793 struct rt6_exception_bucket *bucket; 1794 struct rt6_exception *rt6_ex; 1795 struct hlist_node *tmp; 1796 int i; 1797 1798 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1799 return; 1800 1801 rcu_read_lock_bh(); 1802 spin_lock(&rt6_exception_lock); 1803 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1804 lockdep_is_held(&rt6_exception_lock)); 1805 1806 if (bucket) { 1807 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1808 hlist_for_each_entry_safe(rt6_ex, tmp, 1809 &bucket->chain, hlist) { 1810 rt6_age_examine_exception(bucket, rt6_ex, 1811 gc_args, now); 1812 } 1813 bucket++; 1814 } 1815 } 1816 spin_unlock(&rt6_exception_lock); 1817 rcu_read_unlock_bh(); 1818 } 1819 1820 /* must be called with rcu lock held */ 1821 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1822 int oif, struct flowi6 *fl6, int strict) 1823 { 1824 struct fib6_node *fn, *saved_fn; 1825 struct fib6_info *f6i; 1826 1827 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1828 saved_fn = fn; 1829 1830 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1831 oif = 0; 1832 1833 redo_rt6_select: 1834 f6i = rt6_select(net, fn, oif, strict); 1835 if (f6i == net->ipv6.fib6_null_entry) { 1836 fn = fib6_backtrack(fn, &fl6->saddr); 1837 if (fn) 1838 goto redo_rt6_select; 1839 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1840 /* also consider unreachable route */ 1841 strict &= ~RT6_LOOKUP_F_REACHABLE; 1842 fn = saved_fn; 1843 goto redo_rt6_select; 1844 } 1845 } 1846 1847 trace_fib6_table_lookup(net, f6i, table, fl6); 1848 1849 return f6i; 1850 } 1851 1852 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1853 int oif, struct flowi6 *fl6, 1854 const struct sk_buff *skb, int flags) 1855 { 1856 struct fib6_info *f6i; 1857 struct rt6_info *rt; 1858 int strict = 0; 1859 1860 strict |= flags & RT6_LOOKUP_F_IFACE; 1861 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1862 if (net->ipv6.devconf_all->forwarding == 0) 1863 strict |= RT6_LOOKUP_F_REACHABLE; 1864 1865 rcu_read_lock(); 1866 1867 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1868 if (f6i->fib6_nsiblings) 1869 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1870 1871 if (f6i == net->ipv6.fib6_null_entry) { 1872 rt = net->ipv6.ip6_null_entry; 1873 rcu_read_unlock(); 1874 dst_hold(&rt->dst); 1875 return rt; 1876 } 1877 1878 /*Search through exception table */ 1879 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1880 if (rt) { 1881 if (ip6_hold_safe(net, &rt, true)) 1882 dst_use_noref(&rt->dst, jiffies); 1883 1884 rcu_read_unlock(); 1885 return rt; 1886 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1887 !(f6i->fib6_flags & RTF_GATEWAY))) { 1888 /* Create a RTF_CACHE clone which will not be 1889 * owned by the fib6 tree. It is for the special case where 1890 * the daddr in the skb during the neighbor look-up is different 1891 * from the fl6->daddr used to look-up route here. 1892 */ 1893 struct rt6_info *uncached_rt; 1894 1895 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1896 1897 rcu_read_unlock(); 1898 1899 if (uncached_rt) { 1900 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1901 * No need for another dst_hold() 1902 */ 1903 rt6_uncached_list_add(uncached_rt); 1904 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1905 } else { 1906 uncached_rt = net->ipv6.ip6_null_entry; 1907 dst_hold(&uncached_rt->dst); 1908 } 1909 1910 return uncached_rt; 1911 } else { 1912 /* Get a percpu copy */ 1913 1914 struct rt6_info *pcpu_rt; 1915 1916 local_bh_disable(); 1917 pcpu_rt = rt6_get_pcpu_route(f6i); 1918 1919 if (!pcpu_rt) 1920 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1921 1922 local_bh_enable(); 1923 rcu_read_unlock(); 1924 1925 return pcpu_rt; 1926 } 1927 } 1928 EXPORT_SYMBOL_GPL(ip6_pol_route); 1929 1930 static struct rt6_info *ip6_pol_route_input(struct net *net, 1931 struct fib6_table *table, 1932 struct flowi6 *fl6, 1933 const struct sk_buff *skb, 1934 int flags) 1935 { 1936 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1937 } 1938 1939 struct dst_entry *ip6_route_input_lookup(struct net *net, 1940 struct net_device *dev, 1941 struct flowi6 *fl6, 1942 const struct sk_buff *skb, 1943 int flags) 1944 { 1945 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1946 flags |= RT6_LOOKUP_F_IFACE; 1947 1948 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1949 } 1950 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1951 1952 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1953 struct flow_keys *keys, 1954 struct flow_keys *flkeys) 1955 { 1956 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1957 const struct ipv6hdr *key_iph = outer_iph; 1958 struct flow_keys *_flkeys = flkeys; 1959 const struct ipv6hdr *inner_iph; 1960 const struct icmp6hdr *icmph; 1961 struct ipv6hdr _inner_iph; 1962 struct icmp6hdr _icmph; 1963 1964 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1965 goto out; 1966 1967 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1968 sizeof(_icmph), &_icmph); 1969 if (!icmph) 1970 goto out; 1971 1972 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1973 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1974 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1975 icmph->icmp6_type != ICMPV6_PARAMPROB) 1976 goto out; 1977 1978 inner_iph = skb_header_pointer(skb, 1979 skb_transport_offset(skb) + sizeof(*icmph), 1980 sizeof(_inner_iph), &_inner_iph); 1981 if (!inner_iph) 1982 goto out; 1983 1984 key_iph = inner_iph; 1985 _flkeys = NULL; 1986 out: 1987 if (_flkeys) { 1988 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1989 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1990 keys->tags.flow_label = _flkeys->tags.flow_label; 1991 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1992 } else { 1993 keys->addrs.v6addrs.src = key_iph->saddr; 1994 keys->addrs.v6addrs.dst = key_iph->daddr; 1995 keys->tags.flow_label = ip6_flowlabel(key_iph); 1996 keys->basic.ip_proto = key_iph->nexthdr; 1997 } 1998 } 1999 2000 /* if skb is set it will be used and fl6 can be NULL */ 2001 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2002 const struct sk_buff *skb, struct flow_keys *flkeys) 2003 { 2004 struct flow_keys hash_keys; 2005 u32 mhash; 2006 2007 switch (ip6_multipath_hash_policy(net)) { 2008 case 0: 2009 memset(&hash_keys, 0, sizeof(hash_keys)); 2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2011 if (skb) { 2012 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2013 } else { 2014 hash_keys.addrs.v6addrs.src = fl6->saddr; 2015 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2016 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2017 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2018 } 2019 break; 2020 case 1: 2021 if (skb) { 2022 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2023 struct flow_keys keys; 2024 2025 /* short-circuit if we already have L4 hash present */ 2026 if (skb->l4_hash) 2027 return skb_get_hash_raw(skb) >> 1; 2028 2029 memset(&hash_keys, 0, sizeof(hash_keys)); 2030 2031 if (!flkeys) { 2032 skb_flow_dissect_flow_keys(skb, &keys, flag); 2033 flkeys = &keys; 2034 } 2035 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2036 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2037 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2038 hash_keys.ports.src = flkeys->ports.src; 2039 hash_keys.ports.dst = flkeys->ports.dst; 2040 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2041 } else { 2042 memset(&hash_keys, 0, sizeof(hash_keys)); 2043 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2044 hash_keys.addrs.v6addrs.src = fl6->saddr; 2045 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2046 hash_keys.ports.src = fl6->fl6_sport; 2047 hash_keys.ports.dst = fl6->fl6_dport; 2048 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2049 } 2050 break; 2051 } 2052 mhash = flow_hash_from_keys(&hash_keys); 2053 2054 return mhash >> 1; 2055 } 2056 2057 void ip6_route_input(struct sk_buff *skb) 2058 { 2059 const struct ipv6hdr *iph = ipv6_hdr(skb); 2060 struct net *net = dev_net(skb->dev); 2061 int flags = RT6_LOOKUP_F_HAS_SADDR; 2062 struct ip_tunnel_info *tun_info; 2063 struct flowi6 fl6 = { 2064 .flowi6_iif = skb->dev->ifindex, 2065 .daddr = iph->daddr, 2066 .saddr = iph->saddr, 2067 .flowlabel = ip6_flowinfo(iph), 2068 .flowi6_mark = skb->mark, 2069 .flowi6_proto = iph->nexthdr, 2070 }; 2071 struct flow_keys *flkeys = NULL, _flkeys; 2072 2073 tun_info = skb_tunnel_info(skb); 2074 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2075 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2076 2077 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2078 flkeys = &_flkeys; 2079 2080 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2081 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2082 skb_dst_drop(skb); 2083 skb_dst_set(skb, 2084 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2085 } 2086 2087 static struct rt6_info *ip6_pol_route_output(struct net *net, 2088 struct fib6_table *table, 2089 struct flowi6 *fl6, 2090 const struct sk_buff *skb, 2091 int flags) 2092 { 2093 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2094 } 2095 2096 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2097 struct flowi6 *fl6, int flags) 2098 { 2099 bool any_src; 2100 2101 if (rt6_need_strict(&fl6->daddr)) { 2102 struct dst_entry *dst; 2103 2104 dst = l3mdev_link_scope_lookup(net, fl6); 2105 if (dst) 2106 return dst; 2107 } 2108 2109 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2110 2111 any_src = ipv6_addr_any(&fl6->saddr); 2112 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2113 (fl6->flowi6_oif && any_src)) 2114 flags |= RT6_LOOKUP_F_IFACE; 2115 2116 if (!any_src) 2117 flags |= RT6_LOOKUP_F_HAS_SADDR; 2118 else if (sk) 2119 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2120 2121 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2122 } 2123 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2124 2125 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2126 { 2127 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2128 struct net_device *loopback_dev = net->loopback_dev; 2129 struct dst_entry *new = NULL; 2130 2131 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2132 DST_OBSOLETE_DEAD, 0); 2133 if (rt) { 2134 rt6_info_init(rt); 2135 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2136 2137 new = &rt->dst; 2138 new->__use = 1; 2139 new->input = dst_discard; 2140 new->output = dst_discard_out; 2141 2142 dst_copy_metrics(new, &ort->dst); 2143 2144 rt->rt6i_idev = in6_dev_get(loopback_dev); 2145 rt->rt6i_gateway = ort->rt6i_gateway; 2146 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2147 2148 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2149 #ifdef CONFIG_IPV6_SUBTREES 2150 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2151 #endif 2152 } 2153 2154 dst_release(dst_orig); 2155 return new ? new : ERR_PTR(-ENOMEM); 2156 } 2157 2158 /* 2159 * Destination cache support functions 2160 */ 2161 2162 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2163 { 2164 u32 rt_cookie = 0; 2165 2166 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2167 return false; 2168 2169 if (fib6_check_expired(f6i)) 2170 return false; 2171 2172 return true; 2173 } 2174 2175 static struct dst_entry *rt6_check(struct rt6_info *rt, 2176 struct fib6_info *from, 2177 u32 cookie) 2178 { 2179 u32 rt_cookie = 0; 2180 2181 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2182 rt_cookie != cookie) 2183 return NULL; 2184 2185 if (rt6_check_expired(rt)) 2186 return NULL; 2187 2188 return &rt->dst; 2189 } 2190 2191 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2192 struct fib6_info *from, 2193 u32 cookie) 2194 { 2195 if (!__rt6_check_expired(rt) && 2196 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2197 fib6_check(from, cookie)) 2198 return &rt->dst; 2199 else 2200 return NULL; 2201 } 2202 2203 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2204 { 2205 struct dst_entry *dst_ret; 2206 struct fib6_info *from; 2207 struct rt6_info *rt; 2208 2209 rt = container_of(dst, struct rt6_info, dst); 2210 2211 rcu_read_lock(); 2212 2213 /* All IPV6 dsts are created with ->obsolete set to the value 2214 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2215 * into this function always. 2216 */ 2217 2218 from = rcu_dereference(rt->from); 2219 2220 if (from && (rt->rt6i_flags & RTF_PCPU || 2221 unlikely(!list_empty(&rt->rt6i_uncached)))) 2222 dst_ret = rt6_dst_from_check(rt, from, cookie); 2223 else 2224 dst_ret = rt6_check(rt, from, cookie); 2225 2226 rcu_read_unlock(); 2227 2228 return dst_ret; 2229 } 2230 2231 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2232 { 2233 struct rt6_info *rt = (struct rt6_info *) dst; 2234 2235 if (rt) { 2236 if (rt->rt6i_flags & RTF_CACHE) { 2237 rcu_read_lock(); 2238 if (rt6_check_expired(rt)) { 2239 rt6_remove_exception_rt(rt); 2240 dst = NULL; 2241 } 2242 rcu_read_unlock(); 2243 } else { 2244 dst_release(dst); 2245 dst = NULL; 2246 } 2247 } 2248 return dst; 2249 } 2250 2251 static void ip6_link_failure(struct sk_buff *skb) 2252 { 2253 struct rt6_info *rt; 2254 2255 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2256 2257 rt = (struct rt6_info *) skb_dst(skb); 2258 if (rt) { 2259 rcu_read_lock(); 2260 if (rt->rt6i_flags & RTF_CACHE) { 2261 if (dst_hold_safe(&rt->dst)) 2262 rt6_remove_exception_rt(rt); 2263 } else { 2264 struct fib6_info *from; 2265 struct fib6_node *fn; 2266 2267 from = rcu_dereference(rt->from); 2268 if (from) { 2269 fn = rcu_dereference(from->fib6_node); 2270 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2271 fn->fn_sernum = -1; 2272 } 2273 } 2274 rcu_read_unlock(); 2275 } 2276 } 2277 2278 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2279 { 2280 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2281 struct fib6_info *from; 2282 2283 rcu_read_lock(); 2284 from = rcu_dereference(rt0->from); 2285 if (from) 2286 rt0->dst.expires = from->expires; 2287 rcu_read_unlock(); 2288 } 2289 2290 dst_set_expires(&rt0->dst, timeout); 2291 rt0->rt6i_flags |= RTF_EXPIRES; 2292 } 2293 2294 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2295 { 2296 struct net *net = dev_net(rt->dst.dev); 2297 2298 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2299 rt->rt6i_flags |= RTF_MODIFIED; 2300 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2301 } 2302 2303 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2304 { 2305 bool from_set; 2306 2307 rcu_read_lock(); 2308 from_set = !!rcu_dereference(rt->from); 2309 rcu_read_unlock(); 2310 2311 return !(rt->rt6i_flags & RTF_CACHE) && 2312 (rt->rt6i_flags & RTF_PCPU || from_set); 2313 } 2314 2315 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2316 const struct ipv6hdr *iph, u32 mtu) 2317 { 2318 const struct in6_addr *daddr, *saddr; 2319 struct rt6_info *rt6 = (struct rt6_info *)dst; 2320 2321 if (dst_metric_locked(dst, RTAX_MTU)) 2322 return; 2323 2324 if (iph) { 2325 daddr = &iph->daddr; 2326 saddr = &iph->saddr; 2327 } else if (sk) { 2328 daddr = &sk->sk_v6_daddr; 2329 saddr = &inet6_sk(sk)->saddr; 2330 } else { 2331 daddr = NULL; 2332 saddr = NULL; 2333 } 2334 dst_confirm_neigh(dst, daddr); 2335 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2336 if (mtu >= dst_mtu(dst)) 2337 return; 2338 2339 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2340 rt6_do_update_pmtu(rt6, mtu); 2341 /* update rt6_ex->stamp for cache */ 2342 if (rt6->rt6i_flags & RTF_CACHE) 2343 rt6_update_exception_stamp_rt(rt6); 2344 } else if (daddr) { 2345 struct fib6_info *from; 2346 struct rt6_info *nrt6; 2347 2348 rcu_read_lock(); 2349 from = rcu_dereference(rt6->from); 2350 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2351 if (nrt6) { 2352 rt6_do_update_pmtu(nrt6, mtu); 2353 if (rt6_insert_exception(nrt6, from)) 2354 dst_release_immediate(&nrt6->dst); 2355 } 2356 rcu_read_unlock(); 2357 } 2358 } 2359 2360 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2361 struct sk_buff *skb, u32 mtu) 2362 { 2363 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2364 } 2365 2366 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2367 int oif, u32 mark, kuid_t uid) 2368 { 2369 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2370 struct dst_entry *dst; 2371 struct flowi6 fl6; 2372 2373 memset(&fl6, 0, sizeof(fl6)); 2374 fl6.flowi6_oif = oif; 2375 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2376 fl6.daddr = iph->daddr; 2377 fl6.saddr = iph->saddr; 2378 fl6.flowlabel = ip6_flowinfo(iph); 2379 fl6.flowi6_uid = uid; 2380 2381 dst = ip6_route_output(net, NULL, &fl6); 2382 if (!dst->error) 2383 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2384 dst_release(dst); 2385 } 2386 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2387 2388 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2389 { 2390 struct dst_entry *dst; 2391 2392 ip6_update_pmtu(skb, sock_net(sk), mtu, 2393 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2394 2395 dst = __sk_dst_get(sk); 2396 if (!dst || !dst->obsolete || 2397 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2398 return; 2399 2400 bh_lock_sock(sk); 2401 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2402 ip6_datagram_dst_update(sk, false); 2403 bh_unlock_sock(sk); 2404 } 2405 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2406 2407 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2408 const struct flowi6 *fl6) 2409 { 2410 #ifdef CONFIG_IPV6_SUBTREES 2411 struct ipv6_pinfo *np = inet6_sk(sk); 2412 #endif 2413 2414 ip6_dst_store(sk, dst, 2415 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2416 &sk->sk_v6_daddr : NULL, 2417 #ifdef CONFIG_IPV6_SUBTREES 2418 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2419 &np->saddr : 2420 #endif 2421 NULL); 2422 } 2423 2424 /* Handle redirects */ 2425 struct ip6rd_flowi { 2426 struct flowi6 fl6; 2427 struct in6_addr gateway; 2428 }; 2429 2430 static struct rt6_info *__ip6_route_redirect(struct net *net, 2431 struct fib6_table *table, 2432 struct flowi6 *fl6, 2433 const struct sk_buff *skb, 2434 int flags) 2435 { 2436 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2437 struct rt6_info *ret = NULL, *rt_cache; 2438 struct fib6_info *rt; 2439 struct fib6_node *fn; 2440 2441 /* Get the "current" route for this destination and 2442 * check if the redirect has come from appropriate router. 2443 * 2444 * RFC 4861 specifies that redirects should only be 2445 * accepted if they come from the nexthop to the target. 2446 * Due to the way the routes are chosen, this notion 2447 * is a bit fuzzy and one might need to check all possible 2448 * routes. 2449 */ 2450 2451 rcu_read_lock(); 2452 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2453 restart: 2454 for_each_fib6_node_rt_rcu(fn) { 2455 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2456 continue; 2457 if (fib6_check_expired(rt)) 2458 continue; 2459 if (rt->fib6_flags & RTF_REJECT) 2460 break; 2461 if (!(rt->fib6_flags & RTF_GATEWAY)) 2462 continue; 2463 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2464 continue; 2465 /* rt_cache's gateway might be different from its 'parent' 2466 * in the case of an ip redirect. 2467 * So we keep searching in the exception table if the gateway 2468 * is different. 2469 */ 2470 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2471 rt_cache = rt6_find_cached_rt(rt, 2472 &fl6->daddr, 2473 &fl6->saddr); 2474 if (rt_cache && 2475 ipv6_addr_equal(&rdfl->gateway, 2476 &rt_cache->rt6i_gateway)) { 2477 ret = rt_cache; 2478 break; 2479 } 2480 continue; 2481 } 2482 break; 2483 } 2484 2485 if (!rt) 2486 rt = net->ipv6.fib6_null_entry; 2487 else if (rt->fib6_flags & RTF_REJECT) { 2488 ret = net->ipv6.ip6_null_entry; 2489 goto out; 2490 } 2491 2492 if (rt == net->ipv6.fib6_null_entry) { 2493 fn = fib6_backtrack(fn, &fl6->saddr); 2494 if (fn) 2495 goto restart; 2496 } 2497 2498 out: 2499 if (ret) 2500 ip6_hold_safe(net, &ret, true); 2501 else 2502 ret = ip6_create_rt_rcu(rt); 2503 2504 rcu_read_unlock(); 2505 2506 trace_fib6_table_lookup(net, rt, table, fl6); 2507 return ret; 2508 }; 2509 2510 static struct dst_entry *ip6_route_redirect(struct net *net, 2511 const struct flowi6 *fl6, 2512 const struct sk_buff *skb, 2513 const struct in6_addr *gateway) 2514 { 2515 int flags = RT6_LOOKUP_F_HAS_SADDR; 2516 struct ip6rd_flowi rdfl; 2517 2518 rdfl.fl6 = *fl6; 2519 rdfl.gateway = *gateway; 2520 2521 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2522 flags, __ip6_route_redirect); 2523 } 2524 2525 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2526 kuid_t uid) 2527 { 2528 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2529 struct dst_entry *dst; 2530 struct flowi6 fl6; 2531 2532 memset(&fl6, 0, sizeof(fl6)); 2533 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2534 fl6.flowi6_oif = oif; 2535 fl6.flowi6_mark = mark; 2536 fl6.daddr = iph->daddr; 2537 fl6.saddr = iph->saddr; 2538 fl6.flowlabel = ip6_flowinfo(iph); 2539 fl6.flowi6_uid = uid; 2540 2541 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2542 rt6_do_redirect(dst, NULL, skb); 2543 dst_release(dst); 2544 } 2545 EXPORT_SYMBOL_GPL(ip6_redirect); 2546 2547 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2548 u32 mark) 2549 { 2550 const struct ipv6hdr *iph = ipv6_hdr(skb); 2551 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2552 struct dst_entry *dst; 2553 struct flowi6 fl6; 2554 2555 memset(&fl6, 0, sizeof(fl6)); 2556 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2557 fl6.flowi6_oif = oif; 2558 fl6.flowi6_mark = mark; 2559 fl6.daddr = msg->dest; 2560 fl6.saddr = iph->daddr; 2561 fl6.flowi6_uid = sock_net_uid(net, NULL); 2562 2563 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2564 rt6_do_redirect(dst, NULL, skb); 2565 dst_release(dst); 2566 } 2567 2568 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2569 { 2570 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2571 sk->sk_uid); 2572 } 2573 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2574 2575 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2576 { 2577 struct net_device *dev = dst->dev; 2578 unsigned int mtu = dst_mtu(dst); 2579 struct net *net = dev_net(dev); 2580 2581 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2582 2583 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2584 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2585 2586 /* 2587 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2588 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2589 * IPV6_MAXPLEN is also valid and means: "any MSS, 2590 * rely only on pmtu discovery" 2591 */ 2592 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2593 mtu = IPV6_MAXPLEN; 2594 return mtu; 2595 } 2596 2597 static unsigned int ip6_mtu(const struct dst_entry *dst) 2598 { 2599 struct inet6_dev *idev; 2600 unsigned int mtu; 2601 2602 mtu = dst_metric_raw(dst, RTAX_MTU); 2603 if (mtu) 2604 goto out; 2605 2606 mtu = IPV6_MIN_MTU; 2607 2608 rcu_read_lock(); 2609 idev = __in6_dev_get(dst->dev); 2610 if (idev) 2611 mtu = idev->cnf.mtu6; 2612 rcu_read_unlock(); 2613 2614 out: 2615 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2616 2617 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2618 } 2619 2620 /* MTU selection: 2621 * 1. mtu on route is locked - use it 2622 * 2. mtu from nexthop exception 2623 * 3. mtu from egress device 2624 * 2625 * based on ip6_dst_mtu_forward and exception logic of 2626 * rt6_find_cached_rt; called with rcu_read_lock 2627 */ 2628 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2629 struct in6_addr *saddr) 2630 { 2631 struct rt6_exception_bucket *bucket; 2632 struct rt6_exception *rt6_ex; 2633 struct in6_addr *src_key; 2634 struct inet6_dev *idev; 2635 u32 mtu = 0; 2636 2637 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2638 mtu = f6i->fib6_pmtu; 2639 if (mtu) 2640 goto out; 2641 } 2642 2643 src_key = NULL; 2644 #ifdef CONFIG_IPV6_SUBTREES 2645 if (f6i->fib6_src.plen) 2646 src_key = saddr; 2647 #endif 2648 2649 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2650 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2651 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2652 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2653 2654 if (likely(!mtu)) { 2655 struct net_device *dev = fib6_info_nh_dev(f6i); 2656 2657 mtu = IPV6_MIN_MTU; 2658 idev = __in6_dev_get(dev); 2659 if (idev && idev->cnf.mtu6 > mtu) 2660 mtu = idev->cnf.mtu6; 2661 } 2662 2663 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2664 out: 2665 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2666 } 2667 2668 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2669 struct flowi6 *fl6) 2670 { 2671 struct dst_entry *dst; 2672 struct rt6_info *rt; 2673 struct inet6_dev *idev = in6_dev_get(dev); 2674 struct net *net = dev_net(dev); 2675 2676 if (unlikely(!idev)) 2677 return ERR_PTR(-ENODEV); 2678 2679 rt = ip6_dst_alloc(net, dev, 0); 2680 if (unlikely(!rt)) { 2681 in6_dev_put(idev); 2682 dst = ERR_PTR(-ENOMEM); 2683 goto out; 2684 } 2685 2686 rt->dst.flags |= DST_HOST; 2687 rt->dst.input = ip6_input; 2688 rt->dst.output = ip6_output; 2689 rt->rt6i_gateway = fl6->daddr; 2690 rt->rt6i_dst.addr = fl6->daddr; 2691 rt->rt6i_dst.plen = 128; 2692 rt->rt6i_idev = idev; 2693 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2694 2695 /* Add this dst into uncached_list so that rt6_disable_ip() can 2696 * do proper release of the net_device 2697 */ 2698 rt6_uncached_list_add(rt); 2699 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2700 2701 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2702 2703 out: 2704 return dst; 2705 } 2706 2707 static int ip6_dst_gc(struct dst_ops *ops) 2708 { 2709 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2710 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2711 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2712 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2713 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2714 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2715 int entries; 2716 2717 entries = dst_entries_get_fast(ops); 2718 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2719 entries <= rt_max_size) 2720 goto out; 2721 2722 net->ipv6.ip6_rt_gc_expire++; 2723 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2724 entries = dst_entries_get_slow(ops); 2725 if (entries < ops->gc_thresh) 2726 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2727 out: 2728 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2729 return entries > rt_max_size; 2730 } 2731 2732 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2733 struct fib6_config *cfg) 2734 { 2735 struct dst_metrics *p; 2736 2737 if (!cfg->fc_mx) 2738 return 0; 2739 2740 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2741 if (unlikely(!p)) 2742 return -ENOMEM; 2743 2744 refcount_set(&p->refcnt, 1); 2745 rt->fib6_metrics = p; 2746 2747 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2748 } 2749 2750 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2751 struct fib6_config *cfg, 2752 const struct in6_addr *gw_addr, 2753 u32 tbid, int flags) 2754 { 2755 struct flowi6 fl6 = { 2756 .flowi6_oif = cfg->fc_ifindex, 2757 .daddr = *gw_addr, 2758 .saddr = cfg->fc_prefsrc, 2759 }; 2760 struct fib6_table *table; 2761 struct rt6_info *rt; 2762 2763 table = fib6_get_table(net, tbid); 2764 if (!table) 2765 return NULL; 2766 2767 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2768 flags |= RT6_LOOKUP_F_HAS_SADDR; 2769 2770 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2771 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2772 2773 /* if table lookup failed, fall back to full lookup */ 2774 if (rt == net->ipv6.ip6_null_entry) { 2775 ip6_rt_put(rt); 2776 rt = NULL; 2777 } 2778 2779 return rt; 2780 } 2781 2782 static int ip6_route_check_nh_onlink(struct net *net, 2783 struct fib6_config *cfg, 2784 const struct net_device *dev, 2785 struct netlink_ext_ack *extack) 2786 { 2787 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2788 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2789 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2790 struct rt6_info *grt; 2791 int err; 2792 2793 err = 0; 2794 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2795 if (grt) { 2796 if (!grt->dst.error && 2797 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2798 NL_SET_ERR_MSG(extack, 2799 "Nexthop has invalid gateway or device mismatch"); 2800 err = -EINVAL; 2801 } 2802 2803 ip6_rt_put(grt); 2804 } 2805 2806 return err; 2807 } 2808 2809 static int ip6_route_check_nh(struct net *net, 2810 struct fib6_config *cfg, 2811 struct net_device **_dev, 2812 struct inet6_dev **idev) 2813 { 2814 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2815 struct net_device *dev = _dev ? *_dev : NULL; 2816 struct rt6_info *grt = NULL; 2817 int err = -EHOSTUNREACH; 2818 2819 if (cfg->fc_table) { 2820 int flags = RT6_LOOKUP_F_IFACE; 2821 2822 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2823 cfg->fc_table, flags); 2824 if (grt) { 2825 if (grt->rt6i_flags & RTF_GATEWAY || 2826 (dev && dev != grt->dst.dev)) { 2827 ip6_rt_put(grt); 2828 grt = NULL; 2829 } 2830 } 2831 } 2832 2833 if (!grt) 2834 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2835 2836 if (!grt) 2837 goto out; 2838 2839 if (dev) { 2840 if (dev != grt->dst.dev) { 2841 ip6_rt_put(grt); 2842 goto out; 2843 } 2844 } else { 2845 *_dev = dev = grt->dst.dev; 2846 *idev = grt->rt6i_idev; 2847 dev_hold(dev); 2848 in6_dev_hold(grt->rt6i_idev); 2849 } 2850 2851 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2852 err = 0; 2853 2854 ip6_rt_put(grt); 2855 2856 out: 2857 return err; 2858 } 2859 2860 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2861 struct net_device **_dev, struct inet6_dev **idev, 2862 struct netlink_ext_ack *extack) 2863 { 2864 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2865 int gwa_type = ipv6_addr_type(gw_addr); 2866 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2867 const struct net_device *dev = *_dev; 2868 bool need_addr_check = !dev; 2869 int err = -EINVAL; 2870 2871 /* if gw_addr is local we will fail to detect this in case 2872 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2873 * will return already-added prefix route via interface that 2874 * prefix route was assigned to, which might be non-loopback. 2875 */ 2876 if (dev && 2877 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2878 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2879 goto out; 2880 } 2881 2882 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2883 /* IPv6 strictly inhibits using not link-local 2884 * addresses as nexthop address. 2885 * Otherwise, router will not able to send redirects. 2886 * It is very good, but in some (rare!) circumstances 2887 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2888 * some exceptions. --ANK 2889 * We allow IPv4-mapped nexthops to support RFC4798-type 2890 * addressing 2891 */ 2892 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2893 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2894 goto out; 2895 } 2896 2897 if (cfg->fc_flags & RTNH_F_ONLINK) 2898 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2899 else 2900 err = ip6_route_check_nh(net, cfg, _dev, idev); 2901 2902 if (err) 2903 goto out; 2904 } 2905 2906 /* reload in case device was changed */ 2907 dev = *_dev; 2908 2909 err = -EINVAL; 2910 if (!dev) { 2911 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2912 goto out; 2913 } else if (dev->flags & IFF_LOOPBACK) { 2914 NL_SET_ERR_MSG(extack, 2915 "Egress device can not be loopback device for this route"); 2916 goto out; 2917 } 2918 2919 /* if we did not check gw_addr above, do so now that the 2920 * egress device has been resolved. 2921 */ 2922 if (need_addr_check && 2923 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2924 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2925 goto out; 2926 } 2927 2928 err = 0; 2929 out: 2930 return err; 2931 } 2932 2933 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2934 gfp_t gfp_flags, 2935 struct netlink_ext_ack *extack) 2936 { 2937 struct net *net = cfg->fc_nlinfo.nl_net; 2938 struct fib6_info *rt = NULL; 2939 struct net_device *dev = NULL; 2940 struct inet6_dev *idev = NULL; 2941 struct fib6_table *table; 2942 int addr_type; 2943 int err = -EINVAL; 2944 2945 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2946 if (cfg->fc_flags & RTF_PCPU) { 2947 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2948 goto out; 2949 } 2950 2951 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2952 if (cfg->fc_flags & RTF_CACHE) { 2953 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2954 goto out; 2955 } 2956 2957 if (cfg->fc_type > RTN_MAX) { 2958 NL_SET_ERR_MSG(extack, "Invalid route type"); 2959 goto out; 2960 } 2961 2962 if (cfg->fc_dst_len > 128) { 2963 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2964 goto out; 2965 } 2966 if (cfg->fc_src_len > 128) { 2967 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2968 goto out; 2969 } 2970 #ifndef CONFIG_IPV6_SUBTREES 2971 if (cfg->fc_src_len) { 2972 NL_SET_ERR_MSG(extack, 2973 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2974 goto out; 2975 } 2976 #endif 2977 if (cfg->fc_ifindex) { 2978 err = -ENODEV; 2979 dev = dev_get_by_index(net, cfg->fc_ifindex); 2980 if (!dev) 2981 goto out; 2982 idev = in6_dev_get(dev); 2983 if (!idev) 2984 goto out; 2985 } 2986 2987 if (cfg->fc_metric == 0) 2988 cfg->fc_metric = IP6_RT_PRIO_USER; 2989 2990 if (cfg->fc_flags & RTNH_F_ONLINK) { 2991 if (!dev) { 2992 NL_SET_ERR_MSG(extack, 2993 "Nexthop device required for onlink"); 2994 err = -ENODEV; 2995 goto out; 2996 } 2997 2998 if (!(dev->flags & IFF_UP)) { 2999 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3000 err = -ENETDOWN; 3001 goto out; 3002 } 3003 } 3004 3005 err = -ENOBUFS; 3006 if (cfg->fc_nlinfo.nlh && 3007 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3008 table = fib6_get_table(net, cfg->fc_table); 3009 if (!table) { 3010 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3011 table = fib6_new_table(net, cfg->fc_table); 3012 } 3013 } else { 3014 table = fib6_new_table(net, cfg->fc_table); 3015 } 3016 3017 if (!table) 3018 goto out; 3019 3020 err = -ENOMEM; 3021 rt = fib6_info_alloc(gfp_flags); 3022 if (!rt) 3023 goto out; 3024 3025 if (cfg->fc_flags & RTF_ADDRCONF) 3026 rt->dst_nocount = true; 3027 3028 err = ip6_convert_metrics(net, rt, cfg); 3029 if (err < 0) 3030 goto out; 3031 3032 if (cfg->fc_flags & RTF_EXPIRES) 3033 fib6_set_expires(rt, jiffies + 3034 clock_t_to_jiffies(cfg->fc_expires)); 3035 else 3036 fib6_clean_expires(rt); 3037 3038 if (cfg->fc_protocol == RTPROT_UNSPEC) 3039 cfg->fc_protocol = RTPROT_BOOT; 3040 rt->fib6_protocol = cfg->fc_protocol; 3041 3042 addr_type = ipv6_addr_type(&cfg->fc_dst); 3043 3044 if (cfg->fc_encap) { 3045 struct lwtunnel_state *lwtstate; 3046 3047 err = lwtunnel_build_state(cfg->fc_encap_type, 3048 cfg->fc_encap, AF_INET6, cfg, 3049 &lwtstate, extack); 3050 if (err) 3051 goto out; 3052 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3053 } 3054 3055 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3056 rt->fib6_dst.plen = cfg->fc_dst_len; 3057 if (rt->fib6_dst.plen == 128) 3058 rt->dst_host = true; 3059 3060 #ifdef CONFIG_IPV6_SUBTREES 3061 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3062 rt->fib6_src.plen = cfg->fc_src_len; 3063 #endif 3064 3065 rt->fib6_metric = cfg->fc_metric; 3066 rt->fib6_nh.nh_weight = 1; 3067 3068 rt->fib6_type = cfg->fc_type; 3069 3070 /* We cannot add true routes via loopback here, 3071 they would result in kernel looping; promote them to reject routes 3072 */ 3073 if ((cfg->fc_flags & RTF_REJECT) || 3074 (dev && (dev->flags & IFF_LOOPBACK) && 3075 !(addr_type & IPV6_ADDR_LOOPBACK) && 3076 !(cfg->fc_flags & RTF_LOCAL))) { 3077 /* hold loopback dev/idev if we haven't done so. */ 3078 if (dev != net->loopback_dev) { 3079 if (dev) { 3080 dev_put(dev); 3081 in6_dev_put(idev); 3082 } 3083 dev = net->loopback_dev; 3084 dev_hold(dev); 3085 idev = in6_dev_get(dev); 3086 if (!idev) { 3087 err = -ENODEV; 3088 goto out; 3089 } 3090 } 3091 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3092 goto install_route; 3093 } 3094 3095 if (cfg->fc_flags & RTF_GATEWAY) { 3096 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3097 if (err) 3098 goto out; 3099 3100 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3101 } 3102 3103 err = -ENODEV; 3104 if (!dev) 3105 goto out; 3106 3107 if (idev->cnf.disable_ipv6) { 3108 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3109 err = -EACCES; 3110 goto out; 3111 } 3112 3113 if (!(dev->flags & IFF_UP)) { 3114 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3115 err = -ENETDOWN; 3116 goto out; 3117 } 3118 3119 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3120 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3121 NL_SET_ERR_MSG(extack, "Invalid source address"); 3122 err = -EINVAL; 3123 goto out; 3124 } 3125 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3126 rt->fib6_prefsrc.plen = 128; 3127 } else 3128 rt->fib6_prefsrc.plen = 0; 3129 3130 rt->fib6_flags = cfg->fc_flags; 3131 3132 install_route: 3133 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3134 !netif_carrier_ok(dev)) 3135 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3136 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3137 rt->fib6_nh.nh_dev = dev; 3138 rt->fib6_table = table; 3139 3140 cfg->fc_nlinfo.nl_net = dev_net(dev); 3141 3142 if (idev) 3143 in6_dev_put(idev); 3144 3145 return rt; 3146 out: 3147 if (dev) 3148 dev_put(dev); 3149 if (idev) 3150 in6_dev_put(idev); 3151 3152 fib6_info_release(rt); 3153 return ERR_PTR(err); 3154 } 3155 3156 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3157 struct netlink_ext_ack *extack) 3158 { 3159 struct fib6_info *rt; 3160 int err; 3161 3162 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3163 if (IS_ERR(rt)) 3164 return PTR_ERR(rt); 3165 3166 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3167 fib6_info_release(rt); 3168 3169 return err; 3170 } 3171 3172 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3173 { 3174 struct net *net = info->nl_net; 3175 struct fib6_table *table; 3176 int err; 3177 3178 if (rt == net->ipv6.fib6_null_entry) { 3179 err = -ENOENT; 3180 goto out; 3181 } 3182 3183 table = rt->fib6_table; 3184 spin_lock_bh(&table->tb6_lock); 3185 err = fib6_del(rt, info); 3186 spin_unlock_bh(&table->tb6_lock); 3187 3188 out: 3189 fib6_info_release(rt); 3190 return err; 3191 } 3192 3193 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3194 { 3195 struct nl_info info = { .nl_net = net }; 3196 3197 return __ip6_del_rt(rt, &info); 3198 } 3199 3200 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3201 { 3202 struct nl_info *info = &cfg->fc_nlinfo; 3203 struct net *net = info->nl_net; 3204 struct sk_buff *skb = NULL; 3205 struct fib6_table *table; 3206 int err = -ENOENT; 3207 3208 if (rt == net->ipv6.fib6_null_entry) 3209 goto out_put; 3210 table = rt->fib6_table; 3211 spin_lock_bh(&table->tb6_lock); 3212 3213 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3214 struct fib6_info *sibling, *next_sibling; 3215 3216 /* prefer to send a single notification with all hops */ 3217 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3218 if (skb) { 3219 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3220 3221 if (rt6_fill_node(net, skb, rt, NULL, 3222 NULL, NULL, 0, RTM_DELROUTE, 3223 info->portid, seq, 0) < 0) { 3224 kfree_skb(skb); 3225 skb = NULL; 3226 } else 3227 info->skip_notify = 1; 3228 } 3229 3230 list_for_each_entry_safe(sibling, next_sibling, 3231 &rt->fib6_siblings, 3232 fib6_siblings) { 3233 err = fib6_del(sibling, info); 3234 if (err) 3235 goto out_unlock; 3236 } 3237 } 3238 3239 err = fib6_del(rt, info); 3240 out_unlock: 3241 spin_unlock_bh(&table->tb6_lock); 3242 out_put: 3243 fib6_info_release(rt); 3244 3245 if (skb) { 3246 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3247 info->nlh, gfp_any()); 3248 } 3249 return err; 3250 } 3251 3252 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3253 { 3254 int rc = -ESRCH; 3255 3256 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3257 goto out; 3258 3259 if (cfg->fc_flags & RTF_GATEWAY && 3260 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3261 goto out; 3262 if (dst_hold_safe(&rt->dst)) 3263 rc = rt6_remove_exception_rt(rt); 3264 out: 3265 return rc; 3266 } 3267 3268 static int ip6_route_del(struct fib6_config *cfg, 3269 struct netlink_ext_ack *extack) 3270 { 3271 struct rt6_info *rt_cache; 3272 struct fib6_table *table; 3273 struct fib6_info *rt; 3274 struct fib6_node *fn; 3275 int err = -ESRCH; 3276 3277 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3278 if (!table) { 3279 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3280 return err; 3281 } 3282 3283 rcu_read_lock(); 3284 3285 fn = fib6_locate(&table->tb6_root, 3286 &cfg->fc_dst, cfg->fc_dst_len, 3287 &cfg->fc_src, cfg->fc_src_len, 3288 !(cfg->fc_flags & RTF_CACHE)); 3289 3290 if (fn) { 3291 for_each_fib6_node_rt_rcu(fn) { 3292 if (cfg->fc_flags & RTF_CACHE) { 3293 int rc; 3294 3295 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3296 &cfg->fc_src); 3297 if (rt_cache) { 3298 rc = ip6_del_cached_rt(rt_cache, cfg); 3299 if (rc != -ESRCH) { 3300 rcu_read_unlock(); 3301 return rc; 3302 } 3303 } 3304 continue; 3305 } 3306 if (cfg->fc_ifindex && 3307 (!rt->fib6_nh.nh_dev || 3308 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3309 continue; 3310 if (cfg->fc_flags & RTF_GATEWAY && 3311 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3312 continue; 3313 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3314 continue; 3315 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3316 continue; 3317 if (!fib6_info_hold_safe(rt)) 3318 continue; 3319 rcu_read_unlock(); 3320 3321 /* if gateway was specified only delete the one hop */ 3322 if (cfg->fc_flags & RTF_GATEWAY) 3323 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3324 3325 return __ip6_del_rt_siblings(rt, cfg); 3326 } 3327 } 3328 rcu_read_unlock(); 3329 3330 return err; 3331 } 3332 3333 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3334 { 3335 struct netevent_redirect netevent; 3336 struct rt6_info *rt, *nrt = NULL; 3337 struct ndisc_options ndopts; 3338 struct inet6_dev *in6_dev; 3339 struct neighbour *neigh; 3340 struct fib6_info *from; 3341 struct rd_msg *msg; 3342 int optlen, on_link; 3343 u8 *lladdr; 3344 3345 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3346 optlen -= sizeof(*msg); 3347 3348 if (optlen < 0) { 3349 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3350 return; 3351 } 3352 3353 msg = (struct rd_msg *)icmp6_hdr(skb); 3354 3355 if (ipv6_addr_is_multicast(&msg->dest)) { 3356 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3357 return; 3358 } 3359 3360 on_link = 0; 3361 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3362 on_link = 1; 3363 } else if (ipv6_addr_type(&msg->target) != 3364 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3365 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3366 return; 3367 } 3368 3369 in6_dev = __in6_dev_get(skb->dev); 3370 if (!in6_dev) 3371 return; 3372 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3373 return; 3374 3375 /* RFC2461 8.1: 3376 * The IP source address of the Redirect MUST be the same as the current 3377 * first-hop router for the specified ICMP Destination Address. 3378 */ 3379 3380 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3381 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3382 return; 3383 } 3384 3385 lladdr = NULL; 3386 if (ndopts.nd_opts_tgt_lladdr) { 3387 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3388 skb->dev); 3389 if (!lladdr) { 3390 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3391 return; 3392 } 3393 } 3394 3395 rt = (struct rt6_info *) dst; 3396 if (rt->rt6i_flags & RTF_REJECT) { 3397 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3398 return; 3399 } 3400 3401 /* Redirect received -> path was valid. 3402 * Look, redirects are sent only in response to data packets, 3403 * so that this nexthop apparently is reachable. --ANK 3404 */ 3405 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3406 3407 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3408 if (!neigh) 3409 return; 3410 3411 /* 3412 * We have finally decided to accept it. 3413 */ 3414 3415 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3416 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3417 NEIGH_UPDATE_F_OVERRIDE| 3418 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3419 NEIGH_UPDATE_F_ISROUTER)), 3420 NDISC_REDIRECT, &ndopts); 3421 3422 rcu_read_lock(); 3423 from = rcu_dereference(rt->from); 3424 /* This fib6_info_hold() is safe here because we hold reference to rt 3425 * and rt already holds reference to fib6_info. 3426 */ 3427 fib6_info_hold(from); 3428 rcu_read_unlock(); 3429 3430 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3431 if (!nrt) 3432 goto out; 3433 3434 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3435 if (on_link) 3436 nrt->rt6i_flags &= ~RTF_GATEWAY; 3437 3438 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3439 3440 /* No need to remove rt from the exception table if rt is 3441 * a cached route because rt6_insert_exception() will 3442 * takes care of it 3443 */ 3444 if (rt6_insert_exception(nrt, from)) { 3445 dst_release_immediate(&nrt->dst); 3446 goto out; 3447 } 3448 3449 netevent.old = &rt->dst; 3450 netevent.new = &nrt->dst; 3451 netevent.daddr = &msg->dest; 3452 netevent.neigh = neigh; 3453 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3454 3455 out: 3456 fib6_info_release(from); 3457 neigh_release(neigh); 3458 } 3459 3460 #ifdef CONFIG_IPV6_ROUTE_INFO 3461 static struct fib6_info *rt6_get_route_info(struct net *net, 3462 const struct in6_addr *prefix, int prefixlen, 3463 const struct in6_addr *gwaddr, 3464 struct net_device *dev) 3465 { 3466 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3467 int ifindex = dev->ifindex; 3468 struct fib6_node *fn; 3469 struct fib6_info *rt = NULL; 3470 struct fib6_table *table; 3471 3472 table = fib6_get_table(net, tb_id); 3473 if (!table) 3474 return NULL; 3475 3476 rcu_read_lock(); 3477 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3478 if (!fn) 3479 goto out; 3480 3481 for_each_fib6_node_rt_rcu(fn) { 3482 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3483 continue; 3484 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3485 continue; 3486 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3487 continue; 3488 if (!fib6_info_hold_safe(rt)) 3489 continue; 3490 break; 3491 } 3492 out: 3493 rcu_read_unlock(); 3494 return rt; 3495 } 3496 3497 static struct fib6_info *rt6_add_route_info(struct net *net, 3498 const struct in6_addr *prefix, int prefixlen, 3499 const struct in6_addr *gwaddr, 3500 struct net_device *dev, 3501 unsigned int pref) 3502 { 3503 struct fib6_config cfg = { 3504 .fc_metric = IP6_RT_PRIO_USER, 3505 .fc_ifindex = dev->ifindex, 3506 .fc_dst_len = prefixlen, 3507 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3508 RTF_UP | RTF_PREF(pref), 3509 .fc_protocol = RTPROT_RA, 3510 .fc_type = RTN_UNICAST, 3511 .fc_nlinfo.portid = 0, 3512 .fc_nlinfo.nlh = NULL, 3513 .fc_nlinfo.nl_net = net, 3514 }; 3515 3516 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3517 cfg.fc_dst = *prefix; 3518 cfg.fc_gateway = *gwaddr; 3519 3520 /* We should treat it as a default route if prefix length is 0. */ 3521 if (!prefixlen) 3522 cfg.fc_flags |= RTF_DEFAULT; 3523 3524 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3525 3526 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3527 } 3528 #endif 3529 3530 struct fib6_info *rt6_get_dflt_router(struct net *net, 3531 const struct in6_addr *addr, 3532 struct net_device *dev) 3533 { 3534 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3535 struct fib6_info *rt; 3536 struct fib6_table *table; 3537 3538 table = fib6_get_table(net, tb_id); 3539 if (!table) 3540 return NULL; 3541 3542 rcu_read_lock(); 3543 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3544 if (dev == rt->fib6_nh.nh_dev && 3545 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3546 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3547 break; 3548 } 3549 if (rt && !fib6_info_hold_safe(rt)) 3550 rt = NULL; 3551 rcu_read_unlock(); 3552 return rt; 3553 } 3554 3555 struct fib6_info *rt6_add_dflt_router(struct net *net, 3556 const struct in6_addr *gwaddr, 3557 struct net_device *dev, 3558 unsigned int pref) 3559 { 3560 struct fib6_config cfg = { 3561 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3562 .fc_metric = IP6_RT_PRIO_USER, 3563 .fc_ifindex = dev->ifindex, 3564 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3565 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3566 .fc_protocol = RTPROT_RA, 3567 .fc_type = RTN_UNICAST, 3568 .fc_nlinfo.portid = 0, 3569 .fc_nlinfo.nlh = NULL, 3570 .fc_nlinfo.nl_net = net, 3571 }; 3572 3573 cfg.fc_gateway = *gwaddr; 3574 3575 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3576 struct fib6_table *table; 3577 3578 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3579 if (table) 3580 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3581 } 3582 3583 return rt6_get_dflt_router(net, gwaddr, dev); 3584 } 3585 3586 static void __rt6_purge_dflt_routers(struct net *net, 3587 struct fib6_table *table) 3588 { 3589 struct fib6_info *rt; 3590 3591 restart: 3592 rcu_read_lock(); 3593 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3594 struct net_device *dev = fib6_info_nh_dev(rt); 3595 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3596 3597 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3598 (!idev || idev->cnf.accept_ra != 2) && 3599 fib6_info_hold_safe(rt)) { 3600 rcu_read_unlock(); 3601 ip6_del_rt(net, rt); 3602 goto restart; 3603 } 3604 } 3605 rcu_read_unlock(); 3606 3607 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3608 } 3609 3610 void rt6_purge_dflt_routers(struct net *net) 3611 { 3612 struct fib6_table *table; 3613 struct hlist_head *head; 3614 unsigned int h; 3615 3616 rcu_read_lock(); 3617 3618 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3619 head = &net->ipv6.fib_table_hash[h]; 3620 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3621 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3622 __rt6_purge_dflt_routers(net, table); 3623 } 3624 } 3625 3626 rcu_read_unlock(); 3627 } 3628 3629 static void rtmsg_to_fib6_config(struct net *net, 3630 struct in6_rtmsg *rtmsg, 3631 struct fib6_config *cfg) 3632 { 3633 memset(cfg, 0, sizeof(*cfg)); 3634 3635 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3636 : RT6_TABLE_MAIN; 3637 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3638 cfg->fc_metric = rtmsg->rtmsg_metric; 3639 cfg->fc_expires = rtmsg->rtmsg_info; 3640 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3641 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3642 cfg->fc_flags = rtmsg->rtmsg_flags; 3643 cfg->fc_type = rtmsg->rtmsg_type; 3644 3645 cfg->fc_nlinfo.nl_net = net; 3646 3647 cfg->fc_dst = rtmsg->rtmsg_dst; 3648 cfg->fc_src = rtmsg->rtmsg_src; 3649 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3650 } 3651 3652 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3653 { 3654 struct fib6_config cfg; 3655 struct in6_rtmsg rtmsg; 3656 int err; 3657 3658 switch (cmd) { 3659 case SIOCADDRT: /* Add a route */ 3660 case SIOCDELRT: /* Delete a route */ 3661 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3662 return -EPERM; 3663 err = copy_from_user(&rtmsg, arg, 3664 sizeof(struct in6_rtmsg)); 3665 if (err) 3666 return -EFAULT; 3667 3668 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3669 3670 rtnl_lock(); 3671 switch (cmd) { 3672 case SIOCADDRT: 3673 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3674 break; 3675 case SIOCDELRT: 3676 err = ip6_route_del(&cfg, NULL); 3677 break; 3678 default: 3679 err = -EINVAL; 3680 } 3681 rtnl_unlock(); 3682 3683 return err; 3684 } 3685 3686 return -EINVAL; 3687 } 3688 3689 /* 3690 * Drop the packet on the floor 3691 */ 3692 3693 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3694 { 3695 int type; 3696 struct dst_entry *dst = skb_dst(skb); 3697 switch (ipstats_mib_noroutes) { 3698 case IPSTATS_MIB_INNOROUTES: 3699 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3700 if (type == IPV6_ADDR_ANY) { 3701 IP6_INC_STATS(dev_net(dst->dev), 3702 __in6_dev_get_safely(skb->dev), 3703 IPSTATS_MIB_INADDRERRORS); 3704 break; 3705 } 3706 /* FALLTHROUGH */ 3707 case IPSTATS_MIB_OUTNOROUTES: 3708 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3709 ipstats_mib_noroutes); 3710 break; 3711 } 3712 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3713 kfree_skb(skb); 3714 return 0; 3715 } 3716 3717 static int ip6_pkt_discard(struct sk_buff *skb) 3718 { 3719 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3720 } 3721 3722 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3723 { 3724 skb->dev = skb_dst(skb)->dev; 3725 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3726 } 3727 3728 static int ip6_pkt_prohibit(struct sk_buff *skb) 3729 { 3730 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3731 } 3732 3733 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3734 { 3735 skb->dev = skb_dst(skb)->dev; 3736 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3737 } 3738 3739 /* 3740 * Allocate a dst for local (unicast / anycast) address. 3741 */ 3742 3743 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3744 struct inet6_dev *idev, 3745 const struct in6_addr *addr, 3746 bool anycast, gfp_t gfp_flags) 3747 { 3748 u32 tb_id; 3749 struct net_device *dev = idev->dev; 3750 struct fib6_info *f6i; 3751 3752 f6i = fib6_info_alloc(gfp_flags); 3753 if (!f6i) 3754 return ERR_PTR(-ENOMEM); 3755 3756 f6i->dst_nocount = true; 3757 f6i->dst_host = true; 3758 f6i->fib6_protocol = RTPROT_KERNEL; 3759 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3760 if (anycast) { 3761 f6i->fib6_type = RTN_ANYCAST; 3762 f6i->fib6_flags |= RTF_ANYCAST; 3763 } else { 3764 f6i->fib6_type = RTN_LOCAL; 3765 f6i->fib6_flags |= RTF_LOCAL; 3766 } 3767 3768 f6i->fib6_nh.nh_gw = *addr; 3769 dev_hold(dev); 3770 f6i->fib6_nh.nh_dev = dev; 3771 f6i->fib6_dst.addr = *addr; 3772 f6i->fib6_dst.plen = 128; 3773 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3774 f6i->fib6_table = fib6_get_table(net, tb_id); 3775 3776 return f6i; 3777 } 3778 3779 /* remove deleted ip from prefsrc entries */ 3780 struct arg_dev_net_ip { 3781 struct net_device *dev; 3782 struct net *net; 3783 struct in6_addr *addr; 3784 }; 3785 3786 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3787 { 3788 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3789 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3790 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3791 3792 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3793 rt != net->ipv6.fib6_null_entry && 3794 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3795 spin_lock_bh(&rt6_exception_lock); 3796 /* remove prefsrc entry */ 3797 rt->fib6_prefsrc.plen = 0; 3798 /* need to update cache as well */ 3799 rt6_exceptions_remove_prefsrc(rt); 3800 spin_unlock_bh(&rt6_exception_lock); 3801 } 3802 return 0; 3803 } 3804 3805 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3806 { 3807 struct net *net = dev_net(ifp->idev->dev); 3808 struct arg_dev_net_ip adni = { 3809 .dev = ifp->idev->dev, 3810 .net = net, 3811 .addr = &ifp->addr, 3812 }; 3813 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3814 } 3815 3816 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3817 3818 /* Remove routers and update dst entries when gateway turn into host. */ 3819 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3820 { 3821 struct in6_addr *gateway = (struct in6_addr *)arg; 3822 3823 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3824 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3825 return -1; 3826 } 3827 3828 /* Further clean up cached routes in exception table. 3829 * This is needed because cached route may have a different 3830 * gateway than its 'parent' in the case of an ip redirect. 3831 */ 3832 rt6_exceptions_clean_tohost(rt, gateway); 3833 3834 return 0; 3835 } 3836 3837 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3838 { 3839 fib6_clean_all(net, fib6_clean_tohost, gateway); 3840 } 3841 3842 struct arg_netdev_event { 3843 const struct net_device *dev; 3844 union { 3845 unsigned int nh_flags; 3846 unsigned long event; 3847 }; 3848 }; 3849 3850 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3851 { 3852 struct fib6_info *iter; 3853 struct fib6_node *fn; 3854 3855 fn = rcu_dereference_protected(rt->fib6_node, 3856 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3857 iter = rcu_dereference_protected(fn->leaf, 3858 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3859 while (iter) { 3860 if (iter->fib6_metric == rt->fib6_metric && 3861 rt6_qualify_for_ecmp(iter)) 3862 return iter; 3863 iter = rcu_dereference_protected(iter->fib6_next, 3864 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3865 } 3866 3867 return NULL; 3868 } 3869 3870 static bool rt6_is_dead(const struct fib6_info *rt) 3871 { 3872 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3873 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3874 fib6_ignore_linkdown(rt))) 3875 return true; 3876 3877 return false; 3878 } 3879 3880 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3881 { 3882 struct fib6_info *iter; 3883 int total = 0; 3884 3885 if (!rt6_is_dead(rt)) 3886 total += rt->fib6_nh.nh_weight; 3887 3888 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3889 if (!rt6_is_dead(iter)) 3890 total += iter->fib6_nh.nh_weight; 3891 } 3892 3893 return total; 3894 } 3895 3896 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3897 { 3898 int upper_bound = -1; 3899 3900 if (!rt6_is_dead(rt)) { 3901 *weight += rt->fib6_nh.nh_weight; 3902 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3903 total) - 1; 3904 } 3905 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3906 } 3907 3908 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3909 { 3910 struct fib6_info *iter; 3911 int weight = 0; 3912 3913 rt6_upper_bound_set(rt, &weight, total); 3914 3915 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3916 rt6_upper_bound_set(iter, &weight, total); 3917 } 3918 3919 void rt6_multipath_rebalance(struct fib6_info *rt) 3920 { 3921 struct fib6_info *first; 3922 int total; 3923 3924 /* In case the entire multipath route was marked for flushing, 3925 * then there is no need to rebalance upon the removal of every 3926 * sibling route. 3927 */ 3928 if (!rt->fib6_nsiblings || rt->should_flush) 3929 return; 3930 3931 /* During lookup routes are evaluated in order, so we need to 3932 * make sure upper bounds are assigned from the first sibling 3933 * onwards. 3934 */ 3935 first = rt6_multipath_first_sibling(rt); 3936 if (WARN_ON_ONCE(!first)) 3937 return; 3938 3939 total = rt6_multipath_total_weight(first); 3940 rt6_multipath_upper_bound_set(first, total); 3941 } 3942 3943 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3944 { 3945 const struct arg_netdev_event *arg = p_arg; 3946 struct net *net = dev_net(arg->dev); 3947 3948 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3949 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3950 fib6_update_sernum_upto_root(net, rt); 3951 rt6_multipath_rebalance(rt); 3952 } 3953 3954 return 0; 3955 } 3956 3957 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3958 { 3959 struct arg_netdev_event arg = { 3960 .dev = dev, 3961 { 3962 .nh_flags = nh_flags, 3963 }, 3964 }; 3965 3966 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3967 arg.nh_flags |= RTNH_F_LINKDOWN; 3968 3969 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3970 } 3971 3972 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3973 const struct net_device *dev) 3974 { 3975 struct fib6_info *iter; 3976 3977 if (rt->fib6_nh.nh_dev == dev) 3978 return true; 3979 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3980 if (iter->fib6_nh.nh_dev == dev) 3981 return true; 3982 3983 return false; 3984 } 3985 3986 static void rt6_multipath_flush(struct fib6_info *rt) 3987 { 3988 struct fib6_info *iter; 3989 3990 rt->should_flush = 1; 3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3992 iter->should_flush = 1; 3993 } 3994 3995 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3996 const struct net_device *down_dev) 3997 { 3998 struct fib6_info *iter; 3999 unsigned int dead = 0; 4000 4001 if (rt->fib6_nh.nh_dev == down_dev || 4002 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4003 dead++; 4004 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4005 if (iter->fib6_nh.nh_dev == down_dev || 4006 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 4007 dead++; 4008 4009 return dead; 4010 } 4011 4012 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4013 const struct net_device *dev, 4014 unsigned int nh_flags) 4015 { 4016 struct fib6_info *iter; 4017 4018 if (rt->fib6_nh.nh_dev == dev) 4019 rt->fib6_nh.nh_flags |= nh_flags; 4020 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4021 if (iter->fib6_nh.nh_dev == dev) 4022 iter->fib6_nh.nh_flags |= nh_flags; 4023 } 4024 4025 /* called with write lock held for table with rt */ 4026 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4027 { 4028 const struct arg_netdev_event *arg = p_arg; 4029 const struct net_device *dev = arg->dev; 4030 struct net *net = dev_net(dev); 4031 4032 if (rt == net->ipv6.fib6_null_entry) 4033 return 0; 4034 4035 switch (arg->event) { 4036 case NETDEV_UNREGISTER: 4037 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4038 case NETDEV_DOWN: 4039 if (rt->should_flush) 4040 return -1; 4041 if (!rt->fib6_nsiblings) 4042 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4043 if (rt6_multipath_uses_dev(rt, dev)) { 4044 unsigned int count; 4045 4046 count = rt6_multipath_dead_count(rt, dev); 4047 if (rt->fib6_nsiblings + 1 == count) { 4048 rt6_multipath_flush(rt); 4049 return -1; 4050 } 4051 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4052 RTNH_F_LINKDOWN); 4053 fib6_update_sernum(net, rt); 4054 rt6_multipath_rebalance(rt); 4055 } 4056 return -2; 4057 case NETDEV_CHANGE: 4058 if (rt->fib6_nh.nh_dev != dev || 4059 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4060 break; 4061 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4062 rt6_multipath_rebalance(rt); 4063 break; 4064 } 4065 4066 return 0; 4067 } 4068 4069 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4070 { 4071 struct arg_netdev_event arg = { 4072 .dev = dev, 4073 { 4074 .event = event, 4075 }, 4076 }; 4077 4078 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4079 } 4080 4081 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4082 { 4083 rt6_sync_down_dev(dev, event); 4084 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4085 neigh_ifdown(&nd_tbl, dev); 4086 } 4087 4088 struct rt6_mtu_change_arg { 4089 struct net_device *dev; 4090 unsigned int mtu; 4091 }; 4092 4093 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4094 { 4095 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4096 struct inet6_dev *idev; 4097 4098 /* In IPv6 pmtu discovery is not optional, 4099 so that RTAX_MTU lock cannot disable it. 4100 We still use this lock to block changes 4101 caused by addrconf/ndisc. 4102 */ 4103 4104 idev = __in6_dev_get(arg->dev); 4105 if (!idev) 4106 return 0; 4107 4108 /* For administrative MTU increase, there is no way to discover 4109 IPv6 PMTU increase, so PMTU increase should be updated here. 4110 Since RFC 1981 doesn't include administrative MTU increase 4111 update PMTU increase is a MUST. (i.e. jumbo frame) 4112 */ 4113 if (rt->fib6_nh.nh_dev == arg->dev && 4114 !fib6_metric_locked(rt, RTAX_MTU)) { 4115 u32 mtu = rt->fib6_pmtu; 4116 4117 if (mtu >= arg->mtu || 4118 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4119 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4120 4121 spin_lock_bh(&rt6_exception_lock); 4122 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4123 spin_unlock_bh(&rt6_exception_lock); 4124 } 4125 return 0; 4126 } 4127 4128 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4129 { 4130 struct rt6_mtu_change_arg arg = { 4131 .dev = dev, 4132 .mtu = mtu, 4133 }; 4134 4135 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4136 } 4137 4138 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4139 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4140 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4141 [RTA_OIF] = { .type = NLA_U32 }, 4142 [RTA_IIF] = { .type = NLA_U32 }, 4143 [RTA_PRIORITY] = { .type = NLA_U32 }, 4144 [RTA_METRICS] = { .type = NLA_NESTED }, 4145 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4146 [RTA_PREF] = { .type = NLA_U8 }, 4147 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4148 [RTA_ENCAP] = { .type = NLA_NESTED }, 4149 [RTA_EXPIRES] = { .type = NLA_U32 }, 4150 [RTA_UID] = { .type = NLA_U32 }, 4151 [RTA_MARK] = { .type = NLA_U32 }, 4152 [RTA_TABLE] = { .type = NLA_U32 }, 4153 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4154 [RTA_SPORT] = { .type = NLA_U16 }, 4155 [RTA_DPORT] = { .type = NLA_U16 }, 4156 }; 4157 4158 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4159 struct fib6_config *cfg, 4160 struct netlink_ext_ack *extack) 4161 { 4162 struct rtmsg *rtm; 4163 struct nlattr *tb[RTA_MAX+1]; 4164 unsigned int pref; 4165 int err; 4166 4167 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4168 NULL); 4169 if (err < 0) 4170 goto errout; 4171 4172 err = -EINVAL; 4173 rtm = nlmsg_data(nlh); 4174 memset(cfg, 0, sizeof(*cfg)); 4175 4176 cfg->fc_table = rtm->rtm_table; 4177 cfg->fc_dst_len = rtm->rtm_dst_len; 4178 cfg->fc_src_len = rtm->rtm_src_len; 4179 cfg->fc_flags = RTF_UP; 4180 cfg->fc_protocol = rtm->rtm_protocol; 4181 cfg->fc_type = rtm->rtm_type; 4182 4183 if (rtm->rtm_type == RTN_UNREACHABLE || 4184 rtm->rtm_type == RTN_BLACKHOLE || 4185 rtm->rtm_type == RTN_PROHIBIT || 4186 rtm->rtm_type == RTN_THROW) 4187 cfg->fc_flags |= RTF_REJECT; 4188 4189 if (rtm->rtm_type == RTN_LOCAL) 4190 cfg->fc_flags |= RTF_LOCAL; 4191 4192 if (rtm->rtm_flags & RTM_F_CLONED) 4193 cfg->fc_flags |= RTF_CACHE; 4194 4195 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4196 4197 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4198 cfg->fc_nlinfo.nlh = nlh; 4199 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4200 4201 if (tb[RTA_GATEWAY]) { 4202 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4203 cfg->fc_flags |= RTF_GATEWAY; 4204 } 4205 4206 if (tb[RTA_DST]) { 4207 int plen = (rtm->rtm_dst_len + 7) >> 3; 4208 4209 if (nla_len(tb[RTA_DST]) < plen) 4210 goto errout; 4211 4212 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4213 } 4214 4215 if (tb[RTA_SRC]) { 4216 int plen = (rtm->rtm_src_len + 7) >> 3; 4217 4218 if (nla_len(tb[RTA_SRC]) < plen) 4219 goto errout; 4220 4221 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4222 } 4223 4224 if (tb[RTA_PREFSRC]) 4225 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4226 4227 if (tb[RTA_OIF]) 4228 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4229 4230 if (tb[RTA_PRIORITY]) 4231 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4232 4233 if (tb[RTA_METRICS]) { 4234 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4235 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4236 } 4237 4238 if (tb[RTA_TABLE]) 4239 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4240 4241 if (tb[RTA_MULTIPATH]) { 4242 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4243 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4244 4245 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4246 cfg->fc_mp_len, extack); 4247 if (err < 0) 4248 goto errout; 4249 } 4250 4251 if (tb[RTA_PREF]) { 4252 pref = nla_get_u8(tb[RTA_PREF]); 4253 if (pref != ICMPV6_ROUTER_PREF_LOW && 4254 pref != ICMPV6_ROUTER_PREF_HIGH) 4255 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4256 cfg->fc_flags |= RTF_PREF(pref); 4257 } 4258 4259 if (tb[RTA_ENCAP]) 4260 cfg->fc_encap = tb[RTA_ENCAP]; 4261 4262 if (tb[RTA_ENCAP_TYPE]) { 4263 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4264 4265 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4266 if (err < 0) 4267 goto errout; 4268 } 4269 4270 if (tb[RTA_EXPIRES]) { 4271 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4272 4273 if (addrconf_finite_timeout(timeout)) { 4274 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4275 cfg->fc_flags |= RTF_EXPIRES; 4276 } 4277 } 4278 4279 err = 0; 4280 errout: 4281 return err; 4282 } 4283 4284 struct rt6_nh { 4285 struct fib6_info *fib6_info; 4286 struct fib6_config r_cfg; 4287 struct list_head next; 4288 }; 4289 4290 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4291 { 4292 struct rt6_nh *nh; 4293 4294 list_for_each_entry(nh, rt6_nh_list, next) { 4295 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4296 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4297 nh->r_cfg.fc_ifindex); 4298 } 4299 } 4300 4301 static int ip6_route_info_append(struct net *net, 4302 struct list_head *rt6_nh_list, 4303 struct fib6_info *rt, 4304 struct fib6_config *r_cfg) 4305 { 4306 struct rt6_nh *nh; 4307 int err = -EEXIST; 4308 4309 list_for_each_entry(nh, rt6_nh_list, next) { 4310 /* check if fib6_info already exists */ 4311 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4312 return err; 4313 } 4314 4315 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4316 if (!nh) 4317 return -ENOMEM; 4318 nh->fib6_info = rt; 4319 err = ip6_convert_metrics(net, rt, r_cfg); 4320 if (err) { 4321 kfree(nh); 4322 return err; 4323 } 4324 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4325 list_add_tail(&nh->next, rt6_nh_list); 4326 4327 return 0; 4328 } 4329 4330 static void ip6_route_mpath_notify(struct fib6_info *rt, 4331 struct fib6_info *rt_last, 4332 struct nl_info *info, 4333 __u16 nlflags) 4334 { 4335 /* if this is an APPEND route, then rt points to the first route 4336 * inserted and rt_last points to last route inserted. Userspace 4337 * wants a consistent dump of the route which starts at the first 4338 * nexthop. Since sibling routes are always added at the end of 4339 * the list, find the first sibling of the last route appended 4340 */ 4341 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4342 rt = list_first_entry(&rt_last->fib6_siblings, 4343 struct fib6_info, 4344 fib6_siblings); 4345 } 4346 4347 if (rt) 4348 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4349 } 4350 4351 static int ip6_route_multipath_add(struct fib6_config *cfg, 4352 struct netlink_ext_ack *extack) 4353 { 4354 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4355 struct nl_info *info = &cfg->fc_nlinfo; 4356 struct fib6_config r_cfg; 4357 struct rtnexthop *rtnh; 4358 struct fib6_info *rt; 4359 struct rt6_nh *err_nh; 4360 struct rt6_nh *nh, *nh_safe; 4361 __u16 nlflags; 4362 int remaining; 4363 int attrlen; 4364 int err = 1; 4365 int nhn = 0; 4366 int replace = (cfg->fc_nlinfo.nlh && 4367 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4368 LIST_HEAD(rt6_nh_list); 4369 4370 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4371 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4372 nlflags |= NLM_F_APPEND; 4373 4374 remaining = cfg->fc_mp_len; 4375 rtnh = (struct rtnexthop *)cfg->fc_mp; 4376 4377 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4378 * fib6_info structs per nexthop 4379 */ 4380 while (rtnh_ok(rtnh, remaining)) { 4381 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4382 if (rtnh->rtnh_ifindex) 4383 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4384 4385 attrlen = rtnh_attrlen(rtnh); 4386 if (attrlen > 0) { 4387 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4388 4389 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4390 if (nla) { 4391 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4392 r_cfg.fc_flags |= RTF_GATEWAY; 4393 } 4394 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4395 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4396 if (nla) 4397 r_cfg.fc_encap_type = nla_get_u16(nla); 4398 } 4399 4400 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4401 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4402 if (IS_ERR(rt)) { 4403 err = PTR_ERR(rt); 4404 rt = NULL; 4405 goto cleanup; 4406 } 4407 if (!rt6_qualify_for_ecmp(rt)) { 4408 err = -EINVAL; 4409 NL_SET_ERR_MSG(extack, 4410 "Device only routes can not be added for IPv6 using the multipath API."); 4411 fib6_info_release(rt); 4412 goto cleanup; 4413 } 4414 4415 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4416 4417 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4418 rt, &r_cfg); 4419 if (err) { 4420 fib6_info_release(rt); 4421 goto cleanup; 4422 } 4423 4424 rtnh = rtnh_next(rtnh, &remaining); 4425 } 4426 4427 /* for add and replace send one notification with all nexthops. 4428 * Skip the notification in fib6_add_rt2node and send one with 4429 * the full route when done 4430 */ 4431 info->skip_notify = 1; 4432 4433 err_nh = NULL; 4434 list_for_each_entry(nh, &rt6_nh_list, next) { 4435 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4436 fib6_info_release(nh->fib6_info); 4437 4438 if (!err) { 4439 /* save reference to last route successfully inserted */ 4440 rt_last = nh->fib6_info; 4441 4442 /* save reference to first route for notification */ 4443 if (!rt_notif) 4444 rt_notif = nh->fib6_info; 4445 } 4446 4447 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4448 nh->fib6_info = NULL; 4449 if (err) { 4450 if (replace && nhn) 4451 ip6_print_replace_route_err(&rt6_nh_list); 4452 err_nh = nh; 4453 goto add_errout; 4454 } 4455 4456 /* Because each route is added like a single route we remove 4457 * these flags after the first nexthop: if there is a collision, 4458 * we have already failed to add the first nexthop: 4459 * fib6_add_rt2node() has rejected it; when replacing, old 4460 * nexthops have been replaced by first new, the rest should 4461 * be added to it. 4462 */ 4463 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4464 NLM_F_REPLACE); 4465 nhn++; 4466 } 4467 4468 /* success ... tell user about new route */ 4469 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4470 goto cleanup; 4471 4472 add_errout: 4473 /* send notification for routes that were added so that 4474 * the delete notifications sent by ip6_route_del are 4475 * coherent 4476 */ 4477 if (rt_notif) 4478 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4479 4480 /* Delete routes that were already added */ 4481 list_for_each_entry(nh, &rt6_nh_list, next) { 4482 if (err_nh == nh) 4483 break; 4484 ip6_route_del(&nh->r_cfg, extack); 4485 } 4486 4487 cleanup: 4488 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4489 if (nh->fib6_info) 4490 fib6_info_release(nh->fib6_info); 4491 list_del(&nh->next); 4492 kfree(nh); 4493 } 4494 4495 return err; 4496 } 4497 4498 static int ip6_route_multipath_del(struct fib6_config *cfg, 4499 struct netlink_ext_ack *extack) 4500 { 4501 struct fib6_config r_cfg; 4502 struct rtnexthop *rtnh; 4503 int remaining; 4504 int attrlen; 4505 int err = 1, last_err = 0; 4506 4507 remaining = cfg->fc_mp_len; 4508 rtnh = (struct rtnexthop *)cfg->fc_mp; 4509 4510 /* Parse a Multipath Entry */ 4511 while (rtnh_ok(rtnh, remaining)) { 4512 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4513 if (rtnh->rtnh_ifindex) 4514 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4515 4516 attrlen = rtnh_attrlen(rtnh); 4517 if (attrlen > 0) { 4518 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4519 4520 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4521 if (nla) { 4522 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4523 r_cfg.fc_flags |= RTF_GATEWAY; 4524 } 4525 } 4526 err = ip6_route_del(&r_cfg, extack); 4527 if (err) 4528 last_err = err; 4529 4530 rtnh = rtnh_next(rtnh, &remaining); 4531 } 4532 4533 return last_err; 4534 } 4535 4536 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4537 struct netlink_ext_ack *extack) 4538 { 4539 struct fib6_config cfg; 4540 int err; 4541 4542 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4543 if (err < 0) 4544 return err; 4545 4546 if (cfg.fc_mp) 4547 return ip6_route_multipath_del(&cfg, extack); 4548 else { 4549 cfg.fc_delete_all_nh = 1; 4550 return ip6_route_del(&cfg, extack); 4551 } 4552 } 4553 4554 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4555 struct netlink_ext_ack *extack) 4556 { 4557 struct fib6_config cfg; 4558 int err; 4559 4560 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4561 if (err < 0) 4562 return err; 4563 4564 if (cfg.fc_mp) 4565 return ip6_route_multipath_add(&cfg, extack); 4566 else 4567 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4568 } 4569 4570 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4571 { 4572 int nexthop_len = 0; 4573 4574 if (rt->fib6_nsiblings) { 4575 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4576 + NLA_ALIGN(sizeof(struct rtnexthop)) 4577 + nla_total_size(16) /* RTA_GATEWAY */ 4578 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4579 4580 nexthop_len *= rt->fib6_nsiblings; 4581 } 4582 4583 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4584 + nla_total_size(16) /* RTA_SRC */ 4585 + nla_total_size(16) /* RTA_DST */ 4586 + nla_total_size(16) /* RTA_GATEWAY */ 4587 + nla_total_size(16) /* RTA_PREFSRC */ 4588 + nla_total_size(4) /* RTA_TABLE */ 4589 + nla_total_size(4) /* RTA_IIF */ 4590 + nla_total_size(4) /* RTA_OIF */ 4591 + nla_total_size(4) /* RTA_PRIORITY */ 4592 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4593 + nla_total_size(sizeof(struct rta_cacheinfo)) 4594 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4595 + nla_total_size(1) /* RTA_PREF */ 4596 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4597 + nexthop_len; 4598 } 4599 4600 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4601 unsigned int *flags, bool skip_oif) 4602 { 4603 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4604 *flags |= RTNH_F_DEAD; 4605 4606 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4607 *flags |= RTNH_F_LINKDOWN; 4608 4609 rcu_read_lock(); 4610 if (fib6_ignore_linkdown(rt)) 4611 *flags |= RTNH_F_DEAD; 4612 rcu_read_unlock(); 4613 } 4614 4615 if (rt->fib6_flags & RTF_GATEWAY) { 4616 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4617 goto nla_put_failure; 4618 } 4619 4620 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4621 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4622 *flags |= RTNH_F_OFFLOAD; 4623 4624 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4625 if (!skip_oif && rt->fib6_nh.nh_dev && 4626 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4627 goto nla_put_failure; 4628 4629 if (rt->fib6_nh.nh_lwtstate && 4630 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4631 goto nla_put_failure; 4632 4633 return 0; 4634 4635 nla_put_failure: 4636 return -EMSGSIZE; 4637 } 4638 4639 /* add multipath next hop */ 4640 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4641 { 4642 const struct net_device *dev = rt->fib6_nh.nh_dev; 4643 struct rtnexthop *rtnh; 4644 unsigned int flags = 0; 4645 4646 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4647 if (!rtnh) 4648 goto nla_put_failure; 4649 4650 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4651 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4652 4653 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4654 goto nla_put_failure; 4655 4656 rtnh->rtnh_flags = flags; 4657 4658 /* length of rtnetlink header + attributes */ 4659 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4660 4661 return 0; 4662 4663 nla_put_failure: 4664 return -EMSGSIZE; 4665 } 4666 4667 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4668 struct fib6_info *rt, struct dst_entry *dst, 4669 struct in6_addr *dest, struct in6_addr *src, 4670 int iif, int type, u32 portid, u32 seq, 4671 unsigned int flags) 4672 { 4673 struct rtmsg *rtm; 4674 struct nlmsghdr *nlh; 4675 long expires = 0; 4676 u32 *pmetrics; 4677 u32 table; 4678 4679 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4680 if (!nlh) 4681 return -EMSGSIZE; 4682 4683 rtm = nlmsg_data(nlh); 4684 rtm->rtm_family = AF_INET6; 4685 rtm->rtm_dst_len = rt->fib6_dst.plen; 4686 rtm->rtm_src_len = rt->fib6_src.plen; 4687 rtm->rtm_tos = 0; 4688 if (rt->fib6_table) 4689 table = rt->fib6_table->tb6_id; 4690 else 4691 table = RT6_TABLE_UNSPEC; 4692 rtm->rtm_table = table; 4693 if (nla_put_u32(skb, RTA_TABLE, table)) 4694 goto nla_put_failure; 4695 4696 rtm->rtm_type = rt->fib6_type; 4697 rtm->rtm_flags = 0; 4698 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4699 rtm->rtm_protocol = rt->fib6_protocol; 4700 4701 if (rt->fib6_flags & RTF_CACHE) 4702 rtm->rtm_flags |= RTM_F_CLONED; 4703 4704 if (dest) { 4705 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4706 goto nla_put_failure; 4707 rtm->rtm_dst_len = 128; 4708 } else if (rtm->rtm_dst_len) 4709 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4710 goto nla_put_failure; 4711 #ifdef CONFIG_IPV6_SUBTREES 4712 if (src) { 4713 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4714 goto nla_put_failure; 4715 rtm->rtm_src_len = 128; 4716 } else if (rtm->rtm_src_len && 4717 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4718 goto nla_put_failure; 4719 #endif 4720 if (iif) { 4721 #ifdef CONFIG_IPV6_MROUTE 4722 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4723 int err = ip6mr_get_route(net, skb, rtm, portid); 4724 4725 if (err == 0) 4726 return 0; 4727 if (err < 0) 4728 goto nla_put_failure; 4729 } else 4730 #endif 4731 if (nla_put_u32(skb, RTA_IIF, iif)) 4732 goto nla_put_failure; 4733 } else if (dest) { 4734 struct in6_addr saddr_buf; 4735 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4736 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4737 goto nla_put_failure; 4738 } 4739 4740 if (rt->fib6_prefsrc.plen) { 4741 struct in6_addr saddr_buf; 4742 saddr_buf = rt->fib6_prefsrc.addr; 4743 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4744 goto nla_put_failure; 4745 } 4746 4747 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4748 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4749 goto nla_put_failure; 4750 4751 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4752 goto nla_put_failure; 4753 4754 /* For multipath routes, walk the siblings list and add 4755 * each as a nexthop within RTA_MULTIPATH. 4756 */ 4757 if (rt->fib6_nsiblings) { 4758 struct fib6_info *sibling, *next_sibling; 4759 struct nlattr *mp; 4760 4761 mp = nla_nest_start(skb, RTA_MULTIPATH); 4762 if (!mp) 4763 goto nla_put_failure; 4764 4765 if (rt6_add_nexthop(skb, rt) < 0) 4766 goto nla_put_failure; 4767 4768 list_for_each_entry_safe(sibling, next_sibling, 4769 &rt->fib6_siblings, fib6_siblings) { 4770 if (rt6_add_nexthop(skb, sibling) < 0) 4771 goto nla_put_failure; 4772 } 4773 4774 nla_nest_end(skb, mp); 4775 } else { 4776 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4777 goto nla_put_failure; 4778 } 4779 4780 if (rt->fib6_flags & RTF_EXPIRES) { 4781 expires = dst ? dst->expires : rt->expires; 4782 expires -= jiffies; 4783 } 4784 4785 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4786 goto nla_put_failure; 4787 4788 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4789 goto nla_put_failure; 4790 4791 4792 nlmsg_end(skb, nlh); 4793 return 0; 4794 4795 nla_put_failure: 4796 nlmsg_cancel(skb, nlh); 4797 return -EMSGSIZE; 4798 } 4799 4800 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4801 { 4802 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4803 struct net *net = arg->net; 4804 4805 if (rt == net->ipv6.fib6_null_entry) 4806 return 0; 4807 4808 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4809 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4810 4811 /* user wants prefix routes only */ 4812 if (rtm->rtm_flags & RTM_F_PREFIX && 4813 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4814 /* success since this is not a prefix route */ 4815 return 1; 4816 } 4817 } 4818 4819 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4820 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4821 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4822 } 4823 4824 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4825 struct netlink_ext_ack *extack) 4826 { 4827 struct net *net = sock_net(in_skb->sk); 4828 struct nlattr *tb[RTA_MAX+1]; 4829 int err, iif = 0, oif = 0; 4830 struct fib6_info *from; 4831 struct dst_entry *dst; 4832 struct rt6_info *rt; 4833 struct sk_buff *skb; 4834 struct rtmsg *rtm; 4835 struct flowi6 fl6; 4836 bool fibmatch; 4837 4838 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4839 extack); 4840 if (err < 0) 4841 goto errout; 4842 4843 err = -EINVAL; 4844 memset(&fl6, 0, sizeof(fl6)); 4845 rtm = nlmsg_data(nlh); 4846 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4847 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4848 4849 if (tb[RTA_SRC]) { 4850 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4851 goto errout; 4852 4853 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4854 } 4855 4856 if (tb[RTA_DST]) { 4857 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4858 goto errout; 4859 4860 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4861 } 4862 4863 if (tb[RTA_IIF]) 4864 iif = nla_get_u32(tb[RTA_IIF]); 4865 4866 if (tb[RTA_OIF]) 4867 oif = nla_get_u32(tb[RTA_OIF]); 4868 4869 if (tb[RTA_MARK]) 4870 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4871 4872 if (tb[RTA_UID]) 4873 fl6.flowi6_uid = make_kuid(current_user_ns(), 4874 nla_get_u32(tb[RTA_UID])); 4875 else 4876 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4877 4878 if (tb[RTA_SPORT]) 4879 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4880 4881 if (tb[RTA_DPORT]) 4882 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4883 4884 if (tb[RTA_IP_PROTO]) { 4885 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4886 &fl6.flowi6_proto, extack); 4887 if (err) 4888 goto errout; 4889 } 4890 4891 if (iif) { 4892 struct net_device *dev; 4893 int flags = 0; 4894 4895 rcu_read_lock(); 4896 4897 dev = dev_get_by_index_rcu(net, iif); 4898 if (!dev) { 4899 rcu_read_unlock(); 4900 err = -ENODEV; 4901 goto errout; 4902 } 4903 4904 fl6.flowi6_iif = iif; 4905 4906 if (!ipv6_addr_any(&fl6.saddr)) 4907 flags |= RT6_LOOKUP_F_HAS_SADDR; 4908 4909 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4910 4911 rcu_read_unlock(); 4912 } else { 4913 fl6.flowi6_oif = oif; 4914 4915 dst = ip6_route_output(net, NULL, &fl6); 4916 } 4917 4918 4919 rt = container_of(dst, struct rt6_info, dst); 4920 if (rt->dst.error) { 4921 err = rt->dst.error; 4922 ip6_rt_put(rt); 4923 goto errout; 4924 } 4925 4926 if (rt == net->ipv6.ip6_null_entry) { 4927 err = rt->dst.error; 4928 ip6_rt_put(rt); 4929 goto errout; 4930 } 4931 4932 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4933 if (!skb) { 4934 ip6_rt_put(rt); 4935 err = -ENOBUFS; 4936 goto errout; 4937 } 4938 4939 skb_dst_set(skb, &rt->dst); 4940 4941 rcu_read_lock(); 4942 from = rcu_dereference(rt->from); 4943 4944 if (fibmatch) 4945 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4946 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4947 nlh->nlmsg_seq, 0); 4948 else 4949 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4950 &fl6.saddr, iif, RTM_NEWROUTE, 4951 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4952 0); 4953 rcu_read_unlock(); 4954 4955 if (err < 0) { 4956 kfree_skb(skb); 4957 goto errout; 4958 } 4959 4960 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4961 errout: 4962 return err; 4963 } 4964 4965 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4966 unsigned int nlm_flags) 4967 { 4968 struct sk_buff *skb; 4969 struct net *net = info->nl_net; 4970 u32 seq; 4971 int err; 4972 4973 err = -ENOBUFS; 4974 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4975 4976 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4977 if (!skb) 4978 goto errout; 4979 4980 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4981 event, info->portid, seq, nlm_flags); 4982 if (err < 0) { 4983 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4984 WARN_ON(err == -EMSGSIZE); 4985 kfree_skb(skb); 4986 goto errout; 4987 } 4988 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4989 info->nlh, gfp_any()); 4990 return; 4991 errout: 4992 if (err < 0) 4993 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4994 } 4995 4996 static int ip6_route_dev_notify(struct notifier_block *this, 4997 unsigned long event, void *ptr) 4998 { 4999 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5000 struct net *net = dev_net(dev); 5001 5002 if (!(dev->flags & IFF_LOOPBACK)) 5003 return NOTIFY_OK; 5004 5005 if (event == NETDEV_REGISTER) { 5006 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5007 net->ipv6.ip6_null_entry->dst.dev = dev; 5008 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5010 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5011 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5012 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5013 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5014 #endif 5015 } else if (event == NETDEV_UNREGISTER && 5016 dev->reg_state != NETREG_UNREGISTERED) { 5017 /* NETDEV_UNREGISTER could be fired for multiple times by 5018 * netdev_wait_allrefs(). Make sure we only call this once. 5019 */ 5020 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5021 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5022 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5023 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5024 #endif 5025 } 5026 5027 return NOTIFY_OK; 5028 } 5029 5030 /* 5031 * /proc 5032 */ 5033 5034 #ifdef CONFIG_PROC_FS 5035 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5036 { 5037 struct net *net = (struct net *)seq->private; 5038 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5039 net->ipv6.rt6_stats->fib_nodes, 5040 net->ipv6.rt6_stats->fib_route_nodes, 5041 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5042 net->ipv6.rt6_stats->fib_rt_entries, 5043 net->ipv6.rt6_stats->fib_rt_cache, 5044 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5045 net->ipv6.rt6_stats->fib_discarded_routes); 5046 5047 return 0; 5048 } 5049 #endif /* CONFIG_PROC_FS */ 5050 5051 #ifdef CONFIG_SYSCTL 5052 5053 static 5054 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5055 void __user *buffer, size_t *lenp, loff_t *ppos) 5056 { 5057 struct net *net; 5058 int delay; 5059 if (!write) 5060 return -EINVAL; 5061 5062 net = (struct net *)ctl->extra1; 5063 delay = net->ipv6.sysctl.flush_delay; 5064 proc_dointvec(ctl, write, buffer, lenp, ppos); 5065 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5066 return 0; 5067 } 5068 5069 struct ctl_table ipv6_route_table_template[] = { 5070 { 5071 .procname = "flush", 5072 .data = &init_net.ipv6.sysctl.flush_delay, 5073 .maxlen = sizeof(int), 5074 .mode = 0200, 5075 .proc_handler = ipv6_sysctl_rtcache_flush 5076 }, 5077 { 5078 .procname = "gc_thresh", 5079 .data = &ip6_dst_ops_template.gc_thresh, 5080 .maxlen = sizeof(int), 5081 .mode = 0644, 5082 .proc_handler = proc_dointvec, 5083 }, 5084 { 5085 .procname = "max_size", 5086 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5087 .maxlen = sizeof(int), 5088 .mode = 0644, 5089 .proc_handler = proc_dointvec, 5090 }, 5091 { 5092 .procname = "gc_min_interval", 5093 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5094 .maxlen = sizeof(int), 5095 .mode = 0644, 5096 .proc_handler = proc_dointvec_jiffies, 5097 }, 5098 { 5099 .procname = "gc_timeout", 5100 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5101 .maxlen = sizeof(int), 5102 .mode = 0644, 5103 .proc_handler = proc_dointvec_jiffies, 5104 }, 5105 { 5106 .procname = "gc_interval", 5107 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5108 .maxlen = sizeof(int), 5109 .mode = 0644, 5110 .proc_handler = proc_dointvec_jiffies, 5111 }, 5112 { 5113 .procname = "gc_elasticity", 5114 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5115 .maxlen = sizeof(int), 5116 .mode = 0644, 5117 .proc_handler = proc_dointvec, 5118 }, 5119 { 5120 .procname = "mtu_expires", 5121 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5122 .maxlen = sizeof(int), 5123 .mode = 0644, 5124 .proc_handler = proc_dointvec_jiffies, 5125 }, 5126 { 5127 .procname = "min_adv_mss", 5128 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5129 .maxlen = sizeof(int), 5130 .mode = 0644, 5131 .proc_handler = proc_dointvec, 5132 }, 5133 { 5134 .procname = "gc_min_interval_ms", 5135 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5136 .maxlen = sizeof(int), 5137 .mode = 0644, 5138 .proc_handler = proc_dointvec_ms_jiffies, 5139 }, 5140 { } 5141 }; 5142 5143 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5144 { 5145 struct ctl_table *table; 5146 5147 table = kmemdup(ipv6_route_table_template, 5148 sizeof(ipv6_route_table_template), 5149 GFP_KERNEL); 5150 5151 if (table) { 5152 table[0].data = &net->ipv6.sysctl.flush_delay; 5153 table[0].extra1 = net; 5154 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5155 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5156 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5157 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5158 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5159 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5160 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5161 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5162 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5163 5164 /* Don't export sysctls to unprivileged users */ 5165 if (net->user_ns != &init_user_ns) 5166 table[0].procname = NULL; 5167 } 5168 5169 return table; 5170 } 5171 #endif 5172 5173 static int __net_init ip6_route_net_init(struct net *net) 5174 { 5175 int ret = -ENOMEM; 5176 5177 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5178 sizeof(net->ipv6.ip6_dst_ops)); 5179 5180 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5181 goto out_ip6_dst_ops; 5182 5183 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5184 sizeof(*net->ipv6.fib6_null_entry), 5185 GFP_KERNEL); 5186 if (!net->ipv6.fib6_null_entry) 5187 goto out_ip6_dst_entries; 5188 5189 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5190 sizeof(*net->ipv6.ip6_null_entry), 5191 GFP_KERNEL); 5192 if (!net->ipv6.ip6_null_entry) 5193 goto out_fib6_null_entry; 5194 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5195 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5196 ip6_template_metrics, true); 5197 5198 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5199 net->ipv6.fib6_has_custom_rules = false; 5200 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5201 sizeof(*net->ipv6.ip6_prohibit_entry), 5202 GFP_KERNEL); 5203 if (!net->ipv6.ip6_prohibit_entry) 5204 goto out_ip6_null_entry; 5205 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5206 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5207 ip6_template_metrics, true); 5208 5209 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5210 sizeof(*net->ipv6.ip6_blk_hole_entry), 5211 GFP_KERNEL); 5212 if (!net->ipv6.ip6_blk_hole_entry) 5213 goto out_ip6_prohibit_entry; 5214 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5215 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5216 ip6_template_metrics, true); 5217 #endif 5218 5219 net->ipv6.sysctl.flush_delay = 0; 5220 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5221 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5222 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5223 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5224 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5225 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5226 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5227 5228 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5229 5230 ret = 0; 5231 out: 5232 return ret; 5233 5234 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5235 out_ip6_prohibit_entry: 5236 kfree(net->ipv6.ip6_prohibit_entry); 5237 out_ip6_null_entry: 5238 kfree(net->ipv6.ip6_null_entry); 5239 #endif 5240 out_fib6_null_entry: 5241 kfree(net->ipv6.fib6_null_entry); 5242 out_ip6_dst_entries: 5243 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5244 out_ip6_dst_ops: 5245 goto out; 5246 } 5247 5248 static void __net_exit ip6_route_net_exit(struct net *net) 5249 { 5250 kfree(net->ipv6.fib6_null_entry); 5251 kfree(net->ipv6.ip6_null_entry); 5252 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5253 kfree(net->ipv6.ip6_prohibit_entry); 5254 kfree(net->ipv6.ip6_blk_hole_entry); 5255 #endif 5256 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5257 } 5258 5259 static int __net_init ip6_route_net_init_late(struct net *net) 5260 { 5261 #ifdef CONFIG_PROC_FS 5262 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5263 sizeof(struct ipv6_route_iter)); 5264 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5265 rt6_stats_seq_show, NULL); 5266 #endif 5267 return 0; 5268 } 5269 5270 static void __net_exit ip6_route_net_exit_late(struct net *net) 5271 { 5272 #ifdef CONFIG_PROC_FS 5273 remove_proc_entry("ipv6_route", net->proc_net); 5274 remove_proc_entry("rt6_stats", net->proc_net); 5275 #endif 5276 } 5277 5278 static struct pernet_operations ip6_route_net_ops = { 5279 .init = ip6_route_net_init, 5280 .exit = ip6_route_net_exit, 5281 }; 5282 5283 static int __net_init ipv6_inetpeer_init(struct net *net) 5284 { 5285 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5286 5287 if (!bp) 5288 return -ENOMEM; 5289 inet_peer_base_init(bp); 5290 net->ipv6.peers = bp; 5291 return 0; 5292 } 5293 5294 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5295 { 5296 struct inet_peer_base *bp = net->ipv6.peers; 5297 5298 net->ipv6.peers = NULL; 5299 inetpeer_invalidate_tree(bp); 5300 kfree(bp); 5301 } 5302 5303 static struct pernet_operations ipv6_inetpeer_ops = { 5304 .init = ipv6_inetpeer_init, 5305 .exit = ipv6_inetpeer_exit, 5306 }; 5307 5308 static struct pernet_operations ip6_route_net_late_ops = { 5309 .init = ip6_route_net_init_late, 5310 .exit = ip6_route_net_exit_late, 5311 }; 5312 5313 static struct notifier_block ip6_route_dev_notifier = { 5314 .notifier_call = ip6_route_dev_notify, 5315 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5316 }; 5317 5318 void __init ip6_route_init_special_entries(void) 5319 { 5320 /* Registering of the loopback is done before this portion of code, 5321 * the loopback reference in rt6_info will not be taken, do it 5322 * manually for init_net */ 5323 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5324 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5325 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5326 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5327 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5328 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5329 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5330 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5331 #endif 5332 } 5333 5334 int __init ip6_route_init(void) 5335 { 5336 int ret; 5337 int cpu; 5338 5339 ret = -ENOMEM; 5340 ip6_dst_ops_template.kmem_cachep = 5341 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5342 SLAB_HWCACHE_ALIGN, NULL); 5343 if (!ip6_dst_ops_template.kmem_cachep) 5344 goto out; 5345 5346 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5347 if (ret) 5348 goto out_kmem_cache; 5349 5350 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5351 if (ret) 5352 goto out_dst_entries; 5353 5354 ret = register_pernet_subsys(&ip6_route_net_ops); 5355 if (ret) 5356 goto out_register_inetpeer; 5357 5358 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5359 5360 ret = fib6_init(); 5361 if (ret) 5362 goto out_register_subsys; 5363 5364 ret = xfrm6_init(); 5365 if (ret) 5366 goto out_fib6_init; 5367 5368 ret = fib6_rules_init(); 5369 if (ret) 5370 goto xfrm6_init; 5371 5372 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5373 if (ret) 5374 goto fib6_rules_init; 5375 5376 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5377 inet6_rtm_newroute, NULL, 0); 5378 if (ret < 0) 5379 goto out_register_late_subsys; 5380 5381 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5382 inet6_rtm_delroute, NULL, 0); 5383 if (ret < 0) 5384 goto out_register_late_subsys; 5385 5386 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5387 inet6_rtm_getroute, NULL, 5388 RTNL_FLAG_DOIT_UNLOCKED); 5389 if (ret < 0) 5390 goto out_register_late_subsys; 5391 5392 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5393 if (ret) 5394 goto out_register_late_subsys; 5395 5396 for_each_possible_cpu(cpu) { 5397 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5398 5399 INIT_LIST_HEAD(&ul->head); 5400 spin_lock_init(&ul->lock); 5401 } 5402 5403 out: 5404 return ret; 5405 5406 out_register_late_subsys: 5407 rtnl_unregister_all(PF_INET6); 5408 unregister_pernet_subsys(&ip6_route_net_late_ops); 5409 fib6_rules_init: 5410 fib6_rules_cleanup(); 5411 xfrm6_init: 5412 xfrm6_fini(); 5413 out_fib6_init: 5414 fib6_gc_cleanup(); 5415 out_register_subsys: 5416 unregister_pernet_subsys(&ip6_route_net_ops); 5417 out_register_inetpeer: 5418 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5419 out_dst_entries: 5420 dst_entries_destroy(&ip6_dst_blackhole_ops); 5421 out_kmem_cache: 5422 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5423 goto out; 5424 } 5425 5426 void ip6_route_cleanup(void) 5427 { 5428 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5429 unregister_pernet_subsys(&ip6_route_net_late_ops); 5430 fib6_rules_cleanup(); 5431 xfrm6_fini(); 5432 fib6_gc_cleanup(); 5433 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5434 unregister_pernet_subsys(&ip6_route_net_ops); 5435 dst_entries_destroy(&ip6_dst_blackhole_ops); 5436 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5437 } 5438