1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 100 static size_t rt6_nlmsg_size(struct fib6_info *rt); 101 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 102 struct fib6_info *rt, struct dst_entry *dst, 103 struct in6_addr *dest, struct in6_addr *src, 104 int iif, int type, u32 portid, u32 seq, 105 unsigned int flags); 106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 107 struct in6_addr *daddr, 108 struct in6_addr *saddr); 109 110 #ifdef CONFIG_IPV6_ROUTE_INFO 111 static struct fib6_info *rt6_add_route_info(struct net *net, 112 const struct in6_addr *prefix, int prefixlen, 113 const struct in6_addr *gwaddr, 114 struct net_device *dev, 115 unsigned int pref); 116 static struct fib6_info *rt6_get_route_info(struct net *net, 117 const struct in6_addr *prefix, int prefixlen, 118 const struct in6_addr *gwaddr, 119 struct net_device *dev); 120 #endif 121 122 struct uncached_list { 123 spinlock_t lock; 124 struct list_head head; 125 }; 126 127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 128 129 void rt6_uncached_list_add(struct rt6_info *rt) 130 { 131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 132 133 rt->rt6i_uncached_list = ul; 134 135 spin_lock_bh(&ul->lock); 136 list_add_tail(&rt->rt6i_uncached, &ul->head); 137 spin_unlock_bh(&ul->lock); 138 } 139 140 void rt6_uncached_list_del(struct rt6_info *rt) 141 { 142 if (!list_empty(&rt->rt6i_uncached)) { 143 struct uncached_list *ul = rt->rt6i_uncached_list; 144 struct net *net = dev_net(rt->dst.dev); 145 146 spin_lock_bh(&ul->lock); 147 list_del(&rt->rt6i_uncached); 148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 149 spin_unlock_bh(&ul->lock); 150 } 151 } 152 153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 154 { 155 struct net_device *loopback_dev = net->loopback_dev; 156 int cpu; 157 158 if (dev == loopback_dev) 159 return; 160 161 for_each_possible_cpu(cpu) { 162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 163 struct rt6_info *rt; 164 165 spin_lock_bh(&ul->lock); 166 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 167 struct inet6_dev *rt_idev = rt->rt6i_idev; 168 struct net_device *rt_dev = rt->dst.dev; 169 170 if (rt_idev->dev == dev) { 171 rt->rt6i_idev = in6_dev_get(loopback_dev); 172 in6_dev_put(rt_idev); 173 } 174 175 if (rt_dev == dev) { 176 rt->dst.dev = loopback_dev; 177 dev_hold(rt->dst.dev); 178 dev_put(rt_dev); 179 } 180 } 181 spin_unlock_bh(&ul->lock); 182 } 183 } 184 185 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 186 struct sk_buff *skb, 187 const void *daddr) 188 { 189 if (!ipv6_addr_any(p)) 190 return (const void *) p; 191 else if (skb) 192 return &ipv6_hdr(skb)->daddr; 193 return daddr; 194 } 195 196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 197 struct net_device *dev, 198 struct sk_buff *skb, 199 const void *daddr) 200 { 201 struct neighbour *n; 202 203 daddr = choose_neigh_daddr(gw, skb, daddr); 204 n = __ipv6_neigh_lookup(dev, daddr); 205 if (n) 206 return n; 207 return neigh_create(&nd_tbl, daddr, dev); 208 } 209 210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 215 216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 217 } 218 219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 220 { 221 struct net_device *dev = dst->dev; 222 struct rt6_info *rt = (struct rt6_info *)dst; 223 224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 225 if (!daddr) 226 return; 227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 228 return; 229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 230 return; 231 __ipv6_confirm_neigh(dev, daddr); 232 } 233 234 static struct dst_ops ip6_dst_ops_template = { 235 .family = AF_INET6, 236 .gc = ip6_dst_gc, 237 .gc_thresh = 1024, 238 .check = ip6_dst_check, 239 .default_advmss = ip6_default_advmss, 240 .mtu = ip6_mtu, 241 .cow_metrics = dst_cow_metrics_generic, 242 .destroy = ip6_dst_destroy, 243 .ifdown = ip6_dst_ifdown, 244 .negative_advice = ip6_negative_advice, 245 .link_failure = ip6_link_failure, 246 .update_pmtu = ip6_rt_update_pmtu, 247 .redirect = rt6_do_redirect, 248 .local_out = __ip6_local_out, 249 .neigh_lookup = ip6_dst_neigh_lookup, 250 .confirm_neigh = ip6_confirm_neigh, 251 }; 252 253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 254 { 255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 256 257 return mtu ? : dst->dev->mtu; 258 } 259 260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 261 struct sk_buff *skb, u32 mtu) 262 { 263 } 264 265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 266 struct sk_buff *skb) 267 { 268 } 269 270 static struct dst_ops ip6_dst_blackhole_ops = { 271 .family = AF_INET6, 272 .destroy = ip6_dst_destroy, 273 .check = ip6_dst_check, 274 .mtu = ip6_blackhole_mtu, 275 .default_advmss = ip6_default_advmss, 276 .update_pmtu = ip6_rt_blackhole_update_pmtu, 277 .redirect = ip6_rt_blackhole_redirect, 278 .cow_metrics = dst_cow_metrics_generic, 279 .neigh_lookup = ip6_dst_neigh_lookup, 280 }; 281 282 static const u32 ip6_template_metrics[RTAX_MAX] = { 283 [RTAX_HOPLIMIT - 1] = 0, 284 }; 285 286 static const struct fib6_info fib6_null_entry_template = { 287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 288 .fib6_protocol = RTPROT_KERNEL, 289 .fib6_metric = ~(u32)0, 290 .fib6_ref = ATOMIC_INIT(1), 291 .fib6_type = RTN_UNREACHABLE, 292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 293 }; 294 295 static const struct rt6_info ip6_null_entry_template = { 296 .dst = { 297 .__refcnt = ATOMIC_INIT(1), 298 .__use = 1, 299 .obsolete = DST_OBSOLETE_FORCE_CHK, 300 .error = -ENETUNREACH, 301 .input = ip6_pkt_discard, 302 .output = ip6_pkt_discard_out, 303 }, 304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 }; 320 321 static const struct rt6_info ip6_blk_hole_entry_template = { 322 .dst = { 323 .__refcnt = ATOMIC_INIT(1), 324 .__use = 1, 325 .obsolete = DST_OBSOLETE_FORCE_CHK, 326 .error = -EINVAL, 327 .input = dst_discard, 328 .output = dst_discard_out, 329 }, 330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 331 }; 332 333 #endif 334 335 static void rt6_info_init(struct rt6_info *rt) 336 { 337 struct dst_entry *dst = &rt->dst; 338 339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 340 INIT_LIST_HEAD(&rt->rt6i_uncached); 341 } 342 343 /* allocate dst with ip6_dst_ops */ 344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 345 int flags) 346 { 347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 348 1, DST_OBSOLETE_FORCE_CHK, flags); 349 350 if (rt) { 351 rt6_info_init(rt); 352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 353 } 354 355 return rt; 356 } 357 EXPORT_SYMBOL(ip6_dst_alloc); 358 359 static void ip6_dst_destroy(struct dst_entry *dst) 360 { 361 struct rt6_info *rt = (struct rt6_info *)dst; 362 struct fib6_info *from = rt->from; 363 struct inet6_dev *idev; 364 365 dst_destroy_metrics_generic(dst); 366 rt6_uncached_list_del(rt); 367 368 idev = rt->rt6i_idev; 369 if (idev) { 370 rt->rt6i_idev = NULL; 371 in6_dev_put(idev); 372 } 373 374 rt->from = NULL; 375 fib6_info_release(from); 376 } 377 378 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 379 int how) 380 { 381 struct rt6_info *rt = (struct rt6_info *)dst; 382 struct inet6_dev *idev = rt->rt6i_idev; 383 struct net_device *loopback_dev = 384 dev_net(dev)->loopback_dev; 385 386 if (idev && idev->dev != loopback_dev) { 387 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 388 if (loopback_idev) { 389 rt->rt6i_idev = loopback_idev; 390 in6_dev_put(idev); 391 } 392 } 393 } 394 395 static bool __rt6_check_expired(const struct rt6_info *rt) 396 { 397 if (rt->rt6i_flags & RTF_EXPIRES) 398 return time_after(jiffies, rt->dst.expires); 399 else 400 return false; 401 } 402 403 static bool rt6_check_expired(const struct rt6_info *rt) 404 { 405 if (rt->rt6i_flags & RTF_EXPIRES) { 406 if (time_after(jiffies, rt->dst.expires)) 407 return true; 408 } else if (rt->from) { 409 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 410 fib6_check_expired(rt->from); 411 } 412 return false; 413 } 414 415 static struct fib6_info *rt6_multipath_select(const struct net *net, 416 struct fib6_info *match, 417 struct flowi6 *fl6, int oif, 418 const struct sk_buff *skb, 419 int strict) 420 { 421 struct fib6_info *sibling, *next_sibling; 422 423 /* We might have already computed the hash for ICMPv6 errors. In such 424 * case it will always be non-zero. Otherwise now is the time to do it. 425 */ 426 if (!fl6->mp_hash) 427 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 428 429 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 430 return match; 431 432 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 433 fib6_siblings) { 434 int nh_upper_bound; 435 436 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 437 if (fl6->mp_hash > nh_upper_bound) 438 continue; 439 if (rt6_score_route(sibling, oif, strict) < 0) 440 break; 441 match = sibling; 442 break; 443 } 444 445 return match; 446 } 447 448 /* 449 * Route lookup. rcu_read_lock() should be held. 450 */ 451 452 static inline struct fib6_info *rt6_device_match(struct net *net, 453 struct fib6_info *rt, 454 const struct in6_addr *saddr, 455 int oif, 456 int flags) 457 { 458 struct fib6_info *local = NULL; 459 struct fib6_info *sprt; 460 461 if (!oif && ipv6_addr_any(saddr) && 462 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 463 return rt; 464 465 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { 466 const struct net_device *dev = sprt->fib6_nh.nh_dev; 467 468 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 469 continue; 470 471 if (oif) { 472 if (dev->ifindex == oif) 473 return sprt; 474 if (dev->flags & IFF_LOOPBACK) { 475 if (!sprt->fib6_idev || 476 sprt->fib6_idev->dev->ifindex != oif) { 477 if (flags & RT6_LOOKUP_F_IFACE) 478 continue; 479 if (local && 480 local->fib6_idev->dev->ifindex == oif) 481 continue; 482 } 483 local = sprt; 484 } 485 } else { 486 if (ipv6_chk_addr(net, saddr, dev, 487 flags & RT6_LOOKUP_F_IFACE)) 488 return sprt; 489 } 490 } 491 492 if (oif) { 493 if (local) 494 return local; 495 496 if (flags & RT6_LOOKUP_F_IFACE) 497 return net->ipv6.fib6_null_entry; 498 } 499 500 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 501 } 502 503 #ifdef CONFIG_IPV6_ROUTER_PREF 504 struct __rt6_probe_work { 505 struct work_struct work; 506 struct in6_addr target; 507 struct net_device *dev; 508 }; 509 510 static void rt6_probe_deferred(struct work_struct *w) 511 { 512 struct in6_addr mcaddr; 513 struct __rt6_probe_work *work = 514 container_of(w, struct __rt6_probe_work, work); 515 516 addrconf_addr_solict_mult(&work->target, &mcaddr); 517 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 518 dev_put(work->dev); 519 kfree(work); 520 } 521 522 static void rt6_probe(struct fib6_info *rt) 523 { 524 struct __rt6_probe_work *work; 525 const struct in6_addr *nh_gw; 526 struct neighbour *neigh; 527 struct net_device *dev; 528 529 /* 530 * Okay, this does not seem to be appropriate 531 * for now, however, we need to check if it 532 * is really so; aka Router Reachability Probing. 533 * 534 * Router Reachability Probe MUST be rate-limited 535 * to no more than one per minute. 536 */ 537 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 538 return; 539 540 nh_gw = &rt->fib6_nh.nh_gw; 541 dev = rt->fib6_nh.nh_dev; 542 rcu_read_lock_bh(); 543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 544 if (neigh) { 545 if (neigh->nud_state & NUD_VALID) 546 goto out; 547 548 work = NULL; 549 write_lock(&neigh->lock); 550 if (!(neigh->nud_state & NUD_VALID) && 551 time_after(jiffies, 552 neigh->updated + 553 rt->fib6_idev->cnf.rtr_probe_interval)) { 554 work = kmalloc(sizeof(*work), GFP_ATOMIC); 555 if (work) 556 __neigh_set_probe_once(neigh); 557 } 558 write_unlock(&neigh->lock); 559 } else { 560 work = kmalloc(sizeof(*work), GFP_ATOMIC); 561 } 562 563 if (work) { 564 INIT_WORK(&work->work, rt6_probe_deferred); 565 work->target = *nh_gw; 566 dev_hold(dev); 567 work->dev = dev; 568 schedule_work(&work->work); 569 } 570 571 out: 572 rcu_read_unlock_bh(); 573 } 574 #else 575 static inline void rt6_probe(struct fib6_info *rt) 576 { 577 } 578 #endif 579 580 /* 581 * Default Router Selection (RFC 2461 6.3.6) 582 */ 583 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 584 { 585 const struct net_device *dev = rt->fib6_nh.nh_dev; 586 587 if (!oif || dev->ifindex == oif) 588 return 2; 589 if ((dev->flags & IFF_LOOPBACK) && 590 rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif) 591 return 1; 592 return 0; 593 } 594 595 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 596 { 597 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 598 struct neighbour *neigh; 599 600 if (rt->fib6_flags & RTF_NONEXTHOP || 601 !(rt->fib6_flags & RTF_GATEWAY)) 602 return RT6_NUD_SUCCEED; 603 604 rcu_read_lock_bh(); 605 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 606 &rt->fib6_nh.nh_gw); 607 if (neigh) { 608 read_lock(&neigh->lock); 609 if (neigh->nud_state & NUD_VALID) 610 ret = RT6_NUD_SUCCEED; 611 #ifdef CONFIG_IPV6_ROUTER_PREF 612 else if (!(neigh->nud_state & NUD_FAILED)) 613 ret = RT6_NUD_SUCCEED; 614 else 615 ret = RT6_NUD_FAIL_PROBE; 616 #endif 617 read_unlock(&neigh->lock); 618 } else { 619 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 620 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 621 } 622 rcu_read_unlock_bh(); 623 624 return ret; 625 } 626 627 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 628 { 629 int m; 630 631 m = rt6_check_dev(rt, oif); 632 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 633 return RT6_NUD_FAIL_HARD; 634 #ifdef CONFIG_IPV6_ROUTER_PREF 635 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 636 #endif 637 if (strict & RT6_LOOKUP_F_REACHABLE) { 638 int n = rt6_check_neigh(rt); 639 if (n < 0) 640 return n; 641 } 642 return m; 643 } 644 645 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 646 int *mpri, struct fib6_info *match, 647 bool *do_rr) 648 { 649 int m; 650 bool match_do_rr = false; 651 struct inet6_dev *idev = rt->fib6_idev; 652 653 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 654 goto out; 655 656 if (idev->cnf.ignore_routes_with_linkdown && 657 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 658 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 659 goto out; 660 661 if (fib6_check_expired(rt)) 662 goto out; 663 664 m = rt6_score_route(rt, oif, strict); 665 if (m == RT6_NUD_FAIL_DO_RR) { 666 match_do_rr = true; 667 m = 0; /* lowest valid score */ 668 } else if (m == RT6_NUD_FAIL_HARD) { 669 goto out; 670 } 671 672 if (strict & RT6_LOOKUP_F_REACHABLE) 673 rt6_probe(rt); 674 675 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 676 if (m > *mpri) { 677 *do_rr = match_do_rr; 678 *mpri = m; 679 match = rt; 680 } 681 out: 682 return match; 683 } 684 685 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 686 struct fib6_info *leaf, 687 struct fib6_info *rr_head, 688 u32 metric, int oif, int strict, 689 bool *do_rr) 690 { 691 struct fib6_info *rt, *match, *cont; 692 int mpri = -1; 693 694 match = NULL; 695 cont = NULL; 696 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { 697 if (rt->fib6_metric != metric) { 698 cont = rt; 699 break; 700 } 701 702 match = find_match(rt, oif, strict, &mpri, match, do_rr); 703 } 704 705 for (rt = leaf; rt && rt != rr_head; 706 rt = rcu_dereference(rt->rt6_next)) { 707 if (rt->fib6_metric != metric) { 708 cont = rt; 709 break; 710 } 711 712 match = find_match(rt, oif, strict, &mpri, match, do_rr); 713 } 714 715 if (match || !cont) 716 return match; 717 718 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) 719 match = find_match(rt, oif, strict, &mpri, match, do_rr); 720 721 return match; 722 } 723 724 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 725 int oif, int strict) 726 { 727 struct fib6_info *leaf = rcu_dereference(fn->leaf); 728 struct fib6_info *match, *rt0; 729 bool do_rr = false; 730 int key_plen; 731 732 if (!leaf || leaf == net->ipv6.fib6_null_entry) 733 return net->ipv6.fib6_null_entry; 734 735 rt0 = rcu_dereference(fn->rr_ptr); 736 if (!rt0) 737 rt0 = leaf; 738 739 /* Double check to make sure fn is not an intermediate node 740 * and fn->leaf does not points to its child's leaf 741 * (This might happen if all routes under fn are deleted from 742 * the tree and fib6_repair_tree() is called on the node.) 743 */ 744 key_plen = rt0->fib6_dst.plen; 745 #ifdef CONFIG_IPV6_SUBTREES 746 if (rt0->fib6_src.plen) 747 key_plen = rt0->fib6_src.plen; 748 #endif 749 if (fn->fn_bit != key_plen) 750 return net->ipv6.fib6_null_entry; 751 752 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 753 &do_rr); 754 755 if (do_rr) { 756 struct fib6_info *next = rcu_dereference(rt0->rt6_next); 757 758 /* no entries matched; do round-robin */ 759 if (!next || next->fib6_metric != rt0->fib6_metric) 760 next = leaf; 761 762 if (next != rt0) { 763 spin_lock_bh(&leaf->fib6_table->tb6_lock); 764 /* make sure next is not being deleted from the tree */ 765 if (next->fib6_node) 766 rcu_assign_pointer(fn->rr_ptr, next); 767 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 768 } 769 } 770 771 return match ? match : net->ipv6.fib6_null_entry; 772 } 773 774 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 775 { 776 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 777 } 778 779 #ifdef CONFIG_IPV6_ROUTE_INFO 780 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 781 const struct in6_addr *gwaddr) 782 { 783 struct net *net = dev_net(dev); 784 struct route_info *rinfo = (struct route_info *) opt; 785 struct in6_addr prefix_buf, *prefix; 786 unsigned int pref; 787 unsigned long lifetime; 788 struct fib6_info *rt; 789 790 if (len < sizeof(struct route_info)) { 791 return -EINVAL; 792 } 793 794 /* Sanity check for prefix_len and length */ 795 if (rinfo->length > 3) { 796 return -EINVAL; 797 } else if (rinfo->prefix_len > 128) { 798 return -EINVAL; 799 } else if (rinfo->prefix_len > 64) { 800 if (rinfo->length < 2) { 801 return -EINVAL; 802 } 803 } else if (rinfo->prefix_len > 0) { 804 if (rinfo->length < 1) { 805 return -EINVAL; 806 } 807 } 808 809 pref = rinfo->route_pref; 810 if (pref == ICMPV6_ROUTER_PREF_INVALID) 811 return -EINVAL; 812 813 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 814 815 if (rinfo->length == 3) 816 prefix = (struct in6_addr *)rinfo->prefix; 817 else { 818 /* this function is safe */ 819 ipv6_addr_prefix(&prefix_buf, 820 (struct in6_addr *)rinfo->prefix, 821 rinfo->prefix_len); 822 prefix = &prefix_buf; 823 } 824 825 if (rinfo->prefix_len == 0) 826 rt = rt6_get_dflt_router(net, gwaddr, dev); 827 else 828 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 829 gwaddr, dev); 830 831 if (rt && !lifetime) { 832 ip6_del_rt(net, rt); 833 rt = NULL; 834 } 835 836 if (!rt && lifetime) 837 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 838 dev, pref); 839 else if (rt) 840 rt->fib6_flags = RTF_ROUTEINFO | 841 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 842 843 if (rt) { 844 if (!addrconf_finite_timeout(lifetime)) 845 fib6_clean_expires(rt); 846 else 847 fib6_set_expires(rt, jiffies + HZ * lifetime); 848 849 fib6_info_release(rt); 850 } 851 return 0; 852 } 853 #endif 854 855 /* 856 * Misc support functions 857 */ 858 859 /* called with rcu_lock held */ 860 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 861 { 862 struct net_device *dev = rt->fib6_nh.nh_dev; 863 864 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 865 /* for copies of local routes, dst->dev needs to be the 866 * device if it is a master device, the master device if 867 * device is enslaved, and the loopback as the default 868 */ 869 if (netif_is_l3_slave(dev) && 870 !rt6_need_strict(&rt->fib6_dst.addr)) 871 dev = l3mdev_master_dev_rcu(dev); 872 else if (!netif_is_l3_master(dev)) 873 dev = dev_net(dev)->loopback_dev; 874 /* last case is netif_is_l3_master(dev) is true in which 875 * case we want dev returned to be dev 876 */ 877 } 878 879 return dev; 880 } 881 882 static const int fib6_prop[RTN_MAX + 1] = { 883 [RTN_UNSPEC] = 0, 884 [RTN_UNICAST] = 0, 885 [RTN_LOCAL] = 0, 886 [RTN_BROADCAST] = 0, 887 [RTN_ANYCAST] = 0, 888 [RTN_MULTICAST] = 0, 889 [RTN_BLACKHOLE] = -EINVAL, 890 [RTN_UNREACHABLE] = -EHOSTUNREACH, 891 [RTN_PROHIBIT] = -EACCES, 892 [RTN_THROW] = -EAGAIN, 893 [RTN_NAT] = -EINVAL, 894 [RTN_XRESOLVE] = -EINVAL, 895 }; 896 897 static int ip6_rt_type_to_error(u8 fib6_type) 898 { 899 return fib6_prop[fib6_type]; 900 } 901 902 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 903 { 904 unsigned short flags = 0; 905 906 if (rt->dst_nocount) 907 flags |= DST_NOCOUNT; 908 if (rt->dst_nopolicy) 909 flags |= DST_NOPOLICY; 910 if (rt->dst_host) 911 flags |= DST_HOST; 912 913 return flags; 914 } 915 916 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 917 { 918 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 919 920 switch (ort->fib6_type) { 921 case RTN_BLACKHOLE: 922 rt->dst.output = dst_discard_out; 923 rt->dst.input = dst_discard; 924 break; 925 case RTN_PROHIBIT: 926 rt->dst.output = ip6_pkt_prohibit_out; 927 rt->dst.input = ip6_pkt_prohibit; 928 break; 929 case RTN_THROW: 930 case RTN_UNREACHABLE: 931 default: 932 rt->dst.output = ip6_pkt_discard_out; 933 rt->dst.input = ip6_pkt_discard; 934 break; 935 } 936 } 937 938 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 939 { 940 rt->dst.flags |= fib6_info_dst_flags(ort); 941 942 if (ort->fib6_flags & RTF_REJECT) { 943 ip6_rt_init_dst_reject(rt, ort); 944 return; 945 } 946 947 rt->dst.error = 0; 948 rt->dst.output = ip6_output; 949 950 if (ort->fib6_type == RTN_LOCAL) { 951 rt->dst.input = ip6_input; 952 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 953 rt->dst.input = ip6_mc_input; 954 } else { 955 rt->dst.input = ip6_forward; 956 } 957 958 if (ort->fib6_nh.nh_lwtstate) { 959 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 960 lwtunnel_set_redirect(&rt->dst); 961 } 962 963 rt->dst.lastuse = jiffies; 964 } 965 966 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 967 { 968 rt->rt6i_flags &= ~RTF_EXPIRES; 969 fib6_info_hold(from); 970 rt->from = from; 971 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 972 if (from->fib6_metrics != &dst_default_metrics) { 973 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 974 refcount_inc(&from->fib6_metrics->refcnt); 975 } 976 } 977 978 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 979 { 980 ip6_rt_init_dst(rt, ort); 981 982 rt->rt6i_dst = ort->fib6_dst; 983 rt->rt6i_idev = ort->fib6_idev; 984 if (rt->rt6i_idev) 985 in6_dev_hold(rt->rt6i_idev); 986 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 987 rt->rt6i_flags = ort->fib6_flags; 988 rt6_set_from(rt, ort); 989 #ifdef CONFIG_IPV6_SUBTREES 990 rt->rt6i_src = ort->fib6_src; 991 #endif 992 rt->rt6i_prefsrc = ort->fib6_prefsrc; 993 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 994 } 995 996 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 997 struct in6_addr *saddr) 998 { 999 struct fib6_node *pn, *sn; 1000 while (1) { 1001 if (fn->fn_flags & RTN_TL_ROOT) 1002 return NULL; 1003 pn = rcu_dereference(fn->parent); 1004 sn = FIB6_SUBTREE(pn); 1005 if (sn && sn != fn) 1006 fn = fib6_lookup(sn, NULL, saddr); 1007 else 1008 fn = pn; 1009 if (fn->fn_flags & RTN_RTINFO) 1010 return fn; 1011 } 1012 } 1013 1014 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1015 bool null_fallback) 1016 { 1017 struct rt6_info *rt = *prt; 1018 1019 if (dst_hold_safe(&rt->dst)) 1020 return true; 1021 if (null_fallback) { 1022 rt = net->ipv6.ip6_null_entry; 1023 dst_hold(&rt->dst); 1024 } else { 1025 rt = NULL; 1026 } 1027 *prt = rt; 1028 return false; 1029 } 1030 1031 /* called with rcu_lock held */ 1032 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1033 { 1034 unsigned short flags = fib6_info_dst_flags(rt); 1035 struct net_device *dev = rt->fib6_nh.nh_dev; 1036 struct rt6_info *nrt; 1037 1038 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1039 if (nrt) 1040 ip6_rt_copy_init(nrt, rt); 1041 1042 return nrt; 1043 } 1044 1045 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1046 struct fib6_table *table, 1047 struct flowi6 *fl6, 1048 const struct sk_buff *skb, 1049 int flags) 1050 { 1051 struct fib6_info *f6i; 1052 struct fib6_node *fn; 1053 struct rt6_info *rt; 1054 1055 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1056 flags &= ~RT6_LOOKUP_F_IFACE; 1057 1058 rcu_read_lock(); 1059 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1060 restart: 1061 f6i = rcu_dereference(fn->leaf); 1062 if (!f6i) { 1063 f6i = net->ipv6.fib6_null_entry; 1064 } else { 1065 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1066 fl6->flowi6_oif, flags); 1067 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1068 f6i = rt6_multipath_select(net, f6i, fl6, 1069 fl6->flowi6_oif, skb, flags); 1070 } 1071 if (f6i == net->ipv6.fib6_null_entry) { 1072 fn = fib6_backtrack(fn, &fl6->saddr); 1073 if (fn) 1074 goto restart; 1075 } 1076 1077 /* Search through exception table */ 1078 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1079 if (rt) { 1080 if (ip6_hold_safe(net, &rt, true)) 1081 dst_use_noref(&rt->dst, jiffies); 1082 } else if (f6i == net->ipv6.fib6_null_entry) { 1083 rt = net->ipv6.ip6_null_entry; 1084 dst_hold(&rt->dst); 1085 } else { 1086 rt = ip6_create_rt_rcu(f6i); 1087 if (!rt) { 1088 rt = net->ipv6.ip6_null_entry; 1089 dst_hold(&rt->dst); 1090 } 1091 } 1092 1093 rcu_read_unlock(); 1094 1095 trace_fib6_table_lookup(net, rt, table, fl6); 1096 1097 return rt; 1098 } 1099 1100 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1101 const struct sk_buff *skb, int flags) 1102 { 1103 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1104 } 1105 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1106 1107 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1108 const struct in6_addr *saddr, int oif, 1109 const struct sk_buff *skb, int strict) 1110 { 1111 struct flowi6 fl6 = { 1112 .flowi6_oif = oif, 1113 .daddr = *daddr, 1114 }; 1115 struct dst_entry *dst; 1116 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1117 1118 if (saddr) { 1119 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1120 flags |= RT6_LOOKUP_F_HAS_SADDR; 1121 } 1122 1123 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1124 if (dst->error == 0) 1125 return (struct rt6_info *) dst; 1126 1127 dst_release(dst); 1128 1129 return NULL; 1130 } 1131 EXPORT_SYMBOL(rt6_lookup); 1132 1133 /* ip6_ins_rt is called with FREE table->tb6_lock. 1134 * It takes new route entry, the addition fails by any reason the 1135 * route is released. 1136 * Caller must hold dst before calling it. 1137 */ 1138 1139 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1140 struct netlink_ext_ack *extack) 1141 { 1142 int err; 1143 struct fib6_table *table; 1144 1145 table = rt->fib6_table; 1146 spin_lock_bh(&table->tb6_lock); 1147 err = fib6_add(&table->tb6_root, rt, info, extack); 1148 spin_unlock_bh(&table->tb6_lock); 1149 1150 return err; 1151 } 1152 1153 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1154 { 1155 struct nl_info info = { .nl_net = net, }; 1156 1157 return __ip6_ins_rt(rt, &info, NULL); 1158 } 1159 1160 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1161 const struct in6_addr *daddr, 1162 const struct in6_addr *saddr) 1163 { 1164 struct net_device *dev; 1165 struct rt6_info *rt; 1166 1167 /* 1168 * Clone the route. 1169 */ 1170 1171 rcu_read_lock(); 1172 dev = ip6_rt_get_dev_rcu(ort); 1173 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1174 rcu_read_unlock(); 1175 if (!rt) 1176 return NULL; 1177 1178 ip6_rt_copy_init(rt, ort); 1179 rt->rt6i_flags |= RTF_CACHE; 1180 rt->dst.flags |= DST_HOST; 1181 rt->rt6i_dst.addr = *daddr; 1182 rt->rt6i_dst.plen = 128; 1183 1184 if (!rt6_is_gw_or_nonexthop(ort)) { 1185 if (ort->fib6_dst.plen != 128 && 1186 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1187 rt->rt6i_flags |= RTF_ANYCAST; 1188 #ifdef CONFIG_IPV6_SUBTREES 1189 if (rt->rt6i_src.plen && saddr) { 1190 rt->rt6i_src.addr = *saddr; 1191 rt->rt6i_src.plen = 128; 1192 } 1193 #endif 1194 } 1195 1196 return rt; 1197 } 1198 1199 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1200 { 1201 unsigned short flags = fib6_info_dst_flags(rt); 1202 struct net_device *dev; 1203 struct rt6_info *pcpu_rt; 1204 1205 rcu_read_lock(); 1206 dev = ip6_rt_get_dev_rcu(rt); 1207 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1208 rcu_read_unlock(); 1209 if (!pcpu_rt) 1210 return NULL; 1211 ip6_rt_copy_init(pcpu_rt, rt); 1212 pcpu_rt->rt6i_flags |= RTF_PCPU; 1213 return pcpu_rt; 1214 } 1215 1216 /* It should be called with rcu_read_lock() acquired */ 1217 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1218 { 1219 struct rt6_info *pcpu_rt, **p; 1220 1221 p = this_cpu_ptr(rt->rt6i_pcpu); 1222 pcpu_rt = *p; 1223 1224 if (pcpu_rt) 1225 ip6_hold_safe(NULL, &pcpu_rt, false); 1226 1227 return pcpu_rt; 1228 } 1229 1230 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1231 struct fib6_info *rt) 1232 { 1233 struct rt6_info *pcpu_rt, *prev, **p; 1234 1235 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1236 if (!pcpu_rt) { 1237 dst_hold(&net->ipv6.ip6_null_entry->dst); 1238 return net->ipv6.ip6_null_entry; 1239 } 1240 1241 dst_hold(&pcpu_rt->dst); 1242 p = this_cpu_ptr(rt->rt6i_pcpu); 1243 prev = cmpxchg(p, NULL, pcpu_rt); 1244 BUG_ON(prev); 1245 1246 return pcpu_rt; 1247 } 1248 1249 /* exception hash table implementation 1250 */ 1251 static DEFINE_SPINLOCK(rt6_exception_lock); 1252 1253 /* Remove rt6_ex from hash table and free the memory 1254 * Caller must hold rt6_exception_lock 1255 */ 1256 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1257 struct rt6_exception *rt6_ex) 1258 { 1259 struct net *net; 1260 1261 if (!bucket || !rt6_ex) 1262 return; 1263 1264 net = dev_net(rt6_ex->rt6i->dst.dev); 1265 hlist_del_rcu(&rt6_ex->hlist); 1266 dst_release(&rt6_ex->rt6i->dst); 1267 kfree_rcu(rt6_ex, rcu); 1268 WARN_ON_ONCE(!bucket->depth); 1269 bucket->depth--; 1270 net->ipv6.rt6_stats->fib_rt_cache--; 1271 } 1272 1273 /* Remove oldest rt6_ex in bucket and free the memory 1274 * Caller must hold rt6_exception_lock 1275 */ 1276 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1277 { 1278 struct rt6_exception *rt6_ex, *oldest = NULL; 1279 1280 if (!bucket) 1281 return; 1282 1283 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1284 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1285 oldest = rt6_ex; 1286 } 1287 rt6_remove_exception(bucket, oldest); 1288 } 1289 1290 static u32 rt6_exception_hash(const struct in6_addr *dst, 1291 const struct in6_addr *src) 1292 { 1293 static u32 seed __read_mostly; 1294 u32 val; 1295 1296 net_get_random_once(&seed, sizeof(seed)); 1297 val = jhash(dst, sizeof(*dst), seed); 1298 1299 #ifdef CONFIG_IPV6_SUBTREES 1300 if (src) 1301 val = jhash(src, sizeof(*src), val); 1302 #endif 1303 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1304 } 1305 1306 /* Helper function to find the cached rt in the hash table 1307 * and update bucket pointer to point to the bucket for this 1308 * (daddr, saddr) pair 1309 * Caller must hold rt6_exception_lock 1310 */ 1311 static struct rt6_exception * 1312 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1313 const struct in6_addr *daddr, 1314 const struct in6_addr *saddr) 1315 { 1316 struct rt6_exception *rt6_ex; 1317 u32 hval; 1318 1319 if (!(*bucket) || !daddr) 1320 return NULL; 1321 1322 hval = rt6_exception_hash(daddr, saddr); 1323 *bucket += hval; 1324 1325 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1326 struct rt6_info *rt6 = rt6_ex->rt6i; 1327 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1328 1329 #ifdef CONFIG_IPV6_SUBTREES 1330 if (matched && saddr) 1331 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1332 #endif 1333 if (matched) 1334 return rt6_ex; 1335 } 1336 return NULL; 1337 } 1338 1339 /* Helper function to find the cached rt in the hash table 1340 * and update bucket pointer to point to the bucket for this 1341 * (daddr, saddr) pair 1342 * Caller must hold rcu_read_lock() 1343 */ 1344 static struct rt6_exception * 1345 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1346 const struct in6_addr *daddr, 1347 const struct in6_addr *saddr) 1348 { 1349 struct rt6_exception *rt6_ex; 1350 u32 hval; 1351 1352 WARN_ON_ONCE(!rcu_read_lock_held()); 1353 1354 if (!(*bucket) || !daddr) 1355 return NULL; 1356 1357 hval = rt6_exception_hash(daddr, saddr); 1358 *bucket += hval; 1359 1360 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1361 struct rt6_info *rt6 = rt6_ex->rt6i; 1362 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1363 1364 #ifdef CONFIG_IPV6_SUBTREES 1365 if (matched && saddr) 1366 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1367 #endif 1368 if (matched) 1369 return rt6_ex; 1370 } 1371 return NULL; 1372 } 1373 1374 static unsigned int fib6_mtu(const struct fib6_info *rt) 1375 { 1376 unsigned int mtu; 1377 1378 mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6; 1379 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1380 1381 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1382 } 1383 1384 static int rt6_insert_exception(struct rt6_info *nrt, 1385 struct fib6_info *ort) 1386 { 1387 struct net *net = dev_net(nrt->dst.dev); 1388 struct rt6_exception_bucket *bucket; 1389 struct in6_addr *src_key = NULL; 1390 struct rt6_exception *rt6_ex; 1391 int err = 0; 1392 1393 spin_lock_bh(&rt6_exception_lock); 1394 1395 if (ort->exception_bucket_flushed) { 1396 err = -EINVAL; 1397 goto out; 1398 } 1399 1400 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1401 lockdep_is_held(&rt6_exception_lock)); 1402 if (!bucket) { 1403 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1404 GFP_ATOMIC); 1405 if (!bucket) { 1406 err = -ENOMEM; 1407 goto out; 1408 } 1409 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1410 } 1411 1412 #ifdef CONFIG_IPV6_SUBTREES 1413 /* rt6i_src.plen != 0 indicates ort is in subtree 1414 * and exception table is indexed by a hash of 1415 * both rt6i_dst and rt6i_src. 1416 * Otherwise, the exception table is indexed by 1417 * a hash of only rt6i_dst. 1418 */ 1419 if (ort->fib6_src.plen) 1420 src_key = &nrt->rt6i_src.addr; 1421 #endif 1422 1423 /* Update rt6i_prefsrc as it could be changed 1424 * in rt6_remove_prefsrc() 1425 */ 1426 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1427 /* rt6_mtu_change() might lower mtu on ort. 1428 * Only insert this exception route if its mtu 1429 * is less than ort's mtu value. 1430 */ 1431 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1432 err = -EINVAL; 1433 goto out; 1434 } 1435 1436 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1437 src_key); 1438 if (rt6_ex) 1439 rt6_remove_exception(bucket, rt6_ex); 1440 1441 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1442 if (!rt6_ex) { 1443 err = -ENOMEM; 1444 goto out; 1445 } 1446 rt6_ex->rt6i = nrt; 1447 rt6_ex->stamp = jiffies; 1448 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1449 bucket->depth++; 1450 net->ipv6.rt6_stats->fib_rt_cache++; 1451 1452 if (bucket->depth > FIB6_MAX_DEPTH) 1453 rt6_exception_remove_oldest(bucket); 1454 1455 out: 1456 spin_unlock_bh(&rt6_exception_lock); 1457 1458 /* Update fn->fn_sernum to invalidate all cached dst */ 1459 if (!err) { 1460 spin_lock_bh(&ort->fib6_table->tb6_lock); 1461 fib6_update_sernum(net, ort); 1462 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1463 fib6_force_start_gc(net); 1464 } 1465 1466 return err; 1467 } 1468 1469 void rt6_flush_exceptions(struct fib6_info *rt) 1470 { 1471 struct rt6_exception_bucket *bucket; 1472 struct rt6_exception *rt6_ex; 1473 struct hlist_node *tmp; 1474 int i; 1475 1476 spin_lock_bh(&rt6_exception_lock); 1477 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1478 rt->exception_bucket_flushed = 1; 1479 1480 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1481 lockdep_is_held(&rt6_exception_lock)); 1482 if (!bucket) 1483 goto out; 1484 1485 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1486 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1487 rt6_remove_exception(bucket, rt6_ex); 1488 WARN_ON_ONCE(bucket->depth); 1489 bucket++; 1490 } 1491 1492 out: 1493 spin_unlock_bh(&rt6_exception_lock); 1494 } 1495 1496 /* Find cached rt in the hash table inside passed in rt 1497 * Caller has to hold rcu_read_lock() 1498 */ 1499 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1500 struct in6_addr *daddr, 1501 struct in6_addr *saddr) 1502 { 1503 struct rt6_exception_bucket *bucket; 1504 struct in6_addr *src_key = NULL; 1505 struct rt6_exception *rt6_ex; 1506 struct rt6_info *res = NULL; 1507 1508 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1509 1510 #ifdef CONFIG_IPV6_SUBTREES 1511 /* rt6i_src.plen != 0 indicates rt is in subtree 1512 * and exception table is indexed by a hash of 1513 * both rt6i_dst and rt6i_src. 1514 * Otherwise, the exception table is indexed by 1515 * a hash of only rt6i_dst. 1516 */ 1517 if (rt->fib6_src.plen) 1518 src_key = saddr; 1519 #endif 1520 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1521 1522 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1523 res = rt6_ex->rt6i; 1524 1525 return res; 1526 } 1527 1528 /* Remove the passed in cached rt from the hash table that contains it */ 1529 static int rt6_remove_exception_rt(struct rt6_info *rt) 1530 { 1531 struct rt6_exception_bucket *bucket; 1532 struct fib6_info *from = rt->from; 1533 struct in6_addr *src_key = NULL; 1534 struct rt6_exception *rt6_ex; 1535 int err; 1536 1537 if (!from || 1538 !(rt->rt6i_flags & RTF_CACHE)) 1539 return -EINVAL; 1540 1541 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1542 return -ENOENT; 1543 1544 spin_lock_bh(&rt6_exception_lock); 1545 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1546 lockdep_is_held(&rt6_exception_lock)); 1547 #ifdef CONFIG_IPV6_SUBTREES 1548 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1549 * and exception table is indexed by a hash of 1550 * both rt6i_dst and rt6i_src. 1551 * Otherwise, the exception table is indexed by 1552 * a hash of only rt6i_dst. 1553 */ 1554 if (from->fib6_src.plen) 1555 src_key = &rt->rt6i_src.addr; 1556 #endif 1557 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1558 &rt->rt6i_dst.addr, 1559 src_key); 1560 if (rt6_ex) { 1561 rt6_remove_exception(bucket, rt6_ex); 1562 err = 0; 1563 } else { 1564 err = -ENOENT; 1565 } 1566 1567 spin_unlock_bh(&rt6_exception_lock); 1568 return err; 1569 } 1570 1571 /* Find rt6_ex which contains the passed in rt cache and 1572 * refresh its stamp 1573 */ 1574 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1575 { 1576 struct rt6_exception_bucket *bucket; 1577 struct fib6_info *from = rt->from; 1578 struct in6_addr *src_key = NULL; 1579 struct rt6_exception *rt6_ex; 1580 1581 if (!from || 1582 !(rt->rt6i_flags & RTF_CACHE)) 1583 return; 1584 1585 rcu_read_lock(); 1586 bucket = rcu_dereference(from->rt6i_exception_bucket); 1587 1588 #ifdef CONFIG_IPV6_SUBTREES 1589 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1590 * and exception table is indexed by a hash of 1591 * both rt6i_dst and rt6i_src. 1592 * Otherwise, the exception table is indexed by 1593 * a hash of only rt6i_dst. 1594 */ 1595 if (from->fib6_src.plen) 1596 src_key = &rt->rt6i_src.addr; 1597 #endif 1598 rt6_ex = __rt6_find_exception_rcu(&bucket, 1599 &rt->rt6i_dst.addr, 1600 src_key); 1601 if (rt6_ex) 1602 rt6_ex->stamp = jiffies; 1603 1604 rcu_read_unlock(); 1605 } 1606 1607 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1608 { 1609 struct rt6_exception_bucket *bucket; 1610 struct rt6_exception *rt6_ex; 1611 int i; 1612 1613 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1614 lockdep_is_held(&rt6_exception_lock)); 1615 1616 if (bucket) { 1617 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1618 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1619 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1620 } 1621 bucket++; 1622 } 1623 } 1624 } 1625 1626 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1627 struct rt6_info *rt, int mtu) 1628 { 1629 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1630 * lowest MTU in the path: always allow updating the route PMTU to 1631 * reflect PMTU decreases. 1632 * 1633 * If the new MTU is higher, and the route PMTU is equal to the local 1634 * MTU, this means the old MTU is the lowest in the path, so allow 1635 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1636 * handle this. 1637 */ 1638 1639 if (dst_mtu(&rt->dst) >= mtu) 1640 return true; 1641 1642 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1643 return true; 1644 1645 return false; 1646 } 1647 1648 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1649 struct fib6_info *rt, int mtu) 1650 { 1651 struct rt6_exception_bucket *bucket; 1652 struct rt6_exception *rt6_ex; 1653 int i; 1654 1655 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1656 lockdep_is_held(&rt6_exception_lock)); 1657 1658 if (!bucket) 1659 return; 1660 1661 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1662 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1663 struct rt6_info *entry = rt6_ex->rt6i; 1664 1665 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1666 * route), the metrics of its rt->from have already 1667 * been updated. 1668 */ 1669 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1670 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1671 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1672 } 1673 bucket++; 1674 } 1675 } 1676 1677 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1678 1679 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1680 struct in6_addr *gateway) 1681 { 1682 struct rt6_exception_bucket *bucket; 1683 struct rt6_exception *rt6_ex; 1684 struct hlist_node *tmp; 1685 int i; 1686 1687 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1688 return; 1689 1690 spin_lock_bh(&rt6_exception_lock); 1691 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1692 lockdep_is_held(&rt6_exception_lock)); 1693 1694 if (bucket) { 1695 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1696 hlist_for_each_entry_safe(rt6_ex, tmp, 1697 &bucket->chain, hlist) { 1698 struct rt6_info *entry = rt6_ex->rt6i; 1699 1700 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1701 RTF_CACHE_GATEWAY && 1702 ipv6_addr_equal(gateway, 1703 &entry->rt6i_gateway)) { 1704 rt6_remove_exception(bucket, rt6_ex); 1705 } 1706 } 1707 bucket++; 1708 } 1709 } 1710 1711 spin_unlock_bh(&rt6_exception_lock); 1712 } 1713 1714 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1715 struct rt6_exception *rt6_ex, 1716 struct fib6_gc_args *gc_args, 1717 unsigned long now) 1718 { 1719 struct rt6_info *rt = rt6_ex->rt6i; 1720 1721 /* we are pruning and obsoleting aged-out and non gateway exceptions 1722 * even if others have still references to them, so that on next 1723 * dst_check() such references can be dropped. 1724 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1725 * expired, independently from their aging, as per RFC 8201 section 4 1726 */ 1727 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1728 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1729 RT6_TRACE("aging clone %p\n", rt); 1730 rt6_remove_exception(bucket, rt6_ex); 1731 return; 1732 } 1733 } else if (time_after(jiffies, rt->dst.expires)) { 1734 RT6_TRACE("purging expired route %p\n", rt); 1735 rt6_remove_exception(bucket, rt6_ex); 1736 return; 1737 } 1738 1739 if (rt->rt6i_flags & RTF_GATEWAY) { 1740 struct neighbour *neigh; 1741 __u8 neigh_flags = 0; 1742 1743 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1744 if (neigh) 1745 neigh_flags = neigh->flags; 1746 1747 if (!(neigh_flags & NTF_ROUTER)) { 1748 RT6_TRACE("purging route %p via non-router but gateway\n", 1749 rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 } 1754 1755 gc_args->more++; 1756 } 1757 1758 void rt6_age_exceptions(struct fib6_info *rt, 1759 struct fib6_gc_args *gc_args, 1760 unsigned long now) 1761 { 1762 struct rt6_exception_bucket *bucket; 1763 struct rt6_exception *rt6_ex; 1764 struct hlist_node *tmp; 1765 int i; 1766 1767 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1768 return; 1769 1770 rcu_read_lock_bh(); 1771 spin_lock(&rt6_exception_lock); 1772 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1773 lockdep_is_held(&rt6_exception_lock)); 1774 1775 if (bucket) { 1776 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1777 hlist_for_each_entry_safe(rt6_ex, tmp, 1778 &bucket->chain, hlist) { 1779 rt6_age_examine_exception(bucket, rt6_ex, 1780 gc_args, now); 1781 } 1782 bucket++; 1783 } 1784 } 1785 spin_unlock(&rt6_exception_lock); 1786 rcu_read_unlock_bh(); 1787 } 1788 1789 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1790 int oif, struct flowi6 *fl6, 1791 const struct sk_buff *skb, int flags) 1792 { 1793 struct fib6_node *fn, *saved_fn; 1794 struct fib6_info *f6i; 1795 struct rt6_info *rt; 1796 int strict = 0; 1797 1798 strict |= flags & RT6_LOOKUP_F_IFACE; 1799 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1800 if (net->ipv6.devconf_all->forwarding == 0) 1801 strict |= RT6_LOOKUP_F_REACHABLE; 1802 1803 rcu_read_lock(); 1804 1805 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1806 saved_fn = fn; 1807 1808 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1809 oif = 0; 1810 1811 redo_rt6_select: 1812 f6i = rt6_select(net, fn, oif, strict); 1813 if (f6i->fib6_nsiblings) 1814 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); 1815 if (f6i == net->ipv6.fib6_null_entry) { 1816 fn = fib6_backtrack(fn, &fl6->saddr); 1817 if (fn) 1818 goto redo_rt6_select; 1819 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1820 /* also consider unreachable route */ 1821 strict &= ~RT6_LOOKUP_F_REACHABLE; 1822 fn = saved_fn; 1823 goto redo_rt6_select; 1824 } 1825 } 1826 1827 if (f6i == net->ipv6.fib6_null_entry) { 1828 rt = net->ipv6.ip6_null_entry; 1829 rcu_read_unlock(); 1830 dst_hold(&rt->dst); 1831 trace_fib6_table_lookup(net, rt, table, fl6); 1832 return rt; 1833 } 1834 1835 /*Search through exception table */ 1836 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1837 if (rt) { 1838 if (ip6_hold_safe(net, &rt, true)) 1839 dst_use_noref(&rt->dst, jiffies); 1840 1841 rcu_read_unlock(); 1842 trace_fib6_table_lookup(net, rt, table, fl6); 1843 return rt; 1844 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1845 !(f6i->fib6_flags & RTF_GATEWAY))) { 1846 /* Create a RTF_CACHE clone which will not be 1847 * owned by the fib6 tree. It is for the special case where 1848 * the daddr in the skb during the neighbor look-up is different 1849 * from the fl6->daddr used to look-up route here. 1850 */ 1851 1852 struct rt6_info *uncached_rt; 1853 1854 fib6_info_hold(f6i); 1855 rcu_read_unlock(); 1856 1857 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1858 fib6_info_release(f6i); 1859 1860 if (uncached_rt) { 1861 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1862 * No need for another dst_hold() 1863 */ 1864 rt6_uncached_list_add(uncached_rt); 1865 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1866 } else { 1867 uncached_rt = net->ipv6.ip6_null_entry; 1868 dst_hold(&uncached_rt->dst); 1869 } 1870 1871 trace_fib6_table_lookup(net, uncached_rt, table, fl6); 1872 return uncached_rt; 1873 1874 } else { 1875 /* Get a percpu copy */ 1876 1877 struct rt6_info *pcpu_rt; 1878 1879 local_bh_disable(); 1880 pcpu_rt = rt6_get_pcpu_route(f6i); 1881 1882 if (!pcpu_rt) 1883 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1884 1885 local_bh_enable(); 1886 rcu_read_unlock(); 1887 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1888 return pcpu_rt; 1889 } 1890 } 1891 EXPORT_SYMBOL_GPL(ip6_pol_route); 1892 1893 static struct rt6_info *ip6_pol_route_input(struct net *net, 1894 struct fib6_table *table, 1895 struct flowi6 *fl6, 1896 const struct sk_buff *skb, 1897 int flags) 1898 { 1899 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1900 } 1901 1902 struct dst_entry *ip6_route_input_lookup(struct net *net, 1903 struct net_device *dev, 1904 struct flowi6 *fl6, 1905 const struct sk_buff *skb, 1906 int flags) 1907 { 1908 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1909 flags |= RT6_LOOKUP_F_IFACE; 1910 1911 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1912 } 1913 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1914 1915 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1916 struct flow_keys *keys, 1917 struct flow_keys *flkeys) 1918 { 1919 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1920 const struct ipv6hdr *key_iph = outer_iph; 1921 struct flow_keys *_flkeys = flkeys; 1922 const struct ipv6hdr *inner_iph; 1923 const struct icmp6hdr *icmph; 1924 struct ipv6hdr _inner_iph; 1925 1926 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1927 goto out; 1928 1929 icmph = icmp6_hdr(skb); 1930 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1931 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1932 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1933 icmph->icmp6_type != ICMPV6_PARAMPROB) 1934 goto out; 1935 1936 inner_iph = skb_header_pointer(skb, 1937 skb_transport_offset(skb) + sizeof(*icmph), 1938 sizeof(_inner_iph), &_inner_iph); 1939 if (!inner_iph) 1940 goto out; 1941 1942 key_iph = inner_iph; 1943 _flkeys = NULL; 1944 out: 1945 if (_flkeys) { 1946 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1947 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1948 keys->tags.flow_label = _flkeys->tags.flow_label; 1949 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1950 } else { 1951 keys->addrs.v6addrs.src = key_iph->saddr; 1952 keys->addrs.v6addrs.dst = key_iph->daddr; 1953 keys->tags.flow_label = ip6_flowinfo(key_iph); 1954 keys->basic.ip_proto = key_iph->nexthdr; 1955 } 1956 } 1957 1958 /* if skb is set it will be used and fl6 can be NULL */ 1959 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1960 const struct sk_buff *skb, struct flow_keys *flkeys) 1961 { 1962 struct flow_keys hash_keys; 1963 u32 mhash; 1964 1965 switch (ip6_multipath_hash_policy(net)) { 1966 case 0: 1967 memset(&hash_keys, 0, sizeof(hash_keys)); 1968 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1969 if (skb) { 1970 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1971 } else { 1972 hash_keys.addrs.v6addrs.src = fl6->saddr; 1973 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1974 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 1975 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1976 } 1977 break; 1978 case 1: 1979 if (skb) { 1980 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1981 struct flow_keys keys; 1982 1983 /* short-circuit if we already have L4 hash present */ 1984 if (skb->l4_hash) 1985 return skb_get_hash_raw(skb) >> 1; 1986 1987 memset(&hash_keys, 0, sizeof(hash_keys)); 1988 1989 if (!flkeys) { 1990 skb_flow_dissect_flow_keys(skb, &keys, flag); 1991 flkeys = &keys; 1992 } 1993 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1994 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 1995 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 1996 hash_keys.ports.src = flkeys->ports.src; 1997 hash_keys.ports.dst = flkeys->ports.dst; 1998 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 1999 } else { 2000 memset(&hash_keys, 0, sizeof(hash_keys)); 2001 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2002 hash_keys.addrs.v6addrs.src = fl6->saddr; 2003 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2004 hash_keys.ports.src = fl6->fl6_sport; 2005 hash_keys.ports.dst = fl6->fl6_dport; 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2007 } 2008 break; 2009 } 2010 mhash = flow_hash_from_keys(&hash_keys); 2011 2012 return mhash >> 1; 2013 } 2014 2015 void ip6_route_input(struct sk_buff *skb) 2016 { 2017 const struct ipv6hdr *iph = ipv6_hdr(skb); 2018 struct net *net = dev_net(skb->dev); 2019 int flags = RT6_LOOKUP_F_HAS_SADDR; 2020 struct ip_tunnel_info *tun_info; 2021 struct flowi6 fl6 = { 2022 .flowi6_iif = skb->dev->ifindex, 2023 .daddr = iph->daddr, 2024 .saddr = iph->saddr, 2025 .flowlabel = ip6_flowinfo(iph), 2026 .flowi6_mark = skb->mark, 2027 .flowi6_proto = iph->nexthdr, 2028 }; 2029 struct flow_keys *flkeys = NULL, _flkeys; 2030 2031 tun_info = skb_tunnel_info(skb); 2032 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2033 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2034 2035 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2036 flkeys = &_flkeys; 2037 2038 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2039 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2040 skb_dst_drop(skb); 2041 skb_dst_set(skb, 2042 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2043 } 2044 2045 static struct rt6_info *ip6_pol_route_output(struct net *net, 2046 struct fib6_table *table, 2047 struct flowi6 *fl6, 2048 const struct sk_buff *skb, 2049 int flags) 2050 { 2051 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2052 } 2053 2054 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2055 struct flowi6 *fl6, int flags) 2056 { 2057 bool any_src; 2058 2059 if (rt6_need_strict(&fl6->daddr)) { 2060 struct dst_entry *dst; 2061 2062 dst = l3mdev_link_scope_lookup(net, fl6); 2063 if (dst) 2064 return dst; 2065 } 2066 2067 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2068 2069 any_src = ipv6_addr_any(&fl6->saddr); 2070 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2071 (fl6->flowi6_oif && any_src)) 2072 flags |= RT6_LOOKUP_F_IFACE; 2073 2074 if (!any_src) 2075 flags |= RT6_LOOKUP_F_HAS_SADDR; 2076 else if (sk) 2077 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2078 2079 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2080 } 2081 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2082 2083 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2084 { 2085 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2086 struct net_device *loopback_dev = net->loopback_dev; 2087 struct dst_entry *new = NULL; 2088 2089 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2090 DST_OBSOLETE_DEAD, 0); 2091 if (rt) { 2092 rt6_info_init(rt); 2093 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2094 2095 new = &rt->dst; 2096 new->__use = 1; 2097 new->input = dst_discard; 2098 new->output = dst_discard_out; 2099 2100 dst_copy_metrics(new, &ort->dst); 2101 2102 rt->rt6i_idev = in6_dev_get(loopback_dev); 2103 rt->rt6i_gateway = ort->rt6i_gateway; 2104 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2105 2106 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2107 #ifdef CONFIG_IPV6_SUBTREES 2108 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2109 #endif 2110 } 2111 2112 dst_release(dst_orig); 2113 return new ? new : ERR_PTR(-ENOMEM); 2114 } 2115 2116 /* 2117 * Destination cache support functions 2118 */ 2119 2120 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2121 { 2122 u32 rt_cookie = 0; 2123 2124 if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) || 2125 rt_cookie != cookie) 2126 return false; 2127 2128 if (fib6_check_expired(f6i)) 2129 return false; 2130 2131 return true; 2132 } 2133 2134 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 2135 { 2136 u32 rt_cookie = 0; 2137 2138 if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) || 2139 rt_cookie != cookie) 2140 return NULL; 2141 2142 if (rt6_check_expired(rt)) 2143 return NULL; 2144 2145 return &rt->dst; 2146 } 2147 2148 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 2149 { 2150 if (!__rt6_check_expired(rt) && 2151 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2152 fib6_check(rt->from, cookie)) 2153 return &rt->dst; 2154 else 2155 return NULL; 2156 } 2157 2158 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2159 { 2160 struct rt6_info *rt; 2161 2162 rt = (struct rt6_info *) dst; 2163 2164 /* All IPV6 dsts are created with ->obsolete set to the value 2165 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2166 * into this function always. 2167 */ 2168 2169 if (rt->rt6i_flags & RTF_PCPU || 2170 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) 2171 return rt6_dst_from_check(rt, cookie); 2172 else 2173 return rt6_check(rt, cookie); 2174 } 2175 2176 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2177 { 2178 struct rt6_info *rt = (struct rt6_info *) dst; 2179 2180 if (rt) { 2181 if (rt->rt6i_flags & RTF_CACHE) { 2182 if (rt6_check_expired(rt)) { 2183 rt6_remove_exception_rt(rt); 2184 dst = NULL; 2185 } 2186 } else { 2187 dst_release(dst); 2188 dst = NULL; 2189 } 2190 } 2191 return dst; 2192 } 2193 2194 static void ip6_link_failure(struct sk_buff *skb) 2195 { 2196 struct rt6_info *rt; 2197 2198 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2199 2200 rt = (struct rt6_info *) skb_dst(skb); 2201 if (rt) { 2202 if (rt->rt6i_flags & RTF_CACHE) { 2203 if (dst_hold_safe(&rt->dst)) 2204 rt6_remove_exception_rt(rt); 2205 } else if (rt->from) { 2206 struct fib6_node *fn; 2207 2208 rcu_read_lock(); 2209 fn = rcu_dereference(rt->from->fib6_node); 2210 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2211 fn->fn_sernum = -1; 2212 rcu_read_unlock(); 2213 } 2214 } 2215 } 2216 2217 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2218 { 2219 struct net *net = dev_net(rt->dst.dev); 2220 2221 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2222 rt->rt6i_flags |= RTF_MODIFIED; 2223 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2224 } 2225 2226 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2227 { 2228 return !(rt->rt6i_flags & RTF_CACHE) && 2229 (rt->rt6i_flags & RTF_PCPU || rt->from); 2230 } 2231 2232 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2233 const struct ipv6hdr *iph, u32 mtu) 2234 { 2235 const struct in6_addr *daddr, *saddr; 2236 struct rt6_info *rt6 = (struct rt6_info *)dst; 2237 2238 if (rt6->rt6i_flags & RTF_LOCAL) 2239 return; 2240 2241 if (dst_metric_locked(dst, RTAX_MTU)) 2242 return; 2243 2244 if (iph) { 2245 daddr = &iph->daddr; 2246 saddr = &iph->saddr; 2247 } else if (sk) { 2248 daddr = &sk->sk_v6_daddr; 2249 saddr = &inet6_sk(sk)->saddr; 2250 } else { 2251 daddr = NULL; 2252 saddr = NULL; 2253 } 2254 dst_confirm_neigh(dst, daddr); 2255 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2256 if (mtu >= dst_mtu(dst)) 2257 return; 2258 2259 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2260 rt6_do_update_pmtu(rt6, mtu); 2261 /* update rt6_ex->stamp for cache */ 2262 if (rt6->rt6i_flags & RTF_CACHE) 2263 rt6_update_exception_stamp_rt(rt6); 2264 } else if (daddr) { 2265 struct rt6_info *nrt6; 2266 2267 nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr); 2268 if (nrt6) { 2269 rt6_do_update_pmtu(nrt6, mtu); 2270 if (rt6_insert_exception(nrt6, rt6->from)) 2271 dst_release_immediate(&nrt6->dst); 2272 } 2273 } 2274 } 2275 2276 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2277 struct sk_buff *skb, u32 mtu) 2278 { 2279 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2280 } 2281 2282 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2283 int oif, u32 mark, kuid_t uid) 2284 { 2285 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2286 struct dst_entry *dst; 2287 struct flowi6 fl6; 2288 2289 memset(&fl6, 0, sizeof(fl6)); 2290 fl6.flowi6_oif = oif; 2291 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2292 fl6.daddr = iph->daddr; 2293 fl6.saddr = iph->saddr; 2294 fl6.flowlabel = ip6_flowinfo(iph); 2295 fl6.flowi6_uid = uid; 2296 2297 dst = ip6_route_output(net, NULL, &fl6); 2298 if (!dst->error) 2299 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2300 dst_release(dst); 2301 } 2302 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2303 2304 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2305 { 2306 struct dst_entry *dst; 2307 2308 ip6_update_pmtu(skb, sock_net(sk), mtu, 2309 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2310 2311 dst = __sk_dst_get(sk); 2312 if (!dst || !dst->obsolete || 2313 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2314 return; 2315 2316 bh_lock_sock(sk); 2317 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2318 ip6_datagram_dst_update(sk, false); 2319 bh_unlock_sock(sk); 2320 } 2321 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2322 2323 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2324 const struct flowi6 *fl6) 2325 { 2326 #ifdef CONFIG_IPV6_SUBTREES 2327 struct ipv6_pinfo *np = inet6_sk(sk); 2328 #endif 2329 2330 ip6_dst_store(sk, dst, 2331 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2332 &sk->sk_v6_daddr : NULL, 2333 #ifdef CONFIG_IPV6_SUBTREES 2334 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2335 &np->saddr : 2336 #endif 2337 NULL); 2338 } 2339 2340 /* Handle redirects */ 2341 struct ip6rd_flowi { 2342 struct flowi6 fl6; 2343 struct in6_addr gateway; 2344 }; 2345 2346 static struct rt6_info *__ip6_route_redirect(struct net *net, 2347 struct fib6_table *table, 2348 struct flowi6 *fl6, 2349 const struct sk_buff *skb, 2350 int flags) 2351 { 2352 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2353 struct rt6_info *ret = NULL, *rt_cache; 2354 struct fib6_info *rt; 2355 struct fib6_node *fn; 2356 2357 /* Get the "current" route for this destination and 2358 * check if the redirect has come from appropriate router. 2359 * 2360 * RFC 4861 specifies that redirects should only be 2361 * accepted if they come from the nexthop to the target. 2362 * Due to the way the routes are chosen, this notion 2363 * is a bit fuzzy and one might need to check all possible 2364 * routes. 2365 */ 2366 2367 rcu_read_lock(); 2368 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2369 restart: 2370 for_each_fib6_node_rt_rcu(fn) { 2371 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2372 continue; 2373 if (fib6_check_expired(rt)) 2374 continue; 2375 if (rt->fib6_flags & RTF_REJECT) 2376 break; 2377 if (!(rt->fib6_flags & RTF_GATEWAY)) 2378 continue; 2379 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2380 continue; 2381 /* rt_cache's gateway might be different from its 'parent' 2382 * in the case of an ip redirect. 2383 * So we keep searching in the exception table if the gateway 2384 * is different. 2385 */ 2386 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2387 rt_cache = rt6_find_cached_rt(rt, 2388 &fl6->daddr, 2389 &fl6->saddr); 2390 if (rt_cache && 2391 ipv6_addr_equal(&rdfl->gateway, 2392 &rt_cache->rt6i_gateway)) { 2393 ret = rt_cache; 2394 break; 2395 } 2396 continue; 2397 } 2398 break; 2399 } 2400 2401 if (!rt) 2402 rt = net->ipv6.fib6_null_entry; 2403 else if (rt->fib6_flags & RTF_REJECT) { 2404 ret = net->ipv6.ip6_null_entry; 2405 goto out; 2406 } 2407 2408 if (rt == net->ipv6.fib6_null_entry) { 2409 fn = fib6_backtrack(fn, &fl6->saddr); 2410 if (fn) 2411 goto restart; 2412 } 2413 2414 out: 2415 if (ret) 2416 dst_hold(&ret->dst); 2417 else 2418 ret = ip6_create_rt_rcu(rt); 2419 2420 rcu_read_unlock(); 2421 2422 trace_fib6_table_lookup(net, ret, table, fl6); 2423 return ret; 2424 }; 2425 2426 static struct dst_entry *ip6_route_redirect(struct net *net, 2427 const struct flowi6 *fl6, 2428 const struct sk_buff *skb, 2429 const struct in6_addr *gateway) 2430 { 2431 int flags = RT6_LOOKUP_F_HAS_SADDR; 2432 struct ip6rd_flowi rdfl; 2433 2434 rdfl.fl6 = *fl6; 2435 rdfl.gateway = *gateway; 2436 2437 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2438 flags, __ip6_route_redirect); 2439 } 2440 2441 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2442 kuid_t uid) 2443 { 2444 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2445 struct dst_entry *dst; 2446 struct flowi6 fl6; 2447 2448 memset(&fl6, 0, sizeof(fl6)); 2449 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2450 fl6.flowi6_oif = oif; 2451 fl6.flowi6_mark = mark; 2452 fl6.daddr = iph->daddr; 2453 fl6.saddr = iph->saddr; 2454 fl6.flowlabel = ip6_flowinfo(iph); 2455 fl6.flowi6_uid = uid; 2456 2457 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2458 rt6_do_redirect(dst, NULL, skb); 2459 dst_release(dst); 2460 } 2461 EXPORT_SYMBOL_GPL(ip6_redirect); 2462 2463 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2464 u32 mark) 2465 { 2466 const struct ipv6hdr *iph = ipv6_hdr(skb); 2467 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2468 struct dst_entry *dst; 2469 struct flowi6 fl6; 2470 2471 memset(&fl6, 0, sizeof(fl6)); 2472 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2473 fl6.flowi6_oif = oif; 2474 fl6.flowi6_mark = mark; 2475 fl6.daddr = msg->dest; 2476 fl6.saddr = iph->daddr; 2477 fl6.flowi6_uid = sock_net_uid(net, NULL); 2478 2479 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2480 rt6_do_redirect(dst, NULL, skb); 2481 dst_release(dst); 2482 } 2483 2484 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2485 { 2486 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2487 sk->sk_uid); 2488 } 2489 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2490 2491 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2492 { 2493 struct net_device *dev = dst->dev; 2494 unsigned int mtu = dst_mtu(dst); 2495 struct net *net = dev_net(dev); 2496 2497 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2498 2499 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2500 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2501 2502 /* 2503 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2504 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2505 * IPV6_MAXPLEN is also valid and means: "any MSS, 2506 * rely only on pmtu discovery" 2507 */ 2508 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2509 mtu = IPV6_MAXPLEN; 2510 return mtu; 2511 } 2512 2513 static unsigned int ip6_mtu(const struct dst_entry *dst) 2514 { 2515 struct inet6_dev *idev; 2516 unsigned int mtu; 2517 2518 mtu = dst_metric_raw(dst, RTAX_MTU); 2519 if (mtu) 2520 goto out; 2521 2522 mtu = IPV6_MIN_MTU; 2523 2524 rcu_read_lock(); 2525 idev = __in6_dev_get(dst->dev); 2526 if (idev) 2527 mtu = idev->cnf.mtu6; 2528 rcu_read_unlock(); 2529 2530 out: 2531 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2532 2533 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2534 } 2535 2536 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2537 struct flowi6 *fl6) 2538 { 2539 struct dst_entry *dst; 2540 struct rt6_info *rt; 2541 struct inet6_dev *idev = in6_dev_get(dev); 2542 struct net *net = dev_net(dev); 2543 2544 if (unlikely(!idev)) 2545 return ERR_PTR(-ENODEV); 2546 2547 rt = ip6_dst_alloc(net, dev, 0); 2548 if (unlikely(!rt)) { 2549 in6_dev_put(idev); 2550 dst = ERR_PTR(-ENOMEM); 2551 goto out; 2552 } 2553 2554 rt->dst.flags |= DST_HOST; 2555 rt->dst.input = ip6_input; 2556 rt->dst.output = ip6_output; 2557 rt->rt6i_gateway = fl6->daddr; 2558 rt->rt6i_dst.addr = fl6->daddr; 2559 rt->rt6i_dst.plen = 128; 2560 rt->rt6i_idev = idev; 2561 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2562 2563 /* Add this dst into uncached_list so that rt6_disable_ip() can 2564 * do proper release of the net_device 2565 */ 2566 rt6_uncached_list_add(rt); 2567 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2568 2569 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2570 2571 out: 2572 return dst; 2573 } 2574 2575 static int ip6_dst_gc(struct dst_ops *ops) 2576 { 2577 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2578 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2579 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2580 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2581 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2582 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2583 int entries; 2584 2585 entries = dst_entries_get_fast(ops); 2586 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2587 entries <= rt_max_size) 2588 goto out; 2589 2590 net->ipv6.ip6_rt_gc_expire++; 2591 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2592 entries = dst_entries_get_slow(ops); 2593 if (entries < ops->gc_thresh) 2594 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2595 out: 2596 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2597 return entries > rt_max_size; 2598 } 2599 2600 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2601 struct fib6_config *cfg) 2602 { 2603 int err = 0; 2604 2605 if (cfg->fc_mx) { 2606 rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics), 2607 GFP_KERNEL); 2608 if (unlikely(!rt->fib6_metrics)) 2609 return -ENOMEM; 2610 2611 refcount_set(&rt->fib6_metrics->refcnt, 1); 2612 2613 err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, 2614 rt->fib6_metrics->metrics); 2615 } 2616 2617 return err; 2618 } 2619 2620 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2621 struct fib6_config *cfg, 2622 const struct in6_addr *gw_addr, 2623 u32 tbid, int flags) 2624 { 2625 struct flowi6 fl6 = { 2626 .flowi6_oif = cfg->fc_ifindex, 2627 .daddr = *gw_addr, 2628 .saddr = cfg->fc_prefsrc, 2629 }; 2630 struct fib6_table *table; 2631 struct rt6_info *rt; 2632 2633 table = fib6_get_table(net, tbid); 2634 if (!table) 2635 return NULL; 2636 2637 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2638 flags |= RT6_LOOKUP_F_HAS_SADDR; 2639 2640 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2641 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2642 2643 /* if table lookup failed, fall back to full lookup */ 2644 if (rt == net->ipv6.ip6_null_entry) { 2645 ip6_rt_put(rt); 2646 rt = NULL; 2647 } 2648 2649 return rt; 2650 } 2651 2652 static int ip6_route_check_nh_onlink(struct net *net, 2653 struct fib6_config *cfg, 2654 const struct net_device *dev, 2655 struct netlink_ext_ack *extack) 2656 { 2657 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2658 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2659 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2660 struct rt6_info *grt; 2661 int err; 2662 2663 err = 0; 2664 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2665 if (grt) { 2666 if (!grt->dst.error && 2667 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2668 NL_SET_ERR_MSG(extack, 2669 "Nexthop has invalid gateway or device mismatch"); 2670 err = -EINVAL; 2671 } 2672 2673 ip6_rt_put(grt); 2674 } 2675 2676 return err; 2677 } 2678 2679 static int ip6_route_check_nh(struct net *net, 2680 struct fib6_config *cfg, 2681 struct net_device **_dev, 2682 struct inet6_dev **idev) 2683 { 2684 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2685 struct net_device *dev = _dev ? *_dev : NULL; 2686 struct rt6_info *grt = NULL; 2687 int err = -EHOSTUNREACH; 2688 2689 if (cfg->fc_table) { 2690 int flags = RT6_LOOKUP_F_IFACE; 2691 2692 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2693 cfg->fc_table, flags); 2694 if (grt) { 2695 if (grt->rt6i_flags & RTF_GATEWAY || 2696 (dev && dev != grt->dst.dev)) { 2697 ip6_rt_put(grt); 2698 grt = NULL; 2699 } 2700 } 2701 } 2702 2703 if (!grt) 2704 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2705 2706 if (!grt) 2707 goto out; 2708 2709 if (dev) { 2710 if (dev != grt->dst.dev) { 2711 ip6_rt_put(grt); 2712 goto out; 2713 } 2714 } else { 2715 *_dev = dev = grt->dst.dev; 2716 *idev = grt->rt6i_idev; 2717 dev_hold(dev); 2718 in6_dev_hold(grt->rt6i_idev); 2719 } 2720 2721 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2722 err = 0; 2723 2724 ip6_rt_put(grt); 2725 2726 out: 2727 return err; 2728 } 2729 2730 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2731 struct net_device **_dev, struct inet6_dev **idev, 2732 struct netlink_ext_ack *extack) 2733 { 2734 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2735 int gwa_type = ipv6_addr_type(gw_addr); 2736 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2737 const struct net_device *dev = *_dev; 2738 bool need_addr_check = !dev; 2739 int err = -EINVAL; 2740 2741 /* if gw_addr is local we will fail to detect this in case 2742 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2743 * will return already-added prefix route via interface that 2744 * prefix route was assigned to, which might be non-loopback. 2745 */ 2746 if (dev && 2747 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2748 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2749 goto out; 2750 } 2751 2752 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2753 /* IPv6 strictly inhibits using not link-local 2754 * addresses as nexthop address. 2755 * Otherwise, router will not able to send redirects. 2756 * It is very good, but in some (rare!) circumstances 2757 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2758 * some exceptions. --ANK 2759 * We allow IPv4-mapped nexthops to support RFC4798-type 2760 * addressing 2761 */ 2762 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2763 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2764 goto out; 2765 } 2766 2767 if (cfg->fc_flags & RTNH_F_ONLINK) 2768 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2769 else 2770 err = ip6_route_check_nh(net, cfg, _dev, idev); 2771 2772 if (err) 2773 goto out; 2774 } 2775 2776 /* reload in case device was changed */ 2777 dev = *_dev; 2778 2779 err = -EINVAL; 2780 if (!dev) { 2781 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2782 goto out; 2783 } else if (dev->flags & IFF_LOOPBACK) { 2784 NL_SET_ERR_MSG(extack, 2785 "Egress device can not be loopback device for this route"); 2786 goto out; 2787 } 2788 2789 /* if we did not check gw_addr above, do so now that the 2790 * egress device has been resolved. 2791 */ 2792 if (need_addr_check && 2793 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2794 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2795 goto out; 2796 } 2797 2798 err = 0; 2799 out: 2800 return err; 2801 } 2802 2803 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2804 gfp_t gfp_flags, 2805 struct netlink_ext_ack *extack) 2806 { 2807 struct net *net = cfg->fc_nlinfo.nl_net; 2808 struct fib6_info *rt = NULL; 2809 struct net_device *dev = NULL; 2810 struct inet6_dev *idev = NULL; 2811 struct fib6_table *table; 2812 int addr_type; 2813 int err = -EINVAL; 2814 2815 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2816 if (cfg->fc_flags & RTF_PCPU) { 2817 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2818 goto out; 2819 } 2820 2821 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2822 if (cfg->fc_flags & RTF_CACHE) { 2823 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2824 goto out; 2825 } 2826 2827 if (cfg->fc_type > RTN_MAX) { 2828 NL_SET_ERR_MSG(extack, "Invalid route type"); 2829 goto out; 2830 } 2831 2832 if (cfg->fc_dst_len > 128) { 2833 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2834 goto out; 2835 } 2836 if (cfg->fc_src_len > 128) { 2837 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2838 goto out; 2839 } 2840 #ifndef CONFIG_IPV6_SUBTREES 2841 if (cfg->fc_src_len) { 2842 NL_SET_ERR_MSG(extack, 2843 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2844 goto out; 2845 } 2846 #endif 2847 if (cfg->fc_ifindex) { 2848 err = -ENODEV; 2849 dev = dev_get_by_index(net, cfg->fc_ifindex); 2850 if (!dev) 2851 goto out; 2852 idev = in6_dev_get(dev); 2853 if (!idev) 2854 goto out; 2855 } 2856 2857 if (cfg->fc_metric == 0) 2858 cfg->fc_metric = IP6_RT_PRIO_USER; 2859 2860 if (cfg->fc_flags & RTNH_F_ONLINK) { 2861 if (!dev) { 2862 NL_SET_ERR_MSG(extack, 2863 "Nexthop device required for onlink"); 2864 err = -ENODEV; 2865 goto out; 2866 } 2867 2868 if (!(dev->flags & IFF_UP)) { 2869 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2870 err = -ENETDOWN; 2871 goto out; 2872 } 2873 } 2874 2875 err = -ENOBUFS; 2876 if (cfg->fc_nlinfo.nlh && 2877 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2878 table = fib6_get_table(net, cfg->fc_table); 2879 if (!table) { 2880 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2881 table = fib6_new_table(net, cfg->fc_table); 2882 } 2883 } else { 2884 table = fib6_new_table(net, cfg->fc_table); 2885 } 2886 2887 if (!table) 2888 goto out; 2889 2890 err = -ENOMEM; 2891 rt = fib6_info_alloc(gfp_flags); 2892 if (!rt) 2893 goto out; 2894 2895 if (cfg->fc_flags & RTF_ADDRCONF) 2896 rt->dst_nocount = true; 2897 2898 err = ip6_convert_metrics(net, rt, cfg); 2899 if (err < 0) 2900 goto out; 2901 2902 if (cfg->fc_flags & RTF_EXPIRES) 2903 fib6_set_expires(rt, jiffies + 2904 clock_t_to_jiffies(cfg->fc_expires)); 2905 else 2906 fib6_clean_expires(rt); 2907 2908 if (cfg->fc_protocol == RTPROT_UNSPEC) 2909 cfg->fc_protocol = RTPROT_BOOT; 2910 rt->fib6_protocol = cfg->fc_protocol; 2911 2912 addr_type = ipv6_addr_type(&cfg->fc_dst); 2913 2914 if (cfg->fc_encap) { 2915 struct lwtunnel_state *lwtstate; 2916 2917 err = lwtunnel_build_state(cfg->fc_encap_type, 2918 cfg->fc_encap, AF_INET6, cfg, 2919 &lwtstate, extack); 2920 if (err) 2921 goto out; 2922 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 2923 } 2924 2925 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2926 rt->fib6_dst.plen = cfg->fc_dst_len; 2927 if (rt->fib6_dst.plen == 128) 2928 rt->dst_host = true; 2929 2930 #ifdef CONFIG_IPV6_SUBTREES 2931 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 2932 rt->fib6_src.plen = cfg->fc_src_len; 2933 #endif 2934 2935 rt->fib6_metric = cfg->fc_metric; 2936 rt->fib6_nh.nh_weight = 1; 2937 2938 rt->fib6_type = cfg->fc_type; 2939 2940 /* We cannot add true routes via loopback here, 2941 they would result in kernel looping; promote them to reject routes 2942 */ 2943 if ((cfg->fc_flags & RTF_REJECT) || 2944 (dev && (dev->flags & IFF_LOOPBACK) && 2945 !(addr_type & IPV6_ADDR_LOOPBACK) && 2946 !(cfg->fc_flags & RTF_LOCAL))) { 2947 /* hold loopback dev/idev if we haven't done so. */ 2948 if (dev != net->loopback_dev) { 2949 if (dev) { 2950 dev_put(dev); 2951 in6_dev_put(idev); 2952 } 2953 dev = net->loopback_dev; 2954 dev_hold(dev); 2955 idev = in6_dev_get(dev); 2956 if (!idev) { 2957 err = -ENODEV; 2958 goto out; 2959 } 2960 } 2961 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 2962 goto install_route; 2963 } 2964 2965 if (cfg->fc_flags & RTF_GATEWAY) { 2966 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 2967 if (err) 2968 goto out; 2969 2970 rt->fib6_nh.nh_gw = cfg->fc_gateway; 2971 } 2972 2973 err = -ENODEV; 2974 if (!dev) 2975 goto out; 2976 2977 if (idev->cnf.disable_ipv6) { 2978 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 2979 err = -EACCES; 2980 goto out; 2981 } 2982 2983 if (!(dev->flags & IFF_UP)) { 2984 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2985 err = -ENETDOWN; 2986 goto out; 2987 } 2988 2989 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2990 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2991 NL_SET_ERR_MSG(extack, "Invalid source address"); 2992 err = -EINVAL; 2993 goto out; 2994 } 2995 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 2996 rt->fib6_prefsrc.plen = 128; 2997 } else 2998 rt->fib6_prefsrc.plen = 0; 2999 3000 rt->fib6_flags = cfg->fc_flags; 3001 3002 install_route: 3003 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3004 !netif_carrier_ok(dev)) 3005 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3006 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3007 rt->fib6_nh.nh_dev = dev; 3008 rt->fib6_idev = idev; 3009 rt->fib6_table = table; 3010 3011 cfg->fc_nlinfo.nl_net = dev_net(dev); 3012 3013 return rt; 3014 out: 3015 if (dev) 3016 dev_put(dev); 3017 if (idev) 3018 in6_dev_put(idev); 3019 3020 fib6_info_release(rt); 3021 return ERR_PTR(err); 3022 } 3023 3024 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3025 struct netlink_ext_ack *extack) 3026 { 3027 struct fib6_info *rt; 3028 int err; 3029 3030 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3031 if (IS_ERR(rt)) 3032 return PTR_ERR(rt); 3033 3034 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3035 fib6_info_release(rt); 3036 3037 return err; 3038 } 3039 3040 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3041 { 3042 struct net *net = info->nl_net; 3043 struct fib6_table *table; 3044 int err; 3045 3046 if (rt == net->ipv6.fib6_null_entry) { 3047 err = -ENOENT; 3048 goto out; 3049 } 3050 3051 table = rt->fib6_table; 3052 spin_lock_bh(&table->tb6_lock); 3053 err = fib6_del(rt, info); 3054 spin_unlock_bh(&table->tb6_lock); 3055 3056 out: 3057 fib6_info_release(rt); 3058 return err; 3059 } 3060 3061 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3062 { 3063 struct nl_info info = { .nl_net = net }; 3064 3065 return __ip6_del_rt(rt, &info); 3066 } 3067 3068 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3069 { 3070 struct nl_info *info = &cfg->fc_nlinfo; 3071 struct net *net = info->nl_net; 3072 struct sk_buff *skb = NULL; 3073 struct fib6_table *table; 3074 int err = -ENOENT; 3075 3076 if (rt == net->ipv6.fib6_null_entry) 3077 goto out_put; 3078 table = rt->fib6_table; 3079 spin_lock_bh(&table->tb6_lock); 3080 3081 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3082 struct fib6_info *sibling, *next_sibling; 3083 3084 /* prefer to send a single notification with all hops */ 3085 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3086 if (skb) { 3087 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3088 3089 if (rt6_fill_node(net, skb, rt, NULL, 3090 NULL, NULL, 0, RTM_DELROUTE, 3091 info->portid, seq, 0) < 0) { 3092 kfree_skb(skb); 3093 skb = NULL; 3094 } else 3095 info->skip_notify = 1; 3096 } 3097 3098 list_for_each_entry_safe(sibling, next_sibling, 3099 &rt->fib6_siblings, 3100 fib6_siblings) { 3101 err = fib6_del(sibling, info); 3102 if (err) 3103 goto out_unlock; 3104 } 3105 } 3106 3107 err = fib6_del(rt, info); 3108 out_unlock: 3109 spin_unlock_bh(&table->tb6_lock); 3110 out_put: 3111 fib6_info_release(rt); 3112 3113 if (skb) { 3114 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3115 info->nlh, gfp_any()); 3116 } 3117 return err; 3118 } 3119 3120 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3121 { 3122 int rc = -ESRCH; 3123 3124 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3125 goto out; 3126 3127 if (cfg->fc_flags & RTF_GATEWAY && 3128 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3129 goto out; 3130 if (dst_hold_safe(&rt->dst)) 3131 rc = rt6_remove_exception_rt(rt); 3132 out: 3133 return rc; 3134 } 3135 3136 static int ip6_route_del(struct fib6_config *cfg, 3137 struct netlink_ext_ack *extack) 3138 { 3139 struct rt6_info *rt_cache; 3140 struct fib6_table *table; 3141 struct fib6_info *rt; 3142 struct fib6_node *fn; 3143 int err = -ESRCH; 3144 3145 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3146 if (!table) { 3147 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3148 return err; 3149 } 3150 3151 rcu_read_lock(); 3152 3153 fn = fib6_locate(&table->tb6_root, 3154 &cfg->fc_dst, cfg->fc_dst_len, 3155 &cfg->fc_src, cfg->fc_src_len, 3156 !(cfg->fc_flags & RTF_CACHE)); 3157 3158 if (fn) { 3159 for_each_fib6_node_rt_rcu(fn) { 3160 if (cfg->fc_flags & RTF_CACHE) { 3161 int rc; 3162 3163 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3164 &cfg->fc_src); 3165 if (rt_cache) { 3166 rc = ip6_del_cached_rt(rt_cache, cfg); 3167 if (rc != -ESRCH) 3168 return rc; 3169 } 3170 continue; 3171 } 3172 if (cfg->fc_ifindex && 3173 (!rt->fib6_nh.nh_dev || 3174 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3175 continue; 3176 if (cfg->fc_flags & RTF_GATEWAY && 3177 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3178 continue; 3179 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3180 continue; 3181 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3182 continue; 3183 fib6_info_hold(rt); 3184 rcu_read_unlock(); 3185 3186 /* if gateway was specified only delete the one hop */ 3187 if (cfg->fc_flags & RTF_GATEWAY) 3188 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3189 3190 return __ip6_del_rt_siblings(rt, cfg); 3191 } 3192 } 3193 rcu_read_unlock(); 3194 3195 return err; 3196 } 3197 3198 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3199 { 3200 struct netevent_redirect netevent; 3201 struct rt6_info *rt, *nrt = NULL; 3202 struct ndisc_options ndopts; 3203 struct inet6_dev *in6_dev; 3204 struct neighbour *neigh; 3205 struct rd_msg *msg; 3206 int optlen, on_link; 3207 u8 *lladdr; 3208 3209 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3210 optlen -= sizeof(*msg); 3211 3212 if (optlen < 0) { 3213 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3214 return; 3215 } 3216 3217 msg = (struct rd_msg *)icmp6_hdr(skb); 3218 3219 if (ipv6_addr_is_multicast(&msg->dest)) { 3220 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3221 return; 3222 } 3223 3224 on_link = 0; 3225 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3226 on_link = 1; 3227 } else if (ipv6_addr_type(&msg->target) != 3228 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3229 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3230 return; 3231 } 3232 3233 in6_dev = __in6_dev_get(skb->dev); 3234 if (!in6_dev) 3235 return; 3236 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3237 return; 3238 3239 /* RFC2461 8.1: 3240 * The IP source address of the Redirect MUST be the same as the current 3241 * first-hop router for the specified ICMP Destination Address. 3242 */ 3243 3244 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3245 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3246 return; 3247 } 3248 3249 lladdr = NULL; 3250 if (ndopts.nd_opts_tgt_lladdr) { 3251 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3252 skb->dev); 3253 if (!lladdr) { 3254 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3255 return; 3256 } 3257 } 3258 3259 rt = (struct rt6_info *) dst; 3260 if (rt->rt6i_flags & RTF_REJECT) { 3261 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3262 return; 3263 } 3264 3265 /* Redirect received -> path was valid. 3266 * Look, redirects are sent only in response to data packets, 3267 * so that this nexthop apparently is reachable. --ANK 3268 */ 3269 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3270 3271 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3272 if (!neigh) 3273 return; 3274 3275 /* 3276 * We have finally decided to accept it. 3277 */ 3278 3279 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3280 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3281 NEIGH_UPDATE_F_OVERRIDE| 3282 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3283 NEIGH_UPDATE_F_ISROUTER)), 3284 NDISC_REDIRECT, &ndopts); 3285 3286 nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL); 3287 if (!nrt) 3288 goto out; 3289 3290 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3291 if (on_link) 3292 nrt->rt6i_flags &= ~RTF_GATEWAY; 3293 3294 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3295 3296 /* No need to remove rt from the exception table if rt is 3297 * a cached route because rt6_insert_exception() will 3298 * takes care of it 3299 */ 3300 if (rt6_insert_exception(nrt, rt->from)) { 3301 dst_release_immediate(&nrt->dst); 3302 goto out; 3303 } 3304 3305 netevent.old = &rt->dst; 3306 netevent.new = &nrt->dst; 3307 netevent.daddr = &msg->dest; 3308 netevent.neigh = neigh; 3309 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3310 3311 out: 3312 neigh_release(neigh); 3313 } 3314 3315 #ifdef CONFIG_IPV6_ROUTE_INFO 3316 static struct fib6_info *rt6_get_route_info(struct net *net, 3317 const struct in6_addr *prefix, int prefixlen, 3318 const struct in6_addr *gwaddr, 3319 struct net_device *dev) 3320 { 3321 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3322 int ifindex = dev->ifindex; 3323 struct fib6_node *fn; 3324 struct fib6_info *rt = NULL; 3325 struct fib6_table *table; 3326 3327 table = fib6_get_table(net, tb_id); 3328 if (!table) 3329 return NULL; 3330 3331 rcu_read_lock(); 3332 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3333 if (!fn) 3334 goto out; 3335 3336 for_each_fib6_node_rt_rcu(fn) { 3337 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3338 continue; 3339 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3340 continue; 3341 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3342 continue; 3343 fib6_info_hold(rt); 3344 break; 3345 } 3346 out: 3347 rcu_read_unlock(); 3348 return rt; 3349 } 3350 3351 static struct fib6_info *rt6_add_route_info(struct net *net, 3352 const struct in6_addr *prefix, int prefixlen, 3353 const struct in6_addr *gwaddr, 3354 struct net_device *dev, 3355 unsigned int pref) 3356 { 3357 struct fib6_config cfg = { 3358 .fc_metric = IP6_RT_PRIO_USER, 3359 .fc_ifindex = dev->ifindex, 3360 .fc_dst_len = prefixlen, 3361 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3362 RTF_UP | RTF_PREF(pref), 3363 .fc_protocol = RTPROT_RA, 3364 .fc_type = RTN_UNICAST, 3365 .fc_nlinfo.portid = 0, 3366 .fc_nlinfo.nlh = NULL, 3367 .fc_nlinfo.nl_net = net, 3368 }; 3369 3370 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3371 cfg.fc_dst = *prefix; 3372 cfg.fc_gateway = *gwaddr; 3373 3374 /* We should treat it as a default route if prefix length is 0. */ 3375 if (!prefixlen) 3376 cfg.fc_flags |= RTF_DEFAULT; 3377 3378 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3379 3380 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3381 } 3382 #endif 3383 3384 struct fib6_info *rt6_get_dflt_router(struct net *net, 3385 const struct in6_addr *addr, 3386 struct net_device *dev) 3387 { 3388 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3389 struct fib6_info *rt; 3390 struct fib6_table *table; 3391 3392 table = fib6_get_table(net, tb_id); 3393 if (!table) 3394 return NULL; 3395 3396 rcu_read_lock(); 3397 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3398 if (dev == rt->fib6_nh.nh_dev && 3399 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3400 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3401 break; 3402 } 3403 if (rt) 3404 fib6_info_hold(rt); 3405 rcu_read_unlock(); 3406 return rt; 3407 } 3408 3409 struct fib6_info *rt6_add_dflt_router(struct net *net, 3410 const struct in6_addr *gwaddr, 3411 struct net_device *dev, 3412 unsigned int pref) 3413 { 3414 struct fib6_config cfg = { 3415 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3416 .fc_metric = IP6_RT_PRIO_USER, 3417 .fc_ifindex = dev->ifindex, 3418 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3419 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3420 .fc_protocol = RTPROT_RA, 3421 .fc_type = RTN_UNICAST, 3422 .fc_nlinfo.portid = 0, 3423 .fc_nlinfo.nlh = NULL, 3424 .fc_nlinfo.nl_net = net, 3425 }; 3426 3427 cfg.fc_gateway = *gwaddr; 3428 3429 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3430 struct fib6_table *table; 3431 3432 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3433 if (table) 3434 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3435 } 3436 3437 return rt6_get_dflt_router(net, gwaddr, dev); 3438 } 3439 3440 static void __rt6_purge_dflt_routers(struct net *net, 3441 struct fib6_table *table) 3442 { 3443 struct fib6_info *rt; 3444 3445 restart: 3446 rcu_read_lock(); 3447 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3448 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3449 (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) { 3450 fib6_info_hold(rt); 3451 rcu_read_unlock(); 3452 ip6_del_rt(net, rt); 3453 goto restart; 3454 } 3455 } 3456 rcu_read_unlock(); 3457 3458 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3459 } 3460 3461 void rt6_purge_dflt_routers(struct net *net) 3462 { 3463 struct fib6_table *table; 3464 struct hlist_head *head; 3465 unsigned int h; 3466 3467 rcu_read_lock(); 3468 3469 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3470 head = &net->ipv6.fib_table_hash[h]; 3471 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3472 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3473 __rt6_purge_dflt_routers(net, table); 3474 } 3475 } 3476 3477 rcu_read_unlock(); 3478 } 3479 3480 static void rtmsg_to_fib6_config(struct net *net, 3481 struct in6_rtmsg *rtmsg, 3482 struct fib6_config *cfg) 3483 { 3484 memset(cfg, 0, sizeof(*cfg)); 3485 3486 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3487 : RT6_TABLE_MAIN; 3488 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3489 cfg->fc_metric = rtmsg->rtmsg_metric; 3490 cfg->fc_expires = rtmsg->rtmsg_info; 3491 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3492 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3493 cfg->fc_flags = rtmsg->rtmsg_flags; 3494 cfg->fc_type = rtmsg->rtmsg_type; 3495 3496 cfg->fc_nlinfo.nl_net = net; 3497 3498 cfg->fc_dst = rtmsg->rtmsg_dst; 3499 cfg->fc_src = rtmsg->rtmsg_src; 3500 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3501 } 3502 3503 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3504 { 3505 struct fib6_config cfg; 3506 struct in6_rtmsg rtmsg; 3507 int err; 3508 3509 switch (cmd) { 3510 case SIOCADDRT: /* Add a route */ 3511 case SIOCDELRT: /* Delete a route */ 3512 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3513 return -EPERM; 3514 err = copy_from_user(&rtmsg, arg, 3515 sizeof(struct in6_rtmsg)); 3516 if (err) 3517 return -EFAULT; 3518 3519 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3520 3521 rtnl_lock(); 3522 switch (cmd) { 3523 case SIOCADDRT: 3524 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3525 break; 3526 case SIOCDELRT: 3527 err = ip6_route_del(&cfg, NULL); 3528 break; 3529 default: 3530 err = -EINVAL; 3531 } 3532 rtnl_unlock(); 3533 3534 return err; 3535 } 3536 3537 return -EINVAL; 3538 } 3539 3540 /* 3541 * Drop the packet on the floor 3542 */ 3543 3544 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3545 { 3546 int type; 3547 struct dst_entry *dst = skb_dst(skb); 3548 switch (ipstats_mib_noroutes) { 3549 case IPSTATS_MIB_INNOROUTES: 3550 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3551 if (type == IPV6_ADDR_ANY) { 3552 IP6_INC_STATS(dev_net(dst->dev), 3553 __in6_dev_get_safely(skb->dev), 3554 IPSTATS_MIB_INADDRERRORS); 3555 break; 3556 } 3557 /* FALLTHROUGH */ 3558 case IPSTATS_MIB_OUTNOROUTES: 3559 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3560 ipstats_mib_noroutes); 3561 break; 3562 } 3563 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3564 kfree_skb(skb); 3565 return 0; 3566 } 3567 3568 static int ip6_pkt_discard(struct sk_buff *skb) 3569 { 3570 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3571 } 3572 3573 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3574 { 3575 skb->dev = skb_dst(skb)->dev; 3576 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3577 } 3578 3579 static int ip6_pkt_prohibit(struct sk_buff *skb) 3580 { 3581 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3582 } 3583 3584 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3585 { 3586 skb->dev = skb_dst(skb)->dev; 3587 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3588 } 3589 3590 /* 3591 * Allocate a dst for local (unicast / anycast) address. 3592 */ 3593 3594 struct fib6_info *addrconf_dst_alloc(struct net *net, 3595 struct inet6_dev *idev, 3596 const struct in6_addr *addr, 3597 bool anycast, gfp_t gfp_flags) 3598 { 3599 u32 tb_id; 3600 struct net_device *dev = idev->dev; 3601 struct fib6_info *rt; 3602 3603 rt = fib6_info_alloc(gfp_flags); 3604 if (!rt) 3605 return ERR_PTR(-ENOMEM); 3606 3607 rt->dst_nocount = true; 3608 3609 in6_dev_hold(idev); 3610 rt->fib6_idev = idev; 3611 3612 rt->dst_host = true; 3613 rt->fib6_protocol = RTPROT_KERNEL; 3614 rt->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3615 if (anycast) { 3616 rt->fib6_type = RTN_ANYCAST; 3617 rt->fib6_flags |= RTF_ANYCAST; 3618 } else { 3619 rt->fib6_type = RTN_LOCAL; 3620 rt->fib6_flags |= RTF_LOCAL; 3621 } 3622 3623 rt->fib6_nh.nh_gw = *addr; 3624 dev_hold(dev); 3625 rt->fib6_nh.nh_dev = dev; 3626 rt->fib6_dst.addr = *addr; 3627 rt->fib6_dst.plen = 128; 3628 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3629 rt->fib6_table = fib6_get_table(net, tb_id); 3630 3631 return rt; 3632 } 3633 3634 /* remove deleted ip from prefsrc entries */ 3635 struct arg_dev_net_ip { 3636 struct net_device *dev; 3637 struct net *net; 3638 struct in6_addr *addr; 3639 }; 3640 3641 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3642 { 3643 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3644 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3645 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3646 3647 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3648 rt != net->ipv6.fib6_null_entry && 3649 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3650 spin_lock_bh(&rt6_exception_lock); 3651 /* remove prefsrc entry */ 3652 rt->fib6_prefsrc.plen = 0; 3653 /* need to update cache as well */ 3654 rt6_exceptions_remove_prefsrc(rt); 3655 spin_unlock_bh(&rt6_exception_lock); 3656 } 3657 return 0; 3658 } 3659 3660 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3661 { 3662 struct net *net = dev_net(ifp->idev->dev); 3663 struct arg_dev_net_ip adni = { 3664 .dev = ifp->idev->dev, 3665 .net = net, 3666 .addr = &ifp->addr, 3667 }; 3668 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3669 } 3670 3671 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3672 3673 /* Remove routers and update dst entries when gateway turn into host. */ 3674 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3675 { 3676 struct in6_addr *gateway = (struct in6_addr *)arg; 3677 3678 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3679 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3680 return -1; 3681 } 3682 3683 /* Further clean up cached routes in exception table. 3684 * This is needed because cached route may have a different 3685 * gateway than its 'parent' in the case of an ip redirect. 3686 */ 3687 rt6_exceptions_clean_tohost(rt, gateway); 3688 3689 return 0; 3690 } 3691 3692 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3693 { 3694 fib6_clean_all(net, fib6_clean_tohost, gateway); 3695 } 3696 3697 struct arg_netdev_event { 3698 const struct net_device *dev; 3699 union { 3700 unsigned int nh_flags; 3701 unsigned long event; 3702 }; 3703 }; 3704 3705 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3706 { 3707 struct fib6_info *iter; 3708 struct fib6_node *fn; 3709 3710 fn = rcu_dereference_protected(rt->fib6_node, 3711 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3712 iter = rcu_dereference_protected(fn->leaf, 3713 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3714 while (iter) { 3715 if (iter->fib6_metric == rt->fib6_metric && 3716 rt6_qualify_for_ecmp(iter)) 3717 return iter; 3718 iter = rcu_dereference_protected(iter->rt6_next, 3719 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3720 } 3721 3722 return NULL; 3723 } 3724 3725 static bool rt6_is_dead(const struct fib6_info *rt) 3726 { 3727 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3728 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3729 rt->fib6_idev->cnf.ignore_routes_with_linkdown)) 3730 return true; 3731 3732 return false; 3733 } 3734 3735 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3736 { 3737 struct fib6_info *iter; 3738 int total = 0; 3739 3740 if (!rt6_is_dead(rt)) 3741 total += rt->fib6_nh.nh_weight; 3742 3743 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3744 if (!rt6_is_dead(iter)) 3745 total += iter->fib6_nh.nh_weight; 3746 } 3747 3748 return total; 3749 } 3750 3751 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3752 { 3753 int upper_bound = -1; 3754 3755 if (!rt6_is_dead(rt)) { 3756 *weight += rt->fib6_nh.nh_weight; 3757 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3758 total) - 1; 3759 } 3760 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3761 } 3762 3763 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3764 { 3765 struct fib6_info *iter; 3766 int weight = 0; 3767 3768 rt6_upper_bound_set(rt, &weight, total); 3769 3770 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3771 rt6_upper_bound_set(iter, &weight, total); 3772 } 3773 3774 void rt6_multipath_rebalance(struct fib6_info *rt) 3775 { 3776 struct fib6_info *first; 3777 int total; 3778 3779 /* In case the entire multipath route was marked for flushing, 3780 * then there is no need to rebalance upon the removal of every 3781 * sibling route. 3782 */ 3783 if (!rt->fib6_nsiblings || rt->should_flush) 3784 return; 3785 3786 /* During lookup routes are evaluated in order, so we need to 3787 * make sure upper bounds are assigned from the first sibling 3788 * onwards. 3789 */ 3790 first = rt6_multipath_first_sibling(rt); 3791 if (WARN_ON_ONCE(!first)) 3792 return; 3793 3794 total = rt6_multipath_total_weight(first); 3795 rt6_multipath_upper_bound_set(first, total); 3796 } 3797 3798 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3799 { 3800 const struct arg_netdev_event *arg = p_arg; 3801 struct net *net = dev_net(arg->dev); 3802 3803 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3804 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3805 fib6_update_sernum_upto_root(net, rt); 3806 rt6_multipath_rebalance(rt); 3807 } 3808 3809 return 0; 3810 } 3811 3812 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3813 { 3814 struct arg_netdev_event arg = { 3815 .dev = dev, 3816 { 3817 .nh_flags = nh_flags, 3818 }, 3819 }; 3820 3821 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3822 arg.nh_flags |= RTNH_F_LINKDOWN; 3823 3824 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3825 } 3826 3827 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3828 const struct net_device *dev) 3829 { 3830 struct fib6_info *iter; 3831 3832 if (rt->fib6_nh.nh_dev == dev) 3833 return true; 3834 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3835 if (iter->fib6_nh.nh_dev == dev) 3836 return true; 3837 3838 return false; 3839 } 3840 3841 static void rt6_multipath_flush(struct fib6_info *rt) 3842 { 3843 struct fib6_info *iter; 3844 3845 rt->should_flush = 1; 3846 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3847 iter->should_flush = 1; 3848 } 3849 3850 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3851 const struct net_device *down_dev) 3852 { 3853 struct fib6_info *iter; 3854 unsigned int dead = 0; 3855 3856 if (rt->fib6_nh.nh_dev == down_dev || 3857 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3858 dead++; 3859 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3860 if (iter->fib6_nh.nh_dev == down_dev || 3861 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3862 dead++; 3863 3864 return dead; 3865 } 3866 3867 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3868 const struct net_device *dev, 3869 unsigned int nh_flags) 3870 { 3871 struct fib6_info *iter; 3872 3873 if (rt->fib6_nh.nh_dev == dev) 3874 rt->fib6_nh.nh_flags |= nh_flags; 3875 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3876 if (iter->fib6_nh.nh_dev == dev) 3877 iter->fib6_nh.nh_flags |= nh_flags; 3878 } 3879 3880 /* called with write lock held for table with rt */ 3881 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3882 { 3883 const struct arg_netdev_event *arg = p_arg; 3884 const struct net_device *dev = arg->dev; 3885 struct net *net = dev_net(dev); 3886 3887 if (rt == net->ipv6.fib6_null_entry) 3888 return 0; 3889 3890 switch (arg->event) { 3891 case NETDEV_UNREGISTER: 3892 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3893 case NETDEV_DOWN: 3894 if (rt->should_flush) 3895 return -1; 3896 if (!rt->fib6_nsiblings) 3897 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3898 if (rt6_multipath_uses_dev(rt, dev)) { 3899 unsigned int count; 3900 3901 count = rt6_multipath_dead_count(rt, dev); 3902 if (rt->fib6_nsiblings + 1 == count) { 3903 rt6_multipath_flush(rt); 3904 return -1; 3905 } 3906 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 3907 RTNH_F_LINKDOWN); 3908 fib6_update_sernum(net, rt); 3909 rt6_multipath_rebalance(rt); 3910 } 3911 return -2; 3912 case NETDEV_CHANGE: 3913 if (rt->fib6_nh.nh_dev != dev || 3914 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 3915 break; 3916 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3917 rt6_multipath_rebalance(rt); 3918 break; 3919 } 3920 3921 return 0; 3922 } 3923 3924 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 3925 { 3926 struct arg_netdev_event arg = { 3927 .dev = dev, 3928 { 3929 .event = event, 3930 }, 3931 }; 3932 3933 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 3934 } 3935 3936 void rt6_disable_ip(struct net_device *dev, unsigned long event) 3937 { 3938 rt6_sync_down_dev(dev, event); 3939 rt6_uncached_list_flush_dev(dev_net(dev), dev); 3940 neigh_ifdown(&nd_tbl, dev); 3941 } 3942 3943 struct rt6_mtu_change_arg { 3944 struct net_device *dev; 3945 unsigned int mtu; 3946 }; 3947 3948 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 3949 { 3950 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 3951 struct inet6_dev *idev; 3952 3953 /* In IPv6 pmtu discovery is not optional, 3954 so that RTAX_MTU lock cannot disable it. 3955 We still use this lock to block changes 3956 caused by addrconf/ndisc. 3957 */ 3958 3959 idev = __in6_dev_get(arg->dev); 3960 if (!idev) 3961 return 0; 3962 3963 /* For administrative MTU increase, there is no way to discover 3964 IPv6 PMTU increase, so PMTU increase should be updated here. 3965 Since RFC 1981 doesn't include administrative MTU increase 3966 update PMTU increase is a MUST. (i.e. jumbo frame) 3967 */ 3968 if (rt->fib6_nh.nh_dev == arg->dev && 3969 !fib6_metric_locked(rt, RTAX_MTU)) { 3970 u32 mtu = rt->fib6_pmtu; 3971 3972 if (mtu >= arg->mtu || 3973 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 3974 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 3975 3976 spin_lock_bh(&rt6_exception_lock); 3977 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 3978 spin_unlock_bh(&rt6_exception_lock); 3979 } 3980 return 0; 3981 } 3982 3983 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 3984 { 3985 struct rt6_mtu_change_arg arg = { 3986 .dev = dev, 3987 .mtu = mtu, 3988 }; 3989 3990 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 3991 } 3992 3993 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 3994 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 3995 [RTA_OIF] = { .type = NLA_U32 }, 3996 [RTA_IIF] = { .type = NLA_U32 }, 3997 [RTA_PRIORITY] = { .type = NLA_U32 }, 3998 [RTA_METRICS] = { .type = NLA_NESTED }, 3999 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4000 [RTA_PREF] = { .type = NLA_U8 }, 4001 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4002 [RTA_ENCAP] = { .type = NLA_NESTED }, 4003 [RTA_EXPIRES] = { .type = NLA_U32 }, 4004 [RTA_UID] = { .type = NLA_U32 }, 4005 [RTA_MARK] = { .type = NLA_U32 }, 4006 }; 4007 4008 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4009 struct fib6_config *cfg, 4010 struct netlink_ext_ack *extack) 4011 { 4012 struct rtmsg *rtm; 4013 struct nlattr *tb[RTA_MAX+1]; 4014 unsigned int pref; 4015 int err; 4016 4017 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4018 NULL); 4019 if (err < 0) 4020 goto errout; 4021 4022 err = -EINVAL; 4023 rtm = nlmsg_data(nlh); 4024 memset(cfg, 0, sizeof(*cfg)); 4025 4026 cfg->fc_table = rtm->rtm_table; 4027 cfg->fc_dst_len = rtm->rtm_dst_len; 4028 cfg->fc_src_len = rtm->rtm_src_len; 4029 cfg->fc_flags = RTF_UP; 4030 cfg->fc_protocol = rtm->rtm_protocol; 4031 cfg->fc_type = rtm->rtm_type; 4032 4033 if (rtm->rtm_type == RTN_UNREACHABLE || 4034 rtm->rtm_type == RTN_BLACKHOLE || 4035 rtm->rtm_type == RTN_PROHIBIT || 4036 rtm->rtm_type == RTN_THROW) 4037 cfg->fc_flags |= RTF_REJECT; 4038 4039 if (rtm->rtm_type == RTN_LOCAL) 4040 cfg->fc_flags |= RTF_LOCAL; 4041 4042 if (rtm->rtm_flags & RTM_F_CLONED) 4043 cfg->fc_flags |= RTF_CACHE; 4044 4045 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4046 4047 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4048 cfg->fc_nlinfo.nlh = nlh; 4049 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4050 4051 if (tb[RTA_GATEWAY]) { 4052 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4053 cfg->fc_flags |= RTF_GATEWAY; 4054 } 4055 4056 if (tb[RTA_DST]) { 4057 int plen = (rtm->rtm_dst_len + 7) >> 3; 4058 4059 if (nla_len(tb[RTA_DST]) < plen) 4060 goto errout; 4061 4062 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4063 } 4064 4065 if (tb[RTA_SRC]) { 4066 int plen = (rtm->rtm_src_len + 7) >> 3; 4067 4068 if (nla_len(tb[RTA_SRC]) < plen) 4069 goto errout; 4070 4071 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4072 } 4073 4074 if (tb[RTA_PREFSRC]) 4075 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4076 4077 if (tb[RTA_OIF]) 4078 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4079 4080 if (tb[RTA_PRIORITY]) 4081 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4082 4083 if (tb[RTA_METRICS]) { 4084 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4085 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4086 } 4087 4088 if (tb[RTA_TABLE]) 4089 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4090 4091 if (tb[RTA_MULTIPATH]) { 4092 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4093 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4094 4095 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4096 cfg->fc_mp_len, extack); 4097 if (err < 0) 4098 goto errout; 4099 } 4100 4101 if (tb[RTA_PREF]) { 4102 pref = nla_get_u8(tb[RTA_PREF]); 4103 if (pref != ICMPV6_ROUTER_PREF_LOW && 4104 pref != ICMPV6_ROUTER_PREF_HIGH) 4105 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4106 cfg->fc_flags |= RTF_PREF(pref); 4107 } 4108 4109 if (tb[RTA_ENCAP]) 4110 cfg->fc_encap = tb[RTA_ENCAP]; 4111 4112 if (tb[RTA_ENCAP_TYPE]) { 4113 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4114 4115 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4116 if (err < 0) 4117 goto errout; 4118 } 4119 4120 if (tb[RTA_EXPIRES]) { 4121 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4122 4123 if (addrconf_finite_timeout(timeout)) { 4124 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4125 cfg->fc_flags |= RTF_EXPIRES; 4126 } 4127 } 4128 4129 err = 0; 4130 errout: 4131 return err; 4132 } 4133 4134 struct rt6_nh { 4135 struct fib6_info *fib6_info; 4136 struct fib6_config r_cfg; 4137 struct list_head next; 4138 }; 4139 4140 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4141 { 4142 struct rt6_nh *nh; 4143 4144 list_for_each_entry(nh, rt6_nh_list, next) { 4145 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4146 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4147 nh->r_cfg.fc_ifindex); 4148 } 4149 } 4150 4151 static int ip6_route_info_append(struct net *net, 4152 struct list_head *rt6_nh_list, 4153 struct fib6_info *rt, 4154 struct fib6_config *r_cfg) 4155 { 4156 struct rt6_nh *nh; 4157 int err = -EEXIST; 4158 4159 list_for_each_entry(nh, rt6_nh_list, next) { 4160 /* check if fib6_info already exists */ 4161 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4162 return err; 4163 } 4164 4165 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4166 if (!nh) 4167 return -ENOMEM; 4168 nh->fib6_info = rt; 4169 err = ip6_convert_metrics(net, rt, r_cfg); 4170 if (err) { 4171 kfree(nh); 4172 return err; 4173 } 4174 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4175 list_add_tail(&nh->next, rt6_nh_list); 4176 4177 return 0; 4178 } 4179 4180 static void ip6_route_mpath_notify(struct fib6_info *rt, 4181 struct fib6_info *rt_last, 4182 struct nl_info *info, 4183 __u16 nlflags) 4184 { 4185 /* if this is an APPEND route, then rt points to the first route 4186 * inserted and rt_last points to last route inserted. Userspace 4187 * wants a consistent dump of the route which starts at the first 4188 * nexthop. Since sibling routes are always added at the end of 4189 * the list, find the first sibling of the last route appended 4190 */ 4191 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4192 rt = list_first_entry(&rt_last->fib6_siblings, 4193 struct fib6_info, 4194 fib6_siblings); 4195 } 4196 4197 if (rt) 4198 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4199 } 4200 4201 static int ip6_route_multipath_add(struct fib6_config *cfg, 4202 struct netlink_ext_ack *extack) 4203 { 4204 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4205 struct nl_info *info = &cfg->fc_nlinfo; 4206 struct fib6_config r_cfg; 4207 struct rtnexthop *rtnh; 4208 struct fib6_info *rt; 4209 struct rt6_nh *err_nh; 4210 struct rt6_nh *nh, *nh_safe; 4211 __u16 nlflags; 4212 int remaining; 4213 int attrlen; 4214 int err = 1; 4215 int nhn = 0; 4216 int replace = (cfg->fc_nlinfo.nlh && 4217 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4218 LIST_HEAD(rt6_nh_list); 4219 4220 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4221 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4222 nlflags |= NLM_F_APPEND; 4223 4224 remaining = cfg->fc_mp_len; 4225 rtnh = (struct rtnexthop *)cfg->fc_mp; 4226 4227 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4228 * fib6_info structs per nexthop 4229 */ 4230 while (rtnh_ok(rtnh, remaining)) { 4231 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4232 if (rtnh->rtnh_ifindex) 4233 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4234 4235 attrlen = rtnh_attrlen(rtnh); 4236 if (attrlen > 0) { 4237 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4238 4239 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4240 if (nla) { 4241 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4242 r_cfg.fc_flags |= RTF_GATEWAY; 4243 } 4244 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4245 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4246 if (nla) 4247 r_cfg.fc_encap_type = nla_get_u16(nla); 4248 } 4249 4250 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4251 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4252 if (IS_ERR(rt)) { 4253 err = PTR_ERR(rt); 4254 rt = NULL; 4255 goto cleanup; 4256 } 4257 4258 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4259 4260 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4261 rt, &r_cfg); 4262 if (err) { 4263 fib6_info_release(rt); 4264 goto cleanup; 4265 } 4266 4267 rtnh = rtnh_next(rtnh, &remaining); 4268 } 4269 4270 /* for add and replace send one notification with all nexthops. 4271 * Skip the notification in fib6_add_rt2node and send one with 4272 * the full route when done 4273 */ 4274 info->skip_notify = 1; 4275 4276 err_nh = NULL; 4277 list_for_each_entry(nh, &rt6_nh_list, next) { 4278 rt_last = nh->fib6_info; 4279 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4280 fib6_info_release(nh->fib6_info); 4281 4282 /* save reference to first route for notification */ 4283 if (!rt_notif && !err) 4284 rt_notif = nh->fib6_info; 4285 4286 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4287 nh->fib6_info = NULL; 4288 if (err) { 4289 if (replace && nhn) 4290 ip6_print_replace_route_err(&rt6_nh_list); 4291 err_nh = nh; 4292 goto add_errout; 4293 } 4294 4295 /* Because each route is added like a single route we remove 4296 * these flags after the first nexthop: if there is a collision, 4297 * we have already failed to add the first nexthop: 4298 * fib6_add_rt2node() has rejected it; when replacing, old 4299 * nexthops have been replaced by first new, the rest should 4300 * be added to it. 4301 */ 4302 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4303 NLM_F_REPLACE); 4304 nhn++; 4305 } 4306 4307 /* success ... tell user about new route */ 4308 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4309 goto cleanup; 4310 4311 add_errout: 4312 /* send notification for routes that were added so that 4313 * the delete notifications sent by ip6_route_del are 4314 * coherent 4315 */ 4316 if (rt_notif) 4317 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4318 4319 /* Delete routes that were already added */ 4320 list_for_each_entry(nh, &rt6_nh_list, next) { 4321 if (err_nh == nh) 4322 break; 4323 ip6_route_del(&nh->r_cfg, extack); 4324 } 4325 4326 cleanup: 4327 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4328 if (nh->fib6_info) 4329 fib6_info_release(nh->fib6_info); 4330 list_del(&nh->next); 4331 kfree(nh); 4332 } 4333 4334 return err; 4335 } 4336 4337 static int ip6_route_multipath_del(struct fib6_config *cfg, 4338 struct netlink_ext_ack *extack) 4339 { 4340 struct fib6_config r_cfg; 4341 struct rtnexthop *rtnh; 4342 int remaining; 4343 int attrlen; 4344 int err = 1, last_err = 0; 4345 4346 remaining = cfg->fc_mp_len; 4347 rtnh = (struct rtnexthop *)cfg->fc_mp; 4348 4349 /* Parse a Multipath Entry */ 4350 while (rtnh_ok(rtnh, remaining)) { 4351 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4352 if (rtnh->rtnh_ifindex) 4353 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4354 4355 attrlen = rtnh_attrlen(rtnh); 4356 if (attrlen > 0) { 4357 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4358 4359 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4360 if (nla) { 4361 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4362 r_cfg.fc_flags |= RTF_GATEWAY; 4363 } 4364 } 4365 err = ip6_route_del(&r_cfg, extack); 4366 if (err) 4367 last_err = err; 4368 4369 rtnh = rtnh_next(rtnh, &remaining); 4370 } 4371 4372 return last_err; 4373 } 4374 4375 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4376 struct netlink_ext_ack *extack) 4377 { 4378 struct fib6_config cfg; 4379 int err; 4380 4381 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4382 if (err < 0) 4383 return err; 4384 4385 if (cfg.fc_mp) 4386 return ip6_route_multipath_del(&cfg, extack); 4387 else { 4388 cfg.fc_delete_all_nh = 1; 4389 return ip6_route_del(&cfg, extack); 4390 } 4391 } 4392 4393 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4394 struct netlink_ext_ack *extack) 4395 { 4396 struct fib6_config cfg; 4397 int err; 4398 4399 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4400 if (err < 0) 4401 return err; 4402 4403 if (cfg.fc_mp) 4404 return ip6_route_multipath_add(&cfg, extack); 4405 else 4406 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4407 } 4408 4409 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4410 { 4411 int nexthop_len = 0; 4412 4413 if (rt->fib6_nsiblings) { 4414 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4415 + NLA_ALIGN(sizeof(struct rtnexthop)) 4416 + nla_total_size(16) /* RTA_GATEWAY */ 4417 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4418 4419 nexthop_len *= rt->fib6_nsiblings; 4420 } 4421 4422 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4423 + nla_total_size(16) /* RTA_SRC */ 4424 + nla_total_size(16) /* RTA_DST */ 4425 + nla_total_size(16) /* RTA_GATEWAY */ 4426 + nla_total_size(16) /* RTA_PREFSRC */ 4427 + nla_total_size(4) /* RTA_TABLE */ 4428 + nla_total_size(4) /* RTA_IIF */ 4429 + nla_total_size(4) /* RTA_OIF */ 4430 + nla_total_size(4) /* RTA_PRIORITY */ 4431 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4432 + nla_total_size(sizeof(struct rta_cacheinfo)) 4433 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4434 + nla_total_size(1) /* RTA_PREF */ 4435 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4436 + nexthop_len; 4437 } 4438 4439 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4440 unsigned int *flags, bool skip_oif) 4441 { 4442 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4443 *flags |= RTNH_F_DEAD; 4444 4445 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4446 *flags |= RTNH_F_LINKDOWN; 4447 if (rt->fib6_idev->cnf.ignore_routes_with_linkdown) 4448 *flags |= RTNH_F_DEAD; 4449 } 4450 4451 if (rt->fib6_flags & RTF_GATEWAY) { 4452 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4453 goto nla_put_failure; 4454 } 4455 4456 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4457 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4458 *flags |= RTNH_F_OFFLOAD; 4459 4460 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4461 if (!skip_oif && rt->fib6_nh.nh_dev && 4462 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4463 goto nla_put_failure; 4464 4465 if (rt->fib6_nh.nh_lwtstate && 4466 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4467 goto nla_put_failure; 4468 4469 return 0; 4470 4471 nla_put_failure: 4472 return -EMSGSIZE; 4473 } 4474 4475 /* add multipath next hop */ 4476 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4477 { 4478 const struct net_device *dev = rt->fib6_nh.nh_dev; 4479 struct rtnexthop *rtnh; 4480 unsigned int flags = 0; 4481 4482 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4483 if (!rtnh) 4484 goto nla_put_failure; 4485 4486 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4487 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4488 4489 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4490 goto nla_put_failure; 4491 4492 rtnh->rtnh_flags = flags; 4493 4494 /* length of rtnetlink header + attributes */ 4495 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4496 4497 return 0; 4498 4499 nla_put_failure: 4500 return -EMSGSIZE; 4501 } 4502 4503 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4504 struct fib6_info *rt, struct dst_entry *dst, 4505 struct in6_addr *dest, struct in6_addr *src, 4506 int iif, int type, u32 portid, u32 seq, 4507 unsigned int flags) 4508 { 4509 struct rtmsg *rtm; 4510 struct nlmsghdr *nlh; 4511 long expires = 0; 4512 u32 *pmetrics; 4513 u32 table; 4514 4515 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4516 if (!nlh) 4517 return -EMSGSIZE; 4518 4519 rtm = nlmsg_data(nlh); 4520 rtm->rtm_family = AF_INET6; 4521 rtm->rtm_dst_len = rt->fib6_dst.plen; 4522 rtm->rtm_src_len = rt->fib6_src.plen; 4523 rtm->rtm_tos = 0; 4524 if (rt->fib6_table) 4525 table = rt->fib6_table->tb6_id; 4526 else 4527 table = RT6_TABLE_UNSPEC; 4528 rtm->rtm_table = table; 4529 if (nla_put_u32(skb, RTA_TABLE, table)) 4530 goto nla_put_failure; 4531 4532 rtm->rtm_type = rt->fib6_type; 4533 rtm->rtm_flags = 0; 4534 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4535 rtm->rtm_protocol = rt->fib6_protocol; 4536 4537 if (rt->fib6_flags & RTF_CACHE) 4538 rtm->rtm_flags |= RTM_F_CLONED; 4539 4540 if (dest) { 4541 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4542 goto nla_put_failure; 4543 rtm->rtm_dst_len = 128; 4544 } else if (rtm->rtm_dst_len) 4545 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4546 goto nla_put_failure; 4547 #ifdef CONFIG_IPV6_SUBTREES 4548 if (src) { 4549 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4550 goto nla_put_failure; 4551 rtm->rtm_src_len = 128; 4552 } else if (rtm->rtm_src_len && 4553 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4554 goto nla_put_failure; 4555 #endif 4556 if (iif) { 4557 #ifdef CONFIG_IPV6_MROUTE 4558 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4559 int err = ip6mr_get_route(net, skb, rtm, portid); 4560 4561 if (err == 0) 4562 return 0; 4563 if (err < 0) 4564 goto nla_put_failure; 4565 } else 4566 #endif 4567 if (nla_put_u32(skb, RTA_IIF, iif)) 4568 goto nla_put_failure; 4569 } else if (dest) { 4570 struct in6_addr saddr_buf; 4571 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4572 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4573 goto nla_put_failure; 4574 } 4575 4576 if (rt->fib6_prefsrc.plen) { 4577 struct in6_addr saddr_buf; 4578 saddr_buf = rt->fib6_prefsrc.addr; 4579 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4580 goto nla_put_failure; 4581 } 4582 4583 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4584 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4585 goto nla_put_failure; 4586 4587 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4588 goto nla_put_failure; 4589 4590 /* For multipath routes, walk the siblings list and add 4591 * each as a nexthop within RTA_MULTIPATH. 4592 */ 4593 if (rt->fib6_nsiblings) { 4594 struct fib6_info *sibling, *next_sibling; 4595 struct nlattr *mp; 4596 4597 mp = nla_nest_start(skb, RTA_MULTIPATH); 4598 if (!mp) 4599 goto nla_put_failure; 4600 4601 if (rt6_add_nexthop(skb, rt) < 0) 4602 goto nla_put_failure; 4603 4604 list_for_each_entry_safe(sibling, next_sibling, 4605 &rt->fib6_siblings, fib6_siblings) { 4606 if (rt6_add_nexthop(skb, sibling) < 0) 4607 goto nla_put_failure; 4608 } 4609 4610 nla_nest_end(skb, mp); 4611 } else { 4612 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4613 goto nla_put_failure; 4614 } 4615 4616 if (rt->fib6_flags & RTF_EXPIRES) { 4617 expires = dst ? dst->expires : rt->expires; 4618 expires -= jiffies; 4619 } 4620 4621 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4622 goto nla_put_failure; 4623 4624 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4625 goto nla_put_failure; 4626 4627 4628 nlmsg_end(skb, nlh); 4629 return 0; 4630 4631 nla_put_failure: 4632 nlmsg_cancel(skb, nlh); 4633 return -EMSGSIZE; 4634 } 4635 4636 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4637 { 4638 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4639 struct net *net = arg->net; 4640 4641 if (rt == net->ipv6.fib6_null_entry) 4642 return 0; 4643 4644 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4645 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4646 4647 /* user wants prefix routes only */ 4648 if (rtm->rtm_flags & RTM_F_PREFIX && 4649 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4650 /* success since this is not a prefix route */ 4651 return 1; 4652 } 4653 } 4654 4655 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4656 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4657 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4658 } 4659 4660 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4661 struct netlink_ext_ack *extack) 4662 { 4663 struct net *net = sock_net(in_skb->sk); 4664 struct nlattr *tb[RTA_MAX+1]; 4665 int err, iif = 0, oif = 0; 4666 struct dst_entry *dst; 4667 struct rt6_info *rt; 4668 struct sk_buff *skb; 4669 struct rtmsg *rtm; 4670 struct flowi6 fl6; 4671 bool fibmatch; 4672 4673 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4674 extack); 4675 if (err < 0) 4676 goto errout; 4677 4678 err = -EINVAL; 4679 memset(&fl6, 0, sizeof(fl6)); 4680 rtm = nlmsg_data(nlh); 4681 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4682 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4683 4684 if (tb[RTA_SRC]) { 4685 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4686 goto errout; 4687 4688 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4689 } 4690 4691 if (tb[RTA_DST]) { 4692 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4693 goto errout; 4694 4695 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4696 } 4697 4698 if (tb[RTA_IIF]) 4699 iif = nla_get_u32(tb[RTA_IIF]); 4700 4701 if (tb[RTA_OIF]) 4702 oif = nla_get_u32(tb[RTA_OIF]); 4703 4704 if (tb[RTA_MARK]) 4705 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4706 4707 if (tb[RTA_UID]) 4708 fl6.flowi6_uid = make_kuid(current_user_ns(), 4709 nla_get_u32(tb[RTA_UID])); 4710 else 4711 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4712 4713 if (iif) { 4714 struct net_device *dev; 4715 int flags = 0; 4716 4717 rcu_read_lock(); 4718 4719 dev = dev_get_by_index_rcu(net, iif); 4720 if (!dev) { 4721 rcu_read_unlock(); 4722 err = -ENODEV; 4723 goto errout; 4724 } 4725 4726 fl6.flowi6_iif = iif; 4727 4728 if (!ipv6_addr_any(&fl6.saddr)) 4729 flags |= RT6_LOOKUP_F_HAS_SADDR; 4730 4731 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4732 4733 rcu_read_unlock(); 4734 } else { 4735 fl6.flowi6_oif = oif; 4736 4737 dst = ip6_route_output(net, NULL, &fl6); 4738 } 4739 4740 4741 rt = container_of(dst, struct rt6_info, dst); 4742 if (rt->dst.error) { 4743 err = rt->dst.error; 4744 ip6_rt_put(rt); 4745 goto errout; 4746 } 4747 4748 if (rt == net->ipv6.ip6_null_entry) { 4749 err = rt->dst.error; 4750 ip6_rt_put(rt); 4751 goto errout; 4752 } 4753 4754 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4755 if (!skb) { 4756 ip6_rt_put(rt); 4757 err = -ENOBUFS; 4758 goto errout; 4759 } 4760 4761 skb_dst_set(skb, &rt->dst); 4762 if (fibmatch) 4763 err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif, 4764 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4765 nlh->nlmsg_seq, 0); 4766 else 4767 err = rt6_fill_node(net, skb, rt->from, dst, 4768 &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, 4769 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4770 0); 4771 if (err < 0) { 4772 kfree_skb(skb); 4773 goto errout; 4774 } 4775 4776 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4777 errout: 4778 return err; 4779 } 4780 4781 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4782 unsigned int nlm_flags) 4783 { 4784 struct sk_buff *skb; 4785 struct net *net = info->nl_net; 4786 u32 seq; 4787 int err; 4788 4789 err = -ENOBUFS; 4790 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4791 4792 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4793 if (!skb) 4794 goto errout; 4795 4796 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4797 event, info->portid, seq, nlm_flags); 4798 if (err < 0) { 4799 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4800 WARN_ON(err == -EMSGSIZE); 4801 kfree_skb(skb); 4802 goto errout; 4803 } 4804 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4805 info->nlh, gfp_any()); 4806 return; 4807 errout: 4808 if (err < 0) 4809 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4810 } 4811 4812 static int ip6_route_dev_notify(struct notifier_block *this, 4813 unsigned long event, void *ptr) 4814 { 4815 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4816 struct net *net = dev_net(dev); 4817 4818 if (!(dev->flags & IFF_LOOPBACK)) 4819 return NOTIFY_OK; 4820 4821 if (event == NETDEV_REGISTER) { 4822 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4823 net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev); 4824 net->ipv6.ip6_null_entry->dst.dev = dev; 4825 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4826 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4827 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4828 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4829 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4830 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4831 #endif 4832 } else if (event == NETDEV_UNREGISTER && 4833 dev->reg_state != NETREG_UNREGISTERED) { 4834 /* NETDEV_UNREGISTER could be fired for multiple times by 4835 * netdev_wait_allrefs(). Make sure we only call this once. 4836 */ 4837 in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev); 4838 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4839 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4840 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4841 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4842 #endif 4843 } 4844 4845 return NOTIFY_OK; 4846 } 4847 4848 /* 4849 * /proc 4850 */ 4851 4852 #ifdef CONFIG_PROC_FS 4853 4854 static const struct file_operations ipv6_route_proc_fops = { 4855 .open = ipv6_route_open, 4856 .read = seq_read, 4857 .llseek = seq_lseek, 4858 .release = seq_release_net, 4859 }; 4860 4861 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4862 { 4863 struct net *net = (struct net *)seq->private; 4864 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4865 net->ipv6.rt6_stats->fib_nodes, 4866 net->ipv6.rt6_stats->fib_route_nodes, 4867 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4868 net->ipv6.rt6_stats->fib_rt_entries, 4869 net->ipv6.rt6_stats->fib_rt_cache, 4870 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4871 net->ipv6.rt6_stats->fib_discarded_routes); 4872 4873 return 0; 4874 } 4875 4876 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4877 { 4878 return single_open_net(inode, file, rt6_stats_seq_show); 4879 } 4880 4881 static const struct file_operations rt6_stats_seq_fops = { 4882 .open = rt6_stats_seq_open, 4883 .read = seq_read, 4884 .llseek = seq_lseek, 4885 .release = single_release_net, 4886 }; 4887 #endif /* CONFIG_PROC_FS */ 4888 4889 #ifdef CONFIG_SYSCTL 4890 4891 static 4892 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4893 void __user *buffer, size_t *lenp, loff_t *ppos) 4894 { 4895 struct net *net; 4896 int delay; 4897 if (!write) 4898 return -EINVAL; 4899 4900 net = (struct net *)ctl->extra1; 4901 delay = net->ipv6.sysctl.flush_delay; 4902 proc_dointvec(ctl, write, buffer, lenp, ppos); 4903 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4904 return 0; 4905 } 4906 4907 struct ctl_table ipv6_route_table_template[] = { 4908 { 4909 .procname = "flush", 4910 .data = &init_net.ipv6.sysctl.flush_delay, 4911 .maxlen = sizeof(int), 4912 .mode = 0200, 4913 .proc_handler = ipv6_sysctl_rtcache_flush 4914 }, 4915 { 4916 .procname = "gc_thresh", 4917 .data = &ip6_dst_ops_template.gc_thresh, 4918 .maxlen = sizeof(int), 4919 .mode = 0644, 4920 .proc_handler = proc_dointvec, 4921 }, 4922 { 4923 .procname = "max_size", 4924 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4925 .maxlen = sizeof(int), 4926 .mode = 0644, 4927 .proc_handler = proc_dointvec, 4928 }, 4929 { 4930 .procname = "gc_min_interval", 4931 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4932 .maxlen = sizeof(int), 4933 .mode = 0644, 4934 .proc_handler = proc_dointvec_jiffies, 4935 }, 4936 { 4937 .procname = "gc_timeout", 4938 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 4939 .maxlen = sizeof(int), 4940 .mode = 0644, 4941 .proc_handler = proc_dointvec_jiffies, 4942 }, 4943 { 4944 .procname = "gc_interval", 4945 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 4946 .maxlen = sizeof(int), 4947 .mode = 0644, 4948 .proc_handler = proc_dointvec_jiffies, 4949 }, 4950 { 4951 .procname = "gc_elasticity", 4952 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 4953 .maxlen = sizeof(int), 4954 .mode = 0644, 4955 .proc_handler = proc_dointvec, 4956 }, 4957 { 4958 .procname = "mtu_expires", 4959 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 4960 .maxlen = sizeof(int), 4961 .mode = 0644, 4962 .proc_handler = proc_dointvec_jiffies, 4963 }, 4964 { 4965 .procname = "min_adv_mss", 4966 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 4967 .maxlen = sizeof(int), 4968 .mode = 0644, 4969 .proc_handler = proc_dointvec, 4970 }, 4971 { 4972 .procname = "gc_min_interval_ms", 4973 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4974 .maxlen = sizeof(int), 4975 .mode = 0644, 4976 .proc_handler = proc_dointvec_ms_jiffies, 4977 }, 4978 { } 4979 }; 4980 4981 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 4982 { 4983 struct ctl_table *table; 4984 4985 table = kmemdup(ipv6_route_table_template, 4986 sizeof(ipv6_route_table_template), 4987 GFP_KERNEL); 4988 4989 if (table) { 4990 table[0].data = &net->ipv6.sysctl.flush_delay; 4991 table[0].extra1 = net; 4992 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 4993 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 4994 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4995 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 4996 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 4997 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 4998 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 4999 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5000 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5001 5002 /* Don't export sysctls to unprivileged users */ 5003 if (net->user_ns != &init_user_ns) 5004 table[0].procname = NULL; 5005 } 5006 5007 return table; 5008 } 5009 #endif 5010 5011 static int __net_init ip6_route_net_init(struct net *net) 5012 { 5013 int ret = -ENOMEM; 5014 5015 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5016 sizeof(net->ipv6.ip6_dst_ops)); 5017 5018 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5019 goto out_ip6_dst_ops; 5020 5021 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5022 sizeof(*net->ipv6.fib6_null_entry), 5023 GFP_KERNEL); 5024 if (!net->ipv6.fib6_null_entry) 5025 goto out_ip6_dst_entries; 5026 5027 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5028 sizeof(*net->ipv6.ip6_null_entry), 5029 GFP_KERNEL); 5030 if (!net->ipv6.ip6_null_entry) 5031 goto out_fib6_null_entry; 5032 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5033 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5034 ip6_template_metrics, true); 5035 5036 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5037 net->ipv6.fib6_has_custom_rules = false; 5038 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5039 sizeof(*net->ipv6.ip6_prohibit_entry), 5040 GFP_KERNEL); 5041 if (!net->ipv6.ip6_prohibit_entry) 5042 goto out_ip6_null_entry; 5043 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5044 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5045 ip6_template_metrics, true); 5046 5047 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5048 sizeof(*net->ipv6.ip6_blk_hole_entry), 5049 GFP_KERNEL); 5050 if (!net->ipv6.ip6_blk_hole_entry) 5051 goto out_ip6_prohibit_entry; 5052 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5053 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5054 ip6_template_metrics, true); 5055 #endif 5056 5057 net->ipv6.sysctl.flush_delay = 0; 5058 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5059 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5060 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5061 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5062 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5063 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5064 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5065 5066 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5067 5068 ret = 0; 5069 out: 5070 return ret; 5071 5072 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5073 out_ip6_prohibit_entry: 5074 kfree(net->ipv6.ip6_prohibit_entry); 5075 out_ip6_null_entry: 5076 kfree(net->ipv6.ip6_null_entry); 5077 #endif 5078 out_fib6_null_entry: 5079 kfree(net->ipv6.fib6_null_entry); 5080 out_ip6_dst_entries: 5081 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5082 out_ip6_dst_ops: 5083 goto out; 5084 } 5085 5086 static void __net_exit ip6_route_net_exit(struct net *net) 5087 { 5088 kfree(net->ipv6.fib6_null_entry); 5089 kfree(net->ipv6.ip6_null_entry); 5090 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5091 kfree(net->ipv6.ip6_prohibit_entry); 5092 kfree(net->ipv6.ip6_blk_hole_entry); 5093 #endif 5094 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5095 } 5096 5097 static int __net_init ip6_route_net_init_late(struct net *net) 5098 { 5099 #ifdef CONFIG_PROC_FS 5100 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5101 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5102 #endif 5103 return 0; 5104 } 5105 5106 static void __net_exit ip6_route_net_exit_late(struct net *net) 5107 { 5108 #ifdef CONFIG_PROC_FS 5109 remove_proc_entry("ipv6_route", net->proc_net); 5110 remove_proc_entry("rt6_stats", net->proc_net); 5111 #endif 5112 } 5113 5114 static struct pernet_operations ip6_route_net_ops = { 5115 .init = ip6_route_net_init, 5116 .exit = ip6_route_net_exit, 5117 }; 5118 5119 static int __net_init ipv6_inetpeer_init(struct net *net) 5120 { 5121 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5122 5123 if (!bp) 5124 return -ENOMEM; 5125 inet_peer_base_init(bp); 5126 net->ipv6.peers = bp; 5127 return 0; 5128 } 5129 5130 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5131 { 5132 struct inet_peer_base *bp = net->ipv6.peers; 5133 5134 net->ipv6.peers = NULL; 5135 inetpeer_invalidate_tree(bp); 5136 kfree(bp); 5137 } 5138 5139 static struct pernet_operations ipv6_inetpeer_ops = { 5140 .init = ipv6_inetpeer_init, 5141 .exit = ipv6_inetpeer_exit, 5142 }; 5143 5144 static struct pernet_operations ip6_route_net_late_ops = { 5145 .init = ip6_route_net_init_late, 5146 .exit = ip6_route_net_exit_late, 5147 }; 5148 5149 static struct notifier_block ip6_route_dev_notifier = { 5150 .notifier_call = ip6_route_dev_notify, 5151 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5152 }; 5153 5154 void __init ip6_route_init_special_entries(void) 5155 { 5156 /* Registering of the loopback is done before this portion of code, 5157 * the loopback reference in rt6_info will not be taken, do it 5158 * manually for init_net */ 5159 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5160 init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev); 5161 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5162 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5163 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5164 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5165 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5166 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5167 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5168 #endif 5169 } 5170 5171 int __init ip6_route_init(void) 5172 { 5173 int ret; 5174 int cpu; 5175 5176 ret = -ENOMEM; 5177 ip6_dst_ops_template.kmem_cachep = 5178 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5179 SLAB_HWCACHE_ALIGN, NULL); 5180 if (!ip6_dst_ops_template.kmem_cachep) 5181 goto out; 5182 5183 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5184 if (ret) 5185 goto out_kmem_cache; 5186 5187 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5188 if (ret) 5189 goto out_dst_entries; 5190 5191 ret = register_pernet_subsys(&ip6_route_net_ops); 5192 if (ret) 5193 goto out_register_inetpeer; 5194 5195 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5196 5197 ret = fib6_init(); 5198 if (ret) 5199 goto out_register_subsys; 5200 5201 ret = xfrm6_init(); 5202 if (ret) 5203 goto out_fib6_init; 5204 5205 ret = fib6_rules_init(); 5206 if (ret) 5207 goto xfrm6_init; 5208 5209 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5210 if (ret) 5211 goto fib6_rules_init; 5212 5213 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5214 inet6_rtm_newroute, NULL, 0); 5215 if (ret < 0) 5216 goto out_register_late_subsys; 5217 5218 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5219 inet6_rtm_delroute, NULL, 0); 5220 if (ret < 0) 5221 goto out_register_late_subsys; 5222 5223 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5224 inet6_rtm_getroute, NULL, 5225 RTNL_FLAG_DOIT_UNLOCKED); 5226 if (ret < 0) 5227 goto out_register_late_subsys; 5228 5229 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5230 if (ret) 5231 goto out_register_late_subsys; 5232 5233 for_each_possible_cpu(cpu) { 5234 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5235 5236 INIT_LIST_HEAD(&ul->head); 5237 spin_lock_init(&ul->lock); 5238 } 5239 5240 out: 5241 return ret; 5242 5243 out_register_late_subsys: 5244 rtnl_unregister_all(PF_INET6); 5245 unregister_pernet_subsys(&ip6_route_net_late_ops); 5246 fib6_rules_init: 5247 fib6_rules_cleanup(); 5248 xfrm6_init: 5249 xfrm6_fini(); 5250 out_fib6_init: 5251 fib6_gc_cleanup(); 5252 out_register_subsys: 5253 unregister_pernet_subsys(&ip6_route_net_ops); 5254 out_register_inetpeer: 5255 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5256 out_dst_entries: 5257 dst_entries_destroy(&ip6_dst_blackhole_ops); 5258 out_kmem_cache: 5259 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5260 goto out; 5261 } 5262 5263 void ip6_route_cleanup(void) 5264 { 5265 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5266 unregister_pernet_subsys(&ip6_route_net_late_ops); 5267 fib6_rules_cleanup(); 5268 xfrm6_fini(); 5269 fib6_gc_cleanup(); 5270 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5271 unregister_pernet_subsys(&ip6_route_net_ops); 5272 dst_entries_destroy(&ip6_dst_blackhole_ops); 5273 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5274 } 5275