1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 106 int strict); 107 static size_t rt6_nlmsg_size(struct fib6_info *rt); 108 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 109 struct fib6_info *rt, struct dst_entry *dst, 110 struct in6_addr *dest, struct in6_addr *src, 111 int iif, int type, u32 portid, u32 seq, 112 unsigned int flags); 113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 114 struct in6_addr *daddr, 115 struct in6_addr *saddr); 116 117 #ifdef CONFIG_IPV6_ROUTE_INFO 118 static struct fib6_info *rt6_add_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev, 122 unsigned int pref); 123 static struct fib6_info *rt6_get_route_info(struct net *net, 124 const struct in6_addr *prefix, int prefixlen, 125 const struct in6_addr *gwaddr, 126 struct net_device *dev); 127 #endif 128 129 struct uncached_list { 130 spinlock_t lock; 131 struct list_head head; 132 }; 133 134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 135 136 void rt6_uncached_list_add(struct rt6_info *rt) 137 { 138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 139 140 rt->rt6i_uncached_list = ul; 141 142 spin_lock_bh(&ul->lock); 143 list_add_tail(&rt->rt6i_uncached, &ul->head); 144 spin_unlock_bh(&ul->lock); 145 } 146 147 void rt6_uncached_list_del(struct rt6_info *rt) 148 { 149 if (!list_empty(&rt->rt6i_uncached)) { 150 struct uncached_list *ul = rt->rt6i_uncached_list; 151 struct net *net = dev_net(rt->dst.dev); 152 153 spin_lock_bh(&ul->lock); 154 list_del(&rt->rt6i_uncached); 155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 161 { 162 struct net_device *loopback_dev = net->loopback_dev; 163 int cpu; 164 165 if (dev == loopback_dev) 166 return; 167 168 for_each_possible_cpu(cpu) { 169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 170 struct rt6_info *rt; 171 172 spin_lock_bh(&ul->lock); 173 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 174 struct inet6_dev *rt_idev = rt->rt6i_idev; 175 struct net_device *rt_dev = rt->dst.dev; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(loopback_dev); 179 in6_dev_put(rt_idev); 180 } 181 182 if (rt_dev == dev) { 183 rt->dst.dev = loopback_dev; 184 dev_hold(rt->dst.dev); 185 dev_put(rt_dev); 186 } 187 } 188 spin_unlock_bh(&ul->lock); 189 } 190 } 191 192 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 193 struct sk_buff *skb, 194 const void *daddr) 195 { 196 if (!ipv6_addr_any(p)) 197 return (const void *) p; 198 else if (skb) 199 return &ipv6_hdr(skb)->daddr; 200 return daddr; 201 } 202 203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 204 struct net_device *dev, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct neighbour *n; 209 210 daddr = choose_neigh_daddr(gw, skb, daddr); 211 n = __ipv6_neigh_lookup(dev, daddr); 212 if (n) 213 return n; 214 215 n = neigh_create(&nd_tbl, daddr, dev); 216 return IS_ERR(n) ? NULL : n; 217 } 218 219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 220 struct sk_buff *skb, 221 const void *daddr) 222 { 223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 224 225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 226 } 227 228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 229 { 230 struct net_device *dev = dst->dev; 231 struct rt6_info *rt = (struct rt6_info *)dst; 232 233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 234 if (!daddr) 235 return; 236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 237 return; 238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 239 return; 240 __ipv6_confirm_neigh(dev, daddr); 241 } 242 243 static struct dst_ops ip6_dst_ops_template = { 244 .family = AF_INET6, 245 .gc = ip6_dst_gc, 246 .gc_thresh = 1024, 247 .check = ip6_dst_check, 248 .default_advmss = ip6_default_advmss, 249 .mtu = ip6_mtu, 250 .cow_metrics = dst_cow_metrics_generic, 251 .destroy = ip6_dst_destroy, 252 .ifdown = ip6_dst_ifdown, 253 .negative_advice = ip6_negative_advice, 254 .link_failure = ip6_link_failure, 255 .update_pmtu = ip6_rt_update_pmtu, 256 .redirect = rt6_do_redirect, 257 .local_out = __ip6_local_out, 258 .neigh_lookup = ip6_dst_neigh_lookup, 259 .confirm_neigh = ip6_confirm_neigh, 260 }; 261 262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 263 { 264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 265 266 return mtu ? : dst->dev->mtu; 267 } 268 269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 270 struct sk_buff *skb, u32 mtu) 271 { 272 } 273 274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 275 struct sk_buff *skb) 276 { 277 } 278 279 static struct dst_ops ip6_dst_blackhole_ops = { 280 .family = AF_INET6, 281 .destroy = ip6_dst_destroy, 282 .check = ip6_dst_check, 283 .mtu = ip6_blackhole_mtu, 284 .default_advmss = ip6_default_advmss, 285 .update_pmtu = ip6_rt_blackhole_update_pmtu, 286 .redirect = ip6_rt_blackhole_redirect, 287 .cow_metrics = dst_cow_metrics_generic, 288 .neigh_lookup = ip6_dst_neigh_lookup, 289 }; 290 291 static const u32 ip6_template_metrics[RTAX_MAX] = { 292 [RTAX_HOPLIMIT - 1] = 0, 293 }; 294 295 static const struct fib6_info fib6_null_entry_template = { 296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 297 .fib6_protocol = RTPROT_KERNEL, 298 .fib6_metric = ~(u32)0, 299 .fib6_ref = ATOMIC_INIT(1), 300 .fib6_type = RTN_UNREACHABLE, 301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 302 }; 303 304 static const struct rt6_info ip6_null_entry_template = { 305 .dst = { 306 .__refcnt = ATOMIC_INIT(1), 307 .__use = 1, 308 .obsolete = DST_OBSOLETE_FORCE_CHK, 309 .error = -ENETUNREACH, 310 .input = ip6_pkt_discard, 311 .output = ip6_pkt_discard_out, 312 }, 313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 314 }; 315 316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 317 318 static const struct rt6_info ip6_prohibit_entry_template = { 319 .dst = { 320 .__refcnt = ATOMIC_INIT(1), 321 .__use = 1, 322 .obsolete = DST_OBSOLETE_FORCE_CHK, 323 .error = -EACCES, 324 .input = ip6_pkt_prohibit, 325 .output = ip6_pkt_prohibit_out, 326 }, 327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 }; 341 342 #endif 343 344 static void rt6_info_init(struct rt6_info *rt) 345 { 346 struct dst_entry *dst = &rt->dst; 347 348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 349 INIT_LIST_HEAD(&rt->rt6i_uncached); 350 } 351 352 /* allocate dst with ip6_dst_ops */ 353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 354 int flags) 355 { 356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 357 1, DST_OBSOLETE_FORCE_CHK, flags); 358 359 if (rt) { 360 rt6_info_init(rt); 361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 362 } 363 364 return rt; 365 } 366 EXPORT_SYMBOL(ip6_dst_alloc); 367 368 static void ip6_dst_destroy(struct dst_entry *dst) 369 { 370 struct rt6_info *rt = (struct rt6_info *)dst; 371 struct fib6_info *from; 372 struct inet6_dev *idev; 373 374 ip_dst_metrics_put(dst); 375 rt6_uncached_list_del(rt); 376 377 idev = rt->rt6i_idev; 378 if (idev) { 379 rt->rt6i_idev = NULL; 380 in6_dev_put(idev); 381 } 382 383 rcu_read_lock(); 384 from = rcu_dereference(rt->from); 385 rcu_assign_pointer(rt->from, NULL); 386 fib6_info_release(from); 387 rcu_read_unlock(); 388 } 389 390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 391 int how) 392 { 393 struct rt6_info *rt = (struct rt6_info *)dst; 394 struct inet6_dev *idev = rt->rt6i_idev; 395 struct net_device *loopback_dev = 396 dev_net(dev)->loopback_dev; 397 398 if (idev && idev->dev != loopback_dev) { 399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 400 if (loopback_idev) { 401 rt->rt6i_idev = loopback_idev; 402 in6_dev_put(idev); 403 } 404 } 405 } 406 407 static bool __rt6_check_expired(const struct rt6_info *rt) 408 { 409 if (rt->rt6i_flags & RTF_EXPIRES) 410 return time_after(jiffies, rt->dst.expires); 411 else 412 return false; 413 } 414 415 static bool rt6_check_expired(const struct rt6_info *rt) 416 { 417 struct fib6_info *from; 418 419 from = rcu_dereference(rt->from); 420 421 if (rt->rt6i_flags & RTF_EXPIRES) { 422 if (time_after(jiffies, rt->dst.expires)) 423 return true; 424 } else if (from) { 425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 426 fib6_check_expired(from); 427 } 428 return false; 429 } 430 431 void fib6_select_path(const struct net *net, struct fib6_result *res, 432 struct flowi6 *fl6, int oif, bool have_oif_match, 433 const struct sk_buff *skb, int strict) 434 { 435 struct fib6_info *sibling, *next_sibling; 436 struct fib6_info *match = res->f6i; 437 438 if (!match->fib6_nsiblings || have_oif_match) 439 goto out; 440 441 /* We might have already computed the hash for ICMPv6 errors. In such 442 * case it will always be non-zero. Otherwise now is the time to do it. 443 */ 444 if (!fl6->mp_hash) 445 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 446 447 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 448 goto out; 449 450 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 451 fib6_siblings) { 452 const struct fib6_nh *nh = &sibling->fib6_nh; 453 int nh_upper_bound; 454 455 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 456 if (fl6->mp_hash > nh_upper_bound) 457 continue; 458 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 459 break; 460 match = sibling; 461 break; 462 } 463 464 out: 465 res->f6i = match; 466 res->nh = &match->fib6_nh; 467 } 468 469 /* 470 * Route lookup. rcu_read_lock() should be held. 471 */ 472 473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 474 const struct in6_addr *saddr, int oif, int flags) 475 { 476 const struct net_device *dev; 477 478 if (nh->fib_nh_flags & RTNH_F_DEAD) 479 return false; 480 481 dev = nh->fib_nh_dev; 482 if (oif) { 483 if (dev->ifindex == oif) 484 return true; 485 } else { 486 if (ipv6_chk_addr(net, saddr, dev, 487 flags & RT6_LOOKUP_F_IFACE)) 488 return true; 489 } 490 491 return false; 492 } 493 494 static inline struct fib6_info *rt6_device_match(struct net *net, 495 struct fib6_info *rt, 496 const struct in6_addr *saddr, 497 int oif, 498 int flags) 499 { 500 const struct fib6_nh *nh; 501 struct fib6_info *sprt; 502 503 if (!oif && ipv6_addr_any(saddr) && 504 !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)) 505 return rt; 506 507 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 508 nh = &sprt->fib6_nh; 509 if (__rt6_device_match(net, nh, saddr, oif, flags)) 510 return sprt; 511 } 512 513 if (oif && flags & RT6_LOOKUP_F_IFACE) 514 return net->ipv6.fib6_null_entry; 515 516 return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 517 } 518 519 #ifdef CONFIG_IPV6_ROUTER_PREF 520 struct __rt6_probe_work { 521 struct work_struct work; 522 struct in6_addr target; 523 struct net_device *dev; 524 }; 525 526 static void rt6_probe_deferred(struct work_struct *w) 527 { 528 struct in6_addr mcaddr; 529 struct __rt6_probe_work *work = 530 container_of(w, struct __rt6_probe_work, work); 531 532 addrconf_addr_solict_mult(&work->target, &mcaddr); 533 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 534 dev_put(work->dev); 535 kfree(work); 536 } 537 538 static void rt6_probe(struct fib6_nh *fib6_nh) 539 { 540 struct __rt6_probe_work *work = NULL; 541 const struct in6_addr *nh_gw; 542 struct neighbour *neigh; 543 struct net_device *dev; 544 struct inet6_dev *idev; 545 546 /* 547 * Okay, this does not seem to be appropriate 548 * for now, however, we need to check if it 549 * is really so; aka Router Reachability Probing. 550 * 551 * Router Reachability Probe MUST be rate-limited 552 * to no more than one per minute. 553 */ 554 if (fib6_nh->fib_nh_gw_family) 555 return; 556 557 nh_gw = &fib6_nh->fib_nh_gw6; 558 dev = fib6_nh->fib_nh_dev; 559 rcu_read_lock_bh(); 560 idev = __in6_dev_get(dev); 561 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 562 if (neigh) { 563 if (neigh->nud_state & NUD_VALID) 564 goto out; 565 566 write_lock(&neigh->lock); 567 if (!(neigh->nud_state & NUD_VALID) && 568 time_after(jiffies, 569 neigh->updated + idev->cnf.rtr_probe_interval)) { 570 work = kmalloc(sizeof(*work), GFP_ATOMIC); 571 if (work) 572 __neigh_set_probe_once(neigh); 573 } 574 write_unlock(&neigh->lock); 575 } else if (time_after(jiffies, fib6_nh->last_probe + 576 idev->cnf.rtr_probe_interval)) { 577 work = kmalloc(sizeof(*work), GFP_ATOMIC); 578 } 579 580 if (work) { 581 fib6_nh->last_probe = jiffies; 582 INIT_WORK(&work->work, rt6_probe_deferred); 583 work->target = *nh_gw; 584 dev_hold(dev); 585 work->dev = dev; 586 schedule_work(&work->work); 587 } 588 589 out: 590 rcu_read_unlock_bh(); 591 } 592 #else 593 static inline void rt6_probe(struct fib6_nh *fib6_nh) 594 { 595 } 596 #endif 597 598 /* 599 * Default Router Selection (RFC 2461 6.3.6) 600 */ 601 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 602 { 603 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 604 struct neighbour *neigh; 605 606 rcu_read_lock_bh(); 607 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 608 &fib6_nh->fib_nh_gw6); 609 if (neigh) { 610 read_lock(&neigh->lock); 611 if (neigh->nud_state & NUD_VALID) 612 ret = RT6_NUD_SUCCEED; 613 #ifdef CONFIG_IPV6_ROUTER_PREF 614 else if (!(neigh->nud_state & NUD_FAILED)) 615 ret = RT6_NUD_SUCCEED; 616 else 617 ret = RT6_NUD_FAIL_PROBE; 618 #endif 619 read_unlock(&neigh->lock); 620 } else { 621 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 622 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 623 } 624 rcu_read_unlock_bh(); 625 626 return ret; 627 } 628 629 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 630 int strict) 631 { 632 int m = 0; 633 634 if (!oif || nh->fib_nh_dev->ifindex == oif) 635 m = 2; 636 637 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 638 return RT6_NUD_FAIL_HARD; 639 #ifdef CONFIG_IPV6_ROUTER_PREF 640 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 641 #endif 642 if ((strict & RT6_LOOKUP_F_REACHABLE) && 643 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 644 int n = rt6_check_neigh(nh); 645 if (n < 0) 646 return n; 647 } 648 return m; 649 } 650 651 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 652 int oif, int strict, int *mpri, bool *do_rr) 653 { 654 bool match_do_rr = false; 655 bool rc = false; 656 int m; 657 658 if (nh->fib_nh_flags & RTNH_F_DEAD) 659 goto out; 660 661 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 662 nh->fib_nh_flags & RTNH_F_LINKDOWN && 663 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 664 goto out; 665 666 m = rt6_score_route(nh, fib6_flags, oif, strict); 667 if (m == RT6_NUD_FAIL_DO_RR) { 668 match_do_rr = true; 669 m = 0; /* lowest valid score */ 670 } else if (m == RT6_NUD_FAIL_HARD) { 671 goto out; 672 } 673 674 if (strict & RT6_LOOKUP_F_REACHABLE) 675 rt6_probe(nh); 676 677 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 678 if (m > *mpri) { 679 *do_rr = match_do_rr; 680 *mpri = m; 681 rc = true; 682 } 683 out: 684 return rc; 685 } 686 687 static void __find_rr_leaf(struct fib6_info *rt_start, 688 struct fib6_info *nomatch, u32 metric, 689 struct fib6_info **match, struct fib6_info **cont, 690 int oif, int strict, bool *do_rr, int *mpri) 691 { 692 struct fib6_info *rt; 693 694 for (rt = rt_start; 695 rt && rt != nomatch; 696 rt = rcu_dereference(rt->fib6_next)) { 697 struct fib6_nh *nh; 698 699 if (cont && rt->fib6_metric != metric) { 700 *cont = rt; 701 return; 702 } 703 704 if (fib6_check_expired(rt)) 705 continue; 706 707 nh = &rt->fib6_nh; 708 if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr)) 709 *match = rt; 710 } 711 } 712 713 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 714 struct fib6_info *leaf, 715 struct fib6_info *rr_head, 716 u32 metric, int oif, int strict, 717 bool *do_rr) 718 { 719 struct fib6_info *match = NULL, *cont = NULL; 720 int mpri = -1; 721 722 __find_rr_leaf(rr_head, NULL, metric, &match, &cont, 723 oif, strict, do_rr, &mpri); 724 725 __find_rr_leaf(leaf, rr_head, metric, &match, &cont, 726 oif, strict, do_rr, &mpri); 727 728 if (match || !cont) 729 return match; 730 731 __find_rr_leaf(cont, NULL, metric, &match, NULL, 732 oif, strict, do_rr, &mpri); 733 734 return match; 735 } 736 737 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 738 int oif, int strict) 739 { 740 struct fib6_info *leaf = rcu_dereference(fn->leaf); 741 struct fib6_info *match, *rt0; 742 bool do_rr = false; 743 int key_plen; 744 745 if (!leaf || leaf == net->ipv6.fib6_null_entry) 746 return net->ipv6.fib6_null_entry; 747 748 rt0 = rcu_dereference(fn->rr_ptr); 749 if (!rt0) 750 rt0 = leaf; 751 752 /* Double check to make sure fn is not an intermediate node 753 * and fn->leaf does not points to its child's leaf 754 * (This might happen if all routes under fn are deleted from 755 * the tree and fib6_repair_tree() is called on the node.) 756 */ 757 key_plen = rt0->fib6_dst.plen; 758 #ifdef CONFIG_IPV6_SUBTREES 759 if (rt0->fib6_src.plen) 760 key_plen = rt0->fib6_src.plen; 761 #endif 762 if (fn->fn_bit != key_plen) 763 return net->ipv6.fib6_null_entry; 764 765 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 766 &do_rr); 767 768 if (do_rr) { 769 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 770 771 /* no entries matched; do round-robin */ 772 if (!next || next->fib6_metric != rt0->fib6_metric) 773 next = leaf; 774 775 if (next != rt0) { 776 spin_lock_bh(&leaf->fib6_table->tb6_lock); 777 /* make sure next is not being deleted from the tree */ 778 if (next->fib6_node) 779 rcu_assign_pointer(fn->rr_ptr, next); 780 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 781 } 782 } 783 784 return match ? match : net->ipv6.fib6_null_entry; 785 } 786 787 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 788 { 789 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 790 res->nh->fib_nh_gw_family; 791 } 792 793 #ifdef CONFIG_IPV6_ROUTE_INFO 794 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 795 const struct in6_addr *gwaddr) 796 { 797 struct net *net = dev_net(dev); 798 struct route_info *rinfo = (struct route_info *) opt; 799 struct in6_addr prefix_buf, *prefix; 800 unsigned int pref; 801 unsigned long lifetime; 802 struct fib6_info *rt; 803 804 if (len < sizeof(struct route_info)) { 805 return -EINVAL; 806 } 807 808 /* Sanity check for prefix_len and length */ 809 if (rinfo->length > 3) { 810 return -EINVAL; 811 } else if (rinfo->prefix_len > 128) { 812 return -EINVAL; 813 } else if (rinfo->prefix_len > 64) { 814 if (rinfo->length < 2) { 815 return -EINVAL; 816 } 817 } else if (rinfo->prefix_len > 0) { 818 if (rinfo->length < 1) { 819 return -EINVAL; 820 } 821 } 822 823 pref = rinfo->route_pref; 824 if (pref == ICMPV6_ROUTER_PREF_INVALID) 825 return -EINVAL; 826 827 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 828 829 if (rinfo->length == 3) 830 prefix = (struct in6_addr *)rinfo->prefix; 831 else { 832 /* this function is safe */ 833 ipv6_addr_prefix(&prefix_buf, 834 (struct in6_addr *)rinfo->prefix, 835 rinfo->prefix_len); 836 prefix = &prefix_buf; 837 } 838 839 if (rinfo->prefix_len == 0) 840 rt = rt6_get_dflt_router(net, gwaddr, dev); 841 else 842 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 843 gwaddr, dev); 844 845 if (rt && !lifetime) { 846 ip6_del_rt(net, rt); 847 rt = NULL; 848 } 849 850 if (!rt && lifetime) 851 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 852 dev, pref); 853 else if (rt) 854 rt->fib6_flags = RTF_ROUTEINFO | 855 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 856 857 if (rt) { 858 if (!addrconf_finite_timeout(lifetime)) 859 fib6_clean_expires(rt); 860 else 861 fib6_set_expires(rt, jiffies + HZ * lifetime); 862 863 fib6_info_release(rt); 864 } 865 return 0; 866 } 867 #endif 868 869 /* 870 * Misc support functions 871 */ 872 873 /* called with rcu_lock held */ 874 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 875 { 876 struct net_device *dev = res->nh->fib_nh_dev; 877 const struct fib6_info *f6i = res->f6i; 878 879 if (f6i->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 880 /* for copies of local routes, dst->dev needs to be the 881 * device if it is a master device, the master device if 882 * device is enslaved, and the loopback as the default 883 */ 884 if (netif_is_l3_slave(dev) && 885 !rt6_need_strict(&f6i->fib6_dst.addr)) 886 dev = l3mdev_master_dev_rcu(dev); 887 else if (!netif_is_l3_master(dev)) 888 dev = dev_net(dev)->loopback_dev; 889 /* last case is netif_is_l3_master(dev) is true in which 890 * case we want dev returned to be dev 891 */ 892 } 893 894 return dev; 895 } 896 897 static const int fib6_prop[RTN_MAX + 1] = { 898 [RTN_UNSPEC] = 0, 899 [RTN_UNICAST] = 0, 900 [RTN_LOCAL] = 0, 901 [RTN_BROADCAST] = 0, 902 [RTN_ANYCAST] = 0, 903 [RTN_MULTICAST] = 0, 904 [RTN_BLACKHOLE] = -EINVAL, 905 [RTN_UNREACHABLE] = -EHOSTUNREACH, 906 [RTN_PROHIBIT] = -EACCES, 907 [RTN_THROW] = -EAGAIN, 908 [RTN_NAT] = -EINVAL, 909 [RTN_XRESOLVE] = -EINVAL, 910 }; 911 912 static int ip6_rt_type_to_error(u8 fib6_type) 913 { 914 return fib6_prop[fib6_type]; 915 } 916 917 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 918 { 919 unsigned short flags = 0; 920 921 if (rt->dst_nocount) 922 flags |= DST_NOCOUNT; 923 if (rt->dst_nopolicy) 924 flags |= DST_NOPOLICY; 925 if (rt->dst_host) 926 flags |= DST_HOST; 927 928 return flags; 929 } 930 931 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 932 { 933 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 934 935 switch (ort->fib6_type) { 936 case RTN_BLACKHOLE: 937 rt->dst.output = dst_discard_out; 938 rt->dst.input = dst_discard; 939 break; 940 case RTN_PROHIBIT: 941 rt->dst.output = ip6_pkt_prohibit_out; 942 rt->dst.input = ip6_pkt_prohibit; 943 break; 944 case RTN_THROW: 945 case RTN_UNREACHABLE: 946 default: 947 rt->dst.output = ip6_pkt_discard_out; 948 rt->dst.input = ip6_pkt_discard; 949 break; 950 } 951 } 952 953 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 954 { 955 struct fib6_info *ort = res->f6i; 956 957 if (ort->fib6_flags & RTF_REJECT) { 958 ip6_rt_init_dst_reject(rt, ort); 959 return; 960 } 961 962 rt->dst.error = 0; 963 rt->dst.output = ip6_output; 964 965 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 966 rt->dst.input = ip6_input; 967 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 968 rt->dst.input = ip6_mc_input; 969 } else { 970 rt->dst.input = ip6_forward; 971 } 972 973 if (res->nh->fib_nh_lws) { 974 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 975 lwtunnel_set_redirect(&rt->dst); 976 } 977 978 rt->dst.lastuse = jiffies; 979 } 980 981 /* Caller must already hold reference to @from */ 982 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 983 { 984 rt->rt6i_flags &= ~RTF_EXPIRES; 985 rcu_assign_pointer(rt->from, from); 986 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 987 } 988 989 /* Caller must already hold reference to f6i in result */ 990 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 991 { 992 const struct fib6_nh *nh = res->nh; 993 const struct net_device *dev = nh->fib_nh_dev; 994 struct fib6_info *f6i = res->f6i; 995 996 ip6_rt_init_dst(rt, res); 997 998 rt->rt6i_dst = f6i->fib6_dst; 999 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1000 rt->rt6i_flags = f6i->fib6_flags; 1001 if (nh->fib_nh_gw_family) { 1002 rt->rt6i_gateway = nh->fib_nh_gw6; 1003 rt->rt6i_flags |= RTF_GATEWAY; 1004 } 1005 rt6_set_from(rt, f6i); 1006 #ifdef CONFIG_IPV6_SUBTREES 1007 rt->rt6i_src = f6i->fib6_src; 1008 #endif 1009 } 1010 1011 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1012 struct in6_addr *saddr) 1013 { 1014 struct fib6_node *pn, *sn; 1015 while (1) { 1016 if (fn->fn_flags & RTN_TL_ROOT) 1017 return NULL; 1018 pn = rcu_dereference(fn->parent); 1019 sn = FIB6_SUBTREE(pn); 1020 if (sn && sn != fn) 1021 fn = fib6_node_lookup(sn, NULL, saddr); 1022 else 1023 fn = pn; 1024 if (fn->fn_flags & RTN_RTINFO) 1025 return fn; 1026 } 1027 } 1028 1029 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1030 { 1031 struct rt6_info *rt = *prt; 1032 1033 if (dst_hold_safe(&rt->dst)) 1034 return true; 1035 if (net) { 1036 rt = net->ipv6.ip6_null_entry; 1037 dst_hold(&rt->dst); 1038 } else { 1039 rt = NULL; 1040 } 1041 *prt = rt; 1042 return false; 1043 } 1044 1045 /* called with rcu_lock held */ 1046 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1047 { 1048 struct net_device *dev = res->nh->fib_nh_dev; 1049 struct fib6_info *f6i = res->f6i; 1050 unsigned short flags; 1051 struct rt6_info *nrt; 1052 1053 if (!fib6_info_hold_safe(f6i)) 1054 goto fallback; 1055 1056 flags = fib6_info_dst_flags(f6i); 1057 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1058 if (!nrt) { 1059 fib6_info_release(f6i); 1060 goto fallback; 1061 } 1062 1063 ip6_rt_copy_init(nrt, res); 1064 return nrt; 1065 1066 fallback: 1067 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1068 dst_hold(&nrt->dst); 1069 return nrt; 1070 } 1071 1072 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1073 struct fib6_table *table, 1074 struct flowi6 *fl6, 1075 const struct sk_buff *skb, 1076 int flags) 1077 { 1078 struct fib6_result res = {}; 1079 struct fib6_node *fn; 1080 struct rt6_info *rt; 1081 1082 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1083 flags &= ~RT6_LOOKUP_F_IFACE; 1084 1085 rcu_read_lock(); 1086 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1087 restart: 1088 res.f6i = rcu_dereference(fn->leaf); 1089 if (!res.f6i) 1090 res.f6i = net->ipv6.fib6_null_entry; 1091 else 1092 res.f6i = rt6_device_match(net, res.f6i, &fl6->saddr, 1093 fl6->flowi6_oif, flags); 1094 1095 if (res.f6i == net->ipv6.fib6_null_entry) { 1096 fn = fib6_backtrack(fn, &fl6->saddr); 1097 if (fn) 1098 goto restart; 1099 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 goto out; 1103 } 1104 1105 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1106 fl6->flowi6_oif != 0, skb, flags); 1107 1108 /* Search through exception table */ 1109 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1110 if (rt) { 1111 if (ip6_hold_safe(net, &rt)) 1112 dst_use_noref(&rt->dst, jiffies); 1113 } else { 1114 rt = ip6_create_rt_rcu(&res); 1115 } 1116 1117 out: 1118 trace_fib6_table_lookup(net, res.f6i, table, fl6); 1119 1120 rcu_read_unlock(); 1121 1122 return rt; 1123 } 1124 1125 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1126 const struct sk_buff *skb, int flags) 1127 { 1128 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1129 } 1130 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1131 1132 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1133 const struct in6_addr *saddr, int oif, 1134 const struct sk_buff *skb, int strict) 1135 { 1136 struct flowi6 fl6 = { 1137 .flowi6_oif = oif, 1138 .daddr = *daddr, 1139 }; 1140 struct dst_entry *dst; 1141 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1142 1143 if (saddr) { 1144 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1145 flags |= RT6_LOOKUP_F_HAS_SADDR; 1146 } 1147 1148 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1149 if (dst->error == 0) 1150 return (struct rt6_info *) dst; 1151 1152 dst_release(dst); 1153 1154 return NULL; 1155 } 1156 EXPORT_SYMBOL(rt6_lookup); 1157 1158 /* ip6_ins_rt is called with FREE table->tb6_lock. 1159 * It takes new route entry, the addition fails by any reason the 1160 * route is released. 1161 * Caller must hold dst before calling it. 1162 */ 1163 1164 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1165 struct netlink_ext_ack *extack) 1166 { 1167 int err; 1168 struct fib6_table *table; 1169 1170 table = rt->fib6_table; 1171 spin_lock_bh(&table->tb6_lock); 1172 err = fib6_add(&table->tb6_root, rt, info, extack); 1173 spin_unlock_bh(&table->tb6_lock); 1174 1175 return err; 1176 } 1177 1178 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1179 { 1180 struct nl_info info = { .nl_net = net, }; 1181 1182 return __ip6_ins_rt(rt, &info, NULL); 1183 } 1184 1185 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1186 const struct in6_addr *daddr, 1187 const struct in6_addr *saddr) 1188 { 1189 struct fib6_info *f6i = res->f6i; 1190 struct net_device *dev; 1191 struct rt6_info *rt; 1192 1193 /* 1194 * Clone the route. 1195 */ 1196 1197 if (!fib6_info_hold_safe(f6i)) 1198 return NULL; 1199 1200 dev = ip6_rt_get_dev_rcu(res); 1201 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1202 if (!rt) { 1203 fib6_info_release(f6i); 1204 return NULL; 1205 } 1206 1207 ip6_rt_copy_init(rt, res); 1208 rt->rt6i_flags |= RTF_CACHE; 1209 rt->dst.flags |= DST_HOST; 1210 rt->rt6i_dst.addr = *daddr; 1211 rt->rt6i_dst.plen = 128; 1212 1213 if (!rt6_is_gw_or_nonexthop(res)) { 1214 if (f6i->fib6_dst.plen != 128 && 1215 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1216 rt->rt6i_flags |= RTF_ANYCAST; 1217 #ifdef CONFIG_IPV6_SUBTREES 1218 if (rt->rt6i_src.plen && saddr) { 1219 rt->rt6i_src.addr = *saddr; 1220 rt->rt6i_src.plen = 128; 1221 } 1222 #endif 1223 } 1224 1225 return rt; 1226 } 1227 1228 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1229 { 1230 struct fib6_info *f6i = res->f6i; 1231 unsigned short flags = fib6_info_dst_flags(f6i); 1232 struct net_device *dev; 1233 struct rt6_info *pcpu_rt; 1234 1235 if (!fib6_info_hold_safe(f6i)) 1236 return NULL; 1237 1238 rcu_read_lock(); 1239 dev = ip6_rt_get_dev_rcu(res); 1240 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1241 rcu_read_unlock(); 1242 if (!pcpu_rt) { 1243 fib6_info_release(f6i); 1244 return NULL; 1245 } 1246 ip6_rt_copy_init(pcpu_rt, res); 1247 pcpu_rt->rt6i_flags |= RTF_PCPU; 1248 return pcpu_rt; 1249 } 1250 1251 /* It should be called with rcu_read_lock() acquired */ 1252 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1253 { 1254 struct rt6_info *pcpu_rt, **p; 1255 1256 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1257 pcpu_rt = *p; 1258 1259 if (pcpu_rt) 1260 ip6_hold_safe(NULL, &pcpu_rt); 1261 1262 return pcpu_rt; 1263 } 1264 1265 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1266 const struct fib6_result *res) 1267 { 1268 struct rt6_info *pcpu_rt, *prev, **p; 1269 1270 pcpu_rt = ip6_rt_pcpu_alloc(res); 1271 if (!pcpu_rt) { 1272 dst_hold(&net->ipv6.ip6_null_entry->dst); 1273 return net->ipv6.ip6_null_entry; 1274 } 1275 1276 dst_hold(&pcpu_rt->dst); 1277 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1278 prev = cmpxchg(p, NULL, pcpu_rt); 1279 BUG_ON(prev); 1280 1281 return pcpu_rt; 1282 } 1283 1284 /* exception hash table implementation 1285 */ 1286 static DEFINE_SPINLOCK(rt6_exception_lock); 1287 1288 /* Remove rt6_ex from hash table and free the memory 1289 * Caller must hold rt6_exception_lock 1290 */ 1291 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1292 struct rt6_exception *rt6_ex) 1293 { 1294 struct fib6_info *from; 1295 struct net *net; 1296 1297 if (!bucket || !rt6_ex) 1298 return; 1299 1300 net = dev_net(rt6_ex->rt6i->dst.dev); 1301 net->ipv6.rt6_stats->fib_rt_cache--; 1302 1303 /* purge completely the exception to allow releasing the held resources: 1304 * some [sk] cache may keep the dst around for unlimited time 1305 */ 1306 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1307 lockdep_is_held(&rt6_exception_lock)); 1308 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1309 fib6_info_release(from); 1310 dst_dev_put(&rt6_ex->rt6i->dst); 1311 1312 hlist_del_rcu(&rt6_ex->hlist); 1313 dst_release(&rt6_ex->rt6i->dst); 1314 kfree_rcu(rt6_ex, rcu); 1315 WARN_ON_ONCE(!bucket->depth); 1316 bucket->depth--; 1317 } 1318 1319 /* Remove oldest rt6_ex in bucket and free the memory 1320 * Caller must hold rt6_exception_lock 1321 */ 1322 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1323 { 1324 struct rt6_exception *rt6_ex, *oldest = NULL; 1325 1326 if (!bucket) 1327 return; 1328 1329 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1330 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1331 oldest = rt6_ex; 1332 } 1333 rt6_remove_exception(bucket, oldest); 1334 } 1335 1336 static u32 rt6_exception_hash(const struct in6_addr *dst, 1337 const struct in6_addr *src) 1338 { 1339 static u32 seed __read_mostly; 1340 u32 val; 1341 1342 net_get_random_once(&seed, sizeof(seed)); 1343 val = jhash(dst, sizeof(*dst), seed); 1344 1345 #ifdef CONFIG_IPV6_SUBTREES 1346 if (src) 1347 val = jhash(src, sizeof(*src), val); 1348 #endif 1349 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1350 } 1351 1352 /* Helper function to find the cached rt in the hash table 1353 * and update bucket pointer to point to the bucket for this 1354 * (daddr, saddr) pair 1355 * Caller must hold rt6_exception_lock 1356 */ 1357 static struct rt6_exception * 1358 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1359 const struct in6_addr *daddr, 1360 const struct in6_addr *saddr) 1361 { 1362 struct rt6_exception *rt6_ex; 1363 u32 hval; 1364 1365 if (!(*bucket) || !daddr) 1366 return NULL; 1367 1368 hval = rt6_exception_hash(daddr, saddr); 1369 *bucket += hval; 1370 1371 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1372 struct rt6_info *rt6 = rt6_ex->rt6i; 1373 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1374 1375 #ifdef CONFIG_IPV6_SUBTREES 1376 if (matched && saddr) 1377 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1378 #endif 1379 if (matched) 1380 return rt6_ex; 1381 } 1382 return NULL; 1383 } 1384 1385 /* Helper function to find the cached rt in the hash table 1386 * and update bucket pointer to point to the bucket for this 1387 * (daddr, saddr) pair 1388 * Caller must hold rcu_read_lock() 1389 */ 1390 static struct rt6_exception * 1391 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1392 const struct in6_addr *daddr, 1393 const struct in6_addr *saddr) 1394 { 1395 struct rt6_exception *rt6_ex; 1396 u32 hval; 1397 1398 WARN_ON_ONCE(!rcu_read_lock_held()); 1399 1400 if (!(*bucket) || !daddr) 1401 return NULL; 1402 1403 hval = rt6_exception_hash(daddr, saddr); 1404 *bucket += hval; 1405 1406 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1407 struct rt6_info *rt6 = rt6_ex->rt6i; 1408 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1409 1410 #ifdef CONFIG_IPV6_SUBTREES 1411 if (matched && saddr) 1412 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1413 #endif 1414 if (matched) 1415 return rt6_ex; 1416 } 1417 return NULL; 1418 } 1419 1420 static unsigned int fib6_mtu(const struct fib6_info *rt) 1421 { 1422 unsigned int mtu; 1423 1424 if (rt->fib6_pmtu) { 1425 mtu = rt->fib6_pmtu; 1426 } else { 1427 struct net_device *dev = fib6_info_nh_dev(rt); 1428 struct inet6_dev *idev; 1429 1430 rcu_read_lock(); 1431 idev = __in6_dev_get(dev); 1432 mtu = idev->cnf.mtu6; 1433 rcu_read_unlock(); 1434 } 1435 1436 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1437 1438 return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu); 1439 } 1440 1441 static int rt6_insert_exception(struct rt6_info *nrt, 1442 struct fib6_info *ort) 1443 { 1444 struct net *net = dev_net(nrt->dst.dev); 1445 struct rt6_exception_bucket *bucket; 1446 struct in6_addr *src_key = NULL; 1447 struct rt6_exception *rt6_ex; 1448 int err = 0; 1449 1450 spin_lock_bh(&rt6_exception_lock); 1451 1452 if (ort->exception_bucket_flushed) { 1453 err = -EINVAL; 1454 goto out; 1455 } 1456 1457 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1458 lockdep_is_held(&rt6_exception_lock)); 1459 if (!bucket) { 1460 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1461 GFP_ATOMIC); 1462 if (!bucket) { 1463 err = -ENOMEM; 1464 goto out; 1465 } 1466 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1467 } 1468 1469 #ifdef CONFIG_IPV6_SUBTREES 1470 /* rt6i_src.plen != 0 indicates ort is in subtree 1471 * and exception table is indexed by a hash of 1472 * both rt6i_dst and rt6i_src. 1473 * Otherwise, the exception table is indexed by 1474 * a hash of only rt6i_dst. 1475 */ 1476 if (ort->fib6_src.plen) 1477 src_key = &nrt->rt6i_src.addr; 1478 #endif 1479 /* rt6_mtu_change() might lower mtu on ort. 1480 * Only insert this exception route if its mtu 1481 * is less than ort's mtu value. 1482 */ 1483 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1484 err = -EINVAL; 1485 goto out; 1486 } 1487 1488 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1489 src_key); 1490 if (rt6_ex) 1491 rt6_remove_exception(bucket, rt6_ex); 1492 1493 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1494 if (!rt6_ex) { 1495 err = -ENOMEM; 1496 goto out; 1497 } 1498 rt6_ex->rt6i = nrt; 1499 rt6_ex->stamp = jiffies; 1500 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1501 bucket->depth++; 1502 net->ipv6.rt6_stats->fib_rt_cache++; 1503 1504 if (bucket->depth > FIB6_MAX_DEPTH) 1505 rt6_exception_remove_oldest(bucket); 1506 1507 out: 1508 spin_unlock_bh(&rt6_exception_lock); 1509 1510 /* Update fn->fn_sernum to invalidate all cached dst */ 1511 if (!err) { 1512 spin_lock_bh(&ort->fib6_table->tb6_lock); 1513 fib6_update_sernum(net, ort); 1514 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1515 fib6_force_start_gc(net); 1516 } 1517 1518 return err; 1519 } 1520 1521 void rt6_flush_exceptions(struct fib6_info *rt) 1522 { 1523 struct rt6_exception_bucket *bucket; 1524 struct rt6_exception *rt6_ex; 1525 struct hlist_node *tmp; 1526 int i; 1527 1528 spin_lock_bh(&rt6_exception_lock); 1529 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1530 rt->exception_bucket_flushed = 1; 1531 1532 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1533 lockdep_is_held(&rt6_exception_lock)); 1534 if (!bucket) 1535 goto out; 1536 1537 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1538 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1539 rt6_remove_exception(bucket, rt6_ex); 1540 WARN_ON_ONCE(bucket->depth); 1541 bucket++; 1542 } 1543 1544 out: 1545 spin_unlock_bh(&rt6_exception_lock); 1546 } 1547 1548 /* Find cached rt in the hash table inside passed in rt 1549 * Caller has to hold rcu_read_lock() 1550 */ 1551 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1552 struct in6_addr *daddr, 1553 struct in6_addr *saddr) 1554 { 1555 struct rt6_exception_bucket *bucket; 1556 struct in6_addr *src_key = NULL; 1557 struct rt6_exception *rt6_ex; 1558 struct rt6_info *ret = NULL; 1559 1560 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); 1561 1562 #ifdef CONFIG_IPV6_SUBTREES 1563 /* fib6i_src.plen != 0 indicates f6i is in subtree 1564 * and exception table is indexed by a hash of 1565 * both fib6_dst and fib6_src. 1566 * Otherwise, the exception table is indexed by 1567 * a hash of only fib6_dst. 1568 */ 1569 if (res->f6i->fib6_src.plen) 1570 src_key = saddr; 1571 #endif 1572 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1573 1574 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1575 ret = rt6_ex->rt6i; 1576 1577 return ret; 1578 } 1579 1580 /* Remove the passed in cached rt from the hash table that contains it */ 1581 static int rt6_remove_exception_rt(struct rt6_info *rt) 1582 { 1583 struct rt6_exception_bucket *bucket; 1584 struct in6_addr *src_key = NULL; 1585 struct rt6_exception *rt6_ex; 1586 struct fib6_info *from; 1587 int err; 1588 1589 from = rcu_dereference(rt->from); 1590 if (!from || 1591 !(rt->rt6i_flags & RTF_CACHE)) 1592 return -EINVAL; 1593 1594 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1595 return -ENOENT; 1596 1597 spin_lock_bh(&rt6_exception_lock); 1598 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1599 lockdep_is_held(&rt6_exception_lock)); 1600 #ifdef CONFIG_IPV6_SUBTREES 1601 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1602 * and exception table is indexed by a hash of 1603 * both rt6i_dst and rt6i_src. 1604 * Otherwise, the exception table is indexed by 1605 * a hash of only rt6i_dst. 1606 */ 1607 if (from->fib6_src.plen) 1608 src_key = &rt->rt6i_src.addr; 1609 #endif 1610 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1611 &rt->rt6i_dst.addr, 1612 src_key); 1613 if (rt6_ex) { 1614 rt6_remove_exception(bucket, rt6_ex); 1615 err = 0; 1616 } else { 1617 err = -ENOENT; 1618 } 1619 1620 spin_unlock_bh(&rt6_exception_lock); 1621 return err; 1622 } 1623 1624 /* Find rt6_ex which contains the passed in rt cache and 1625 * refresh its stamp 1626 */ 1627 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1628 { 1629 struct rt6_exception_bucket *bucket; 1630 struct in6_addr *src_key = NULL; 1631 struct rt6_exception *rt6_ex; 1632 struct fib6_info *from; 1633 1634 rcu_read_lock(); 1635 from = rcu_dereference(rt->from); 1636 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1637 goto unlock; 1638 1639 bucket = rcu_dereference(from->rt6i_exception_bucket); 1640 1641 #ifdef CONFIG_IPV6_SUBTREES 1642 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1643 * and exception table is indexed by a hash of 1644 * both rt6i_dst and rt6i_src. 1645 * Otherwise, the exception table is indexed by 1646 * a hash of only rt6i_dst. 1647 */ 1648 if (from->fib6_src.plen) 1649 src_key = &rt->rt6i_src.addr; 1650 #endif 1651 rt6_ex = __rt6_find_exception_rcu(&bucket, 1652 &rt->rt6i_dst.addr, 1653 src_key); 1654 if (rt6_ex) 1655 rt6_ex->stamp = jiffies; 1656 1657 unlock: 1658 rcu_read_unlock(); 1659 } 1660 1661 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1662 struct rt6_info *rt, int mtu) 1663 { 1664 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1665 * lowest MTU in the path: always allow updating the route PMTU to 1666 * reflect PMTU decreases. 1667 * 1668 * If the new MTU is higher, and the route PMTU is equal to the local 1669 * MTU, this means the old MTU is the lowest in the path, so allow 1670 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1671 * handle this. 1672 */ 1673 1674 if (dst_mtu(&rt->dst) >= mtu) 1675 return true; 1676 1677 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1678 return true; 1679 1680 return false; 1681 } 1682 1683 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1684 struct fib6_info *rt, int mtu) 1685 { 1686 struct rt6_exception_bucket *bucket; 1687 struct rt6_exception *rt6_ex; 1688 int i; 1689 1690 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1691 lockdep_is_held(&rt6_exception_lock)); 1692 1693 if (!bucket) 1694 return; 1695 1696 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1697 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1698 struct rt6_info *entry = rt6_ex->rt6i; 1699 1700 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1701 * route), the metrics of its rt->from have already 1702 * been updated. 1703 */ 1704 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1705 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1706 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1707 } 1708 bucket++; 1709 } 1710 } 1711 1712 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1713 1714 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1715 struct in6_addr *gateway) 1716 { 1717 struct rt6_exception_bucket *bucket; 1718 struct rt6_exception *rt6_ex; 1719 struct hlist_node *tmp; 1720 int i; 1721 1722 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1723 return; 1724 1725 spin_lock_bh(&rt6_exception_lock); 1726 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1727 lockdep_is_held(&rt6_exception_lock)); 1728 1729 if (bucket) { 1730 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1731 hlist_for_each_entry_safe(rt6_ex, tmp, 1732 &bucket->chain, hlist) { 1733 struct rt6_info *entry = rt6_ex->rt6i; 1734 1735 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1736 RTF_CACHE_GATEWAY && 1737 ipv6_addr_equal(gateway, 1738 &entry->rt6i_gateway)) { 1739 rt6_remove_exception(bucket, rt6_ex); 1740 } 1741 } 1742 bucket++; 1743 } 1744 } 1745 1746 spin_unlock_bh(&rt6_exception_lock); 1747 } 1748 1749 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1750 struct rt6_exception *rt6_ex, 1751 struct fib6_gc_args *gc_args, 1752 unsigned long now) 1753 { 1754 struct rt6_info *rt = rt6_ex->rt6i; 1755 1756 /* we are pruning and obsoleting aged-out and non gateway exceptions 1757 * even if others have still references to them, so that on next 1758 * dst_check() such references can be dropped. 1759 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1760 * expired, independently from their aging, as per RFC 8201 section 4 1761 */ 1762 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1763 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1764 RT6_TRACE("aging clone %p\n", rt); 1765 rt6_remove_exception(bucket, rt6_ex); 1766 return; 1767 } 1768 } else if (time_after(jiffies, rt->dst.expires)) { 1769 RT6_TRACE("purging expired route %p\n", rt); 1770 rt6_remove_exception(bucket, rt6_ex); 1771 return; 1772 } 1773 1774 if (rt->rt6i_flags & RTF_GATEWAY) { 1775 struct neighbour *neigh; 1776 __u8 neigh_flags = 0; 1777 1778 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1779 if (neigh) 1780 neigh_flags = neigh->flags; 1781 1782 if (!(neigh_flags & NTF_ROUTER)) { 1783 RT6_TRACE("purging route %p via non-router but gateway\n", 1784 rt); 1785 rt6_remove_exception(bucket, rt6_ex); 1786 return; 1787 } 1788 } 1789 1790 gc_args->more++; 1791 } 1792 1793 void rt6_age_exceptions(struct fib6_info *rt, 1794 struct fib6_gc_args *gc_args, 1795 unsigned long now) 1796 { 1797 struct rt6_exception_bucket *bucket; 1798 struct rt6_exception *rt6_ex; 1799 struct hlist_node *tmp; 1800 int i; 1801 1802 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1803 return; 1804 1805 rcu_read_lock_bh(); 1806 spin_lock(&rt6_exception_lock); 1807 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1808 lockdep_is_held(&rt6_exception_lock)); 1809 1810 if (bucket) { 1811 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1812 hlist_for_each_entry_safe(rt6_ex, tmp, 1813 &bucket->chain, hlist) { 1814 rt6_age_examine_exception(bucket, rt6_ex, 1815 gc_args, now); 1816 } 1817 bucket++; 1818 } 1819 } 1820 spin_unlock(&rt6_exception_lock); 1821 rcu_read_unlock_bh(); 1822 } 1823 1824 /* must be called with rcu lock held */ 1825 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1826 int oif, struct flowi6 *fl6, int strict) 1827 { 1828 struct fib6_node *fn, *saved_fn; 1829 struct fib6_info *f6i; 1830 1831 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1832 saved_fn = fn; 1833 1834 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1835 oif = 0; 1836 1837 redo_rt6_select: 1838 f6i = rt6_select(net, fn, oif, strict); 1839 if (f6i == net->ipv6.fib6_null_entry) { 1840 fn = fib6_backtrack(fn, &fl6->saddr); 1841 if (fn) 1842 goto redo_rt6_select; 1843 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1844 /* also consider unreachable route */ 1845 strict &= ~RT6_LOOKUP_F_REACHABLE; 1846 fn = saved_fn; 1847 goto redo_rt6_select; 1848 } 1849 } 1850 1851 trace_fib6_table_lookup(net, f6i, table, fl6); 1852 1853 return f6i; 1854 } 1855 1856 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1857 int oif, struct flowi6 *fl6, 1858 const struct sk_buff *skb, int flags) 1859 { 1860 struct fib6_result res = {}; 1861 struct rt6_info *rt; 1862 int strict = 0; 1863 1864 strict |= flags & RT6_LOOKUP_F_IFACE; 1865 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1866 if (net->ipv6.devconf_all->forwarding == 0) 1867 strict |= RT6_LOOKUP_F_REACHABLE; 1868 1869 rcu_read_lock(); 1870 1871 res.f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1872 if (res.f6i == net->ipv6.fib6_null_entry) { 1873 rt = net->ipv6.ip6_null_entry; 1874 rcu_read_unlock(); 1875 dst_hold(&rt->dst); 1876 return rt; 1877 } 1878 1879 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 1880 1881 /*Search through exception table */ 1882 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1883 if (rt) { 1884 if (ip6_hold_safe(net, &rt)) 1885 dst_use_noref(&rt->dst, jiffies); 1886 1887 rcu_read_unlock(); 1888 return rt; 1889 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1890 !res.nh->fib_nh_gw_family)) { 1891 /* Create a RTF_CACHE clone which will not be 1892 * owned by the fib6 tree. It is for the special case where 1893 * the daddr in the skb during the neighbor look-up is different 1894 * from the fl6->daddr used to look-up route here. 1895 */ 1896 struct rt6_info *uncached_rt; 1897 1898 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 1899 1900 rcu_read_unlock(); 1901 1902 if (uncached_rt) { 1903 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1904 * No need for another dst_hold() 1905 */ 1906 rt6_uncached_list_add(uncached_rt); 1907 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1908 } else { 1909 uncached_rt = net->ipv6.ip6_null_entry; 1910 dst_hold(&uncached_rt->dst); 1911 } 1912 1913 return uncached_rt; 1914 } else { 1915 /* Get a percpu copy */ 1916 1917 struct rt6_info *pcpu_rt; 1918 1919 local_bh_disable(); 1920 pcpu_rt = rt6_get_pcpu_route(&res); 1921 1922 if (!pcpu_rt) 1923 pcpu_rt = rt6_make_pcpu_route(net, &res); 1924 1925 local_bh_enable(); 1926 rcu_read_unlock(); 1927 1928 return pcpu_rt; 1929 } 1930 } 1931 EXPORT_SYMBOL_GPL(ip6_pol_route); 1932 1933 static struct rt6_info *ip6_pol_route_input(struct net *net, 1934 struct fib6_table *table, 1935 struct flowi6 *fl6, 1936 const struct sk_buff *skb, 1937 int flags) 1938 { 1939 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1940 } 1941 1942 struct dst_entry *ip6_route_input_lookup(struct net *net, 1943 struct net_device *dev, 1944 struct flowi6 *fl6, 1945 const struct sk_buff *skb, 1946 int flags) 1947 { 1948 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1949 flags |= RT6_LOOKUP_F_IFACE; 1950 1951 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1952 } 1953 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1954 1955 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1956 struct flow_keys *keys, 1957 struct flow_keys *flkeys) 1958 { 1959 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1960 const struct ipv6hdr *key_iph = outer_iph; 1961 struct flow_keys *_flkeys = flkeys; 1962 const struct ipv6hdr *inner_iph; 1963 const struct icmp6hdr *icmph; 1964 struct ipv6hdr _inner_iph; 1965 struct icmp6hdr _icmph; 1966 1967 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1968 goto out; 1969 1970 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1971 sizeof(_icmph), &_icmph); 1972 if (!icmph) 1973 goto out; 1974 1975 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1976 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1977 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1978 icmph->icmp6_type != ICMPV6_PARAMPROB) 1979 goto out; 1980 1981 inner_iph = skb_header_pointer(skb, 1982 skb_transport_offset(skb) + sizeof(*icmph), 1983 sizeof(_inner_iph), &_inner_iph); 1984 if (!inner_iph) 1985 goto out; 1986 1987 key_iph = inner_iph; 1988 _flkeys = NULL; 1989 out: 1990 if (_flkeys) { 1991 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1992 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1993 keys->tags.flow_label = _flkeys->tags.flow_label; 1994 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1995 } else { 1996 keys->addrs.v6addrs.src = key_iph->saddr; 1997 keys->addrs.v6addrs.dst = key_iph->daddr; 1998 keys->tags.flow_label = ip6_flowlabel(key_iph); 1999 keys->basic.ip_proto = key_iph->nexthdr; 2000 } 2001 } 2002 2003 /* if skb is set it will be used and fl6 can be NULL */ 2004 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2005 const struct sk_buff *skb, struct flow_keys *flkeys) 2006 { 2007 struct flow_keys hash_keys; 2008 u32 mhash; 2009 2010 switch (ip6_multipath_hash_policy(net)) { 2011 case 0: 2012 memset(&hash_keys, 0, sizeof(hash_keys)); 2013 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2014 if (skb) { 2015 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2016 } else { 2017 hash_keys.addrs.v6addrs.src = fl6->saddr; 2018 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2019 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2020 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2021 } 2022 break; 2023 case 1: 2024 if (skb) { 2025 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2026 struct flow_keys keys; 2027 2028 /* short-circuit if we already have L4 hash present */ 2029 if (skb->l4_hash) 2030 return skb_get_hash_raw(skb) >> 1; 2031 2032 memset(&hash_keys, 0, sizeof(hash_keys)); 2033 2034 if (!flkeys) { 2035 skb_flow_dissect_flow_keys(skb, &keys, flag); 2036 flkeys = &keys; 2037 } 2038 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2039 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2040 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2041 hash_keys.ports.src = flkeys->ports.src; 2042 hash_keys.ports.dst = flkeys->ports.dst; 2043 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2044 } else { 2045 memset(&hash_keys, 0, sizeof(hash_keys)); 2046 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2047 hash_keys.addrs.v6addrs.src = fl6->saddr; 2048 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2049 hash_keys.ports.src = fl6->fl6_sport; 2050 hash_keys.ports.dst = fl6->fl6_dport; 2051 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2052 } 2053 break; 2054 } 2055 mhash = flow_hash_from_keys(&hash_keys); 2056 2057 return mhash >> 1; 2058 } 2059 2060 void ip6_route_input(struct sk_buff *skb) 2061 { 2062 const struct ipv6hdr *iph = ipv6_hdr(skb); 2063 struct net *net = dev_net(skb->dev); 2064 int flags = RT6_LOOKUP_F_HAS_SADDR; 2065 struct ip_tunnel_info *tun_info; 2066 struct flowi6 fl6 = { 2067 .flowi6_iif = skb->dev->ifindex, 2068 .daddr = iph->daddr, 2069 .saddr = iph->saddr, 2070 .flowlabel = ip6_flowinfo(iph), 2071 .flowi6_mark = skb->mark, 2072 .flowi6_proto = iph->nexthdr, 2073 }; 2074 struct flow_keys *flkeys = NULL, _flkeys; 2075 2076 tun_info = skb_tunnel_info(skb); 2077 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2078 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2079 2080 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2081 flkeys = &_flkeys; 2082 2083 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2084 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2085 skb_dst_drop(skb); 2086 skb_dst_set(skb, 2087 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2088 } 2089 2090 static struct rt6_info *ip6_pol_route_output(struct net *net, 2091 struct fib6_table *table, 2092 struct flowi6 *fl6, 2093 const struct sk_buff *skb, 2094 int flags) 2095 { 2096 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2097 } 2098 2099 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2100 struct flowi6 *fl6, int flags) 2101 { 2102 bool any_src; 2103 2104 if (ipv6_addr_type(&fl6->daddr) & 2105 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2106 struct dst_entry *dst; 2107 2108 dst = l3mdev_link_scope_lookup(net, fl6); 2109 if (dst) 2110 return dst; 2111 } 2112 2113 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2114 2115 any_src = ipv6_addr_any(&fl6->saddr); 2116 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2117 (fl6->flowi6_oif && any_src)) 2118 flags |= RT6_LOOKUP_F_IFACE; 2119 2120 if (!any_src) 2121 flags |= RT6_LOOKUP_F_HAS_SADDR; 2122 else if (sk) 2123 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2124 2125 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2126 } 2127 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2128 2129 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2130 { 2131 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2132 struct net_device *loopback_dev = net->loopback_dev; 2133 struct dst_entry *new = NULL; 2134 2135 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2136 DST_OBSOLETE_DEAD, 0); 2137 if (rt) { 2138 rt6_info_init(rt); 2139 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2140 2141 new = &rt->dst; 2142 new->__use = 1; 2143 new->input = dst_discard; 2144 new->output = dst_discard_out; 2145 2146 dst_copy_metrics(new, &ort->dst); 2147 2148 rt->rt6i_idev = in6_dev_get(loopback_dev); 2149 rt->rt6i_gateway = ort->rt6i_gateway; 2150 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2151 2152 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2153 #ifdef CONFIG_IPV6_SUBTREES 2154 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2155 #endif 2156 } 2157 2158 dst_release(dst_orig); 2159 return new ? new : ERR_PTR(-ENOMEM); 2160 } 2161 2162 /* 2163 * Destination cache support functions 2164 */ 2165 2166 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2167 { 2168 u32 rt_cookie = 0; 2169 2170 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2171 return false; 2172 2173 if (fib6_check_expired(f6i)) 2174 return false; 2175 2176 return true; 2177 } 2178 2179 static struct dst_entry *rt6_check(struct rt6_info *rt, 2180 struct fib6_info *from, 2181 u32 cookie) 2182 { 2183 u32 rt_cookie = 0; 2184 2185 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2186 rt_cookie != cookie) 2187 return NULL; 2188 2189 if (rt6_check_expired(rt)) 2190 return NULL; 2191 2192 return &rt->dst; 2193 } 2194 2195 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2196 struct fib6_info *from, 2197 u32 cookie) 2198 { 2199 if (!__rt6_check_expired(rt) && 2200 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2201 fib6_check(from, cookie)) 2202 return &rt->dst; 2203 else 2204 return NULL; 2205 } 2206 2207 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2208 { 2209 struct dst_entry *dst_ret; 2210 struct fib6_info *from; 2211 struct rt6_info *rt; 2212 2213 rt = container_of(dst, struct rt6_info, dst); 2214 2215 rcu_read_lock(); 2216 2217 /* All IPV6 dsts are created with ->obsolete set to the value 2218 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2219 * into this function always. 2220 */ 2221 2222 from = rcu_dereference(rt->from); 2223 2224 if (from && (rt->rt6i_flags & RTF_PCPU || 2225 unlikely(!list_empty(&rt->rt6i_uncached)))) 2226 dst_ret = rt6_dst_from_check(rt, from, cookie); 2227 else 2228 dst_ret = rt6_check(rt, from, cookie); 2229 2230 rcu_read_unlock(); 2231 2232 return dst_ret; 2233 } 2234 2235 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2236 { 2237 struct rt6_info *rt = (struct rt6_info *) dst; 2238 2239 if (rt) { 2240 if (rt->rt6i_flags & RTF_CACHE) { 2241 rcu_read_lock(); 2242 if (rt6_check_expired(rt)) { 2243 rt6_remove_exception_rt(rt); 2244 dst = NULL; 2245 } 2246 rcu_read_unlock(); 2247 } else { 2248 dst_release(dst); 2249 dst = NULL; 2250 } 2251 } 2252 return dst; 2253 } 2254 2255 static void ip6_link_failure(struct sk_buff *skb) 2256 { 2257 struct rt6_info *rt; 2258 2259 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2260 2261 rt = (struct rt6_info *) skb_dst(skb); 2262 if (rt) { 2263 rcu_read_lock(); 2264 if (rt->rt6i_flags & RTF_CACHE) { 2265 rt6_remove_exception_rt(rt); 2266 } else { 2267 struct fib6_info *from; 2268 struct fib6_node *fn; 2269 2270 from = rcu_dereference(rt->from); 2271 if (from) { 2272 fn = rcu_dereference(from->fib6_node); 2273 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2274 fn->fn_sernum = -1; 2275 } 2276 } 2277 rcu_read_unlock(); 2278 } 2279 } 2280 2281 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2282 { 2283 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2284 struct fib6_info *from; 2285 2286 rcu_read_lock(); 2287 from = rcu_dereference(rt0->from); 2288 if (from) 2289 rt0->dst.expires = from->expires; 2290 rcu_read_unlock(); 2291 } 2292 2293 dst_set_expires(&rt0->dst, timeout); 2294 rt0->rt6i_flags |= RTF_EXPIRES; 2295 } 2296 2297 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2298 { 2299 struct net *net = dev_net(rt->dst.dev); 2300 2301 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2302 rt->rt6i_flags |= RTF_MODIFIED; 2303 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2304 } 2305 2306 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2307 { 2308 return !(rt->rt6i_flags & RTF_CACHE) && 2309 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2310 } 2311 2312 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2313 const struct ipv6hdr *iph, u32 mtu) 2314 { 2315 const struct in6_addr *daddr, *saddr; 2316 struct rt6_info *rt6 = (struct rt6_info *)dst; 2317 2318 if (dst_metric_locked(dst, RTAX_MTU)) 2319 return; 2320 2321 if (iph) { 2322 daddr = &iph->daddr; 2323 saddr = &iph->saddr; 2324 } else if (sk) { 2325 daddr = &sk->sk_v6_daddr; 2326 saddr = &inet6_sk(sk)->saddr; 2327 } else { 2328 daddr = NULL; 2329 saddr = NULL; 2330 } 2331 dst_confirm_neigh(dst, daddr); 2332 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2333 if (mtu >= dst_mtu(dst)) 2334 return; 2335 2336 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2337 rt6_do_update_pmtu(rt6, mtu); 2338 /* update rt6_ex->stamp for cache */ 2339 if (rt6->rt6i_flags & RTF_CACHE) 2340 rt6_update_exception_stamp_rt(rt6); 2341 } else if (daddr) { 2342 struct fib6_result res = {}; 2343 struct rt6_info *nrt6; 2344 2345 rcu_read_lock(); 2346 res.f6i = rcu_dereference(rt6->from); 2347 if (!res.f6i) { 2348 rcu_read_unlock(); 2349 return; 2350 } 2351 res.nh = &res.f6i->fib6_nh; 2352 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2353 if (nrt6) { 2354 rt6_do_update_pmtu(nrt6, mtu); 2355 if (rt6_insert_exception(nrt6, res.f6i)) 2356 dst_release_immediate(&nrt6->dst); 2357 } 2358 rcu_read_unlock(); 2359 } 2360 } 2361 2362 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2363 struct sk_buff *skb, u32 mtu) 2364 { 2365 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2366 } 2367 2368 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2369 int oif, u32 mark, kuid_t uid) 2370 { 2371 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2372 struct dst_entry *dst; 2373 struct flowi6 fl6 = { 2374 .flowi6_oif = oif, 2375 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2376 .daddr = iph->daddr, 2377 .saddr = iph->saddr, 2378 .flowlabel = ip6_flowinfo(iph), 2379 .flowi6_uid = uid, 2380 }; 2381 2382 dst = ip6_route_output(net, NULL, &fl6); 2383 if (!dst->error) 2384 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2385 dst_release(dst); 2386 } 2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2388 2389 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2390 { 2391 int oif = sk->sk_bound_dev_if; 2392 struct dst_entry *dst; 2393 2394 if (!oif && skb->dev) 2395 oif = l3mdev_master_ifindex(skb->dev); 2396 2397 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2398 2399 dst = __sk_dst_get(sk); 2400 if (!dst || !dst->obsolete || 2401 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2402 return; 2403 2404 bh_lock_sock(sk); 2405 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2406 ip6_datagram_dst_update(sk, false); 2407 bh_unlock_sock(sk); 2408 } 2409 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2410 2411 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2412 const struct flowi6 *fl6) 2413 { 2414 #ifdef CONFIG_IPV6_SUBTREES 2415 struct ipv6_pinfo *np = inet6_sk(sk); 2416 #endif 2417 2418 ip6_dst_store(sk, dst, 2419 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2420 &sk->sk_v6_daddr : NULL, 2421 #ifdef CONFIG_IPV6_SUBTREES 2422 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2423 &np->saddr : 2424 #endif 2425 NULL); 2426 } 2427 2428 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2429 struct flowi6 *fl6, 2430 const struct in6_addr *gw, 2431 struct rt6_info **ret) 2432 { 2433 const struct fib6_nh *nh = res->nh; 2434 2435 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2436 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2437 return false; 2438 2439 /* rt_cache's gateway might be different from its 'parent' 2440 * in the case of an ip redirect. 2441 * So we keep searching in the exception table if the gateway 2442 * is different. 2443 */ 2444 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2445 struct rt6_info *rt_cache; 2446 2447 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2448 if (rt_cache && 2449 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2450 *ret = rt_cache; 2451 return true; 2452 } 2453 return false; 2454 } 2455 return true; 2456 } 2457 2458 /* Handle redirects */ 2459 struct ip6rd_flowi { 2460 struct flowi6 fl6; 2461 struct in6_addr gateway; 2462 }; 2463 2464 static struct rt6_info *__ip6_route_redirect(struct net *net, 2465 struct fib6_table *table, 2466 struct flowi6 *fl6, 2467 const struct sk_buff *skb, 2468 int flags) 2469 { 2470 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2471 struct rt6_info *ret = NULL; 2472 struct fib6_result res = {}; 2473 struct fib6_info *rt; 2474 struct fib6_node *fn; 2475 2476 /* Get the "current" route for this destination and 2477 * check if the redirect has come from appropriate router. 2478 * 2479 * RFC 4861 specifies that redirects should only be 2480 * accepted if they come from the nexthop to the target. 2481 * Due to the way the routes are chosen, this notion 2482 * is a bit fuzzy and one might need to check all possible 2483 * routes. 2484 */ 2485 2486 rcu_read_lock(); 2487 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2488 restart: 2489 for_each_fib6_node_rt_rcu(fn) { 2490 res.f6i = rt; 2491 res.nh = &rt->fib6_nh; 2492 2493 if (fib6_check_expired(rt)) 2494 continue; 2495 if (rt->fib6_flags & RTF_REJECT) 2496 break; 2497 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2498 goto out; 2499 } 2500 2501 if (!rt) 2502 rt = net->ipv6.fib6_null_entry; 2503 else if (rt->fib6_flags & RTF_REJECT) { 2504 ret = net->ipv6.ip6_null_entry; 2505 goto out; 2506 } 2507 2508 if (rt == net->ipv6.fib6_null_entry) { 2509 fn = fib6_backtrack(fn, &fl6->saddr); 2510 if (fn) 2511 goto restart; 2512 } 2513 2514 res.f6i = rt; 2515 res.nh = &rt->fib6_nh; 2516 out: 2517 if (ret) 2518 ip6_hold_safe(net, &ret); 2519 else 2520 ret = ip6_create_rt_rcu(&res); 2521 2522 rcu_read_unlock(); 2523 2524 trace_fib6_table_lookup(net, rt, table, fl6); 2525 return ret; 2526 }; 2527 2528 static struct dst_entry *ip6_route_redirect(struct net *net, 2529 const struct flowi6 *fl6, 2530 const struct sk_buff *skb, 2531 const struct in6_addr *gateway) 2532 { 2533 int flags = RT6_LOOKUP_F_HAS_SADDR; 2534 struct ip6rd_flowi rdfl; 2535 2536 rdfl.fl6 = *fl6; 2537 rdfl.gateway = *gateway; 2538 2539 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2540 flags, __ip6_route_redirect); 2541 } 2542 2543 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2544 kuid_t uid) 2545 { 2546 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2547 struct dst_entry *dst; 2548 struct flowi6 fl6 = { 2549 .flowi6_iif = LOOPBACK_IFINDEX, 2550 .flowi6_oif = oif, 2551 .flowi6_mark = mark, 2552 .daddr = iph->daddr, 2553 .saddr = iph->saddr, 2554 .flowlabel = ip6_flowinfo(iph), 2555 .flowi6_uid = uid, 2556 }; 2557 2558 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2559 rt6_do_redirect(dst, NULL, skb); 2560 dst_release(dst); 2561 } 2562 EXPORT_SYMBOL_GPL(ip6_redirect); 2563 2564 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2565 { 2566 const struct ipv6hdr *iph = ipv6_hdr(skb); 2567 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2568 struct dst_entry *dst; 2569 struct flowi6 fl6 = { 2570 .flowi6_iif = LOOPBACK_IFINDEX, 2571 .flowi6_oif = oif, 2572 .daddr = msg->dest, 2573 .saddr = iph->daddr, 2574 .flowi6_uid = sock_net_uid(net, NULL), 2575 }; 2576 2577 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2578 rt6_do_redirect(dst, NULL, skb); 2579 dst_release(dst); 2580 } 2581 2582 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2583 { 2584 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2585 sk->sk_uid); 2586 } 2587 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2588 2589 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2590 { 2591 struct net_device *dev = dst->dev; 2592 unsigned int mtu = dst_mtu(dst); 2593 struct net *net = dev_net(dev); 2594 2595 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2596 2597 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2598 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2599 2600 /* 2601 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2602 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2603 * IPV6_MAXPLEN is also valid and means: "any MSS, 2604 * rely only on pmtu discovery" 2605 */ 2606 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2607 mtu = IPV6_MAXPLEN; 2608 return mtu; 2609 } 2610 2611 static unsigned int ip6_mtu(const struct dst_entry *dst) 2612 { 2613 struct inet6_dev *idev; 2614 unsigned int mtu; 2615 2616 mtu = dst_metric_raw(dst, RTAX_MTU); 2617 if (mtu) 2618 goto out; 2619 2620 mtu = IPV6_MIN_MTU; 2621 2622 rcu_read_lock(); 2623 idev = __in6_dev_get(dst->dev); 2624 if (idev) 2625 mtu = idev->cnf.mtu6; 2626 rcu_read_unlock(); 2627 2628 out: 2629 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2630 2631 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2632 } 2633 2634 /* MTU selection: 2635 * 1. mtu on route is locked - use it 2636 * 2. mtu from nexthop exception 2637 * 3. mtu from egress device 2638 * 2639 * based on ip6_dst_mtu_forward and exception logic of 2640 * rt6_find_cached_rt; called with rcu_read_lock 2641 */ 2642 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2643 struct in6_addr *saddr) 2644 { 2645 struct rt6_exception_bucket *bucket; 2646 struct rt6_exception *rt6_ex; 2647 struct in6_addr *src_key; 2648 struct inet6_dev *idev; 2649 u32 mtu = 0; 2650 2651 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2652 mtu = f6i->fib6_pmtu; 2653 if (mtu) 2654 goto out; 2655 } 2656 2657 src_key = NULL; 2658 #ifdef CONFIG_IPV6_SUBTREES 2659 if (f6i->fib6_src.plen) 2660 src_key = saddr; 2661 #endif 2662 2663 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2664 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2665 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2666 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2667 2668 if (likely(!mtu)) { 2669 struct net_device *dev = fib6_info_nh_dev(f6i); 2670 2671 mtu = IPV6_MIN_MTU; 2672 idev = __in6_dev_get(dev); 2673 if (idev && idev->cnf.mtu6 > mtu) 2674 mtu = idev->cnf.mtu6; 2675 } 2676 2677 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2678 out: 2679 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2680 } 2681 2682 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2683 struct flowi6 *fl6) 2684 { 2685 struct dst_entry *dst; 2686 struct rt6_info *rt; 2687 struct inet6_dev *idev = in6_dev_get(dev); 2688 struct net *net = dev_net(dev); 2689 2690 if (unlikely(!idev)) 2691 return ERR_PTR(-ENODEV); 2692 2693 rt = ip6_dst_alloc(net, dev, 0); 2694 if (unlikely(!rt)) { 2695 in6_dev_put(idev); 2696 dst = ERR_PTR(-ENOMEM); 2697 goto out; 2698 } 2699 2700 rt->dst.flags |= DST_HOST; 2701 rt->dst.input = ip6_input; 2702 rt->dst.output = ip6_output; 2703 rt->rt6i_gateway = fl6->daddr; 2704 rt->rt6i_dst.addr = fl6->daddr; 2705 rt->rt6i_dst.plen = 128; 2706 rt->rt6i_idev = idev; 2707 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2708 2709 /* Add this dst into uncached_list so that rt6_disable_ip() can 2710 * do proper release of the net_device 2711 */ 2712 rt6_uncached_list_add(rt); 2713 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2714 2715 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2716 2717 out: 2718 return dst; 2719 } 2720 2721 static int ip6_dst_gc(struct dst_ops *ops) 2722 { 2723 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2724 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2725 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2726 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2727 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2728 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2729 int entries; 2730 2731 entries = dst_entries_get_fast(ops); 2732 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2733 entries <= rt_max_size) 2734 goto out; 2735 2736 net->ipv6.ip6_rt_gc_expire++; 2737 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2738 entries = dst_entries_get_slow(ops); 2739 if (entries < ops->gc_thresh) 2740 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2741 out: 2742 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2743 return entries > rt_max_size; 2744 } 2745 2746 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2747 struct fib6_config *cfg, 2748 const struct in6_addr *gw_addr, 2749 u32 tbid, int flags) 2750 { 2751 struct flowi6 fl6 = { 2752 .flowi6_oif = cfg->fc_ifindex, 2753 .daddr = *gw_addr, 2754 .saddr = cfg->fc_prefsrc, 2755 }; 2756 struct fib6_table *table; 2757 struct rt6_info *rt; 2758 2759 table = fib6_get_table(net, tbid); 2760 if (!table) 2761 return NULL; 2762 2763 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2764 flags |= RT6_LOOKUP_F_HAS_SADDR; 2765 2766 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2767 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2768 2769 /* if table lookup failed, fall back to full lookup */ 2770 if (rt == net->ipv6.ip6_null_entry) { 2771 ip6_rt_put(rt); 2772 rt = NULL; 2773 } 2774 2775 return rt; 2776 } 2777 2778 static int ip6_route_check_nh_onlink(struct net *net, 2779 struct fib6_config *cfg, 2780 const struct net_device *dev, 2781 struct netlink_ext_ack *extack) 2782 { 2783 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2784 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2785 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2786 struct fib6_info *from; 2787 struct rt6_info *grt; 2788 int err; 2789 2790 err = 0; 2791 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2792 if (grt) { 2793 rcu_read_lock(); 2794 from = rcu_dereference(grt->from); 2795 if (!grt->dst.error && 2796 /* ignore match if it is the default route */ 2797 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2798 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2799 NL_SET_ERR_MSG(extack, 2800 "Nexthop has invalid gateway or device mismatch"); 2801 err = -EINVAL; 2802 } 2803 rcu_read_unlock(); 2804 2805 ip6_rt_put(grt); 2806 } 2807 2808 return err; 2809 } 2810 2811 static int ip6_route_check_nh(struct net *net, 2812 struct fib6_config *cfg, 2813 struct net_device **_dev, 2814 struct inet6_dev **idev) 2815 { 2816 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2817 struct net_device *dev = _dev ? *_dev : NULL; 2818 struct rt6_info *grt = NULL; 2819 int err = -EHOSTUNREACH; 2820 2821 if (cfg->fc_table) { 2822 int flags = RT6_LOOKUP_F_IFACE; 2823 2824 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2825 cfg->fc_table, flags); 2826 if (grt) { 2827 if (grt->rt6i_flags & RTF_GATEWAY || 2828 (dev && dev != grt->dst.dev)) { 2829 ip6_rt_put(grt); 2830 grt = NULL; 2831 } 2832 } 2833 } 2834 2835 if (!grt) 2836 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2837 2838 if (!grt) 2839 goto out; 2840 2841 if (dev) { 2842 if (dev != grt->dst.dev) { 2843 ip6_rt_put(grt); 2844 goto out; 2845 } 2846 } else { 2847 *_dev = dev = grt->dst.dev; 2848 *idev = grt->rt6i_idev; 2849 dev_hold(dev); 2850 in6_dev_hold(grt->rt6i_idev); 2851 } 2852 2853 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2854 err = 0; 2855 2856 ip6_rt_put(grt); 2857 2858 out: 2859 return err; 2860 } 2861 2862 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2863 struct net_device **_dev, struct inet6_dev **idev, 2864 struct netlink_ext_ack *extack) 2865 { 2866 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2867 int gwa_type = ipv6_addr_type(gw_addr); 2868 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2869 const struct net_device *dev = *_dev; 2870 bool need_addr_check = !dev; 2871 int err = -EINVAL; 2872 2873 /* if gw_addr is local we will fail to detect this in case 2874 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2875 * will return already-added prefix route via interface that 2876 * prefix route was assigned to, which might be non-loopback. 2877 */ 2878 if (dev && 2879 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2880 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2881 goto out; 2882 } 2883 2884 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2885 /* IPv6 strictly inhibits using not link-local 2886 * addresses as nexthop address. 2887 * Otherwise, router will not able to send redirects. 2888 * It is very good, but in some (rare!) circumstances 2889 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2890 * some exceptions. --ANK 2891 * We allow IPv4-mapped nexthops to support RFC4798-type 2892 * addressing 2893 */ 2894 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2895 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2896 goto out; 2897 } 2898 2899 if (cfg->fc_flags & RTNH_F_ONLINK) 2900 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2901 else 2902 err = ip6_route_check_nh(net, cfg, _dev, idev); 2903 2904 if (err) 2905 goto out; 2906 } 2907 2908 /* reload in case device was changed */ 2909 dev = *_dev; 2910 2911 err = -EINVAL; 2912 if (!dev) { 2913 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2914 goto out; 2915 } else if (dev->flags & IFF_LOOPBACK) { 2916 NL_SET_ERR_MSG(extack, 2917 "Egress device can not be loopback device for this route"); 2918 goto out; 2919 } 2920 2921 /* if we did not check gw_addr above, do so now that the 2922 * egress device has been resolved. 2923 */ 2924 if (need_addr_check && 2925 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2926 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2927 goto out; 2928 } 2929 2930 err = 0; 2931 out: 2932 return err; 2933 } 2934 2935 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2936 { 2937 if ((flags & RTF_REJECT) || 2938 (dev && (dev->flags & IFF_LOOPBACK) && 2939 !(addr_type & IPV6_ADDR_LOOPBACK) && 2940 !(flags & RTF_LOCAL))) 2941 return true; 2942 2943 return false; 2944 } 2945 2946 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2947 struct fib6_config *cfg, gfp_t gfp_flags, 2948 struct netlink_ext_ack *extack) 2949 { 2950 struct net_device *dev = NULL; 2951 struct inet6_dev *idev = NULL; 2952 int addr_type; 2953 int err; 2954 2955 fib6_nh->fib_nh_family = AF_INET6; 2956 2957 err = -ENODEV; 2958 if (cfg->fc_ifindex) { 2959 dev = dev_get_by_index(net, cfg->fc_ifindex); 2960 if (!dev) 2961 goto out; 2962 idev = in6_dev_get(dev); 2963 if (!idev) 2964 goto out; 2965 } 2966 2967 if (cfg->fc_flags & RTNH_F_ONLINK) { 2968 if (!dev) { 2969 NL_SET_ERR_MSG(extack, 2970 "Nexthop device required for onlink"); 2971 goto out; 2972 } 2973 2974 if (!(dev->flags & IFF_UP)) { 2975 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2976 err = -ENETDOWN; 2977 goto out; 2978 } 2979 2980 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 2981 } 2982 2983 fib6_nh->fib_nh_weight = 1; 2984 2985 /* We cannot add true routes via loopback here, 2986 * they would result in kernel looping; promote them to reject routes 2987 */ 2988 addr_type = ipv6_addr_type(&cfg->fc_dst); 2989 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 2990 /* hold loopback dev/idev if we haven't done so. */ 2991 if (dev != net->loopback_dev) { 2992 if (dev) { 2993 dev_put(dev); 2994 in6_dev_put(idev); 2995 } 2996 dev = net->loopback_dev; 2997 dev_hold(dev); 2998 idev = in6_dev_get(dev); 2999 if (!idev) { 3000 err = -ENODEV; 3001 goto out; 3002 } 3003 } 3004 goto set_dev; 3005 } 3006 3007 if (cfg->fc_flags & RTF_GATEWAY) { 3008 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3009 if (err) 3010 goto out; 3011 3012 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3013 fib6_nh->fib_nh_gw_family = AF_INET6; 3014 } 3015 3016 err = -ENODEV; 3017 if (!dev) 3018 goto out; 3019 3020 if (idev->cnf.disable_ipv6) { 3021 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3022 err = -EACCES; 3023 goto out; 3024 } 3025 3026 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3027 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3028 err = -ENETDOWN; 3029 goto out; 3030 } 3031 3032 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3033 !netif_carrier_ok(dev)) 3034 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3035 3036 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3037 cfg->fc_encap_type, cfg, gfp_flags, extack); 3038 if (err) 3039 goto out; 3040 set_dev: 3041 fib6_nh->fib_nh_dev = dev; 3042 fib6_nh->fib_nh_oif = dev->ifindex; 3043 err = 0; 3044 out: 3045 if (idev) 3046 in6_dev_put(idev); 3047 3048 if (err) { 3049 lwtstate_put(fib6_nh->fib_nh_lws); 3050 fib6_nh->fib_nh_lws = NULL; 3051 if (dev) 3052 dev_put(dev); 3053 } 3054 3055 return err; 3056 } 3057 3058 void fib6_nh_release(struct fib6_nh *fib6_nh) 3059 { 3060 fib_nh_common_release(&fib6_nh->nh_common); 3061 } 3062 3063 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3064 gfp_t gfp_flags, 3065 struct netlink_ext_ack *extack) 3066 { 3067 struct net *net = cfg->fc_nlinfo.nl_net; 3068 struct fib6_info *rt = NULL; 3069 struct fib6_table *table; 3070 int err = -EINVAL; 3071 int addr_type; 3072 3073 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3074 if (cfg->fc_flags & RTF_PCPU) { 3075 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3076 goto out; 3077 } 3078 3079 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3080 if (cfg->fc_flags & RTF_CACHE) { 3081 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3082 goto out; 3083 } 3084 3085 if (cfg->fc_type > RTN_MAX) { 3086 NL_SET_ERR_MSG(extack, "Invalid route type"); 3087 goto out; 3088 } 3089 3090 if (cfg->fc_dst_len > 128) { 3091 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3092 goto out; 3093 } 3094 if (cfg->fc_src_len > 128) { 3095 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3096 goto out; 3097 } 3098 #ifndef CONFIG_IPV6_SUBTREES 3099 if (cfg->fc_src_len) { 3100 NL_SET_ERR_MSG(extack, 3101 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3102 goto out; 3103 } 3104 #endif 3105 3106 err = -ENOBUFS; 3107 if (cfg->fc_nlinfo.nlh && 3108 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3109 table = fib6_get_table(net, cfg->fc_table); 3110 if (!table) { 3111 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3112 table = fib6_new_table(net, cfg->fc_table); 3113 } 3114 } else { 3115 table = fib6_new_table(net, cfg->fc_table); 3116 } 3117 3118 if (!table) 3119 goto out; 3120 3121 err = -ENOMEM; 3122 rt = fib6_info_alloc(gfp_flags); 3123 if (!rt) 3124 goto out; 3125 3126 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3127 extack); 3128 if (IS_ERR(rt->fib6_metrics)) { 3129 err = PTR_ERR(rt->fib6_metrics); 3130 /* Do not leave garbage there. */ 3131 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3132 goto out; 3133 } 3134 3135 if (cfg->fc_flags & RTF_ADDRCONF) 3136 rt->dst_nocount = true; 3137 3138 if (cfg->fc_flags & RTF_EXPIRES) 3139 fib6_set_expires(rt, jiffies + 3140 clock_t_to_jiffies(cfg->fc_expires)); 3141 else 3142 fib6_clean_expires(rt); 3143 3144 if (cfg->fc_protocol == RTPROT_UNSPEC) 3145 cfg->fc_protocol = RTPROT_BOOT; 3146 rt->fib6_protocol = cfg->fc_protocol; 3147 3148 rt->fib6_table = table; 3149 rt->fib6_metric = cfg->fc_metric; 3150 rt->fib6_type = cfg->fc_type; 3151 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3152 3153 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3154 rt->fib6_dst.plen = cfg->fc_dst_len; 3155 if (rt->fib6_dst.plen == 128) 3156 rt->dst_host = true; 3157 3158 #ifdef CONFIG_IPV6_SUBTREES 3159 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3160 rt->fib6_src.plen = cfg->fc_src_len; 3161 #endif 3162 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3163 if (err) 3164 goto out; 3165 3166 /* We cannot add true routes via loopback here, 3167 * they would result in kernel looping; promote them to reject routes 3168 */ 3169 addr_type = ipv6_addr_type(&cfg->fc_dst); 3170 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3171 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3172 3173 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3174 struct net_device *dev = fib6_info_nh_dev(rt); 3175 3176 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3177 NL_SET_ERR_MSG(extack, "Invalid source address"); 3178 err = -EINVAL; 3179 goto out; 3180 } 3181 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3182 rt->fib6_prefsrc.plen = 128; 3183 } else 3184 rt->fib6_prefsrc.plen = 0; 3185 3186 return rt; 3187 out: 3188 fib6_info_release(rt); 3189 return ERR_PTR(err); 3190 } 3191 3192 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3193 struct netlink_ext_ack *extack) 3194 { 3195 struct fib6_info *rt; 3196 int err; 3197 3198 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3199 if (IS_ERR(rt)) 3200 return PTR_ERR(rt); 3201 3202 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3203 fib6_info_release(rt); 3204 3205 return err; 3206 } 3207 3208 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3209 { 3210 struct net *net = info->nl_net; 3211 struct fib6_table *table; 3212 int err; 3213 3214 if (rt == net->ipv6.fib6_null_entry) { 3215 err = -ENOENT; 3216 goto out; 3217 } 3218 3219 table = rt->fib6_table; 3220 spin_lock_bh(&table->tb6_lock); 3221 err = fib6_del(rt, info); 3222 spin_unlock_bh(&table->tb6_lock); 3223 3224 out: 3225 fib6_info_release(rt); 3226 return err; 3227 } 3228 3229 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3230 { 3231 struct nl_info info = { .nl_net = net }; 3232 3233 return __ip6_del_rt(rt, &info); 3234 } 3235 3236 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3237 { 3238 struct nl_info *info = &cfg->fc_nlinfo; 3239 struct net *net = info->nl_net; 3240 struct sk_buff *skb = NULL; 3241 struct fib6_table *table; 3242 int err = -ENOENT; 3243 3244 if (rt == net->ipv6.fib6_null_entry) 3245 goto out_put; 3246 table = rt->fib6_table; 3247 spin_lock_bh(&table->tb6_lock); 3248 3249 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3250 struct fib6_info *sibling, *next_sibling; 3251 3252 /* prefer to send a single notification with all hops */ 3253 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3254 if (skb) { 3255 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3256 3257 if (rt6_fill_node(net, skb, rt, NULL, 3258 NULL, NULL, 0, RTM_DELROUTE, 3259 info->portid, seq, 0) < 0) { 3260 kfree_skb(skb); 3261 skb = NULL; 3262 } else 3263 info->skip_notify = 1; 3264 } 3265 3266 list_for_each_entry_safe(sibling, next_sibling, 3267 &rt->fib6_siblings, 3268 fib6_siblings) { 3269 err = fib6_del(sibling, info); 3270 if (err) 3271 goto out_unlock; 3272 } 3273 } 3274 3275 err = fib6_del(rt, info); 3276 out_unlock: 3277 spin_unlock_bh(&table->tb6_lock); 3278 out_put: 3279 fib6_info_release(rt); 3280 3281 if (skb) { 3282 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3283 info->nlh, gfp_any()); 3284 } 3285 return err; 3286 } 3287 3288 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3289 { 3290 int rc = -ESRCH; 3291 3292 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3293 goto out; 3294 3295 if (cfg->fc_flags & RTF_GATEWAY && 3296 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3297 goto out; 3298 3299 rc = rt6_remove_exception_rt(rt); 3300 out: 3301 return rc; 3302 } 3303 3304 static int ip6_route_del(struct fib6_config *cfg, 3305 struct netlink_ext_ack *extack) 3306 { 3307 struct rt6_info *rt_cache; 3308 struct fib6_table *table; 3309 struct fib6_info *rt; 3310 struct fib6_node *fn; 3311 int err = -ESRCH; 3312 3313 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3314 if (!table) { 3315 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3316 return err; 3317 } 3318 3319 rcu_read_lock(); 3320 3321 fn = fib6_locate(&table->tb6_root, 3322 &cfg->fc_dst, cfg->fc_dst_len, 3323 &cfg->fc_src, cfg->fc_src_len, 3324 !(cfg->fc_flags & RTF_CACHE)); 3325 3326 if (fn) { 3327 for_each_fib6_node_rt_rcu(fn) { 3328 struct fib6_nh *nh; 3329 3330 if (cfg->fc_flags & RTF_CACHE) { 3331 struct fib6_result res = { 3332 .f6i = rt, 3333 }; 3334 int rc; 3335 3336 rt_cache = rt6_find_cached_rt(&res, 3337 &cfg->fc_dst, 3338 &cfg->fc_src); 3339 if (rt_cache) { 3340 rc = ip6_del_cached_rt(rt_cache, cfg); 3341 if (rc != -ESRCH) { 3342 rcu_read_unlock(); 3343 return rc; 3344 } 3345 } 3346 continue; 3347 } 3348 3349 nh = &rt->fib6_nh; 3350 if (cfg->fc_ifindex && 3351 (!nh->fib_nh_dev || 3352 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3353 continue; 3354 if (cfg->fc_flags & RTF_GATEWAY && 3355 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3356 continue; 3357 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3358 continue; 3359 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3360 continue; 3361 if (!fib6_info_hold_safe(rt)) 3362 continue; 3363 rcu_read_unlock(); 3364 3365 /* if gateway was specified only delete the one hop */ 3366 if (cfg->fc_flags & RTF_GATEWAY) 3367 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3368 3369 return __ip6_del_rt_siblings(rt, cfg); 3370 } 3371 } 3372 rcu_read_unlock(); 3373 3374 return err; 3375 } 3376 3377 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3378 { 3379 struct netevent_redirect netevent; 3380 struct rt6_info *rt, *nrt = NULL; 3381 struct fib6_result res = {}; 3382 struct ndisc_options ndopts; 3383 struct inet6_dev *in6_dev; 3384 struct neighbour *neigh; 3385 struct rd_msg *msg; 3386 int optlen, on_link; 3387 u8 *lladdr; 3388 3389 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3390 optlen -= sizeof(*msg); 3391 3392 if (optlen < 0) { 3393 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3394 return; 3395 } 3396 3397 msg = (struct rd_msg *)icmp6_hdr(skb); 3398 3399 if (ipv6_addr_is_multicast(&msg->dest)) { 3400 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3401 return; 3402 } 3403 3404 on_link = 0; 3405 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3406 on_link = 1; 3407 } else if (ipv6_addr_type(&msg->target) != 3408 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3409 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3410 return; 3411 } 3412 3413 in6_dev = __in6_dev_get(skb->dev); 3414 if (!in6_dev) 3415 return; 3416 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3417 return; 3418 3419 /* RFC2461 8.1: 3420 * The IP source address of the Redirect MUST be the same as the current 3421 * first-hop router for the specified ICMP Destination Address. 3422 */ 3423 3424 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3425 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3426 return; 3427 } 3428 3429 lladdr = NULL; 3430 if (ndopts.nd_opts_tgt_lladdr) { 3431 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3432 skb->dev); 3433 if (!lladdr) { 3434 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3435 return; 3436 } 3437 } 3438 3439 rt = (struct rt6_info *) dst; 3440 if (rt->rt6i_flags & RTF_REJECT) { 3441 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3442 return; 3443 } 3444 3445 /* Redirect received -> path was valid. 3446 * Look, redirects are sent only in response to data packets, 3447 * so that this nexthop apparently is reachable. --ANK 3448 */ 3449 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3450 3451 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3452 if (!neigh) 3453 return; 3454 3455 /* 3456 * We have finally decided to accept it. 3457 */ 3458 3459 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3460 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3461 NEIGH_UPDATE_F_OVERRIDE| 3462 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3463 NEIGH_UPDATE_F_ISROUTER)), 3464 NDISC_REDIRECT, &ndopts); 3465 3466 rcu_read_lock(); 3467 res.f6i = rcu_dereference(rt->from); 3468 /* This fib6_info_hold() is safe here because we hold reference to rt 3469 * and rt already holds reference to fib6_info. 3470 */ 3471 fib6_info_hold(res.f6i); 3472 rcu_read_unlock(); 3473 3474 res.nh = &res.f6i->fib6_nh; 3475 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3476 if (!nrt) 3477 goto out; 3478 3479 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3480 if (on_link) 3481 nrt->rt6i_flags &= ~RTF_GATEWAY; 3482 3483 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3484 3485 /* No need to remove rt from the exception table if rt is 3486 * a cached route because rt6_insert_exception() will 3487 * takes care of it 3488 */ 3489 if (rt6_insert_exception(nrt, res.f6i)) { 3490 dst_release_immediate(&nrt->dst); 3491 goto out; 3492 } 3493 3494 netevent.old = &rt->dst; 3495 netevent.new = &nrt->dst; 3496 netevent.daddr = &msg->dest; 3497 netevent.neigh = neigh; 3498 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3499 3500 out: 3501 fib6_info_release(res.f6i); 3502 neigh_release(neigh); 3503 } 3504 3505 #ifdef CONFIG_IPV6_ROUTE_INFO 3506 static struct fib6_info *rt6_get_route_info(struct net *net, 3507 const struct in6_addr *prefix, int prefixlen, 3508 const struct in6_addr *gwaddr, 3509 struct net_device *dev) 3510 { 3511 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3512 int ifindex = dev->ifindex; 3513 struct fib6_node *fn; 3514 struct fib6_info *rt = NULL; 3515 struct fib6_table *table; 3516 3517 table = fib6_get_table(net, tb_id); 3518 if (!table) 3519 return NULL; 3520 3521 rcu_read_lock(); 3522 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3523 if (!fn) 3524 goto out; 3525 3526 for_each_fib6_node_rt_rcu(fn) { 3527 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3528 continue; 3529 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3530 !rt->fib6_nh.fib_nh_gw_family) 3531 continue; 3532 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3533 continue; 3534 if (!fib6_info_hold_safe(rt)) 3535 continue; 3536 break; 3537 } 3538 out: 3539 rcu_read_unlock(); 3540 return rt; 3541 } 3542 3543 static struct fib6_info *rt6_add_route_info(struct net *net, 3544 const struct in6_addr *prefix, int prefixlen, 3545 const struct in6_addr *gwaddr, 3546 struct net_device *dev, 3547 unsigned int pref) 3548 { 3549 struct fib6_config cfg = { 3550 .fc_metric = IP6_RT_PRIO_USER, 3551 .fc_ifindex = dev->ifindex, 3552 .fc_dst_len = prefixlen, 3553 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3554 RTF_UP | RTF_PREF(pref), 3555 .fc_protocol = RTPROT_RA, 3556 .fc_type = RTN_UNICAST, 3557 .fc_nlinfo.portid = 0, 3558 .fc_nlinfo.nlh = NULL, 3559 .fc_nlinfo.nl_net = net, 3560 }; 3561 3562 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3563 cfg.fc_dst = *prefix; 3564 cfg.fc_gateway = *gwaddr; 3565 3566 /* We should treat it as a default route if prefix length is 0. */ 3567 if (!prefixlen) 3568 cfg.fc_flags |= RTF_DEFAULT; 3569 3570 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3571 3572 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3573 } 3574 #endif 3575 3576 struct fib6_info *rt6_get_dflt_router(struct net *net, 3577 const struct in6_addr *addr, 3578 struct net_device *dev) 3579 { 3580 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3581 struct fib6_info *rt; 3582 struct fib6_table *table; 3583 3584 table = fib6_get_table(net, tb_id); 3585 if (!table) 3586 return NULL; 3587 3588 rcu_read_lock(); 3589 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3590 struct fib6_nh *nh = &rt->fib6_nh; 3591 3592 if (dev == nh->fib_nh_dev && 3593 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3594 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3595 break; 3596 } 3597 if (rt && !fib6_info_hold_safe(rt)) 3598 rt = NULL; 3599 rcu_read_unlock(); 3600 return rt; 3601 } 3602 3603 struct fib6_info *rt6_add_dflt_router(struct net *net, 3604 const struct in6_addr *gwaddr, 3605 struct net_device *dev, 3606 unsigned int pref) 3607 { 3608 struct fib6_config cfg = { 3609 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3610 .fc_metric = IP6_RT_PRIO_USER, 3611 .fc_ifindex = dev->ifindex, 3612 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3613 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3614 .fc_protocol = RTPROT_RA, 3615 .fc_type = RTN_UNICAST, 3616 .fc_nlinfo.portid = 0, 3617 .fc_nlinfo.nlh = NULL, 3618 .fc_nlinfo.nl_net = net, 3619 }; 3620 3621 cfg.fc_gateway = *gwaddr; 3622 3623 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3624 struct fib6_table *table; 3625 3626 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3627 if (table) 3628 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3629 } 3630 3631 return rt6_get_dflt_router(net, gwaddr, dev); 3632 } 3633 3634 static void __rt6_purge_dflt_routers(struct net *net, 3635 struct fib6_table *table) 3636 { 3637 struct fib6_info *rt; 3638 3639 restart: 3640 rcu_read_lock(); 3641 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3642 struct net_device *dev = fib6_info_nh_dev(rt); 3643 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3644 3645 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3646 (!idev || idev->cnf.accept_ra != 2) && 3647 fib6_info_hold_safe(rt)) { 3648 rcu_read_unlock(); 3649 ip6_del_rt(net, rt); 3650 goto restart; 3651 } 3652 } 3653 rcu_read_unlock(); 3654 3655 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3656 } 3657 3658 void rt6_purge_dflt_routers(struct net *net) 3659 { 3660 struct fib6_table *table; 3661 struct hlist_head *head; 3662 unsigned int h; 3663 3664 rcu_read_lock(); 3665 3666 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3667 head = &net->ipv6.fib_table_hash[h]; 3668 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3669 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3670 __rt6_purge_dflt_routers(net, table); 3671 } 3672 } 3673 3674 rcu_read_unlock(); 3675 } 3676 3677 static void rtmsg_to_fib6_config(struct net *net, 3678 struct in6_rtmsg *rtmsg, 3679 struct fib6_config *cfg) 3680 { 3681 *cfg = (struct fib6_config){ 3682 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3683 : RT6_TABLE_MAIN, 3684 .fc_ifindex = rtmsg->rtmsg_ifindex, 3685 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3686 .fc_expires = rtmsg->rtmsg_info, 3687 .fc_dst_len = rtmsg->rtmsg_dst_len, 3688 .fc_src_len = rtmsg->rtmsg_src_len, 3689 .fc_flags = rtmsg->rtmsg_flags, 3690 .fc_type = rtmsg->rtmsg_type, 3691 3692 .fc_nlinfo.nl_net = net, 3693 3694 .fc_dst = rtmsg->rtmsg_dst, 3695 .fc_src = rtmsg->rtmsg_src, 3696 .fc_gateway = rtmsg->rtmsg_gateway, 3697 }; 3698 } 3699 3700 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3701 { 3702 struct fib6_config cfg; 3703 struct in6_rtmsg rtmsg; 3704 int err; 3705 3706 switch (cmd) { 3707 case SIOCADDRT: /* Add a route */ 3708 case SIOCDELRT: /* Delete a route */ 3709 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3710 return -EPERM; 3711 err = copy_from_user(&rtmsg, arg, 3712 sizeof(struct in6_rtmsg)); 3713 if (err) 3714 return -EFAULT; 3715 3716 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3717 3718 rtnl_lock(); 3719 switch (cmd) { 3720 case SIOCADDRT: 3721 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3722 break; 3723 case SIOCDELRT: 3724 err = ip6_route_del(&cfg, NULL); 3725 break; 3726 default: 3727 err = -EINVAL; 3728 } 3729 rtnl_unlock(); 3730 3731 return err; 3732 } 3733 3734 return -EINVAL; 3735 } 3736 3737 /* 3738 * Drop the packet on the floor 3739 */ 3740 3741 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3742 { 3743 int type; 3744 struct dst_entry *dst = skb_dst(skb); 3745 switch (ipstats_mib_noroutes) { 3746 case IPSTATS_MIB_INNOROUTES: 3747 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3748 if (type == IPV6_ADDR_ANY) { 3749 IP6_INC_STATS(dev_net(dst->dev), 3750 __in6_dev_get_safely(skb->dev), 3751 IPSTATS_MIB_INADDRERRORS); 3752 break; 3753 } 3754 /* FALLTHROUGH */ 3755 case IPSTATS_MIB_OUTNOROUTES: 3756 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3757 ipstats_mib_noroutes); 3758 break; 3759 } 3760 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3761 kfree_skb(skb); 3762 return 0; 3763 } 3764 3765 static int ip6_pkt_discard(struct sk_buff *skb) 3766 { 3767 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3768 } 3769 3770 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3771 { 3772 skb->dev = skb_dst(skb)->dev; 3773 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3774 } 3775 3776 static int ip6_pkt_prohibit(struct sk_buff *skb) 3777 { 3778 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3779 } 3780 3781 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3782 { 3783 skb->dev = skb_dst(skb)->dev; 3784 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3785 } 3786 3787 /* 3788 * Allocate a dst for local (unicast / anycast) address. 3789 */ 3790 3791 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3792 struct inet6_dev *idev, 3793 const struct in6_addr *addr, 3794 bool anycast, gfp_t gfp_flags) 3795 { 3796 struct fib6_config cfg = { 3797 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3798 .fc_ifindex = idev->dev->ifindex, 3799 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3800 .fc_dst = *addr, 3801 .fc_dst_len = 128, 3802 .fc_protocol = RTPROT_KERNEL, 3803 .fc_nlinfo.nl_net = net, 3804 .fc_ignore_dev_down = true, 3805 }; 3806 3807 if (anycast) { 3808 cfg.fc_type = RTN_ANYCAST; 3809 cfg.fc_flags |= RTF_ANYCAST; 3810 } else { 3811 cfg.fc_type = RTN_LOCAL; 3812 cfg.fc_flags |= RTF_LOCAL; 3813 } 3814 3815 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3816 } 3817 3818 /* remove deleted ip from prefsrc entries */ 3819 struct arg_dev_net_ip { 3820 struct net_device *dev; 3821 struct net *net; 3822 struct in6_addr *addr; 3823 }; 3824 3825 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3826 { 3827 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3828 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3829 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3830 3831 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3832 rt != net->ipv6.fib6_null_entry && 3833 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3834 spin_lock_bh(&rt6_exception_lock); 3835 /* remove prefsrc entry */ 3836 rt->fib6_prefsrc.plen = 0; 3837 spin_unlock_bh(&rt6_exception_lock); 3838 } 3839 return 0; 3840 } 3841 3842 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3843 { 3844 struct net *net = dev_net(ifp->idev->dev); 3845 struct arg_dev_net_ip adni = { 3846 .dev = ifp->idev->dev, 3847 .net = net, 3848 .addr = &ifp->addr, 3849 }; 3850 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3851 } 3852 3853 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3854 3855 /* Remove routers and update dst entries when gateway turn into host. */ 3856 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3857 { 3858 struct in6_addr *gateway = (struct in6_addr *)arg; 3859 3860 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3861 rt->fib6_nh.fib_nh_gw_family && 3862 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3863 return -1; 3864 } 3865 3866 /* Further clean up cached routes in exception table. 3867 * This is needed because cached route may have a different 3868 * gateway than its 'parent' in the case of an ip redirect. 3869 */ 3870 rt6_exceptions_clean_tohost(rt, gateway); 3871 3872 return 0; 3873 } 3874 3875 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3876 { 3877 fib6_clean_all(net, fib6_clean_tohost, gateway); 3878 } 3879 3880 struct arg_netdev_event { 3881 const struct net_device *dev; 3882 union { 3883 unsigned int nh_flags; 3884 unsigned long event; 3885 }; 3886 }; 3887 3888 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3889 { 3890 struct fib6_info *iter; 3891 struct fib6_node *fn; 3892 3893 fn = rcu_dereference_protected(rt->fib6_node, 3894 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3895 iter = rcu_dereference_protected(fn->leaf, 3896 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3897 while (iter) { 3898 if (iter->fib6_metric == rt->fib6_metric && 3899 rt6_qualify_for_ecmp(iter)) 3900 return iter; 3901 iter = rcu_dereference_protected(iter->fib6_next, 3902 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3903 } 3904 3905 return NULL; 3906 } 3907 3908 static bool rt6_is_dead(const struct fib6_info *rt) 3909 { 3910 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3911 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3912 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3913 return true; 3914 3915 return false; 3916 } 3917 3918 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3919 { 3920 struct fib6_info *iter; 3921 int total = 0; 3922 3923 if (!rt6_is_dead(rt)) 3924 total += rt->fib6_nh.fib_nh_weight; 3925 3926 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3927 if (!rt6_is_dead(iter)) 3928 total += iter->fib6_nh.fib_nh_weight; 3929 } 3930 3931 return total; 3932 } 3933 3934 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3935 { 3936 int upper_bound = -1; 3937 3938 if (!rt6_is_dead(rt)) { 3939 *weight += rt->fib6_nh.fib_nh_weight; 3940 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3941 total) - 1; 3942 } 3943 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3944 } 3945 3946 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3947 { 3948 struct fib6_info *iter; 3949 int weight = 0; 3950 3951 rt6_upper_bound_set(rt, &weight, total); 3952 3953 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3954 rt6_upper_bound_set(iter, &weight, total); 3955 } 3956 3957 void rt6_multipath_rebalance(struct fib6_info *rt) 3958 { 3959 struct fib6_info *first; 3960 int total; 3961 3962 /* In case the entire multipath route was marked for flushing, 3963 * then there is no need to rebalance upon the removal of every 3964 * sibling route. 3965 */ 3966 if (!rt->fib6_nsiblings || rt->should_flush) 3967 return; 3968 3969 /* During lookup routes are evaluated in order, so we need to 3970 * make sure upper bounds are assigned from the first sibling 3971 * onwards. 3972 */ 3973 first = rt6_multipath_first_sibling(rt); 3974 if (WARN_ON_ONCE(!first)) 3975 return; 3976 3977 total = rt6_multipath_total_weight(first); 3978 rt6_multipath_upper_bound_set(first, total); 3979 } 3980 3981 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3982 { 3983 const struct arg_netdev_event *arg = p_arg; 3984 struct net *net = dev_net(arg->dev); 3985 3986 if (rt != net->ipv6.fib6_null_entry && 3987 rt->fib6_nh.fib_nh_dev == arg->dev) { 3988 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 3989 fib6_update_sernum_upto_root(net, rt); 3990 rt6_multipath_rebalance(rt); 3991 } 3992 3993 return 0; 3994 } 3995 3996 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3997 { 3998 struct arg_netdev_event arg = { 3999 .dev = dev, 4000 { 4001 .nh_flags = nh_flags, 4002 }, 4003 }; 4004 4005 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4006 arg.nh_flags |= RTNH_F_LINKDOWN; 4007 4008 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4009 } 4010 4011 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4012 const struct net_device *dev) 4013 { 4014 struct fib6_info *iter; 4015 4016 if (rt->fib6_nh.fib_nh_dev == dev) 4017 return true; 4018 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4019 if (iter->fib6_nh.fib_nh_dev == dev) 4020 return true; 4021 4022 return false; 4023 } 4024 4025 static void rt6_multipath_flush(struct fib6_info *rt) 4026 { 4027 struct fib6_info *iter; 4028 4029 rt->should_flush = 1; 4030 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4031 iter->should_flush = 1; 4032 } 4033 4034 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4035 const struct net_device *down_dev) 4036 { 4037 struct fib6_info *iter; 4038 unsigned int dead = 0; 4039 4040 if (rt->fib6_nh.fib_nh_dev == down_dev || 4041 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4042 dead++; 4043 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4044 if (iter->fib6_nh.fib_nh_dev == down_dev || 4045 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4046 dead++; 4047 4048 return dead; 4049 } 4050 4051 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4052 const struct net_device *dev, 4053 unsigned int nh_flags) 4054 { 4055 struct fib6_info *iter; 4056 4057 if (rt->fib6_nh.fib_nh_dev == dev) 4058 rt->fib6_nh.fib_nh_flags |= nh_flags; 4059 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4060 if (iter->fib6_nh.fib_nh_dev == dev) 4061 iter->fib6_nh.fib_nh_flags |= nh_flags; 4062 } 4063 4064 /* called with write lock held for table with rt */ 4065 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4066 { 4067 const struct arg_netdev_event *arg = p_arg; 4068 const struct net_device *dev = arg->dev; 4069 struct net *net = dev_net(dev); 4070 4071 if (rt == net->ipv6.fib6_null_entry) 4072 return 0; 4073 4074 switch (arg->event) { 4075 case NETDEV_UNREGISTER: 4076 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4077 case NETDEV_DOWN: 4078 if (rt->should_flush) 4079 return -1; 4080 if (!rt->fib6_nsiblings) 4081 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4082 if (rt6_multipath_uses_dev(rt, dev)) { 4083 unsigned int count; 4084 4085 count = rt6_multipath_dead_count(rt, dev); 4086 if (rt->fib6_nsiblings + 1 == count) { 4087 rt6_multipath_flush(rt); 4088 return -1; 4089 } 4090 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4091 RTNH_F_LINKDOWN); 4092 fib6_update_sernum(net, rt); 4093 rt6_multipath_rebalance(rt); 4094 } 4095 return -2; 4096 case NETDEV_CHANGE: 4097 if (rt->fib6_nh.fib_nh_dev != dev || 4098 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4099 break; 4100 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4101 rt6_multipath_rebalance(rt); 4102 break; 4103 } 4104 4105 return 0; 4106 } 4107 4108 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4109 { 4110 struct arg_netdev_event arg = { 4111 .dev = dev, 4112 { 4113 .event = event, 4114 }, 4115 }; 4116 struct net *net = dev_net(dev); 4117 4118 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4119 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4120 else 4121 fib6_clean_all(net, fib6_ifdown, &arg); 4122 } 4123 4124 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4125 { 4126 rt6_sync_down_dev(dev, event); 4127 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4128 neigh_ifdown(&nd_tbl, dev); 4129 } 4130 4131 struct rt6_mtu_change_arg { 4132 struct net_device *dev; 4133 unsigned int mtu; 4134 }; 4135 4136 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4137 { 4138 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4139 struct inet6_dev *idev; 4140 4141 /* In IPv6 pmtu discovery is not optional, 4142 so that RTAX_MTU lock cannot disable it. 4143 We still use this lock to block changes 4144 caused by addrconf/ndisc. 4145 */ 4146 4147 idev = __in6_dev_get(arg->dev); 4148 if (!idev) 4149 return 0; 4150 4151 /* For administrative MTU increase, there is no way to discover 4152 IPv6 PMTU increase, so PMTU increase should be updated here. 4153 Since RFC 1981 doesn't include administrative MTU increase 4154 update PMTU increase is a MUST. (i.e. jumbo frame) 4155 */ 4156 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4157 !fib6_metric_locked(rt, RTAX_MTU)) { 4158 u32 mtu = rt->fib6_pmtu; 4159 4160 if (mtu >= arg->mtu || 4161 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4162 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4163 4164 spin_lock_bh(&rt6_exception_lock); 4165 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4166 spin_unlock_bh(&rt6_exception_lock); 4167 } 4168 return 0; 4169 } 4170 4171 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4172 { 4173 struct rt6_mtu_change_arg arg = { 4174 .dev = dev, 4175 .mtu = mtu, 4176 }; 4177 4178 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4179 } 4180 4181 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4182 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4183 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4184 [RTA_OIF] = { .type = NLA_U32 }, 4185 [RTA_IIF] = { .type = NLA_U32 }, 4186 [RTA_PRIORITY] = { .type = NLA_U32 }, 4187 [RTA_METRICS] = { .type = NLA_NESTED }, 4188 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4189 [RTA_PREF] = { .type = NLA_U8 }, 4190 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4191 [RTA_ENCAP] = { .type = NLA_NESTED }, 4192 [RTA_EXPIRES] = { .type = NLA_U32 }, 4193 [RTA_UID] = { .type = NLA_U32 }, 4194 [RTA_MARK] = { .type = NLA_U32 }, 4195 [RTA_TABLE] = { .type = NLA_U32 }, 4196 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4197 [RTA_SPORT] = { .type = NLA_U16 }, 4198 [RTA_DPORT] = { .type = NLA_U16 }, 4199 }; 4200 4201 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4202 struct fib6_config *cfg, 4203 struct netlink_ext_ack *extack) 4204 { 4205 struct rtmsg *rtm; 4206 struct nlattr *tb[RTA_MAX+1]; 4207 unsigned int pref; 4208 int err; 4209 4210 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4211 extack); 4212 if (err < 0) 4213 goto errout; 4214 4215 err = -EINVAL; 4216 rtm = nlmsg_data(nlh); 4217 4218 *cfg = (struct fib6_config){ 4219 .fc_table = rtm->rtm_table, 4220 .fc_dst_len = rtm->rtm_dst_len, 4221 .fc_src_len = rtm->rtm_src_len, 4222 .fc_flags = RTF_UP, 4223 .fc_protocol = rtm->rtm_protocol, 4224 .fc_type = rtm->rtm_type, 4225 4226 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4227 .fc_nlinfo.nlh = nlh, 4228 .fc_nlinfo.nl_net = sock_net(skb->sk), 4229 }; 4230 4231 if (rtm->rtm_type == RTN_UNREACHABLE || 4232 rtm->rtm_type == RTN_BLACKHOLE || 4233 rtm->rtm_type == RTN_PROHIBIT || 4234 rtm->rtm_type == RTN_THROW) 4235 cfg->fc_flags |= RTF_REJECT; 4236 4237 if (rtm->rtm_type == RTN_LOCAL) 4238 cfg->fc_flags |= RTF_LOCAL; 4239 4240 if (rtm->rtm_flags & RTM_F_CLONED) 4241 cfg->fc_flags |= RTF_CACHE; 4242 4243 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4244 4245 if (tb[RTA_GATEWAY]) { 4246 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4247 cfg->fc_flags |= RTF_GATEWAY; 4248 } 4249 if (tb[RTA_VIA]) { 4250 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4251 goto errout; 4252 } 4253 4254 if (tb[RTA_DST]) { 4255 int plen = (rtm->rtm_dst_len + 7) >> 3; 4256 4257 if (nla_len(tb[RTA_DST]) < plen) 4258 goto errout; 4259 4260 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4261 } 4262 4263 if (tb[RTA_SRC]) { 4264 int plen = (rtm->rtm_src_len + 7) >> 3; 4265 4266 if (nla_len(tb[RTA_SRC]) < plen) 4267 goto errout; 4268 4269 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4270 } 4271 4272 if (tb[RTA_PREFSRC]) 4273 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4274 4275 if (tb[RTA_OIF]) 4276 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4277 4278 if (tb[RTA_PRIORITY]) 4279 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4280 4281 if (tb[RTA_METRICS]) { 4282 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4283 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4284 } 4285 4286 if (tb[RTA_TABLE]) 4287 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4288 4289 if (tb[RTA_MULTIPATH]) { 4290 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4291 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4292 4293 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4294 cfg->fc_mp_len, extack); 4295 if (err < 0) 4296 goto errout; 4297 } 4298 4299 if (tb[RTA_PREF]) { 4300 pref = nla_get_u8(tb[RTA_PREF]); 4301 if (pref != ICMPV6_ROUTER_PREF_LOW && 4302 pref != ICMPV6_ROUTER_PREF_HIGH) 4303 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4304 cfg->fc_flags |= RTF_PREF(pref); 4305 } 4306 4307 if (tb[RTA_ENCAP]) 4308 cfg->fc_encap = tb[RTA_ENCAP]; 4309 4310 if (tb[RTA_ENCAP_TYPE]) { 4311 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4312 4313 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4314 if (err < 0) 4315 goto errout; 4316 } 4317 4318 if (tb[RTA_EXPIRES]) { 4319 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4320 4321 if (addrconf_finite_timeout(timeout)) { 4322 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4323 cfg->fc_flags |= RTF_EXPIRES; 4324 } 4325 } 4326 4327 err = 0; 4328 errout: 4329 return err; 4330 } 4331 4332 struct rt6_nh { 4333 struct fib6_info *fib6_info; 4334 struct fib6_config r_cfg; 4335 struct list_head next; 4336 }; 4337 4338 static int ip6_route_info_append(struct net *net, 4339 struct list_head *rt6_nh_list, 4340 struct fib6_info *rt, 4341 struct fib6_config *r_cfg) 4342 { 4343 struct rt6_nh *nh; 4344 int err = -EEXIST; 4345 4346 list_for_each_entry(nh, rt6_nh_list, next) { 4347 /* check if fib6_info already exists */ 4348 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4349 return err; 4350 } 4351 4352 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4353 if (!nh) 4354 return -ENOMEM; 4355 nh->fib6_info = rt; 4356 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4357 list_add_tail(&nh->next, rt6_nh_list); 4358 4359 return 0; 4360 } 4361 4362 static void ip6_route_mpath_notify(struct fib6_info *rt, 4363 struct fib6_info *rt_last, 4364 struct nl_info *info, 4365 __u16 nlflags) 4366 { 4367 /* if this is an APPEND route, then rt points to the first route 4368 * inserted and rt_last points to last route inserted. Userspace 4369 * wants a consistent dump of the route which starts at the first 4370 * nexthop. Since sibling routes are always added at the end of 4371 * the list, find the first sibling of the last route appended 4372 */ 4373 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4374 rt = list_first_entry(&rt_last->fib6_siblings, 4375 struct fib6_info, 4376 fib6_siblings); 4377 } 4378 4379 if (rt) 4380 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4381 } 4382 4383 static int ip6_route_multipath_add(struct fib6_config *cfg, 4384 struct netlink_ext_ack *extack) 4385 { 4386 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4387 struct nl_info *info = &cfg->fc_nlinfo; 4388 struct fib6_config r_cfg; 4389 struct rtnexthop *rtnh; 4390 struct fib6_info *rt; 4391 struct rt6_nh *err_nh; 4392 struct rt6_nh *nh, *nh_safe; 4393 __u16 nlflags; 4394 int remaining; 4395 int attrlen; 4396 int err = 1; 4397 int nhn = 0; 4398 int replace = (cfg->fc_nlinfo.nlh && 4399 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4400 LIST_HEAD(rt6_nh_list); 4401 4402 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4403 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4404 nlflags |= NLM_F_APPEND; 4405 4406 remaining = cfg->fc_mp_len; 4407 rtnh = (struct rtnexthop *)cfg->fc_mp; 4408 4409 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4410 * fib6_info structs per nexthop 4411 */ 4412 while (rtnh_ok(rtnh, remaining)) { 4413 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4414 if (rtnh->rtnh_ifindex) 4415 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4416 4417 attrlen = rtnh_attrlen(rtnh); 4418 if (attrlen > 0) { 4419 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4420 4421 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4422 if (nla) { 4423 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4424 r_cfg.fc_flags |= RTF_GATEWAY; 4425 } 4426 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4427 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4428 if (nla) 4429 r_cfg.fc_encap_type = nla_get_u16(nla); 4430 } 4431 4432 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4433 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4434 if (IS_ERR(rt)) { 4435 err = PTR_ERR(rt); 4436 rt = NULL; 4437 goto cleanup; 4438 } 4439 if (!rt6_qualify_for_ecmp(rt)) { 4440 err = -EINVAL; 4441 NL_SET_ERR_MSG(extack, 4442 "Device only routes can not be added for IPv6 using the multipath API."); 4443 fib6_info_release(rt); 4444 goto cleanup; 4445 } 4446 4447 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4448 4449 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4450 rt, &r_cfg); 4451 if (err) { 4452 fib6_info_release(rt); 4453 goto cleanup; 4454 } 4455 4456 rtnh = rtnh_next(rtnh, &remaining); 4457 } 4458 4459 /* for add and replace send one notification with all nexthops. 4460 * Skip the notification in fib6_add_rt2node and send one with 4461 * the full route when done 4462 */ 4463 info->skip_notify = 1; 4464 4465 err_nh = NULL; 4466 list_for_each_entry(nh, &rt6_nh_list, next) { 4467 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4468 fib6_info_release(nh->fib6_info); 4469 4470 if (!err) { 4471 /* save reference to last route successfully inserted */ 4472 rt_last = nh->fib6_info; 4473 4474 /* save reference to first route for notification */ 4475 if (!rt_notif) 4476 rt_notif = nh->fib6_info; 4477 } 4478 4479 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4480 nh->fib6_info = NULL; 4481 if (err) { 4482 if (replace && nhn) 4483 NL_SET_ERR_MSG_MOD(extack, 4484 "multipath route replace failed (check consistency of installed routes)"); 4485 err_nh = nh; 4486 goto add_errout; 4487 } 4488 4489 /* Because each route is added like a single route we remove 4490 * these flags after the first nexthop: if there is a collision, 4491 * we have already failed to add the first nexthop: 4492 * fib6_add_rt2node() has rejected it; when replacing, old 4493 * nexthops have been replaced by first new, the rest should 4494 * be added to it. 4495 */ 4496 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4497 NLM_F_REPLACE); 4498 nhn++; 4499 } 4500 4501 /* success ... tell user about new route */ 4502 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4503 goto cleanup; 4504 4505 add_errout: 4506 /* send notification for routes that were added so that 4507 * the delete notifications sent by ip6_route_del are 4508 * coherent 4509 */ 4510 if (rt_notif) 4511 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4512 4513 /* Delete routes that were already added */ 4514 list_for_each_entry(nh, &rt6_nh_list, next) { 4515 if (err_nh == nh) 4516 break; 4517 ip6_route_del(&nh->r_cfg, extack); 4518 } 4519 4520 cleanup: 4521 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4522 if (nh->fib6_info) 4523 fib6_info_release(nh->fib6_info); 4524 list_del(&nh->next); 4525 kfree(nh); 4526 } 4527 4528 return err; 4529 } 4530 4531 static int ip6_route_multipath_del(struct fib6_config *cfg, 4532 struct netlink_ext_ack *extack) 4533 { 4534 struct fib6_config r_cfg; 4535 struct rtnexthop *rtnh; 4536 int remaining; 4537 int attrlen; 4538 int err = 1, last_err = 0; 4539 4540 remaining = cfg->fc_mp_len; 4541 rtnh = (struct rtnexthop *)cfg->fc_mp; 4542 4543 /* Parse a Multipath Entry */ 4544 while (rtnh_ok(rtnh, remaining)) { 4545 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4546 if (rtnh->rtnh_ifindex) 4547 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4548 4549 attrlen = rtnh_attrlen(rtnh); 4550 if (attrlen > 0) { 4551 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4552 4553 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4554 if (nla) { 4555 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4556 r_cfg.fc_flags |= RTF_GATEWAY; 4557 } 4558 } 4559 err = ip6_route_del(&r_cfg, extack); 4560 if (err) 4561 last_err = err; 4562 4563 rtnh = rtnh_next(rtnh, &remaining); 4564 } 4565 4566 return last_err; 4567 } 4568 4569 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4570 struct netlink_ext_ack *extack) 4571 { 4572 struct fib6_config cfg; 4573 int err; 4574 4575 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4576 if (err < 0) 4577 return err; 4578 4579 if (cfg.fc_mp) 4580 return ip6_route_multipath_del(&cfg, extack); 4581 else { 4582 cfg.fc_delete_all_nh = 1; 4583 return ip6_route_del(&cfg, extack); 4584 } 4585 } 4586 4587 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4588 struct netlink_ext_ack *extack) 4589 { 4590 struct fib6_config cfg; 4591 int err; 4592 4593 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4594 if (err < 0) 4595 return err; 4596 4597 if (cfg.fc_metric == 0) 4598 cfg.fc_metric = IP6_RT_PRIO_USER; 4599 4600 if (cfg.fc_mp) 4601 return ip6_route_multipath_add(&cfg, extack); 4602 else 4603 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4604 } 4605 4606 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4607 { 4608 int nexthop_len = 0; 4609 4610 if (rt->fib6_nsiblings) { 4611 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4612 + NLA_ALIGN(sizeof(struct rtnexthop)) 4613 + nla_total_size(16) /* RTA_GATEWAY */ 4614 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4615 4616 nexthop_len *= rt->fib6_nsiblings; 4617 } 4618 4619 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4620 + nla_total_size(16) /* RTA_SRC */ 4621 + nla_total_size(16) /* RTA_DST */ 4622 + nla_total_size(16) /* RTA_GATEWAY */ 4623 + nla_total_size(16) /* RTA_PREFSRC */ 4624 + nla_total_size(4) /* RTA_TABLE */ 4625 + nla_total_size(4) /* RTA_IIF */ 4626 + nla_total_size(4) /* RTA_OIF */ 4627 + nla_total_size(4) /* RTA_PRIORITY */ 4628 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4629 + nla_total_size(sizeof(struct rta_cacheinfo)) 4630 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4631 + nla_total_size(1) /* RTA_PREF */ 4632 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4633 + nexthop_len; 4634 } 4635 4636 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4637 struct fib6_info *rt, struct dst_entry *dst, 4638 struct in6_addr *dest, struct in6_addr *src, 4639 int iif, int type, u32 portid, u32 seq, 4640 unsigned int flags) 4641 { 4642 struct rt6_info *rt6 = (struct rt6_info *)dst; 4643 struct rt6key *rt6_dst, *rt6_src; 4644 u32 *pmetrics, table, rt6_flags; 4645 struct nlmsghdr *nlh; 4646 struct rtmsg *rtm; 4647 long expires = 0; 4648 4649 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4650 if (!nlh) 4651 return -EMSGSIZE; 4652 4653 if (rt6) { 4654 rt6_dst = &rt6->rt6i_dst; 4655 rt6_src = &rt6->rt6i_src; 4656 rt6_flags = rt6->rt6i_flags; 4657 } else { 4658 rt6_dst = &rt->fib6_dst; 4659 rt6_src = &rt->fib6_src; 4660 rt6_flags = rt->fib6_flags; 4661 } 4662 4663 rtm = nlmsg_data(nlh); 4664 rtm->rtm_family = AF_INET6; 4665 rtm->rtm_dst_len = rt6_dst->plen; 4666 rtm->rtm_src_len = rt6_src->plen; 4667 rtm->rtm_tos = 0; 4668 if (rt->fib6_table) 4669 table = rt->fib6_table->tb6_id; 4670 else 4671 table = RT6_TABLE_UNSPEC; 4672 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4673 if (nla_put_u32(skb, RTA_TABLE, table)) 4674 goto nla_put_failure; 4675 4676 rtm->rtm_type = rt->fib6_type; 4677 rtm->rtm_flags = 0; 4678 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4679 rtm->rtm_protocol = rt->fib6_protocol; 4680 4681 if (rt6_flags & RTF_CACHE) 4682 rtm->rtm_flags |= RTM_F_CLONED; 4683 4684 if (dest) { 4685 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4686 goto nla_put_failure; 4687 rtm->rtm_dst_len = 128; 4688 } else if (rtm->rtm_dst_len) 4689 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4690 goto nla_put_failure; 4691 #ifdef CONFIG_IPV6_SUBTREES 4692 if (src) { 4693 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4694 goto nla_put_failure; 4695 rtm->rtm_src_len = 128; 4696 } else if (rtm->rtm_src_len && 4697 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4698 goto nla_put_failure; 4699 #endif 4700 if (iif) { 4701 #ifdef CONFIG_IPV6_MROUTE 4702 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4703 int err = ip6mr_get_route(net, skb, rtm, portid); 4704 4705 if (err == 0) 4706 return 0; 4707 if (err < 0) 4708 goto nla_put_failure; 4709 } else 4710 #endif 4711 if (nla_put_u32(skb, RTA_IIF, iif)) 4712 goto nla_put_failure; 4713 } else if (dest) { 4714 struct in6_addr saddr_buf; 4715 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4716 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4717 goto nla_put_failure; 4718 } 4719 4720 if (rt->fib6_prefsrc.plen) { 4721 struct in6_addr saddr_buf; 4722 saddr_buf = rt->fib6_prefsrc.addr; 4723 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4724 goto nla_put_failure; 4725 } 4726 4727 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4728 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4729 goto nla_put_failure; 4730 4731 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4732 goto nla_put_failure; 4733 4734 /* For multipath routes, walk the siblings list and add 4735 * each as a nexthop within RTA_MULTIPATH. 4736 */ 4737 if (rt6) { 4738 if (rt6_flags & RTF_GATEWAY && 4739 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4740 goto nla_put_failure; 4741 4742 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4743 goto nla_put_failure; 4744 } else if (rt->fib6_nsiblings) { 4745 struct fib6_info *sibling, *next_sibling; 4746 struct nlattr *mp; 4747 4748 mp = nla_nest_start(skb, RTA_MULTIPATH); 4749 if (!mp) 4750 goto nla_put_failure; 4751 4752 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, 4753 rt->fib6_nh.fib_nh_weight) < 0) 4754 goto nla_put_failure; 4755 4756 list_for_each_entry_safe(sibling, next_sibling, 4757 &rt->fib6_siblings, fib6_siblings) { 4758 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, 4759 sibling->fib6_nh.fib_nh_weight) < 0) 4760 goto nla_put_failure; 4761 } 4762 4763 nla_nest_end(skb, mp); 4764 } else { 4765 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, 4766 &rtm->rtm_flags, false) < 0) 4767 goto nla_put_failure; 4768 } 4769 4770 if (rt6_flags & RTF_EXPIRES) { 4771 expires = dst ? dst->expires : rt->expires; 4772 expires -= jiffies; 4773 } 4774 4775 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4776 goto nla_put_failure; 4777 4778 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4779 goto nla_put_failure; 4780 4781 4782 nlmsg_end(skb, nlh); 4783 return 0; 4784 4785 nla_put_failure: 4786 nlmsg_cancel(skb, nlh); 4787 return -EMSGSIZE; 4788 } 4789 4790 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4791 const struct net_device *dev) 4792 { 4793 if (f6i->fib6_nh.fib_nh_dev == dev) 4794 return true; 4795 4796 if (f6i->fib6_nsiblings) { 4797 struct fib6_info *sibling, *next_sibling; 4798 4799 list_for_each_entry_safe(sibling, next_sibling, 4800 &f6i->fib6_siblings, fib6_siblings) { 4801 if (sibling->fib6_nh.fib_nh_dev == dev) 4802 return true; 4803 } 4804 } 4805 4806 return false; 4807 } 4808 4809 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4810 { 4811 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4812 struct fib_dump_filter *filter = &arg->filter; 4813 unsigned int flags = NLM_F_MULTI; 4814 struct net *net = arg->net; 4815 4816 if (rt == net->ipv6.fib6_null_entry) 4817 return 0; 4818 4819 if ((filter->flags & RTM_F_PREFIX) && 4820 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4821 /* success since this is not a prefix route */ 4822 return 1; 4823 } 4824 if (filter->filter_set) { 4825 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4826 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4827 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4828 return 1; 4829 } 4830 flags |= NLM_F_DUMP_FILTERED; 4831 } 4832 4833 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4834 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4835 arg->cb->nlh->nlmsg_seq, flags); 4836 } 4837 4838 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4839 const struct nlmsghdr *nlh, 4840 struct nlattr **tb, 4841 struct netlink_ext_ack *extack) 4842 { 4843 struct rtmsg *rtm; 4844 int i, err; 4845 4846 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4847 NL_SET_ERR_MSG_MOD(extack, 4848 "Invalid header for get route request"); 4849 return -EINVAL; 4850 } 4851 4852 if (!netlink_strict_get_check(skb)) 4853 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4854 rtm_ipv6_policy, extack); 4855 4856 rtm = nlmsg_data(nlh); 4857 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4858 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4859 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4860 rtm->rtm_type) { 4861 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4862 return -EINVAL; 4863 } 4864 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4865 NL_SET_ERR_MSG_MOD(extack, 4866 "Invalid flags for get route request"); 4867 return -EINVAL; 4868 } 4869 4870 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4871 rtm_ipv6_policy, extack); 4872 if (err) 4873 return err; 4874 4875 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4876 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4877 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4878 return -EINVAL; 4879 } 4880 4881 for (i = 0; i <= RTA_MAX; i++) { 4882 if (!tb[i]) 4883 continue; 4884 4885 switch (i) { 4886 case RTA_SRC: 4887 case RTA_DST: 4888 case RTA_IIF: 4889 case RTA_OIF: 4890 case RTA_MARK: 4891 case RTA_UID: 4892 case RTA_SPORT: 4893 case RTA_DPORT: 4894 case RTA_IP_PROTO: 4895 break; 4896 default: 4897 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4898 return -EINVAL; 4899 } 4900 } 4901 4902 return 0; 4903 } 4904 4905 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4906 struct netlink_ext_ack *extack) 4907 { 4908 struct net *net = sock_net(in_skb->sk); 4909 struct nlattr *tb[RTA_MAX+1]; 4910 int err, iif = 0, oif = 0; 4911 struct fib6_info *from; 4912 struct dst_entry *dst; 4913 struct rt6_info *rt; 4914 struct sk_buff *skb; 4915 struct rtmsg *rtm; 4916 struct flowi6 fl6 = {}; 4917 bool fibmatch; 4918 4919 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4920 if (err < 0) 4921 goto errout; 4922 4923 err = -EINVAL; 4924 rtm = nlmsg_data(nlh); 4925 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4926 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4927 4928 if (tb[RTA_SRC]) { 4929 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4930 goto errout; 4931 4932 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4933 } 4934 4935 if (tb[RTA_DST]) { 4936 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4937 goto errout; 4938 4939 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4940 } 4941 4942 if (tb[RTA_IIF]) 4943 iif = nla_get_u32(tb[RTA_IIF]); 4944 4945 if (tb[RTA_OIF]) 4946 oif = nla_get_u32(tb[RTA_OIF]); 4947 4948 if (tb[RTA_MARK]) 4949 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4950 4951 if (tb[RTA_UID]) 4952 fl6.flowi6_uid = make_kuid(current_user_ns(), 4953 nla_get_u32(tb[RTA_UID])); 4954 else 4955 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4956 4957 if (tb[RTA_SPORT]) 4958 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4959 4960 if (tb[RTA_DPORT]) 4961 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4962 4963 if (tb[RTA_IP_PROTO]) { 4964 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4965 &fl6.flowi6_proto, AF_INET6, 4966 extack); 4967 if (err) 4968 goto errout; 4969 } 4970 4971 if (iif) { 4972 struct net_device *dev; 4973 int flags = 0; 4974 4975 rcu_read_lock(); 4976 4977 dev = dev_get_by_index_rcu(net, iif); 4978 if (!dev) { 4979 rcu_read_unlock(); 4980 err = -ENODEV; 4981 goto errout; 4982 } 4983 4984 fl6.flowi6_iif = iif; 4985 4986 if (!ipv6_addr_any(&fl6.saddr)) 4987 flags |= RT6_LOOKUP_F_HAS_SADDR; 4988 4989 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4990 4991 rcu_read_unlock(); 4992 } else { 4993 fl6.flowi6_oif = oif; 4994 4995 dst = ip6_route_output(net, NULL, &fl6); 4996 } 4997 4998 4999 rt = container_of(dst, struct rt6_info, dst); 5000 if (rt->dst.error) { 5001 err = rt->dst.error; 5002 ip6_rt_put(rt); 5003 goto errout; 5004 } 5005 5006 if (rt == net->ipv6.ip6_null_entry) { 5007 err = rt->dst.error; 5008 ip6_rt_put(rt); 5009 goto errout; 5010 } 5011 5012 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5013 if (!skb) { 5014 ip6_rt_put(rt); 5015 err = -ENOBUFS; 5016 goto errout; 5017 } 5018 5019 skb_dst_set(skb, &rt->dst); 5020 5021 rcu_read_lock(); 5022 from = rcu_dereference(rt->from); 5023 5024 if (fibmatch) 5025 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 5026 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 5027 nlh->nlmsg_seq, 0); 5028 else 5029 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5030 &fl6.saddr, iif, RTM_NEWROUTE, 5031 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 5032 0); 5033 rcu_read_unlock(); 5034 5035 if (err < 0) { 5036 kfree_skb(skb); 5037 goto errout; 5038 } 5039 5040 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5041 errout: 5042 return err; 5043 } 5044 5045 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5046 unsigned int nlm_flags) 5047 { 5048 struct sk_buff *skb; 5049 struct net *net = info->nl_net; 5050 u32 seq; 5051 int err; 5052 5053 err = -ENOBUFS; 5054 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5055 5056 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5057 if (!skb) 5058 goto errout; 5059 5060 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5061 event, info->portid, seq, nlm_flags); 5062 if (err < 0) { 5063 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5064 WARN_ON(err == -EMSGSIZE); 5065 kfree_skb(skb); 5066 goto errout; 5067 } 5068 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5069 info->nlh, gfp_any()); 5070 return; 5071 errout: 5072 if (err < 0) 5073 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5074 } 5075 5076 static int ip6_route_dev_notify(struct notifier_block *this, 5077 unsigned long event, void *ptr) 5078 { 5079 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5080 struct net *net = dev_net(dev); 5081 5082 if (!(dev->flags & IFF_LOOPBACK)) 5083 return NOTIFY_OK; 5084 5085 if (event == NETDEV_REGISTER) { 5086 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5087 net->ipv6.ip6_null_entry->dst.dev = dev; 5088 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5089 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5090 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5091 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5092 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5093 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5094 #endif 5095 } else if (event == NETDEV_UNREGISTER && 5096 dev->reg_state != NETREG_UNREGISTERED) { 5097 /* NETDEV_UNREGISTER could be fired for multiple times by 5098 * netdev_wait_allrefs(). Make sure we only call this once. 5099 */ 5100 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5101 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5102 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5103 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5104 #endif 5105 } 5106 5107 return NOTIFY_OK; 5108 } 5109 5110 /* 5111 * /proc 5112 */ 5113 5114 #ifdef CONFIG_PROC_FS 5115 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5116 { 5117 struct net *net = (struct net *)seq->private; 5118 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5119 net->ipv6.rt6_stats->fib_nodes, 5120 net->ipv6.rt6_stats->fib_route_nodes, 5121 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5122 net->ipv6.rt6_stats->fib_rt_entries, 5123 net->ipv6.rt6_stats->fib_rt_cache, 5124 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5125 net->ipv6.rt6_stats->fib_discarded_routes); 5126 5127 return 0; 5128 } 5129 #endif /* CONFIG_PROC_FS */ 5130 5131 #ifdef CONFIG_SYSCTL 5132 5133 static 5134 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5135 void __user *buffer, size_t *lenp, loff_t *ppos) 5136 { 5137 struct net *net; 5138 int delay; 5139 int ret; 5140 if (!write) 5141 return -EINVAL; 5142 5143 net = (struct net *)ctl->extra1; 5144 delay = net->ipv6.sysctl.flush_delay; 5145 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5146 if (ret) 5147 return ret; 5148 5149 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5150 return 0; 5151 } 5152 5153 static int zero; 5154 static int one = 1; 5155 5156 static struct ctl_table ipv6_route_table_template[] = { 5157 { 5158 .procname = "flush", 5159 .data = &init_net.ipv6.sysctl.flush_delay, 5160 .maxlen = sizeof(int), 5161 .mode = 0200, 5162 .proc_handler = ipv6_sysctl_rtcache_flush 5163 }, 5164 { 5165 .procname = "gc_thresh", 5166 .data = &ip6_dst_ops_template.gc_thresh, 5167 .maxlen = sizeof(int), 5168 .mode = 0644, 5169 .proc_handler = proc_dointvec, 5170 }, 5171 { 5172 .procname = "max_size", 5173 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5174 .maxlen = sizeof(int), 5175 .mode = 0644, 5176 .proc_handler = proc_dointvec, 5177 }, 5178 { 5179 .procname = "gc_min_interval", 5180 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5181 .maxlen = sizeof(int), 5182 .mode = 0644, 5183 .proc_handler = proc_dointvec_jiffies, 5184 }, 5185 { 5186 .procname = "gc_timeout", 5187 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5188 .maxlen = sizeof(int), 5189 .mode = 0644, 5190 .proc_handler = proc_dointvec_jiffies, 5191 }, 5192 { 5193 .procname = "gc_interval", 5194 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5195 .maxlen = sizeof(int), 5196 .mode = 0644, 5197 .proc_handler = proc_dointvec_jiffies, 5198 }, 5199 { 5200 .procname = "gc_elasticity", 5201 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5202 .maxlen = sizeof(int), 5203 .mode = 0644, 5204 .proc_handler = proc_dointvec, 5205 }, 5206 { 5207 .procname = "mtu_expires", 5208 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5209 .maxlen = sizeof(int), 5210 .mode = 0644, 5211 .proc_handler = proc_dointvec_jiffies, 5212 }, 5213 { 5214 .procname = "min_adv_mss", 5215 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5216 .maxlen = sizeof(int), 5217 .mode = 0644, 5218 .proc_handler = proc_dointvec, 5219 }, 5220 { 5221 .procname = "gc_min_interval_ms", 5222 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5223 .maxlen = sizeof(int), 5224 .mode = 0644, 5225 .proc_handler = proc_dointvec_ms_jiffies, 5226 }, 5227 { 5228 .procname = "skip_notify_on_dev_down", 5229 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5230 .maxlen = sizeof(int), 5231 .mode = 0644, 5232 .proc_handler = proc_dointvec, 5233 .extra1 = &zero, 5234 .extra2 = &one, 5235 }, 5236 { } 5237 }; 5238 5239 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5240 { 5241 struct ctl_table *table; 5242 5243 table = kmemdup(ipv6_route_table_template, 5244 sizeof(ipv6_route_table_template), 5245 GFP_KERNEL); 5246 5247 if (table) { 5248 table[0].data = &net->ipv6.sysctl.flush_delay; 5249 table[0].extra1 = net; 5250 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5251 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5252 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5253 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5254 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5255 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5256 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5257 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5258 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5259 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5260 5261 /* Don't export sysctls to unprivileged users */ 5262 if (net->user_ns != &init_user_ns) 5263 table[0].procname = NULL; 5264 } 5265 5266 return table; 5267 } 5268 #endif 5269 5270 static int __net_init ip6_route_net_init(struct net *net) 5271 { 5272 int ret = -ENOMEM; 5273 5274 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5275 sizeof(net->ipv6.ip6_dst_ops)); 5276 5277 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5278 goto out_ip6_dst_ops; 5279 5280 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5281 sizeof(*net->ipv6.fib6_null_entry), 5282 GFP_KERNEL); 5283 if (!net->ipv6.fib6_null_entry) 5284 goto out_ip6_dst_entries; 5285 5286 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5287 sizeof(*net->ipv6.ip6_null_entry), 5288 GFP_KERNEL); 5289 if (!net->ipv6.ip6_null_entry) 5290 goto out_fib6_null_entry; 5291 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5292 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5293 ip6_template_metrics, true); 5294 5295 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5296 net->ipv6.fib6_has_custom_rules = false; 5297 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5298 sizeof(*net->ipv6.ip6_prohibit_entry), 5299 GFP_KERNEL); 5300 if (!net->ipv6.ip6_prohibit_entry) 5301 goto out_ip6_null_entry; 5302 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5303 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5304 ip6_template_metrics, true); 5305 5306 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5307 sizeof(*net->ipv6.ip6_blk_hole_entry), 5308 GFP_KERNEL); 5309 if (!net->ipv6.ip6_blk_hole_entry) 5310 goto out_ip6_prohibit_entry; 5311 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5312 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5313 ip6_template_metrics, true); 5314 #endif 5315 5316 net->ipv6.sysctl.flush_delay = 0; 5317 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5318 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5319 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5320 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5321 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5322 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5323 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5324 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5325 5326 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5327 5328 ret = 0; 5329 out: 5330 return ret; 5331 5332 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5333 out_ip6_prohibit_entry: 5334 kfree(net->ipv6.ip6_prohibit_entry); 5335 out_ip6_null_entry: 5336 kfree(net->ipv6.ip6_null_entry); 5337 #endif 5338 out_fib6_null_entry: 5339 kfree(net->ipv6.fib6_null_entry); 5340 out_ip6_dst_entries: 5341 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5342 out_ip6_dst_ops: 5343 goto out; 5344 } 5345 5346 static void __net_exit ip6_route_net_exit(struct net *net) 5347 { 5348 kfree(net->ipv6.fib6_null_entry); 5349 kfree(net->ipv6.ip6_null_entry); 5350 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5351 kfree(net->ipv6.ip6_prohibit_entry); 5352 kfree(net->ipv6.ip6_blk_hole_entry); 5353 #endif 5354 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5355 } 5356 5357 static int __net_init ip6_route_net_init_late(struct net *net) 5358 { 5359 #ifdef CONFIG_PROC_FS 5360 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5361 sizeof(struct ipv6_route_iter)); 5362 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5363 rt6_stats_seq_show, NULL); 5364 #endif 5365 return 0; 5366 } 5367 5368 static void __net_exit ip6_route_net_exit_late(struct net *net) 5369 { 5370 #ifdef CONFIG_PROC_FS 5371 remove_proc_entry("ipv6_route", net->proc_net); 5372 remove_proc_entry("rt6_stats", net->proc_net); 5373 #endif 5374 } 5375 5376 static struct pernet_operations ip6_route_net_ops = { 5377 .init = ip6_route_net_init, 5378 .exit = ip6_route_net_exit, 5379 }; 5380 5381 static int __net_init ipv6_inetpeer_init(struct net *net) 5382 { 5383 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5384 5385 if (!bp) 5386 return -ENOMEM; 5387 inet_peer_base_init(bp); 5388 net->ipv6.peers = bp; 5389 return 0; 5390 } 5391 5392 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5393 { 5394 struct inet_peer_base *bp = net->ipv6.peers; 5395 5396 net->ipv6.peers = NULL; 5397 inetpeer_invalidate_tree(bp); 5398 kfree(bp); 5399 } 5400 5401 static struct pernet_operations ipv6_inetpeer_ops = { 5402 .init = ipv6_inetpeer_init, 5403 .exit = ipv6_inetpeer_exit, 5404 }; 5405 5406 static struct pernet_operations ip6_route_net_late_ops = { 5407 .init = ip6_route_net_init_late, 5408 .exit = ip6_route_net_exit_late, 5409 }; 5410 5411 static struct notifier_block ip6_route_dev_notifier = { 5412 .notifier_call = ip6_route_dev_notify, 5413 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5414 }; 5415 5416 void __init ip6_route_init_special_entries(void) 5417 { 5418 /* Registering of the loopback is done before this portion of code, 5419 * the loopback reference in rt6_info will not be taken, do it 5420 * manually for init_net */ 5421 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5422 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5423 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5424 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5425 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5426 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5427 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5428 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5429 #endif 5430 } 5431 5432 int __init ip6_route_init(void) 5433 { 5434 int ret; 5435 int cpu; 5436 5437 ret = -ENOMEM; 5438 ip6_dst_ops_template.kmem_cachep = 5439 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5440 SLAB_HWCACHE_ALIGN, NULL); 5441 if (!ip6_dst_ops_template.kmem_cachep) 5442 goto out; 5443 5444 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5445 if (ret) 5446 goto out_kmem_cache; 5447 5448 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5449 if (ret) 5450 goto out_dst_entries; 5451 5452 ret = register_pernet_subsys(&ip6_route_net_ops); 5453 if (ret) 5454 goto out_register_inetpeer; 5455 5456 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5457 5458 ret = fib6_init(); 5459 if (ret) 5460 goto out_register_subsys; 5461 5462 ret = xfrm6_init(); 5463 if (ret) 5464 goto out_fib6_init; 5465 5466 ret = fib6_rules_init(); 5467 if (ret) 5468 goto xfrm6_init; 5469 5470 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5471 if (ret) 5472 goto fib6_rules_init; 5473 5474 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5475 inet6_rtm_newroute, NULL, 0); 5476 if (ret < 0) 5477 goto out_register_late_subsys; 5478 5479 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5480 inet6_rtm_delroute, NULL, 0); 5481 if (ret < 0) 5482 goto out_register_late_subsys; 5483 5484 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5485 inet6_rtm_getroute, NULL, 5486 RTNL_FLAG_DOIT_UNLOCKED); 5487 if (ret < 0) 5488 goto out_register_late_subsys; 5489 5490 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5491 if (ret) 5492 goto out_register_late_subsys; 5493 5494 for_each_possible_cpu(cpu) { 5495 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5496 5497 INIT_LIST_HEAD(&ul->head); 5498 spin_lock_init(&ul->lock); 5499 } 5500 5501 out: 5502 return ret; 5503 5504 out_register_late_subsys: 5505 rtnl_unregister_all(PF_INET6); 5506 unregister_pernet_subsys(&ip6_route_net_late_ops); 5507 fib6_rules_init: 5508 fib6_rules_cleanup(); 5509 xfrm6_init: 5510 xfrm6_fini(); 5511 out_fib6_init: 5512 fib6_gc_cleanup(); 5513 out_register_subsys: 5514 unregister_pernet_subsys(&ip6_route_net_ops); 5515 out_register_inetpeer: 5516 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5517 out_dst_entries: 5518 dst_entries_destroy(&ip6_dst_blackhole_ops); 5519 out_kmem_cache: 5520 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5521 goto out; 5522 } 5523 5524 void ip6_route_cleanup(void) 5525 { 5526 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5527 unregister_pernet_subsys(&ip6_route_net_late_ops); 5528 fib6_rules_cleanup(); 5529 xfrm6_fini(); 5530 fib6_gc_cleanup(); 5531 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5532 unregister_pernet_subsys(&ip6_route_net_ops); 5533 dst_entries_destroy(&ip6_dst_blackhole_ops); 5534 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5535 } 5536