1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/rtnh.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 106 int strict); 107 static size_t rt6_nlmsg_size(struct fib6_info *rt); 108 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 109 struct fib6_info *rt, struct dst_entry *dst, 110 struct in6_addr *dest, struct in6_addr *src, 111 int iif, int type, u32 portid, u32 seq, 112 unsigned int flags); 113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 114 struct in6_addr *daddr, 115 struct in6_addr *saddr); 116 117 #ifdef CONFIG_IPV6_ROUTE_INFO 118 static struct fib6_info *rt6_add_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev, 122 unsigned int pref); 123 static struct fib6_info *rt6_get_route_info(struct net *net, 124 const struct in6_addr *prefix, int prefixlen, 125 const struct in6_addr *gwaddr, 126 struct net_device *dev); 127 #endif 128 129 struct uncached_list { 130 spinlock_t lock; 131 struct list_head head; 132 }; 133 134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 135 136 void rt6_uncached_list_add(struct rt6_info *rt) 137 { 138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 139 140 rt->rt6i_uncached_list = ul; 141 142 spin_lock_bh(&ul->lock); 143 list_add_tail(&rt->rt6i_uncached, &ul->head); 144 spin_unlock_bh(&ul->lock); 145 } 146 147 void rt6_uncached_list_del(struct rt6_info *rt) 148 { 149 if (!list_empty(&rt->rt6i_uncached)) { 150 struct uncached_list *ul = rt->rt6i_uncached_list; 151 struct net *net = dev_net(rt->dst.dev); 152 153 spin_lock_bh(&ul->lock); 154 list_del(&rt->rt6i_uncached); 155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 161 { 162 struct net_device *loopback_dev = net->loopback_dev; 163 int cpu; 164 165 if (dev == loopback_dev) 166 return; 167 168 for_each_possible_cpu(cpu) { 169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 170 struct rt6_info *rt; 171 172 spin_lock_bh(&ul->lock); 173 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 174 struct inet6_dev *rt_idev = rt->rt6i_idev; 175 struct net_device *rt_dev = rt->dst.dev; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(loopback_dev); 179 in6_dev_put(rt_idev); 180 } 181 182 if (rt_dev == dev) { 183 rt->dst.dev = loopback_dev; 184 dev_hold(rt->dst.dev); 185 dev_put(rt_dev); 186 } 187 } 188 spin_unlock_bh(&ul->lock); 189 } 190 } 191 192 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 193 struct sk_buff *skb, 194 const void *daddr) 195 { 196 if (!ipv6_addr_any(p)) 197 return (const void *) p; 198 else if (skb) 199 return &ipv6_hdr(skb)->daddr; 200 return daddr; 201 } 202 203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 204 struct net_device *dev, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct neighbour *n; 209 210 daddr = choose_neigh_daddr(gw, skb, daddr); 211 n = __ipv6_neigh_lookup(dev, daddr); 212 if (n) 213 return n; 214 215 n = neigh_create(&nd_tbl, daddr, dev); 216 return IS_ERR(n) ? NULL : n; 217 } 218 219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 220 struct sk_buff *skb, 221 const void *daddr) 222 { 223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 224 225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 226 } 227 228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 229 { 230 struct net_device *dev = dst->dev; 231 struct rt6_info *rt = (struct rt6_info *)dst; 232 233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 234 if (!daddr) 235 return; 236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 237 return; 238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 239 return; 240 __ipv6_confirm_neigh(dev, daddr); 241 } 242 243 static struct dst_ops ip6_dst_ops_template = { 244 .family = AF_INET6, 245 .gc = ip6_dst_gc, 246 .gc_thresh = 1024, 247 .check = ip6_dst_check, 248 .default_advmss = ip6_default_advmss, 249 .mtu = ip6_mtu, 250 .cow_metrics = dst_cow_metrics_generic, 251 .destroy = ip6_dst_destroy, 252 .ifdown = ip6_dst_ifdown, 253 .negative_advice = ip6_negative_advice, 254 .link_failure = ip6_link_failure, 255 .update_pmtu = ip6_rt_update_pmtu, 256 .redirect = rt6_do_redirect, 257 .local_out = __ip6_local_out, 258 .neigh_lookup = ip6_dst_neigh_lookup, 259 .confirm_neigh = ip6_confirm_neigh, 260 }; 261 262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 263 { 264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 265 266 return mtu ? : dst->dev->mtu; 267 } 268 269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 270 struct sk_buff *skb, u32 mtu) 271 { 272 } 273 274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 275 struct sk_buff *skb) 276 { 277 } 278 279 static struct dst_ops ip6_dst_blackhole_ops = { 280 .family = AF_INET6, 281 .destroy = ip6_dst_destroy, 282 .check = ip6_dst_check, 283 .mtu = ip6_blackhole_mtu, 284 .default_advmss = ip6_default_advmss, 285 .update_pmtu = ip6_rt_blackhole_update_pmtu, 286 .redirect = ip6_rt_blackhole_redirect, 287 .cow_metrics = dst_cow_metrics_generic, 288 .neigh_lookup = ip6_dst_neigh_lookup, 289 }; 290 291 static const u32 ip6_template_metrics[RTAX_MAX] = { 292 [RTAX_HOPLIMIT - 1] = 0, 293 }; 294 295 static const struct fib6_info fib6_null_entry_template = { 296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 297 .fib6_protocol = RTPROT_KERNEL, 298 .fib6_metric = ~(u32)0, 299 .fib6_ref = REFCOUNT_INIT(1), 300 .fib6_type = RTN_UNREACHABLE, 301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 302 }; 303 304 static const struct rt6_info ip6_null_entry_template = { 305 .dst = { 306 .__refcnt = ATOMIC_INIT(1), 307 .__use = 1, 308 .obsolete = DST_OBSOLETE_FORCE_CHK, 309 .error = -ENETUNREACH, 310 .input = ip6_pkt_discard, 311 .output = ip6_pkt_discard_out, 312 }, 313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 314 }; 315 316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 317 318 static const struct rt6_info ip6_prohibit_entry_template = { 319 .dst = { 320 .__refcnt = ATOMIC_INIT(1), 321 .__use = 1, 322 .obsolete = DST_OBSOLETE_FORCE_CHK, 323 .error = -EACCES, 324 .input = ip6_pkt_prohibit, 325 .output = ip6_pkt_prohibit_out, 326 }, 327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 }; 341 342 #endif 343 344 static void rt6_info_init(struct rt6_info *rt) 345 { 346 struct dst_entry *dst = &rt->dst; 347 348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 349 INIT_LIST_HEAD(&rt->rt6i_uncached); 350 } 351 352 /* allocate dst with ip6_dst_ops */ 353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 354 int flags) 355 { 356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 357 1, DST_OBSOLETE_FORCE_CHK, flags); 358 359 if (rt) { 360 rt6_info_init(rt); 361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 362 } 363 364 return rt; 365 } 366 EXPORT_SYMBOL(ip6_dst_alloc); 367 368 static void ip6_dst_destroy(struct dst_entry *dst) 369 { 370 struct rt6_info *rt = (struct rt6_info *)dst; 371 struct fib6_info *from; 372 struct inet6_dev *idev; 373 374 ip_dst_metrics_put(dst); 375 rt6_uncached_list_del(rt); 376 377 idev = rt->rt6i_idev; 378 if (idev) { 379 rt->rt6i_idev = NULL; 380 in6_dev_put(idev); 381 } 382 383 from = xchg((__force struct fib6_info **)&rt->from, NULL); 384 fib6_info_release(from); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 void fib6_select_path(const struct net *net, struct fib6_result *res, 429 struct flowi6 *fl6, int oif, bool have_oif_match, 430 const struct sk_buff *skb, int strict) 431 { 432 struct fib6_info *sibling, *next_sibling; 433 struct fib6_info *match = res->f6i; 434 435 if (!match->fib6_nsiblings || have_oif_match) 436 goto out; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 445 goto out; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 const struct fib6_nh *nh = &sibling->fib6_nh; 450 int nh_upper_bound; 451 452 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 453 if (fl6->mp_hash > nh_upper_bound) 454 continue; 455 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 456 break; 457 match = sibling; 458 break; 459 } 460 461 out: 462 res->f6i = match; 463 res->nh = &match->fib6_nh; 464 } 465 466 /* 467 * Route lookup. rcu_read_lock() should be held. 468 */ 469 470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 471 const struct in6_addr *saddr, int oif, int flags) 472 { 473 const struct net_device *dev; 474 475 if (nh->fib_nh_flags & RTNH_F_DEAD) 476 return false; 477 478 dev = nh->fib_nh_dev; 479 if (oif) { 480 if (dev->ifindex == oif) 481 return true; 482 } else { 483 if (ipv6_chk_addr(net, saddr, dev, 484 flags & RT6_LOOKUP_F_IFACE)) 485 return true; 486 } 487 488 return false; 489 } 490 491 static void rt6_device_match(struct net *net, struct fib6_result *res, 492 const struct in6_addr *saddr, int oif, int flags) 493 { 494 struct fib6_info *f6i = res->f6i; 495 struct fib6_info *spf6i; 496 struct fib6_nh *nh; 497 498 if (!oif && ipv6_addr_any(saddr)) { 499 nh = &f6i->fib6_nh; 500 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 501 goto out; 502 } 503 504 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 505 nh = &spf6i->fib6_nh; 506 if (__rt6_device_match(net, nh, saddr, oif, flags)) { 507 res->f6i = spf6i; 508 goto out; 509 } 510 } 511 512 if (oif && flags & RT6_LOOKUP_F_IFACE) { 513 res->f6i = net->ipv6.fib6_null_entry; 514 nh = &res->f6i->fib6_nh; 515 goto out; 516 } 517 518 nh = &f6i->fib6_nh; 519 if (nh->fib_nh_flags & RTNH_F_DEAD) { 520 res->f6i = net->ipv6.fib6_null_entry; 521 nh = &res->f6i->fib6_nh; 522 } 523 out: 524 res->nh = nh; 525 res->fib6_type = res->f6i->fib6_type; 526 res->fib6_flags = res->f6i->fib6_flags; 527 } 528 529 #ifdef CONFIG_IPV6_ROUTER_PREF 530 struct __rt6_probe_work { 531 struct work_struct work; 532 struct in6_addr target; 533 struct net_device *dev; 534 }; 535 536 static void rt6_probe_deferred(struct work_struct *w) 537 { 538 struct in6_addr mcaddr; 539 struct __rt6_probe_work *work = 540 container_of(w, struct __rt6_probe_work, work); 541 542 addrconf_addr_solict_mult(&work->target, &mcaddr); 543 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 544 dev_put(work->dev); 545 kfree(work); 546 } 547 548 static void rt6_probe(struct fib6_nh *fib6_nh) 549 { 550 struct __rt6_probe_work *work = NULL; 551 const struct in6_addr *nh_gw; 552 struct neighbour *neigh; 553 struct net_device *dev; 554 struct inet6_dev *idev; 555 556 /* 557 * Okay, this does not seem to be appropriate 558 * for now, however, we need to check if it 559 * is really so; aka Router Reachability Probing. 560 * 561 * Router Reachability Probe MUST be rate-limited 562 * to no more than one per minute. 563 */ 564 if (fib6_nh->fib_nh_gw_family) 565 return; 566 567 nh_gw = &fib6_nh->fib_nh_gw6; 568 dev = fib6_nh->fib_nh_dev; 569 rcu_read_lock_bh(); 570 idev = __in6_dev_get(dev); 571 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 572 if (neigh) { 573 if (neigh->nud_state & NUD_VALID) 574 goto out; 575 576 write_lock(&neigh->lock); 577 if (!(neigh->nud_state & NUD_VALID) && 578 time_after(jiffies, 579 neigh->updated + idev->cnf.rtr_probe_interval)) { 580 work = kmalloc(sizeof(*work), GFP_ATOMIC); 581 if (work) 582 __neigh_set_probe_once(neigh); 583 } 584 write_unlock(&neigh->lock); 585 } else if (time_after(jiffies, fib6_nh->last_probe + 586 idev->cnf.rtr_probe_interval)) { 587 work = kmalloc(sizeof(*work), GFP_ATOMIC); 588 } 589 590 if (work) { 591 fib6_nh->last_probe = jiffies; 592 INIT_WORK(&work->work, rt6_probe_deferred); 593 work->target = *nh_gw; 594 dev_hold(dev); 595 work->dev = dev; 596 schedule_work(&work->work); 597 } 598 599 out: 600 rcu_read_unlock_bh(); 601 } 602 #else 603 static inline void rt6_probe(struct fib6_nh *fib6_nh) 604 { 605 } 606 #endif 607 608 /* 609 * Default Router Selection (RFC 2461 6.3.6) 610 */ 611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 612 { 613 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 614 struct neighbour *neigh; 615 616 rcu_read_lock_bh(); 617 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 618 &fib6_nh->fib_nh_gw6); 619 if (neigh) { 620 read_lock(&neigh->lock); 621 if (neigh->nud_state & NUD_VALID) 622 ret = RT6_NUD_SUCCEED; 623 #ifdef CONFIG_IPV6_ROUTER_PREF 624 else if (!(neigh->nud_state & NUD_FAILED)) 625 ret = RT6_NUD_SUCCEED; 626 else 627 ret = RT6_NUD_FAIL_PROBE; 628 #endif 629 read_unlock(&neigh->lock); 630 } else { 631 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 632 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 633 } 634 rcu_read_unlock_bh(); 635 636 return ret; 637 } 638 639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 640 int strict) 641 { 642 int m = 0; 643 644 if (!oif || nh->fib_nh_dev->ifindex == oif) 645 m = 2; 646 647 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 648 return RT6_NUD_FAIL_HARD; 649 #ifdef CONFIG_IPV6_ROUTER_PREF 650 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 651 #endif 652 if ((strict & RT6_LOOKUP_F_REACHABLE) && 653 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 654 int n = rt6_check_neigh(nh); 655 if (n < 0) 656 return n; 657 } 658 return m; 659 } 660 661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 662 int oif, int strict, int *mpri, bool *do_rr) 663 { 664 bool match_do_rr = false; 665 bool rc = false; 666 int m; 667 668 if (nh->fib_nh_flags & RTNH_F_DEAD) 669 goto out; 670 671 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 672 nh->fib_nh_flags & RTNH_F_LINKDOWN && 673 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 674 goto out; 675 676 m = rt6_score_route(nh, fib6_flags, oif, strict); 677 if (m == RT6_NUD_FAIL_DO_RR) { 678 match_do_rr = true; 679 m = 0; /* lowest valid score */ 680 } else if (m == RT6_NUD_FAIL_HARD) { 681 goto out; 682 } 683 684 if (strict & RT6_LOOKUP_F_REACHABLE) 685 rt6_probe(nh); 686 687 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 688 if (m > *mpri) { 689 *do_rr = match_do_rr; 690 *mpri = m; 691 rc = true; 692 } 693 out: 694 return rc; 695 } 696 697 static void __find_rr_leaf(struct fib6_info *f6i_start, 698 struct fib6_info *nomatch, u32 metric, 699 struct fib6_result *res, struct fib6_info **cont, 700 int oif, int strict, bool *do_rr, int *mpri) 701 { 702 struct fib6_info *f6i; 703 704 for (f6i = f6i_start; 705 f6i && f6i != nomatch; 706 f6i = rcu_dereference(f6i->fib6_next)) { 707 struct fib6_nh *nh; 708 709 if (cont && f6i->fib6_metric != metric) { 710 *cont = f6i; 711 return; 712 } 713 714 if (fib6_check_expired(f6i)) 715 continue; 716 717 nh = &f6i->fib6_nh; 718 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { 719 res->f6i = f6i; 720 res->nh = nh; 721 res->fib6_flags = f6i->fib6_flags; 722 res->fib6_type = f6i->fib6_type; 723 } 724 } 725 } 726 727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 728 struct fib6_info *rr_head, int oif, int strict, 729 bool *do_rr, struct fib6_result *res) 730 { 731 u32 metric = rr_head->fib6_metric; 732 struct fib6_info *cont = NULL; 733 int mpri = -1; 734 735 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 736 oif, strict, do_rr, &mpri); 737 738 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 739 oif, strict, do_rr, &mpri); 740 741 if (res->f6i || !cont) 742 return; 743 744 __find_rr_leaf(cont, NULL, metric, res, NULL, 745 oif, strict, do_rr, &mpri); 746 } 747 748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 749 struct fib6_result *res, int strict) 750 { 751 struct fib6_info *leaf = rcu_dereference(fn->leaf); 752 struct fib6_info *rt0; 753 bool do_rr = false; 754 int key_plen; 755 756 /* make sure this function or its helpers sets f6i */ 757 res->f6i = NULL; 758 759 if (!leaf || leaf == net->ipv6.fib6_null_entry) 760 goto out; 761 762 rt0 = rcu_dereference(fn->rr_ptr); 763 if (!rt0) 764 rt0 = leaf; 765 766 /* Double check to make sure fn is not an intermediate node 767 * and fn->leaf does not points to its child's leaf 768 * (This might happen if all routes under fn are deleted from 769 * the tree and fib6_repair_tree() is called on the node.) 770 */ 771 key_plen = rt0->fib6_dst.plen; 772 #ifdef CONFIG_IPV6_SUBTREES 773 if (rt0->fib6_src.plen) 774 key_plen = rt0->fib6_src.plen; 775 #endif 776 if (fn->fn_bit != key_plen) 777 goto out; 778 779 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 780 if (do_rr) { 781 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 782 783 /* no entries matched; do round-robin */ 784 if (!next || next->fib6_metric != rt0->fib6_metric) 785 next = leaf; 786 787 if (next != rt0) { 788 spin_lock_bh(&leaf->fib6_table->tb6_lock); 789 /* make sure next is not being deleted from the tree */ 790 if (next->fib6_node) 791 rcu_assign_pointer(fn->rr_ptr, next); 792 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 793 } 794 } 795 796 out: 797 if (!res->f6i) { 798 res->f6i = net->ipv6.fib6_null_entry; 799 res->nh = &res->f6i->fib6_nh; 800 res->fib6_flags = res->f6i->fib6_flags; 801 res->fib6_type = res->f6i->fib6_type; 802 } 803 } 804 805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 806 { 807 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 808 res->nh->fib_nh_gw_family; 809 } 810 811 #ifdef CONFIG_IPV6_ROUTE_INFO 812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 813 const struct in6_addr *gwaddr) 814 { 815 struct net *net = dev_net(dev); 816 struct route_info *rinfo = (struct route_info *) opt; 817 struct in6_addr prefix_buf, *prefix; 818 unsigned int pref; 819 unsigned long lifetime; 820 struct fib6_info *rt; 821 822 if (len < sizeof(struct route_info)) { 823 return -EINVAL; 824 } 825 826 /* Sanity check for prefix_len and length */ 827 if (rinfo->length > 3) { 828 return -EINVAL; 829 } else if (rinfo->prefix_len > 128) { 830 return -EINVAL; 831 } else if (rinfo->prefix_len > 64) { 832 if (rinfo->length < 2) { 833 return -EINVAL; 834 } 835 } else if (rinfo->prefix_len > 0) { 836 if (rinfo->length < 1) { 837 return -EINVAL; 838 } 839 } 840 841 pref = rinfo->route_pref; 842 if (pref == ICMPV6_ROUTER_PREF_INVALID) 843 return -EINVAL; 844 845 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 846 847 if (rinfo->length == 3) 848 prefix = (struct in6_addr *)rinfo->prefix; 849 else { 850 /* this function is safe */ 851 ipv6_addr_prefix(&prefix_buf, 852 (struct in6_addr *)rinfo->prefix, 853 rinfo->prefix_len); 854 prefix = &prefix_buf; 855 } 856 857 if (rinfo->prefix_len == 0) 858 rt = rt6_get_dflt_router(net, gwaddr, dev); 859 else 860 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 861 gwaddr, dev); 862 863 if (rt && !lifetime) { 864 ip6_del_rt(net, rt); 865 rt = NULL; 866 } 867 868 if (!rt && lifetime) 869 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 870 dev, pref); 871 else if (rt) 872 rt->fib6_flags = RTF_ROUTEINFO | 873 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 874 875 if (rt) { 876 if (!addrconf_finite_timeout(lifetime)) 877 fib6_clean_expires(rt); 878 else 879 fib6_set_expires(rt, jiffies + HZ * lifetime); 880 881 fib6_info_release(rt); 882 } 883 return 0; 884 } 885 #endif 886 887 /* 888 * Misc support functions 889 */ 890 891 /* called with rcu_lock held */ 892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 893 { 894 struct net_device *dev = res->nh->fib_nh_dev; 895 896 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 897 /* for copies of local routes, dst->dev needs to be the 898 * device if it is a master device, the master device if 899 * device is enslaved, and the loopback as the default 900 */ 901 if (netif_is_l3_slave(dev) && 902 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 903 dev = l3mdev_master_dev_rcu(dev); 904 else if (!netif_is_l3_master(dev)) 905 dev = dev_net(dev)->loopback_dev; 906 /* last case is netif_is_l3_master(dev) is true in which 907 * case we want dev returned to be dev 908 */ 909 } 910 911 return dev; 912 } 913 914 static const int fib6_prop[RTN_MAX + 1] = { 915 [RTN_UNSPEC] = 0, 916 [RTN_UNICAST] = 0, 917 [RTN_LOCAL] = 0, 918 [RTN_BROADCAST] = 0, 919 [RTN_ANYCAST] = 0, 920 [RTN_MULTICAST] = 0, 921 [RTN_BLACKHOLE] = -EINVAL, 922 [RTN_UNREACHABLE] = -EHOSTUNREACH, 923 [RTN_PROHIBIT] = -EACCES, 924 [RTN_THROW] = -EAGAIN, 925 [RTN_NAT] = -EINVAL, 926 [RTN_XRESOLVE] = -EINVAL, 927 }; 928 929 static int ip6_rt_type_to_error(u8 fib6_type) 930 { 931 return fib6_prop[fib6_type]; 932 } 933 934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 935 { 936 unsigned short flags = 0; 937 938 if (rt->dst_nocount) 939 flags |= DST_NOCOUNT; 940 if (rt->dst_nopolicy) 941 flags |= DST_NOPOLICY; 942 if (rt->dst_host) 943 flags |= DST_HOST; 944 945 return flags; 946 } 947 948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 949 { 950 rt->dst.error = ip6_rt_type_to_error(fib6_type); 951 952 switch (fib6_type) { 953 case RTN_BLACKHOLE: 954 rt->dst.output = dst_discard_out; 955 rt->dst.input = dst_discard; 956 break; 957 case RTN_PROHIBIT: 958 rt->dst.output = ip6_pkt_prohibit_out; 959 rt->dst.input = ip6_pkt_prohibit; 960 break; 961 case RTN_THROW: 962 case RTN_UNREACHABLE: 963 default: 964 rt->dst.output = ip6_pkt_discard_out; 965 rt->dst.input = ip6_pkt_discard; 966 break; 967 } 968 } 969 970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 971 { 972 struct fib6_info *f6i = res->f6i; 973 974 if (res->fib6_flags & RTF_REJECT) { 975 ip6_rt_init_dst_reject(rt, res->fib6_type); 976 return; 977 } 978 979 rt->dst.error = 0; 980 rt->dst.output = ip6_output; 981 982 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 983 rt->dst.input = ip6_input; 984 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 985 rt->dst.input = ip6_mc_input; 986 } else { 987 rt->dst.input = ip6_forward; 988 } 989 990 if (res->nh->fib_nh_lws) { 991 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 992 lwtunnel_set_redirect(&rt->dst); 993 } 994 995 rt->dst.lastuse = jiffies; 996 } 997 998 /* Caller must already hold reference to @from */ 999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1000 { 1001 rt->rt6i_flags &= ~RTF_EXPIRES; 1002 rcu_assign_pointer(rt->from, from); 1003 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1004 } 1005 1006 /* Caller must already hold reference to f6i in result */ 1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1008 { 1009 const struct fib6_nh *nh = res->nh; 1010 const struct net_device *dev = nh->fib_nh_dev; 1011 struct fib6_info *f6i = res->f6i; 1012 1013 ip6_rt_init_dst(rt, res); 1014 1015 rt->rt6i_dst = f6i->fib6_dst; 1016 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1017 rt->rt6i_flags = res->fib6_flags; 1018 if (nh->fib_nh_gw_family) { 1019 rt->rt6i_gateway = nh->fib_nh_gw6; 1020 rt->rt6i_flags |= RTF_GATEWAY; 1021 } 1022 rt6_set_from(rt, f6i); 1023 #ifdef CONFIG_IPV6_SUBTREES 1024 rt->rt6i_src = f6i->fib6_src; 1025 #endif 1026 } 1027 1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1029 struct in6_addr *saddr) 1030 { 1031 struct fib6_node *pn, *sn; 1032 while (1) { 1033 if (fn->fn_flags & RTN_TL_ROOT) 1034 return NULL; 1035 pn = rcu_dereference(fn->parent); 1036 sn = FIB6_SUBTREE(pn); 1037 if (sn && sn != fn) 1038 fn = fib6_node_lookup(sn, NULL, saddr); 1039 else 1040 fn = pn; 1041 if (fn->fn_flags & RTN_RTINFO) 1042 return fn; 1043 } 1044 } 1045 1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1047 { 1048 struct rt6_info *rt = *prt; 1049 1050 if (dst_hold_safe(&rt->dst)) 1051 return true; 1052 if (net) { 1053 rt = net->ipv6.ip6_null_entry; 1054 dst_hold(&rt->dst); 1055 } else { 1056 rt = NULL; 1057 } 1058 *prt = rt; 1059 return false; 1060 } 1061 1062 /* called with rcu_lock held */ 1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1064 { 1065 struct net_device *dev = res->nh->fib_nh_dev; 1066 struct fib6_info *f6i = res->f6i; 1067 unsigned short flags; 1068 struct rt6_info *nrt; 1069 1070 if (!fib6_info_hold_safe(f6i)) 1071 goto fallback; 1072 1073 flags = fib6_info_dst_flags(f6i); 1074 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1075 if (!nrt) { 1076 fib6_info_release(f6i); 1077 goto fallback; 1078 } 1079 1080 ip6_rt_copy_init(nrt, res); 1081 return nrt; 1082 1083 fallback: 1084 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1085 dst_hold(&nrt->dst); 1086 return nrt; 1087 } 1088 1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1090 struct fib6_table *table, 1091 struct flowi6 *fl6, 1092 const struct sk_buff *skb, 1093 int flags) 1094 { 1095 struct fib6_result res = {}; 1096 struct fib6_node *fn; 1097 struct rt6_info *rt; 1098 1099 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1100 flags &= ~RT6_LOOKUP_F_IFACE; 1101 1102 rcu_read_lock(); 1103 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1104 restart: 1105 res.f6i = rcu_dereference(fn->leaf); 1106 if (!res.f6i) 1107 res.f6i = net->ipv6.fib6_null_entry; 1108 else 1109 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1110 flags); 1111 1112 if (res.f6i == net->ipv6.fib6_null_entry) { 1113 fn = fib6_backtrack(fn, &fl6->saddr); 1114 if (fn) 1115 goto restart; 1116 1117 rt = net->ipv6.ip6_null_entry; 1118 dst_hold(&rt->dst); 1119 goto out; 1120 } 1121 1122 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1123 fl6->flowi6_oif != 0, skb, flags); 1124 1125 /* Search through exception table */ 1126 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1127 if (rt) { 1128 if (ip6_hold_safe(net, &rt)) 1129 dst_use_noref(&rt->dst, jiffies); 1130 } else { 1131 rt = ip6_create_rt_rcu(&res); 1132 } 1133 1134 out: 1135 trace_fib6_table_lookup(net, &res, table, fl6); 1136 1137 rcu_read_unlock(); 1138 1139 return rt; 1140 } 1141 1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1143 const struct sk_buff *skb, int flags) 1144 { 1145 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1146 } 1147 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1148 1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1150 const struct in6_addr *saddr, int oif, 1151 const struct sk_buff *skb, int strict) 1152 { 1153 struct flowi6 fl6 = { 1154 .flowi6_oif = oif, 1155 .daddr = *daddr, 1156 }; 1157 struct dst_entry *dst; 1158 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1159 1160 if (saddr) { 1161 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1162 flags |= RT6_LOOKUP_F_HAS_SADDR; 1163 } 1164 1165 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1166 if (dst->error == 0) 1167 return (struct rt6_info *) dst; 1168 1169 dst_release(dst); 1170 1171 return NULL; 1172 } 1173 EXPORT_SYMBOL(rt6_lookup); 1174 1175 /* ip6_ins_rt is called with FREE table->tb6_lock. 1176 * It takes new route entry, the addition fails by any reason the 1177 * route is released. 1178 * Caller must hold dst before calling it. 1179 */ 1180 1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1182 struct netlink_ext_ack *extack) 1183 { 1184 int err; 1185 struct fib6_table *table; 1186 1187 table = rt->fib6_table; 1188 spin_lock_bh(&table->tb6_lock); 1189 err = fib6_add(&table->tb6_root, rt, info, extack); 1190 spin_unlock_bh(&table->tb6_lock); 1191 1192 return err; 1193 } 1194 1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1196 { 1197 struct nl_info info = { .nl_net = net, }; 1198 1199 return __ip6_ins_rt(rt, &info, NULL); 1200 } 1201 1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1203 const struct in6_addr *daddr, 1204 const struct in6_addr *saddr) 1205 { 1206 struct fib6_info *f6i = res->f6i; 1207 struct net_device *dev; 1208 struct rt6_info *rt; 1209 1210 /* 1211 * Clone the route. 1212 */ 1213 1214 if (!fib6_info_hold_safe(f6i)) 1215 return NULL; 1216 1217 dev = ip6_rt_get_dev_rcu(res); 1218 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1219 if (!rt) { 1220 fib6_info_release(f6i); 1221 return NULL; 1222 } 1223 1224 ip6_rt_copy_init(rt, res); 1225 rt->rt6i_flags |= RTF_CACHE; 1226 rt->dst.flags |= DST_HOST; 1227 rt->rt6i_dst.addr = *daddr; 1228 rt->rt6i_dst.plen = 128; 1229 1230 if (!rt6_is_gw_or_nonexthop(res)) { 1231 if (f6i->fib6_dst.plen != 128 && 1232 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1233 rt->rt6i_flags |= RTF_ANYCAST; 1234 #ifdef CONFIG_IPV6_SUBTREES 1235 if (rt->rt6i_src.plen && saddr) { 1236 rt->rt6i_src.addr = *saddr; 1237 rt->rt6i_src.plen = 128; 1238 } 1239 #endif 1240 } 1241 1242 return rt; 1243 } 1244 1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1246 { 1247 struct fib6_info *f6i = res->f6i; 1248 unsigned short flags = fib6_info_dst_flags(f6i); 1249 struct net_device *dev; 1250 struct rt6_info *pcpu_rt; 1251 1252 if (!fib6_info_hold_safe(f6i)) 1253 return NULL; 1254 1255 rcu_read_lock(); 1256 dev = ip6_rt_get_dev_rcu(res); 1257 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1258 rcu_read_unlock(); 1259 if (!pcpu_rt) { 1260 fib6_info_release(f6i); 1261 return NULL; 1262 } 1263 ip6_rt_copy_init(pcpu_rt, res); 1264 pcpu_rt->rt6i_flags |= RTF_PCPU; 1265 return pcpu_rt; 1266 } 1267 1268 /* It should be called with rcu_read_lock() acquired */ 1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1270 { 1271 struct rt6_info *pcpu_rt, **p; 1272 1273 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1274 pcpu_rt = *p; 1275 1276 if (pcpu_rt) 1277 ip6_hold_safe(NULL, &pcpu_rt); 1278 1279 return pcpu_rt; 1280 } 1281 1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1283 const struct fib6_result *res) 1284 { 1285 struct rt6_info *pcpu_rt, *prev, **p; 1286 1287 pcpu_rt = ip6_rt_pcpu_alloc(res); 1288 if (!pcpu_rt) { 1289 dst_hold(&net->ipv6.ip6_null_entry->dst); 1290 return net->ipv6.ip6_null_entry; 1291 } 1292 1293 dst_hold(&pcpu_rt->dst); 1294 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1295 prev = cmpxchg(p, NULL, pcpu_rt); 1296 BUG_ON(prev); 1297 1298 return pcpu_rt; 1299 } 1300 1301 /* exception hash table implementation 1302 */ 1303 static DEFINE_SPINLOCK(rt6_exception_lock); 1304 1305 /* Remove rt6_ex from hash table and free the memory 1306 * Caller must hold rt6_exception_lock 1307 */ 1308 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1309 struct rt6_exception *rt6_ex) 1310 { 1311 struct fib6_info *from; 1312 struct net *net; 1313 1314 if (!bucket || !rt6_ex) 1315 return; 1316 1317 net = dev_net(rt6_ex->rt6i->dst.dev); 1318 net->ipv6.rt6_stats->fib_rt_cache--; 1319 1320 /* purge completely the exception to allow releasing the held resources: 1321 * some [sk] cache may keep the dst around for unlimited time 1322 */ 1323 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1324 fib6_info_release(from); 1325 dst_dev_put(&rt6_ex->rt6i->dst); 1326 1327 hlist_del_rcu(&rt6_ex->hlist); 1328 dst_release(&rt6_ex->rt6i->dst); 1329 kfree_rcu(rt6_ex, rcu); 1330 WARN_ON_ONCE(!bucket->depth); 1331 bucket->depth--; 1332 } 1333 1334 /* Remove oldest rt6_ex in bucket and free the memory 1335 * Caller must hold rt6_exception_lock 1336 */ 1337 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1338 { 1339 struct rt6_exception *rt6_ex, *oldest = NULL; 1340 1341 if (!bucket) 1342 return; 1343 1344 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1345 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1346 oldest = rt6_ex; 1347 } 1348 rt6_remove_exception(bucket, oldest); 1349 } 1350 1351 static u32 rt6_exception_hash(const struct in6_addr *dst, 1352 const struct in6_addr *src) 1353 { 1354 static u32 seed __read_mostly; 1355 u32 val; 1356 1357 net_get_random_once(&seed, sizeof(seed)); 1358 val = jhash(dst, sizeof(*dst), seed); 1359 1360 #ifdef CONFIG_IPV6_SUBTREES 1361 if (src) 1362 val = jhash(src, sizeof(*src), val); 1363 #endif 1364 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1365 } 1366 1367 /* Helper function to find the cached rt in the hash table 1368 * and update bucket pointer to point to the bucket for this 1369 * (daddr, saddr) pair 1370 * Caller must hold rt6_exception_lock 1371 */ 1372 static struct rt6_exception * 1373 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1374 const struct in6_addr *daddr, 1375 const struct in6_addr *saddr) 1376 { 1377 struct rt6_exception *rt6_ex; 1378 u32 hval; 1379 1380 if (!(*bucket) || !daddr) 1381 return NULL; 1382 1383 hval = rt6_exception_hash(daddr, saddr); 1384 *bucket += hval; 1385 1386 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1387 struct rt6_info *rt6 = rt6_ex->rt6i; 1388 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1389 1390 #ifdef CONFIG_IPV6_SUBTREES 1391 if (matched && saddr) 1392 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1393 #endif 1394 if (matched) 1395 return rt6_ex; 1396 } 1397 return NULL; 1398 } 1399 1400 /* Helper function to find the cached rt in the hash table 1401 * and update bucket pointer to point to the bucket for this 1402 * (daddr, saddr) pair 1403 * Caller must hold rcu_read_lock() 1404 */ 1405 static struct rt6_exception * 1406 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1407 const struct in6_addr *daddr, 1408 const struct in6_addr *saddr) 1409 { 1410 struct rt6_exception *rt6_ex; 1411 u32 hval; 1412 1413 WARN_ON_ONCE(!rcu_read_lock_held()); 1414 1415 if (!(*bucket) || !daddr) 1416 return NULL; 1417 1418 hval = rt6_exception_hash(daddr, saddr); 1419 *bucket += hval; 1420 1421 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1422 struct rt6_info *rt6 = rt6_ex->rt6i; 1423 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1424 1425 #ifdef CONFIG_IPV6_SUBTREES 1426 if (matched && saddr) 1427 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1428 #endif 1429 if (matched) 1430 return rt6_ex; 1431 } 1432 return NULL; 1433 } 1434 1435 static unsigned int fib6_mtu(const struct fib6_result *res) 1436 { 1437 const struct fib6_nh *nh = res->nh; 1438 unsigned int mtu; 1439 1440 if (res->f6i->fib6_pmtu) { 1441 mtu = res->f6i->fib6_pmtu; 1442 } else { 1443 struct net_device *dev = nh->fib_nh_dev; 1444 struct inet6_dev *idev; 1445 1446 rcu_read_lock(); 1447 idev = __in6_dev_get(dev); 1448 mtu = idev->cnf.mtu6; 1449 rcu_read_unlock(); 1450 } 1451 1452 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1453 1454 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1455 } 1456 1457 static int rt6_insert_exception(struct rt6_info *nrt, 1458 const struct fib6_result *res) 1459 { 1460 struct net *net = dev_net(nrt->dst.dev); 1461 struct rt6_exception_bucket *bucket; 1462 struct in6_addr *src_key = NULL; 1463 struct rt6_exception *rt6_ex; 1464 struct fib6_info *f6i = res->f6i; 1465 int err = 0; 1466 1467 spin_lock_bh(&rt6_exception_lock); 1468 1469 if (f6i->exception_bucket_flushed) { 1470 err = -EINVAL; 1471 goto out; 1472 } 1473 1474 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1475 lockdep_is_held(&rt6_exception_lock)); 1476 if (!bucket) { 1477 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1478 GFP_ATOMIC); 1479 if (!bucket) { 1480 err = -ENOMEM; 1481 goto out; 1482 } 1483 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); 1484 } 1485 1486 #ifdef CONFIG_IPV6_SUBTREES 1487 /* fib6_src.plen != 0 indicates f6i is in subtree 1488 * and exception table is indexed by a hash of 1489 * both fib6_dst and fib6_src. 1490 * Otherwise, the exception table is indexed by 1491 * a hash of only fib6_dst. 1492 */ 1493 if (f6i->fib6_src.plen) 1494 src_key = &nrt->rt6i_src.addr; 1495 #endif 1496 /* rt6_mtu_change() might lower mtu on f6i. 1497 * Only insert this exception route if its mtu 1498 * is less than f6i's mtu value. 1499 */ 1500 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1501 err = -EINVAL; 1502 goto out; 1503 } 1504 1505 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1506 src_key); 1507 if (rt6_ex) 1508 rt6_remove_exception(bucket, rt6_ex); 1509 1510 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1511 if (!rt6_ex) { 1512 err = -ENOMEM; 1513 goto out; 1514 } 1515 rt6_ex->rt6i = nrt; 1516 rt6_ex->stamp = jiffies; 1517 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1518 bucket->depth++; 1519 net->ipv6.rt6_stats->fib_rt_cache++; 1520 1521 if (bucket->depth > FIB6_MAX_DEPTH) 1522 rt6_exception_remove_oldest(bucket); 1523 1524 out: 1525 spin_unlock_bh(&rt6_exception_lock); 1526 1527 /* Update fn->fn_sernum to invalidate all cached dst */ 1528 if (!err) { 1529 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1530 fib6_update_sernum(net, f6i); 1531 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1532 fib6_force_start_gc(net); 1533 } 1534 1535 return err; 1536 } 1537 1538 void rt6_flush_exceptions(struct fib6_info *rt) 1539 { 1540 struct rt6_exception_bucket *bucket; 1541 struct rt6_exception *rt6_ex; 1542 struct hlist_node *tmp; 1543 int i; 1544 1545 spin_lock_bh(&rt6_exception_lock); 1546 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1547 rt->exception_bucket_flushed = 1; 1548 1549 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1550 lockdep_is_held(&rt6_exception_lock)); 1551 if (!bucket) 1552 goto out; 1553 1554 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1555 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1556 rt6_remove_exception(bucket, rt6_ex); 1557 WARN_ON_ONCE(bucket->depth); 1558 bucket++; 1559 } 1560 1561 out: 1562 spin_unlock_bh(&rt6_exception_lock); 1563 } 1564 1565 /* Find cached rt in the hash table inside passed in rt 1566 * Caller has to hold rcu_read_lock() 1567 */ 1568 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1569 struct in6_addr *daddr, 1570 struct in6_addr *saddr) 1571 { 1572 struct rt6_exception_bucket *bucket; 1573 struct in6_addr *src_key = NULL; 1574 struct rt6_exception *rt6_ex; 1575 struct rt6_info *ret = NULL; 1576 1577 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); 1578 1579 #ifdef CONFIG_IPV6_SUBTREES 1580 /* fib6i_src.plen != 0 indicates f6i is in subtree 1581 * and exception table is indexed by a hash of 1582 * both fib6_dst and fib6_src. 1583 * Otherwise, the exception table is indexed by 1584 * a hash of only fib6_dst. 1585 */ 1586 if (res->f6i->fib6_src.plen) 1587 src_key = saddr; 1588 #endif 1589 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1590 1591 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1592 ret = rt6_ex->rt6i; 1593 1594 return ret; 1595 } 1596 1597 /* Remove the passed in cached rt from the hash table that contains it */ 1598 static int rt6_remove_exception_rt(struct rt6_info *rt) 1599 { 1600 struct rt6_exception_bucket *bucket; 1601 struct in6_addr *src_key = NULL; 1602 struct rt6_exception *rt6_ex; 1603 struct fib6_info *from; 1604 int err; 1605 1606 from = rcu_dereference(rt->from); 1607 if (!from || 1608 !(rt->rt6i_flags & RTF_CACHE)) 1609 return -EINVAL; 1610 1611 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1612 return -ENOENT; 1613 1614 spin_lock_bh(&rt6_exception_lock); 1615 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1616 lockdep_is_held(&rt6_exception_lock)); 1617 #ifdef CONFIG_IPV6_SUBTREES 1618 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1619 * and exception table is indexed by a hash of 1620 * both rt6i_dst and rt6i_src. 1621 * Otherwise, the exception table is indexed by 1622 * a hash of only rt6i_dst. 1623 */ 1624 if (from->fib6_src.plen) 1625 src_key = &rt->rt6i_src.addr; 1626 #endif 1627 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1628 &rt->rt6i_dst.addr, 1629 src_key); 1630 if (rt6_ex) { 1631 rt6_remove_exception(bucket, rt6_ex); 1632 err = 0; 1633 } else { 1634 err = -ENOENT; 1635 } 1636 1637 spin_unlock_bh(&rt6_exception_lock); 1638 return err; 1639 } 1640 1641 /* Find rt6_ex which contains the passed in rt cache and 1642 * refresh its stamp 1643 */ 1644 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1645 { 1646 struct rt6_exception_bucket *bucket; 1647 struct in6_addr *src_key = NULL; 1648 struct rt6_exception *rt6_ex; 1649 struct fib6_info *from; 1650 1651 rcu_read_lock(); 1652 from = rcu_dereference(rt->from); 1653 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1654 goto unlock; 1655 1656 bucket = rcu_dereference(from->rt6i_exception_bucket); 1657 1658 #ifdef CONFIG_IPV6_SUBTREES 1659 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1660 * and exception table is indexed by a hash of 1661 * both rt6i_dst and rt6i_src. 1662 * Otherwise, the exception table is indexed by 1663 * a hash of only rt6i_dst. 1664 */ 1665 if (from->fib6_src.plen) 1666 src_key = &rt->rt6i_src.addr; 1667 #endif 1668 rt6_ex = __rt6_find_exception_rcu(&bucket, 1669 &rt->rt6i_dst.addr, 1670 src_key); 1671 if (rt6_ex) 1672 rt6_ex->stamp = jiffies; 1673 1674 unlock: 1675 rcu_read_unlock(); 1676 } 1677 1678 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1679 struct rt6_info *rt, int mtu) 1680 { 1681 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1682 * lowest MTU in the path: always allow updating the route PMTU to 1683 * reflect PMTU decreases. 1684 * 1685 * If the new MTU is higher, and the route PMTU is equal to the local 1686 * MTU, this means the old MTU is the lowest in the path, so allow 1687 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1688 * handle this. 1689 */ 1690 1691 if (dst_mtu(&rt->dst) >= mtu) 1692 return true; 1693 1694 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1695 return true; 1696 1697 return false; 1698 } 1699 1700 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1701 struct fib6_info *rt, int mtu) 1702 { 1703 struct rt6_exception_bucket *bucket; 1704 struct rt6_exception *rt6_ex; 1705 int i; 1706 1707 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1708 lockdep_is_held(&rt6_exception_lock)); 1709 1710 if (!bucket) 1711 return; 1712 1713 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1714 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1715 struct rt6_info *entry = rt6_ex->rt6i; 1716 1717 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1718 * route), the metrics of its rt->from have already 1719 * been updated. 1720 */ 1721 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1722 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1723 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1724 } 1725 bucket++; 1726 } 1727 } 1728 1729 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1730 1731 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1732 struct in6_addr *gateway) 1733 { 1734 struct rt6_exception_bucket *bucket; 1735 struct rt6_exception *rt6_ex; 1736 struct hlist_node *tmp; 1737 int i; 1738 1739 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1740 return; 1741 1742 spin_lock_bh(&rt6_exception_lock); 1743 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1744 lockdep_is_held(&rt6_exception_lock)); 1745 1746 if (bucket) { 1747 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1748 hlist_for_each_entry_safe(rt6_ex, tmp, 1749 &bucket->chain, hlist) { 1750 struct rt6_info *entry = rt6_ex->rt6i; 1751 1752 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1753 RTF_CACHE_GATEWAY && 1754 ipv6_addr_equal(gateway, 1755 &entry->rt6i_gateway)) { 1756 rt6_remove_exception(bucket, rt6_ex); 1757 } 1758 } 1759 bucket++; 1760 } 1761 } 1762 1763 spin_unlock_bh(&rt6_exception_lock); 1764 } 1765 1766 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1767 struct rt6_exception *rt6_ex, 1768 struct fib6_gc_args *gc_args, 1769 unsigned long now) 1770 { 1771 struct rt6_info *rt = rt6_ex->rt6i; 1772 1773 /* we are pruning and obsoleting aged-out and non gateway exceptions 1774 * even if others have still references to them, so that on next 1775 * dst_check() such references can be dropped. 1776 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1777 * expired, independently from their aging, as per RFC 8201 section 4 1778 */ 1779 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1780 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1781 RT6_TRACE("aging clone %p\n", rt); 1782 rt6_remove_exception(bucket, rt6_ex); 1783 return; 1784 } 1785 } else if (time_after(jiffies, rt->dst.expires)) { 1786 RT6_TRACE("purging expired route %p\n", rt); 1787 rt6_remove_exception(bucket, rt6_ex); 1788 return; 1789 } 1790 1791 if (rt->rt6i_flags & RTF_GATEWAY) { 1792 struct neighbour *neigh; 1793 __u8 neigh_flags = 0; 1794 1795 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1796 if (neigh) 1797 neigh_flags = neigh->flags; 1798 1799 if (!(neigh_flags & NTF_ROUTER)) { 1800 RT6_TRACE("purging route %p via non-router but gateway\n", 1801 rt); 1802 rt6_remove_exception(bucket, rt6_ex); 1803 return; 1804 } 1805 } 1806 1807 gc_args->more++; 1808 } 1809 1810 void rt6_age_exceptions(struct fib6_info *rt, 1811 struct fib6_gc_args *gc_args, 1812 unsigned long now) 1813 { 1814 struct rt6_exception_bucket *bucket; 1815 struct rt6_exception *rt6_ex; 1816 struct hlist_node *tmp; 1817 int i; 1818 1819 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1820 return; 1821 1822 rcu_read_lock_bh(); 1823 spin_lock(&rt6_exception_lock); 1824 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1825 lockdep_is_held(&rt6_exception_lock)); 1826 1827 if (bucket) { 1828 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1829 hlist_for_each_entry_safe(rt6_ex, tmp, 1830 &bucket->chain, hlist) { 1831 rt6_age_examine_exception(bucket, rt6_ex, 1832 gc_args, now); 1833 } 1834 bucket++; 1835 } 1836 } 1837 spin_unlock(&rt6_exception_lock); 1838 rcu_read_unlock_bh(); 1839 } 1840 1841 /* must be called with rcu lock held */ 1842 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 1843 struct flowi6 *fl6, struct fib6_result *res, int strict) 1844 { 1845 struct fib6_node *fn, *saved_fn; 1846 1847 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1848 saved_fn = fn; 1849 1850 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1851 oif = 0; 1852 1853 redo_rt6_select: 1854 rt6_select(net, fn, oif, res, strict); 1855 if (res->f6i == net->ipv6.fib6_null_entry) { 1856 fn = fib6_backtrack(fn, &fl6->saddr); 1857 if (fn) 1858 goto redo_rt6_select; 1859 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1860 /* also consider unreachable route */ 1861 strict &= ~RT6_LOOKUP_F_REACHABLE; 1862 fn = saved_fn; 1863 goto redo_rt6_select; 1864 } 1865 } 1866 1867 trace_fib6_table_lookup(net, res, table, fl6); 1868 1869 return 0; 1870 } 1871 1872 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1873 int oif, struct flowi6 *fl6, 1874 const struct sk_buff *skb, int flags) 1875 { 1876 struct fib6_result res = {}; 1877 struct rt6_info *rt; 1878 int strict = 0; 1879 1880 strict |= flags & RT6_LOOKUP_F_IFACE; 1881 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1882 if (net->ipv6.devconf_all->forwarding == 0) 1883 strict |= RT6_LOOKUP_F_REACHABLE; 1884 1885 rcu_read_lock(); 1886 1887 fib6_table_lookup(net, table, oif, fl6, &res, strict); 1888 if (res.f6i == net->ipv6.fib6_null_entry) { 1889 rt = net->ipv6.ip6_null_entry; 1890 rcu_read_unlock(); 1891 dst_hold(&rt->dst); 1892 return rt; 1893 } 1894 1895 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 1896 1897 /*Search through exception table */ 1898 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1899 if (rt) { 1900 if (ip6_hold_safe(net, &rt)) 1901 dst_use_noref(&rt->dst, jiffies); 1902 1903 rcu_read_unlock(); 1904 return rt; 1905 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1906 !res.nh->fib_nh_gw_family)) { 1907 /* Create a RTF_CACHE clone which will not be 1908 * owned by the fib6 tree. It is for the special case where 1909 * the daddr in the skb during the neighbor look-up is different 1910 * from the fl6->daddr used to look-up route here. 1911 */ 1912 struct rt6_info *uncached_rt; 1913 1914 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 1915 1916 rcu_read_unlock(); 1917 1918 if (uncached_rt) { 1919 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1920 * No need for another dst_hold() 1921 */ 1922 rt6_uncached_list_add(uncached_rt); 1923 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1924 } else { 1925 uncached_rt = net->ipv6.ip6_null_entry; 1926 dst_hold(&uncached_rt->dst); 1927 } 1928 1929 return uncached_rt; 1930 } else { 1931 /* Get a percpu copy */ 1932 1933 struct rt6_info *pcpu_rt; 1934 1935 local_bh_disable(); 1936 pcpu_rt = rt6_get_pcpu_route(&res); 1937 1938 if (!pcpu_rt) 1939 pcpu_rt = rt6_make_pcpu_route(net, &res); 1940 1941 local_bh_enable(); 1942 rcu_read_unlock(); 1943 1944 return pcpu_rt; 1945 } 1946 } 1947 EXPORT_SYMBOL_GPL(ip6_pol_route); 1948 1949 static struct rt6_info *ip6_pol_route_input(struct net *net, 1950 struct fib6_table *table, 1951 struct flowi6 *fl6, 1952 const struct sk_buff *skb, 1953 int flags) 1954 { 1955 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1956 } 1957 1958 struct dst_entry *ip6_route_input_lookup(struct net *net, 1959 struct net_device *dev, 1960 struct flowi6 *fl6, 1961 const struct sk_buff *skb, 1962 int flags) 1963 { 1964 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1965 flags |= RT6_LOOKUP_F_IFACE; 1966 1967 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1968 } 1969 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1970 1971 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1972 struct flow_keys *keys, 1973 struct flow_keys *flkeys) 1974 { 1975 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1976 const struct ipv6hdr *key_iph = outer_iph; 1977 struct flow_keys *_flkeys = flkeys; 1978 const struct ipv6hdr *inner_iph; 1979 const struct icmp6hdr *icmph; 1980 struct ipv6hdr _inner_iph; 1981 struct icmp6hdr _icmph; 1982 1983 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1984 goto out; 1985 1986 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1987 sizeof(_icmph), &_icmph); 1988 if (!icmph) 1989 goto out; 1990 1991 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1992 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1993 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1994 icmph->icmp6_type != ICMPV6_PARAMPROB) 1995 goto out; 1996 1997 inner_iph = skb_header_pointer(skb, 1998 skb_transport_offset(skb) + sizeof(*icmph), 1999 sizeof(_inner_iph), &_inner_iph); 2000 if (!inner_iph) 2001 goto out; 2002 2003 key_iph = inner_iph; 2004 _flkeys = NULL; 2005 out: 2006 if (_flkeys) { 2007 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2008 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2009 keys->tags.flow_label = _flkeys->tags.flow_label; 2010 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2011 } else { 2012 keys->addrs.v6addrs.src = key_iph->saddr; 2013 keys->addrs.v6addrs.dst = key_iph->daddr; 2014 keys->tags.flow_label = ip6_flowlabel(key_iph); 2015 keys->basic.ip_proto = key_iph->nexthdr; 2016 } 2017 } 2018 2019 /* if skb is set it will be used and fl6 can be NULL */ 2020 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2021 const struct sk_buff *skb, struct flow_keys *flkeys) 2022 { 2023 struct flow_keys hash_keys; 2024 u32 mhash; 2025 2026 switch (ip6_multipath_hash_policy(net)) { 2027 case 0: 2028 memset(&hash_keys, 0, sizeof(hash_keys)); 2029 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2030 if (skb) { 2031 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2032 } else { 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2036 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2037 } 2038 break; 2039 case 1: 2040 if (skb) { 2041 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2042 struct flow_keys keys; 2043 2044 /* short-circuit if we already have L4 hash present */ 2045 if (skb->l4_hash) 2046 return skb_get_hash_raw(skb) >> 1; 2047 2048 memset(&hash_keys, 0, sizeof(hash_keys)); 2049 2050 if (!flkeys) { 2051 skb_flow_dissect_flow_keys(skb, &keys, flag); 2052 flkeys = &keys; 2053 } 2054 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2055 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2056 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2057 hash_keys.ports.src = flkeys->ports.src; 2058 hash_keys.ports.dst = flkeys->ports.dst; 2059 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2060 } else { 2061 memset(&hash_keys, 0, sizeof(hash_keys)); 2062 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2063 hash_keys.addrs.v6addrs.src = fl6->saddr; 2064 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2065 hash_keys.ports.src = fl6->fl6_sport; 2066 hash_keys.ports.dst = fl6->fl6_dport; 2067 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2068 } 2069 break; 2070 } 2071 mhash = flow_hash_from_keys(&hash_keys); 2072 2073 return mhash >> 1; 2074 } 2075 2076 void ip6_route_input(struct sk_buff *skb) 2077 { 2078 const struct ipv6hdr *iph = ipv6_hdr(skb); 2079 struct net *net = dev_net(skb->dev); 2080 int flags = RT6_LOOKUP_F_HAS_SADDR; 2081 struct ip_tunnel_info *tun_info; 2082 struct flowi6 fl6 = { 2083 .flowi6_iif = skb->dev->ifindex, 2084 .daddr = iph->daddr, 2085 .saddr = iph->saddr, 2086 .flowlabel = ip6_flowinfo(iph), 2087 .flowi6_mark = skb->mark, 2088 .flowi6_proto = iph->nexthdr, 2089 }; 2090 struct flow_keys *flkeys = NULL, _flkeys; 2091 2092 tun_info = skb_tunnel_info(skb); 2093 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2094 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2095 2096 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2097 flkeys = &_flkeys; 2098 2099 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2100 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2101 skb_dst_drop(skb); 2102 skb_dst_set(skb, 2103 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2104 } 2105 2106 static struct rt6_info *ip6_pol_route_output(struct net *net, 2107 struct fib6_table *table, 2108 struct flowi6 *fl6, 2109 const struct sk_buff *skb, 2110 int flags) 2111 { 2112 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2113 } 2114 2115 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2116 struct flowi6 *fl6, int flags) 2117 { 2118 bool any_src; 2119 2120 if (ipv6_addr_type(&fl6->daddr) & 2121 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2122 struct dst_entry *dst; 2123 2124 dst = l3mdev_link_scope_lookup(net, fl6); 2125 if (dst) 2126 return dst; 2127 } 2128 2129 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2130 2131 any_src = ipv6_addr_any(&fl6->saddr); 2132 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2133 (fl6->flowi6_oif && any_src)) 2134 flags |= RT6_LOOKUP_F_IFACE; 2135 2136 if (!any_src) 2137 flags |= RT6_LOOKUP_F_HAS_SADDR; 2138 else if (sk) 2139 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2140 2141 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2142 } 2143 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2144 2145 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2146 { 2147 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2148 struct net_device *loopback_dev = net->loopback_dev; 2149 struct dst_entry *new = NULL; 2150 2151 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2152 DST_OBSOLETE_DEAD, 0); 2153 if (rt) { 2154 rt6_info_init(rt); 2155 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2156 2157 new = &rt->dst; 2158 new->__use = 1; 2159 new->input = dst_discard; 2160 new->output = dst_discard_out; 2161 2162 dst_copy_metrics(new, &ort->dst); 2163 2164 rt->rt6i_idev = in6_dev_get(loopback_dev); 2165 rt->rt6i_gateway = ort->rt6i_gateway; 2166 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2167 2168 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2169 #ifdef CONFIG_IPV6_SUBTREES 2170 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2171 #endif 2172 } 2173 2174 dst_release(dst_orig); 2175 return new ? new : ERR_PTR(-ENOMEM); 2176 } 2177 2178 /* 2179 * Destination cache support functions 2180 */ 2181 2182 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2183 { 2184 u32 rt_cookie = 0; 2185 2186 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2187 return false; 2188 2189 if (fib6_check_expired(f6i)) 2190 return false; 2191 2192 return true; 2193 } 2194 2195 static struct dst_entry *rt6_check(struct rt6_info *rt, 2196 struct fib6_info *from, 2197 u32 cookie) 2198 { 2199 u32 rt_cookie = 0; 2200 2201 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2202 rt_cookie != cookie) 2203 return NULL; 2204 2205 if (rt6_check_expired(rt)) 2206 return NULL; 2207 2208 return &rt->dst; 2209 } 2210 2211 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2212 struct fib6_info *from, 2213 u32 cookie) 2214 { 2215 if (!__rt6_check_expired(rt) && 2216 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2217 fib6_check(from, cookie)) 2218 return &rt->dst; 2219 else 2220 return NULL; 2221 } 2222 2223 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2224 { 2225 struct dst_entry *dst_ret; 2226 struct fib6_info *from; 2227 struct rt6_info *rt; 2228 2229 rt = container_of(dst, struct rt6_info, dst); 2230 2231 rcu_read_lock(); 2232 2233 /* All IPV6 dsts are created with ->obsolete set to the value 2234 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2235 * into this function always. 2236 */ 2237 2238 from = rcu_dereference(rt->from); 2239 2240 if (from && (rt->rt6i_flags & RTF_PCPU || 2241 unlikely(!list_empty(&rt->rt6i_uncached)))) 2242 dst_ret = rt6_dst_from_check(rt, from, cookie); 2243 else 2244 dst_ret = rt6_check(rt, from, cookie); 2245 2246 rcu_read_unlock(); 2247 2248 return dst_ret; 2249 } 2250 2251 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2252 { 2253 struct rt6_info *rt = (struct rt6_info *) dst; 2254 2255 if (rt) { 2256 if (rt->rt6i_flags & RTF_CACHE) { 2257 rcu_read_lock(); 2258 if (rt6_check_expired(rt)) { 2259 rt6_remove_exception_rt(rt); 2260 dst = NULL; 2261 } 2262 rcu_read_unlock(); 2263 } else { 2264 dst_release(dst); 2265 dst = NULL; 2266 } 2267 } 2268 return dst; 2269 } 2270 2271 static void ip6_link_failure(struct sk_buff *skb) 2272 { 2273 struct rt6_info *rt; 2274 2275 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2276 2277 rt = (struct rt6_info *) skb_dst(skb); 2278 if (rt) { 2279 rcu_read_lock(); 2280 if (rt->rt6i_flags & RTF_CACHE) { 2281 rt6_remove_exception_rt(rt); 2282 } else { 2283 struct fib6_info *from; 2284 struct fib6_node *fn; 2285 2286 from = rcu_dereference(rt->from); 2287 if (from) { 2288 fn = rcu_dereference(from->fib6_node); 2289 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2290 fn->fn_sernum = -1; 2291 } 2292 } 2293 rcu_read_unlock(); 2294 } 2295 } 2296 2297 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2298 { 2299 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2300 struct fib6_info *from; 2301 2302 rcu_read_lock(); 2303 from = rcu_dereference(rt0->from); 2304 if (from) 2305 rt0->dst.expires = from->expires; 2306 rcu_read_unlock(); 2307 } 2308 2309 dst_set_expires(&rt0->dst, timeout); 2310 rt0->rt6i_flags |= RTF_EXPIRES; 2311 } 2312 2313 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2314 { 2315 struct net *net = dev_net(rt->dst.dev); 2316 2317 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2318 rt->rt6i_flags |= RTF_MODIFIED; 2319 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2320 } 2321 2322 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2323 { 2324 return !(rt->rt6i_flags & RTF_CACHE) && 2325 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2326 } 2327 2328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2329 const struct ipv6hdr *iph, u32 mtu) 2330 { 2331 const struct in6_addr *daddr, *saddr; 2332 struct rt6_info *rt6 = (struct rt6_info *)dst; 2333 2334 if (dst_metric_locked(dst, RTAX_MTU)) 2335 return; 2336 2337 if (iph) { 2338 daddr = &iph->daddr; 2339 saddr = &iph->saddr; 2340 } else if (sk) { 2341 daddr = &sk->sk_v6_daddr; 2342 saddr = &inet6_sk(sk)->saddr; 2343 } else { 2344 daddr = NULL; 2345 saddr = NULL; 2346 } 2347 dst_confirm_neigh(dst, daddr); 2348 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2349 if (mtu >= dst_mtu(dst)) 2350 return; 2351 2352 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2353 rt6_do_update_pmtu(rt6, mtu); 2354 /* update rt6_ex->stamp for cache */ 2355 if (rt6->rt6i_flags & RTF_CACHE) 2356 rt6_update_exception_stamp_rt(rt6); 2357 } else if (daddr) { 2358 struct fib6_result res = {}; 2359 struct rt6_info *nrt6; 2360 2361 rcu_read_lock(); 2362 res.f6i = rcu_dereference(rt6->from); 2363 if (!res.f6i) { 2364 rcu_read_unlock(); 2365 return; 2366 } 2367 res.nh = &res.f6i->fib6_nh; 2368 res.fib6_flags = res.f6i->fib6_flags; 2369 res.fib6_type = res.f6i->fib6_type; 2370 2371 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2372 if (nrt6) { 2373 rt6_do_update_pmtu(nrt6, mtu); 2374 if (rt6_insert_exception(nrt6, &res)) 2375 dst_release_immediate(&nrt6->dst); 2376 } 2377 rcu_read_unlock(); 2378 } 2379 } 2380 2381 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2382 struct sk_buff *skb, u32 mtu) 2383 { 2384 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2385 } 2386 2387 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2388 int oif, u32 mark, kuid_t uid) 2389 { 2390 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2391 struct dst_entry *dst; 2392 struct flowi6 fl6 = { 2393 .flowi6_oif = oif, 2394 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2395 .daddr = iph->daddr, 2396 .saddr = iph->saddr, 2397 .flowlabel = ip6_flowinfo(iph), 2398 .flowi6_uid = uid, 2399 }; 2400 2401 dst = ip6_route_output(net, NULL, &fl6); 2402 if (!dst->error) 2403 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2404 dst_release(dst); 2405 } 2406 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2407 2408 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2409 { 2410 int oif = sk->sk_bound_dev_if; 2411 struct dst_entry *dst; 2412 2413 if (!oif && skb->dev) 2414 oif = l3mdev_master_ifindex(skb->dev); 2415 2416 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2417 2418 dst = __sk_dst_get(sk); 2419 if (!dst || !dst->obsolete || 2420 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2421 return; 2422 2423 bh_lock_sock(sk); 2424 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2425 ip6_datagram_dst_update(sk, false); 2426 bh_unlock_sock(sk); 2427 } 2428 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2429 2430 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2431 const struct flowi6 *fl6) 2432 { 2433 #ifdef CONFIG_IPV6_SUBTREES 2434 struct ipv6_pinfo *np = inet6_sk(sk); 2435 #endif 2436 2437 ip6_dst_store(sk, dst, 2438 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2439 &sk->sk_v6_daddr : NULL, 2440 #ifdef CONFIG_IPV6_SUBTREES 2441 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2442 &np->saddr : 2443 #endif 2444 NULL); 2445 } 2446 2447 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2448 struct flowi6 *fl6, 2449 const struct in6_addr *gw, 2450 struct rt6_info **ret) 2451 { 2452 const struct fib6_nh *nh = res->nh; 2453 2454 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2455 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2456 return false; 2457 2458 /* rt_cache's gateway might be different from its 'parent' 2459 * in the case of an ip redirect. 2460 * So we keep searching in the exception table if the gateway 2461 * is different. 2462 */ 2463 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2464 struct rt6_info *rt_cache; 2465 2466 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2467 if (rt_cache && 2468 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2469 *ret = rt_cache; 2470 return true; 2471 } 2472 return false; 2473 } 2474 return true; 2475 } 2476 2477 /* Handle redirects */ 2478 struct ip6rd_flowi { 2479 struct flowi6 fl6; 2480 struct in6_addr gateway; 2481 }; 2482 2483 static struct rt6_info *__ip6_route_redirect(struct net *net, 2484 struct fib6_table *table, 2485 struct flowi6 *fl6, 2486 const struct sk_buff *skb, 2487 int flags) 2488 { 2489 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2490 struct rt6_info *ret = NULL; 2491 struct fib6_result res = {}; 2492 struct fib6_info *rt; 2493 struct fib6_node *fn; 2494 2495 /* Get the "current" route for this destination and 2496 * check if the redirect has come from appropriate router. 2497 * 2498 * RFC 4861 specifies that redirects should only be 2499 * accepted if they come from the nexthop to the target. 2500 * Due to the way the routes are chosen, this notion 2501 * is a bit fuzzy and one might need to check all possible 2502 * routes. 2503 */ 2504 2505 rcu_read_lock(); 2506 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2507 restart: 2508 for_each_fib6_node_rt_rcu(fn) { 2509 res.f6i = rt; 2510 res.nh = &rt->fib6_nh; 2511 2512 if (fib6_check_expired(rt)) 2513 continue; 2514 if (rt->fib6_flags & RTF_REJECT) 2515 break; 2516 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2517 goto out; 2518 } 2519 2520 if (!rt) 2521 rt = net->ipv6.fib6_null_entry; 2522 else if (rt->fib6_flags & RTF_REJECT) { 2523 ret = net->ipv6.ip6_null_entry; 2524 goto out; 2525 } 2526 2527 if (rt == net->ipv6.fib6_null_entry) { 2528 fn = fib6_backtrack(fn, &fl6->saddr); 2529 if (fn) 2530 goto restart; 2531 } 2532 2533 res.f6i = rt; 2534 res.nh = &rt->fib6_nh; 2535 out: 2536 if (ret) { 2537 ip6_hold_safe(net, &ret); 2538 } else { 2539 res.fib6_flags = res.f6i->fib6_flags; 2540 res.fib6_type = res.f6i->fib6_type; 2541 ret = ip6_create_rt_rcu(&res); 2542 } 2543 2544 rcu_read_unlock(); 2545 2546 trace_fib6_table_lookup(net, &res, table, fl6); 2547 return ret; 2548 }; 2549 2550 static struct dst_entry *ip6_route_redirect(struct net *net, 2551 const struct flowi6 *fl6, 2552 const struct sk_buff *skb, 2553 const struct in6_addr *gateway) 2554 { 2555 int flags = RT6_LOOKUP_F_HAS_SADDR; 2556 struct ip6rd_flowi rdfl; 2557 2558 rdfl.fl6 = *fl6; 2559 rdfl.gateway = *gateway; 2560 2561 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2562 flags, __ip6_route_redirect); 2563 } 2564 2565 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2566 kuid_t uid) 2567 { 2568 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2569 struct dst_entry *dst; 2570 struct flowi6 fl6 = { 2571 .flowi6_iif = LOOPBACK_IFINDEX, 2572 .flowi6_oif = oif, 2573 .flowi6_mark = mark, 2574 .daddr = iph->daddr, 2575 .saddr = iph->saddr, 2576 .flowlabel = ip6_flowinfo(iph), 2577 .flowi6_uid = uid, 2578 }; 2579 2580 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2581 rt6_do_redirect(dst, NULL, skb); 2582 dst_release(dst); 2583 } 2584 EXPORT_SYMBOL_GPL(ip6_redirect); 2585 2586 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2587 { 2588 const struct ipv6hdr *iph = ipv6_hdr(skb); 2589 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2590 struct dst_entry *dst; 2591 struct flowi6 fl6 = { 2592 .flowi6_iif = LOOPBACK_IFINDEX, 2593 .flowi6_oif = oif, 2594 .daddr = msg->dest, 2595 .saddr = iph->daddr, 2596 .flowi6_uid = sock_net_uid(net, NULL), 2597 }; 2598 2599 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2600 rt6_do_redirect(dst, NULL, skb); 2601 dst_release(dst); 2602 } 2603 2604 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2605 { 2606 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2607 sk->sk_uid); 2608 } 2609 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2610 2611 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2612 { 2613 struct net_device *dev = dst->dev; 2614 unsigned int mtu = dst_mtu(dst); 2615 struct net *net = dev_net(dev); 2616 2617 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2618 2619 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2620 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2621 2622 /* 2623 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2624 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2625 * IPV6_MAXPLEN is also valid and means: "any MSS, 2626 * rely only on pmtu discovery" 2627 */ 2628 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2629 mtu = IPV6_MAXPLEN; 2630 return mtu; 2631 } 2632 2633 static unsigned int ip6_mtu(const struct dst_entry *dst) 2634 { 2635 struct inet6_dev *idev; 2636 unsigned int mtu; 2637 2638 mtu = dst_metric_raw(dst, RTAX_MTU); 2639 if (mtu) 2640 goto out; 2641 2642 mtu = IPV6_MIN_MTU; 2643 2644 rcu_read_lock(); 2645 idev = __in6_dev_get(dst->dev); 2646 if (idev) 2647 mtu = idev->cnf.mtu6; 2648 rcu_read_unlock(); 2649 2650 out: 2651 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2652 2653 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2654 } 2655 2656 /* MTU selection: 2657 * 1. mtu on route is locked - use it 2658 * 2. mtu from nexthop exception 2659 * 3. mtu from egress device 2660 * 2661 * based on ip6_dst_mtu_forward and exception logic of 2662 * rt6_find_cached_rt; called with rcu_read_lock 2663 */ 2664 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 2665 const struct in6_addr *daddr, 2666 const struct in6_addr *saddr) 2667 { 2668 struct rt6_exception_bucket *bucket; 2669 const struct fib6_nh *nh = res->nh; 2670 struct fib6_info *f6i = res->f6i; 2671 const struct in6_addr *src_key; 2672 struct rt6_exception *rt6_ex; 2673 struct inet6_dev *idev; 2674 u32 mtu = 0; 2675 2676 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2677 mtu = f6i->fib6_pmtu; 2678 if (mtu) 2679 goto out; 2680 } 2681 2682 src_key = NULL; 2683 #ifdef CONFIG_IPV6_SUBTREES 2684 if (f6i->fib6_src.plen) 2685 src_key = saddr; 2686 #endif 2687 2688 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2689 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2690 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2691 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2692 2693 if (likely(!mtu)) { 2694 struct net_device *dev = nh->fib_nh_dev; 2695 2696 mtu = IPV6_MIN_MTU; 2697 idev = __in6_dev_get(dev); 2698 if (idev && idev->cnf.mtu6 > mtu) 2699 mtu = idev->cnf.mtu6; 2700 } 2701 2702 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2703 out: 2704 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 2705 } 2706 2707 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2708 struct flowi6 *fl6) 2709 { 2710 struct dst_entry *dst; 2711 struct rt6_info *rt; 2712 struct inet6_dev *idev = in6_dev_get(dev); 2713 struct net *net = dev_net(dev); 2714 2715 if (unlikely(!idev)) 2716 return ERR_PTR(-ENODEV); 2717 2718 rt = ip6_dst_alloc(net, dev, 0); 2719 if (unlikely(!rt)) { 2720 in6_dev_put(idev); 2721 dst = ERR_PTR(-ENOMEM); 2722 goto out; 2723 } 2724 2725 rt->dst.flags |= DST_HOST; 2726 rt->dst.input = ip6_input; 2727 rt->dst.output = ip6_output; 2728 rt->rt6i_gateway = fl6->daddr; 2729 rt->rt6i_dst.addr = fl6->daddr; 2730 rt->rt6i_dst.plen = 128; 2731 rt->rt6i_idev = idev; 2732 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2733 2734 /* Add this dst into uncached_list so that rt6_disable_ip() can 2735 * do proper release of the net_device 2736 */ 2737 rt6_uncached_list_add(rt); 2738 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2739 2740 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2741 2742 out: 2743 return dst; 2744 } 2745 2746 static int ip6_dst_gc(struct dst_ops *ops) 2747 { 2748 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2749 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2750 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2751 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2752 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2753 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2754 int entries; 2755 2756 entries = dst_entries_get_fast(ops); 2757 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2758 entries <= rt_max_size) 2759 goto out; 2760 2761 net->ipv6.ip6_rt_gc_expire++; 2762 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2763 entries = dst_entries_get_slow(ops); 2764 if (entries < ops->gc_thresh) 2765 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2766 out: 2767 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2768 return entries > rt_max_size; 2769 } 2770 2771 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2772 struct fib6_config *cfg, 2773 const struct in6_addr *gw_addr, 2774 u32 tbid, int flags) 2775 { 2776 struct flowi6 fl6 = { 2777 .flowi6_oif = cfg->fc_ifindex, 2778 .daddr = *gw_addr, 2779 .saddr = cfg->fc_prefsrc, 2780 }; 2781 struct fib6_table *table; 2782 struct rt6_info *rt; 2783 2784 table = fib6_get_table(net, tbid); 2785 if (!table) 2786 return NULL; 2787 2788 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2789 flags |= RT6_LOOKUP_F_HAS_SADDR; 2790 2791 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2792 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2793 2794 /* if table lookup failed, fall back to full lookup */ 2795 if (rt == net->ipv6.ip6_null_entry) { 2796 ip6_rt_put(rt); 2797 rt = NULL; 2798 } 2799 2800 return rt; 2801 } 2802 2803 static int ip6_route_check_nh_onlink(struct net *net, 2804 struct fib6_config *cfg, 2805 const struct net_device *dev, 2806 struct netlink_ext_ack *extack) 2807 { 2808 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2809 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2810 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2811 struct fib6_info *from; 2812 struct rt6_info *grt; 2813 int err; 2814 2815 err = 0; 2816 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2817 if (grt) { 2818 rcu_read_lock(); 2819 from = rcu_dereference(grt->from); 2820 if (!grt->dst.error && 2821 /* ignore match if it is the default route */ 2822 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2823 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2824 NL_SET_ERR_MSG(extack, 2825 "Nexthop has invalid gateway or device mismatch"); 2826 err = -EINVAL; 2827 } 2828 rcu_read_unlock(); 2829 2830 ip6_rt_put(grt); 2831 } 2832 2833 return err; 2834 } 2835 2836 static int ip6_route_check_nh(struct net *net, 2837 struct fib6_config *cfg, 2838 struct net_device **_dev, 2839 struct inet6_dev **idev) 2840 { 2841 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2842 struct net_device *dev = _dev ? *_dev : NULL; 2843 struct rt6_info *grt = NULL; 2844 int err = -EHOSTUNREACH; 2845 2846 if (cfg->fc_table) { 2847 int flags = RT6_LOOKUP_F_IFACE; 2848 2849 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2850 cfg->fc_table, flags); 2851 if (grt) { 2852 if (grt->rt6i_flags & RTF_GATEWAY || 2853 (dev && dev != grt->dst.dev)) { 2854 ip6_rt_put(grt); 2855 grt = NULL; 2856 } 2857 } 2858 } 2859 2860 if (!grt) 2861 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2862 2863 if (!grt) 2864 goto out; 2865 2866 if (dev) { 2867 if (dev != grt->dst.dev) { 2868 ip6_rt_put(grt); 2869 goto out; 2870 } 2871 } else { 2872 *_dev = dev = grt->dst.dev; 2873 *idev = grt->rt6i_idev; 2874 dev_hold(dev); 2875 in6_dev_hold(grt->rt6i_idev); 2876 } 2877 2878 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2879 err = 0; 2880 2881 ip6_rt_put(grt); 2882 2883 out: 2884 return err; 2885 } 2886 2887 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2888 struct net_device **_dev, struct inet6_dev **idev, 2889 struct netlink_ext_ack *extack) 2890 { 2891 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2892 int gwa_type = ipv6_addr_type(gw_addr); 2893 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2894 const struct net_device *dev = *_dev; 2895 bool need_addr_check = !dev; 2896 int err = -EINVAL; 2897 2898 /* if gw_addr is local we will fail to detect this in case 2899 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2900 * will return already-added prefix route via interface that 2901 * prefix route was assigned to, which might be non-loopback. 2902 */ 2903 if (dev && 2904 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2905 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2906 goto out; 2907 } 2908 2909 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2910 /* IPv6 strictly inhibits using not link-local 2911 * addresses as nexthop address. 2912 * Otherwise, router will not able to send redirects. 2913 * It is very good, but in some (rare!) circumstances 2914 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2915 * some exceptions. --ANK 2916 * We allow IPv4-mapped nexthops to support RFC4798-type 2917 * addressing 2918 */ 2919 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2920 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2921 goto out; 2922 } 2923 2924 if (cfg->fc_flags & RTNH_F_ONLINK) 2925 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2926 else 2927 err = ip6_route_check_nh(net, cfg, _dev, idev); 2928 2929 if (err) 2930 goto out; 2931 } 2932 2933 /* reload in case device was changed */ 2934 dev = *_dev; 2935 2936 err = -EINVAL; 2937 if (!dev) { 2938 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2939 goto out; 2940 } else if (dev->flags & IFF_LOOPBACK) { 2941 NL_SET_ERR_MSG(extack, 2942 "Egress device can not be loopback device for this route"); 2943 goto out; 2944 } 2945 2946 /* if we did not check gw_addr above, do so now that the 2947 * egress device has been resolved. 2948 */ 2949 if (need_addr_check && 2950 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2951 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2952 goto out; 2953 } 2954 2955 err = 0; 2956 out: 2957 return err; 2958 } 2959 2960 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2961 { 2962 if ((flags & RTF_REJECT) || 2963 (dev && (dev->flags & IFF_LOOPBACK) && 2964 !(addr_type & IPV6_ADDR_LOOPBACK) && 2965 !(flags & RTF_LOCAL))) 2966 return true; 2967 2968 return false; 2969 } 2970 2971 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2972 struct fib6_config *cfg, gfp_t gfp_flags, 2973 struct netlink_ext_ack *extack) 2974 { 2975 struct net_device *dev = NULL; 2976 struct inet6_dev *idev = NULL; 2977 int addr_type; 2978 int err; 2979 2980 fib6_nh->fib_nh_family = AF_INET6; 2981 2982 err = -ENODEV; 2983 if (cfg->fc_ifindex) { 2984 dev = dev_get_by_index(net, cfg->fc_ifindex); 2985 if (!dev) 2986 goto out; 2987 idev = in6_dev_get(dev); 2988 if (!idev) 2989 goto out; 2990 } 2991 2992 if (cfg->fc_flags & RTNH_F_ONLINK) { 2993 if (!dev) { 2994 NL_SET_ERR_MSG(extack, 2995 "Nexthop device required for onlink"); 2996 goto out; 2997 } 2998 2999 if (!(dev->flags & IFF_UP)) { 3000 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3001 err = -ENETDOWN; 3002 goto out; 3003 } 3004 3005 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3006 } 3007 3008 fib6_nh->fib_nh_weight = 1; 3009 3010 /* We cannot add true routes via loopback here, 3011 * they would result in kernel looping; promote them to reject routes 3012 */ 3013 addr_type = ipv6_addr_type(&cfg->fc_dst); 3014 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3015 /* hold loopback dev/idev if we haven't done so. */ 3016 if (dev != net->loopback_dev) { 3017 if (dev) { 3018 dev_put(dev); 3019 in6_dev_put(idev); 3020 } 3021 dev = net->loopback_dev; 3022 dev_hold(dev); 3023 idev = in6_dev_get(dev); 3024 if (!idev) { 3025 err = -ENODEV; 3026 goto out; 3027 } 3028 } 3029 goto set_dev; 3030 } 3031 3032 if (cfg->fc_flags & RTF_GATEWAY) { 3033 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3034 if (err) 3035 goto out; 3036 3037 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3038 fib6_nh->fib_nh_gw_family = AF_INET6; 3039 } 3040 3041 err = -ENODEV; 3042 if (!dev) 3043 goto out; 3044 3045 if (idev->cnf.disable_ipv6) { 3046 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3047 err = -EACCES; 3048 goto out; 3049 } 3050 3051 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3052 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3053 err = -ENETDOWN; 3054 goto out; 3055 } 3056 3057 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3058 !netif_carrier_ok(dev)) 3059 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3060 3061 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3062 cfg->fc_encap_type, cfg, gfp_flags, extack); 3063 if (err) 3064 goto out; 3065 set_dev: 3066 fib6_nh->fib_nh_dev = dev; 3067 fib6_nh->fib_nh_oif = dev->ifindex; 3068 err = 0; 3069 out: 3070 if (idev) 3071 in6_dev_put(idev); 3072 3073 if (err) { 3074 lwtstate_put(fib6_nh->fib_nh_lws); 3075 fib6_nh->fib_nh_lws = NULL; 3076 if (dev) 3077 dev_put(dev); 3078 } 3079 3080 return err; 3081 } 3082 3083 void fib6_nh_release(struct fib6_nh *fib6_nh) 3084 { 3085 fib_nh_common_release(&fib6_nh->nh_common); 3086 } 3087 3088 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3089 gfp_t gfp_flags, 3090 struct netlink_ext_ack *extack) 3091 { 3092 struct net *net = cfg->fc_nlinfo.nl_net; 3093 struct fib6_info *rt = NULL; 3094 struct fib6_table *table; 3095 int err = -EINVAL; 3096 int addr_type; 3097 3098 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3099 if (cfg->fc_flags & RTF_PCPU) { 3100 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3101 goto out; 3102 } 3103 3104 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3105 if (cfg->fc_flags & RTF_CACHE) { 3106 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3107 goto out; 3108 } 3109 3110 if (cfg->fc_type > RTN_MAX) { 3111 NL_SET_ERR_MSG(extack, "Invalid route type"); 3112 goto out; 3113 } 3114 3115 if (cfg->fc_dst_len > 128) { 3116 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3117 goto out; 3118 } 3119 if (cfg->fc_src_len > 128) { 3120 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3121 goto out; 3122 } 3123 #ifndef CONFIG_IPV6_SUBTREES 3124 if (cfg->fc_src_len) { 3125 NL_SET_ERR_MSG(extack, 3126 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3127 goto out; 3128 } 3129 #endif 3130 3131 err = -ENOBUFS; 3132 if (cfg->fc_nlinfo.nlh && 3133 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3134 table = fib6_get_table(net, cfg->fc_table); 3135 if (!table) { 3136 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3137 table = fib6_new_table(net, cfg->fc_table); 3138 } 3139 } else { 3140 table = fib6_new_table(net, cfg->fc_table); 3141 } 3142 3143 if (!table) 3144 goto out; 3145 3146 err = -ENOMEM; 3147 rt = fib6_info_alloc(gfp_flags); 3148 if (!rt) 3149 goto out; 3150 3151 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3152 extack); 3153 if (IS_ERR(rt->fib6_metrics)) { 3154 err = PTR_ERR(rt->fib6_metrics); 3155 /* Do not leave garbage there. */ 3156 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3157 goto out; 3158 } 3159 3160 if (cfg->fc_flags & RTF_ADDRCONF) 3161 rt->dst_nocount = true; 3162 3163 if (cfg->fc_flags & RTF_EXPIRES) 3164 fib6_set_expires(rt, jiffies + 3165 clock_t_to_jiffies(cfg->fc_expires)); 3166 else 3167 fib6_clean_expires(rt); 3168 3169 if (cfg->fc_protocol == RTPROT_UNSPEC) 3170 cfg->fc_protocol = RTPROT_BOOT; 3171 rt->fib6_protocol = cfg->fc_protocol; 3172 3173 rt->fib6_table = table; 3174 rt->fib6_metric = cfg->fc_metric; 3175 rt->fib6_type = cfg->fc_type; 3176 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3177 3178 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3179 rt->fib6_dst.plen = cfg->fc_dst_len; 3180 if (rt->fib6_dst.plen == 128) 3181 rt->dst_host = true; 3182 3183 #ifdef CONFIG_IPV6_SUBTREES 3184 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3185 rt->fib6_src.plen = cfg->fc_src_len; 3186 #endif 3187 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3188 if (err) 3189 goto out; 3190 3191 /* We cannot add true routes via loopback here, 3192 * they would result in kernel looping; promote them to reject routes 3193 */ 3194 addr_type = ipv6_addr_type(&cfg->fc_dst); 3195 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3196 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3197 3198 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3199 struct net_device *dev = fib6_info_nh_dev(rt); 3200 3201 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3202 NL_SET_ERR_MSG(extack, "Invalid source address"); 3203 err = -EINVAL; 3204 goto out; 3205 } 3206 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3207 rt->fib6_prefsrc.plen = 128; 3208 } else 3209 rt->fib6_prefsrc.plen = 0; 3210 3211 return rt; 3212 out: 3213 fib6_info_release(rt); 3214 return ERR_PTR(err); 3215 } 3216 3217 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3218 struct netlink_ext_ack *extack) 3219 { 3220 struct fib6_info *rt; 3221 int err; 3222 3223 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3224 if (IS_ERR(rt)) 3225 return PTR_ERR(rt); 3226 3227 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3228 fib6_info_release(rt); 3229 3230 return err; 3231 } 3232 3233 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3234 { 3235 struct net *net = info->nl_net; 3236 struct fib6_table *table; 3237 int err; 3238 3239 if (rt == net->ipv6.fib6_null_entry) { 3240 err = -ENOENT; 3241 goto out; 3242 } 3243 3244 table = rt->fib6_table; 3245 spin_lock_bh(&table->tb6_lock); 3246 err = fib6_del(rt, info); 3247 spin_unlock_bh(&table->tb6_lock); 3248 3249 out: 3250 fib6_info_release(rt); 3251 return err; 3252 } 3253 3254 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3255 { 3256 struct nl_info info = { .nl_net = net }; 3257 3258 return __ip6_del_rt(rt, &info); 3259 } 3260 3261 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3262 { 3263 struct nl_info *info = &cfg->fc_nlinfo; 3264 struct net *net = info->nl_net; 3265 struct sk_buff *skb = NULL; 3266 struct fib6_table *table; 3267 int err = -ENOENT; 3268 3269 if (rt == net->ipv6.fib6_null_entry) 3270 goto out_put; 3271 table = rt->fib6_table; 3272 spin_lock_bh(&table->tb6_lock); 3273 3274 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3275 struct fib6_info *sibling, *next_sibling; 3276 3277 /* prefer to send a single notification with all hops */ 3278 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3279 if (skb) { 3280 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3281 3282 if (rt6_fill_node(net, skb, rt, NULL, 3283 NULL, NULL, 0, RTM_DELROUTE, 3284 info->portid, seq, 0) < 0) { 3285 kfree_skb(skb); 3286 skb = NULL; 3287 } else 3288 info->skip_notify = 1; 3289 } 3290 3291 list_for_each_entry_safe(sibling, next_sibling, 3292 &rt->fib6_siblings, 3293 fib6_siblings) { 3294 err = fib6_del(sibling, info); 3295 if (err) 3296 goto out_unlock; 3297 } 3298 } 3299 3300 err = fib6_del(rt, info); 3301 out_unlock: 3302 spin_unlock_bh(&table->tb6_lock); 3303 out_put: 3304 fib6_info_release(rt); 3305 3306 if (skb) { 3307 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3308 info->nlh, gfp_any()); 3309 } 3310 return err; 3311 } 3312 3313 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3314 { 3315 int rc = -ESRCH; 3316 3317 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3318 goto out; 3319 3320 if (cfg->fc_flags & RTF_GATEWAY && 3321 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3322 goto out; 3323 3324 rc = rt6_remove_exception_rt(rt); 3325 out: 3326 return rc; 3327 } 3328 3329 static int ip6_route_del(struct fib6_config *cfg, 3330 struct netlink_ext_ack *extack) 3331 { 3332 struct rt6_info *rt_cache; 3333 struct fib6_table *table; 3334 struct fib6_info *rt; 3335 struct fib6_node *fn; 3336 int err = -ESRCH; 3337 3338 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3339 if (!table) { 3340 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3341 return err; 3342 } 3343 3344 rcu_read_lock(); 3345 3346 fn = fib6_locate(&table->tb6_root, 3347 &cfg->fc_dst, cfg->fc_dst_len, 3348 &cfg->fc_src, cfg->fc_src_len, 3349 !(cfg->fc_flags & RTF_CACHE)); 3350 3351 if (fn) { 3352 for_each_fib6_node_rt_rcu(fn) { 3353 struct fib6_nh *nh; 3354 3355 if (cfg->fc_flags & RTF_CACHE) { 3356 struct fib6_result res = { 3357 .f6i = rt, 3358 }; 3359 int rc; 3360 3361 rt_cache = rt6_find_cached_rt(&res, 3362 &cfg->fc_dst, 3363 &cfg->fc_src); 3364 if (rt_cache) { 3365 rc = ip6_del_cached_rt(rt_cache, cfg); 3366 if (rc != -ESRCH) { 3367 rcu_read_unlock(); 3368 return rc; 3369 } 3370 } 3371 continue; 3372 } 3373 3374 nh = &rt->fib6_nh; 3375 if (cfg->fc_ifindex && 3376 (!nh->fib_nh_dev || 3377 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3378 continue; 3379 if (cfg->fc_flags & RTF_GATEWAY && 3380 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3381 continue; 3382 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3383 continue; 3384 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3385 continue; 3386 if (!fib6_info_hold_safe(rt)) 3387 continue; 3388 rcu_read_unlock(); 3389 3390 /* if gateway was specified only delete the one hop */ 3391 if (cfg->fc_flags & RTF_GATEWAY) 3392 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3393 3394 return __ip6_del_rt_siblings(rt, cfg); 3395 } 3396 } 3397 rcu_read_unlock(); 3398 3399 return err; 3400 } 3401 3402 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3403 { 3404 struct netevent_redirect netevent; 3405 struct rt6_info *rt, *nrt = NULL; 3406 struct fib6_result res = {}; 3407 struct ndisc_options ndopts; 3408 struct inet6_dev *in6_dev; 3409 struct neighbour *neigh; 3410 struct rd_msg *msg; 3411 int optlen, on_link; 3412 u8 *lladdr; 3413 3414 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3415 optlen -= sizeof(*msg); 3416 3417 if (optlen < 0) { 3418 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3419 return; 3420 } 3421 3422 msg = (struct rd_msg *)icmp6_hdr(skb); 3423 3424 if (ipv6_addr_is_multicast(&msg->dest)) { 3425 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3426 return; 3427 } 3428 3429 on_link = 0; 3430 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3431 on_link = 1; 3432 } else if (ipv6_addr_type(&msg->target) != 3433 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3434 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3435 return; 3436 } 3437 3438 in6_dev = __in6_dev_get(skb->dev); 3439 if (!in6_dev) 3440 return; 3441 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3442 return; 3443 3444 /* RFC2461 8.1: 3445 * The IP source address of the Redirect MUST be the same as the current 3446 * first-hop router for the specified ICMP Destination Address. 3447 */ 3448 3449 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3450 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3451 return; 3452 } 3453 3454 lladdr = NULL; 3455 if (ndopts.nd_opts_tgt_lladdr) { 3456 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3457 skb->dev); 3458 if (!lladdr) { 3459 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3460 return; 3461 } 3462 } 3463 3464 rt = (struct rt6_info *) dst; 3465 if (rt->rt6i_flags & RTF_REJECT) { 3466 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3467 return; 3468 } 3469 3470 /* Redirect received -> path was valid. 3471 * Look, redirects are sent only in response to data packets, 3472 * so that this nexthop apparently is reachable. --ANK 3473 */ 3474 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3475 3476 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3477 if (!neigh) 3478 return; 3479 3480 /* 3481 * We have finally decided to accept it. 3482 */ 3483 3484 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3485 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3486 NEIGH_UPDATE_F_OVERRIDE| 3487 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3488 NEIGH_UPDATE_F_ISROUTER)), 3489 NDISC_REDIRECT, &ndopts); 3490 3491 rcu_read_lock(); 3492 res.f6i = rcu_dereference(rt->from); 3493 if (!res.f6i) 3494 goto out; 3495 3496 res.nh = &res.f6i->fib6_nh; 3497 res.fib6_flags = res.f6i->fib6_flags; 3498 res.fib6_type = res.f6i->fib6_type; 3499 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3500 if (!nrt) 3501 goto out; 3502 3503 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3504 if (on_link) 3505 nrt->rt6i_flags &= ~RTF_GATEWAY; 3506 3507 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3508 3509 /* rt6_insert_exception() will take care of duplicated exceptions */ 3510 if (rt6_insert_exception(nrt, &res)) { 3511 dst_release_immediate(&nrt->dst); 3512 goto out; 3513 } 3514 3515 netevent.old = &rt->dst; 3516 netevent.new = &nrt->dst; 3517 netevent.daddr = &msg->dest; 3518 netevent.neigh = neigh; 3519 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3520 3521 out: 3522 rcu_read_unlock(); 3523 neigh_release(neigh); 3524 } 3525 3526 #ifdef CONFIG_IPV6_ROUTE_INFO 3527 static struct fib6_info *rt6_get_route_info(struct net *net, 3528 const struct in6_addr *prefix, int prefixlen, 3529 const struct in6_addr *gwaddr, 3530 struct net_device *dev) 3531 { 3532 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3533 int ifindex = dev->ifindex; 3534 struct fib6_node *fn; 3535 struct fib6_info *rt = NULL; 3536 struct fib6_table *table; 3537 3538 table = fib6_get_table(net, tb_id); 3539 if (!table) 3540 return NULL; 3541 3542 rcu_read_lock(); 3543 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3544 if (!fn) 3545 goto out; 3546 3547 for_each_fib6_node_rt_rcu(fn) { 3548 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3549 continue; 3550 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3551 !rt->fib6_nh.fib_nh_gw_family) 3552 continue; 3553 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3554 continue; 3555 if (!fib6_info_hold_safe(rt)) 3556 continue; 3557 break; 3558 } 3559 out: 3560 rcu_read_unlock(); 3561 return rt; 3562 } 3563 3564 static struct fib6_info *rt6_add_route_info(struct net *net, 3565 const struct in6_addr *prefix, int prefixlen, 3566 const struct in6_addr *gwaddr, 3567 struct net_device *dev, 3568 unsigned int pref) 3569 { 3570 struct fib6_config cfg = { 3571 .fc_metric = IP6_RT_PRIO_USER, 3572 .fc_ifindex = dev->ifindex, 3573 .fc_dst_len = prefixlen, 3574 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3575 RTF_UP | RTF_PREF(pref), 3576 .fc_protocol = RTPROT_RA, 3577 .fc_type = RTN_UNICAST, 3578 .fc_nlinfo.portid = 0, 3579 .fc_nlinfo.nlh = NULL, 3580 .fc_nlinfo.nl_net = net, 3581 }; 3582 3583 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3584 cfg.fc_dst = *prefix; 3585 cfg.fc_gateway = *gwaddr; 3586 3587 /* We should treat it as a default route if prefix length is 0. */ 3588 if (!prefixlen) 3589 cfg.fc_flags |= RTF_DEFAULT; 3590 3591 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3592 3593 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3594 } 3595 #endif 3596 3597 struct fib6_info *rt6_get_dflt_router(struct net *net, 3598 const struct in6_addr *addr, 3599 struct net_device *dev) 3600 { 3601 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3602 struct fib6_info *rt; 3603 struct fib6_table *table; 3604 3605 table = fib6_get_table(net, tb_id); 3606 if (!table) 3607 return NULL; 3608 3609 rcu_read_lock(); 3610 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3611 struct fib6_nh *nh = &rt->fib6_nh; 3612 3613 if (dev == nh->fib_nh_dev && 3614 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3615 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3616 break; 3617 } 3618 if (rt && !fib6_info_hold_safe(rt)) 3619 rt = NULL; 3620 rcu_read_unlock(); 3621 return rt; 3622 } 3623 3624 struct fib6_info *rt6_add_dflt_router(struct net *net, 3625 const struct in6_addr *gwaddr, 3626 struct net_device *dev, 3627 unsigned int pref) 3628 { 3629 struct fib6_config cfg = { 3630 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3631 .fc_metric = IP6_RT_PRIO_USER, 3632 .fc_ifindex = dev->ifindex, 3633 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3634 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3635 .fc_protocol = RTPROT_RA, 3636 .fc_type = RTN_UNICAST, 3637 .fc_nlinfo.portid = 0, 3638 .fc_nlinfo.nlh = NULL, 3639 .fc_nlinfo.nl_net = net, 3640 }; 3641 3642 cfg.fc_gateway = *gwaddr; 3643 3644 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3645 struct fib6_table *table; 3646 3647 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3648 if (table) 3649 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3650 } 3651 3652 return rt6_get_dflt_router(net, gwaddr, dev); 3653 } 3654 3655 static void __rt6_purge_dflt_routers(struct net *net, 3656 struct fib6_table *table) 3657 { 3658 struct fib6_info *rt; 3659 3660 restart: 3661 rcu_read_lock(); 3662 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3663 struct net_device *dev = fib6_info_nh_dev(rt); 3664 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3665 3666 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3667 (!idev || idev->cnf.accept_ra != 2) && 3668 fib6_info_hold_safe(rt)) { 3669 rcu_read_unlock(); 3670 ip6_del_rt(net, rt); 3671 goto restart; 3672 } 3673 } 3674 rcu_read_unlock(); 3675 3676 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3677 } 3678 3679 void rt6_purge_dflt_routers(struct net *net) 3680 { 3681 struct fib6_table *table; 3682 struct hlist_head *head; 3683 unsigned int h; 3684 3685 rcu_read_lock(); 3686 3687 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3688 head = &net->ipv6.fib_table_hash[h]; 3689 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3690 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3691 __rt6_purge_dflt_routers(net, table); 3692 } 3693 } 3694 3695 rcu_read_unlock(); 3696 } 3697 3698 static void rtmsg_to_fib6_config(struct net *net, 3699 struct in6_rtmsg *rtmsg, 3700 struct fib6_config *cfg) 3701 { 3702 *cfg = (struct fib6_config){ 3703 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3704 : RT6_TABLE_MAIN, 3705 .fc_ifindex = rtmsg->rtmsg_ifindex, 3706 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3707 .fc_expires = rtmsg->rtmsg_info, 3708 .fc_dst_len = rtmsg->rtmsg_dst_len, 3709 .fc_src_len = rtmsg->rtmsg_src_len, 3710 .fc_flags = rtmsg->rtmsg_flags, 3711 .fc_type = rtmsg->rtmsg_type, 3712 3713 .fc_nlinfo.nl_net = net, 3714 3715 .fc_dst = rtmsg->rtmsg_dst, 3716 .fc_src = rtmsg->rtmsg_src, 3717 .fc_gateway = rtmsg->rtmsg_gateway, 3718 }; 3719 } 3720 3721 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3722 { 3723 struct fib6_config cfg; 3724 struct in6_rtmsg rtmsg; 3725 int err; 3726 3727 switch (cmd) { 3728 case SIOCADDRT: /* Add a route */ 3729 case SIOCDELRT: /* Delete a route */ 3730 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3731 return -EPERM; 3732 err = copy_from_user(&rtmsg, arg, 3733 sizeof(struct in6_rtmsg)); 3734 if (err) 3735 return -EFAULT; 3736 3737 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3738 3739 rtnl_lock(); 3740 switch (cmd) { 3741 case SIOCADDRT: 3742 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3743 break; 3744 case SIOCDELRT: 3745 err = ip6_route_del(&cfg, NULL); 3746 break; 3747 default: 3748 err = -EINVAL; 3749 } 3750 rtnl_unlock(); 3751 3752 return err; 3753 } 3754 3755 return -EINVAL; 3756 } 3757 3758 /* 3759 * Drop the packet on the floor 3760 */ 3761 3762 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3763 { 3764 struct dst_entry *dst = skb_dst(skb); 3765 struct net *net = dev_net(dst->dev); 3766 struct inet6_dev *idev; 3767 int type; 3768 3769 if (netif_is_l3_master(skb->dev) && 3770 dst->dev == net->loopback_dev) 3771 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 3772 else 3773 idev = ip6_dst_idev(dst); 3774 3775 switch (ipstats_mib_noroutes) { 3776 case IPSTATS_MIB_INNOROUTES: 3777 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3778 if (type == IPV6_ADDR_ANY) { 3779 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 3780 break; 3781 } 3782 /* FALLTHROUGH */ 3783 case IPSTATS_MIB_OUTNOROUTES: 3784 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 3785 break; 3786 } 3787 3788 /* Start over by dropping the dst for l3mdev case */ 3789 if (netif_is_l3_master(skb->dev)) 3790 skb_dst_drop(skb); 3791 3792 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3793 kfree_skb(skb); 3794 return 0; 3795 } 3796 3797 static int ip6_pkt_discard(struct sk_buff *skb) 3798 { 3799 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3800 } 3801 3802 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3803 { 3804 skb->dev = skb_dst(skb)->dev; 3805 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3806 } 3807 3808 static int ip6_pkt_prohibit(struct sk_buff *skb) 3809 { 3810 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3811 } 3812 3813 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3814 { 3815 skb->dev = skb_dst(skb)->dev; 3816 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3817 } 3818 3819 /* 3820 * Allocate a dst for local (unicast / anycast) address. 3821 */ 3822 3823 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3824 struct inet6_dev *idev, 3825 const struct in6_addr *addr, 3826 bool anycast, gfp_t gfp_flags) 3827 { 3828 struct fib6_config cfg = { 3829 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3830 .fc_ifindex = idev->dev->ifindex, 3831 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3832 .fc_dst = *addr, 3833 .fc_dst_len = 128, 3834 .fc_protocol = RTPROT_KERNEL, 3835 .fc_nlinfo.nl_net = net, 3836 .fc_ignore_dev_down = true, 3837 }; 3838 3839 if (anycast) { 3840 cfg.fc_type = RTN_ANYCAST; 3841 cfg.fc_flags |= RTF_ANYCAST; 3842 } else { 3843 cfg.fc_type = RTN_LOCAL; 3844 cfg.fc_flags |= RTF_LOCAL; 3845 } 3846 3847 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3848 } 3849 3850 /* remove deleted ip from prefsrc entries */ 3851 struct arg_dev_net_ip { 3852 struct net_device *dev; 3853 struct net *net; 3854 struct in6_addr *addr; 3855 }; 3856 3857 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3858 { 3859 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3860 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3861 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3862 3863 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3864 rt != net->ipv6.fib6_null_entry && 3865 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3866 spin_lock_bh(&rt6_exception_lock); 3867 /* remove prefsrc entry */ 3868 rt->fib6_prefsrc.plen = 0; 3869 spin_unlock_bh(&rt6_exception_lock); 3870 } 3871 return 0; 3872 } 3873 3874 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3875 { 3876 struct net *net = dev_net(ifp->idev->dev); 3877 struct arg_dev_net_ip adni = { 3878 .dev = ifp->idev->dev, 3879 .net = net, 3880 .addr = &ifp->addr, 3881 }; 3882 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3883 } 3884 3885 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3886 3887 /* Remove routers and update dst entries when gateway turn into host. */ 3888 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3889 { 3890 struct in6_addr *gateway = (struct in6_addr *)arg; 3891 3892 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3893 rt->fib6_nh.fib_nh_gw_family && 3894 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3895 return -1; 3896 } 3897 3898 /* Further clean up cached routes in exception table. 3899 * This is needed because cached route may have a different 3900 * gateway than its 'parent' in the case of an ip redirect. 3901 */ 3902 rt6_exceptions_clean_tohost(rt, gateway); 3903 3904 return 0; 3905 } 3906 3907 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3908 { 3909 fib6_clean_all(net, fib6_clean_tohost, gateway); 3910 } 3911 3912 struct arg_netdev_event { 3913 const struct net_device *dev; 3914 union { 3915 unsigned char nh_flags; 3916 unsigned long event; 3917 }; 3918 }; 3919 3920 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3921 { 3922 struct fib6_info *iter; 3923 struct fib6_node *fn; 3924 3925 fn = rcu_dereference_protected(rt->fib6_node, 3926 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3927 iter = rcu_dereference_protected(fn->leaf, 3928 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3929 while (iter) { 3930 if (iter->fib6_metric == rt->fib6_metric && 3931 rt6_qualify_for_ecmp(iter)) 3932 return iter; 3933 iter = rcu_dereference_protected(iter->fib6_next, 3934 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3935 } 3936 3937 return NULL; 3938 } 3939 3940 static bool rt6_is_dead(const struct fib6_info *rt) 3941 { 3942 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3943 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3944 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3945 return true; 3946 3947 return false; 3948 } 3949 3950 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3951 { 3952 struct fib6_info *iter; 3953 int total = 0; 3954 3955 if (!rt6_is_dead(rt)) 3956 total += rt->fib6_nh.fib_nh_weight; 3957 3958 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3959 if (!rt6_is_dead(iter)) 3960 total += iter->fib6_nh.fib_nh_weight; 3961 } 3962 3963 return total; 3964 } 3965 3966 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3967 { 3968 int upper_bound = -1; 3969 3970 if (!rt6_is_dead(rt)) { 3971 *weight += rt->fib6_nh.fib_nh_weight; 3972 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3973 total) - 1; 3974 } 3975 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3976 } 3977 3978 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3979 { 3980 struct fib6_info *iter; 3981 int weight = 0; 3982 3983 rt6_upper_bound_set(rt, &weight, total); 3984 3985 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3986 rt6_upper_bound_set(iter, &weight, total); 3987 } 3988 3989 void rt6_multipath_rebalance(struct fib6_info *rt) 3990 { 3991 struct fib6_info *first; 3992 int total; 3993 3994 /* In case the entire multipath route was marked for flushing, 3995 * then there is no need to rebalance upon the removal of every 3996 * sibling route. 3997 */ 3998 if (!rt->fib6_nsiblings || rt->should_flush) 3999 return; 4000 4001 /* During lookup routes are evaluated in order, so we need to 4002 * make sure upper bounds are assigned from the first sibling 4003 * onwards. 4004 */ 4005 first = rt6_multipath_first_sibling(rt); 4006 if (WARN_ON_ONCE(!first)) 4007 return; 4008 4009 total = rt6_multipath_total_weight(first); 4010 rt6_multipath_upper_bound_set(first, total); 4011 } 4012 4013 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4014 { 4015 const struct arg_netdev_event *arg = p_arg; 4016 struct net *net = dev_net(arg->dev); 4017 4018 if (rt != net->ipv6.fib6_null_entry && 4019 rt->fib6_nh.fib_nh_dev == arg->dev) { 4020 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 4021 fib6_update_sernum_upto_root(net, rt); 4022 rt6_multipath_rebalance(rt); 4023 } 4024 4025 return 0; 4026 } 4027 4028 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4029 { 4030 struct arg_netdev_event arg = { 4031 .dev = dev, 4032 { 4033 .nh_flags = nh_flags, 4034 }, 4035 }; 4036 4037 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4038 arg.nh_flags |= RTNH_F_LINKDOWN; 4039 4040 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4041 } 4042 4043 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4044 const struct net_device *dev) 4045 { 4046 struct fib6_info *iter; 4047 4048 if (rt->fib6_nh.fib_nh_dev == dev) 4049 return true; 4050 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4051 if (iter->fib6_nh.fib_nh_dev == dev) 4052 return true; 4053 4054 return false; 4055 } 4056 4057 static void rt6_multipath_flush(struct fib6_info *rt) 4058 { 4059 struct fib6_info *iter; 4060 4061 rt->should_flush = 1; 4062 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4063 iter->should_flush = 1; 4064 } 4065 4066 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4067 const struct net_device *down_dev) 4068 { 4069 struct fib6_info *iter; 4070 unsigned int dead = 0; 4071 4072 if (rt->fib6_nh.fib_nh_dev == down_dev || 4073 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4074 dead++; 4075 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4076 if (iter->fib6_nh.fib_nh_dev == down_dev || 4077 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4078 dead++; 4079 4080 return dead; 4081 } 4082 4083 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4084 const struct net_device *dev, 4085 unsigned char nh_flags) 4086 { 4087 struct fib6_info *iter; 4088 4089 if (rt->fib6_nh.fib_nh_dev == dev) 4090 rt->fib6_nh.fib_nh_flags |= nh_flags; 4091 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4092 if (iter->fib6_nh.fib_nh_dev == dev) 4093 iter->fib6_nh.fib_nh_flags |= nh_flags; 4094 } 4095 4096 /* called with write lock held for table with rt */ 4097 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4098 { 4099 const struct arg_netdev_event *arg = p_arg; 4100 const struct net_device *dev = arg->dev; 4101 struct net *net = dev_net(dev); 4102 4103 if (rt == net->ipv6.fib6_null_entry) 4104 return 0; 4105 4106 switch (arg->event) { 4107 case NETDEV_UNREGISTER: 4108 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4109 case NETDEV_DOWN: 4110 if (rt->should_flush) 4111 return -1; 4112 if (!rt->fib6_nsiblings) 4113 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4114 if (rt6_multipath_uses_dev(rt, dev)) { 4115 unsigned int count; 4116 4117 count = rt6_multipath_dead_count(rt, dev); 4118 if (rt->fib6_nsiblings + 1 == count) { 4119 rt6_multipath_flush(rt); 4120 return -1; 4121 } 4122 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4123 RTNH_F_LINKDOWN); 4124 fib6_update_sernum(net, rt); 4125 rt6_multipath_rebalance(rt); 4126 } 4127 return -2; 4128 case NETDEV_CHANGE: 4129 if (rt->fib6_nh.fib_nh_dev != dev || 4130 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4131 break; 4132 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4133 rt6_multipath_rebalance(rt); 4134 break; 4135 } 4136 4137 return 0; 4138 } 4139 4140 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4141 { 4142 struct arg_netdev_event arg = { 4143 .dev = dev, 4144 { 4145 .event = event, 4146 }, 4147 }; 4148 struct net *net = dev_net(dev); 4149 4150 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4151 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4152 else 4153 fib6_clean_all(net, fib6_ifdown, &arg); 4154 } 4155 4156 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4157 { 4158 rt6_sync_down_dev(dev, event); 4159 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4160 neigh_ifdown(&nd_tbl, dev); 4161 } 4162 4163 struct rt6_mtu_change_arg { 4164 struct net_device *dev; 4165 unsigned int mtu; 4166 }; 4167 4168 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4169 { 4170 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4171 struct inet6_dev *idev; 4172 4173 /* In IPv6 pmtu discovery is not optional, 4174 so that RTAX_MTU lock cannot disable it. 4175 We still use this lock to block changes 4176 caused by addrconf/ndisc. 4177 */ 4178 4179 idev = __in6_dev_get(arg->dev); 4180 if (!idev) 4181 return 0; 4182 4183 /* For administrative MTU increase, there is no way to discover 4184 IPv6 PMTU increase, so PMTU increase should be updated here. 4185 Since RFC 1981 doesn't include administrative MTU increase 4186 update PMTU increase is a MUST. (i.e. jumbo frame) 4187 */ 4188 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4189 !fib6_metric_locked(rt, RTAX_MTU)) { 4190 u32 mtu = rt->fib6_pmtu; 4191 4192 if (mtu >= arg->mtu || 4193 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4194 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4195 4196 spin_lock_bh(&rt6_exception_lock); 4197 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4198 spin_unlock_bh(&rt6_exception_lock); 4199 } 4200 return 0; 4201 } 4202 4203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4204 { 4205 struct rt6_mtu_change_arg arg = { 4206 .dev = dev, 4207 .mtu = mtu, 4208 }; 4209 4210 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4211 } 4212 4213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4214 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4215 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4216 [RTA_OIF] = { .type = NLA_U32 }, 4217 [RTA_IIF] = { .type = NLA_U32 }, 4218 [RTA_PRIORITY] = { .type = NLA_U32 }, 4219 [RTA_METRICS] = { .type = NLA_NESTED }, 4220 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4221 [RTA_PREF] = { .type = NLA_U8 }, 4222 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4223 [RTA_ENCAP] = { .type = NLA_NESTED }, 4224 [RTA_EXPIRES] = { .type = NLA_U32 }, 4225 [RTA_UID] = { .type = NLA_U32 }, 4226 [RTA_MARK] = { .type = NLA_U32 }, 4227 [RTA_TABLE] = { .type = NLA_U32 }, 4228 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4229 [RTA_SPORT] = { .type = NLA_U16 }, 4230 [RTA_DPORT] = { .type = NLA_U16 }, 4231 }; 4232 4233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4234 struct fib6_config *cfg, 4235 struct netlink_ext_ack *extack) 4236 { 4237 struct rtmsg *rtm; 4238 struct nlattr *tb[RTA_MAX+1]; 4239 unsigned int pref; 4240 int err; 4241 4242 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4243 rtm_ipv6_policy, extack); 4244 if (err < 0) 4245 goto errout; 4246 4247 err = -EINVAL; 4248 rtm = nlmsg_data(nlh); 4249 4250 *cfg = (struct fib6_config){ 4251 .fc_table = rtm->rtm_table, 4252 .fc_dst_len = rtm->rtm_dst_len, 4253 .fc_src_len = rtm->rtm_src_len, 4254 .fc_flags = RTF_UP, 4255 .fc_protocol = rtm->rtm_protocol, 4256 .fc_type = rtm->rtm_type, 4257 4258 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4259 .fc_nlinfo.nlh = nlh, 4260 .fc_nlinfo.nl_net = sock_net(skb->sk), 4261 }; 4262 4263 if (rtm->rtm_type == RTN_UNREACHABLE || 4264 rtm->rtm_type == RTN_BLACKHOLE || 4265 rtm->rtm_type == RTN_PROHIBIT || 4266 rtm->rtm_type == RTN_THROW) 4267 cfg->fc_flags |= RTF_REJECT; 4268 4269 if (rtm->rtm_type == RTN_LOCAL) 4270 cfg->fc_flags |= RTF_LOCAL; 4271 4272 if (rtm->rtm_flags & RTM_F_CLONED) 4273 cfg->fc_flags |= RTF_CACHE; 4274 4275 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4276 4277 if (tb[RTA_GATEWAY]) { 4278 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4279 cfg->fc_flags |= RTF_GATEWAY; 4280 } 4281 if (tb[RTA_VIA]) { 4282 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4283 goto errout; 4284 } 4285 4286 if (tb[RTA_DST]) { 4287 int plen = (rtm->rtm_dst_len + 7) >> 3; 4288 4289 if (nla_len(tb[RTA_DST]) < plen) 4290 goto errout; 4291 4292 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4293 } 4294 4295 if (tb[RTA_SRC]) { 4296 int plen = (rtm->rtm_src_len + 7) >> 3; 4297 4298 if (nla_len(tb[RTA_SRC]) < plen) 4299 goto errout; 4300 4301 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4302 } 4303 4304 if (tb[RTA_PREFSRC]) 4305 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4306 4307 if (tb[RTA_OIF]) 4308 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4309 4310 if (tb[RTA_PRIORITY]) 4311 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4312 4313 if (tb[RTA_METRICS]) { 4314 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4315 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4316 } 4317 4318 if (tb[RTA_TABLE]) 4319 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4320 4321 if (tb[RTA_MULTIPATH]) { 4322 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4323 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4324 4325 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4326 cfg->fc_mp_len, extack); 4327 if (err < 0) 4328 goto errout; 4329 } 4330 4331 if (tb[RTA_PREF]) { 4332 pref = nla_get_u8(tb[RTA_PREF]); 4333 if (pref != ICMPV6_ROUTER_PREF_LOW && 4334 pref != ICMPV6_ROUTER_PREF_HIGH) 4335 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4336 cfg->fc_flags |= RTF_PREF(pref); 4337 } 4338 4339 if (tb[RTA_ENCAP]) 4340 cfg->fc_encap = tb[RTA_ENCAP]; 4341 4342 if (tb[RTA_ENCAP_TYPE]) { 4343 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4344 4345 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4346 if (err < 0) 4347 goto errout; 4348 } 4349 4350 if (tb[RTA_EXPIRES]) { 4351 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4352 4353 if (addrconf_finite_timeout(timeout)) { 4354 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4355 cfg->fc_flags |= RTF_EXPIRES; 4356 } 4357 } 4358 4359 err = 0; 4360 errout: 4361 return err; 4362 } 4363 4364 struct rt6_nh { 4365 struct fib6_info *fib6_info; 4366 struct fib6_config r_cfg; 4367 struct list_head next; 4368 }; 4369 4370 static int ip6_route_info_append(struct net *net, 4371 struct list_head *rt6_nh_list, 4372 struct fib6_info *rt, 4373 struct fib6_config *r_cfg) 4374 { 4375 struct rt6_nh *nh; 4376 int err = -EEXIST; 4377 4378 list_for_each_entry(nh, rt6_nh_list, next) { 4379 /* check if fib6_info already exists */ 4380 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4381 return err; 4382 } 4383 4384 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4385 if (!nh) 4386 return -ENOMEM; 4387 nh->fib6_info = rt; 4388 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4389 list_add_tail(&nh->next, rt6_nh_list); 4390 4391 return 0; 4392 } 4393 4394 static void ip6_route_mpath_notify(struct fib6_info *rt, 4395 struct fib6_info *rt_last, 4396 struct nl_info *info, 4397 __u16 nlflags) 4398 { 4399 /* if this is an APPEND route, then rt points to the first route 4400 * inserted and rt_last points to last route inserted. Userspace 4401 * wants a consistent dump of the route which starts at the first 4402 * nexthop. Since sibling routes are always added at the end of 4403 * the list, find the first sibling of the last route appended 4404 */ 4405 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4406 rt = list_first_entry(&rt_last->fib6_siblings, 4407 struct fib6_info, 4408 fib6_siblings); 4409 } 4410 4411 if (rt) 4412 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4413 } 4414 4415 static int ip6_route_multipath_add(struct fib6_config *cfg, 4416 struct netlink_ext_ack *extack) 4417 { 4418 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4419 struct nl_info *info = &cfg->fc_nlinfo; 4420 struct fib6_config r_cfg; 4421 struct rtnexthop *rtnh; 4422 struct fib6_info *rt; 4423 struct rt6_nh *err_nh; 4424 struct rt6_nh *nh, *nh_safe; 4425 __u16 nlflags; 4426 int remaining; 4427 int attrlen; 4428 int err = 1; 4429 int nhn = 0; 4430 int replace = (cfg->fc_nlinfo.nlh && 4431 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4432 LIST_HEAD(rt6_nh_list); 4433 4434 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4435 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4436 nlflags |= NLM_F_APPEND; 4437 4438 remaining = cfg->fc_mp_len; 4439 rtnh = (struct rtnexthop *)cfg->fc_mp; 4440 4441 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4442 * fib6_info structs per nexthop 4443 */ 4444 while (rtnh_ok(rtnh, remaining)) { 4445 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4446 if (rtnh->rtnh_ifindex) 4447 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4448 4449 attrlen = rtnh_attrlen(rtnh); 4450 if (attrlen > 0) { 4451 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4452 4453 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4454 if (nla) { 4455 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4456 r_cfg.fc_flags |= RTF_GATEWAY; 4457 } 4458 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4459 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4460 if (nla) 4461 r_cfg.fc_encap_type = nla_get_u16(nla); 4462 } 4463 4464 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4465 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4466 if (IS_ERR(rt)) { 4467 err = PTR_ERR(rt); 4468 rt = NULL; 4469 goto cleanup; 4470 } 4471 if (!rt6_qualify_for_ecmp(rt)) { 4472 err = -EINVAL; 4473 NL_SET_ERR_MSG(extack, 4474 "Device only routes can not be added for IPv6 using the multipath API."); 4475 fib6_info_release(rt); 4476 goto cleanup; 4477 } 4478 4479 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4480 4481 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4482 rt, &r_cfg); 4483 if (err) { 4484 fib6_info_release(rt); 4485 goto cleanup; 4486 } 4487 4488 rtnh = rtnh_next(rtnh, &remaining); 4489 } 4490 4491 /* for add and replace send one notification with all nexthops. 4492 * Skip the notification in fib6_add_rt2node and send one with 4493 * the full route when done 4494 */ 4495 info->skip_notify = 1; 4496 4497 err_nh = NULL; 4498 list_for_each_entry(nh, &rt6_nh_list, next) { 4499 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4500 fib6_info_release(nh->fib6_info); 4501 4502 if (!err) { 4503 /* save reference to last route successfully inserted */ 4504 rt_last = nh->fib6_info; 4505 4506 /* save reference to first route for notification */ 4507 if (!rt_notif) 4508 rt_notif = nh->fib6_info; 4509 } 4510 4511 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4512 nh->fib6_info = NULL; 4513 if (err) { 4514 if (replace && nhn) 4515 NL_SET_ERR_MSG_MOD(extack, 4516 "multipath route replace failed (check consistency of installed routes)"); 4517 err_nh = nh; 4518 goto add_errout; 4519 } 4520 4521 /* Because each route is added like a single route we remove 4522 * these flags after the first nexthop: if there is a collision, 4523 * we have already failed to add the first nexthop: 4524 * fib6_add_rt2node() has rejected it; when replacing, old 4525 * nexthops have been replaced by first new, the rest should 4526 * be added to it. 4527 */ 4528 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4529 NLM_F_REPLACE); 4530 nhn++; 4531 } 4532 4533 /* success ... tell user about new route */ 4534 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4535 goto cleanup; 4536 4537 add_errout: 4538 /* send notification for routes that were added so that 4539 * the delete notifications sent by ip6_route_del are 4540 * coherent 4541 */ 4542 if (rt_notif) 4543 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4544 4545 /* Delete routes that were already added */ 4546 list_for_each_entry(nh, &rt6_nh_list, next) { 4547 if (err_nh == nh) 4548 break; 4549 ip6_route_del(&nh->r_cfg, extack); 4550 } 4551 4552 cleanup: 4553 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4554 if (nh->fib6_info) 4555 fib6_info_release(nh->fib6_info); 4556 list_del(&nh->next); 4557 kfree(nh); 4558 } 4559 4560 return err; 4561 } 4562 4563 static int ip6_route_multipath_del(struct fib6_config *cfg, 4564 struct netlink_ext_ack *extack) 4565 { 4566 struct fib6_config r_cfg; 4567 struct rtnexthop *rtnh; 4568 int remaining; 4569 int attrlen; 4570 int err = 1, last_err = 0; 4571 4572 remaining = cfg->fc_mp_len; 4573 rtnh = (struct rtnexthop *)cfg->fc_mp; 4574 4575 /* Parse a Multipath Entry */ 4576 while (rtnh_ok(rtnh, remaining)) { 4577 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4578 if (rtnh->rtnh_ifindex) 4579 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4580 4581 attrlen = rtnh_attrlen(rtnh); 4582 if (attrlen > 0) { 4583 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4584 4585 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4586 if (nla) { 4587 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4588 r_cfg.fc_flags |= RTF_GATEWAY; 4589 } 4590 } 4591 err = ip6_route_del(&r_cfg, extack); 4592 if (err) 4593 last_err = err; 4594 4595 rtnh = rtnh_next(rtnh, &remaining); 4596 } 4597 4598 return last_err; 4599 } 4600 4601 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4602 struct netlink_ext_ack *extack) 4603 { 4604 struct fib6_config cfg; 4605 int err; 4606 4607 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4608 if (err < 0) 4609 return err; 4610 4611 if (cfg.fc_mp) 4612 return ip6_route_multipath_del(&cfg, extack); 4613 else { 4614 cfg.fc_delete_all_nh = 1; 4615 return ip6_route_del(&cfg, extack); 4616 } 4617 } 4618 4619 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4620 struct netlink_ext_ack *extack) 4621 { 4622 struct fib6_config cfg; 4623 int err; 4624 4625 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4626 if (err < 0) 4627 return err; 4628 4629 if (cfg.fc_metric == 0) 4630 cfg.fc_metric = IP6_RT_PRIO_USER; 4631 4632 if (cfg.fc_mp) 4633 return ip6_route_multipath_add(&cfg, extack); 4634 else 4635 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4636 } 4637 4638 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4639 { 4640 int nexthop_len = 0; 4641 4642 if (rt->fib6_nsiblings) { 4643 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4644 + NLA_ALIGN(sizeof(struct rtnexthop)) 4645 + nla_total_size(16) /* RTA_GATEWAY */ 4646 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4647 4648 nexthop_len *= rt->fib6_nsiblings; 4649 } 4650 4651 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4652 + nla_total_size(16) /* RTA_SRC */ 4653 + nla_total_size(16) /* RTA_DST */ 4654 + nla_total_size(16) /* RTA_GATEWAY */ 4655 + nla_total_size(16) /* RTA_PREFSRC */ 4656 + nla_total_size(4) /* RTA_TABLE */ 4657 + nla_total_size(4) /* RTA_IIF */ 4658 + nla_total_size(4) /* RTA_OIF */ 4659 + nla_total_size(4) /* RTA_PRIORITY */ 4660 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4661 + nla_total_size(sizeof(struct rta_cacheinfo)) 4662 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4663 + nla_total_size(1) /* RTA_PREF */ 4664 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4665 + nexthop_len; 4666 } 4667 4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4669 struct fib6_info *rt, struct dst_entry *dst, 4670 struct in6_addr *dest, struct in6_addr *src, 4671 int iif, int type, u32 portid, u32 seq, 4672 unsigned int flags) 4673 { 4674 struct rt6_info *rt6 = (struct rt6_info *)dst; 4675 struct rt6key *rt6_dst, *rt6_src; 4676 u32 *pmetrics, table, rt6_flags; 4677 struct nlmsghdr *nlh; 4678 struct rtmsg *rtm; 4679 long expires = 0; 4680 4681 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4682 if (!nlh) 4683 return -EMSGSIZE; 4684 4685 if (rt6) { 4686 rt6_dst = &rt6->rt6i_dst; 4687 rt6_src = &rt6->rt6i_src; 4688 rt6_flags = rt6->rt6i_flags; 4689 } else { 4690 rt6_dst = &rt->fib6_dst; 4691 rt6_src = &rt->fib6_src; 4692 rt6_flags = rt->fib6_flags; 4693 } 4694 4695 rtm = nlmsg_data(nlh); 4696 rtm->rtm_family = AF_INET6; 4697 rtm->rtm_dst_len = rt6_dst->plen; 4698 rtm->rtm_src_len = rt6_src->plen; 4699 rtm->rtm_tos = 0; 4700 if (rt->fib6_table) 4701 table = rt->fib6_table->tb6_id; 4702 else 4703 table = RT6_TABLE_UNSPEC; 4704 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4705 if (nla_put_u32(skb, RTA_TABLE, table)) 4706 goto nla_put_failure; 4707 4708 rtm->rtm_type = rt->fib6_type; 4709 rtm->rtm_flags = 0; 4710 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4711 rtm->rtm_protocol = rt->fib6_protocol; 4712 4713 if (rt6_flags & RTF_CACHE) 4714 rtm->rtm_flags |= RTM_F_CLONED; 4715 4716 if (dest) { 4717 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4718 goto nla_put_failure; 4719 rtm->rtm_dst_len = 128; 4720 } else if (rtm->rtm_dst_len) 4721 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4722 goto nla_put_failure; 4723 #ifdef CONFIG_IPV6_SUBTREES 4724 if (src) { 4725 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4726 goto nla_put_failure; 4727 rtm->rtm_src_len = 128; 4728 } else if (rtm->rtm_src_len && 4729 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4730 goto nla_put_failure; 4731 #endif 4732 if (iif) { 4733 #ifdef CONFIG_IPV6_MROUTE 4734 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4735 int err = ip6mr_get_route(net, skb, rtm, portid); 4736 4737 if (err == 0) 4738 return 0; 4739 if (err < 0) 4740 goto nla_put_failure; 4741 } else 4742 #endif 4743 if (nla_put_u32(skb, RTA_IIF, iif)) 4744 goto nla_put_failure; 4745 } else if (dest) { 4746 struct in6_addr saddr_buf; 4747 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4748 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4749 goto nla_put_failure; 4750 } 4751 4752 if (rt->fib6_prefsrc.plen) { 4753 struct in6_addr saddr_buf; 4754 saddr_buf = rt->fib6_prefsrc.addr; 4755 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4756 goto nla_put_failure; 4757 } 4758 4759 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4760 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4761 goto nla_put_failure; 4762 4763 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4764 goto nla_put_failure; 4765 4766 /* For multipath routes, walk the siblings list and add 4767 * each as a nexthop within RTA_MULTIPATH. 4768 */ 4769 if (rt6) { 4770 if (rt6_flags & RTF_GATEWAY && 4771 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4772 goto nla_put_failure; 4773 4774 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4775 goto nla_put_failure; 4776 } else if (rt->fib6_nsiblings) { 4777 struct fib6_info *sibling, *next_sibling; 4778 struct nlattr *mp; 4779 4780 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 4781 if (!mp) 4782 goto nla_put_failure; 4783 4784 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, 4785 rt->fib6_nh.fib_nh_weight) < 0) 4786 goto nla_put_failure; 4787 4788 list_for_each_entry_safe(sibling, next_sibling, 4789 &rt->fib6_siblings, fib6_siblings) { 4790 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, 4791 sibling->fib6_nh.fib_nh_weight) < 0) 4792 goto nla_put_failure; 4793 } 4794 4795 nla_nest_end(skb, mp); 4796 } else { 4797 unsigned char nh_flags = 0; 4798 4799 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, 4800 &nh_flags, false) < 0) 4801 goto nla_put_failure; 4802 4803 rtm->rtm_flags |= nh_flags; 4804 } 4805 4806 if (rt6_flags & RTF_EXPIRES) { 4807 expires = dst ? dst->expires : rt->expires; 4808 expires -= jiffies; 4809 } 4810 4811 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4812 goto nla_put_failure; 4813 4814 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4815 goto nla_put_failure; 4816 4817 4818 nlmsg_end(skb, nlh); 4819 return 0; 4820 4821 nla_put_failure: 4822 nlmsg_cancel(skb, nlh); 4823 return -EMSGSIZE; 4824 } 4825 4826 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4827 const struct net_device *dev) 4828 { 4829 if (f6i->fib6_nh.fib_nh_dev == dev) 4830 return true; 4831 4832 if (f6i->fib6_nsiblings) { 4833 struct fib6_info *sibling, *next_sibling; 4834 4835 list_for_each_entry_safe(sibling, next_sibling, 4836 &f6i->fib6_siblings, fib6_siblings) { 4837 if (sibling->fib6_nh.fib_nh_dev == dev) 4838 return true; 4839 } 4840 } 4841 4842 return false; 4843 } 4844 4845 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4846 { 4847 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4848 struct fib_dump_filter *filter = &arg->filter; 4849 unsigned int flags = NLM_F_MULTI; 4850 struct net *net = arg->net; 4851 4852 if (rt == net->ipv6.fib6_null_entry) 4853 return 0; 4854 4855 if ((filter->flags & RTM_F_PREFIX) && 4856 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4857 /* success since this is not a prefix route */ 4858 return 1; 4859 } 4860 if (filter->filter_set) { 4861 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4862 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4863 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4864 return 1; 4865 } 4866 flags |= NLM_F_DUMP_FILTERED; 4867 } 4868 4869 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4870 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4871 arg->cb->nlh->nlmsg_seq, flags); 4872 } 4873 4874 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4875 const struct nlmsghdr *nlh, 4876 struct nlattr **tb, 4877 struct netlink_ext_ack *extack) 4878 { 4879 struct rtmsg *rtm; 4880 int i, err; 4881 4882 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4883 NL_SET_ERR_MSG_MOD(extack, 4884 "Invalid header for get route request"); 4885 return -EINVAL; 4886 } 4887 4888 if (!netlink_strict_get_check(skb)) 4889 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4890 rtm_ipv6_policy, extack); 4891 4892 rtm = nlmsg_data(nlh); 4893 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4894 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4895 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4896 rtm->rtm_type) { 4897 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4898 return -EINVAL; 4899 } 4900 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4901 NL_SET_ERR_MSG_MOD(extack, 4902 "Invalid flags for get route request"); 4903 return -EINVAL; 4904 } 4905 4906 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4907 rtm_ipv6_policy, extack); 4908 if (err) 4909 return err; 4910 4911 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4912 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4913 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4914 return -EINVAL; 4915 } 4916 4917 for (i = 0; i <= RTA_MAX; i++) { 4918 if (!tb[i]) 4919 continue; 4920 4921 switch (i) { 4922 case RTA_SRC: 4923 case RTA_DST: 4924 case RTA_IIF: 4925 case RTA_OIF: 4926 case RTA_MARK: 4927 case RTA_UID: 4928 case RTA_SPORT: 4929 case RTA_DPORT: 4930 case RTA_IP_PROTO: 4931 break; 4932 default: 4933 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4934 return -EINVAL; 4935 } 4936 } 4937 4938 return 0; 4939 } 4940 4941 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4942 struct netlink_ext_ack *extack) 4943 { 4944 struct net *net = sock_net(in_skb->sk); 4945 struct nlattr *tb[RTA_MAX+1]; 4946 int err, iif = 0, oif = 0; 4947 struct fib6_info *from; 4948 struct dst_entry *dst; 4949 struct rt6_info *rt; 4950 struct sk_buff *skb; 4951 struct rtmsg *rtm; 4952 struct flowi6 fl6 = {}; 4953 bool fibmatch; 4954 4955 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4956 if (err < 0) 4957 goto errout; 4958 4959 err = -EINVAL; 4960 rtm = nlmsg_data(nlh); 4961 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4962 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4963 4964 if (tb[RTA_SRC]) { 4965 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4966 goto errout; 4967 4968 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4969 } 4970 4971 if (tb[RTA_DST]) { 4972 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4973 goto errout; 4974 4975 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4976 } 4977 4978 if (tb[RTA_IIF]) 4979 iif = nla_get_u32(tb[RTA_IIF]); 4980 4981 if (tb[RTA_OIF]) 4982 oif = nla_get_u32(tb[RTA_OIF]); 4983 4984 if (tb[RTA_MARK]) 4985 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4986 4987 if (tb[RTA_UID]) 4988 fl6.flowi6_uid = make_kuid(current_user_ns(), 4989 nla_get_u32(tb[RTA_UID])); 4990 else 4991 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4992 4993 if (tb[RTA_SPORT]) 4994 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4995 4996 if (tb[RTA_DPORT]) 4997 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4998 4999 if (tb[RTA_IP_PROTO]) { 5000 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 5001 &fl6.flowi6_proto, AF_INET6, 5002 extack); 5003 if (err) 5004 goto errout; 5005 } 5006 5007 if (iif) { 5008 struct net_device *dev; 5009 int flags = 0; 5010 5011 rcu_read_lock(); 5012 5013 dev = dev_get_by_index_rcu(net, iif); 5014 if (!dev) { 5015 rcu_read_unlock(); 5016 err = -ENODEV; 5017 goto errout; 5018 } 5019 5020 fl6.flowi6_iif = iif; 5021 5022 if (!ipv6_addr_any(&fl6.saddr)) 5023 flags |= RT6_LOOKUP_F_HAS_SADDR; 5024 5025 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5026 5027 rcu_read_unlock(); 5028 } else { 5029 fl6.flowi6_oif = oif; 5030 5031 dst = ip6_route_output(net, NULL, &fl6); 5032 } 5033 5034 5035 rt = container_of(dst, struct rt6_info, dst); 5036 if (rt->dst.error) { 5037 err = rt->dst.error; 5038 ip6_rt_put(rt); 5039 goto errout; 5040 } 5041 5042 if (rt == net->ipv6.ip6_null_entry) { 5043 err = rt->dst.error; 5044 ip6_rt_put(rt); 5045 goto errout; 5046 } 5047 5048 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5049 if (!skb) { 5050 ip6_rt_put(rt); 5051 err = -ENOBUFS; 5052 goto errout; 5053 } 5054 5055 skb_dst_set(skb, &rt->dst); 5056 5057 rcu_read_lock(); 5058 from = rcu_dereference(rt->from); 5059 if (from) { 5060 if (fibmatch) 5061 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 5062 iif, RTM_NEWROUTE, 5063 NETLINK_CB(in_skb).portid, 5064 nlh->nlmsg_seq, 0); 5065 else 5066 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5067 &fl6.saddr, iif, RTM_NEWROUTE, 5068 NETLINK_CB(in_skb).portid, 5069 nlh->nlmsg_seq, 0); 5070 } else { 5071 err = -ENETUNREACH; 5072 } 5073 rcu_read_unlock(); 5074 5075 if (err < 0) { 5076 kfree_skb(skb); 5077 goto errout; 5078 } 5079 5080 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5081 errout: 5082 return err; 5083 } 5084 5085 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5086 unsigned int nlm_flags) 5087 { 5088 struct sk_buff *skb; 5089 struct net *net = info->nl_net; 5090 u32 seq; 5091 int err; 5092 5093 err = -ENOBUFS; 5094 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5095 5096 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5097 if (!skb) 5098 goto errout; 5099 5100 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5101 event, info->portid, seq, nlm_flags); 5102 if (err < 0) { 5103 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5104 WARN_ON(err == -EMSGSIZE); 5105 kfree_skb(skb); 5106 goto errout; 5107 } 5108 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5109 info->nlh, gfp_any()); 5110 return; 5111 errout: 5112 if (err < 0) 5113 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5114 } 5115 5116 static int ip6_route_dev_notify(struct notifier_block *this, 5117 unsigned long event, void *ptr) 5118 { 5119 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5120 struct net *net = dev_net(dev); 5121 5122 if (!(dev->flags & IFF_LOOPBACK)) 5123 return NOTIFY_OK; 5124 5125 if (event == NETDEV_REGISTER) { 5126 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5127 net->ipv6.ip6_null_entry->dst.dev = dev; 5128 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5129 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5130 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5131 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5132 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5133 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5134 #endif 5135 } else if (event == NETDEV_UNREGISTER && 5136 dev->reg_state != NETREG_UNREGISTERED) { 5137 /* NETDEV_UNREGISTER could be fired for multiple times by 5138 * netdev_wait_allrefs(). Make sure we only call this once. 5139 */ 5140 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5142 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5143 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5144 #endif 5145 } 5146 5147 return NOTIFY_OK; 5148 } 5149 5150 /* 5151 * /proc 5152 */ 5153 5154 #ifdef CONFIG_PROC_FS 5155 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5156 { 5157 struct net *net = (struct net *)seq->private; 5158 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5159 net->ipv6.rt6_stats->fib_nodes, 5160 net->ipv6.rt6_stats->fib_route_nodes, 5161 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5162 net->ipv6.rt6_stats->fib_rt_entries, 5163 net->ipv6.rt6_stats->fib_rt_cache, 5164 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5165 net->ipv6.rt6_stats->fib_discarded_routes); 5166 5167 return 0; 5168 } 5169 #endif /* CONFIG_PROC_FS */ 5170 5171 #ifdef CONFIG_SYSCTL 5172 5173 static 5174 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5175 void __user *buffer, size_t *lenp, loff_t *ppos) 5176 { 5177 struct net *net; 5178 int delay; 5179 int ret; 5180 if (!write) 5181 return -EINVAL; 5182 5183 net = (struct net *)ctl->extra1; 5184 delay = net->ipv6.sysctl.flush_delay; 5185 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5186 if (ret) 5187 return ret; 5188 5189 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5190 return 0; 5191 } 5192 5193 static int zero; 5194 static int one = 1; 5195 5196 static struct ctl_table ipv6_route_table_template[] = { 5197 { 5198 .procname = "flush", 5199 .data = &init_net.ipv6.sysctl.flush_delay, 5200 .maxlen = sizeof(int), 5201 .mode = 0200, 5202 .proc_handler = ipv6_sysctl_rtcache_flush 5203 }, 5204 { 5205 .procname = "gc_thresh", 5206 .data = &ip6_dst_ops_template.gc_thresh, 5207 .maxlen = sizeof(int), 5208 .mode = 0644, 5209 .proc_handler = proc_dointvec, 5210 }, 5211 { 5212 .procname = "max_size", 5213 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5214 .maxlen = sizeof(int), 5215 .mode = 0644, 5216 .proc_handler = proc_dointvec, 5217 }, 5218 { 5219 .procname = "gc_min_interval", 5220 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5221 .maxlen = sizeof(int), 5222 .mode = 0644, 5223 .proc_handler = proc_dointvec_jiffies, 5224 }, 5225 { 5226 .procname = "gc_timeout", 5227 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5228 .maxlen = sizeof(int), 5229 .mode = 0644, 5230 .proc_handler = proc_dointvec_jiffies, 5231 }, 5232 { 5233 .procname = "gc_interval", 5234 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5235 .maxlen = sizeof(int), 5236 .mode = 0644, 5237 .proc_handler = proc_dointvec_jiffies, 5238 }, 5239 { 5240 .procname = "gc_elasticity", 5241 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5242 .maxlen = sizeof(int), 5243 .mode = 0644, 5244 .proc_handler = proc_dointvec, 5245 }, 5246 { 5247 .procname = "mtu_expires", 5248 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5249 .maxlen = sizeof(int), 5250 .mode = 0644, 5251 .proc_handler = proc_dointvec_jiffies, 5252 }, 5253 { 5254 .procname = "min_adv_mss", 5255 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5256 .maxlen = sizeof(int), 5257 .mode = 0644, 5258 .proc_handler = proc_dointvec, 5259 }, 5260 { 5261 .procname = "gc_min_interval_ms", 5262 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5263 .maxlen = sizeof(int), 5264 .mode = 0644, 5265 .proc_handler = proc_dointvec_ms_jiffies, 5266 }, 5267 { 5268 .procname = "skip_notify_on_dev_down", 5269 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5270 .maxlen = sizeof(int), 5271 .mode = 0644, 5272 .proc_handler = proc_dointvec, 5273 .extra1 = &zero, 5274 .extra2 = &one, 5275 }, 5276 { } 5277 }; 5278 5279 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5280 { 5281 struct ctl_table *table; 5282 5283 table = kmemdup(ipv6_route_table_template, 5284 sizeof(ipv6_route_table_template), 5285 GFP_KERNEL); 5286 5287 if (table) { 5288 table[0].data = &net->ipv6.sysctl.flush_delay; 5289 table[0].extra1 = net; 5290 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5291 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5292 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5293 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5294 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5295 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5296 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5297 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5298 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5299 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5300 5301 /* Don't export sysctls to unprivileged users */ 5302 if (net->user_ns != &init_user_ns) 5303 table[0].procname = NULL; 5304 } 5305 5306 return table; 5307 } 5308 #endif 5309 5310 static int __net_init ip6_route_net_init(struct net *net) 5311 { 5312 int ret = -ENOMEM; 5313 5314 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5315 sizeof(net->ipv6.ip6_dst_ops)); 5316 5317 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5318 goto out_ip6_dst_ops; 5319 5320 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5321 sizeof(*net->ipv6.fib6_null_entry), 5322 GFP_KERNEL); 5323 if (!net->ipv6.fib6_null_entry) 5324 goto out_ip6_dst_entries; 5325 5326 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5327 sizeof(*net->ipv6.ip6_null_entry), 5328 GFP_KERNEL); 5329 if (!net->ipv6.ip6_null_entry) 5330 goto out_fib6_null_entry; 5331 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5332 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5333 ip6_template_metrics, true); 5334 5335 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5336 net->ipv6.fib6_has_custom_rules = false; 5337 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5338 sizeof(*net->ipv6.ip6_prohibit_entry), 5339 GFP_KERNEL); 5340 if (!net->ipv6.ip6_prohibit_entry) 5341 goto out_ip6_null_entry; 5342 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5343 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5344 ip6_template_metrics, true); 5345 5346 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5347 sizeof(*net->ipv6.ip6_blk_hole_entry), 5348 GFP_KERNEL); 5349 if (!net->ipv6.ip6_blk_hole_entry) 5350 goto out_ip6_prohibit_entry; 5351 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5352 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5353 ip6_template_metrics, true); 5354 #endif 5355 5356 net->ipv6.sysctl.flush_delay = 0; 5357 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5358 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5359 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5360 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5361 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5362 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5363 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5364 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5365 5366 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5367 5368 ret = 0; 5369 out: 5370 return ret; 5371 5372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5373 out_ip6_prohibit_entry: 5374 kfree(net->ipv6.ip6_prohibit_entry); 5375 out_ip6_null_entry: 5376 kfree(net->ipv6.ip6_null_entry); 5377 #endif 5378 out_fib6_null_entry: 5379 kfree(net->ipv6.fib6_null_entry); 5380 out_ip6_dst_entries: 5381 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5382 out_ip6_dst_ops: 5383 goto out; 5384 } 5385 5386 static void __net_exit ip6_route_net_exit(struct net *net) 5387 { 5388 kfree(net->ipv6.fib6_null_entry); 5389 kfree(net->ipv6.ip6_null_entry); 5390 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5391 kfree(net->ipv6.ip6_prohibit_entry); 5392 kfree(net->ipv6.ip6_blk_hole_entry); 5393 #endif 5394 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5395 } 5396 5397 static int __net_init ip6_route_net_init_late(struct net *net) 5398 { 5399 #ifdef CONFIG_PROC_FS 5400 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5401 sizeof(struct ipv6_route_iter)); 5402 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5403 rt6_stats_seq_show, NULL); 5404 #endif 5405 return 0; 5406 } 5407 5408 static void __net_exit ip6_route_net_exit_late(struct net *net) 5409 { 5410 #ifdef CONFIG_PROC_FS 5411 remove_proc_entry("ipv6_route", net->proc_net); 5412 remove_proc_entry("rt6_stats", net->proc_net); 5413 #endif 5414 } 5415 5416 static struct pernet_operations ip6_route_net_ops = { 5417 .init = ip6_route_net_init, 5418 .exit = ip6_route_net_exit, 5419 }; 5420 5421 static int __net_init ipv6_inetpeer_init(struct net *net) 5422 { 5423 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5424 5425 if (!bp) 5426 return -ENOMEM; 5427 inet_peer_base_init(bp); 5428 net->ipv6.peers = bp; 5429 return 0; 5430 } 5431 5432 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5433 { 5434 struct inet_peer_base *bp = net->ipv6.peers; 5435 5436 net->ipv6.peers = NULL; 5437 inetpeer_invalidate_tree(bp); 5438 kfree(bp); 5439 } 5440 5441 static struct pernet_operations ipv6_inetpeer_ops = { 5442 .init = ipv6_inetpeer_init, 5443 .exit = ipv6_inetpeer_exit, 5444 }; 5445 5446 static struct pernet_operations ip6_route_net_late_ops = { 5447 .init = ip6_route_net_init_late, 5448 .exit = ip6_route_net_exit_late, 5449 }; 5450 5451 static struct notifier_block ip6_route_dev_notifier = { 5452 .notifier_call = ip6_route_dev_notify, 5453 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5454 }; 5455 5456 void __init ip6_route_init_special_entries(void) 5457 { 5458 /* Registering of the loopback is done before this portion of code, 5459 * the loopback reference in rt6_info will not be taken, do it 5460 * manually for init_net */ 5461 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5462 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5463 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5464 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5465 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5466 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5467 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5468 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5469 #endif 5470 } 5471 5472 int __init ip6_route_init(void) 5473 { 5474 int ret; 5475 int cpu; 5476 5477 ret = -ENOMEM; 5478 ip6_dst_ops_template.kmem_cachep = 5479 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5480 SLAB_HWCACHE_ALIGN, NULL); 5481 if (!ip6_dst_ops_template.kmem_cachep) 5482 goto out; 5483 5484 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5485 if (ret) 5486 goto out_kmem_cache; 5487 5488 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5489 if (ret) 5490 goto out_dst_entries; 5491 5492 ret = register_pernet_subsys(&ip6_route_net_ops); 5493 if (ret) 5494 goto out_register_inetpeer; 5495 5496 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5497 5498 ret = fib6_init(); 5499 if (ret) 5500 goto out_register_subsys; 5501 5502 ret = xfrm6_init(); 5503 if (ret) 5504 goto out_fib6_init; 5505 5506 ret = fib6_rules_init(); 5507 if (ret) 5508 goto xfrm6_init; 5509 5510 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5511 if (ret) 5512 goto fib6_rules_init; 5513 5514 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5515 inet6_rtm_newroute, NULL, 0); 5516 if (ret < 0) 5517 goto out_register_late_subsys; 5518 5519 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5520 inet6_rtm_delroute, NULL, 0); 5521 if (ret < 0) 5522 goto out_register_late_subsys; 5523 5524 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5525 inet6_rtm_getroute, NULL, 5526 RTNL_FLAG_DOIT_UNLOCKED); 5527 if (ret < 0) 5528 goto out_register_late_subsys; 5529 5530 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5531 if (ret) 5532 goto out_register_late_subsys; 5533 5534 for_each_possible_cpu(cpu) { 5535 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5536 5537 INIT_LIST_HEAD(&ul->head); 5538 spin_lock_init(&ul->lock); 5539 } 5540 5541 out: 5542 return ret; 5543 5544 out_register_late_subsys: 5545 rtnl_unregister_all(PF_INET6); 5546 unregister_pernet_subsys(&ip6_route_net_late_ops); 5547 fib6_rules_init: 5548 fib6_rules_cleanup(); 5549 xfrm6_init: 5550 xfrm6_fini(); 5551 out_fib6_init: 5552 fib6_gc_cleanup(); 5553 out_register_subsys: 5554 unregister_pernet_subsys(&ip6_route_net_ops); 5555 out_register_inetpeer: 5556 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5557 out_dst_entries: 5558 dst_entries_destroy(&ip6_dst_blackhole_ops); 5559 out_kmem_cache: 5560 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5561 goto out; 5562 } 5563 5564 void ip6_route_cleanup(void) 5565 { 5566 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5567 unregister_pernet_subsys(&ip6_route_net_late_ops); 5568 fib6_rules_cleanup(); 5569 xfrm6_fini(); 5570 fib6_gc_cleanup(); 5571 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5572 unregister_pernet_subsys(&ip6_route_net_ops); 5573 dst_entries_destroy(&ip6_dst_blackhole_ops); 5574 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5575 } 5576