1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 106 int strict); 107 static size_t rt6_nlmsg_size(struct fib6_info *rt); 108 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 109 struct fib6_info *rt, struct dst_entry *dst, 110 struct in6_addr *dest, struct in6_addr *src, 111 int iif, int type, u32 portid, u32 seq, 112 unsigned int flags); 113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 114 struct in6_addr *daddr, 115 struct in6_addr *saddr); 116 117 #ifdef CONFIG_IPV6_ROUTE_INFO 118 static struct fib6_info *rt6_add_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev, 122 unsigned int pref); 123 static struct fib6_info *rt6_get_route_info(struct net *net, 124 const struct in6_addr *prefix, int prefixlen, 125 const struct in6_addr *gwaddr, 126 struct net_device *dev); 127 #endif 128 129 struct uncached_list { 130 spinlock_t lock; 131 struct list_head head; 132 }; 133 134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 135 136 void rt6_uncached_list_add(struct rt6_info *rt) 137 { 138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 139 140 rt->rt6i_uncached_list = ul; 141 142 spin_lock_bh(&ul->lock); 143 list_add_tail(&rt->rt6i_uncached, &ul->head); 144 spin_unlock_bh(&ul->lock); 145 } 146 147 void rt6_uncached_list_del(struct rt6_info *rt) 148 { 149 if (!list_empty(&rt->rt6i_uncached)) { 150 struct uncached_list *ul = rt->rt6i_uncached_list; 151 struct net *net = dev_net(rt->dst.dev); 152 153 spin_lock_bh(&ul->lock); 154 list_del(&rt->rt6i_uncached); 155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 161 { 162 struct net_device *loopback_dev = net->loopback_dev; 163 int cpu; 164 165 if (dev == loopback_dev) 166 return; 167 168 for_each_possible_cpu(cpu) { 169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 170 struct rt6_info *rt; 171 172 spin_lock_bh(&ul->lock); 173 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 174 struct inet6_dev *rt_idev = rt->rt6i_idev; 175 struct net_device *rt_dev = rt->dst.dev; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(loopback_dev); 179 in6_dev_put(rt_idev); 180 } 181 182 if (rt_dev == dev) { 183 rt->dst.dev = loopback_dev; 184 dev_hold(rt->dst.dev); 185 dev_put(rt_dev); 186 } 187 } 188 spin_unlock_bh(&ul->lock); 189 } 190 } 191 192 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 193 struct sk_buff *skb, 194 const void *daddr) 195 { 196 if (!ipv6_addr_any(p)) 197 return (const void *) p; 198 else if (skb) 199 return &ipv6_hdr(skb)->daddr; 200 return daddr; 201 } 202 203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 204 struct net_device *dev, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct neighbour *n; 209 210 daddr = choose_neigh_daddr(gw, skb, daddr); 211 n = __ipv6_neigh_lookup(dev, daddr); 212 if (n) 213 return n; 214 215 n = neigh_create(&nd_tbl, daddr, dev); 216 return IS_ERR(n) ? NULL : n; 217 } 218 219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 220 struct sk_buff *skb, 221 const void *daddr) 222 { 223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 224 225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 226 } 227 228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 229 { 230 struct net_device *dev = dst->dev; 231 struct rt6_info *rt = (struct rt6_info *)dst; 232 233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 234 if (!daddr) 235 return; 236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 237 return; 238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 239 return; 240 __ipv6_confirm_neigh(dev, daddr); 241 } 242 243 static struct dst_ops ip6_dst_ops_template = { 244 .family = AF_INET6, 245 .gc = ip6_dst_gc, 246 .gc_thresh = 1024, 247 .check = ip6_dst_check, 248 .default_advmss = ip6_default_advmss, 249 .mtu = ip6_mtu, 250 .cow_metrics = dst_cow_metrics_generic, 251 .destroy = ip6_dst_destroy, 252 .ifdown = ip6_dst_ifdown, 253 .negative_advice = ip6_negative_advice, 254 .link_failure = ip6_link_failure, 255 .update_pmtu = ip6_rt_update_pmtu, 256 .redirect = rt6_do_redirect, 257 .local_out = __ip6_local_out, 258 .neigh_lookup = ip6_dst_neigh_lookup, 259 .confirm_neigh = ip6_confirm_neigh, 260 }; 261 262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 263 { 264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 265 266 return mtu ? : dst->dev->mtu; 267 } 268 269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 270 struct sk_buff *skb, u32 mtu) 271 { 272 } 273 274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 275 struct sk_buff *skb) 276 { 277 } 278 279 static struct dst_ops ip6_dst_blackhole_ops = { 280 .family = AF_INET6, 281 .destroy = ip6_dst_destroy, 282 .check = ip6_dst_check, 283 .mtu = ip6_blackhole_mtu, 284 .default_advmss = ip6_default_advmss, 285 .update_pmtu = ip6_rt_blackhole_update_pmtu, 286 .redirect = ip6_rt_blackhole_redirect, 287 .cow_metrics = dst_cow_metrics_generic, 288 .neigh_lookup = ip6_dst_neigh_lookup, 289 }; 290 291 static const u32 ip6_template_metrics[RTAX_MAX] = { 292 [RTAX_HOPLIMIT - 1] = 0, 293 }; 294 295 static const struct fib6_info fib6_null_entry_template = { 296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 297 .fib6_protocol = RTPROT_KERNEL, 298 .fib6_metric = ~(u32)0, 299 .fib6_ref = ATOMIC_INIT(1), 300 .fib6_type = RTN_UNREACHABLE, 301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 302 }; 303 304 static const struct rt6_info ip6_null_entry_template = { 305 .dst = { 306 .__refcnt = ATOMIC_INIT(1), 307 .__use = 1, 308 .obsolete = DST_OBSOLETE_FORCE_CHK, 309 .error = -ENETUNREACH, 310 .input = ip6_pkt_discard, 311 .output = ip6_pkt_discard_out, 312 }, 313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 314 }; 315 316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 317 318 static const struct rt6_info ip6_prohibit_entry_template = { 319 .dst = { 320 .__refcnt = ATOMIC_INIT(1), 321 .__use = 1, 322 .obsolete = DST_OBSOLETE_FORCE_CHK, 323 .error = -EACCES, 324 .input = ip6_pkt_prohibit, 325 .output = ip6_pkt_prohibit_out, 326 }, 327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 }; 341 342 #endif 343 344 static void rt6_info_init(struct rt6_info *rt) 345 { 346 struct dst_entry *dst = &rt->dst; 347 348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 349 INIT_LIST_HEAD(&rt->rt6i_uncached); 350 } 351 352 /* allocate dst with ip6_dst_ops */ 353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 354 int flags) 355 { 356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 357 1, DST_OBSOLETE_FORCE_CHK, flags); 358 359 if (rt) { 360 rt6_info_init(rt); 361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 362 } 363 364 return rt; 365 } 366 EXPORT_SYMBOL(ip6_dst_alloc); 367 368 static void ip6_dst_destroy(struct dst_entry *dst) 369 { 370 struct rt6_info *rt = (struct rt6_info *)dst; 371 struct fib6_info *from; 372 struct inet6_dev *idev; 373 374 ip_dst_metrics_put(dst); 375 rt6_uncached_list_del(rt); 376 377 idev = rt->rt6i_idev; 378 if (idev) { 379 rt->rt6i_idev = NULL; 380 in6_dev_put(idev); 381 } 382 383 rcu_read_lock(); 384 from = rcu_dereference(rt->from); 385 rcu_assign_pointer(rt->from, NULL); 386 fib6_info_release(from); 387 rcu_read_unlock(); 388 } 389 390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 391 int how) 392 { 393 struct rt6_info *rt = (struct rt6_info *)dst; 394 struct inet6_dev *idev = rt->rt6i_idev; 395 struct net_device *loopback_dev = 396 dev_net(dev)->loopback_dev; 397 398 if (idev && idev->dev != loopback_dev) { 399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 400 if (loopback_idev) { 401 rt->rt6i_idev = loopback_idev; 402 in6_dev_put(idev); 403 } 404 } 405 } 406 407 static bool __rt6_check_expired(const struct rt6_info *rt) 408 { 409 if (rt->rt6i_flags & RTF_EXPIRES) 410 return time_after(jiffies, rt->dst.expires); 411 else 412 return false; 413 } 414 415 static bool rt6_check_expired(const struct rt6_info *rt) 416 { 417 struct fib6_info *from; 418 419 from = rcu_dereference(rt->from); 420 421 if (rt->rt6i_flags & RTF_EXPIRES) { 422 if (time_after(jiffies, rt->dst.expires)) 423 return true; 424 } else if (from) { 425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 426 fib6_check_expired(from); 427 } 428 return false; 429 } 430 431 void fib6_select_path(const struct net *net, struct fib6_result *res, 432 struct flowi6 *fl6, int oif, bool have_oif_match, 433 const struct sk_buff *skb, int strict) 434 { 435 struct fib6_info *sibling, *next_sibling; 436 struct fib6_info *match = res->f6i; 437 438 if (!match->fib6_nsiblings || have_oif_match) 439 goto out; 440 441 /* We might have already computed the hash for ICMPv6 errors. In such 442 * case it will always be non-zero. Otherwise now is the time to do it. 443 */ 444 if (!fl6->mp_hash) 445 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 446 447 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 448 goto out; 449 450 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 451 fib6_siblings) { 452 const struct fib6_nh *nh = &sibling->fib6_nh; 453 int nh_upper_bound; 454 455 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 456 if (fl6->mp_hash > nh_upper_bound) 457 continue; 458 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 459 break; 460 match = sibling; 461 break; 462 } 463 464 out: 465 res->f6i = match; 466 res->nh = &match->fib6_nh; 467 } 468 469 /* 470 * Route lookup. rcu_read_lock() should be held. 471 */ 472 473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 474 const struct in6_addr *saddr, int oif, int flags) 475 { 476 const struct net_device *dev; 477 478 if (nh->fib_nh_flags & RTNH_F_DEAD) 479 return false; 480 481 dev = nh->fib_nh_dev; 482 if (oif) { 483 if (dev->ifindex == oif) 484 return true; 485 } else { 486 if (ipv6_chk_addr(net, saddr, dev, 487 flags & RT6_LOOKUP_F_IFACE)) 488 return true; 489 } 490 491 return false; 492 } 493 494 static void rt6_device_match(struct net *net, struct fib6_result *res, 495 const struct in6_addr *saddr, int oif, int flags) 496 { 497 struct fib6_info *f6i = res->f6i; 498 struct fib6_info *spf6i; 499 struct fib6_nh *nh; 500 501 if (!oif && ipv6_addr_any(saddr)) { 502 nh = &f6i->fib6_nh; 503 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) { 504 res->nh = nh; 505 return; 506 } 507 } 508 509 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 510 nh = &spf6i->fib6_nh; 511 if (__rt6_device_match(net, nh, saddr, oif, flags)) { 512 res->f6i = spf6i; 513 res->nh = nh; 514 } 515 } 516 517 if (oif && flags & RT6_LOOKUP_F_IFACE) { 518 res->f6i = net->ipv6.fib6_null_entry; 519 res->nh = &res->f6i->fib6_nh; 520 return; 521 } 522 523 res->nh = &f6i->fib6_nh; 524 if (res->nh->fib_nh_flags & RTNH_F_DEAD) { 525 res->f6i = net->ipv6.fib6_null_entry; 526 res->nh = &res->f6i->fib6_nh; 527 } 528 } 529 530 #ifdef CONFIG_IPV6_ROUTER_PREF 531 struct __rt6_probe_work { 532 struct work_struct work; 533 struct in6_addr target; 534 struct net_device *dev; 535 }; 536 537 static void rt6_probe_deferred(struct work_struct *w) 538 { 539 struct in6_addr mcaddr; 540 struct __rt6_probe_work *work = 541 container_of(w, struct __rt6_probe_work, work); 542 543 addrconf_addr_solict_mult(&work->target, &mcaddr); 544 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 545 dev_put(work->dev); 546 kfree(work); 547 } 548 549 static void rt6_probe(struct fib6_nh *fib6_nh) 550 { 551 struct __rt6_probe_work *work = NULL; 552 const struct in6_addr *nh_gw; 553 struct neighbour *neigh; 554 struct net_device *dev; 555 struct inet6_dev *idev; 556 557 /* 558 * Okay, this does not seem to be appropriate 559 * for now, however, we need to check if it 560 * is really so; aka Router Reachability Probing. 561 * 562 * Router Reachability Probe MUST be rate-limited 563 * to no more than one per minute. 564 */ 565 if (fib6_nh->fib_nh_gw_family) 566 return; 567 568 nh_gw = &fib6_nh->fib_nh_gw6; 569 dev = fib6_nh->fib_nh_dev; 570 rcu_read_lock_bh(); 571 idev = __in6_dev_get(dev); 572 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 573 if (neigh) { 574 if (neigh->nud_state & NUD_VALID) 575 goto out; 576 577 write_lock(&neigh->lock); 578 if (!(neigh->nud_state & NUD_VALID) && 579 time_after(jiffies, 580 neigh->updated + idev->cnf.rtr_probe_interval)) { 581 work = kmalloc(sizeof(*work), GFP_ATOMIC); 582 if (work) 583 __neigh_set_probe_once(neigh); 584 } 585 write_unlock(&neigh->lock); 586 } else if (time_after(jiffies, fib6_nh->last_probe + 587 idev->cnf.rtr_probe_interval)) { 588 work = kmalloc(sizeof(*work), GFP_ATOMIC); 589 } 590 591 if (work) { 592 fib6_nh->last_probe = jiffies; 593 INIT_WORK(&work->work, rt6_probe_deferred); 594 work->target = *nh_gw; 595 dev_hold(dev); 596 work->dev = dev; 597 schedule_work(&work->work); 598 } 599 600 out: 601 rcu_read_unlock_bh(); 602 } 603 #else 604 static inline void rt6_probe(struct fib6_nh *fib6_nh) 605 { 606 } 607 #endif 608 609 /* 610 * Default Router Selection (RFC 2461 6.3.6) 611 */ 612 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 613 { 614 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 615 struct neighbour *neigh; 616 617 rcu_read_lock_bh(); 618 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 619 &fib6_nh->fib_nh_gw6); 620 if (neigh) { 621 read_lock(&neigh->lock); 622 if (neigh->nud_state & NUD_VALID) 623 ret = RT6_NUD_SUCCEED; 624 #ifdef CONFIG_IPV6_ROUTER_PREF 625 else if (!(neigh->nud_state & NUD_FAILED)) 626 ret = RT6_NUD_SUCCEED; 627 else 628 ret = RT6_NUD_FAIL_PROBE; 629 #endif 630 read_unlock(&neigh->lock); 631 } else { 632 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 633 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 634 } 635 rcu_read_unlock_bh(); 636 637 return ret; 638 } 639 640 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 641 int strict) 642 { 643 int m = 0; 644 645 if (!oif || nh->fib_nh_dev->ifindex == oif) 646 m = 2; 647 648 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 649 return RT6_NUD_FAIL_HARD; 650 #ifdef CONFIG_IPV6_ROUTER_PREF 651 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 652 #endif 653 if ((strict & RT6_LOOKUP_F_REACHABLE) && 654 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 655 int n = rt6_check_neigh(nh); 656 if (n < 0) 657 return n; 658 } 659 return m; 660 } 661 662 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 663 int oif, int strict, int *mpri, bool *do_rr) 664 { 665 bool match_do_rr = false; 666 bool rc = false; 667 int m; 668 669 if (nh->fib_nh_flags & RTNH_F_DEAD) 670 goto out; 671 672 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 673 nh->fib_nh_flags & RTNH_F_LINKDOWN && 674 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 675 goto out; 676 677 m = rt6_score_route(nh, fib6_flags, oif, strict); 678 if (m == RT6_NUD_FAIL_DO_RR) { 679 match_do_rr = true; 680 m = 0; /* lowest valid score */ 681 } else if (m == RT6_NUD_FAIL_HARD) { 682 goto out; 683 } 684 685 if (strict & RT6_LOOKUP_F_REACHABLE) 686 rt6_probe(nh); 687 688 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 689 if (m > *mpri) { 690 *do_rr = match_do_rr; 691 *mpri = m; 692 rc = true; 693 } 694 out: 695 return rc; 696 } 697 698 static void __find_rr_leaf(struct fib6_info *f6i_start, 699 struct fib6_info *nomatch, u32 metric, 700 struct fib6_result *res, struct fib6_info **cont, 701 int oif, int strict, bool *do_rr, int *mpri) 702 { 703 struct fib6_info *f6i; 704 705 for (f6i = f6i_start; 706 f6i && f6i != nomatch; 707 f6i = rcu_dereference(f6i->fib6_next)) { 708 struct fib6_nh *nh; 709 710 if (cont && f6i->fib6_metric != metric) { 711 *cont = f6i; 712 return; 713 } 714 715 if (fib6_check_expired(f6i)) 716 continue; 717 718 nh = &f6i->fib6_nh; 719 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { 720 res->f6i = f6i; 721 res->nh = nh; 722 } 723 } 724 } 725 726 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 727 struct fib6_info *rr_head, int oif, int strict, 728 bool *do_rr, struct fib6_result *res) 729 { 730 u32 metric = rr_head->fib6_metric; 731 struct fib6_info *cont = NULL; 732 int mpri = -1; 733 734 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 735 oif, strict, do_rr, &mpri); 736 737 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 738 oif, strict, do_rr, &mpri); 739 740 if (res->f6i || !cont) 741 return; 742 743 __find_rr_leaf(cont, NULL, metric, res, NULL, 744 oif, strict, do_rr, &mpri); 745 } 746 747 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 748 struct fib6_result *res, int strict) 749 { 750 struct fib6_info *leaf = rcu_dereference(fn->leaf); 751 struct fib6_info *rt0; 752 bool do_rr = false; 753 int key_plen; 754 755 /* make sure this function or its helpers sets f6i */ 756 res->f6i = NULL; 757 758 if (!leaf || leaf == net->ipv6.fib6_null_entry) 759 goto out; 760 761 rt0 = rcu_dereference(fn->rr_ptr); 762 if (!rt0) 763 rt0 = leaf; 764 765 /* Double check to make sure fn is not an intermediate node 766 * and fn->leaf does not points to its child's leaf 767 * (This might happen if all routes under fn are deleted from 768 * the tree and fib6_repair_tree() is called on the node.) 769 */ 770 key_plen = rt0->fib6_dst.plen; 771 #ifdef CONFIG_IPV6_SUBTREES 772 if (rt0->fib6_src.plen) 773 key_plen = rt0->fib6_src.plen; 774 #endif 775 if (fn->fn_bit != key_plen) 776 goto out; 777 778 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 779 if (do_rr) { 780 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 781 782 /* no entries matched; do round-robin */ 783 if (!next || next->fib6_metric != rt0->fib6_metric) 784 next = leaf; 785 786 if (next != rt0) { 787 spin_lock_bh(&leaf->fib6_table->tb6_lock); 788 /* make sure next is not being deleted from the tree */ 789 if (next->fib6_node) 790 rcu_assign_pointer(fn->rr_ptr, next); 791 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 792 } 793 } 794 795 out: 796 if (!res->f6i) { 797 res->f6i = net->ipv6.fib6_null_entry; 798 res->nh = &res->f6i->fib6_nh; 799 } 800 } 801 802 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 803 { 804 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 805 res->nh->fib_nh_gw_family; 806 } 807 808 #ifdef CONFIG_IPV6_ROUTE_INFO 809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 810 const struct in6_addr *gwaddr) 811 { 812 struct net *net = dev_net(dev); 813 struct route_info *rinfo = (struct route_info *) opt; 814 struct in6_addr prefix_buf, *prefix; 815 unsigned int pref; 816 unsigned long lifetime; 817 struct fib6_info *rt; 818 819 if (len < sizeof(struct route_info)) { 820 return -EINVAL; 821 } 822 823 /* Sanity check for prefix_len and length */ 824 if (rinfo->length > 3) { 825 return -EINVAL; 826 } else if (rinfo->prefix_len > 128) { 827 return -EINVAL; 828 } else if (rinfo->prefix_len > 64) { 829 if (rinfo->length < 2) { 830 return -EINVAL; 831 } 832 } else if (rinfo->prefix_len > 0) { 833 if (rinfo->length < 1) { 834 return -EINVAL; 835 } 836 } 837 838 pref = rinfo->route_pref; 839 if (pref == ICMPV6_ROUTER_PREF_INVALID) 840 return -EINVAL; 841 842 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 843 844 if (rinfo->length == 3) 845 prefix = (struct in6_addr *)rinfo->prefix; 846 else { 847 /* this function is safe */ 848 ipv6_addr_prefix(&prefix_buf, 849 (struct in6_addr *)rinfo->prefix, 850 rinfo->prefix_len); 851 prefix = &prefix_buf; 852 } 853 854 if (rinfo->prefix_len == 0) 855 rt = rt6_get_dflt_router(net, gwaddr, dev); 856 else 857 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 858 gwaddr, dev); 859 860 if (rt && !lifetime) { 861 ip6_del_rt(net, rt); 862 rt = NULL; 863 } 864 865 if (!rt && lifetime) 866 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 867 dev, pref); 868 else if (rt) 869 rt->fib6_flags = RTF_ROUTEINFO | 870 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 871 872 if (rt) { 873 if (!addrconf_finite_timeout(lifetime)) 874 fib6_clean_expires(rt); 875 else 876 fib6_set_expires(rt, jiffies + HZ * lifetime); 877 878 fib6_info_release(rt); 879 } 880 return 0; 881 } 882 #endif 883 884 /* 885 * Misc support functions 886 */ 887 888 /* called with rcu_lock held */ 889 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 890 { 891 struct net_device *dev = res->nh->fib_nh_dev; 892 const struct fib6_info *f6i = res->f6i; 893 894 if (f6i->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 895 /* for copies of local routes, dst->dev needs to be the 896 * device if it is a master device, the master device if 897 * device is enslaved, and the loopback as the default 898 */ 899 if (netif_is_l3_slave(dev) && 900 !rt6_need_strict(&f6i->fib6_dst.addr)) 901 dev = l3mdev_master_dev_rcu(dev); 902 else if (!netif_is_l3_master(dev)) 903 dev = dev_net(dev)->loopback_dev; 904 /* last case is netif_is_l3_master(dev) is true in which 905 * case we want dev returned to be dev 906 */ 907 } 908 909 return dev; 910 } 911 912 static const int fib6_prop[RTN_MAX + 1] = { 913 [RTN_UNSPEC] = 0, 914 [RTN_UNICAST] = 0, 915 [RTN_LOCAL] = 0, 916 [RTN_BROADCAST] = 0, 917 [RTN_ANYCAST] = 0, 918 [RTN_MULTICAST] = 0, 919 [RTN_BLACKHOLE] = -EINVAL, 920 [RTN_UNREACHABLE] = -EHOSTUNREACH, 921 [RTN_PROHIBIT] = -EACCES, 922 [RTN_THROW] = -EAGAIN, 923 [RTN_NAT] = -EINVAL, 924 [RTN_XRESOLVE] = -EINVAL, 925 }; 926 927 static int ip6_rt_type_to_error(u8 fib6_type) 928 { 929 return fib6_prop[fib6_type]; 930 } 931 932 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 933 { 934 unsigned short flags = 0; 935 936 if (rt->dst_nocount) 937 flags |= DST_NOCOUNT; 938 if (rt->dst_nopolicy) 939 flags |= DST_NOPOLICY; 940 if (rt->dst_host) 941 flags |= DST_HOST; 942 943 return flags; 944 } 945 946 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 947 { 948 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 949 950 switch (ort->fib6_type) { 951 case RTN_BLACKHOLE: 952 rt->dst.output = dst_discard_out; 953 rt->dst.input = dst_discard; 954 break; 955 case RTN_PROHIBIT: 956 rt->dst.output = ip6_pkt_prohibit_out; 957 rt->dst.input = ip6_pkt_prohibit; 958 break; 959 case RTN_THROW: 960 case RTN_UNREACHABLE: 961 default: 962 rt->dst.output = ip6_pkt_discard_out; 963 rt->dst.input = ip6_pkt_discard; 964 break; 965 } 966 } 967 968 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 969 { 970 struct fib6_info *ort = res->f6i; 971 972 if (ort->fib6_flags & RTF_REJECT) { 973 ip6_rt_init_dst_reject(rt, ort); 974 return; 975 } 976 977 rt->dst.error = 0; 978 rt->dst.output = ip6_output; 979 980 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 981 rt->dst.input = ip6_input; 982 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 983 rt->dst.input = ip6_mc_input; 984 } else { 985 rt->dst.input = ip6_forward; 986 } 987 988 if (res->nh->fib_nh_lws) { 989 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 990 lwtunnel_set_redirect(&rt->dst); 991 } 992 993 rt->dst.lastuse = jiffies; 994 } 995 996 /* Caller must already hold reference to @from */ 997 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 998 { 999 rt->rt6i_flags &= ~RTF_EXPIRES; 1000 rcu_assign_pointer(rt->from, from); 1001 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1002 } 1003 1004 /* Caller must already hold reference to f6i in result */ 1005 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1006 { 1007 const struct fib6_nh *nh = res->nh; 1008 const struct net_device *dev = nh->fib_nh_dev; 1009 struct fib6_info *f6i = res->f6i; 1010 1011 ip6_rt_init_dst(rt, res); 1012 1013 rt->rt6i_dst = f6i->fib6_dst; 1014 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1015 rt->rt6i_flags = f6i->fib6_flags; 1016 if (nh->fib_nh_gw_family) { 1017 rt->rt6i_gateway = nh->fib_nh_gw6; 1018 rt->rt6i_flags |= RTF_GATEWAY; 1019 } 1020 rt6_set_from(rt, f6i); 1021 #ifdef CONFIG_IPV6_SUBTREES 1022 rt->rt6i_src = f6i->fib6_src; 1023 #endif 1024 } 1025 1026 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1027 struct in6_addr *saddr) 1028 { 1029 struct fib6_node *pn, *sn; 1030 while (1) { 1031 if (fn->fn_flags & RTN_TL_ROOT) 1032 return NULL; 1033 pn = rcu_dereference(fn->parent); 1034 sn = FIB6_SUBTREE(pn); 1035 if (sn && sn != fn) 1036 fn = fib6_node_lookup(sn, NULL, saddr); 1037 else 1038 fn = pn; 1039 if (fn->fn_flags & RTN_RTINFO) 1040 return fn; 1041 } 1042 } 1043 1044 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1045 { 1046 struct rt6_info *rt = *prt; 1047 1048 if (dst_hold_safe(&rt->dst)) 1049 return true; 1050 if (net) { 1051 rt = net->ipv6.ip6_null_entry; 1052 dst_hold(&rt->dst); 1053 } else { 1054 rt = NULL; 1055 } 1056 *prt = rt; 1057 return false; 1058 } 1059 1060 /* called with rcu_lock held */ 1061 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1062 { 1063 struct net_device *dev = res->nh->fib_nh_dev; 1064 struct fib6_info *f6i = res->f6i; 1065 unsigned short flags; 1066 struct rt6_info *nrt; 1067 1068 if (!fib6_info_hold_safe(f6i)) 1069 goto fallback; 1070 1071 flags = fib6_info_dst_flags(f6i); 1072 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1073 if (!nrt) { 1074 fib6_info_release(f6i); 1075 goto fallback; 1076 } 1077 1078 ip6_rt_copy_init(nrt, res); 1079 return nrt; 1080 1081 fallback: 1082 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1083 dst_hold(&nrt->dst); 1084 return nrt; 1085 } 1086 1087 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1088 struct fib6_table *table, 1089 struct flowi6 *fl6, 1090 const struct sk_buff *skb, 1091 int flags) 1092 { 1093 struct fib6_result res = {}; 1094 struct fib6_node *fn; 1095 struct rt6_info *rt; 1096 1097 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1098 flags &= ~RT6_LOOKUP_F_IFACE; 1099 1100 rcu_read_lock(); 1101 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1102 restart: 1103 res.f6i = rcu_dereference(fn->leaf); 1104 if (!res.f6i) 1105 res.f6i = net->ipv6.fib6_null_entry; 1106 else 1107 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1108 flags); 1109 1110 if (res.f6i == net->ipv6.fib6_null_entry) { 1111 fn = fib6_backtrack(fn, &fl6->saddr); 1112 if (fn) 1113 goto restart; 1114 1115 rt = net->ipv6.ip6_null_entry; 1116 dst_hold(&rt->dst); 1117 goto out; 1118 } 1119 1120 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1121 fl6->flowi6_oif != 0, skb, flags); 1122 1123 /* Search through exception table */ 1124 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1125 if (rt) { 1126 if (ip6_hold_safe(net, &rt)) 1127 dst_use_noref(&rt->dst, jiffies); 1128 } else { 1129 rt = ip6_create_rt_rcu(&res); 1130 } 1131 1132 out: 1133 trace_fib6_table_lookup(net, &res, table, fl6); 1134 1135 rcu_read_unlock(); 1136 1137 return rt; 1138 } 1139 1140 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1141 const struct sk_buff *skb, int flags) 1142 { 1143 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1144 } 1145 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1146 1147 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1148 const struct in6_addr *saddr, int oif, 1149 const struct sk_buff *skb, int strict) 1150 { 1151 struct flowi6 fl6 = { 1152 .flowi6_oif = oif, 1153 .daddr = *daddr, 1154 }; 1155 struct dst_entry *dst; 1156 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1157 1158 if (saddr) { 1159 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1160 flags |= RT6_LOOKUP_F_HAS_SADDR; 1161 } 1162 1163 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1164 if (dst->error == 0) 1165 return (struct rt6_info *) dst; 1166 1167 dst_release(dst); 1168 1169 return NULL; 1170 } 1171 EXPORT_SYMBOL(rt6_lookup); 1172 1173 /* ip6_ins_rt is called with FREE table->tb6_lock. 1174 * It takes new route entry, the addition fails by any reason the 1175 * route is released. 1176 * Caller must hold dst before calling it. 1177 */ 1178 1179 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1180 struct netlink_ext_ack *extack) 1181 { 1182 int err; 1183 struct fib6_table *table; 1184 1185 table = rt->fib6_table; 1186 spin_lock_bh(&table->tb6_lock); 1187 err = fib6_add(&table->tb6_root, rt, info, extack); 1188 spin_unlock_bh(&table->tb6_lock); 1189 1190 return err; 1191 } 1192 1193 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1194 { 1195 struct nl_info info = { .nl_net = net, }; 1196 1197 return __ip6_ins_rt(rt, &info, NULL); 1198 } 1199 1200 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1201 const struct in6_addr *daddr, 1202 const struct in6_addr *saddr) 1203 { 1204 struct fib6_info *f6i = res->f6i; 1205 struct net_device *dev; 1206 struct rt6_info *rt; 1207 1208 /* 1209 * Clone the route. 1210 */ 1211 1212 if (!fib6_info_hold_safe(f6i)) 1213 return NULL; 1214 1215 dev = ip6_rt_get_dev_rcu(res); 1216 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1217 if (!rt) { 1218 fib6_info_release(f6i); 1219 return NULL; 1220 } 1221 1222 ip6_rt_copy_init(rt, res); 1223 rt->rt6i_flags |= RTF_CACHE; 1224 rt->dst.flags |= DST_HOST; 1225 rt->rt6i_dst.addr = *daddr; 1226 rt->rt6i_dst.plen = 128; 1227 1228 if (!rt6_is_gw_or_nonexthop(res)) { 1229 if (f6i->fib6_dst.plen != 128 && 1230 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1231 rt->rt6i_flags |= RTF_ANYCAST; 1232 #ifdef CONFIG_IPV6_SUBTREES 1233 if (rt->rt6i_src.plen && saddr) { 1234 rt->rt6i_src.addr = *saddr; 1235 rt->rt6i_src.plen = 128; 1236 } 1237 #endif 1238 } 1239 1240 return rt; 1241 } 1242 1243 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1244 { 1245 struct fib6_info *f6i = res->f6i; 1246 unsigned short flags = fib6_info_dst_flags(f6i); 1247 struct net_device *dev; 1248 struct rt6_info *pcpu_rt; 1249 1250 if (!fib6_info_hold_safe(f6i)) 1251 return NULL; 1252 1253 rcu_read_lock(); 1254 dev = ip6_rt_get_dev_rcu(res); 1255 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1256 rcu_read_unlock(); 1257 if (!pcpu_rt) { 1258 fib6_info_release(f6i); 1259 return NULL; 1260 } 1261 ip6_rt_copy_init(pcpu_rt, res); 1262 pcpu_rt->rt6i_flags |= RTF_PCPU; 1263 return pcpu_rt; 1264 } 1265 1266 /* It should be called with rcu_read_lock() acquired */ 1267 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1268 { 1269 struct rt6_info *pcpu_rt, **p; 1270 1271 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1272 pcpu_rt = *p; 1273 1274 if (pcpu_rt) 1275 ip6_hold_safe(NULL, &pcpu_rt); 1276 1277 return pcpu_rt; 1278 } 1279 1280 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1281 const struct fib6_result *res) 1282 { 1283 struct rt6_info *pcpu_rt, *prev, **p; 1284 1285 pcpu_rt = ip6_rt_pcpu_alloc(res); 1286 if (!pcpu_rt) { 1287 dst_hold(&net->ipv6.ip6_null_entry->dst); 1288 return net->ipv6.ip6_null_entry; 1289 } 1290 1291 dst_hold(&pcpu_rt->dst); 1292 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1293 prev = cmpxchg(p, NULL, pcpu_rt); 1294 BUG_ON(prev); 1295 1296 return pcpu_rt; 1297 } 1298 1299 /* exception hash table implementation 1300 */ 1301 static DEFINE_SPINLOCK(rt6_exception_lock); 1302 1303 /* Remove rt6_ex from hash table and free the memory 1304 * Caller must hold rt6_exception_lock 1305 */ 1306 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1307 struct rt6_exception *rt6_ex) 1308 { 1309 struct fib6_info *from; 1310 struct net *net; 1311 1312 if (!bucket || !rt6_ex) 1313 return; 1314 1315 net = dev_net(rt6_ex->rt6i->dst.dev); 1316 net->ipv6.rt6_stats->fib_rt_cache--; 1317 1318 /* purge completely the exception to allow releasing the held resources: 1319 * some [sk] cache may keep the dst around for unlimited time 1320 */ 1321 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1322 lockdep_is_held(&rt6_exception_lock)); 1323 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1324 fib6_info_release(from); 1325 dst_dev_put(&rt6_ex->rt6i->dst); 1326 1327 hlist_del_rcu(&rt6_ex->hlist); 1328 dst_release(&rt6_ex->rt6i->dst); 1329 kfree_rcu(rt6_ex, rcu); 1330 WARN_ON_ONCE(!bucket->depth); 1331 bucket->depth--; 1332 } 1333 1334 /* Remove oldest rt6_ex in bucket and free the memory 1335 * Caller must hold rt6_exception_lock 1336 */ 1337 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1338 { 1339 struct rt6_exception *rt6_ex, *oldest = NULL; 1340 1341 if (!bucket) 1342 return; 1343 1344 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1345 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1346 oldest = rt6_ex; 1347 } 1348 rt6_remove_exception(bucket, oldest); 1349 } 1350 1351 static u32 rt6_exception_hash(const struct in6_addr *dst, 1352 const struct in6_addr *src) 1353 { 1354 static u32 seed __read_mostly; 1355 u32 val; 1356 1357 net_get_random_once(&seed, sizeof(seed)); 1358 val = jhash(dst, sizeof(*dst), seed); 1359 1360 #ifdef CONFIG_IPV6_SUBTREES 1361 if (src) 1362 val = jhash(src, sizeof(*src), val); 1363 #endif 1364 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1365 } 1366 1367 /* Helper function to find the cached rt in the hash table 1368 * and update bucket pointer to point to the bucket for this 1369 * (daddr, saddr) pair 1370 * Caller must hold rt6_exception_lock 1371 */ 1372 static struct rt6_exception * 1373 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1374 const struct in6_addr *daddr, 1375 const struct in6_addr *saddr) 1376 { 1377 struct rt6_exception *rt6_ex; 1378 u32 hval; 1379 1380 if (!(*bucket) || !daddr) 1381 return NULL; 1382 1383 hval = rt6_exception_hash(daddr, saddr); 1384 *bucket += hval; 1385 1386 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1387 struct rt6_info *rt6 = rt6_ex->rt6i; 1388 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1389 1390 #ifdef CONFIG_IPV6_SUBTREES 1391 if (matched && saddr) 1392 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1393 #endif 1394 if (matched) 1395 return rt6_ex; 1396 } 1397 return NULL; 1398 } 1399 1400 /* Helper function to find the cached rt in the hash table 1401 * and update bucket pointer to point to the bucket for this 1402 * (daddr, saddr) pair 1403 * Caller must hold rcu_read_lock() 1404 */ 1405 static struct rt6_exception * 1406 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1407 const struct in6_addr *daddr, 1408 const struct in6_addr *saddr) 1409 { 1410 struct rt6_exception *rt6_ex; 1411 u32 hval; 1412 1413 WARN_ON_ONCE(!rcu_read_lock_held()); 1414 1415 if (!(*bucket) || !daddr) 1416 return NULL; 1417 1418 hval = rt6_exception_hash(daddr, saddr); 1419 *bucket += hval; 1420 1421 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1422 struct rt6_info *rt6 = rt6_ex->rt6i; 1423 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1424 1425 #ifdef CONFIG_IPV6_SUBTREES 1426 if (matched && saddr) 1427 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1428 #endif 1429 if (matched) 1430 return rt6_ex; 1431 } 1432 return NULL; 1433 } 1434 1435 static unsigned int fib6_mtu(const struct fib6_result *res) 1436 { 1437 const struct fib6_nh *nh = res->nh; 1438 unsigned int mtu; 1439 1440 if (res->f6i->fib6_pmtu) { 1441 mtu = res->f6i->fib6_pmtu; 1442 } else { 1443 struct net_device *dev = nh->fib_nh_dev; 1444 struct inet6_dev *idev; 1445 1446 rcu_read_lock(); 1447 idev = __in6_dev_get(dev); 1448 mtu = idev->cnf.mtu6; 1449 rcu_read_unlock(); 1450 } 1451 1452 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1453 1454 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1455 } 1456 1457 static int rt6_insert_exception(struct rt6_info *nrt, 1458 const struct fib6_result *res) 1459 { 1460 struct net *net = dev_net(nrt->dst.dev); 1461 struct rt6_exception_bucket *bucket; 1462 struct in6_addr *src_key = NULL; 1463 struct rt6_exception *rt6_ex; 1464 struct fib6_info *f6i = res->f6i; 1465 int err = 0; 1466 1467 spin_lock_bh(&rt6_exception_lock); 1468 1469 if (f6i->exception_bucket_flushed) { 1470 err = -EINVAL; 1471 goto out; 1472 } 1473 1474 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1475 lockdep_is_held(&rt6_exception_lock)); 1476 if (!bucket) { 1477 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1478 GFP_ATOMIC); 1479 if (!bucket) { 1480 err = -ENOMEM; 1481 goto out; 1482 } 1483 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); 1484 } 1485 1486 #ifdef CONFIG_IPV6_SUBTREES 1487 /* fib6_src.plen != 0 indicates f6i is in subtree 1488 * and exception table is indexed by a hash of 1489 * both fib6_dst and fib6_src. 1490 * Otherwise, the exception table is indexed by 1491 * a hash of only fib6_dst. 1492 */ 1493 if (f6i->fib6_src.plen) 1494 src_key = &nrt->rt6i_src.addr; 1495 #endif 1496 /* rt6_mtu_change() might lower mtu on f6i. 1497 * Only insert this exception route if its mtu 1498 * is less than f6i's mtu value. 1499 */ 1500 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1501 err = -EINVAL; 1502 goto out; 1503 } 1504 1505 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1506 src_key); 1507 if (rt6_ex) 1508 rt6_remove_exception(bucket, rt6_ex); 1509 1510 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1511 if (!rt6_ex) { 1512 err = -ENOMEM; 1513 goto out; 1514 } 1515 rt6_ex->rt6i = nrt; 1516 rt6_ex->stamp = jiffies; 1517 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1518 bucket->depth++; 1519 net->ipv6.rt6_stats->fib_rt_cache++; 1520 1521 if (bucket->depth > FIB6_MAX_DEPTH) 1522 rt6_exception_remove_oldest(bucket); 1523 1524 out: 1525 spin_unlock_bh(&rt6_exception_lock); 1526 1527 /* Update fn->fn_sernum to invalidate all cached dst */ 1528 if (!err) { 1529 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1530 fib6_update_sernum(net, f6i); 1531 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1532 fib6_force_start_gc(net); 1533 } 1534 1535 return err; 1536 } 1537 1538 void rt6_flush_exceptions(struct fib6_info *rt) 1539 { 1540 struct rt6_exception_bucket *bucket; 1541 struct rt6_exception *rt6_ex; 1542 struct hlist_node *tmp; 1543 int i; 1544 1545 spin_lock_bh(&rt6_exception_lock); 1546 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1547 rt->exception_bucket_flushed = 1; 1548 1549 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1550 lockdep_is_held(&rt6_exception_lock)); 1551 if (!bucket) 1552 goto out; 1553 1554 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1555 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1556 rt6_remove_exception(bucket, rt6_ex); 1557 WARN_ON_ONCE(bucket->depth); 1558 bucket++; 1559 } 1560 1561 out: 1562 spin_unlock_bh(&rt6_exception_lock); 1563 } 1564 1565 /* Find cached rt in the hash table inside passed in rt 1566 * Caller has to hold rcu_read_lock() 1567 */ 1568 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1569 struct in6_addr *daddr, 1570 struct in6_addr *saddr) 1571 { 1572 struct rt6_exception_bucket *bucket; 1573 struct in6_addr *src_key = NULL; 1574 struct rt6_exception *rt6_ex; 1575 struct rt6_info *ret = NULL; 1576 1577 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); 1578 1579 #ifdef CONFIG_IPV6_SUBTREES 1580 /* fib6i_src.plen != 0 indicates f6i is in subtree 1581 * and exception table is indexed by a hash of 1582 * both fib6_dst and fib6_src. 1583 * Otherwise, the exception table is indexed by 1584 * a hash of only fib6_dst. 1585 */ 1586 if (res->f6i->fib6_src.plen) 1587 src_key = saddr; 1588 #endif 1589 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1590 1591 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1592 ret = rt6_ex->rt6i; 1593 1594 return ret; 1595 } 1596 1597 /* Remove the passed in cached rt from the hash table that contains it */ 1598 static int rt6_remove_exception_rt(struct rt6_info *rt) 1599 { 1600 struct rt6_exception_bucket *bucket; 1601 struct in6_addr *src_key = NULL; 1602 struct rt6_exception *rt6_ex; 1603 struct fib6_info *from; 1604 int err; 1605 1606 from = rcu_dereference(rt->from); 1607 if (!from || 1608 !(rt->rt6i_flags & RTF_CACHE)) 1609 return -EINVAL; 1610 1611 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1612 return -ENOENT; 1613 1614 spin_lock_bh(&rt6_exception_lock); 1615 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1616 lockdep_is_held(&rt6_exception_lock)); 1617 #ifdef CONFIG_IPV6_SUBTREES 1618 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1619 * and exception table is indexed by a hash of 1620 * both rt6i_dst and rt6i_src. 1621 * Otherwise, the exception table is indexed by 1622 * a hash of only rt6i_dst. 1623 */ 1624 if (from->fib6_src.plen) 1625 src_key = &rt->rt6i_src.addr; 1626 #endif 1627 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1628 &rt->rt6i_dst.addr, 1629 src_key); 1630 if (rt6_ex) { 1631 rt6_remove_exception(bucket, rt6_ex); 1632 err = 0; 1633 } else { 1634 err = -ENOENT; 1635 } 1636 1637 spin_unlock_bh(&rt6_exception_lock); 1638 return err; 1639 } 1640 1641 /* Find rt6_ex which contains the passed in rt cache and 1642 * refresh its stamp 1643 */ 1644 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1645 { 1646 struct rt6_exception_bucket *bucket; 1647 struct in6_addr *src_key = NULL; 1648 struct rt6_exception *rt6_ex; 1649 struct fib6_info *from; 1650 1651 rcu_read_lock(); 1652 from = rcu_dereference(rt->from); 1653 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1654 goto unlock; 1655 1656 bucket = rcu_dereference(from->rt6i_exception_bucket); 1657 1658 #ifdef CONFIG_IPV6_SUBTREES 1659 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1660 * and exception table is indexed by a hash of 1661 * both rt6i_dst and rt6i_src. 1662 * Otherwise, the exception table is indexed by 1663 * a hash of only rt6i_dst. 1664 */ 1665 if (from->fib6_src.plen) 1666 src_key = &rt->rt6i_src.addr; 1667 #endif 1668 rt6_ex = __rt6_find_exception_rcu(&bucket, 1669 &rt->rt6i_dst.addr, 1670 src_key); 1671 if (rt6_ex) 1672 rt6_ex->stamp = jiffies; 1673 1674 unlock: 1675 rcu_read_unlock(); 1676 } 1677 1678 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1679 struct rt6_info *rt, int mtu) 1680 { 1681 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1682 * lowest MTU in the path: always allow updating the route PMTU to 1683 * reflect PMTU decreases. 1684 * 1685 * If the new MTU is higher, and the route PMTU is equal to the local 1686 * MTU, this means the old MTU is the lowest in the path, so allow 1687 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1688 * handle this. 1689 */ 1690 1691 if (dst_mtu(&rt->dst) >= mtu) 1692 return true; 1693 1694 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1695 return true; 1696 1697 return false; 1698 } 1699 1700 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1701 struct fib6_info *rt, int mtu) 1702 { 1703 struct rt6_exception_bucket *bucket; 1704 struct rt6_exception *rt6_ex; 1705 int i; 1706 1707 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1708 lockdep_is_held(&rt6_exception_lock)); 1709 1710 if (!bucket) 1711 return; 1712 1713 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1714 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1715 struct rt6_info *entry = rt6_ex->rt6i; 1716 1717 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1718 * route), the metrics of its rt->from have already 1719 * been updated. 1720 */ 1721 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1722 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1723 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1724 } 1725 bucket++; 1726 } 1727 } 1728 1729 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1730 1731 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1732 struct in6_addr *gateway) 1733 { 1734 struct rt6_exception_bucket *bucket; 1735 struct rt6_exception *rt6_ex; 1736 struct hlist_node *tmp; 1737 int i; 1738 1739 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1740 return; 1741 1742 spin_lock_bh(&rt6_exception_lock); 1743 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1744 lockdep_is_held(&rt6_exception_lock)); 1745 1746 if (bucket) { 1747 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1748 hlist_for_each_entry_safe(rt6_ex, tmp, 1749 &bucket->chain, hlist) { 1750 struct rt6_info *entry = rt6_ex->rt6i; 1751 1752 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1753 RTF_CACHE_GATEWAY && 1754 ipv6_addr_equal(gateway, 1755 &entry->rt6i_gateway)) { 1756 rt6_remove_exception(bucket, rt6_ex); 1757 } 1758 } 1759 bucket++; 1760 } 1761 } 1762 1763 spin_unlock_bh(&rt6_exception_lock); 1764 } 1765 1766 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1767 struct rt6_exception *rt6_ex, 1768 struct fib6_gc_args *gc_args, 1769 unsigned long now) 1770 { 1771 struct rt6_info *rt = rt6_ex->rt6i; 1772 1773 /* we are pruning and obsoleting aged-out and non gateway exceptions 1774 * even if others have still references to them, so that on next 1775 * dst_check() such references can be dropped. 1776 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1777 * expired, independently from their aging, as per RFC 8201 section 4 1778 */ 1779 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1780 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1781 RT6_TRACE("aging clone %p\n", rt); 1782 rt6_remove_exception(bucket, rt6_ex); 1783 return; 1784 } 1785 } else if (time_after(jiffies, rt->dst.expires)) { 1786 RT6_TRACE("purging expired route %p\n", rt); 1787 rt6_remove_exception(bucket, rt6_ex); 1788 return; 1789 } 1790 1791 if (rt->rt6i_flags & RTF_GATEWAY) { 1792 struct neighbour *neigh; 1793 __u8 neigh_flags = 0; 1794 1795 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1796 if (neigh) 1797 neigh_flags = neigh->flags; 1798 1799 if (!(neigh_flags & NTF_ROUTER)) { 1800 RT6_TRACE("purging route %p via non-router but gateway\n", 1801 rt); 1802 rt6_remove_exception(bucket, rt6_ex); 1803 return; 1804 } 1805 } 1806 1807 gc_args->more++; 1808 } 1809 1810 void rt6_age_exceptions(struct fib6_info *rt, 1811 struct fib6_gc_args *gc_args, 1812 unsigned long now) 1813 { 1814 struct rt6_exception_bucket *bucket; 1815 struct rt6_exception *rt6_ex; 1816 struct hlist_node *tmp; 1817 int i; 1818 1819 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1820 return; 1821 1822 rcu_read_lock_bh(); 1823 spin_lock(&rt6_exception_lock); 1824 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1825 lockdep_is_held(&rt6_exception_lock)); 1826 1827 if (bucket) { 1828 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1829 hlist_for_each_entry_safe(rt6_ex, tmp, 1830 &bucket->chain, hlist) { 1831 rt6_age_examine_exception(bucket, rt6_ex, 1832 gc_args, now); 1833 } 1834 bucket++; 1835 } 1836 } 1837 spin_unlock(&rt6_exception_lock); 1838 rcu_read_unlock_bh(); 1839 } 1840 1841 /* must be called with rcu lock held */ 1842 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 1843 struct flowi6 *fl6, struct fib6_result *res, int strict) 1844 { 1845 struct fib6_node *fn, *saved_fn; 1846 1847 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1848 saved_fn = fn; 1849 1850 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1851 oif = 0; 1852 1853 redo_rt6_select: 1854 rt6_select(net, fn, oif, res, strict); 1855 if (res->f6i == net->ipv6.fib6_null_entry) { 1856 fn = fib6_backtrack(fn, &fl6->saddr); 1857 if (fn) 1858 goto redo_rt6_select; 1859 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1860 /* also consider unreachable route */ 1861 strict &= ~RT6_LOOKUP_F_REACHABLE; 1862 fn = saved_fn; 1863 goto redo_rt6_select; 1864 } 1865 } 1866 1867 trace_fib6_table_lookup(net, res, table, fl6); 1868 1869 return 0; 1870 } 1871 1872 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1873 int oif, struct flowi6 *fl6, 1874 const struct sk_buff *skb, int flags) 1875 { 1876 struct fib6_result res = {}; 1877 struct rt6_info *rt; 1878 int strict = 0; 1879 1880 strict |= flags & RT6_LOOKUP_F_IFACE; 1881 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1882 if (net->ipv6.devconf_all->forwarding == 0) 1883 strict |= RT6_LOOKUP_F_REACHABLE; 1884 1885 rcu_read_lock(); 1886 1887 fib6_table_lookup(net, table, oif, fl6, &res, strict); 1888 if (res.f6i == net->ipv6.fib6_null_entry) { 1889 rt = net->ipv6.ip6_null_entry; 1890 rcu_read_unlock(); 1891 dst_hold(&rt->dst); 1892 return rt; 1893 } 1894 1895 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 1896 1897 /*Search through exception table */ 1898 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1899 if (rt) { 1900 if (ip6_hold_safe(net, &rt)) 1901 dst_use_noref(&rt->dst, jiffies); 1902 1903 rcu_read_unlock(); 1904 return rt; 1905 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1906 !res.nh->fib_nh_gw_family)) { 1907 /* Create a RTF_CACHE clone which will not be 1908 * owned by the fib6 tree. It is for the special case where 1909 * the daddr in the skb during the neighbor look-up is different 1910 * from the fl6->daddr used to look-up route here. 1911 */ 1912 struct rt6_info *uncached_rt; 1913 1914 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 1915 1916 rcu_read_unlock(); 1917 1918 if (uncached_rt) { 1919 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1920 * No need for another dst_hold() 1921 */ 1922 rt6_uncached_list_add(uncached_rt); 1923 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1924 } else { 1925 uncached_rt = net->ipv6.ip6_null_entry; 1926 dst_hold(&uncached_rt->dst); 1927 } 1928 1929 return uncached_rt; 1930 } else { 1931 /* Get a percpu copy */ 1932 1933 struct rt6_info *pcpu_rt; 1934 1935 local_bh_disable(); 1936 pcpu_rt = rt6_get_pcpu_route(&res); 1937 1938 if (!pcpu_rt) 1939 pcpu_rt = rt6_make_pcpu_route(net, &res); 1940 1941 local_bh_enable(); 1942 rcu_read_unlock(); 1943 1944 return pcpu_rt; 1945 } 1946 } 1947 EXPORT_SYMBOL_GPL(ip6_pol_route); 1948 1949 static struct rt6_info *ip6_pol_route_input(struct net *net, 1950 struct fib6_table *table, 1951 struct flowi6 *fl6, 1952 const struct sk_buff *skb, 1953 int flags) 1954 { 1955 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1956 } 1957 1958 struct dst_entry *ip6_route_input_lookup(struct net *net, 1959 struct net_device *dev, 1960 struct flowi6 *fl6, 1961 const struct sk_buff *skb, 1962 int flags) 1963 { 1964 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1965 flags |= RT6_LOOKUP_F_IFACE; 1966 1967 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1968 } 1969 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1970 1971 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1972 struct flow_keys *keys, 1973 struct flow_keys *flkeys) 1974 { 1975 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1976 const struct ipv6hdr *key_iph = outer_iph; 1977 struct flow_keys *_flkeys = flkeys; 1978 const struct ipv6hdr *inner_iph; 1979 const struct icmp6hdr *icmph; 1980 struct ipv6hdr _inner_iph; 1981 struct icmp6hdr _icmph; 1982 1983 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1984 goto out; 1985 1986 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1987 sizeof(_icmph), &_icmph); 1988 if (!icmph) 1989 goto out; 1990 1991 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1992 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1993 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1994 icmph->icmp6_type != ICMPV6_PARAMPROB) 1995 goto out; 1996 1997 inner_iph = skb_header_pointer(skb, 1998 skb_transport_offset(skb) + sizeof(*icmph), 1999 sizeof(_inner_iph), &_inner_iph); 2000 if (!inner_iph) 2001 goto out; 2002 2003 key_iph = inner_iph; 2004 _flkeys = NULL; 2005 out: 2006 if (_flkeys) { 2007 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2008 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2009 keys->tags.flow_label = _flkeys->tags.flow_label; 2010 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2011 } else { 2012 keys->addrs.v6addrs.src = key_iph->saddr; 2013 keys->addrs.v6addrs.dst = key_iph->daddr; 2014 keys->tags.flow_label = ip6_flowlabel(key_iph); 2015 keys->basic.ip_proto = key_iph->nexthdr; 2016 } 2017 } 2018 2019 /* if skb is set it will be used and fl6 can be NULL */ 2020 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2021 const struct sk_buff *skb, struct flow_keys *flkeys) 2022 { 2023 struct flow_keys hash_keys; 2024 u32 mhash; 2025 2026 switch (ip6_multipath_hash_policy(net)) { 2027 case 0: 2028 memset(&hash_keys, 0, sizeof(hash_keys)); 2029 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2030 if (skb) { 2031 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2032 } else { 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2036 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2037 } 2038 break; 2039 case 1: 2040 if (skb) { 2041 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2042 struct flow_keys keys; 2043 2044 /* short-circuit if we already have L4 hash present */ 2045 if (skb->l4_hash) 2046 return skb_get_hash_raw(skb) >> 1; 2047 2048 memset(&hash_keys, 0, sizeof(hash_keys)); 2049 2050 if (!flkeys) { 2051 skb_flow_dissect_flow_keys(skb, &keys, flag); 2052 flkeys = &keys; 2053 } 2054 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2055 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2056 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2057 hash_keys.ports.src = flkeys->ports.src; 2058 hash_keys.ports.dst = flkeys->ports.dst; 2059 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2060 } else { 2061 memset(&hash_keys, 0, sizeof(hash_keys)); 2062 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2063 hash_keys.addrs.v6addrs.src = fl6->saddr; 2064 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2065 hash_keys.ports.src = fl6->fl6_sport; 2066 hash_keys.ports.dst = fl6->fl6_dport; 2067 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2068 } 2069 break; 2070 } 2071 mhash = flow_hash_from_keys(&hash_keys); 2072 2073 return mhash >> 1; 2074 } 2075 2076 void ip6_route_input(struct sk_buff *skb) 2077 { 2078 const struct ipv6hdr *iph = ipv6_hdr(skb); 2079 struct net *net = dev_net(skb->dev); 2080 int flags = RT6_LOOKUP_F_HAS_SADDR; 2081 struct ip_tunnel_info *tun_info; 2082 struct flowi6 fl6 = { 2083 .flowi6_iif = skb->dev->ifindex, 2084 .daddr = iph->daddr, 2085 .saddr = iph->saddr, 2086 .flowlabel = ip6_flowinfo(iph), 2087 .flowi6_mark = skb->mark, 2088 .flowi6_proto = iph->nexthdr, 2089 }; 2090 struct flow_keys *flkeys = NULL, _flkeys; 2091 2092 tun_info = skb_tunnel_info(skb); 2093 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2094 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2095 2096 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2097 flkeys = &_flkeys; 2098 2099 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2100 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2101 skb_dst_drop(skb); 2102 skb_dst_set(skb, 2103 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2104 } 2105 2106 static struct rt6_info *ip6_pol_route_output(struct net *net, 2107 struct fib6_table *table, 2108 struct flowi6 *fl6, 2109 const struct sk_buff *skb, 2110 int flags) 2111 { 2112 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2113 } 2114 2115 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2116 struct flowi6 *fl6, int flags) 2117 { 2118 bool any_src; 2119 2120 if (ipv6_addr_type(&fl6->daddr) & 2121 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2122 struct dst_entry *dst; 2123 2124 dst = l3mdev_link_scope_lookup(net, fl6); 2125 if (dst) 2126 return dst; 2127 } 2128 2129 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2130 2131 any_src = ipv6_addr_any(&fl6->saddr); 2132 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2133 (fl6->flowi6_oif && any_src)) 2134 flags |= RT6_LOOKUP_F_IFACE; 2135 2136 if (!any_src) 2137 flags |= RT6_LOOKUP_F_HAS_SADDR; 2138 else if (sk) 2139 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2140 2141 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2142 } 2143 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2144 2145 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2146 { 2147 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2148 struct net_device *loopback_dev = net->loopback_dev; 2149 struct dst_entry *new = NULL; 2150 2151 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2152 DST_OBSOLETE_DEAD, 0); 2153 if (rt) { 2154 rt6_info_init(rt); 2155 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2156 2157 new = &rt->dst; 2158 new->__use = 1; 2159 new->input = dst_discard; 2160 new->output = dst_discard_out; 2161 2162 dst_copy_metrics(new, &ort->dst); 2163 2164 rt->rt6i_idev = in6_dev_get(loopback_dev); 2165 rt->rt6i_gateway = ort->rt6i_gateway; 2166 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2167 2168 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2169 #ifdef CONFIG_IPV6_SUBTREES 2170 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2171 #endif 2172 } 2173 2174 dst_release(dst_orig); 2175 return new ? new : ERR_PTR(-ENOMEM); 2176 } 2177 2178 /* 2179 * Destination cache support functions 2180 */ 2181 2182 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2183 { 2184 u32 rt_cookie = 0; 2185 2186 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2187 return false; 2188 2189 if (fib6_check_expired(f6i)) 2190 return false; 2191 2192 return true; 2193 } 2194 2195 static struct dst_entry *rt6_check(struct rt6_info *rt, 2196 struct fib6_info *from, 2197 u32 cookie) 2198 { 2199 u32 rt_cookie = 0; 2200 2201 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2202 rt_cookie != cookie) 2203 return NULL; 2204 2205 if (rt6_check_expired(rt)) 2206 return NULL; 2207 2208 return &rt->dst; 2209 } 2210 2211 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2212 struct fib6_info *from, 2213 u32 cookie) 2214 { 2215 if (!__rt6_check_expired(rt) && 2216 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2217 fib6_check(from, cookie)) 2218 return &rt->dst; 2219 else 2220 return NULL; 2221 } 2222 2223 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2224 { 2225 struct dst_entry *dst_ret; 2226 struct fib6_info *from; 2227 struct rt6_info *rt; 2228 2229 rt = container_of(dst, struct rt6_info, dst); 2230 2231 rcu_read_lock(); 2232 2233 /* All IPV6 dsts are created with ->obsolete set to the value 2234 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2235 * into this function always. 2236 */ 2237 2238 from = rcu_dereference(rt->from); 2239 2240 if (from && (rt->rt6i_flags & RTF_PCPU || 2241 unlikely(!list_empty(&rt->rt6i_uncached)))) 2242 dst_ret = rt6_dst_from_check(rt, from, cookie); 2243 else 2244 dst_ret = rt6_check(rt, from, cookie); 2245 2246 rcu_read_unlock(); 2247 2248 return dst_ret; 2249 } 2250 2251 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2252 { 2253 struct rt6_info *rt = (struct rt6_info *) dst; 2254 2255 if (rt) { 2256 if (rt->rt6i_flags & RTF_CACHE) { 2257 rcu_read_lock(); 2258 if (rt6_check_expired(rt)) { 2259 rt6_remove_exception_rt(rt); 2260 dst = NULL; 2261 } 2262 rcu_read_unlock(); 2263 } else { 2264 dst_release(dst); 2265 dst = NULL; 2266 } 2267 } 2268 return dst; 2269 } 2270 2271 static void ip6_link_failure(struct sk_buff *skb) 2272 { 2273 struct rt6_info *rt; 2274 2275 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2276 2277 rt = (struct rt6_info *) skb_dst(skb); 2278 if (rt) { 2279 rcu_read_lock(); 2280 if (rt->rt6i_flags & RTF_CACHE) { 2281 rt6_remove_exception_rt(rt); 2282 } else { 2283 struct fib6_info *from; 2284 struct fib6_node *fn; 2285 2286 from = rcu_dereference(rt->from); 2287 if (from) { 2288 fn = rcu_dereference(from->fib6_node); 2289 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2290 fn->fn_sernum = -1; 2291 } 2292 } 2293 rcu_read_unlock(); 2294 } 2295 } 2296 2297 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2298 { 2299 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2300 struct fib6_info *from; 2301 2302 rcu_read_lock(); 2303 from = rcu_dereference(rt0->from); 2304 if (from) 2305 rt0->dst.expires = from->expires; 2306 rcu_read_unlock(); 2307 } 2308 2309 dst_set_expires(&rt0->dst, timeout); 2310 rt0->rt6i_flags |= RTF_EXPIRES; 2311 } 2312 2313 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2314 { 2315 struct net *net = dev_net(rt->dst.dev); 2316 2317 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2318 rt->rt6i_flags |= RTF_MODIFIED; 2319 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2320 } 2321 2322 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2323 { 2324 return !(rt->rt6i_flags & RTF_CACHE) && 2325 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2326 } 2327 2328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2329 const struct ipv6hdr *iph, u32 mtu) 2330 { 2331 const struct in6_addr *daddr, *saddr; 2332 struct rt6_info *rt6 = (struct rt6_info *)dst; 2333 2334 if (dst_metric_locked(dst, RTAX_MTU)) 2335 return; 2336 2337 if (iph) { 2338 daddr = &iph->daddr; 2339 saddr = &iph->saddr; 2340 } else if (sk) { 2341 daddr = &sk->sk_v6_daddr; 2342 saddr = &inet6_sk(sk)->saddr; 2343 } else { 2344 daddr = NULL; 2345 saddr = NULL; 2346 } 2347 dst_confirm_neigh(dst, daddr); 2348 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2349 if (mtu >= dst_mtu(dst)) 2350 return; 2351 2352 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2353 rt6_do_update_pmtu(rt6, mtu); 2354 /* update rt6_ex->stamp for cache */ 2355 if (rt6->rt6i_flags & RTF_CACHE) 2356 rt6_update_exception_stamp_rt(rt6); 2357 } else if (daddr) { 2358 struct fib6_result res = {}; 2359 struct rt6_info *nrt6; 2360 2361 rcu_read_lock(); 2362 res.f6i = rcu_dereference(rt6->from); 2363 if (!res.f6i) { 2364 rcu_read_unlock(); 2365 return; 2366 } 2367 res.nh = &res.f6i->fib6_nh; 2368 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2369 if (nrt6) { 2370 rt6_do_update_pmtu(nrt6, mtu); 2371 if (rt6_insert_exception(nrt6, &res)) 2372 dst_release_immediate(&nrt6->dst); 2373 } 2374 rcu_read_unlock(); 2375 } 2376 } 2377 2378 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2379 struct sk_buff *skb, u32 mtu) 2380 { 2381 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2382 } 2383 2384 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2385 int oif, u32 mark, kuid_t uid) 2386 { 2387 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2388 struct dst_entry *dst; 2389 struct flowi6 fl6 = { 2390 .flowi6_oif = oif, 2391 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2392 .daddr = iph->daddr, 2393 .saddr = iph->saddr, 2394 .flowlabel = ip6_flowinfo(iph), 2395 .flowi6_uid = uid, 2396 }; 2397 2398 dst = ip6_route_output(net, NULL, &fl6); 2399 if (!dst->error) 2400 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2401 dst_release(dst); 2402 } 2403 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2404 2405 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2406 { 2407 int oif = sk->sk_bound_dev_if; 2408 struct dst_entry *dst; 2409 2410 if (!oif && skb->dev) 2411 oif = l3mdev_master_ifindex(skb->dev); 2412 2413 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2414 2415 dst = __sk_dst_get(sk); 2416 if (!dst || !dst->obsolete || 2417 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2418 return; 2419 2420 bh_lock_sock(sk); 2421 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2422 ip6_datagram_dst_update(sk, false); 2423 bh_unlock_sock(sk); 2424 } 2425 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2426 2427 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2428 const struct flowi6 *fl6) 2429 { 2430 #ifdef CONFIG_IPV6_SUBTREES 2431 struct ipv6_pinfo *np = inet6_sk(sk); 2432 #endif 2433 2434 ip6_dst_store(sk, dst, 2435 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2436 &sk->sk_v6_daddr : NULL, 2437 #ifdef CONFIG_IPV6_SUBTREES 2438 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2439 &np->saddr : 2440 #endif 2441 NULL); 2442 } 2443 2444 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2445 struct flowi6 *fl6, 2446 const struct in6_addr *gw, 2447 struct rt6_info **ret) 2448 { 2449 const struct fib6_nh *nh = res->nh; 2450 2451 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2452 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2453 return false; 2454 2455 /* rt_cache's gateway might be different from its 'parent' 2456 * in the case of an ip redirect. 2457 * So we keep searching in the exception table if the gateway 2458 * is different. 2459 */ 2460 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2461 struct rt6_info *rt_cache; 2462 2463 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2464 if (rt_cache && 2465 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2466 *ret = rt_cache; 2467 return true; 2468 } 2469 return false; 2470 } 2471 return true; 2472 } 2473 2474 /* Handle redirects */ 2475 struct ip6rd_flowi { 2476 struct flowi6 fl6; 2477 struct in6_addr gateway; 2478 }; 2479 2480 static struct rt6_info *__ip6_route_redirect(struct net *net, 2481 struct fib6_table *table, 2482 struct flowi6 *fl6, 2483 const struct sk_buff *skb, 2484 int flags) 2485 { 2486 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2487 struct rt6_info *ret = NULL; 2488 struct fib6_result res = {}; 2489 struct fib6_info *rt; 2490 struct fib6_node *fn; 2491 2492 /* Get the "current" route for this destination and 2493 * check if the redirect has come from appropriate router. 2494 * 2495 * RFC 4861 specifies that redirects should only be 2496 * accepted if they come from the nexthop to the target. 2497 * Due to the way the routes are chosen, this notion 2498 * is a bit fuzzy and one might need to check all possible 2499 * routes. 2500 */ 2501 2502 rcu_read_lock(); 2503 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2504 restart: 2505 for_each_fib6_node_rt_rcu(fn) { 2506 res.f6i = rt; 2507 res.nh = &rt->fib6_nh; 2508 2509 if (fib6_check_expired(rt)) 2510 continue; 2511 if (rt->fib6_flags & RTF_REJECT) 2512 break; 2513 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2514 goto out; 2515 } 2516 2517 if (!rt) 2518 rt = net->ipv6.fib6_null_entry; 2519 else if (rt->fib6_flags & RTF_REJECT) { 2520 ret = net->ipv6.ip6_null_entry; 2521 goto out; 2522 } 2523 2524 if (rt == net->ipv6.fib6_null_entry) { 2525 fn = fib6_backtrack(fn, &fl6->saddr); 2526 if (fn) 2527 goto restart; 2528 } 2529 2530 res.f6i = rt; 2531 res.nh = &rt->fib6_nh; 2532 out: 2533 if (ret) 2534 ip6_hold_safe(net, &ret); 2535 else 2536 ret = ip6_create_rt_rcu(&res); 2537 2538 rcu_read_unlock(); 2539 2540 trace_fib6_table_lookup(net, &res, table, fl6); 2541 return ret; 2542 }; 2543 2544 static struct dst_entry *ip6_route_redirect(struct net *net, 2545 const struct flowi6 *fl6, 2546 const struct sk_buff *skb, 2547 const struct in6_addr *gateway) 2548 { 2549 int flags = RT6_LOOKUP_F_HAS_SADDR; 2550 struct ip6rd_flowi rdfl; 2551 2552 rdfl.fl6 = *fl6; 2553 rdfl.gateway = *gateway; 2554 2555 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2556 flags, __ip6_route_redirect); 2557 } 2558 2559 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2560 kuid_t uid) 2561 { 2562 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2563 struct dst_entry *dst; 2564 struct flowi6 fl6 = { 2565 .flowi6_iif = LOOPBACK_IFINDEX, 2566 .flowi6_oif = oif, 2567 .flowi6_mark = mark, 2568 .daddr = iph->daddr, 2569 .saddr = iph->saddr, 2570 .flowlabel = ip6_flowinfo(iph), 2571 .flowi6_uid = uid, 2572 }; 2573 2574 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2575 rt6_do_redirect(dst, NULL, skb); 2576 dst_release(dst); 2577 } 2578 EXPORT_SYMBOL_GPL(ip6_redirect); 2579 2580 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2581 { 2582 const struct ipv6hdr *iph = ipv6_hdr(skb); 2583 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2584 struct dst_entry *dst; 2585 struct flowi6 fl6 = { 2586 .flowi6_iif = LOOPBACK_IFINDEX, 2587 .flowi6_oif = oif, 2588 .daddr = msg->dest, 2589 .saddr = iph->daddr, 2590 .flowi6_uid = sock_net_uid(net, NULL), 2591 }; 2592 2593 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2594 rt6_do_redirect(dst, NULL, skb); 2595 dst_release(dst); 2596 } 2597 2598 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2599 { 2600 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2601 sk->sk_uid); 2602 } 2603 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2604 2605 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2606 { 2607 struct net_device *dev = dst->dev; 2608 unsigned int mtu = dst_mtu(dst); 2609 struct net *net = dev_net(dev); 2610 2611 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2612 2613 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2614 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2615 2616 /* 2617 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2618 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2619 * IPV6_MAXPLEN is also valid and means: "any MSS, 2620 * rely only on pmtu discovery" 2621 */ 2622 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2623 mtu = IPV6_MAXPLEN; 2624 return mtu; 2625 } 2626 2627 static unsigned int ip6_mtu(const struct dst_entry *dst) 2628 { 2629 struct inet6_dev *idev; 2630 unsigned int mtu; 2631 2632 mtu = dst_metric_raw(dst, RTAX_MTU); 2633 if (mtu) 2634 goto out; 2635 2636 mtu = IPV6_MIN_MTU; 2637 2638 rcu_read_lock(); 2639 idev = __in6_dev_get(dst->dev); 2640 if (idev) 2641 mtu = idev->cnf.mtu6; 2642 rcu_read_unlock(); 2643 2644 out: 2645 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2646 2647 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2648 } 2649 2650 /* MTU selection: 2651 * 1. mtu on route is locked - use it 2652 * 2. mtu from nexthop exception 2653 * 3. mtu from egress device 2654 * 2655 * based on ip6_dst_mtu_forward and exception logic of 2656 * rt6_find_cached_rt; called with rcu_read_lock 2657 */ 2658 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 2659 const struct in6_addr *daddr, 2660 const struct in6_addr *saddr) 2661 { 2662 struct rt6_exception_bucket *bucket; 2663 const struct fib6_nh *nh = res->nh; 2664 struct fib6_info *f6i = res->f6i; 2665 const struct in6_addr *src_key; 2666 struct rt6_exception *rt6_ex; 2667 struct inet6_dev *idev; 2668 u32 mtu = 0; 2669 2670 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2671 mtu = f6i->fib6_pmtu; 2672 if (mtu) 2673 goto out; 2674 } 2675 2676 src_key = NULL; 2677 #ifdef CONFIG_IPV6_SUBTREES 2678 if (f6i->fib6_src.plen) 2679 src_key = saddr; 2680 #endif 2681 2682 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2683 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2684 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2685 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2686 2687 if (likely(!mtu)) { 2688 struct net_device *dev = nh->fib_nh_dev; 2689 2690 mtu = IPV6_MIN_MTU; 2691 idev = __in6_dev_get(dev); 2692 if (idev && idev->cnf.mtu6 > mtu) 2693 mtu = idev->cnf.mtu6; 2694 } 2695 2696 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2697 out: 2698 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 2699 } 2700 2701 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2702 struct flowi6 *fl6) 2703 { 2704 struct dst_entry *dst; 2705 struct rt6_info *rt; 2706 struct inet6_dev *idev = in6_dev_get(dev); 2707 struct net *net = dev_net(dev); 2708 2709 if (unlikely(!idev)) 2710 return ERR_PTR(-ENODEV); 2711 2712 rt = ip6_dst_alloc(net, dev, 0); 2713 if (unlikely(!rt)) { 2714 in6_dev_put(idev); 2715 dst = ERR_PTR(-ENOMEM); 2716 goto out; 2717 } 2718 2719 rt->dst.flags |= DST_HOST; 2720 rt->dst.input = ip6_input; 2721 rt->dst.output = ip6_output; 2722 rt->rt6i_gateway = fl6->daddr; 2723 rt->rt6i_dst.addr = fl6->daddr; 2724 rt->rt6i_dst.plen = 128; 2725 rt->rt6i_idev = idev; 2726 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2727 2728 /* Add this dst into uncached_list so that rt6_disable_ip() can 2729 * do proper release of the net_device 2730 */ 2731 rt6_uncached_list_add(rt); 2732 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2733 2734 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2735 2736 out: 2737 return dst; 2738 } 2739 2740 static int ip6_dst_gc(struct dst_ops *ops) 2741 { 2742 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2743 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2744 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2745 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2746 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2747 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2748 int entries; 2749 2750 entries = dst_entries_get_fast(ops); 2751 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2752 entries <= rt_max_size) 2753 goto out; 2754 2755 net->ipv6.ip6_rt_gc_expire++; 2756 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2757 entries = dst_entries_get_slow(ops); 2758 if (entries < ops->gc_thresh) 2759 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2760 out: 2761 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2762 return entries > rt_max_size; 2763 } 2764 2765 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2766 struct fib6_config *cfg, 2767 const struct in6_addr *gw_addr, 2768 u32 tbid, int flags) 2769 { 2770 struct flowi6 fl6 = { 2771 .flowi6_oif = cfg->fc_ifindex, 2772 .daddr = *gw_addr, 2773 .saddr = cfg->fc_prefsrc, 2774 }; 2775 struct fib6_table *table; 2776 struct rt6_info *rt; 2777 2778 table = fib6_get_table(net, tbid); 2779 if (!table) 2780 return NULL; 2781 2782 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2783 flags |= RT6_LOOKUP_F_HAS_SADDR; 2784 2785 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2786 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2787 2788 /* if table lookup failed, fall back to full lookup */ 2789 if (rt == net->ipv6.ip6_null_entry) { 2790 ip6_rt_put(rt); 2791 rt = NULL; 2792 } 2793 2794 return rt; 2795 } 2796 2797 static int ip6_route_check_nh_onlink(struct net *net, 2798 struct fib6_config *cfg, 2799 const struct net_device *dev, 2800 struct netlink_ext_ack *extack) 2801 { 2802 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2803 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2804 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2805 struct fib6_info *from; 2806 struct rt6_info *grt; 2807 int err; 2808 2809 err = 0; 2810 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2811 if (grt) { 2812 rcu_read_lock(); 2813 from = rcu_dereference(grt->from); 2814 if (!grt->dst.error && 2815 /* ignore match if it is the default route */ 2816 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2817 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2818 NL_SET_ERR_MSG(extack, 2819 "Nexthop has invalid gateway or device mismatch"); 2820 err = -EINVAL; 2821 } 2822 rcu_read_unlock(); 2823 2824 ip6_rt_put(grt); 2825 } 2826 2827 return err; 2828 } 2829 2830 static int ip6_route_check_nh(struct net *net, 2831 struct fib6_config *cfg, 2832 struct net_device **_dev, 2833 struct inet6_dev **idev) 2834 { 2835 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2836 struct net_device *dev = _dev ? *_dev : NULL; 2837 struct rt6_info *grt = NULL; 2838 int err = -EHOSTUNREACH; 2839 2840 if (cfg->fc_table) { 2841 int flags = RT6_LOOKUP_F_IFACE; 2842 2843 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2844 cfg->fc_table, flags); 2845 if (grt) { 2846 if (grt->rt6i_flags & RTF_GATEWAY || 2847 (dev && dev != grt->dst.dev)) { 2848 ip6_rt_put(grt); 2849 grt = NULL; 2850 } 2851 } 2852 } 2853 2854 if (!grt) 2855 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2856 2857 if (!grt) 2858 goto out; 2859 2860 if (dev) { 2861 if (dev != grt->dst.dev) { 2862 ip6_rt_put(grt); 2863 goto out; 2864 } 2865 } else { 2866 *_dev = dev = grt->dst.dev; 2867 *idev = grt->rt6i_idev; 2868 dev_hold(dev); 2869 in6_dev_hold(grt->rt6i_idev); 2870 } 2871 2872 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2873 err = 0; 2874 2875 ip6_rt_put(grt); 2876 2877 out: 2878 return err; 2879 } 2880 2881 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2882 struct net_device **_dev, struct inet6_dev **idev, 2883 struct netlink_ext_ack *extack) 2884 { 2885 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2886 int gwa_type = ipv6_addr_type(gw_addr); 2887 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2888 const struct net_device *dev = *_dev; 2889 bool need_addr_check = !dev; 2890 int err = -EINVAL; 2891 2892 /* if gw_addr is local we will fail to detect this in case 2893 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2894 * will return already-added prefix route via interface that 2895 * prefix route was assigned to, which might be non-loopback. 2896 */ 2897 if (dev && 2898 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2899 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2900 goto out; 2901 } 2902 2903 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2904 /* IPv6 strictly inhibits using not link-local 2905 * addresses as nexthop address. 2906 * Otherwise, router will not able to send redirects. 2907 * It is very good, but in some (rare!) circumstances 2908 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2909 * some exceptions. --ANK 2910 * We allow IPv4-mapped nexthops to support RFC4798-type 2911 * addressing 2912 */ 2913 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2914 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2915 goto out; 2916 } 2917 2918 if (cfg->fc_flags & RTNH_F_ONLINK) 2919 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2920 else 2921 err = ip6_route_check_nh(net, cfg, _dev, idev); 2922 2923 if (err) 2924 goto out; 2925 } 2926 2927 /* reload in case device was changed */ 2928 dev = *_dev; 2929 2930 err = -EINVAL; 2931 if (!dev) { 2932 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2933 goto out; 2934 } else if (dev->flags & IFF_LOOPBACK) { 2935 NL_SET_ERR_MSG(extack, 2936 "Egress device can not be loopback device for this route"); 2937 goto out; 2938 } 2939 2940 /* if we did not check gw_addr above, do so now that the 2941 * egress device has been resolved. 2942 */ 2943 if (need_addr_check && 2944 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2945 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2946 goto out; 2947 } 2948 2949 err = 0; 2950 out: 2951 return err; 2952 } 2953 2954 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2955 { 2956 if ((flags & RTF_REJECT) || 2957 (dev && (dev->flags & IFF_LOOPBACK) && 2958 !(addr_type & IPV6_ADDR_LOOPBACK) && 2959 !(flags & RTF_LOCAL))) 2960 return true; 2961 2962 return false; 2963 } 2964 2965 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2966 struct fib6_config *cfg, gfp_t gfp_flags, 2967 struct netlink_ext_ack *extack) 2968 { 2969 struct net_device *dev = NULL; 2970 struct inet6_dev *idev = NULL; 2971 int addr_type; 2972 int err; 2973 2974 fib6_nh->fib_nh_family = AF_INET6; 2975 2976 err = -ENODEV; 2977 if (cfg->fc_ifindex) { 2978 dev = dev_get_by_index(net, cfg->fc_ifindex); 2979 if (!dev) 2980 goto out; 2981 idev = in6_dev_get(dev); 2982 if (!idev) 2983 goto out; 2984 } 2985 2986 if (cfg->fc_flags & RTNH_F_ONLINK) { 2987 if (!dev) { 2988 NL_SET_ERR_MSG(extack, 2989 "Nexthop device required for onlink"); 2990 goto out; 2991 } 2992 2993 if (!(dev->flags & IFF_UP)) { 2994 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2995 err = -ENETDOWN; 2996 goto out; 2997 } 2998 2999 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3000 } 3001 3002 fib6_nh->fib_nh_weight = 1; 3003 3004 /* We cannot add true routes via loopback here, 3005 * they would result in kernel looping; promote them to reject routes 3006 */ 3007 addr_type = ipv6_addr_type(&cfg->fc_dst); 3008 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3009 /* hold loopback dev/idev if we haven't done so. */ 3010 if (dev != net->loopback_dev) { 3011 if (dev) { 3012 dev_put(dev); 3013 in6_dev_put(idev); 3014 } 3015 dev = net->loopback_dev; 3016 dev_hold(dev); 3017 idev = in6_dev_get(dev); 3018 if (!idev) { 3019 err = -ENODEV; 3020 goto out; 3021 } 3022 } 3023 goto set_dev; 3024 } 3025 3026 if (cfg->fc_flags & RTF_GATEWAY) { 3027 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3028 if (err) 3029 goto out; 3030 3031 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3032 fib6_nh->fib_nh_gw_family = AF_INET6; 3033 } 3034 3035 err = -ENODEV; 3036 if (!dev) 3037 goto out; 3038 3039 if (idev->cnf.disable_ipv6) { 3040 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3041 err = -EACCES; 3042 goto out; 3043 } 3044 3045 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3046 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3047 err = -ENETDOWN; 3048 goto out; 3049 } 3050 3051 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3052 !netif_carrier_ok(dev)) 3053 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3054 3055 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3056 cfg->fc_encap_type, cfg, gfp_flags, extack); 3057 if (err) 3058 goto out; 3059 set_dev: 3060 fib6_nh->fib_nh_dev = dev; 3061 fib6_nh->fib_nh_oif = dev->ifindex; 3062 err = 0; 3063 out: 3064 if (idev) 3065 in6_dev_put(idev); 3066 3067 if (err) { 3068 lwtstate_put(fib6_nh->fib_nh_lws); 3069 fib6_nh->fib_nh_lws = NULL; 3070 if (dev) 3071 dev_put(dev); 3072 } 3073 3074 return err; 3075 } 3076 3077 void fib6_nh_release(struct fib6_nh *fib6_nh) 3078 { 3079 fib_nh_common_release(&fib6_nh->nh_common); 3080 } 3081 3082 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3083 gfp_t gfp_flags, 3084 struct netlink_ext_ack *extack) 3085 { 3086 struct net *net = cfg->fc_nlinfo.nl_net; 3087 struct fib6_info *rt = NULL; 3088 struct fib6_table *table; 3089 int err = -EINVAL; 3090 int addr_type; 3091 3092 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3093 if (cfg->fc_flags & RTF_PCPU) { 3094 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3095 goto out; 3096 } 3097 3098 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3099 if (cfg->fc_flags & RTF_CACHE) { 3100 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3101 goto out; 3102 } 3103 3104 if (cfg->fc_type > RTN_MAX) { 3105 NL_SET_ERR_MSG(extack, "Invalid route type"); 3106 goto out; 3107 } 3108 3109 if (cfg->fc_dst_len > 128) { 3110 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3111 goto out; 3112 } 3113 if (cfg->fc_src_len > 128) { 3114 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3115 goto out; 3116 } 3117 #ifndef CONFIG_IPV6_SUBTREES 3118 if (cfg->fc_src_len) { 3119 NL_SET_ERR_MSG(extack, 3120 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3121 goto out; 3122 } 3123 #endif 3124 3125 err = -ENOBUFS; 3126 if (cfg->fc_nlinfo.nlh && 3127 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3128 table = fib6_get_table(net, cfg->fc_table); 3129 if (!table) { 3130 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3131 table = fib6_new_table(net, cfg->fc_table); 3132 } 3133 } else { 3134 table = fib6_new_table(net, cfg->fc_table); 3135 } 3136 3137 if (!table) 3138 goto out; 3139 3140 err = -ENOMEM; 3141 rt = fib6_info_alloc(gfp_flags); 3142 if (!rt) 3143 goto out; 3144 3145 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3146 extack); 3147 if (IS_ERR(rt->fib6_metrics)) { 3148 err = PTR_ERR(rt->fib6_metrics); 3149 /* Do not leave garbage there. */ 3150 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3151 goto out; 3152 } 3153 3154 if (cfg->fc_flags & RTF_ADDRCONF) 3155 rt->dst_nocount = true; 3156 3157 if (cfg->fc_flags & RTF_EXPIRES) 3158 fib6_set_expires(rt, jiffies + 3159 clock_t_to_jiffies(cfg->fc_expires)); 3160 else 3161 fib6_clean_expires(rt); 3162 3163 if (cfg->fc_protocol == RTPROT_UNSPEC) 3164 cfg->fc_protocol = RTPROT_BOOT; 3165 rt->fib6_protocol = cfg->fc_protocol; 3166 3167 rt->fib6_table = table; 3168 rt->fib6_metric = cfg->fc_metric; 3169 rt->fib6_type = cfg->fc_type; 3170 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3171 3172 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3173 rt->fib6_dst.plen = cfg->fc_dst_len; 3174 if (rt->fib6_dst.plen == 128) 3175 rt->dst_host = true; 3176 3177 #ifdef CONFIG_IPV6_SUBTREES 3178 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3179 rt->fib6_src.plen = cfg->fc_src_len; 3180 #endif 3181 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3182 if (err) 3183 goto out; 3184 3185 /* We cannot add true routes via loopback here, 3186 * they would result in kernel looping; promote them to reject routes 3187 */ 3188 addr_type = ipv6_addr_type(&cfg->fc_dst); 3189 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3190 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3191 3192 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3193 struct net_device *dev = fib6_info_nh_dev(rt); 3194 3195 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3196 NL_SET_ERR_MSG(extack, "Invalid source address"); 3197 err = -EINVAL; 3198 goto out; 3199 } 3200 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3201 rt->fib6_prefsrc.plen = 128; 3202 } else 3203 rt->fib6_prefsrc.plen = 0; 3204 3205 return rt; 3206 out: 3207 fib6_info_release(rt); 3208 return ERR_PTR(err); 3209 } 3210 3211 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3212 struct netlink_ext_ack *extack) 3213 { 3214 struct fib6_info *rt; 3215 int err; 3216 3217 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3218 if (IS_ERR(rt)) 3219 return PTR_ERR(rt); 3220 3221 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3222 fib6_info_release(rt); 3223 3224 return err; 3225 } 3226 3227 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3228 { 3229 struct net *net = info->nl_net; 3230 struct fib6_table *table; 3231 int err; 3232 3233 if (rt == net->ipv6.fib6_null_entry) { 3234 err = -ENOENT; 3235 goto out; 3236 } 3237 3238 table = rt->fib6_table; 3239 spin_lock_bh(&table->tb6_lock); 3240 err = fib6_del(rt, info); 3241 spin_unlock_bh(&table->tb6_lock); 3242 3243 out: 3244 fib6_info_release(rt); 3245 return err; 3246 } 3247 3248 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3249 { 3250 struct nl_info info = { .nl_net = net }; 3251 3252 return __ip6_del_rt(rt, &info); 3253 } 3254 3255 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3256 { 3257 struct nl_info *info = &cfg->fc_nlinfo; 3258 struct net *net = info->nl_net; 3259 struct sk_buff *skb = NULL; 3260 struct fib6_table *table; 3261 int err = -ENOENT; 3262 3263 if (rt == net->ipv6.fib6_null_entry) 3264 goto out_put; 3265 table = rt->fib6_table; 3266 spin_lock_bh(&table->tb6_lock); 3267 3268 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3269 struct fib6_info *sibling, *next_sibling; 3270 3271 /* prefer to send a single notification with all hops */ 3272 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3273 if (skb) { 3274 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3275 3276 if (rt6_fill_node(net, skb, rt, NULL, 3277 NULL, NULL, 0, RTM_DELROUTE, 3278 info->portid, seq, 0) < 0) { 3279 kfree_skb(skb); 3280 skb = NULL; 3281 } else 3282 info->skip_notify = 1; 3283 } 3284 3285 list_for_each_entry_safe(sibling, next_sibling, 3286 &rt->fib6_siblings, 3287 fib6_siblings) { 3288 err = fib6_del(sibling, info); 3289 if (err) 3290 goto out_unlock; 3291 } 3292 } 3293 3294 err = fib6_del(rt, info); 3295 out_unlock: 3296 spin_unlock_bh(&table->tb6_lock); 3297 out_put: 3298 fib6_info_release(rt); 3299 3300 if (skb) { 3301 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3302 info->nlh, gfp_any()); 3303 } 3304 return err; 3305 } 3306 3307 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3308 { 3309 int rc = -ESRCH; 3310 3311 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3312 goto out; 3313 3314 if (cfg->fc_flags & RTF_GATEWAY && 3315 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3316 goto out; 3317 3318 rc = rt6_remove_exception_rt(rt); 3319 out: 3320 return rc; 3321 } 3322 3323 static int ip6_route_del(struct fib6_config *cfg, 3324 struct netlink_ext_ack *extack) 3325 { 3326 struct rt6_info *rt_cache; 3327 struct fib6_table *table; 3328 struct fib6_info *rt; 3329 struct fib6_node *fn; 3330 int err = -ESRCH; 3331 3332 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3333 if (!table) { 3334 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3335 return err; 3336 } 3337 3338 rcu_read_lock(); 3339 3340 fn = fib6_locate(&table->tb6_root, 3341 &cfg->fc_dst, cfg->fc_dst_len, 3342 &cfg->fc_src, cfg->fc_src_len, 3343 !(cfg->fc_flags & RTF_CACHE)); 3344 3345 if (fn) { 3346 for_each_fib6_node_rt_rcu(fn) { 3347 struct fib6_nh *nh; 3348 3349 if (cfg->fc_flags & RTF_CACHE) { 3350 struct fib6_result res = { 3351 .f6i = rt, 3352 }; 3353 int rc; 3354 3355 rt_cache = rt6_find_cached_rt(&res, 3356 &cfg->fc_dst, 3357 &cfg->fc_src); 3358 if (rt_cache) { 3359 rc = ip6_del_cached_rt(rt_cache, cfg); 3360 if (rc != -ESRCH) { 3361 rcu_read_unlock(); 3362 return rc; 3363 } 3364 } 3365 continue; 3366 } 3367 3368 nh = &rt->fib6_nh; 3369 if (cfg->fc_ifindex && 3370 (!nh->fib_nh_dev || 3371 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3372 continue; 3373 if (cfg->fc_flags & RTF_GATEWAY && 3374 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3375 continue; 3376 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3377 continue; 3378 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3379 continue; 3380 if (!fib6_info_hold_safe(rt)) 3381 continue; 3382 rcu_read_unlock(); 3383 3384 /* if gateway was specified only delete the one hop */ 3385 if (cfg->fc_flags & RTF_GATEWAY) 3386 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3387 3388 return __ip6_del_rt_siblings(rt, cfg); 3389 } 3390 } 3391 rcu_read_unlock(); 3392 3393 return err; 3394 } 3395 3396 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3397 { 3398 struct netevent_redirect netevent; 3399 struct rt6_info *rt, *nrt = NULL; 3400 struct fib6_result res = {}; 3401 struct ndisc_options ndopts; 3402 struct inet6_dev *in6_dev; 3403 struct neighbour *neigh; 3404 struct rd_msg *msg; 3405 int optlen, on_link; 3406 u8 *lladdr; 3407 3408 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3409 optlen -= sizeof(*msg); 3410 3411 if (optlen < 0) { 3412 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3413 return; 3414 } 3415 3416 msg = (struct rd_msg *)icmp6_hdr(skb); 3417 3418 if (ipv6_addr_is_multicast(&msg->dest)) { 3419 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3420 return; 3421 } 3422 3423 on_link = 0; 3424 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3425 on_link = 1; 3426 } else if (ipv6_addr_type(&msg->target) != 3427 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3428 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3429 return; 3430 } 3431 3432 in6_dev = __in6_dev_get(skb->dev); 3433 if (!in6_dev) 3434 return; 3435 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3436 return; 3437 3438 /* RFC2461 8.1: 3439 * The IP source address of the Redirect MUST be the same as the current 3440 * first-hop router for the specified ICMP Destination Address. 3441 */ 3442 3443 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3444 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3445 return; 3446 } 3447 3448 lladdr = NULL; 3449 if (ndopts.nd_opts_tgt_lladdr) { 3450 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3451 skb->dev); 3452 if (!lladdr) { 3453 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3454 return; 3455 } 3456 } 3457 3458 rt = (struct rt6_info *) dst; 3459 if (rt->rt6i_flags & RTF_REJECT) { 3460 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3461 return; 3462 } 3463 3464 /* Redirect received -> path was valid. 3465 * Look, redirects are sent only in response to data packets, 3466 * so that this nexthop apparently is reachable. --ANK 3467 */ 3468 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3469 3470 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3471 if (!neigh) 3472 return; 3473 3474 /* 3475 * We have finally decided to accept it. 3476 */ 3477 3478 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3479 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3480 NEIGH_UPDATE_F_OVERRIDE| 3481 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3482 NEIGH_UPDATE_F_ISROUTER)), 3483 NDISC_REDIRECT, &ndopts); 3484 3485 rcu_read_lock(); 3486 res.f6i = rcu_dereference(rt->from); 3487 /* This fib6_info_hold() is safe here because we hold reference to rt 3488 * and rt already holds reference to fib6_info. 3489 */ 3490 fib6_info_hold(res.f6i); 3491 rcu_read_unlock(); 3492 3493 res.nh = &res.f6i->fib6_nh; 3494 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3495 if (!nrt) 3496 goto out; 3497 3498 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3499 if (on_link) 3500 nrt->rt6i_flags &= ~RTF_GATEWAY; 3501 3502 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3503 3504 /* No need to remove rt from the exception table if rt is 3505 * a cached route because rt6_insert_exception() will 3506 * takes care of it 3507 */ 3508 if (rt6_insert_exception(nrt, &res)) { 3509 dst_release_immediate(&nrt->dst); 3510 goto out; 3511 } 3512 3513 netevent.old = &rt->dst; 3514 netevent.new = &nrt->dst; 3515 netevent.daddr = &msg->dest; 3516 netevent.neigh = neigh; 3517 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3518 3519 out: 3520 fib6_info_release(res.f6i); 3521 neigh_release(neigh); 3522 } 3523 3524 #ifdef CONFIG_IPV6_ROUTE_INFO 3525 static struct fib6_info *rt6_get_route_info(struct net *net, 3526 const struct in6_addr *prefix, int prefixlen, 3527 const struct in6_addr *gwaddr, 3528 struct net_device *dev) 3529 { 3530 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3531 int ifindex = dev->ifindex; 3532 struct fib6_node *fn; 3533 struct fib6_info *rt = NULL; 3534 struct fib6_table *table; 3535 3536 table = fib6_get_table(net, tb_id); 3537 if (!table) 3538 return NULL; 3539 3540 rcu_read_lock(); 3541 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3542 if (!fn) 3543 goto out; 3544 3545 for_each_fib6_node_rt_rcu(fn) { 3546 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3547 continue; 3548 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3549 !rt->fib6_nh.fib_nh_gw_family) 3550 continue; 3551 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3552 continue; 3553 if (!fib6_info_hold_safe(rt)) 3554 continue; 3555 break; 3556 } 3557 out: 3558 rcu_read_unlock(); 3559 return rt; 3560 } 3561 3562 static struct fib6_info *rt6_add_route_info(struct net *net, 3563 const struct in6_addr *prefix, int prefixlen, 3564 const struct in6_addr *gwaddr, 3565 struct net_device *dev, 3566 unsigned int pref) 3567 { 3568 struct fib6_config cfg = { 3569 .fc_metric = IP6_RT_PRIO_USER, 3570 .fc_ifindex = dev->ifindex, 3571 .fc_dst_len = prefixlen, 3572 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3573 RTF_UP | RTF_PREF(pref), 3574 .fc_protocol = RTPROT_RA, 3575 .fc_type = RTN_UNICAST, 3576 .fc_nlinfo.portid = 0, 3577 .fc_nlinfo.nlh = NULL, 3578 .fc_nlinfo.nl_net = net, 3579 }; 3580 3581 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3582 cfg.fc_dst = *prefix; 3583 cfg.fc_gateway = *gwaddr; 3584 3585 /* We should treat it as a default route if prefix length is 0. */ 3586 if (!prefixlen) 3587 cfg.fc_flags |= RTF_DEFAULT; 3588 3589 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3590 3591 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3592 } 3593 #endif 3594 3595 struct fib6_info *rt6_get_dflt_router(struct net *net, 3596 const struct in6_addr *addr, 3597 struct net_device *dev) 3598 { 3599 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3600 struct fib6_info *rt; 3601 struct fib6_table *table; 3602 3603 table = fib6_get_table(net, tb_id); 3604 if (!table) 3605 return NULL; 3606 3607 rcu_read_lock(); 3608 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3609 struct fib6_nh *nh = &rt->fib6_nh; 3610 3611 if (dev == nh->fib_nh_dev && 3612 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3613 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3614 break; 3615 } 3616 if (rt && !fib6_info_hold_safe(rt)) 3617 rt = NULL; 3618 rcu_read_unlock(); 3619 return rt; 3620 } 3621 3622 struct fib6_info *rt6_add_dflt_router(struct net *net, 3623 const struct in6_addr *gwaddr, 3624 struct net_device *dev, 3625 unsigned int pref) 3626 { 3627 struct fib6_config cfg = { 3628 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3629 .fc_metric = IP6_RT_PRIO_USER, 3630 .fc_ifindex = dev->ifindex, 3631 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3632 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3633 .fc_protocol = RTPROT_RA, 3634 .fc_type = RTN_UNICAST, 3635 .fc_nlinfo.portid = 0, 3636 .fc_nlinfo.nlh = NULL, 3637 .fc_nlinfo.nl_net = net, 3638 }; 3639 3640 cfg.fc_gateway = *gwaddr; 3641 3642 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3643 struct fib6_table *table; 3644 3645 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3646 if (table) 3647 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3648 } 3649 3650 return rt6_get_dflt_router(net, gwaddr, dev); 3651 } 3652 3653 static void __rt6_purge_dflt_routers(struct net *net, 3654 struct fib6_table *table) 3655 { 3656 struct fib6_info *rt; 3657 3658 restart: 3659 rcu_read_lock(); 3660 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3661 struct net_device *dev = fib6_info_nh_dev(rt); 3662 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3663 3664 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3665 (!idev || idev->cnf.accept_ra != 2) && 3666 fib6_info_hold_safe(rt)) { 3667 rcu_read_unlock(); 3668 ip6_del_rt(net, rt); 3669 goto restart; 3670 } 3671 } 3672 rcu_read_unlock(); 3673 3674 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3675 } 3676 3677 void rt6_purge_dflt_routers(struct net *net) 3678 { 3679 struct fib6_table *table; 3680 struct hlist_head *head; 3681 unsigned int h; 3682 3683 rcu_read_lock(); 3684 3685 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3686 head = &net->ipv6.fib_table_hash[h]; 3687 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3688 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3689 __rt6_purge_dflt_routers(net, table); 3690 } 3691 } 3692 3693 rcu_read_unlock(); 3694 } 3695 3696 static void rtmsg_to_fib6_config(struct net *net, 3697 struct in6_rtmsg *rtmsg, 3698 struct fib6_config *cfg) 3699 { 3700 *cfg = (struct fib6_config){ 3701 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3702 : RT6_TABLE_MAIN, 3703 .fc_ifindex = rtmsg->rtmsg_ifindex, 3704 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3705 .fc_expires = rtmsg->rtmsg_info, 3706 .fc_dst_len = rtmsg->rtmsg_dst_len, 3707 .fc_src_len = rtmsg->rtmsg_src_len, 3708 .fc_flags = rtmsg->rtmsg_flags, 3709 .fc_type = rtmsg->rtmsg_type, 3710 3711 .fc_nlinfo.nl_net = net, 3712 3713 .fc_dst = rtmsg->rtmsg_dst, 3714 .fc_src = rtmsg->rtmsg_src, 3715 .fc_gateway = rtmsg->rtmsg_gateway, 3716 }; 3717 } 3718 3719 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3720 { 3721 struct fib6_config cfg; 3722 struct in6_rtmsg rtmsg; 3723 int err; 3724 3725 switch (cmd) { 3726 case SIOCADDRT: /* Add a route */ 3727 case SIOCDELRT: /* Delete a route */ 3728 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3729 return -EPERM; 3730 err = copy_from_user(&rtmsg, arg, 3731 sizeof(struct in6_rtmsg)); 3732 if (err) 3733 return -EFAULT; 3734 3735 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3736 3737 rtnl_lock(); 3738 switch (cmd) { 3739 case SIOCADDRT: 3740 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3741 break; 3742 case SIOCDELRT: 3743 err = ip6_route_del(&cfg, NULL); 3744 break; 3745 default: 3746 err = -EINVAL; 3747 } 3748 rtnl_unlock(); 3749 3750 return err; 3751 } 3752 3753 return -EINVAL; 3754 } 3755 3756 /* 3757 * Drop the packet on the floor 3758 */ 3759 3760 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3761 { 3762 int type; 3763 struct dst_entry *dst = skb_dst(skb); 3764 switch (ipstats_mib_noroutes) { 3765 case IPSTATS_MIB_INNOROUTES: 3766 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3767 if (type == IPV6_ADDR_ANY) { 3768 IP6_INC_STATS(dev_net(dst->dev), 3769 __in6_dev_get_safely(skb->dev), 3770 IPSTATS_MIB_INADDRERRORS); 3771 break; 3772 } 3773 /* FALLTHROUGH */ 3774 case IPSTATS_MIB_OUTNOROUTES: 3775 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3776 ipstats_mib_noroutes); 3777 break; 3778 } 3779 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3780 kfree_skb(skb); 3781 return 0; 3782 } 3783 3784 static int ip6_pkt_discard(struct sk_buff *skb) 3785 { 3786 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3787 } 3788 3789 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3790 { 3791 skb->dev = skb_dst(skb)->dev; 3792 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3793 } 3794 3795 static int ip6_pkt_prohibit(struct sk_buff *skb) 3796 { 3797 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3798 } 3799 3800 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3801 { 3802 skb->dev = skb_dst(skb)->dev; 3803 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3804 } 3805 3806 /* 3807 * Allocate a dst for local (unicast / anycast) address. 3808 */ 3809 3810 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3811 struct inet6_dev *idev, 3812 const struct in6_addr *addr, 3813 bool anycast, gfp_t gfp_flags) 3814 { 3815 struct fib6_config cfg = { 3816 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3817 .fc_ifindex = idev->dev->ifindex, 3818 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3819 .fc_dst = *addr, 3820 .fc_dst_len = 128, 3821 .fc_protocol = RTPROT_KERNEL, 3822 .fc_nlinfo.nl_net = net, 3823 .fc_ignore_dev_down = true, 3824 }; 3825 3826 if (anycast) { 3827 cfg.fc_type = RTN_ANYCAST; 3828 cfg.fc_flags |= RTF_ANYCAST; 3829 } else { 3830 cfg.fc_type = RTN_LOCAL; 3831 cfg.fc_flags |= RTF_LOCAL; 3832 } 3833 3834 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3835 } 3836 3837 /* remove deleted ip from prefsrc entries */ 3838 struct arg_dev_net_ip { 3839 struct net_device *dev; 3840 struct net *net; 3841 struct in6_addr *addr; 3842 }; 3843 3844 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3845 { 3846 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3847 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3848 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3849 3850 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3851 rt != net->ipv6.fib6_null_entry && 3852 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3853 spin_lock_bh(&rt6_exception_lock); 3854 /* remove prefsrc entry */ 3855 rt->fib6_prefsrc.plen = 0; 3856 spin_unlock_bh(&rt6_exception_lock); 3857 } 3858 return 0; 3859 } 3860 3861 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3862 { 3863 struct net *net = dev_net(ifp->idev->dev); 3864 struct arg_dev_net_ip adni = { 3865 .dev = ifp->idev->dev, 3866 .net = net, 3867 .addr = &ifp->addr, 3868 }; 3869 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3870 } 3871 3872 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3873 3874 /* Remove routers and update dst entries when gateway turn into host. */ 3875 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3876 { 3877 struct in6_addr *gateway = (struct in6_addr *)arg; 3878 3879 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3880 rt->fib6_nh.fib_nh_gw_family && 3881 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3882 return -1; 3883 } 3884 3885 /* Further clean up cached routes in exception table. 3886 * This is needed because cached route may have a different 3887 * gateway than its 'parent' in the case of an ip redirect. 3888 */ 3889 rt6_exceptions_clean_tohost(rt, gateway); 3890 3891 return 0; 3892 } 3893 3894 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3895 { 3896 fib6_clean_all(net, fib6_clean_tohost, gateway); 3897 } 3898 3899 struct arg_netdev_event { 3900 const struct net_device *dev; 3901 union { 3902 unsigned int nh_flags; 3903 unsigned long event; 3904 }; 3905 }; 3906 3907 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3908 { 3909 struct fib6_info *iter; 3910 struct fib6_node *fn; 3911 3912 fn = rcu_dereference_protected(rt->fib6_node, 3913 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3914 iter = rcu_dereference_protected(fn->leaf, 3915 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3916 while (iter) { 3917 if (iter->fib6_metric == rt->fib6_metric && 3918 rt6_qualify_for_ecmp(iter)) 3919 return iter; 3920 iter = rcu_dereference_protected(iter->fib6_next, 3921 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3922 } 3923 3924 return NULL; 3925 } 3926 3927 static bool rt6_is_dead(const struct fib6_info *rt) 3928 { 3929 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3930 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3931 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3932 return true; 3933 3934 return false; 3935 } 3936 3937 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3938 { 3939 struct fib6_info *iter; 3940 int total = 0; 3941 3942 if (!rt6_is_dead(rt)) 3943 total += rt->fib6_nh.fib_nh_weight; 3944 3945 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3946 if (!rt6_is_dead(iter)) 3947 total += iter->fib6_nh.fib_nh_weight; 3948 } 3949 3950 return total; 3951 } 3952 3953 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3954 { 3955 int upper_bound = -1; 3956 3957 if (!rt6_is_dead(rt)) { 3958 *weight += rt->fib6_nh.fib_nh_weight; 3959 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3960 total) - 1; 3961 } 3962 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3963 } 3964 3965 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3966 { 3967 struct fib6_info *iter; 3968 int weight = 0; 3969 3970 rt6_upper_bound_set(rt, &weight, total); 3971 3972 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3973 rt6_upper_bound_set(iter, &weight, total); 3974 } 3975 3976 void rt6_multipath_rebalance(struct fib6_info *rt) 3977 { 3978 struct fib6_info *first; 3979 int total; 3980 3981 /* In case the entire multipath route was marked for flushing, 3982 * then there is no need to rebalance upon the removal of every 3983 * sibling route. 3984 */ 3985 if (!rt->fib6_nsiblings || rt->should_flush) 3986 return; 3987 3988 /* During lookup routes are evaluated in order, so we need to 3989 * make sure upper bounds are assigned from the first sibling 3990 * onwards. 3991 */ 3992 first = rt6_multipath_first_sibling(rt); 3993 if (WARN_ON_ONCE(!first)) 3994 return; 3995 3996 total = rt6_multipath_total_weight(first); 3997 rt6_multipath_upper_bound_set(first, total); 3998 } 3999 4000 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4001 { 4002 const struct arg_netdev_event *arg = p_arg; 4003 struct net *net = dev_net(arg->dev); 4004 4005 if (rt != net->ipv6.fib6_null_entry && 4006 rt->fib6_nh.fib_nh_dev == arg->dev) { 4007 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 4008 fib6_update_sernum_upto_root(net, rt); 4009 rt6_multipath_rebalance(rt); 4010 } 4011 4012 return 0; 4013 } 4014 4015 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 4016 { 4017 struct arg_netdev_event arg = { 4018 .dev = dev, 4019 { 4020 .nh_flags = nh_flags, 4021 }, 4022 }; 4023 4024 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4025 arg.nh_flags |= RTNH_F_LINKDOWN; 4026 4027 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4028 } 4029 4030 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4031 const struct net_device *dev) 4032 { 4033 struct fib6_info *iter; 4034 4035 if (rt->fib6_nh.fib_nh_dev == dev) 4036 return true; 4037 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4038 if (iter->fib6_nh.fib_nh_dev == dev) 4039 return true; 4040 4041 return false; 4042 } 4043 4044 static void rt6_multipath_flush(struct fib6_info *rt) 4045 { 4046 struct fib6_info *iter; 4047 4048 rt->should_flush = 1; 4049 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4050 iter->should_flush = 1; 4051 } 4052 4053 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4054 const struct net_device *down_dev) 4055 { 4056 struct fib6_info *iter; 4057 unsigned int dead = 0; 4058 4059 if (rt->fib6_nh.fib_nh_dev == down_dev || 4060 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4061 dead++; 4062 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4063 if (iter->fib6_nh.fib_nh_dev == down_dev || 4064 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4065 dead++; 4066 4067 return dead; 4068 } 4069 4070 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4071 const struct net_device *dev, 4072 unsigned int nh_flags) 4073 { 4074 struct fib6_info *iter; 4075 4076 if (rt->fib6_nh.fib_nh_dev == dev) 4077 rt->fib6_nh.fib_nh_flags |= nh_flags; 4078 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4079 if (iter->fib6_nh.fib_nh_dev == dev) 4080 iter->fib6_nh.fib_nh_flags |= nh_flags; 4081 } 4082 4083 /* called with write lock held for table with rt */ 4084 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4085 { 4086 const struct arg_netdev_event *arg = p_arg; 4087 const struct net_device *dev = arg->dev; 4088 struct net *net = dev_net(dev); 4089 4090 if (rt == net->ipv6.fib6_null_entry) 4091 return 0; 4092 4093 switch (arg->event) { 4094 case NETDEV_UNREGISTER: 4095 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4096 case NETDEV_DOWN: 4097 if (rt->should_flush) 4098 return -1; 4099 if (!rt->fib6_nsiblings) 4100 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4101 if (rt6_multipath_uses_dev(rt, dev)) { 4102 unsigned int count; 4103 4104 count = rt6_multipath_dead_count(rt, dev); 4105 if (rt->fib6_nsiblings + 1 == count) { 4106 rt6_multipath_flush(rt); 4107 return -1; 4108 } 4109 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4110 RTNH_F_LINKDOWN); 4111 fib6_update_sernum(net, rt); 4112 rt6_multipath_rebalance(rt); 4113 } 4114 return -2; 4115 case NETDEV_CHANGE: 4116 if (rt->fib6_nh.fib_nh_dev != dev || 4117 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4118 break; 4119 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4120 rt6_multipath_rebalance(rt); 4121 break; 4122 } 4123 4124 return 0; 4125 } 4126 4127 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4128 { 4129 struct arg_netdev_event arg = { 4130 .dev = dev, 4131 { 4132 .event = event, 4133 }, 4134 }; 4135 struct net *net = dev_net(dev); 4136 4137 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4138 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4139 else 4140 fib6_clean_all(net, fib6_ifdown, &arg); 4141 } 4142 4143 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4144 { 4145 rt6_sync_down_dev(dev, event); 4146 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4147 neigh_ifdown(&nd_tbl, dev); 4148 } 4149 4150 struct rt6_mtu_change_arg { 4151 struct net_device *dev; 4152 unsigned int mtu; 4153 }; 4154 4155 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4156 { 4157 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4158 struct inet6_dev *idev; 4159 4160 /* In IPv6 pmtu discovery is not optional, 4161 so that RTAX_MTU lock cannot disable it. 4162 We still use this lock to block changes 4163 caused by addrconf/ndisc. 4164 */ 4165 4166 idev = __in6_dev_get(arg->dev); 4167 if (!idev) 4168 return 0; 4169 4170 /* For administrative MTU increase, there is no way to discover 4171 IPv6 PMTU increase, so PMTU increase should be updated here. 4172 Since RFC 1981 doesn't include administrative MTU increase 4173 update PMTU increase is a MUST. (i.e. jumbo frame) 4174 */ 4175 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4176 !fib6_metric_locked(rt, RTAX_MTU)) { 4177 u32 mtu = rt->fib6_pmtu; 4178 4179 if (mtu >= arg->mtu || 4180 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4181 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4182 4183 spin_lock_bh(&rt6_exception_lock); 4184 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4185 spin_unlock_bh(&rt6_exception_lock); 4186 } 4187 return 0; 4188 } 4189 4190 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4191 { 4192 struct rt6_mtu_change_arg arg = { 4193 .dev = dev, 4194 .mtu = mtu, 4195 }; 4196 4197 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4198 } 4199 4200 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4201 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4202 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4203 [RTA_OIF] = { .type = NLA_U32 }, 4204 [RTA_IIF] = { .type = NLA_U32 }, 4205 [RTA_PRIORITY] = { .type = NLA_U32 }, 4206 [RTA_METRICS] = { .type = NLA_NESTED }, 4207 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4208 [RTA_PREF] = { .type = NLA_U8 }, 4209 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4210 [RTA_ENCAP] = { .type = NLA_NESTED }, 4211 [RTA_EXPIRES] = { .type = NLA_U32 }, 4212 [RTA_UID] = { .type = NLA_U32 }, 4213 [RTA_MARK] = { .type = NLA_U32 }, 4214 [RTA_TABLE] = { .type = NLA_U32 }, 4215 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4216 [RTA_SPORT] = { .type = NLA_U16 }, 4217 [RTA_DPORT] = { .type = NLA_U16 }, 4218 }; 4219 4220 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4221 struct fib6_config *cfg, 4222 struct netlink_ext_ack *extack) 4223 { 4224 struct rtmsg *rtm; 4225 struct nlattr *tb[RTA_MAX+1]; 4226 unsigned int pref; 4227 int err; 4228 4229 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4230 extack); 4231 if (err < 0) 4232 goto errout; 4233 4234 err = -EINVAL; 4235 rtm = nlmsg_data(nlh); 4236 4237 *cfg = (struct fib6_config){ 4238 .fc_table = rtm->rtm_table, 4239 .fc_dst_len = rtm->rtm_dst_len, 4240 .fc_src_len = rtm->rtm_src_len, 4241 .fc_flags = RTF_UP, 4242 .fc_protocol = rtm->rtm_protocol, 4243 .fc_type = rtm->rtm_type, 4244 4245 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4246 .fc_nlinfo.nlh = nlh, 4247 .fc_nlinfo.nl_net = sock_net(skb->sk), 4248 }; 4249 4250 if (rtm->rtm_type == RTN_UNREACHABLE || 4251 rtm->rtm_type == RTN_BLACKHOLE || 4252 rtm->rtm_type == RTN_PROHIBIT || 4253 rtm->rtm_type == RTN_THROW) 4254 cfg->fc_flags |= RTF_REJECT; 4255 4256 if (rtm->rtm_type == RTN_LOCAL) 4257 cfg->fc_flags |= RTF_LOCAL; 4258 4259 if (rtm->rtm_flags & RTM_F_CLONED) 4260 cfg->fc_flags |= RTF_CACHE; 4261 4262 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4263 4264 if (tb[RTA_GATEWAY]) { 4265 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4266 cfg->fc_flags |= RTF_GATEWAY; 4267 } 4268 if (tb[RTA_VIA]) { 4269 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4270 goto errout; 4271 } 4272 4273 if (tb[RTA_DST]) { 4274 int plen = (rtm->rtm_dst_len + 7) >> 3; 4275 4276 if (nla_len(tb[RTA_DST]) < plen) 4277 goto errout; 4278 4279 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4280 } 4281 4282 if (tb[RTA_SRC]) { 4283 int plen = (rtm->rtm_src_len + 7) >> 3; 4284 4285 if (nla_len(tb[RTA_SRC]) < plen) 4286 goto errout; 4287 4288 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4289 } 4290 4291 if (tb[RTA_PREFSRC]) 4292 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4293 4294 if (tb[RTA_OIF]) 4295 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4296 4297 if (tb[RTA_PRIORITY]) 4298 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4299 4300 if (tb[RTA_METRICS]) { 4301 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4302 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4303 } 4304 4305 if (tb[RTA_TABLE]) 4306 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4307 4308 if (tb[RTA_MULTIPATH]) { 4309 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4310 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4311 4312 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4313 cfg->fc_mp_len, extack); 4314 if (err < 0) 4315 goto errout; 4316 } 4317 4318 if (tb[RTA_PREF]) { 4319 pref = nla_get_u8(tb[RTA_PREF]); 4320 if (pref != ICMPV6_ROUTER_PREF_LOW && 4321 pref != ICMPV6_ROUTER_PREF_HIGH) 4322 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4323 cfg->fc_flags |= RTF_PREF(pref); 4324 } 4325 4326 if (tb[RTA_ENCAP]) 4327 cfg->fc_encap = tb[RTA_ENCAP]; 4328 4329 if (tb[RTA_ENCAP_TYPE]) { 4330 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4331 4332 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4333 if (err < 0) 4334 goto errout; 4335 } 4336 4337 if (tb[RTA_EXPIRES]) { 4338 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4339 4340 if (addrconf_finite_timeout(timeout)) { 4341 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4342 cfg->fc_flags |= RTF_EXPIRES; 4343 } 4344 } 4345 4346 err = 0; 4347 errout: 4348 return err; 4349 } 4350 4351 struct rt6_nh { 4352 struct fib6_info *fib6_info; 4353 struct fib6_config r_cfg; 4354 struct list_head next; 4355 }; 4356 4357 static int ip6_route_info_append(struct net *net, 4358 struct list_head *rt6_nh_list, 4359 struct fib6_info *rt, 4360 struct fib6_config *r_cfg) 4361 { 4362 struct rt6_nh *nh; 4363 int err = -EEXIST; 4364 4365 list_for_each_entry(nh, rt6_nh_list, next) { 4366 /* check if fib6_info already exists */ 4367 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4368 return err; 4369 } 4370 4371 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4372 if (!nh) 4373 return -ENOMEM; 4374 nh->fib6_info = rt; 4375 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4376 list_add_tail(&nh->next, rt6_nh_list); 4377 4378 return 0; 4379 } 4380 4381 static void ip6_route_mpath_notify(struct fib6_info *rt, 4382 struct fib6_info *rt_last, 4383 struct nl_info *info, 4384 __u16 nlflags) 4385 { 4386 /* if this is an APPEND route, then rt points to the first route 4387 * inserted and rt_last points to last route inserted. Userspace 4388 * wants a consistent dump of the route which starts at the first 4389 * nexthop. Since sibling routes are always added at the end of 4390 * the list, find the first sibling of the last route appended 4391 */ 4392 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4393 rt = list_first_entry(&rt_last->fib6_siblings, 4394 struct fib6_info, 4395 fib6_siblings); 4396 } 4397 4398 if (rt) 4399 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4400 } 4401 4402 static int ip6_route_multipath_add(struct fib6_config *cfg, 4403 struct netlink_ext_ack *extack) 4404 { 4405 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4406 struct nl_info *info = &cfg->fc_nlinfo; 4407 struct fib6_config r_cfg; 4408 struct rtnexthop *rtnh; 4409 struct fib6_info *rt; 4410 struct rt6_nh *err_nh; 4411 struct rt6_nh *nh, *nh_safe; 4412 __u16 nlflags; 4413 int remaining; 4414 int attrlen; 4415 int err = 1; 4416 int nhn = 0; 4417 int replace = (cfg->fc_nlinfo.nlh && 4418 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4419 LIST_HEAD(rt6_nh_list); 4420 4421 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4422 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4423 nlflags |= NLM_F_APPEND; 4424 4425 remaining = cfg->fc_mp_len; 4426 rtnh = (struct rtnexthop *)cfg->fc_mp; 4427 4428 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4429 * fib6_info structs per nexthop 4430 */ 4431 while (rtnh_ok(rtnh, remaining)) { 4432 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4433 if (rtnh->rtnh_ifindex) 4434 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4435 4436 attrlen = rtnh_attrlen(rtnh); 4437 if (attrlen > 0) { 4438 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4439 4440 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4441 if (nla) { 4442 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4443 r_cfg.fc_flags |= RTF_GATEWAY; 4444 } 4445 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4446 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4447 if (nla) 4448 r_cfg.fc_encap_type = nla_get_u16(nla); 4449 } 4450 4451 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4452 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4453 if (IS_ERR(rt)) { 4454 err = PTR_ERR(rt); 4455 rt = NULL; 4456 goto cleanup; 4457 } 4458 if (!rt6_qualify_for_ecmp(rt)) { 4459 err = -EINVAL; 4460 NL_SET_ERR_MSG(extack, 4461 "Device only routes can not be added for IPv6 using the multipath API."); 4462 fib6_info_release(rt); 4463 goto cleanup; 4464 } 4465 4466 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4467 4468 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4469 rt, &r_cfg); 4470 if (err) { 4471 fib6_info_release(rt); 4472 goto cleanup; 4473 } 4474 4475 rtnh = rtnh_next(rtnh, &remaining); 4476 } 4477 4478 /* for add and replace send one notification with all nexthops. 4479 * Skip the notification in fib6_add_rt2node and send one with 4480 * the full route when done 4481 */ 4482 info->skip_notify = 1; 4483 4484 err_nh = NULL; 4485 list_for_each_entry(nh, &rt6_nh_list, next) { 4486 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4487 fib6_info_release(nh->fib6_info); 4488 4489 if (!err) { 4490 /* save reference to last route successfully inserted */ 4491 rt_last = nh->fib6_info; 4492 4493 /* save reference to first route for notification */ 4494 if (!rt_notif) 4495 rt_notif = nh->fib6_info; 4496 } 4497 4498 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4499 nh->fib6_info = NULL; 4500 if (err) { 4501 if (replace && nhn) 4502 NL_SET_ERR_MSG_MOD(extack, 4503 "multipath route replace failed (check consistency of installed routes)"); 4504 err_nh = nh; 4505 goto add_errout; 4506 } 4507 4508 /* Because each route is added like a single route we remove 4509 * these flags after the first nexthop: if there is a collision, 4510 * we have already failed to add the first nexthop: 4511 * fib6_add_rt2node() has rejected it; when replacing, old 4512 * nexthops have been replaced by first new, the rest should 4513 * be added to it. 4514 */ 4515 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4516 NLM_F_REPLACE); 4517 nhn++; 4518 } 4519 4520 /* success ... tell user about new route */ 4521 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4522 goto cleanup; 4523 4524 add_errout: 4525 /* send notification for routes that were added so that 4526 * the delete notifications sent by ip6_route_del are 4527 * coherent 4528 */ 4529 if (rt_notif) 4530 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4531 4532 /* Delete routes that were already added */ 4533 list_for_each_entry(nh, &rt6_nh_list, next) { 4534 if (err_nh == nh) 4535 break; 4536 ip6_route_del(&nh->r_cfg, extack); 4537 } 4538 4539 cleanup: 4540 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4541 if (nh->fib6_info) 4542 fib6_info_release(nh->fib6_info); 4543 list_del(&nh->next); 4544 kfree(nh); 4545 } 4546 4547 return err; 4548 } 4549 4550 static int ip6_route_multipath_del(struct fib6_config *cfg, 4551 struct netlink_ext_ack *extack) 4552 { 4553 struct fib6_config r_cfg; 4554 struct rtnexthop *rtnh; 4555 int remaining; 4556 int attrlen; 4557 int err = 1, last_err = 0; 4558 4559 remaining = cfg->fc_mp_len; 4560 rtnh = (struct rtnexthop *)cfg->fc_mp; 4561 4562 /* Parse a Multipath Entry */ 4563 while (rtnh_ok(rtnh, remaining)) { 4564 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4565 if (rtnh->rtnh_ifindex) 4566 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4567 4568 attrlen = rtnh_attrlen(rtnh); 4569 if (attrlen > 0) { 4570 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4571 4572 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4573 if (nla) { 4574 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4575 r_cfg.fc_flags |= RTF_GATEWAY; 4576 } 4577 } 4578 err = ip6_route_del(&r_cfg, extack); 4579 if (err) 4580 last_err = err; 4581 4582 rtnh = rtnh_next(rtnh, &remaining); 4583 } 4584 4585 return last_err; 4586 } 4587 4588 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4589 struct netlink_ext_ack *extack) 4590 { 4591 struct fib6_config cfg; 4592 int err; 4593 4594 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4595 if (err < 0) 4596 return err; 4597 4598 if (cfg.fc_mp) 4599 return ip6_route_multipath_del(&cfg, extack); 4600 else { 4601 cfg.fc_delete_all_nh = 1; 4602 return ip6_route_del(&cfg, extack); 4603 } 4604 } 4605 4606 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4607 struct netlink_ext_ack *extack) 4608 { 4609 struct fib6_config cfg; 4610 int err; 4611 4612 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4613 if (err < 0) 4614 return err; 4615 4616 if (cfg.fc_metric == 0) 4617 cfg.fc_metric = IP6_RT_PRIO_USER; 4618 4619 if (cfg.fc_mp) 4620 return ip6_route_multipath_add(&cfg, extack); 4621 else 4622 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4623 } 4624 4625 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4626 { 4627 int nexthop_len = 0; 4628 4629 if (rt->fib6_nsiblings) { 4630 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4631 + NLA_ALIGN(sizeof(struct rtnexthop)) 4632 + nla_total_size(16) /* RTA_GATEWAY */ 4633 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4634 4635 nexthop_len *= rt->fib6_nsiblings; 4636 } 4637 4638 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4639 + nla_total_size(16) /* RTA_SRC */ 4640 + nla_total_size(16) /* RTA_DST */ 4641 + nla_total_size(16) /* RTA_GATEWAY */ 4642 + nla_total_size(16) /* RTA_PREFSRC */ 4643 + nla_total_size(4) /* RTA_TABLE */ 4644 + nla_total_size(4) /* RTA_IIF */ 4645 + nla_total_size(4) /* RTA_OIF */ 4646 + nla_total_size(4) /* RTA_PRIORITY */ 4647 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4648 + nla_total_size(sizeof(struct rta_cacheinfo)) 4649 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4650 + nla_total_size(1) /* RTA_PREF */ 4651 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4652 + nexthop_len; 4653 } 4654 4655 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4656 struct fib6_info *rt, struct dst_entry *dst, 4657 struct in6_addr *dest, struct in6_addr *src, 4658 int iif, int type, u32 portid, u32 seq, 4659 unsigned int flags) 4660 { 4661 struct rt6_info *rt6 = (struct rt6_info *)dst; 4662 struct rt6key *rt6_dst, *rt6_src; 4663 u32 *pmetrics, table, rt6_flags; 4664 struct nlmsghdr *nlh; 4665 struct rtmsg *rtm; 4666 long expires = 0; 4667 4668 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4669 if (!nlh) 4670 return -EMSGSIZE; 4671 4672 if (rt6) { 4673 rt6_dst = &rt6->rt6i_dst; 4674 rt6_src = &rt6->rt6i_src; 4675 rt6_flags = rt6->rt6i_flags; 4676 } else { 4677 rt6_dst = &rt->fib6_dst; 4678 rt6_src = &rt->fib6_src; 4679 rt6_flags = rt->fib6_flags; 4680 } 4681 4682 rtm = nlmsg_data(nlh); 4683 rtm->rtm_family = AF_INET6; 4684 rtm->rtm_dst_len = rt6_dst->plen; 4685 rtm->rtm_src_len = rt6_src->plen; 4686 rtm->rtm_tos = 0; 4687 if (rt->fib6_table) 4688 table = rt->fib6_table->tb6_id; 4689 else 4690 table = RT6_TABLE_UNSPEC; 4691 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4692 if (nla_put_u32(skb, RTA_TABLE, table)) 4693 goto nla_put_failure; 4694 4695 rtm->rtm_type = rt->fib6_type; 4696 rtm->rtm_flags = 0; 4697 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4698 rtm->rtm_protocol = rt->fib6_protocol; 4699 4700 if (rt6_flags & RTF_CACHE) 4701 rtm->rtm_flags |= RTM_F_CLONED; 4702 4703 if (dest) { 4704 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4705 goto nla_put_failure; 4706 rtm->rtm_dst_len = 128; 4707 } else if (rtm->rtm_dst_len) 4708 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4709 goto nla_put_failure; 4710 #ifdef CONFIG_IPV6_SUBTREES 4711 if (src) { 4712 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4713 goto nla_put_failure; 4714 rtm->rtm_src_len = 128; 4715 } else if (rtm->rtm_src_len && 4716 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4717 goto nla_put_failure; 4718 #endif 4719 if (iif) { 4720 #ifdef CONFIG_IPV6_MROUTE 4721 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4722 int err = ip6mr_get_route(net, skb, rtm, portid); 4723 4724 if (err == 0) 4725 return 0; 4726 if (err < 0) 4727 goto nla_put_failure; 4728 } else 4729 #endif 4730 if (nla_put_u32(skb, RTA_IIF, iif)) 4731 goto nla_put_failure; 4732 } else if (dest) { 4733 struct in6_addr saddr_buf; 4734 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4735 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4736 goto nla_put_failure; 4737 } 4738 4739 if (rt->fib6_prefsrc.plen) { 4740 struct in6_addr saddr_buf; 4741 saddr_buf = rt->fib6_prefsrc.addr; 4742 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4743 goto nla_put_failure; 4744 } 4745 4746 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4747 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4748 goto nla_put_failure; 4749 4750 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4751 goto nla_put_failure; 4752 4753 /* For multipath routes, walk the siblings list and add 4754 * each as a nexthop within RTA_MULTIPATH. 4755 */ 4756 if (rt6) { 4757 if (rt6_flags & RTF_GATEWAY && 4758 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4759 goto nla_put_failure; 4760 4761 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4762 goto nla_put_failure; 4763 } else if (rt->fib6_nsiblings) { 4764 struct fib6_info *sibling, *next_sibling; 4765 struct nlattr *mp; 4766 4767 mp = nla_nest_start(skb, RTA_MULTIPATH); 4768 if (!mp) 4769 goto nla_put_failure; 4770 4771 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, 4772 rt->fib6_nh.fib_nh_weight) < 0) 4773 goto nla_put_failure; 4774 4775 list_for_each_entry_safe(sibling, next_sibling, 4776 &rt->fib6_siblings, fib6_siblings) { 4777 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, 4778 sibling->fib6_nh.fib_nh_weight) < 0) 4779 goto nla_put_failure; 4780 } 4781 4782 nla_nest_end(skb, mp); 4783 } else { 4784 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, 4785 &rtm->rtm_flags, false) < 0) 4786 goto nla_put_failure; 4787 } 4788 4789 if (rt6_flags & RTF_EXPIRES) { 4790 expires = dst ? dst->expires : rt->expires; 4791 expires -= jiffies; 4792 } 4793 4794 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4795 goto nla_put_failure; 4796 4797 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4798 goto nla_put_failure; 4799 4800 4801 nlmsg_end(skb, nlh); 4802 return 0; 4803 4804 nla_put_failure: 4805 nlmsg_cancel(skb, nlh); 4806 return -EMSGSIZE; 4807 } 4808 4809 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4810 const struct net_device *dev) 4811 { 4812 if (f6i->fib6_nh.fib_nh_dev == dev) 4813 return true; 4814 4815 if (f6i->fib6_nsiblings) { 4816 struct fib6_info *sibling, *next_sibling; 4817 4818 list_for_each_entry_safe(sibling, next_sibling, 4819 &f6i->fib6_siblings, fib6_siblings) { 4820 if (sibling->fib6_nh.fib_nh_dev == dev) 4821 return true; 4822 } 4823 } 4824 4825 return false; 4826 } 4827 4828 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4829 { 4830 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4831 struct fib_dump_filter *filter = &arg->filter; 4832 unsigned int flags = NLM_F_MULTI; 4833 struct net *net = arg->net; 4834 4835 if (rt == net->ipv6.fib6_null_entry) 4836 return 0; 4837 4838 if ((filter->flags & RTM_F_PREFIX) && 4839 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4840 /* success since this is not a prefix route */ 4841 return 1; 4842 } 4843 if (filter->filter_set) { 4844 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4845 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4846 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4847 return 1; 4848 } 4849 flags |= NLM_F_DUMP_FILTERED; 4850 } 4851 4852 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4853 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4854 arg->cb->nlh->nlmsg_seq, flags); 4855 } 4856 4857 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4858 const struct nlmsghdr *nlh, 4859 struct nlattr **tb, 4860 struct netlink_ext_ack *extack) 4861 { 4862 struct rtmsg *rtm; 4863 int i, err; 4864 4865 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4866 NL_SET_ERR_MSG_MOD(extack, 4867 "Invalid header for get route request"); 4868 return -EINVAL; 4869 } 4870 4871 if (!netlink_strict_get_check(skb)) 4872 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4873 rtm_ipv6_policy, extack); 4874 4875 rtm = nlmsg_data(nlh); 4876 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4877 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4878 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4879 rtm->rtm_type) { 4880 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4881 return -EINVAL; 4882 } 4883 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4884 NL_SET_ERR_MSG_MOD(extack, 4885 "Invalid flags for get route request"); 4886 return -EINVAL; 4887 } 4888 4889 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4890 rtm_ipv6_policy, extack); 4891 if (err) 4892 return err; 4893 4894 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4895 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4896 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4897 return -EINVAL; 4898 } 4899 4900 for (i = 0; i <= RTA_MAX; i++) { 4901 if (!tb[i]) 4902 continue; 4903 4904 switch (i) { 4905 case RTA_SRC: 4906 case RTA_DST: 4907 case RTA_IIF: 4908 case RTA_OIF: 4909 case RTA_MARK: 4910 case RTA_UID: 4911 case RTA_SPORT: 4912 case RTA_DPORT: 4913 case RTA_IP_PROTO: 4914 break; 4915 default: 4916 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4917 return -EINVAL; 4918 } 4919 } 4920 4921 return 0; 4922 } 4923 4924 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4925 struct netlink_ext_ack *extack) 4926 { 4927 struct net *net = sock_net(in_skb->sk); 4928 struct nlattr *tb[RTA_MAX+1]; 4929 int err, iif = 0, oif = 0; 4930 struct fib6_info *from; 4931 struct dst_entry *dst; 4932 struct rt6_info *rt; 4933 struct sk_buff *skb; 4934 struct rtmsg *rtm; 4935 struct flowi6 fl6 = {}; 4936 bool fibmatch; 4937 4938 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4939 if (err < 0) 4940 goto errout; 4941 4942 err = -EINVAL; 4943 rtm = nlmsg_data(nlh); 4944 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4945 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4946 4947 if (tb[RTA_SRC]) { 4948 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4949 goto errout; 4950 4951 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4952 } 4953 4954 if (tb[RTA_DST]) { 4955 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4956 goto errout; 4957 4958 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4959 } 4960 4961 if (tb[RTA_IIF]) 4962 iif = nla_get_u32(tb[RTA_IIF]); 4963 4964 if (tb[RTA_OIF]) 4965 oif = nla_get_u32(tb[RTA_OIF]); 4966 4967 if (tb[RTA_MARK]) 4968 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4969 4970 if (tb[RTA_UID]) 4971 fl6.flowi6_uid = make_kuid(current_user_ns(), 4972 nla_get_u32(tb[RTA_UID])); 4973 else 4974 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4975 4976 if (tb[RTA_SPORT]) 4977 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4978 4979 if (tb[RTA_DPORT]) 4980 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4981 4982 if (tb[RTA_IP_PROTO]) { 4983 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4984 &fl6.flowi6_proto, AF_INET6, 4985 extack); 4986 if (err) 4987 goto errout; 4988 } 4989 4990 if (iif) { 4991 struct net_device *dev; 4992 int flags = 0; 4993 4994 rcu_read_lock(); 4995 4996 dev = dev_get_by_index_rcu(net, iif); 4997 if (!dev) { 4998 rcu_read_unlock(); 4999 err = -ENODEV; 5000 goto errout; 5001 } 5002 5003 fl6.flowi6_iif = iif; 5004 5005 if (!ipv6_addr_any(&fl6.saddr)) 5006 flags |= RT6_LOOKUP_F_HAS_SADDR; 5007 5008 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5009 5010 rcu_read_unlock(); 5011 } else { 5012 fl6.flowi6_oif = oif; 5013 5014 dst = ip6_route_output(net, NULL, &fl6); 5015 } 5016 5017 5018 rt = container_of(dst, struct rt6_info, dst); 5019 if (rt->dst.error) { 5020 err = rt->dst.error; 5021 ip6_rt_put(rt); 5022 goto errout; 5023 } 5024 5025 if (rt == net->ipv6.ip6_null_entry) { 5026 err = rt->dst.error; 5027 ip6_rt_put(rt); 5028 goto errout; 5029 } 5030 5031 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5032 if (!skb) { 5033 ip6_rt_put(rt); 5034 err = -ENOBUFS; 5035 goto errout; 5036 } 5037 5038 skb_dst_set(skb, &rt->dst); 5039 5040 rcu_read_lock(); 5041 from = rcu_dereference(rt->from); 5042 5043 if (fibmatch) 5044 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 5045 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 5046 nlh->nlmsg_seq, 0); 5047 else 5048 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5049 &fl6.saddr, iif, RTM_NEWROUTE, 5050 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 5051 0); 5052 rcu_read_unlock(); 5053 5054 if (err < 0) { 5055 kfree_skb(skb); 5056 goto errout; 5057 } 5058 5059 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5060 errout: 5061 return err; 5062 } 5063 5064 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5065 unsigned int nlm_flags) 5066 { 5067 struct sk_buff *skb; 5068 struct net *net = info->nl_net; 5069 u32 seq; 5070 int err; 5071 5072 err = -ENOBUFS; 5073 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5074 5075 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5076 if (!skb) 5077 goto errout; 5078 5079 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5080 event, info->portid, seq, nlm_flags); 5081 if (err < 0) { 5082 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5083 WARN_ON(err == -EMSGSIZE); 5084 kfree_skb(skb); 5085 goto errout; 5086 } 5087 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5088 info->nlh, gfp_any()); 5089 return; 5090 errout: 5091 if (err < 0) 5092 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5093 } 5094 5095 static int ip6_route_dev_notify(struct notifier_block *this, 5096 unsigned long event, void *ptr) 5097 { 5098 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5099 struct net *net = dev_net(dev); 5100 5101 if (!(dev->flags & IFF_LOOPBACK)) 5102 return NOTIFY_OK; 5103 5104 if (event == NETDEV_REGISTER) { 5105 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5106 net->ipv6.ip6_null_entry->dst.dev = dev; 5107 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5108 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5109 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5110 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5111 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5112 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5113 #endif 5114 } else if (event == NETDEV_UNREGISTER && 5115 dev->reg_state != NETREG_UNREGISTERED) { 5116 /* NETDEV_UNREGISTER could be fired for multiple times by 5117 * netdev_wait_allrefs(). Make sure we only call this once. 5118 */ 5119 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5120 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5121 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5122 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5123 #endif 5124 } 5125 5126 return NOTIFY_OK; 5127 } 5128 5129 /* 5130 * /proc 5131 */ 5132 5133 #ifdef CONFIG_PROC_FS 5134 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5135 { 5136 struct net *net = (struct net *)seq->private; 5137 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5138 net->ipv6.rt6_stats->fib_nodes, 5139 net->ipv6.rt6_stats->fib_route_nodes, 5140 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5141 net->ipv6.rt6_stats->fib_rt_entries, 5142 net->ipv6.rt6_stats->fib_rt_cache, 5143 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5144 net->ipv6.rt6_stats->fib_discarded_routes); 5145 5146 return 0; 5147 } 5148 #endif /* CONFIG_PROC_FS */ 5149 5150 #ifdef CONFIG_SYSCTL 5151 5152 static 5153 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5154 void __user *buffer, size_t *lenp, loff_t *ppos) 5155 { 5156 struct net *net; 5157 int delay; 5158 int ret; 5159 if (!write) 5160 return -EINVAL; 5161 5162 net = (struct net *)ctl->extra1; 5163 delay = net->ipv6.sysctl.flush_delay; 5164 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5165 if (ret) 5166 return ret; 5167 5168 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5169 return 0; 5170 } 5171 5172 static int zero; 5173 static int one = 1; 5174 5175 static struct ctl_table ipv6_route_table_template[] = { 5176 { 5177 .procname = "flush", 5178 .data = &init_net.ipv6.sysctl.flush_delay, 5179 .maxlen = sizeof(int), 5180 .mode = 0200, 5181 .proc_handler = ipv6_sysctl_rtcache_flush 5182 }, 5183 { 5184 .procname = "gc_thresh", 5185 .data = &ip6_dst_ops_template.gc_thresh, 5186 .maxlen = sizeof(int), 5187 .mode = 0644, 5188 .proc_handler = proc_dointvec, 5189 }, 5190 { 5191 .procname = "max_size", 5192 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5193 .maxlen = sizeof(int), 5194 .mode = 0644, 5195 .proc_handler = proc_dointvec, 5196 }, 5197 { 5198 .procname = "gc_min_interval", 5199 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5200 .maxlen = sizeof(int), 5201 .mode = 0644, 5202 .proc_handler = proc_dointvec_jiffies, 5203 }, 5204 { 5205 .procname = "gc_timeout", 5206 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5207 .maxlen = sizeof(int), 5208 .mode = 0644, 5209 .proc_handler = proc_dointvec_jiffies, 5210 }, 5211 { 5212 .procname = "gc_interval", 5213 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5214 .maxlen = sizeof(int), 5215 .mode = 0644, 5216 .proc_handler = proc_dointvec_jiffies, 5217 }, 5218 { 5219 .procname = "gc_elasticity", 5220 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5221 .maxlen = sizeof(int), 5222 .mode = 0644, 5223 .proc_handler = proc_dointvec, 5224 }, 5225 { 5226 .procname = "mtu_expires", 5227 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5228 .maxlen = sizeof(int), 5229 .mode = 0644, 5230 .proc_handler = proc_dointvec_jiffies, 5231 }, 5232 { 5233 .procname = "min_adv_mss", 5234 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5235 .maxlen = sizeof(int), 5236 .mode = 0644, 5237 .proc_handler = proc_dointvec, 5238 }, 5239 { 5240 .procname = "gc_min_interval_ms", 5241 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5242 .maxlen = sizeof(int), 5243 .mode = 0644, 5244 .proc_handler = proc_dointvec_ms_jiffies, 5245 }, 5246 { 5247 .procname = "skip_notify_on_dev_down", 5248 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5249 .maxlen = sizeof(int), 5250 .mode = 0644, 5251 .proc_handler = proc_dointvec, 5252 .extra1 = &zero, 5253 .extra2 = &one, 5254 }, 5255 { } 5256 }; 5257 5258 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5259 { 5260 struct ctl_table *table; 5261 5262 table = kmemdup(ipv6_route_table_template, 5263 sizeof(ipv6_route_table_template), 5264 GFP_KERNEL); 5265 5266 if (table) { 5267 table[0].data = &net->ipv6.sysctl.flush_delay; 5268 table[0].extra1 = net; 5269 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5270 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5271 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5272 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5273 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5274 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5275 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5276 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5277 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5278 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5279 5280 /* Don't export sysctls to unprivileged users */ 5281 if (net->user_ns != &init_user_ns) 5282 table[0].procname = NULL; 5283 } 5284 5285 return table; 5286 } 5287 #endif 5288 5289 static int __net_init ip6_route_net_init(struct net *net) 5290 { 5291 int ret = -ENOMEM; 5292 5293 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5294 sizeof(net->ipv6.ip6_dst_ops)); 5295 5296 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5297 goto out_ip6_dst_ops; 5298 5299 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5300 sizeof(*net->ipv6.fib6_null_entry), 5301 GFP_KERNEL); 5302 if (!net->ipv6.fib6_null_entry) 5303 goto out_ip6_dst_entries; 5304 5305 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5306 sizeof(*net->ipv6.ip6_null_entry), 5307 GFP_KERNEL); 5308 if (!net->ipv6.ip6_null_entry) 5309 goto out_fib6_null_entry; 5310 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5311 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5312 ip6_template_metrics, true); 5313 5314 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5315 net->ipv6.fib6_has_custom_rules = false; 5316 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5317 sizeof(*net->ipv6.ip6_prohibit_entry), 5318 GFP_KERNEL); 5319 if (!net->ipv6.ip6_prohibit_entry) 5320 goto out_ip6_null_entry; 5321 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5322 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5323 ip6_template_metrics, true); 5324 5325 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5326 sizeof(*net->ipv6.ip6_blk_hole_entry), 5327 GFP_KERNEL); 5328 if (!net->ipv6.ip6_blk_hole_entry) 5329 goto out_ip6_prohibit_entry; 5330 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5331 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5332 ip6_template_metrics, true); 5333 #endif 5334 5335 net->ipv6.sysctl.flush_delay = 0; 5336 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5337 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5338 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5339 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5340 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5341 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5342 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5343 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5344 5345 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5346 5347 ret = 0; 5348 out: 5349 return ret; 5350 5351 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5352 out_ip6_prohibit_entry: 5353 kfree(net->ipv6.ip6_prohibit_entry); 5354 out_ip6_null_entry: 5355 kfree(net->ipv6.ip6_null_entry); 5356 #endif 5357 out_fib6_null_entry: 5358 kfree(net->ipv6.fib6_null_entry); 5359 out_ip6_dst_entries: 5360 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5361 out_ip6_dst_ops: 5362 goto out; 5363 } 5364 5365 static void __net_exit ip6_route_net_exit(struct net *net) 5366 { 5367 kfree(net->ipv6.fib6_null_entry); 5368 kfree(net->ipv6.ip6_null_entry); 5369 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5370 kfree(net->ipv6.ip6_prohibit_entry); 5371 kfree(net->ipv6.ip6_blk_hole_entry); 5372 #endif 5373 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5374 } 5375 5376 static int __net_init ip6_route_net_init_late(struct net *net) 5377 { 5378 #ifdef CONFIG_PROC_FS 5379 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5380 sizeof(struct ipv6_route_iter)); 5381 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5382 rt6_stats_seq_show, NULL); 5383 #endif 5384 return 0; 5385 } 5386 5387 static void __net_exit ip6_route_net_exit_late(struct net *net) 5388 { 5389 #ifdef CONFIG_PROC_FS 5390 remove_proc_entry("ipv6_route", net->proc_net); 5391 remove_proc_entry("rt6_stats", net->proc_net); 5392 #endif 5393 } 5394 5395 static struct pernet_operations ip6_route_net_ops = { 5396 .init = ip6_route_net_init, 5397 .exit = ip6_route_net_exit, 5398 }; 5399 5400 static int __net_init ipv6_inetpeer_init(struct net *net) 5401 { 5402 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5403 5404 if (!bp) 5405 return -ENOMEM; 5406 inet_peer_base_init(bp); 5407 net->ipv6.peers = bp; 5408 return 0; 5409 } 5410 5411 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5412 { 5413 struct inet_peer_base *bp = net->ipv6.peers; 5414 5415 net->ipv6.peers = NULL; 5416 inetpeer_invalidate_tree(bp); 5417 kfree(bp); 5418 } 5419 5420 static struct pernet_operations ipv6_inetpeer_ops = { 5421 .init = ipv6_inetpeer_init, 5422 .exit = ipv6_inetpeer_exit, 5423 }; 5424 5425 static struct pernet_operations ip6_route_net_late_ops = { 5426 .init = ip6_route_net_init_late, 5427 .exit = ip6_route_net_exit_late, 5428 }; 5429 5430 static struct notifier_block ip6_route_dev_notifier = { 5431 .notifier_call = ip6_route_dev_notify, 5432 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5433 }; 5434 5435 void __init ip6_route_init_special_entries(void) 5436 { 5437 /* Registering of the loopback is done before this portion of code, 5438 * the loopback reference in rt6_info will not be taken, do it 5439 * manually for init_net */ 5440 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5441 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5442 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5443 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5444 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5445 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5446 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5447 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5448 #endif 5449 } 5450 5451 int __init ip6_route_init(void) 5452 { 5453 int ret; 5454 int cpu; 5455 5456 ret = -ENOMEM; 5457 ip6_dst_ops_template.kmem_cachep = 5458 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5459 SLAB_HWCACHE_ALIGN, NULL); 5460 if (!ip6_dst_ops_template.kmem_cachep) 5461 goto out; 5462 5463 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5464 if (ret) 5465 goto out_kmem_cache; 5466 5467 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5468 if (ret) 5469 goto out_dst_entries; 5470 5471 ret = register_pernet_subsys(&ip6_route_net_ops); 5472 if (ret) 5473 goto out_register_inetpeer; 5474 5475 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5476 5477 ret = fib6_init(); 5478 if (ret) 5479 goto out_register_subsys; 5480 5481 ret = xfrm6_init(); 5482 if (ret) 5483 goto out_fib6_init; 5484 5485 ret = fib6_rules_init(); 5486 if (ret) 5487 goto xfrm6_init; 5488 5489 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5490 if (ret) 5491 goto fib6_rules_init; 5492 5493 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5494 inet6_rtm_newroute, NULL, 0); 5495 if (ret < 0) 5496 goto out_register_late_subsys; 5497 5498 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5499 inet6_rtm_delroute, NULL, 0); 5500 if (ret < 0) 5501 goto out_register_late_subsys; 5502 5503 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5504 inet6_rtm_getroute, NULL, 5505 RTNL_FLAG_DOIT_UNLOCKED); 5506 if (ret < 0) 5507 goto out_register_late_subsys; 5508 5509 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5510 if (ret) 5511 goto out_register_late_subsys; 5512 5513 for_each_possible_cpu(cpu) { 5514 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5515 5516 INIT_LIST_HEAD(&ul->head); 5517 spin_lock_init(&ul->lock); 5518 } 5519 5520 out: 5521 return ret; 5522 5523 out_register_late_subsys: 5524 rtnl_unregister_all(PF_INET6); 5525 unregister_pernet_subsys(&ip6_route_net_late_ops); 5526 fib6_rules_init: 5527 fib6_rules_cleanup(); 5528 xfrm6_init: 5529 xfrm6_fini(); 5530 out_fib6_init: 5531 fib6_gc_cleanup(); 5532 out_register_subsys: 5533 unregister_pernet_subsys(&ip6_route_net_ops); 5534 out_register_inetpeer: 5535 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5536 out_dst_entries: 5537 dst_entries_destroy(&ip6_dst_blackhole_ops); 5538 out_kmem_cache: 5539 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5540 goto out; 5541 } 5542 5543 void ip6_route_cleanup(void) 5544 { 5545 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5546 unregister_pernet_subsys(&ip6_route_net_late_ops); 5547 fib6_rules_cleanup(); 5548 xfrm6_fini(); 5549 fib6_gc_cleanup(); 5550 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5551 unregister_pernet_subsys(&ip6_route_net_ops); 5552 dst_entries_destroy(&ip6_dst_blackhole_ops); 5553 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5554 } 5555