1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/rtnh.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 106 int strict); 107 static size_t rt6_nlmsg_size(struct fib6_info *rt); 108 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 109 struct fib6_info *rt, struct dst_entry *dst, 110 struct in6_addr *dest, struct in6_addr *src, 111 int iif, int type, u32 portid, u32 seq, 112 unsigned int flags); 113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 114 struct in6_addr *daddr, 115 struct in6_addr *saddr); 116 117 #ifdef CONFIG_IPV6_ROUTE_INFO 118 static struct fib6_info *rt6_add_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev, 122 unsigned int pref); 123 static struct fib6_info *rt6_get_route_info(struct net *net, 124 const struct in6_addr *prefix, int prefixlen, 125 const struct in6_addr *gwaddr, 126 struct net_device *dev); 127 #endif 128 129 struct uncached_list { 130 spinlock_t lock; 131 struct list_head head; 132 }; 133 134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 135 136 void rt6_uncached_list_add(struct rt6_info *rt) 137 { 138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 139 140 rt->rt6i_uncached_list = ul; 141 142 spin_lock_bh(&ul->lock); 143 list_add_tail(&rt->rt6i_uncached, &ul->head); 144 spin_unlock_bh(&ul->lock); 145 } 146 147 void rt6_uncached_list_del(struct rt6_info *rt) 148 { 149 if (!list_empty(&rt->rt6i_uncached)) { 150 struct uncached_list *ul = rt->rt6i_uncached_list; 151 struct net *net = dev_net(rt->dst.dev); 152 153 spin_lock_bh(&ul->lock); 154 list_del(&rt->rt6i_uncached); 155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 161 { 162 struct net_device *loopback_dev = net->loopback_dev; 163 int cpu; 164 165 if (dev == loopback_dev) 166 return; 167 168 for_each_possible_cpu(cpu) { 169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 170 struct rt6_info *rt; 171 172 spin_lock_bh(&ul->lock); 173 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 174 struct inet6_dev *rt_idev = rt->rt6i_idev; 175 struct net_device *rt_dev = rt->dst.dev; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(loopback_dev); 179 in6_dev_put(rt_idev); 180 } 181 182 if (rt_dev == dev) { 183 rt->dst.dev = loopback_dev; 184 dev_hold(rt->dst.dev); 185 dev_put(rt_dev); 186 } 187 } 188 spin_unlock_bh(&ul->lock); 189 } 190 } 191 192 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 193 struct sk_buff *skb, 194 const void *daddr) 195 { 196 if (!ipv6_addr_any(p)) 197 return (const void *) p; 198 else if (skb) 199 return &ipv6_hdr(skb)->daddr; 200 return daddr; 201 } 202 203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 204 struct net_device *dev, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct neighbour *n; 209 210 daddr = choose_neigh_daddr(gw, skb, daddr); 211 n = __ipv6_neigh_lookup(dev, daddr); 212 if (n) 213 return n; 214 215 n = neigh_create(&nd_tbl, daddr, dev); 216 return IS_ERR(n) ? NULL : n; 217 } 218 219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 220 struct sk_buff *skb, 221 const void *daddr) 222 { 223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 224 225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 226 } 227 228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 229 { 230 struct net_device *dev = dst->dev; 231 struct rt6_info *rt = (struct rt6_info *)dst; 232 233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 234 if (!daddr) 235 return; 236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 237 return; 238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 239 return; 240 __ipv6_confirm_neigh(dev, daddr); 241 } 242 243 static struct dst_ops ip6_dst_ops_template = { 244 .family = AF_INET6, 245 .gc = ip6_dst_gc, 246 .gc_thresh = 1024, 247 .check = ip6_dst_check, 248 .default_advmss = ip6_default_advmss, 249 .mtu = ip6_mtu, 250 .cow_metrics = dst_cow_metrics_generic, 251 .destroy = ip6_dst_destroy, 252 .ifdown = ip6_dst_ifdown, 253 .negative_advice = ip6_negative_advice, 254 .link_failure = ip6_link_failure, 255 .update_pmtu = ip6_rt_update_pmtu, 256 .redirect = rt6_do_redirect, 257 .local_out = __ip6_local_out, 258 .neigh_lookup = ip6_dst_neigh_lookup, 259 .confirm_neigh = ip6_confirm_neigh, 260 }; 261 262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 263 { 264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 265 266 return mtu ? : dst->dev->mtu; 267 } 268 269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 270 struct sk_buff *skb, u32 mtu) 271 { 272 } 273 274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 275 struct sk_buff *skb) 276 { 277 } 278 279 static struct dst_ops ip6_dst_blackhole_ops = { 280 .family = AF_INET6, 281 .destroy = ip6_dst_destroy, 282 .check = ip6_dst_check, 283 .mtu = ip6_blackhole_mtu, 284 .default_advmss = ip6_default_advmss, 285 .update_pmtu = ip6_rt_blackhole_update_pmtu, 286 .redirect = ip6_rt_blackhole_redirect, 287 .cow_metrics = dst_cow_metrics_generic, 288 .neigh_lookup = ip6_dst_neigh_lookup, 289 }; 290 291 static const u32 ip6_template_metrics[RTAX_MAX] = { 292 [RTAX_HOPLIMIT - 1] = 0, 293 }; 294 295 static const struct fib6_info fib6_null_entry_template = { 296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 297 .fib6_protocol = RTPROT_KERNEL, 298 .fib6_metric = ~(u32)0, 299 .fib6_ref = REFCOUNT_INIT(1), 300 .fib6_type = RTN_UNREACHABLE, 301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 302 }; 303 304 static const struct rt6_info ip6_null_entry_template = { 305 .dst = { 306 .__refcnt = ATOMIC_INIT(1), 307 .__use = 1, 308 .obsolete = DST_OBSOLETE_FORCE_CHK, 309 .error = -ENETUNREACH, 310 .input = ip6_pkt_discard, 311 .output = ip6_pkt_discard_out, 312 }, 313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 314 }; 315 316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 317 318 static const struct rt6_info ip6_prohibit_entry_template = { 319 .dst = { 320 .__refcnt = ATOMIC_INIT(1), 321 .__use = 1, 322 .obsolete = DST_OBSOLETE_FORCE_CHK, 323 .error = -EACCES, 324 .input = ip6_pkt_prohibit, 325 .output = ip6_pkt_prohibit_out, 326 }, 327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 }; 341 342 #endif 343 344 static void rt6_info_init(struct rt6_info *rt) 345 { 346 struct dst_entry *dst = &rt->dst; 347 348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 349 INIT_LIST_HEAD(&rt->rt6i_uncached); 350 } 351 352 /* allocate dst with ip6_dst_ops */ 353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 354 int flags) 355 { 356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 357 1, DST_OBSOLETE_FORCE_CHK, flags); 358 359 if (rt) { 360 rt6_info_init(rt); 361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 362 } 363 364 return rt; 365 } 366 EXPORT_SYMBOL(ip6_dst_alloc); 367 368 static void ip6_dst_destroy(struct dst_entry *dst) 369 { 370 struct rt6_info *rt = (struct rt6_info *)dst; 371 struct fib6_info *from; 372 struct inet6_dev *idev; 373 374 ip_dst_metrics_put(dst); 375 rt6_uncached_list_del(rt); 376 377 idev = rt->rt6i_idev; 378 if (idev) { 379 rt->rt6i_idev = NULL; 380 in6_dev_put(idev); 381 } 382 383 rcu_read_lock(); 384 from = rcu_dereference(rt->from); 385 rcu_assign_pointer(rt->from, NULL); 386 fib6_info_release(from); 387 rcu_read_unlock(); 388 } 389 390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 391 int how) 392 { 393 struct rt6_info *rt = (struct rt6_info *)dst; 394 struct inet6_dev *idev = rt->rt6i_idev; 395 struct net_device *loopback_dev = 396 dev_net(dev)->loopback_dev; 397 398 if (idev && idev->dev != loopback_dev) { 399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 400 if (loopback_idev) { 401 rt->rt6i_idev = loopback_idev; 402 in6_dev_put(idev); 403 } 404 } 405 } 406 407 static bool __rt6_check_expired(const struct rt6_info *rt) 408 { 409 if (rt->rt6i_flags & RTF_EXPIRES) 410 return time_after(jiffies, rt->dst.expires); 411 else 412 return false; 413 } 414 415 static bool rt6_check_expired(const struct rt6_info *rt) 416 { 417 struct fib6_info *from; 418 419 from = rcu_dereference(rt->from); 420 421 if (rt->rt6i_flags & RTF_EXPIRES) { 422 if (time_after(jiffies, rt->dst.expires)) 423 return true; 424 } else if (from) { 425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 426 fib6_check_expired(from); 427 } 428 return false; 429 } 430 431 void fib6_select_path(const struct net *net, struct fib6_result *res, 432 struct flowi6 *fl6, int oif, bool have_oif_match, 433 const struct sk_buff *skb, int strict) 434 { 435 struct fib6_info *sibling, *next_sibling; 436 struct fib6_info *match = res->f6i; 437 438 if (!match->fib6_nsiblings || have_oif_match) 439 goto out; 440 441 /* We might have already computed the hash for ICMPv6 errors. In such 442 * case it will always be non-zero. Otherwise now is the time to do it. 443 */ 444 if (!fl6->mp_hash) 445 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 446 447 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 448 goto out; 449 450 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 451 fib6_siblings) { 452 const struct fib6_nh *nh = &sibling->fib6_nh; 453 int nh_upper_bound; 454 455 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 456 if (fl6->mp_hash > nh_upper_bound) 457 continue; 458 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 459 break; 460 match = sibling; 461 break; 462 } 463 464 out: 465 res->f6i = match; 466 res->nh = &match->fib6_nh; 467 } 468 469 /* 470 * Route lookup. rcu_read_lock() should be held. 471 */ 472 473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 474 const struct in6_addr *saddr, int oif, int flags) 475 { 476 const struct net_device *dev; 477 478 if (nh->fib_nh_flags & RTNH_F_DEAD) 479 return false; 480 481 dev = nh->fib_nh_dev; 482 if (oif) { 483 if (dev->ifindex == oif) 484 return true; 485 } else { 486 if (ipv6_chk_addr(net, saddr, dev, 487 flags & RT6_LOOKUP_F_IFACE)) 488 return true; 489 } 490 491 return false; 492 } 493 494 static void rt6_device_match(struct net *net, struct fib6_result *res, 495 const struct in6_addr *saddr, int oif, int flags) 496 { 497 struct fib6_info *f6i = res->f6i; 498 struct fib6_info *spf6i; 499 struct fib6_nh *nh; 500 501 if (!oif && ipv6_addr_any(saddr)) { 502 nh = &f6i->fib6_nh; 503 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 504 goto out; 505 } 506 507 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 508 nh = &spf6i->fib6_nh; 509 if (__rt6_device_match(net, nh, saddr, oif, flags)) { 510 res->f6i = spf6i; 511 goto out; 512 } 513 } 514 515 if (oif && flags & RT6_LOOKUP_F_IFACE) { 516 res->f6i = net->ipv6.fib6_null_entry; 517 nh = &res->f6i->fib6_nh; 518 goto out; 519 } 520 521 nh = &f6i->fib6_nh; 522 if (nh->fib_nh_flags & RTNH_F_DEAD) { 523 res->f6i = net->ipv6.fib6_null_entry; 524 nh = &res->f6i->fib6_nh; 525 } 526 out: 527 res->nh = nh; 528 res->fib6_type = res->f6i->fib6_type; 529 res->fib6_flags = res->f6i->fib6_flags; 530 } 531 532 #ifdef CONFIG_IPV6_ROUTER_PREF 533 struct __rt6_probe_work { 534 struct work_struct work; 535 struct in6_addr target; 536 struct net_device *dev; 537 }; 538 539 static void rt6_probe_deferred(struct work_struct *w) 540 { 541 struct in6_addr mcaddr; 542 struct __rt6_probe_work *work = 543 container_of(w, struct __rt6_probe_work, work); 544 545 addrconf_addr_solict_mult(&work->target, &mcaddr); 546 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 547 dev_put(work->dev); 548 kfree(work); 549 } 550 551 static void rt6_probe(struct fib6_nh *fib6_nh) 552 { 553 struct __rt6_probe_work *work = NULL; 554 const struct in6_addr *nh_gw; 555 struct neighbour *neigh; 556 struct net_device *dev; 557 struct inet6_dev *idev; 558 559 /* 560 * Okay, this does not seem to be appropriate 561 * for now, however, we need to check if it 562 * is really so; aka Router Reachability Probing. 563 * 564 * Router Reachability Probe MUST be rate-limited 565 * to no more than one per minute. 566 */ 567 if (fib6_nh->fib_nh_gw_family) 568 return; 569 570 nh_gw = &fib6_nh->fib_nh_gw6; 571 dev = fib6_nh->fib_nh_dev; 572 rcu_read_lock_bh(); 573 idev = __in6_dev_get(dev); 574 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 575 if (neigh) { 576 if (neigh->nud_state & NUD_VALID) 577 goto out; 578 579 write_lock(&neigh->lock); 580 if (!(neigh->nud_state & NUD_VALID) && 581 time_after(jiffies, 582 neigh->updated + idev->cnf.rtr_probe_interval)) { 583 work = kmalloc(sizeof(*work), GFP_ATOMIC); 584 if (work) 585 __neigh_set_probe_once(neigh); 586 } 587 write_unlock(&neigh->lock); 588 } else if (time_after(jiffies, fib6_nh->last_probe + 589 idev->cnf.rtr_probe_interval)) { 590 work = kmalloc(sizeof(*work), GFP_ATOMIC); 591 } 592 593 if (work) { 594 fib6_nh->last_probe = jiffies; 595 INIT_WORK(&work->work, rt6_probe_deferred); 596 work->target = *nh_gw; 597 dev_hold(dev); 598 work->dev = dev; 599 schedule_work(&work->work); 600 } 601 602 out: 603 rcu_read_unlock_bh(); 604 } 605 #else 606 static inline void rt6_probe(struct fib6_nh *fib6_nh) 607 { 608 } 609 #endif 610 611 /* 612 * Default Router Selection (RFC 2461 6.3.6) 613 */ 614 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 615 { 616 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 617 struct neighbour *neigh; 618 619 rcu_read_lock_bh(); 620 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 621 &fib6_nh->fib_nh_gw6); 622 if (neigh) { 623 read_lock(&neigh->lock); 624 if (neigh->nud_state & NUD_VALID) 625 ret = RT6_NUD_SUCCEED; 626 #ifdef CONFIG_IPV6_ROUTER_PREF 627 else if (!(neigh->nud_state & NUD_FAILED)) 628 ret = RT6_NUD_SUCCEED; 629 else 630 ret = RT6_NUD_FAIL_PROBE; 631 #endif 632 read_unlock(&neigh->lock); 633 } else { 634 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 635 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 636 } 637 rcu_read_unlock_bh(); 638 639 return ret; 640 } 641 642 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 643 int strict) 644 { 645 int m = 0; 646 647 if (!oif || nh->fib_nh_dev->ifindex == oif) 648 m = 2; 649 650 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 651 return RT6_NUD_FAIL_HARD; 652 #ifdef CONFIG_IPV6_ROUTER_PREF 653 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 654 #endif 655 if ((strict & RT6_LOOKUP_F_REACHABLE) && 656 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 657 int n = rt6_check_neigh(nh); 658 if (n < 0) 659 return n; 660 } 661 return m; 662 } 663 664 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 665 int oif, int strict, int *mpri, bool *do_rr) 666 { 667 bool match_do_rr = false; 668 bool rc = false; 669 int m; 670 671 if (nh->fib_nh_flags & RTNH_F_DEAD) 672 goto out; 673 674 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 675 nh->fib_nh_flags & RTNH_F_LINKDOWN && 676 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 677 goto out; 678 679 m = rt6_score_route(nh, fib6_flags, oif, strict); 680 if (m == RT6_NUD_FAIL_DO_RR) { 681 match_do_rr = true; 682 m = 0; /* lowest valid score */ 683 } else if (m == RT6_NUD_FAIL_HARD) { 684 goto out; 685 } 686 687 if (strict & RT6_LOOKUP_F_REACHABLE) 688 rt6_probe(nh); 689 690 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 691 if (m > *mpri) { 692 *do_rr = match_do_rr; 693 *mpri = m; 694 rc = true; 695 } 696 out: 697 return rc; 698 } 699 700 static void __find_rr_leaf(struct fib6_info *f6i_start, 701 struct fib6_info *nomatch, u32 metric, 702 struct fib6_result *res, struct fib6_info **cont, 703 int oif, int strict, bool *do_rr, int *mpri) 704 { 705 struct fib6_info *f6i; 706 707 for (f6i = f6i_start; 708 f6i && f6i != nomatch; 709 f6i = rcu_dereference(f6i->fib6_next)) { 710 struct fib6_nh *nh; 711 712 if (cont && f6i->fib6_metric != metric) { 713 *cont = f6i; 714 return; 715 } 716 717 if (fib6_check_expired(f6i)) 718 continue; 719 720 nh = &f6i->fib6_nh; 721 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { 722 res->f6i = f6i; 723 res->nh = nh; 724 res->fib6_flags = f6i->fib6_flags; 725 res->fib6_type = f6i->fib6_type; 726 } 727 } 728 } 729 730 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 731 struct fib6_info *rr_head, int oif, int strict, 732 bool *do_rr, struct fib6_result *res) 733 { 734 u32 metric = rr_head->fib6_metric; 735 struct fib6_info *cont = NULL; 736 int mpri = -1; 737 738 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 739 oif, strict, do_rr, &mpri); 740 741 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 742 oif, strict, do_rr, &mpri); 743 744 if (res->f6i || !cont) 745 return; 746 747 __find_rr_leaf(cont, NULL, metric, res, NULL, 748 oif, strict, do_rr, &mpri); 749 } 750 751 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 752 struct fib6_result *res, int strict) 753 { 754 struct fib6_info *leaf = rcu_dereference(fn->leaf); 755 struct fib6_info *rt0; 756 bool do_rr = false; 757 int key_plen; 758 759 /* make sure this function or its helpers sets f6i */ 760 res->f6i = NULL; 761 762 if (!leaf || leaf == net->ipv6.fib6_null_entry) 763 goto out; 764 765 rt0 = rcu_dereference(fn->rr_ptr); 766 if (!rt0) 767 rt0 = leaf; 768 769 /* Double check to make sure fn is not an intermediate node 770 * and fn->leaf does not points to its child's leaf 771 * (This might happen if all routes under fn are deleted from 772 * the tree and fib6_repair_tree() is called on the node.) 773 */ 774 key_plen = rt0->fib6_dst.plen; 775 #ifdef CONFIG_IPV6_SUBTREES 776 if (rt0->fib6_src.plen) 777 key_plen = rt0->fib6_src.plen; 778 #endif 779 if (fn->fn_bit != key_plen) 780 goto out; 781 782 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 783 if (do_rr) { 784 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 785 786 /* no entries matched; do round-robin */ 787 if (!next || next->fib6_metric != rt0->fib6_metric) 788 next = leaf; 789 790 if (next != rt0) { 791 spin_lock_bh(&leaf->fib6_table->tb6_lock); 792 /* make sure next is not being deleted from the tree */ 793 if (next->fib6_node) 794 rcu_assign_pointer(fn->rr_ptr, next); 795 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 796 } 797 } 798 799 out: 800 if (!res->f6i) { 801 res->f6i = net->ipv6.fib6_null_entry; 802 res->nh = &res->f6i->fib6_nh; 803 res->fib6_flags = res->f6i->fib6_flags; 804 res->fib6_type = res->f6i->fib6_type; 805 } 806 } 807 808 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 809 { 810 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 811 res->nh->fib_nh_gw_family; 812 } 813 814 #ifdef CONFIG_IPV6_ROUTE_INFO 815 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 816 const struct in6_addr *gwaddr) 817 { 818 struct net *net = dev_net(dev); 819 struct route_info *rinfo = (struct route_info *) opt; 820 struct in6_addr prefix_buf, *prefix; 821 unsigned int pref; 822 unsigned long lifetime; 823 struct fib6_info *rt; 824 825 if (len < sizeof(struct route_info)) { 826 return -EINVAL; 827 } 828 829 /* Sanity check for prefix_len and length */ 830 if (rinfo->length > 3) { 831 return -EINVAL; 832 } else if (rinfo->prefix_len > 128) { 833 return -EINVAL; 834 } else if (rinfo->prefix_len > 64) { 835 if (rinfo->length < 2) { 836 return -EINVAL; 837 } 838 } else if (rinfo->prefix_len > 0) { 839 if (rinfo->length < 1) { 840 return -EINVAL; 841 } 842 } 843 844 pref = rinfo->route_pref; 845 if (pref == ICMPV6_ROUTER_PREF_INVALID) 846 return -EINVAL; 847 848 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 849 850 if (rinfo->length == 3) 851 prefix = (struct in6_addr *)rinfo->prefix; 852 else { 853 /* this function is safe */ 854 ipv6_addr_prefix(&prefix_buf, 855 (struct in6_addr *)rinfo->prefix, 856 rinfo->prefix_len); 857 prefix = &prefix_buf; 858 } 859 860 if (rinfo->prefix_len == 0) 861 rt = rt6_get_dflt_router(net, gwaddr, dev); 862 else 863 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 864 gwaddr, dev); 865 866 if (rt && !lifetime) { 867 ip6_del_rt(net, rt); 868 rt = NULL; 869 } 870 871 if (!rt && lifetime) 872 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 873 dev, pref); 874 else if (rt) 875 rt->fib6_flags = RTF_ROUTEINFO | 876 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 877 878 if (rt) { 879 if (!addrconf_finite_timeout(lifetime)) 880 fib6_clean_expires(rt); 881 else 882 fib6_set_expires(rt, jiffies + HZ * lifetime); 883 884 fib6_info_release(rt); 885 } 886 return 0; 887 } 888 #endif 889 890 /* 891 * Misc support functions 892 */ 893 894 /* called with rcu_lock held */ 895 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 896 { 897 struct net_device *dev = res->nh->fib_nh_dev; 898 899 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 900 /* for copies of local routes, dst->dev needs to be the 901 * device if it is a master device, the master device if 902 * device is enslaved, and the loopback as the default 903 */ 904 if (netif_is_l3_slave(dev) && 905 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 906 dev = l3mdev_master_dev_rcu(dev); 907 else if (!netif_is_l3_master(dev)) 908 dev = dev_net(dev)->loopback_dev; 909 /* last case is netif_is_l3_master(dev) is true in which 910 * case we want dev returned to be dev 911 */ 912 } 913 914 return dev; 915 } 916 917 static const int fib6_prop[RTN_MAX + 1] = { 918 [RTN_UNSPEC] = 0, 919 [RTN_UNICAST] = 0, 920 [RTN_LOCAL] = 0, 921 [RTN_BROADCAST] = 0, 922 [RTN_ANYCAST] = 0, 923 [RTN_MULTICAST] = 0, 924 [RTN_BLACKHOLE] = -EINVAL, 925 [RTN_UNREACHABLE] = -EHOSTUNREACH, 926 [RTN_PROHIBIT] = -EACCES, 927 [RTN_THROW] = -EAGAIN, 928 [RTN_NAT] = -EINVAL, 929 [RTN_XRESOLVE] = -EINVAL, 930 }; 931 932 static int ip6_rt_type_to_error(u8 fib6_type) 933 { 934 return fib6_prop[fib6_type]; 935 } 936 937 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 938 { 939 unsigned short flags = 0; 940 941 if (rt->dst_nocount) 942 flags |= DST_NOCOUNT; 943 if (rt->dst_nopolicy) 944 flags |= DST_NOPOLICY; 945 if (rt->dst_host) 946 flags |= DST_HOST; 947 948 return flags; 949 } 950 951 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 952 { 953 rt->dst.error = ip6_rt_type_to_error(fib6_type); 954 955 switch (fib6_type) { 956 case RTN_BLACKHOLE: 957 rt->dst.output = dst_discard_out; 958 rt->dst.input = dst_discard; 959 break; 960 case RTN_PROHIBIT: 961 rt->dst.output = ip6_pkt_prohibit_out; 962 rt->dst.input = ip6_pkt_prohibit; 963 break; 964 case RTN_THROW: 965 case RTN_UNREACHABLE: 966 default: 967 rt->dst.output = ip6_pkt_discard_out; 968 rt->dst.input = ip6_pkt_discard; 969 break; 970 } 971 } 972 973 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 974 { 975 struct fib6_info *f6i = res->f6i; 976 977 if (res->fib6_flags & RTF_REJECT) { 978 ip6_rt_init_dst_reject(rt, res->fib6_type); 979 return; 980 } 981 982 rt->dst.error = 0; 983 rt->dst.output = ip6_output; 984 985 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 986 rt->dst.input = ip6_input; 987 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 988 rt->dst.input = ip6_mc_input; 989 } else { 990 rt->dst.input = ip6_forward; 991 } 992 993 if (res->nh->fib_nh_lws) { 994 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 995 lwtunnel_set_redirect(&rt->dst); 996 } 997 998 rt->dst.lastuse = jiffies; 999 } 1000 1001 /* Caller must already hold reference to @from */ 1002 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1003 { 1004 rt->rt6i_flags &= ~RTF_EXPIRES; 1005 rcu_assign_pointer(rt->from, from); 1006 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1007 } 1008 1009 /* Caller must already hold reference to f6i in result */ 1010 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1011 { 1012 const struct fib6_nh *nh = res->nh; 1013 const struct net_device *dev = nh->fib_nh_dev; 1014 struct fib6_info *f6i = res->f6i; 1015 1016 ip6_rt_init_dst(rt, res); 1017 1018 rt->rt6i_dst = f6i->fib6_dst; 1019 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1020 rt->rt6i_flags = res->fib6_flags; 1021 if (nh->fib_nh_gw_family) { 1022 rt->rt6i_gateway = nh->fib_nh_gw6; 1023 rt->rt6i_flags |= RTF_GATEWAY; 1024 } 1025 rt6_set_from(rt, f6i); 1026 #ifdef CONFIG_IPV6_SUBTREES 1027 rt->rt6i_src = f6i->fib6_src; 1028 #endif 1029 } 1030 1031 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1032 struct in6_addr *saddr) 1033 { 1034 struct fib6_node *pn, *sn; 1035 while (1) { 1036 if (fn->fn_flags & RTN_TL_ROOT) 1037 return NULL; 1038 pn = rcu_dereference(fn->parent); 1039 sn = FIB6_SUBTREE(pn); 1040 if (sn && sn != fn) 1041 fn = fib6_node_lookup(sn, NULL, saddr); 1042 else 1043 fn = pn; 1044 if (fn->fn_flags & RTN_RTINFO) 1045 return fn; 1046 } 1047 } 1048 1049 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1050 { 1051 struct rt6_info *rt = *prt; 1052 1053 if (dst_hold_safe(&rt->dst)) 1054 return true; 1055 if (net) { 1056 rt = net->ipv6.ip6_null_entry; 1057 dst_hold(&rt->dst); 1058 } else { 1059 rt = NULL; 1060 } 1061 *prt = rt; 1062 return false; 1063 } 1064 1065 /* called with rcu_lock held */ 1066 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1067 { 1068 struct net_device *dev = res->nh->fib_nh_dev; 1069 struct fib6_info *f6i = res->f6i; 1070 unsigned short flags; 1071 struct rt6_info *nrt; 1072 1073 if (!fib6_info_hold_safe(f6i)) 1074 goto fallback; 1075 1076 flags = fib6_info_dst_flags(f6i); 1077 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1078 if (!nrt) { 1079 fib6_info_release(f6i); 1080 goto fallback; 1081 } 1082 1083 ip6_rt_copy_init(nrt, res); 1084 return nrt; 1085 1086 fallback: 1087 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1088 dst_hold(&nrt->dst); 1089 return nrt; 1090 } 1091 1092 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1093 struct fib6_table *table, 1094 struct flowi6 *fl6, 1095 const struct sk_buff *skb, 1096 int flags) 1097 { 1098 struct fib6_result res = {}; 1099 struct fib6_node *fn; 1100 struct rt6_info *rt; 1101 1102 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1103 flags &= ~RT6_LOOKUP_F_IFACE; 1104 1105 rcu_read_lock(); 1106 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1107 restart: 1108 res.f6i = rcu_dereference(fn->leaf); 1109 if (!res.f6i) 1110 res.f6i = net->ipv6.fib6_null_entry; 1111 else 1112 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1113 flags); 1114 1115 if (res.f6i == net->ipv6.fib6_null_entry) { 1116 fn = fib6_backtrack(fn, &fl6->saddr); 1117 if (fn) 1118 goto restart; 1119 1120 rt = net->ipv6.ip6_null_entry; 1121 dst_hold(&rt->dst); 1122 goto out; 1123 } 1124 1125 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1126 fl6->flowi6_oif != 0, skb, flags); 1127 1128 /* Search through exception table */ 1129 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1130 if (rt) { 1131 if (ip6_hold_safe(net, &rt)) 1132 dst_use_noref(&rt->dst, jiffies); 1133 } else { 1134 rt = ip6_create_rt_rcu(&res); 1135 } 1136 1137 out: 1138 trace_fib6_table_lookup(net, &res, table, fl6); 1139 1140 rcu_read_unlock(); 1141 1142 return rt; 1143 } 1144 1145 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1146 const struct sk_buff *skb, int flags) 1147 { 1148 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1149 } 1150 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1151 1152 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1153 const struct in6_addr *saddr, int oif, 1154 const struct sk_buff *skb, int strict) 1155 { 1156 struct flowi6 fl6 = { 1157 .flowi6_oif = oif, 1158 .daddr = *daddr, 1159 }; 1160 struct dst_entry *dst; 1161 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1162 1163 if (saddr) { 1164 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1165 flags |= RT6_LOOKUP_F_HAS_SADDR; 1166 } 1167 1168 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1169 if (dst->error == 0) 1170 return (struct rt6_info *) dst; 1171 1172 dst_release(dst); 1173 1174 return NULL; 1175 } 1176 EXPORT_SYMBOL(rt6_lookup); 1177 1178 /* ip6_ins_rt is called with FREE table->tb6_lock. 1179 * It takes new route entry, the addition fails by any reason the 1180 * route is released. 1181 * Caller must hold dst before calling it. 1182 */ 1183 1184 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1185 struct netlink_ext_ack *extack) 1186 { 1187 int err; 1188 struct fib6_table *table; 1189 1190 table = rt->fib6_table; 1191 spin_lock_bh(&table->tb6_lock); 1192 err = fib6_add(&table->tb6_root, rt, info, extack); 1193 spin_unlock_bh(&table->tb6_lock); 1194 1195 return err; 1196 } 1197 1198 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1199 { 1200 struct nl_info info = { .nl_net = net, }; 1201 1202 return __ip6_ins_rt(rt, &info, NULL); 1203 } 1204 1205 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1206 const struct in6_addr *daddr, 1207 const struct in6_addr *saddr) 1208 { 1209 struct fib6_info *f6i = res->f6i; 1210 struct net_device *dev; 1211 struct rt6_info *rt; 1212 1213 /* 1214 * Clone the route. 1215 */ 1216 1217 if (!fib6_info_hold_safe(f6i)) 1218 return NULL; 1219 1220 dev = ip6_rt_get_dev_rcu(res); 1221 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1222 if (!rt) { 1223 fib6_info_release(f6i); 1224 return NULL; 1225 } 1226 1227 ip6_rt_copy_init(rt, res); 1228 rt->rt6i_flags |= RTF_CACHE; 1229 rt->dst.flags |= DST_HOST; 1230 rt->rt6i_dst.addr = *daddr; 1231 rt->rt6i_dst.plen = 128; 1232 1233 if (!rt6_is_gw_or_nonexthop(res)) { 1234 if (f6i->fib6_dst.plen != 128 && 1235 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1236 rt->rt6i_flags |= RTF_ANYCAST; 1237 #ifdef CONFIG_IPV6_SUBTREES 1238 if (rt->rt6i_src.plen && saddr) { 1239 rt->rt6i_src.addr = *saddr; 1240 rt->rt6i_src.plen = 128; 1241 } 1242 #endif 1243 } 1244 1245 return rt; 1246 } 1247 1248 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1249 { 1250 struct fib6_info *f6i = res->f6i; 1251 unsigned short flags = fib6_info_dst_flags(f6i); 1252 struct net_device *dev; 1253 struct rt6_info *pcpu_rt; 1254 1255 if (!fib6_info_hold_safe(f6i)) 1256 return NULL; 1257 1258 rcu_read_lock(); 1259 dev = ip6_rt_get_dev_rcu(res); 1260 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1261 rcu_read_unlock(); 1262 if (!pcpu_rt) { 1263 fib6_info_release(f6i); 1264 return NULL; 1265 } 1266 ip6_rt_copy_init(pcpu_rt, res); 1267 pcpu_rt->rt6i_flags |= RTF_PCPU; 1268 return pcpu_rt; 1269 } 1270 1271 /* It should be called with rcu_read_lock() acquired */ 1272 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1273 { 1274 struct rt6_info *pcpu_rt, **p; 1275 1276 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1277 pcpu_rt = *p; 1278 1279 if (pcpu_rt) 1280 ip6_hold_safe(NULL, &pcpu_rt); 1281 1282 return pcpu_rt; 1283 } 1284 1285 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1286 const struct fib6_result *res) 1287 { 1288 struct rt6_info *pcpu_rt, *prev, **p; 1289 1290 pcpu_rt = ip6_rt_pcpu_alloc(res); 1291 if (!pcpu_rt) { 1292 dst_hold(&net->ipv6.ip6_null_entry->dst); 1293 return net->ipv6.ip6_null_entry; 1294 } 1295 1296 dst_hold(&pcpu_rt->dst); 1297 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1298 prev = cmpxchg(p, NULL, pcpu_rt); 1299 BUG_ON(prev); 1300 1301 return pcpu_rt; 1302 } 1303 1304 /* exception hash table implementation 1305 */ 1306 static DEFINE_SPINLOCK(rt6_exception_lock); 1307 1308 /* Remove rt6_ex from hash table and free the memory 1309 * Caller must hold rt6_exception_lock 1310 */ 1311 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1312 struct rt6_exception *rt6_ex) 1313 { 1314 struct fib6_info *from; 1315 struct net *net; 1316 1317 if (!bucket || !rt6_ex) 1318 return; 1319 1320 net = dev_net(rt6_ex->rt6i->dst.dev); 1321 net->ipv6.rt6_stats->fib_rt_cache--; 1322 1323 /* purge completely the exception to allow releasing the held resources: 1324 * some [sk] cache may keep the dst around for unlimited time 1325 */ 1326 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1327 lockdep_is_held(&rt6_exception_lock)); 1328 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1329 fib6_info_release(from); 1330 dst_dev_put(&rt6_ex->rt6i->dst); 1331 1332 hlist_del_rcu(&rt6_ex->hlist); 1333 dst_release(&rt6_ex->rt6i->dst); 1334 kfree_rcu(rt6_ex, rcu); 1335 WARN_ON_ONCE(!bucket->depth); 1336 bucket->depth--; 1337 } 1338 1339 /* Remove oldest rt6_ex in bucket and free the memory 1340 * Caller must hold rt6_exception_lock 1341 */ 1342 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1343 { 1344 struct rt6_exception *rt6_ex, *oldest = NULL; 1345 1346 if (!bucket) 1347 return; 1348 1349 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1350 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1351 oldest = rt6_ex; 1352 } 1353 rt6_remove_exception(bucket, oldest); 1354 } 1355 1356 static u32 rt6_exception_hash(const struct in6_addr *dst, 1357 const struct in6_addr *src) 1358 { 1359 static u32 seed __read_mostly; 1360 u32 val; 1361 1362 net_get_random_once(&seed, sizeof(seed)); 1363 val = jhash(dst, sizeof(*dst), seed); 1364 1365 #ifdef CONFIG_IPV6_SUBTREES 1366 if (src) 1367 val = jhash(src, sizeof(*src), val); 1368 #endif 1369 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1370 } 1371 1372 /* Helper function to find the cached rt in the hash table 1373 * and update bucket pointer to point to the bucket for this 1374 * (daddr, saddr) pair 1375 * Caller must hold rt6_exception_lock 1376 */ 1377 static struct rt6_exception * 1378 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1379 const struct in6_addr *daddr, 1380 const struct in6_addr *saddr) 1381 { 1382 struct rt6_exception *rt6_ex; 1383 u32 hval; 1384 1385 if (!(*bucket) || !daddr) 1386 return NULL; 1387 1388 hval = rt6_exception_hash(daddr, saddr); 1389 *bucket += hval; 1390 1391 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1392 struct rt6_info *rt6 = rt6_ex->rt6i; 1393 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1394 1395 #ifdef CONFIG_IPV6_SUBTREES 1396 if (matched && saddr) 1397 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1398 #endif 1399 if (matched) 1400 return rt6_ex; 1401 } 1402 return NULL; 1403 } 1404 1405 /* Helper function to find the cached rt in the hash table 1406 * and update bucket pointer to point to the bucket for this 1407 * (daddr, saddr) pair 1408 * Caller must hold rcu_read_lock() 1409 */ 1410 static struct rt6_exception * 1411 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1412 const struct in6_addr *daddr, 1413 const struct in6_addr *saddr) 1414 { 1415 struct rt6_exception *rt6_ex; 1416 u32 hval; 1417 1418 WARN_ON_ONCE(!rcu_read_lock_held()); 1419 1420 if (!(*bucket) || !daddr) 1421 return NULL; 1422 1423 hval = rt6_exception_hash(daddr, saddr); 1424 *bucket += hval; 1425 1426 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1427 struct rt6_info *rt6 = rt6_ex->rt6i; 1428 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1429 1430 #ifdef CONFIG_IPV6_SUBTREES 1431 if (matched && saddr) 1432 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1433 #endif 1434 if (matched) 1435 return rt6_ex; 1436 } 1437 return NULL; 1438 } 1439 1440 static unsigned int fib6_mtu(const struct fib6_result *res) 1441 { 1442 const struct fib6_nh *nh = res->nh; 1443 unsigned int mtu; 1444 1445 if (res->f6i->fib6_pmtu) { 1446 mtu = res->f6i->fib6_pmtu; 1447 } else { 1448 struct net_device *dev = nh->fib_nh_dev; 1449 struct inet6_dev *idev; 1450 1451 rcu_read_lock(); 1452 idev = __in6_dev_get(dev); 1453 mtu = idev->cnf.mtu6; 1454 rcu_read_unlock(); 1455 } 1456 1457 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1458 1459 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1460 } 1461 1462 static int rt6_insert_exception(struct rt6_info *nrt, 1463 const struct fib6_result *res) 1464 { 1465 struct net *net = dev_net(nrt->dst.dev); 1466 struct rt6_exception_bucket *bucket; 1467 struct in6_addr *src_key = NULL; 1468 struct rt6_exception *rt6_ex; 1469 struct fib6_info *f6i = res->f6i; 1470 int err = 0; 1471 1472 spin_lock_bh(&rt6_exception_lock); 1473 1474 if (f6i->exception_bucket_flushed) { 1475 err = -EINVAL; 1476 goto out; 1477 } 1478 1479 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1480 lockdep_is_held(&rt6_exception_lock)); 1481 if (!bucket) { 1482 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1483 GFP_ATOMIC); 1484 if (!bucket) { 1485 err = -ENOMEM; 1486 goto out; 1487 } 1488 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); 1489 } 1490 1491 #ifdef CONFIG_IPV6_SUBTREES 1492 /* fib6_src.plen != 0 indicates f6i is in subtree 1493 * and exception table is indexed by a hash of 1494 * both fib6_dst and fib6_src. 1495 * Otherwise, the exception table is indexed by 1496 * a hash of only fib6_dst. 1497 */ 1498 if (f6i->fib6_src.plen) 1499 src_key = &nrt->rt6i_src.addr; 1500 #endif 1501 /* rt6_mtu_change() might lower mtu on f6i. 1502 * Only insert this exception route if its mtu 1503 * is less than f6i's mtu value. 1504 */ 1505 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1506 err = -EINVAL; 1507 goto out; 1508 } 1509 1510 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1511 src_key); 1512 if (rt6_ex) 1513 rt6_remove_exception(bucket, rt6_ex); 1514 1515 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1516 if (!rt6_ex) { 1517 err = -ENOMEM; 1518 goto out; 1519 } 1520 rt6_ex->rt6i = nrt; 1521 rt6_ex->stamp = jiffies; 1522 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1523 bucket->depth++; 1524 net->ipv6.rt6_stats->fib_rt_cache++; 1525 1526 if (bucket->depth > FIB6_MAX_DEPTH) 1527 rt6_exception_remove_oldest(bucket); 1528 1529 out: 1530 spin_unlock_bh(&rt6_exception_lock); 1531 1532 /* Update fn->fn_sernum to invalidate all cached dst */ 1533 if (!err) { 1534 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1535 fib6_update_sernum(net, f6i); 1536 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1537 fib6_force_start_gc(net); 1538 } 1539 1540 return err; 1541 } 1542 1543 void rt6_flush_exceptions(struct fib6_info *rt) 1544 { 1545 struct rt6_exception_bucket *bucket; 1546 struct rt6_exception *rt6_ex; 1547 struct hlist_node *tmp; 1548 int i; 1549 1550 spin_lock_bh(&rt6_exception_lock); 1551 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1552 rt->exception_bucket_flushed = 1; 1553 1554 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1555 lockdep_is_held(&rt6_exception_lock)); 1556 if (!bucket) 1557 goto out; 1558 1559 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1560 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1561 rt6_remove_exception(bucket, rt6_ex); 1562 WARN_ON_ONCE(bucket->depth); 1563 bucket++; 1564 } 1565 1566 out: 1567 spin_unlock_bh(&rt6_exception_lock); 1568 } 1569 1570 /* Find cached rt in the hash table inside passed in rt 1571 * Caller has to hold rcu_read_lock() 1572 */ 1573 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1574 struct in6_addr *daddr, 1575 struct in6_addr *saddr) 1576 { 1577 struct rt6_exception_bucket *bucket; 1578 struct in6_addr *src_key = NULL; 1579 struct rt6_exception *rt6_ex; 1580 struct rt6_info *ret = NULL; 1581 1582 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); 1583 1584 #ifdef CONFIG_IPV6_SUBTREES 1585 /* fib6i_src.plen != 0 indicates f6i is in subtree 1586 * and exception table is indexed by a hash of 1587 * both fib6_dst and fib6_src. 1588 * Otherwise, the exception table is indexed by 1589 * a hash of only fib6_dst. 1590 */ 1591 if (res->f6i->fib6_src.plen) 1592 src_key = saddr; 1593 #endif 1594 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1595 1596 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1597 ret = rt6_ex->rt6i; 1598 1599 return ret; 1600 } 1601 1602 /* Remove the passed in cached rt from the hash table that contains it */ 1603 static int rt6_remove_exception_rt(struct rt6_info *rt) 1604 { 1605 struct rt6_exception_bucket *bucket; 1606 struct in6_addr *src_key = NULL; 1607 struct rt6_exception *rt6_ex; 1608 struct fib6_info *from; 1609 int err; 1610 1611 from = rcu_dereference(rt->from); 1612 if (!from || 1613 !(rt->rt6i_flags & RTF_CACHE)) 1614 return -EINVAL; 1615 1616 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1617 return -ENOENT; 1618 1619 spin_lock_bh(&rt6_exception_lock); 1620 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1621 lockdep_is_held(&rt6_exception_lock)); 1622 #ifdef CONFIG_IPV6_SUBTREES 1623 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1624 * and exception table is indexed by a hash of 1625 * both rt6i_dst and rt6i_src. 1626 * Otherwise, the exception table is indexed by 1627 * a hash of only rt6i_dst. 1628 */ 1629 if (from->fib6_src.plen) 1630 src_key = &rt->rt6i_src.addr; 1631 #endif 1632 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1633 &rt->rt6i_dst.addr, 1634 src_key); 1635 if (rt6_ex) { 1636 rt6_remove_exception(bucket, rt6_ex); 1637 err = 0; 1638 } else { 1639 err = -ENOENT; 1640 } 1641 1642 spin_unlock_bh(&rt6_exception_lock); 1643 return err; 1644 } 1645 1646 /* Find rt6_ex which contains the passed in rt cache and 1647 * refresh its stamp 1648 */ 1649 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1650 { 1651 struct rt6_exception_bucket *bucket; 1652 struct in6_addr *src_key = NULL; 1653 struct rt6_exception *rt6_ex; 1654 struct fib6_info *from; 1655 1656 rcu_read_lock(); 1657 from = rcu_dereference(rt->from); 1658 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1659 goto unlock; 1660 1661 bucket = rcu_dereference(from->rt6i_exception_bucket); 1662 1663 #ifdef CONFIG_IPV6_SUBTREES 1664 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1665 * and exception table is indexed by a hash of 1666 * both rt6i_dst and rt6i_src. 1667 * Otherwise, the exception table is indexed by 1668 * a hash of only rt6i_dst. 1669 */ 1670 if (from->fib6_src.plen) 1671 src_key = &rt->rt6i_src.addr; 1672 #endif 1673 rt6_ex = __rt6_find_exception_rcu(&bucket, 1674 &rt->rt6i_dst.addr, 1675 src_key); 1676 if (rt6_ex) 1677 rt6_ex->stamp = jiffies; 1678 1679 unlock: 1680 rcu_read_unlock(); 1681 } 1682 1683 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1684 struct rt6_info *rt, int mtu) 1685 { 1686 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1687 * lowest MTU in the path: always allow updating the route PMTU to 1688 * reflect PMTU decreases. 1689 * 1690 * If the new MTU is higher, and the route PMTU is equal to the local 1691 * MTU, this means the old MTU is the lowest in the path, so allow 1692 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1693 * handle this. 1694 */ 1695 1696 if (dst_mtu(&rt->dst) >= mtu) 1697 return true; 1698 1699 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1700 return true; 1701 1702 return false; 1703 } 1704 1705 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1706 struct fib6_info *rt, int mtu) 1707 { 1708 struct rt6_exception_bucket *bucket; 1709 struct rt6_exception *rt6_ex; 1710 int i; 1711 1712 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1713 lockdep_is_held(&rt6_exception_lock)); 1714 1715 if (!bucket) 1716 return; 1717 1718 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1719 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1720 struct rt6_info *entry = rt6_ex->rt6i; 1721 1722 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1723 * route), the metrics of its rt->from have already 1724 * been updated. 1725 */ 1726 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1727 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1728 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1729 } 1730 bucket++; 1731 } 1732 } 1733 1734 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1735 1736 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1737 struct in6_addr *gateway) 1738 { 1739 struct rt6_exception_bucket *bucket; 1740 struct rt6_exception *rt6_ex; 1741 struct hlist_node *tmp; 1742 int i; 1743 1744 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1745 return; 1746 1747 spin_lock_bh(&rt6_exception_lock); 1748 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1749 lockdep_is_held(&rt6_exception_lock)); 1750 1751 if (bucket) { 1752 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1753 hlist_for_each_entry_safe(rt6_ex, tmp, 1754 &bucket->chain, hlist) { 1755 struct rt6_info *entry = rt6_ex->rt6i; 1756 1757 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1758 RTF_CACHE_GATEWAY && 1759 ipv6_addr_equal(gateway, 1760 &entry->rt6i_gateway)) { 1761 rt6_remove_exception(bucket, rt6_ex); 1762 } 1763 } 1764 bucket++; 1765 } 1766 } 1767 1768 spin_unlock_bh(&rt6_exception_lock); 1769 } 1770 1771 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1772 struct rt6_exception *rt6_ex, 1773 struct fib6_gc_args *gc_args, 1774 unsigned long now) 1775 { 1776 struct rt6_info *rt = rt6_ex->rt6i; 1777 1778 /* we are pruning and obsoleting aged-out and non gateway exceptions 1779 * even if others have still references to them, so that on next 1780 * dst_check() such references can be dropped. 1781 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1782 * expired, independently from their aging, as per RFC 8201 section 4 1783 */ 1784 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1785 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1786 RT6_TRACE("aging clone %p\n", rt); 1787 rt6_remove_exception(bucket, rt6_ex); 1788 return; 1789 } 1790 } else if (time_after(jiffies, rt->dst.expires)) { 1791 RT6_TRACE("purging expired route %p\n", rt); 1792 rt6_remove_exception(bucket, rt6_ex); 1793 return; 1794 } 1795 1796 if (rt->rt6i_flags & RTF_GATEWAY) { 1797 struct neighbour *neigh; 1798 __u8 neigh_flags = 0; 1799 1800 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1801 if (neigh) 1802 neigh_flags = neigh->flags; 1803 1804 if (!(neigh_flags & NTF_ROUTER)) { 1805 RT6_TRACE("purging route %p via non-router but gateway\n", 1806 rt); 1807 rt6_remove_exception(bucket, rt6_ex); 1808 return; 1809 } 1810 } 1811 1812 gc_args->more++; 1813 } 1814 1815 void rt6_age_exceptions(struct fib6_info *rt, 1816 struct fib6_gc_args *gc_args, 1817 unsigned long now) 1818 { 1819 struct rt6_exception_bucket *bucket; 1820 struct rt6_exception *rt6_ex; 1821 struct hlist_node *tmp; 1822 int i; 1823 1824 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1825 return; 1826 1827 rcu_read_lock_bh(); 1828 spin_lock(&rt6_exception_lock); 1829 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1830 lockdep_is_held(&rt6_exception_lock)); 1831 1832 if (bucket) { 1833 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1834 hlist_for_each_entry_safe(rt6_ex, tmp, 1835 &bucket->chain, hlist) { 1836 rt6_age_examine_exception(bucket, rt6_ex, 1837 gc_args, now); 1838 } 1839 bucket++; 1840 } 1841 } 1842 spin_unlock(&rt6_exception_lock); 1843 rcu_read_unlock_bh(); 1844 } 1845 1846 /* must be called with rcu lock held */ 1847 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 1848 struct flowi6 *fl6, struct fib6_result *res, int strict) 1849 { 1850 struct fib6_node *fn, *saved_fn; 1851 1852 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1853 saved_fn = fn; 1854 1855 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1856 oif = 0; 1857 1858 redo_rt6_select: 1859 rt6_select(net, fn, oif, res, strict); 1860 if (res->f6i == net->ipv6.fib6_null_entry) { 1861 fn = fib6_backtrack(fn, &fl6->saddr); 1862 if (fn) 1863 goto redo_rt6_select; 1864 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1865 /* also consider unreachable route */ 1866 strict &= ~RT6_LOOKUP_F_REACHABLE; 1867 fn = saved_fn; 1868 goto redo_rt6_select; 1869 } 1870 } 1871 1872 trace_fib6_table_lookup(net, res, table, fl6); 1873 1874 return 0; 1875 } 1876 1877 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1878 int oif, struct flowi6 *fl6, 1879 const struct sk_buff *skb, int flags) 1880 { 1881 struct fib6_result res = {}; 1882 struct rt6_info *rt; 1883 int strict = 0; 1884 1885 strict |= flags & RT6_LOOKUP_F_IFACE; 1886 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1887 if (net->ipv6.devconf_all->forwarding == 0) 1888 strict |= RT6_LOOKUP_F_REACHABLE; 1889 1890 rcu_read_lock(); 1891 1892 fib6_table_lookup(net, table, oif, fl6, &res, strict); 1893 if (res.f6i == net->ipv6.fib6_null_entry) { 1894 rt = net->ipv6.ip6_null_entry; 1895 rcu_read_unlock(); 1896 dst_hold(&rt->dst); 1897 return rt; 1898 } 1899 1900 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 1901 1902 /*Search through exception table */ 1903 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1904 if (rt) { 1905 if (ip6_hold_safe(net, &rt)) 1906 dst_use_noref(&rt->dst, jiffies); 1907 1908 rcu_read_unlock(); 1909 return rt; 1910 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1911 !res.nh->fib_nh_gw_family)) { 1912 /* Create a RTF_CACHE clone which will not be 1913 * owned by the fib6 tree. It is for the special case where 1914 * the daddr in the skb during the neighbor look-up is different 1915 * from the fl6->daddr used to look-up route here. 1916 */ 1917 struct rt6_info *uncached_rt; 1918 1919 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 1920 1921 rcu_read_unlock(); 1922 1923 if (uncached_rt) { 1924 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1925 * No need for another dst_hold() 1926 */ 1927 rt6_uncached_list_add(uncached_rt); 1928 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1929 } else { 1930 uncached_rt = net->ipv6.ip6_null_entry; 1931 dst_hold(&uncached_rt->dst); 1932 } 1933 1934 return uncached_rt; 1935 } else { 1936 /* Get a percpu copy */ 1937 1938 struct rt6_info *pcpu_rt; 1939 1940 local_bh_disable(); 1941 pcpu_rt = rt6_get_pcpu_route(&res); 1942 1943 if (!pcpu_rt) 1944 pcpu_rt = rt6_make_pcpu_route(net, &res); 1945 1946 local_bh_enable(); 1947 rcu_read_unlock(); 1948 1949 return pcpu_rt; 1950 } 1951 } 1952 EXPORT_SYMBOL_GPL(ip6_pol_route); 1953 1954 static struct rt6_info *ip6_pol_route_input(struct net *net, 1955 struct fib6_table *table, 1956 struct flowi6 *fl6, 1957 const struct sk_buff *skb, 1958 int flags) 1959 { 1960 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1961 } 1962 1963 struct dst_entry *ip6_route_input_lookup(struct net *net, 1964 struct net_device *dev, 1965 struct flowi6 *fl6, 1966 const struct sk_buff *skb, 1967 int flags) 1968 { 1969 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1970 flags |= RT6_LOOKUP_F_IFACE; 1971 1972 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1973 } 1974 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1975 1976 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1977 struct flow_keys *keys, 1978 struct flow_keys *flkeys) 1979 { 1980 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1981 const struct ipv6hdr *key_iph = outer_iph; 1982 struct flow_keys *_flkeys = flkeys; 1983 const struct ipv6hdr *inner_iph; 1984 const struct icmp6hdr *icmph; 1985 struct ipv6hdr _inner_iph; 1986 struct icmp6hdr _icmph; 1987 1988 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1989 goto out; 1990 1991 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1992 sizeof(_icmph), &_icmph); 1993 if (!icmph) 1994 goto out; 1995 1996 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1997 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1998 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1999 icmph->icmp6_type != ICMPV6_PARAMPROB) 2000 goto out; 2001 2002 inner_iph = skb_header_pointer(skb, 2003 skb_transport_offset(skb) + sizeof(*icmph), 2004 sizeof(_inner_iph), &_inner_iph); 2005 if (!inner_iph) 2006 goto out; 2007 2008 key_iph = inner_iph; 2009 _flkeys = NULL; 2010 out: 2011 if (_flkeys) { 2012 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2013 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2014 keys->tags.flow_label = _flkeys->tags.flow_label; 2015 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2016 } else { 2017 keys->addrs.v6addrs.src = key_iph->saddr; 2018 keys->addrs.v6addrs.dst = key_iph->daddr; 2019 keys->tags.flow_label = ip6_flowlabel(key_iph); 2020 keys->basic.ip_proto = key_iph->nexthdr; 2021 } 2022 } 2023 2024 /* if skb is set it will be used and fl6 can be NULL */ 2025 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2026 const struct sk_buff *skb, struct flow_keys *flkeys) 2027 { 2028 struct flow_keys hash_keys; 2029 u32 mhash; 2030 2031 switch (ip6_multipath_hash_policy(net)) { 2032 case 0: 2033 memset(&hash_keys, 0, sizeof(hash_keys)); 2034 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2035 if (skb) { 2036 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2037 } else { 2038 hash_keys.addrs.v6addrs.src = fl6->saddr; 2039 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2040 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2041 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2042 } 2043 break; 2044 case 1: 2045 if (skb) { 2046 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2047 struct flow_keys keys; 2048 2049 /* short-circuit if we already have L4 hash present */ 2050 if (skb->l4_hash) 2051 return skb_get_hash_raw(skb) >> 1; 2052 2053 memset(&hash_keys, 0, sizeof(hash_keys)); 2054 2055 if (!flkeys) { 2056 skb_flow_dissect_flow_keys(skb, &keys, flag); 2057 flkeys = &keys; 2058 } 2059 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2060 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2061 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2062 hash_keys.ports.src = flkeys->ports.src; 2063 hash_keys.ports.dst = flkeys->ports.dst; 2064 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2065 } else { 2066 memset(&hash_keys, 0, sizeof(hash_keys)); 2067 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2068 hash_keys.addrs.v6addrs.src = fl6->saddr; 2069 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2070 hash_keys.ports.src = fl6->fl6_sport; 2071 hash_keys.ports.dst = fl6->fl6_dport; 2072 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2073 } 2074 break; 2075 } 2076 mhash = flow_hash_from_keys(&hash_keys); 2077 2078 return mhash >> 1; 2079 } 2080 2081 void ip6_route_input(struct sk_buff *skb) 2082 { 2083 const struct ipv6hdr *iph = ipv6_hdr(skb); 2084 struct net *net = dev_net(skb->dev); 2085 int flags = RT6_LOOKUP_F_HAS_SADDR; 2086 struct ip_tunnel_info *tun_info; 2087 struct flowi6 fl6 = { 2088 .flowi6_iif = skb->dev->ifindex, 2089 .daddr = iph->daddr, 2090 .saddr = iph->saddr, 2091 .flowlabel = ip6_flowinfo(iph), 2092 .flowi6_mark = skb->mark, 2093 .flowi6_proto = iph->nexthdr, 2094 }; 2095 struct flow_keys *flkeys = NULL, _flkeys; 2096 2097 tun_info = skb_tunnel_info(skb); 2098 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2099 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2100 2101 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2102 flkeys = &_flkeys; 2103 2104 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2105 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2106 skb_dst_drop(skb); 2107 skb_dst_set(skb, 2108 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2109 } 2110 2111 static struct rt6_info *ip6_pol_route_output(struct net *net, 2112 struct fib6_table *table, 2113 struct flowi6 *fl6, 2114 const struct sk_buff *skb, 2115 int flags) 2116 { 2117 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2118 } 2119 2120 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2121 struct flowi6 *fl6, int flags) 2122 { 2123 bool any_src; 2124 2125 if (ipv6_addr_type(&fl6->daddr) & 2126 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2127 struct dst_entry *dst; 2128 2129 dst = l3mdev_link_scope_lookup(net, fl6); 2130 if (dst) 2131 return dst; 2132 } 2133 2134 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2135 2136 any_src = ipv6_addr_any(&fl6->saddr); 2137 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2138 (fl6->flowi6_oif && any_src)) 2139 flags |= RT6_LOOKUP_F_IFACE; 2140 2141 if (!any_src) 2142 flags |= RT6_LOOKUP_F_HAS_SADDR; 2143 else if (sk) 2144 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2145 2146 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2147 } 2148 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2149 2150 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2151 { 2152 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2153 struct net_device *loopback_dev = net->loopback_dev; 2154 struct dst_entry *new = NULL; 2155 2156 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2157 DST_OBSOLETE_DEAD, 0); 2158 if (rt) { 2159 rt6_info_init(rt); 2160 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2161 2162 new = &rt->dst; 2163 new->__use = 1; 2164 new->input = dst_discard; 2165 new->output = dst_discard_out; 2166 2167 dst_copy_metrics(new, &ort->dst); 2168 2169 rt->rt6i_idev = in6_dev_get(loopback_dev); 2170 rt->rt6i_gateway = ort->rt6i_gateway; 2171 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2172 2173 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2174 #ifdef CONFIG_IPV6_SUBTREES 2175 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2176 #endif 2177 } 2178 2179 dst_release(dst_orig); 2180 return new ? new : ERR_PTR(-ENOMEM); 2181 } 2182 2183 /* 2184 * Destination cache support functions 2185 */ 2186 2187 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2188 { 2189 u32 rt_cookie = 0; 2190 2191 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2192 return false; 2193 2194 if (fib6_check_expired(f6i)) 2195 return false; 2196 2197 return true; 2198 } 2199 2200 static struct dst_entry *rt6_check(struct rt6_info *rt, 2201 struct fib6_info *from, 2202 u32 cookie) 2203 { 2204 u32 rt_cookie = 0; 2205 2206 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2207 rt_cookie != cookie) 2208 return NULL; 2209 2210 if (rt6_check_expired(rt)) 2211 return NULL; 2212 2213 return &rt->dst; 2214 } 2215 2216 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2217 struct fib6_info *from, 2218 u32 cookie) 2219 { 2220 if (!__rt6_check_expired(rt) && 2221 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2222 fib6_check(from, cookie)) 2223 return &rt->dst; 2224 else 2225 return NULL; 2226 } 2227 2228 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2229 { 2230 struct dst_entry *dst_ret; 2231 struct fib6_info *from; 2232 struct rt6_info *rt; 2233 2234 rt = container_of(dst, struct rt6_info, dst); 2235 2236 rcu_read_lock(); 2237 2238 /* All IPV6 dsts are created with ->obsolete set to the value 2239 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2240 * into this function always. 2241 */ 2242 2243 from = rcu_dereference(rt->from); 2244 2245 if (from && (rt->rt6i_flags & RTF_PCPU || 2246 unlikely(!list_empty(&rt->rt6i_uncached)))) 2247 dst_ret = rt6_dst_from_check(rt, from, cookie); 2248 else 2249 dst_ret = rt6_check(rt, from, cookie); 2250 2251 rcu_read_unlock(); 2252 2253 return dst_ret; 2254 } 2255 2256 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2257 { 2258 struct rt6_info *rt = (struct rt6_info *) dst; 2259 2260 if (rt) { 2261 if (rt->rt6i_flags & RTF_CACHE) { 2262 rcu_read_lock(); 2263 if (rt6_check_expired(rt)) { 2264 rt6_remove_exception_rt(rt); 2265 dst = NULL; 2266 } 2267 rcu_read_unlock(); 2268 } else { 2269 dst_release(dst); 2270 dst = NULL; 2271 } 2272 } 2273 return dst; 2274 } 2275 2276 static void ip6_link_failure(struct sk_buff *skb) 2277 { 2278 struct rt6_info *rt; 2279 2280 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2281 2282 rt = (struct rt6_info *) skb_dst(skb); 2283 if (rt) { 2284 rcu_read_lock(); 2285 if (rt->rt6i_flags & RTF_CACHE) { 2286 rt6_remove_exception_rt(rt); 2287 } else { 2288 struct fib6_info *from; 2289 struct fib6_node *fn; 2290 2291 from = rcu_dereference(rt->from); 2292 if (from) { 2293 fn = rcu_dereference(from->fib6_node); 2294 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2295 fn->fn_sernum = -1; 2296 } 2297 } 2298 rcu_read_unlock(); 2299 } 2300 } 2301 2302 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2303 { 2304 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2305 struct fib6_info *from; 2306 2307 rcu_read_lock(); 2308 from = rcu_dereference(rt0->from); 2309 if (from) 2310 rt0->dst.expires = from->expires; 2311 rcu_read_unlock(); 2312 } 2313 2314 dst_set_expires(&rt0->dst, timeout); 2315 rt0->rt6i_flags |= RTF_EXPIRES; 2316 } 2317 2318 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2319 { 2320 struct net *net = dev_net(rt->dst.dev); 2321 2322 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2323 rt->rt6i_flags |= RTF_MODIFIED; 2324 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2325 } 2326 2327 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2328 { 2329 return !(rt->rt6i_flags & RTF_CACHE) && 2330 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2331 } 2332 2333 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2334 const struct ipv6hdr *iph, u32 mtu) 2335 { 2336 const struct in6_addr *daddr, *saddr; 2337 struct rt6_info *rt6 = (struct rt6_info *)dst; 2338 2339 if (dst_metric_locked(dst, RTAX_MTU)) 2340 return; 2341 2342 if (iph) { 2343 daddr = &iph->daddr; 2344 saddr = &iph->saddr; 2345 } else if (sk) { 2346 daddr = &sk->sk_v6_daddr; 2347 saddr = &inet6_sk(sk)->saddr; 2348 } else { 2349 daddr = NULL; 2350 saddr = NULL; 2351 } 2352 dst_confirm_neigh(dst, daddr); 2353 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2354 if (mtu >= dst_mtu(dst)) 2355 return; 2356 2357 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2358 rt6_do_update_pmtu(rt6, mtu); 2359 /* update rt6_ex->stamp for cache */ 2360 if (rt6->rt6i_flags & RTF_CACHE) 2361 rt6_update_exception_stamp_rt(rt6); 2362 } else if (daddr) { 2363 struct fib6_result res = {}; 2364 struct rt6_info *nrt6; 2365 2366 rcu_read_lock(); 2367 res.f6i = rcu_dereference(rt6->from); 2368 if (!res.f6i) { 2369 rcu_read_unlock(); 2370 return; 2371 } 2372 res.nh = &res.f6i->fib6_nh; 2373 res.fib6_flags = res.f6i->fib6_flags; 2374 res.fib6_type = res.f6i->fib6_type; 2375 2376 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2377 if (nrt6) { 2378 rt6_do_update_pmtu(nrt6, mtu); 2379 if (rt6_insert_exception(nrt6, &res)) 2380 dst_release_immediate(&nrt6->dst); 2381 } 2382 rcu_read_unlock(); 2383 } 2384 } 2385 2386 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2387 struct sk_buff *skb, u32 mtu) 2388 { 2389 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2390 } 2391 2392 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2393 int oif, u32 mark, kuid_t uid) 2394 { 2395 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2396 struct dst_entry *dst; 2397 struct flowi6 fl6 = { 2398 .flowi6_oif = oif, 2399 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2400 .daddr = iph->daddr, 2401 .saddr = iph->saddr, 2402 .flowlabel = ip6_flowinfo(iph), 2403 .flowi6_uid = uid, 2404 }; 2405 2406 dst = ip6_route_output(net, NULL, &fl6); 2407 if (!dst->error) 2408 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2409 dst_release(dst); 2410 } 2411 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2412 2413 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2414 { 2415 int oif = sk->sk_bound_dev_if; 2416 struct dst_entry *dst; 2417 2418 if (!oif && skb->dev) 2419 oif = l3mdev_master_ifindex(skb->dev); 2420 2421 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2422 2423 dst = __sk_dst_get(sk); 2424 if (!dst || !dst->obsolete || 2425 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2426 return; 2427 2428 bh_lock_sock(sk); 2429 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2430 ip6_datagram_dst_update(sk, false); 2431 bh_unlock_sock(sk); 2432 } 2433 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2434 2435 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2436 const struct flowi6 *fl6) 2437 { 2438 #ifdef CONFIG_IPV6_SUBTREES 2439 struct ipv6_pinfo *np = inet6_sk(sk); 2440 #endif 2441 2442 ip6_dst_store(sk, dst, 2443 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2444 &sk->sk_v6_daddr : NULL, 2445 #ifdef CONFIG_IPV6_SUBTREES 2446 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2447 &np->saddr : 2448 #endif 2449 NULL); 2450 } 2451 2452 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2453 struct flowi6 *fl6, 2454 const struct in6_addr *gw, 2455 struct rt6_info **ret) 2456 { 2457 const struct fib6_nh *nh = res->nh; 2458 2459 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2460 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2461 return false; 2462 2463 /* rt_cache's gateway might be different from its 'parent' 2464 * in the case of an ip redirect. 2465 * So we keep searching in the exception table if the gateway 2466 * is different. 2467 */ 2468 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2469 struct rt6_info *rt_cache; 2470 2471 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2472 if (rt_cache && 2473 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2474 *ret = rt_cache; 2475 return true; 2476 } 2477 return false; 2478 } 2479 return true; 2480 } 2481 2482 /* Handle redirects */ 2483 struct ip6rd_flowi { 2484 struct flowi6 fl6; 2485 struct in6_addr gateway; 2486 }; 2487 2488 static struct rt6_info *__ip6_route_redirect(struct net *net, 2489 struct fib6_table *table, 2490 struct flowi6 *fl6, 2491 const struct sk_buff *skb, 2492 int flags) 2493 { 2494 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2495 struct rt6_info *ret = NULL; 2496 struct fib6_result res = {}; 2497 struct fib6_info *rt; 2498 struct fib6_node *fn; 2499 2500 /* Get the "current" route for this destination and 2501 * check if the redirect has come from appropriate router. 2502 * 2503 * RFC 4861 specifies that redirects should only be 2504 * accepted if they come from the nexthop to the target. 2505 * Due to the way the routes are chosen, this notion 2506 * is a bit fuzzy and one might need to check all possible 2507 * routes. 2508 */ 2509 2510 rcu_read_lock(); 2511 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2512 restart: 2513 for_each_fib6_node_rt_rcu(fn) { 2514 res.f6i = rt; 2515 res.nh = &rt->fib6_nh; 2516 2517 if (fib6_check_expired(rt)) 2518 continue; 2519 if (rt->fib6_flags & RTF_REJECT) 2520 break; 2521 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2522 goto out; 2523 } 2524 2525 if (!rt) 2526 rt = net->ipv6.fib6_null_entry; 2527 else if (rt->fib6_flags & RTF_REJECT) { 2528 ret = net->ipv6.ip6_null_entry; 2529 goto out; 2530 } 2531 2532 if (rt == net->ipv6.fib6_null_entry) { 2533 fn = fib6_backtrack(fn, &fl6->saddr); 2534 if (fn) 2535 goto restart; 2536 } 2537 2538 res.f6i = rt; 2539 res.nh = &rt->fib6_nh; 2540 out: 2541 if (ret) { 2542 ip6_hold_safe(net, &ret); 2543 } else { 2544 res.fib6_flags = res.f6i->fib6_flags; 2545 res.fib6_type = res.f6i->fib6_type; 2546 ret = ip6_create_rt_rcu(&res); 2547 } 2548 2549 rcu_read_unlock(); 2550 2551 trace_fib6_table_lookup(net, &res, table, fl6); 2552 return ret; 2553 }; 2554 2555 static struct dst_entry *ip6_route_redirect(struct net *net, 2556 const struct flowi6 *fl6, 2557 const struct sk_buff *skb, 2558 const struct in6_addr *gateway) 2559 { 2560 int flags = RT6_LOOKUP_F_HAS_SADDR; 2561 struct ip6rd_flowi rdfl; 2562 2563 rdfl.fl6 = *fl6; 2564 rdfl.gateway = *gateway; 2565 2566 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2567 flags, __ip6_route_redirect); 2568 } 2569 2570 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2571 kuid_t uid) 2572 { 2573 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2574 struct dst_entry *dst; 2575 struct flowi6 fl6 = { 2576 .flowi6_iif = LOOPBACK_IFINDEX, 2577 .flowi6_oif = oif, 2578 .flowi6_mark = mark, 2579 .daddr = iph->daddr, 2580 .saddr = iph->saddr, 2581 .flowlabel = ip6_flowinfo(iph), 2582 .flowi6_uid = uid, 2583 }; 2584 2585 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2586 rt6_do_redirect(dst, NULL, skb); 2587 dst_release(dst); 2588 } 2589 EXPORT_SYMBOL_GPL(ip6_redirect); 2590 2591 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2592 { 2593 const struct ipv6hdr *iph = ipv6_hdr(skb); 2594 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2595 struct dst_entry *dst; 2596 struct flowi6 fl6 = { 2597 .flowi6_iif = LOOPBACK_IFINDEX, 2598 .flowi6_oif = oif, 2599 .daddr = msg->dest, 2600 .saddr = iph->daddr, 2601 .flowi6_uid = sock_net_uid(net, NULL), 2602 }; 2603 2604 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2605 rt6_do_redirect(dst, NULL, skb); 2606 dst_release(dst); 2607 } 2608 2609 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2610 { 2611 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2612 sk->sk_uid); 2613 } 2614 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2615 2616 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2617 { 2618 struct net_device *dev = dst->dev; 2619 unsigned int mtu = dst_mtu(dst); 2620 struct net *net = dev_net(dev); 2621 2622 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2623 2624 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2625 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2626 2627 /* 2628 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2629 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2630 * IPV6_MAXPLEN is also valid and means: "any MSS, 2631 * rely only on pmtu discovery" 2632 */ 2633 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2634 mtu = IPV6_MAXPLEN; 2635 return mtu; 2636 } 2637 2638 static unsigned int ip6_mtu(const struct dst_entry *dst) 2639 { 2640 struct inet6_dev *idev; 2641 unsigned int mtu; 2642 2643 mtu = dst_metric_raw(dst, RTAX_MTU); 2644 if (mtu) 2645 goto out; 2646 2647 mtu = IPV6_MIN_MTU; 2648 2649 rcu_read_lock(); 2650 idev = __in6_dev_get(dst->dev); 2651 if (idev) 2652 mtu = idev->cnf.mtu6; 2653 rcu_read_unlock(); 2654 2655 out: 2656 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2657 2658 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2659 } 2660 2661 /* MTU selection: 2662 * 1. mtu on route is locked - use it 2663 * 2. mtu from nexthop exception 2664 * 3. mtu from egress device 2665 * 2666 * based on ip6_dst_mtu_forward and exception logic of 2667 * rt6_find_cached_rt; called with rcu_read_lock 2668 */ 2669 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 2670 const struct in6_addr *daddr, 2671 const struct in6_addr *saddr) 2672 { 2673 struct rt6_exception_bucket *bucket; 2674 const struct fib6_nh *nh = res->nh; 2675 struct fib6_info *f6i = res->f6i; 2676 const struct in6_addr *src_key; 2677 struct rt6_exception *rt6_ex; 2678 struct inet6_dev *idev; 2679 u32 mtu = 0; 2680 2681 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2682 mtu = f6i->fib6_pmtu; 2683 if (mtu) 2684 goto out; 2685 } 2686 2687 src_key = NULL; 2688 #ifdef CONFIG_IPV6_SUBTREES 2689 if (f6i->fib6_src.plen) 2690 src_key = saddr; 2691 #endif 2692 2693 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2694 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2695 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2696 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2697 2698 if (likely(!mtu)) { 2699 struct net_device *dev = nh->fib_nh_dev; 2700 2701 mtu = IPV6_MIN_MTU; 2702 idev = __in6_dev_get(dev); 2703 if (idev && idev->cnf.mtu6 > mtu) 2704 mtu = idev->cnf.mtu6; 2705 } 2706 2707 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2708 out: 2709 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 2710 } 2711 2712 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2713 struct flowi6 *fl6) 2714 { 2715 struct dst_entry *dst; 2716 struct rt6_info *rt; 2717 struct inet6_dev *idev = in6_dev_get(dev); 2718 struct net *net = dev_net(dev); 2719 2720 if (unlikely(!idev)) 2721 return ERR_PTR(-ENODEV); 2722 2723 rt = ip6_dst_alloc(net, dev, 0); 2724 if (unlikely(!rt)) { 2725 in6_dev_put(idev); 2726 dst = ERR_PTR(-ENOMEM); 2727 goto out; 2728 } 2729 2730 rt->dst.flags |= DST_HOST; 2731 rt->dst.input = ip6_input; 2732 rt->dst.output = ip6_output; 2733 rt->rt6i_gateway = fl6->daddr; 2734 rt->rt6i_dst.addr = fl6->daddr; 2735 rt->rt6i_dst.plen = 128; 2736 rt->rt6i_idev = idev; 2737 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2738 2739 /* Add this dst into uncached_list so that rt6_disable_ip() can 2740 * do proper release of the net_device 2741 */ 2742 rt6_uncached_list_add(rt); 2743 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2744 2745 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2746 2747 out: 2748 return dst; 2749 } 2750 2751 static int ip6_dst_gc(struct dst_ops *ops) 2752 { 2753 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2754 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2755 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2756 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2757 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2758 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2759 int entries; 2760 2761 entries = dst_entries_get_fast(ops); 2762 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2763 entries <= rt_max_size) 2764 goto out; 2765 2766 net->ipv6.ip6_rt_gc_expire++; 2767 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2768 entries = dst_entries_get_slow(ops); 2769 if (entries < ops->gc_thresh) 2770 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2771 out: 2772 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2773 return entries > rt_max_size; 2774 } 2775 2776 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2777 struct fib6_config *cfg, 2778 const struct in6_addr *gw_addr, 2779 u32 tbid, int flags) 2780 { 2781 struct flowi6 fl6 = { 2782 .flowi6_oif = cfg->fc_ifindex, 2783 .daddr = *gw_addr, 2784 .saddr = cfg->fc_prefsrc, 2785 }; 2786 struct fib6_table *table; 2787 struct rt6_info *rt; 2788 2789 table = fib6_get_table(net, tbid); 2790 if (!table) 2791 return NULL; 2792 2793 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2794 flags |= RT6_LOOKUP_F_HAS_SADDR; 2795 2796 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2797 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2798 2799 /* if table lookup failed, fall back to full lookup */ 2800 if (rt == net->ipv6.ip6_null_entry) { 2801 ip6_rt_put(rt); 2802 rt = NULL; 2803 } 2804 2805 return rt; 2806 } 2807 2808 static int ip6_route_check_nh_onlink(struct net *net, 2809 struct fib6_config *cfg, 2810 const struct net_device *dev, 2811 struct netlink_ext_ack *extack) 2812 { 2813 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2814 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2815 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2816 struct fib6_info *from; 2817 struct rt6_info *grt; 2818 int err; 2819 2820 err = 0; 2821 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2822 if (grt) { 2823 rcu_read_lock(); 2824 from = rcu_dereference(grt->from); 2825 if (!grt->dst.error && 2826 /* ignore match if it is the default route */ 2827 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2828 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2829 NL_SET_ERR_MSG(extack, 2830 "Nexthop has invalid gateway or device mismatch"); 2831 err = -EINVAL; 2832 } 2833 rcu_read_unlock(); 2834 2835 ip6_rt_put(grt); 2836 } 2837 2838 return err; 2839 } 2840 2841 static int ip6_route_check_nh(struct net *net, 2842 struct fib6_config *cfg, 2843 struct net_device **_dev, 2844 struct inet6_dev **idev) 2845 { 2846 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2847 struct net_device *dev = _dev ? *_dev : NULL; 2848 struct rt6_info *grt = NULL; 2849 int err = -EHOSTUNREACH; 2850 2851 if (cfg->fc_table) { 2852 int flags = RT6_LOOKUP_F_IFACE; 2853 2854 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2855 cfg->fc_table, flags); 2856 if (grt) { 2857 if (grt->rt6i_flags & RTF_GATEWAY || 2858 (dev && dev != grt->dst.dev)) { 2859 ip6_rt_put(grt); 2860 grt = NULL; 2861 } 2862 } 2863 } 2864 2865 if (!grt) 2866 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2867 2868 if (!grt) 2869 goto out; 2870 2871 if (dev) { 2872 if (dev != grt->dst.dev) { 2873 ip6_rt_put(grt); 2874 goto out; 2875 } 2876 } else { 2877 *_dev = dev = grt->dst.dev; 2878 *idev = grt->rt6i_idev; 2879 dev_hold(dev); 2880 in6_dev_hold(grt->rt6i_idev); 2881 } 2882 2883 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2884 err = 0; 2885 2886 ip6_rt_put(grt); 2887 2888 out: 2889 return err; 2890 } 2891 2892 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2893 struct net_device **_dev, struct inet6_dev **idev, 2894 struct netlink_ext_ack *extack) 2895 { 2896 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2897 int gwa_type = ipv6_addr_type(gw_addr); 2898 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2899 const struct net_device *dev = *_dev; 2900 bool need_addr_check = !dev; 2901 int err = -EINVAL; 2902 2903 /* if gw_addr is local we will fail to detect this in case 2904 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2905 * will return already-added prefix route via interface that 2906 * prefix route was assigned to, which might be non-loopback. 2907 */ 2908 if (dev && 2909 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2910 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2911 goto out; 2912 } 2913 2914 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2915 /* IPv6 strictly inhibits using not link-local 2916 * addresses as nexthop address. 2917 * Otherwise, router will not able to send redirects. 2918 * It is very good, but in some (rare!) circumstances 2919 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2920 * some exceptions. --ANK 2921 * We allow IPv4-mapped nexthops to support RFC4798-type 2922 * addressing 2923 */ 2924 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2925 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2926 goto out; 2927 } 2928 2929 if (cfg->fc_flags & RTNH_F_ONLINK) 2930 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2931 else 2932 err = ip6_route_check_nh(net, cfg, _dev, idev); 2933 2934 if (err) 2935 goto out; 2936 } 2937 2938 /* reload in case device was changed */ 2939 dev = *_dev; 2940 2941 err = -EINVAL; 2942 if (!dev) { 2943 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2944 goto out; 2945 } else if (dev->flags & IFF_LOOPBACK) { 2946 NL_SET_ERR_MSG(extack, 2947 "Egress device can not be loopback device for this route"); 2948 goto out; 2949 } 2950 2951 /* if we did not check gw_addr above, do so now that the 2952 * egress device has been resolved. 2953 */ 2954 if (need_addr_check && 2955 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2956 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2957 goto out; 2958 } 2959 2960 err = 0; 2961 out: 2962 return err; 2963 } 2964 2965 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2966 { 2967 if ((flags & RTF_REJECT) || 2968 (dev && (dev->flags & IFF_LOOPBACK) && 2969 !(addr_type & IPV6_ADDR_LOOPBACK) && 2970 !(flags & RTF_LOCAL))) 2971 return true; 2972 2973 return false; 2974 } 2975 2976 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2977 struct fib6_config *cfg, gfp_t gfp_flags, 2978 struct netlink_ext_ack *extack) 2979 { 2980 struct net_device *dev = NULL; 2981 struct inet6_dev *idev = NULL; 2982 int addr_type; 2983 int err; 2984 2985 fib6_nh->fib_nh_family = AF_INET6; 2986 2987 err = -ENODEV; 2988 if (cfg->fc_ifindex) { 2989 dev = dev_get_by_index(net, cfg->fc_ifindex); 2990 if (!dev) 2991 goto out; 2992 idev = in6_dev_get(dev); 2993 if (!idev) 2994 goto out; 2995 } 2996 2997 if (cfg->fc_flags & RTNH_F_ONLINK) { 2998 if (!dev) { 2999 NL_SET_ERR_MSG(extack, 3000 "Nexthop device required for onlink"); 3001 goto out; 3002 } 3003 3004 if (!(dev->flags & IFF_UP)) { 3005 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3006 err = -ENETDOWN; 3007 goto out; 3008 } 3009 3010 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3011 } 3012 3013 fib6_nh->fib_nh_weight = 1; 3014 3015 /* We cannot add true routes via loopback here, 3016 * they would result in kernel looping; promote them to reject routes 3017 */ 3018 addr_type = ipv6_addr_type(&cfg->fc_dst); 3019 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3020 /* hold loopback dev/idev if we haven't done so. */ 3021 if (dev != net->loopback_dev) { 3022 if (dev) { 3023 dev_put(dev); 3024 in6_dev_put(idev); 3025 } 3026 dev = net->loopback_dev; 3027 dev_hold(dev); 3028 idev = in6_dev_get(dev); 3029 if (!idev) { 3030 err = -ENODEV; 3031 goto out; 3032 } 3033 } 3034 goto set_dev; 3035 } 3036 3037 if (cfg->fc_flags & RTF_GATEWAY) { 3038 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3039 if (err) 3040 goto out; 3041 3042 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3043 fib6_nh->fib_nh_gw_family = AF_INET6; 3044 } 3045 3046 err = -ENODEV; 3047 if (!dev) 3048 goto out; 3049 3050 if (idev->cnf.disable_ipv6) { 3051 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3052 err = -EACCES; 3053 goto out; 3054 } 3055 3056 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3057 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3058 err = -ENETDOWN; 3059 goto out; 3060 } 3061 3062 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3063 !netif_carrier_ok(dev)) 3064 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3065 3066 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3067 cfg->fc_encap_type, cfg, gfp_flags, extack); 3068 if (err) 3069 goto out; 3070 set_dev: 3071 fib6_nh->fib_nh_dev = dev; 3072 fib6_nh->fib_nh_oif = dev->ifindex; 3073 err = 0; 3074 out: 3075 if (idev) 3076 in6_dev_put(idev); 3077 3078 if (err) { 3079 lwtstate_put(fib6_nh->fib_nh_lws); 3080 fib6_nh->fib_nh_lws = NULL; 3081 if (dev) 3082 dev_put(dev); 3083 } 3084 3085 return err; 3086 } 3087 3088 void fib6_nh_release(struct fib6_nh *fib6_nh) 3089 { 3090 fib_nh_common_release(&fib6_nh->nh_common); 3091 } 3092 3093 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3094 gfp_t gfp_flags, 3095 struct netlink_ext_ack *extack) 3096 { 3097 struct net *net = cfg->fc_nlinfo.nl_net; 3098 struct fib6_info *rt = NULL; 3099 struct fib6_table *table; 3100 int err = -EINVAL; 3101 int addr_type; 3102 3103 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3104 if (cfg->fc_flags & RTF_PCPU) { 3105 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3106 goto out; 3107 } 3108 3109 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3110 if (cfg->fc_flags & RTF_CACHE) { 3111 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3112 goto out; 3113 } 3114 3115 if (cfg->fc_type > RTN_MAX) { 3116 NL_SET_ERR_MSG(extack, "Invalid route type"); 3117 goto out; 3118 } 3119 3120 if (cfg->fc_dst_len > 128) { 3121 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3122 goto out; 3123 } 3124 if (cfg->fc_src_len > 128) { 3125 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3126 goto out; 3127 } 3128 #ifndef CONFIG_IPV6_SUBTREES 3129 if (cfg->fc_src_len) { 3130 NL_SET_ERR_MSG(extack, 3131 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3132 goto out; 3133 } 3134 #endif 3135 3136 err = -ENOBUFS; 3137 if (cfg->fc_nlinfo.nlh && 3138 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3139 table = fib6_get_table(net, cfg->fc_table); 3140 if (!table) { 3141 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3142 table = fib6_new_table(net, cfg->fc_table); 3143 } 3144 } else { 3145 table = fib6_new_table(net, cfg->fc_table); 3146 } 3147 3148 if (!table) 3149 goto out; 3150 3151 err = -ENOMEM; 3152 rt = fib6_info_alloc(gfp_flags); 3153 if (!rt) 3154 goto out; 3155 3156 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3157 extack); 3158 if (IS_ERR(rt->fib6_metrics)) { 3159 err = PTR_ERR(rt->fib6_metrics); 3160 /* Do not leave garbage there. */ 3161 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3162 goto out; 3163 } 3164 3165 if (cfg->fc_flags & RTF_ADDRCONF) 3166 rt->dst_nocount = true; 3167 3168 if (cfg->fc_flags & RTF_EXPIRES) 3169 fib6_set_expires(rt, jiffies + 3170 clock_t_to_jiffies(cfg->fc_expires)); 3171 else 3172 fib6_clean_expires(rt); 3173 3174 if (cfg->fc_protocol == RTPROT_UNSPEC) 3175 cfg->fc_protocol = RTPROT_BOOT; 3176 rt->fib6_protocol = cfg->fc_protocol; 3177 3178 rt->fib6_table = table; 3179 rt->fib6_metric = cfg->fc_metric; 3180 rt->fib6_type = cfg->fc_type; 3181 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3182 3183 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3184 rt->fib6_dst.plen = cfg->fc_dst_len; 3185 if (rt->fib6_dst.plen == 128) 3186 rt->dst_host = true; 3187 3188 #ifdef CONFIG_IPV6_SUBTREES 3189 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3190 rt->fib6_src.plen = cfg->fc_src_len; 3191 #endif 3192 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3193 if (err) 3194 goto out; 3195 3196 /* We cannot add true routes via loopback here, 3197 * they would result in kernel looping; promote them to reject routes 3198 */ 3199 addr_type = ipv6_addr_type(&cfg->fc_dst); 3200 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3201 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3202 3203 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3204 struct net_device *dev = fib6_info_nh_dev(rt); 3205 3206 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3207 NL_SET_ERR_MSG(extack, "Invalid source address"); 3208 err = -EINVAL; 3209 goto out; 3210 } 3211 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3212 rt->fib6_prefsrc.plen = 128; 3213 } else 3214 rt->fib6_prefsrc.plen = 0; 3215 3216 return rt; 3217 out: 3218 fib6_info_release(rt); 3219 return ERR_PTR(err); 3220 } 3221 3222 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3223 struct netlink_ext_ack *extack) 3224 { 3225 struct fib6_info *rt; 3226 int err; 3227 3228 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3229 if (IS_ERR(rt)) 3230 return PTR_ERR(rt); 3231 3232 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3233 fib6_info_release(rt); 3234 3235 return err; 3236 } 3237 3238 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3239 { 3240 struct net *net = info->nl_net; 3241 struct fib6_table *table; 3242 int err; 3243 3244 if (rt == net->ipv6.fib6_null_entry) { 3245 err = -ENOENT; 3246 goto out; 3247 } 3248 3249 table = rt->fib6_table; 3250 spin_lock_bh(&table->tb6_lock); 3251 err = fib6_del(rt, info); 3252 spin_unlock_bh(&table->tb6_lock); 3253 3254 out: 3255 fib6_info_release(rt); 3256 return err; 3257 } 3258 3259 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3260 { 3261 struct nl_info info = { .nl_net = net }; 3262 3263 return __ip6_del_rt(rt, &info); 3264 } 3265 3266 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3267 { 3268 struct nl_info *info = &cfg->fc_nlinfo; 3269 struct net *net = info->nl_net; 3270 struct sk_buff *skb = NULL; 3271 struct fib6_table *table; 3272 int err = -ENOENT; 3273 3274 if (rt == net->ipv6.fib6_null_entry) 3275 goto out_put; 3276 table = rt->fib6_table; 3277 spin_lock_bh(&table->tb6_lock); 3278 3279 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3280 struct fib6_info *sibling, *next_sibling; 3281 3282 /* prefer to send a single notification with all hops */ 3283 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3284 if (skb) { 3285 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3286 3287 if (rt6_fill_node(net, skb, rt, NULL, 3288 NULL, NULL, 0, RTM_DELROUTE, 3289 info->portid, seq, 0) < 0) { 3290 kfree_skb(skb); 3291 skb = NULL; 3292 } else 3293 info->skip_notify = 1; 3294 } 3295 3296 list_for_each_entry_safe(sibling, next_sibling, 3297 &rt->fib6_siblings, 3298 fib6_siblings) { 3299 err = fib6_del(sibling, info); 3300 if (err) 3301 goto out_unlock; 3302 } 3303 } 3304 3305 err = fib6_del(rt, info); 3306 out_unlock: 3307 spin_unlock_bh(&table->tb6_lock); 3308 out_put: 3309 fib6_info_release(rt); 3310 3311 if (skb) { 3312 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3313 info->nlh, gfp_any()); 3314 } 3315 return err; 3316 } 3317 3318 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3319 { 3320 int rc = -ESRCH; 3321 3322 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3323 goto out; 3324 3325 if (cfg->fc_flags & RTF_GATEWAY && 3326 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3327 goto out; 3328 3329 rc = rt6_remove_exception_rt(rt); 3330 out: 3331 return rc; 3332 } 3333 3334 static int ip6_route_del(struct fib6_config *cfg, 3335 struct netlink_ext_ack *extack) 3336 { 3337 struct rt6_info *rt_cache; 3338 struct fib6_table *table; 3339 struct fib6_info *rt; 3340 struct fib6_node *fn; 3341 int err = -ESRCH; 3342 3343 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3344 if (!table) { 3345 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3346 return err; 3347 } 3348 3349 rcu_read_lock(); 3350 3351 fn = fib6_locate(&table->tb6_root, 3352 &cfg->fc_dst, cfg->fc_dst_len, 3353 &cfg->fc_src, cfg->fc_src_len, 3354 !(cfg->fc_flags & RTF_CACHE)); 3355 3356 if (fn) { 3357 for_each_fib6_node_rt_rcu(fn) { 3358 struct fib6_nh *nh; 3359 3360 if (cfg->fc_flags & RTF_CACHE) { 3361 struct fib6_result res = { 3362 .f6i = rt, 3363 }; 3364 int rc; 3365 3366 rt_cache = rt6_find_cached_rt(&res, 3367 &cfg->fc_dst, 3368 &cfg->fc_src); 3369 if (rt_cache) { 3370 rc = ip6_del_cached_rt(rt_cache, cfg); 3371 if (rc != -ESRCH) { 3372 rcu_read_unlock(); 3373 return rc; 3374 } 3375 } 3376 continue; 3377 } 3378 3379 nh = &rt->fib6_nh; 3380 if (cfg->fc_ifindex && 3381 (!nh->fib_nh_dev || 3382 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3383 continue; 3384 if (cfg->fc_flags & RTF_GATEWAY && 3385 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3386 continue; 3387 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3388 continue; 3389 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3390 continue; 3391 if (!fib6_info_hold_safe(rt)) 3392 continue; 3393 rcu_read_unlock(); 3394 3395 /* if gateway was specified only delete the one hop */ 3396 if (cfg->fc_flags & RTF_GATEWAY) 3397 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3398 3399 return __ip6_del_rt_siblings(rt, cfg); 3400 } 3401 } 3402 rcu_read_unlock(); 3403 3404 return err; 3405 } 3406 3407 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3408 { 3409 struct netevent_redirect netevent; 3410 struct rt6_info *rt, *nrt = NULL; 3411 struct fib6_result res = {}; 3412 struct ndisc_options ndopts; 3413 struct inet6_dev *in6_dev; 3414 struct neighbour *neigh; 3415 struct rd_msg *msg; 3416 int optlen, on_link; 3417 u8 *lladdr; 3418 3419 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3420 optlen -= sizeof(*msg); 3421 3422 if (optlen < 0) { 3423 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3424 return; 3425 } 3426 3427 msg = (struct rd_msg *)icmp6_hdr(skb); 3428 3429 if (ipv6_addr_is_multicast(&msg->dest)) { 3430 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3431 return; 3432 } 3433 3434 on_link = 0; 3435 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3436 on_link = 1; 3437 } else if (ipv6_addr_type(&msg->target) != 3438 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3439 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3440 return; 3441 } 3442 3443 in6_dev = __in6_dev_get(skb->dev); 3444 if (!in6_dev) 3445 return; 3446 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3447 return; 3448 3449 /* RFC2461 8.1: 3450 * The IP source address of the Redirect MUST be the same as the current 3451 * first-hop router for the specified ICMP Destination Address. 3452 */ 3453 3454 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3455 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3456 return; 3457 } 3458 3459 lladdr = NULL; 3460 if (ndopts.nd_opts_tgt_lladdr) { 3461 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3462 skb->dev); 3463 if (!lladdr) { 3464 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3465 return; 3466 } 3467 } 3468 3469 rt = (struct rt6_info *) dst; 3470 if (rt->rt6i_flags & RTF_REJECT) { 3471 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3472 return; 3473 } 3474 3475 /* Redirect received -> path was valid. 3476 * Look, redirects are sent only in response to data packets, 3477 * so that this nexthop apparently is reachable. --ANK 3478 */ 3479 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3480 3481 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3482 if (!neigh) 3483 return; 3484 3485 /* 3486 * We have finally decided to accept it. 3487 */ 3488 3489 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3490 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3491 NEIGH_UPDATE_F_OVERRIDE| 3492 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3493 NEIGH_UPDATE_F_ISROUTER)), 3494 NDISC_REDIRECT, &ndopts); 3495 3496 rcu_read_lock(); 3497 res.f6i = rcu_dereference(rt->from); 3498 /* This fib6_info_hold() is safe here because we hold reference to rt 3499 * and rt already holds reference to fib6_info. 3500 */ 3501 fib6_info_hold(res.f6i); 3502 rcu_read_unlock(); 3503 3504 res.nh = &res.f6i->fib6_nh; 3505 res.fib6_flags = res.f6i->fib6_flags; 3506 res.fib6_type = res.f6i->fib6_type; 3507 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3508 if (!nrt) 3509 goto out; 3510 3511 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3512 if (on_link) 3513 nrt->rt6i_flags &= ~RTF_GATEWAY; 3514 3515 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3516 3517 /* No need to remove rt from the exception table if rt is 3518 * a cached route because rt6_insert_exception() will 3519 * takes care of it 3520 */ 3521 if (rt6_insert_exception(nrt, &res)) { 3522 dst_release_immediate(&nrt->dst); 3523 goto out; 3524 } 3525 3526 netevent.old = &rt->dst; 3527 netevent.new = &nrt->dst; 3528 netevent.daddr = &msg->dest; 3529 netevent.neigh = neigh; 3530 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3531 3532 out: 3533 fib6_info_release(res.f6i); 3534 neigh_release(neigh); 3535 } 3536 3537 #ifdef CONFIG_IPV6_ROUTE_INFO 3538 static struct fib6_info *rt6_get_route_info(struct net *net, 3539 const struct in6_addr *prefix, int prefixlen, 3540 const struct in6_addr *gwaddr, 3541 struct net_device *dev) 3542 { 3543 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3544 int ifindex = dev->ifindex; 3545 struct fib6_node *fn; 3546 struct fib6_info *rt = NULL; 3547 struct fib6_table *table; 3548 3549 table = fib6_get_table(net, tb_id); 3550 if (!table) 3551 return NULL; 3552 3553 rcu_read_lock(); 3554 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3555 if (!fn) 3556 goto out; 3557 3558 for_each_fib6_node_rt_rcu(fn) { 3559 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3560 continue; 3561 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3562 !rt->fib6_nh.fib_nh_gw_family) 3563 continue; 3564 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3565 continue; 3566 if (!fib6_info_hold_safe(rt)) 3567 continue; 3568 break; 3569 } 3570 out: 3571 rcu_read_unlock(); 3572 return rt; 3573 } 3574 3575 static struct fib6_info *rt6_add_route_info(struct net *net, 3576 const struct in6_addr *prefix, int prefixlen, 3577 const struct in6_addr *gwaddr, 3578 struct net_device *dev, 3579 unsigned int pref) 3580 { 3581 struct fib6_config cfg = { 3582 .fc_metric = IP6_RT_PRIO_USER, 3583 .fc_ifindex = dev->ifindex, 3584 .fc_dst_len = prefixlen, 3585 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3586 RTF_UP | RTF_PREF(pref), 3587 .fc_protocol = RTPROT_RA, 3588 .fc_type = RTN_UNICAST, 3589 .fc_nlinfo.portid = 0, 3590 .fc_nlinfo.nlh = NULL, 3591 .fc_nlinfo.nl_net = net, 3592 }; 3593 3594 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3595 cfg.fc_dst = *prefix; 3596 cfg.fc_gateway = *gwaddr; 3597 3598 /* We should treat it as a default route if prefix length is 0. */ 3599 if (!prefixlen) 3600 cfg.fc_flags |= RTF_DEFAULT; 3601 3602 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3603 3604 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3605 } 3606 #endif 3607 3608 struct fib6_info *rt6_get_dflt_router(struct net *net, 3609 const struct in6_addr *addr, 3610 struct net_device *dev) 3611 { 3612 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3613 struct fib6_info *rt; 3614 struct fib6_table *table; 3615 3616 table = fib6_get_table(net, tb_id); 3617 if (!table) 3618 return NULL; 3619 3620 rcu_read_lock(); 3621 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3622 struct fib6_nh *nh = &rt->fib6_nh; 3623 3624 if (dev == nh->fib_nh_dev && 3625 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3626 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3627 break; 3628 } 3629 if (rt && !fib6_info_hold_safe(rt)) 3630 rt = NULL; 3631 rcu_read_unlock(); 3632 return rt; 3633 } 3634 3635 struct fib6_info *rt6_add_dflt_router(struct net *net, 3636 const struct in6_addr *gwaddr, 3637 struct net_device *dev, 3638 unsigned int pref) 3639 { 3640 struct fib6_config cfg = { 3641 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3642 .fc_metric = IP6_RT_PRIO_USER, 3643 .fc_ifindex = dev->ifindex, 3644 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3645 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3646 .fc_protocol = RTPROT_RA, 3647 .fc_type = RTN_UNICAST, 3648 .fc_nlinfo.portid = 0, 3649 .fc_nlinfo.nlh = NULL, 3650 .fc_nlinfo.nl_net = net, 3651 }; 3652 3653 cfg.fc_gateway = *gwaddr; 3654 3655 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3656 struct fib6_table *table; 3657 3658 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3659 if (table) 3660 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3661 } 3662 3663 return rt6_get_dflt_router(net, gwaddr, dev); 3664 } 3665 3666 static void __rt6_purge_dflt_routers(struct net *net, 3667 struct fib6_table *table) 3668 { 3669 struct fib6_info *rt; 3670 3671 restart: 3672 rcu_read_lock(); 3673 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3674 struct net_device *dev = fib6_info_nh_dev(rt); 3675 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3676 3677 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3678 (!idev || idev->cnf.accept_ra != 2) && 3679 fib6_info_hold_safe(rt)) { 3680 rcu_read_unlock(); 3681 ip6_del_rt(net, rt); 3682 goto restart; 3683 } 3684 } 3685 rcu_read_unlock(); 3686 3687 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3688 } 3689 3690 void rt6_purge_dflt_routers(struct net *net) 3691 { 3692 struct fib6_table *table; 3693 struct hlist_head *head; 3694 unsigned int h; 3695 3696 rcu_read_lock(); 3697 3698 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3699 head = &net->ipv6.fib_table_hash[h]; 3700 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3701 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3702 __rt6_purge_dflt_routers(net, table); 3703 } 3704 } 3705 3706 rcu_read_unlock(); 3707 } 3708 3709 static void rtmsg_to_fib6_config(struct net *net, 3710 struct in6_rtmsg *rtmsg, 3711 struct fib6_config *cfg) 3712 { 3713 *cfg = (struct fib6_config){ 3714 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3715 : RT6_TABLE_MAIN, 3716 .fc_ifindex = rtmsg->rtmsg_ifindex, 3717 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3718 .fc_expires = rtmsg->rtmsg_info, 3719 .fc_dst_len = rtmsg->rtmsg_dst_len, 3720 .fc_src_len = rtmsg->rtmsg_src_len, 3721 .fc_flags = rtmsg->rtmsg_flags, 3722 .fc_type = rtmsg->rtmsg_type, 3723 3724 .fc_nlinfo.nl_net = net, 3725 3726 .fc_dst = rtmsg->rtmsg_dst, 3727 .fc_src = rtmsg->rtmsg_src, 3728 .fc_gateway = rtmsg->rtmsg_gateway, 3729 }; 3730 } 3731 3732 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3733 { 3734 struct fib6_config cfg; 3735 struct in6_rtmsg rtmsg; 3736 int err; 3737 3738 switch (cmd) { 3739 case SIOCADDRT: /* Add a route */ 3740 case SIOCDELRT: /* Delete a route */ 3741 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3742 return -EPERM; 3743 err = copy_from_user(&rtmsg, arg, 3744 sizeof(struct in6_rtmsg)); 3745 if (err) 3746 return -EFAULT; 3747 3748 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3749 3750 rtnl_lock(); 3751 switch (cmd) { 3752 case SIOCADDRT: 3753 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3754 break; 3755 case SIOCDELRT: 3756 err = ip6_route_del(&cfg, NULL); 3757 break; 3758 default: 3759 err = -EINVAL; 3760 } 3761 rtnl_unlock(); 3762 3763 return err; 3764 } 3765 3766 return -EINVAL; 3767 } 3768 3769 /* 3770 * Drop the packet on the floor 3771 */ 3772 3773 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3774 { 3775 int type; 3776 struct dst_entry *dst = skb_dst(skb); 3777 switch (ipstats_mib_noroutes) { 3778 case IPSTATS_MIB_INNOROUTES: 3779 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3780 if (type == IPV6_ADDR_ANY) { 3781 IP6_INC_STATS(dev_net(dst->dev), 3782 __in6_dev_get_safely(skb->dev), 3783 IPSTATS_MIB_INADDRERRORS); 3784 break; 3785 } 3786 /* FALLTHROUGH */ 3787 case IPSTATS_MIB_OUTNOROUTES: 3788 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3789 ipstats_mib_noroutes); 3790 break; 3791 } 3792 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3793 kfree_skb(skb); 3794 return 0; 3795 } 3796 3797 static int ip6_pkt_discard(struct sk_buff *skb) 3798 { 3799 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3800 } 3801 3802 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3803 { 3804 skb->dev = skb_dst(skb)->dev; 3805 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3806 } 3807 3808 static int ip6_pkt_prohibit(struct sk_buff *skb) 3809 { 3810 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3811 } 3812 3813 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3814 { 3815 skb->dev = skb_dst(skb)->dev; 3816 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3817 } 3818 3819 /* 3820 * Allocate a dst for local (unicast / anycast) address. 3821 */ 3822 3823 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3824 struct inet6_dev *idev, 3825 const struct in6_addr *addr, 3826 bool anycast, gfp_t gfp_flags) 3827 { 3828 struct fib6_config cfg = { 3829 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3830 .fc_ifindex = idev->dev->ifindex, 3831 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3832 .fc_dst = *addr, 3833 .fc_dst_len = 128, 3834 .fc_protocol = RTPROT_KERNEL, 3835 .fc_nlinfo.nl_net = net, 3836 .fc_ignore_dev_down = true, 3837 }; 3838 3839 if (anycast) { 3840 cfg.fc_type = RTN_ANYCAST; 3841 cfg.fc_flags |= RTF_ANYCAST; 3842 } else { 3843 cfg.fc_type = RTN_LOCAL; 3844 cfg.fc_flags |= RTF_LOCAL; 3845 } 3846 3847 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3848 } 3849 3850 /* remove deleted ip from prefsrc entries */ 3851 struct arg_dev_net_ip { 3852 struct net_device *dev; 3853 struct net *net; 3854 struct in6_addr *addr; 3855 }; 3856 3857 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3858 { 3859 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3860 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3861 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3862 3863 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3864 rt != net->ipv6.fib6_null_entry && 3865 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3866 spin_lock_bh(&rt6_exception_lock); 3867 /* remove prefsrc entry */ 3868 rt->fib6_prefsrc.plen = 0; 3869 spin_unlock_bh(&rt6_exception_lock); 3870 } 3871 return 0; 3872 } 3873 3874 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3875 { 3876 struct net *net = dev_net(ifp->idev->dev); 3877 struct arg_dev_net_ip adni = { 3878 .dev = ifp->idev->dev, 3879 .net = net, 3880 .addr = &ifp->addr, 3881 }; 3882 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3883 } 3884 3885 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3886 3887 /* Remove routers and update dst entries when gateway turn into host. */ 3888 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3889 { 3890 struct in6_addr *gateway = (struct in6_addr *)arg; 3891 3892 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3893 rt->fib6_nh.fib_nh_gw_family && 3894 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3895 return -1; 3896 } 3897 3898 /* Further clean up cached routes in exception table. 3899 * This is needed because cached route may have a different 3900 * gateway than its 'parent' in the case of an ip redirect. 3901 */ 3902 rt6_exceptions_clean_tohost(rt, gateway); 3903 3904 return 0; 3905 } 3906 3907 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3908 { 3909 fib6_clean_all(net, fib6_clean_tohost, gateway); 3910 } 3911 3912 struct arg_netdev_event { 3913 const struct net_device *dev; 3914 union { 3915 unsigned char nh_flags; 3916 unsigned long event; 3917 }; 3918 }; 3919 3920 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3921 { 3922 struct fib6_info *iter; 3923 struct fib6_node *fn; 3924 3925 fn = rcu_dereference_protected(rt->fib6_node, 3926 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3927 iter = rcu_dereference_protected(fn->leaf, 3928 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3929 while (iter) { 3930 if (iter->fib6_metric == rt->fib6_metric && 3931 rt6_qualify_for_ecmp(iter)) 3932 return iter; 3933 iter = rcu_dereference_protected(iter->fib6_next, 3934 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3935 } 3936 3937 return NULL; 3938 } 3939 3940 static bool rt6_is_dead(const struct fib6_info *rt) 3941 { 3942 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3943 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3944 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3945 return true; 3946 3947 return false; 3948 } 3949 3950 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3951 { 3952 struct fib6_info *iter; 3953 int total = 0; 3954 3955 if (!rt6_is_dead(rt)) 3956 total += rt->fib6_nh.fib_nh_weight; 3957 3958 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3959 if (!rt6_is_dead(iter)) 3960 total += iter->fib6_nh.fib_nh_weight; 3961 } 3962 3963 return total; 3964 } 3965 3966 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3967 { 3968 int upper_bound = -1; 3969 3970 if (!rt6_is_dead(rt)) { 3971 *weight += rt->fib6_nh.fib_nh_weight; 3972 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3973 total) - 1; 3974 } 3975 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3976 } 3977 3978 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3979 { 3980 struct fib6_info *iter; 3981 int weight = 0; 3982 3983 rt6_upper_bound_set(rt, &weight, total); 3984 3985 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3986 rt6_upper_bound_set(iter, &weight, total); 3987 } 3988 3989 void rt6_multipath_rebalance(struct fib6_info *rt) 3990 { 3991 struct fib6_info *first; 3992 int total; 3993 3994 /* In case the entire multipath route was marked for flushing, 3995 * then there is no need to rebalance upon the removal of every 3996 * sibling route. 3997 */ 3998 if (!rt->fib6_nsiblings || rt->should_flush) 3999 return; 4000 4001 /* During lookup routes are evaluated in order, so we need to 4002 * make sure upper bounds are assigned from the first sibling 4003 * onwards. 4004 */ 4005 first = rt6_multipath_first_sibling(rt); 4006 if (WARN_ON_ONCE(!first)) 4007 return; 4008 4009 total = rt6_multipath_total_weight(first); 4010 rt6_multipath_upper_bound_set(first, total); 4011 } 4012 4013 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4014 { 4015 const struct arg_netdev_event *arg = p_arg; 4016 struct net *net = dev_net(arg->dev); 4017 4018 if (rt != net->ipv6.fib6_null_entry && 4019 rt->fib6_nh.fib_nh_dev == arg->dev) { 4020 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 4021 fib6_update_sernum_upto_root(net, rt); 4022 rt6_multipath_rebalance(rt); 4023 } 4024 4025 return 0; 4026 } 4027 4028 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4029 { 4030 struct arg_netdev_event arg = { 4031 .dev = dev, 4032 { 4033 .nh_flags = nh_flags, 4034 }, 4035 }; 4036 4037 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4038 arg.nh_flags |= RTNH_F_LINKDOWN; 4039 4040 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4041 } 4042 4043 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4044 const struct net_device *dev) 4045 { 4046 struct fib6_info *iter; 4047 4048 if (rt->fib6_nh.fib_nh_dev == dev) 4049 return true; 4050 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4051 if (iter->fib6_nh.fib_nh_dev == dev) 4052 return true; 4053 4054 return false; 4055 } 4056 4057 static void rt6_multipath_flush(struct fib6_info *rt) 4058 { 4059 struct fib6_info *iter; 4060 4061 rt->should_flush = 1; 4062 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4063 iter->should_flush = 1; 4064 } 4065 4066 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4067 const struct net_device *down_dev) 4068 { 4069 struct fib6_info *iter; 4070 unsigned int dead = 0; 4071 4072 if (rt->fib6_nh.fib_nh_dev == down_dev || 4073 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4074 dead++; 4075 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4076 if (iter->fib6_nh.fib_nh_dev == down_dev || 4077 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4078 dead++; 4079 4080 return dead; 4081 } 4082 4083 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4084 const struct net_device *dev, 4085 unsigned char nh_flags) 4086 { 4087 struct fib6_info *iter; 4088 4089 if (rt->fib6_nh.fib_nh_dev == dev) 4090 rt->fib6_nh.fib_nh_flags |= nh_flags; 4091 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4092 if (iter->fib6_nh.fib_nh_dev == dev) 4093 iter->fib6_nh.fib_nh_flags |= nh_flags; 4094 } 4095 4096 /* called with write lock held for table with rt */ 4097 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4098 { 4099 const struct arg_netdev_event *arg = p_arg; 4100 const struct net_device *dev = arg->dev; 4101 struct net *net = dev_net(dev); 4102 4103 if (rt == net->ipv6.fib6_null_entry) 4104 return 0; 4105 4106 switch (arg->event) { 4107 case NETDEV_UNREGISTER: 4108 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4109 case NETDEV_DOWN: 4110 if (rt->should_flush) 4111 return -1; 4112 if (!rt->fib6_nsiblings) 4113 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4114 if (rt6_multipath_uses_dev(rt, dev)) { 4115 unsigned int count; 4116 4117 count = rt6_multipath_dead_count(rt, dev); 4118 if (rt->fib6_nsiblings + 1 == count) { 4119 rt6_multipath_flush(rt); 4120 return -1; 4121 } 4122 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4123 RTNH_F_LINKDOWN); 4124 fib6_update_sernum(net, rt); 4125 rt6_multipath_rebalance(rt); 4126 } 4127 return -2; 4128 case NETDEV_CHANGE: 4129 if (rt->fib6_nh.fib_nh_dev != dev || 4130 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4131 break; 4132 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4133 rt6_multipath_rebalance(rt); 4134 break; 4135 } 4136 4137 return 0; 4138 } 4139 4140 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4141 { 4142 struct arg_netdev_event arg = { 4143 .dev = dev, 4144 { 4145 .event = event, 4146 }, 4147 }; 4148 struct net *net = dev_net(dev); 4149 4150 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4151 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4152 else 4153 fib6_clean_all(net, fib6_ifdown, &arg); 4154 } 4155 4156 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4157 { 4158 rt6_sync_down_dev(dev, event); 4159 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4160 neigh_ifdown(&nd_tbl, dev); 4161 } 4162 4163 struct rt6_mtu_change_arg { 4164 struct net_device *dev; 4165 unsigned int mtu; 4166 }; 4167 4168 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4169 { 4170 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4171 struct inet6_dev *idev; 4172 4173 /* In IPv6 pmtu discovery is not optional, 4174 so that RTAX_MTU lock cannot disable it. 4175 We still use this lock to block changes 4176 caused by addrconf/ndisc. 4177 */ 4178 4179 idev = __in6_dev_get(arg->dev); 4180 if (!idev) 4181 return 0; 4182 4183 /* For administrative MTU increase, there is no way to discover 4184 IPv6 PMTU increase, so PMTU increase should be updated here. 4185 Since RFC 1981 doesn't include administrative MTU increase 4186 update PMTU increase is a MUST. (i.e. jumbo frame) 4187 */ 4188 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4189 !fib6_metric_locked(rt, RTAX_MTU)) { 4190 u32 mtu = rt->fib6_pmtu; 4191 4192 if (mtu >= arg->mtu || 4193 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4194 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4195 4196 spin_lock_bh(&rt6_exception_lock); 4197 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4198 spin_unlock_bh(&rt6_exception_lock); 4199 } 4200 return 0; 4201 } 4202 4203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4204 { 4205 struct rt6_mtu_change_arg arg = { 4206 .dev = dev, 4207 .mtu = mtu, 4208 }; 4209 4210 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4211 } 4212 4213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4214 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4215 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4216 [RTA_OIF] = { .type = NLA_U32 }, 4217 [RTA_IIF] = { .type = NLA_U32 }, 4218 [RTA_PRIORITY] = { .type = NLA_U32 }, 4219 [RTA_METRICS] = { .type = NLA_NESTED }, 4220 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4221 [RTA_PREF] = { .type = NLA_U8 }, 4222 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4223 [RTA_ENCAP] = { .type = NLA_NESTED }, 4224 [RTA_EXPIRES] = { .type = NLA_U32 }, 4225 [RTA_UID] = { .type = NLA_U32 }, 4226 [RTA_MARK] = { .type = NLA_U32 }, 4227 [RTA_TABLE] = { .type = NLA_U32 }, 4228 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4229 [RTA_SPORT] = { .type = NLA_U16 }, 4230 [RTA_DPORT] = { .type = NLA_U16 }, 4231 }; 4232 4233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4234 struct fib6_config *cfg, 4235 struct netlink_ext_ack *extack) 4236 { 4237 struct rtmsg *rtm; 4238 struct nlattr *tb[RTA_MAX+1]; 4239 unsigned int pref; 4240 int err; 4241 4242 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4243 rtm_ipv6_policy, extack); 4244 if (err < 0) 4245 goto errout; 4246 4247 err = -EINVAL; 4248 rtm = nlmsg_data(nlh); 4249 4250 *cfg = (struct fib6_config){ 4251 .fc_table = rtm->rtm_table, 4252 .fc_dst_len = rtm->rtm_dst_len, 4253 .fc_src_len = rtm->rtm_src_len, 4254 .fc_flags = RTF_UP, 4255 .fc_protocol = rtm->rtm_protocol, 4256 .fc_type = rtm->rtm_type, 4257 4258 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4259 .fc_nlinfo.nlh = nlh, 4260 .fc_nlinfo.nl_net = sock_net(skb->sk), 4261 }; 4262 4263 if (rtm->rtm_type == RTN_UNREACHABLE || 4264 rtm->rtm_type == RTN_BLACKHOLE || 4265 rtm->rtm_type == RTN_PROHIBIT || 4266 rtm->rtm_type == RTN_THROW) 4267 cfg->fc_flags |= RTF_REJECT; 4268 4269 if (rtm->rtm_type == RTN_LOCAL) 4270 cfg->fc_flags |= RTF_LOCAL; 4271 4272 if (rtm->rtm_flags & RTM_F_CLONED) 4273 cfg->fc_flags |= RTF_CACHE; 4274 4275 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4276 4277 if (tb[RTA_GATEWAY]) { 4278 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4279 cfg->fc_flags |= RTF_GATEWAY; 4280 } 4281 if (tb[RTA_VIA]) { 4282 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4283 goto errout; 4284 } 4285 4286 if (tb[RTA_DST]) { 4287 int plen = (rtm->rtm_dst_len + 7) >> 3; 4288 4289 if (nla_len(tb[RTA_DST]) < plen) 4290 goto errout; 4291 4292 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4293 } 4294 4295 if (tb[RTA_SRC]) { 4296 int plen = (rtm->rtm_src_len + 7) >> 3; 4297 4298 if (nla_len(tb[RTA_SRC]) < plen) 4299 goto errout; 4300 4301 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4302 } 4303 4304 if (tb[RTA_PREFSRC]) 4305 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4306 4307 if (tb[RTA_OIF]) 4308 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4309 4310 if (tb[RTA_PRIORITY]) 4311 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4312 4313 if (tb[RTA_METRICS]) { 4314 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4315 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4316 } 4317 4318 if (tb[RTA_TABLE]) 4319 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4320 4321 if (tb[RTA_MULTIPATH]) { 4322 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4323 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4324 4325 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4326 cfg->fc_mp_len, extack); 4327 if (err < 0) 4328 goto errout; 4329 } 4330 4331 if (tb[RTA_PREF]) { 4332 pref = nla_get_u8(tb[RTA_PREF]); 4333 if (pref != ICMPV6_ROUTER_PREF_LOW && 4334 pref != ICMPV6_ROUTER_PREF_HIGH) 4335 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4336 cfg->fc_flags |= RTF_PREF(pref); 4337 } 4338 4339 if (tb[RTA_ENCAP]) 4340 cfg->fc_encap = tb[RTA_ENCAP]; 4341 4342 if (tb[RTA_ENCAP_TYPE]) { 4343 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4344 4345 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4346 if (err < 0) 4347 goto errout; 4348 } 4349 4350 if (tb[RTA_EXPIRES]) { 4351 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4352 4353 if (addrconf_finite_timeout(timeout)) { 4354 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4355 cfg->fc_flags |= RTF_EXPIRES; 4356 } 4357 } 4358 4359 err = 0; 4360 errout: 4361 return err; 4362 } 4363 4364 struct rt6_nh { 4365 struct fib6_info *fib6_info; 4366 struct fib6_config r_cfg; 4367 struct list_head next; 4368 }; 4369 4370 static int ip6_route_info_append(struct net *net, 4371 struct list_head *rt6_nh_list, 4372 struct fib6_info *rt, 4373 struct fib6_config *r_cfg) 4374 { 4375 struct rt6_nh *nh; 4376 int err = -EEXIST; 4377 4378 list_for_each_entry(nh, rt6_nh_list, next) { 4379 /* check if fib6_info already exists */ 4380 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4381 return err; 4382 } 4383 4384 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4385 if (!nh) 4386 return -ENOMEM; 4387 nh->fib6_info = rt; 4388 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4389 list_add_tail(&nh->next, rt6_nh_list); 4390 4391 return 0; 4392 } 4393 4394 static void ip6_route_mpath_notify(struct fib6_info *rt, 4395 struct fib6_info *rt_last, 4396 struct nl_info *info, 4397 __u16 nlflags) 4398 { 4399 /* if this is an APPEND route, then rt points to the first route 4400 * inserted and rt_last points to last route inserted. Userspace 4401 * wants a consistent dump of the route which starts at the first 4402 * nexthop. Since sibling routes are always added at the end of 4403 * the list, find the first sibling of the last route appended 4404 */ 4405 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4406 rt = list_first_entry(&rt_last->fib6_siblings, 4407 struct fib6_info, 4408 fib6_siblings); 4409 } 4410 4411 if (rt) 4412 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4413 } 4414 4415 static int ip6_route_multipath_add(struct fib6_config *cfg, 4416 struct netlink_ext_ack *extack) 4417 { 4418 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4419 struct nl_info *info = &cfg->fc_nlinfo; 4420 struct fib6_config r_cfg; 4421 struct rtnexthop *rtnh; 4422 struct fib6_info *rt; 4423 struct rt6_nh *err_nh; 4424 struct rt6_nh *nh, *nh_safe; 4425 __u16 nlflags; 4426 int remaining; 4427 int attrlen; 4428 int err = 1; 4429 int nhn = 0; 4430 int replace = (cfg->fc_nlinfo.nlh && 4431 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4432 LIST_HEAD(rt6_nh_list); 4433 4434 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4435 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4436 nlflags |= NLM_F_APPEND; 4437 4438 remaining = cfg->fc_mp_len; 4439 rtnh = (struct rtnexthop *)cfg->fc_mp; 4440 4441 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4442 * fib6_info structs per nexthop 4443 */ 4444 while (rtnh_ok(rtnh, remaining)) { 4445 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4446 if (rtnh->rtnh_ifindex) 4447 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4448 4449 attrlen = rtnh_attrlen(rtnh); 4450 if (attrlen > 0) { 4451 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4452 4453 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4454 if (nla) { 4455 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4456 r_cfg.fc_flags |= RTF_GATEWAY; 4457 } 4458 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4459 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4460 if (nla) 4461 r_cfg.fc_encap_type = nla_get_u16(nla); 4462 } 4463 4464 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4465 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4466 if (IS_ERR(rt)) { 4467 err = PTR_ERR(rt); 4468 rt = NULL; 4469 goto cleanup; 4470 } 4471 if (!rt6_qualify_for_ecmp(rt)) { 4472 err = -EINVAL; 4473 NL_SET_ERR_MSG(extack, 4474 "Device only routes can not be added for IPv6 using the multipath API."); 4475 fib6_info_release(rt); 4476 goto cleanup; 4477 } 4478 4479 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4480 4481 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4482 rt, &r_cfg); 4483 if (err) { 4484 fib6_info_release(rt); 4485 goto cleanup; 4486 } 4487 4488 rtnh = rtnh_next(rtnh, &remaining); 4489 } 4490 4491 /* for add and replace send one notification with all nexthops. 4492 * Skip the notification in fib6_add_rt2node and send one with 4493 * the full route when done 4494 */ 4495 info->skip_notify = 1; 4496 4497 err_nh = NULL; 4498 list_for_each_entry(nh, &rt6_nh_list, next) { 4499 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4500 fib6_info_release(nh->fib6_info); 4501 4502 if (!err) { 4503 /* save reference to last route successfully inserted */ 4504 rt_last = nh->fib6_info; 4505 4506 /* save reference to first route for notification */ 4507 if (!rt_notif) 4508 rt_notif = nh->fib6_info; 4509 } 4510 4511 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4512 nh->fib6_info = NULL; 4513 if (err) { 4514 if (replace && nhn) 4515 NL_SET_ERR_MSG_MOD(extack, 4516 "multipath route replace failed (check consistency of installed routes)"); 4517 err_nh = nh; 4518 goto add_errout; 4519 } 4520 4521 /* Because each route is added like a single route we remove 4522 * these flags after the first nexthop: if there is a collision, 4523 * we have already failed to add the first nexthop: 4524 * fib6_add_rt2node() has rejected it; when replacing, old 4525 * nexthops have been replaced by first new, the rest should 4526 * be added to it. 4527 */ 4528 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4529 NLM_F_REPLACE); 4530 nhn++; 4531 } 4532 4533 /* success ... tell user about new route */ 4534 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4535 goto cleanup; 4536 4537 add_errout: 4538 /* send notification for routes that were added so that 4539 * the delete notifications sent by ip6_route_del are 4540 * coherent 4541 */ 4542 if (rt_notif) 4543 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4544 4545 /* Delete routes that were already added */ 4546 list_for_each_entry(nh, &rt6_nh_list, next) { 4547 if (err_nh == nh) 4548 break; 4549 ip6_route_del(&nh->r_cfg, extack); 4550 } 4551 4552 cleanup: 4553 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4554 if (nh->fib6_info) 4555 fib6_info_release(nh->fib6_info); 4556 list_del(&nh->next); 4557 kfree(nh); 4558 } 4559 4560 return err; 4561 } 4562 4563 static int ip6_route_multipath_del(struct fib6_config *cfg, 4564 struct netlink_ext_ack *extack) 4565 { 4566 struct fib6_config r_cfg; 4567 struct rtnexthop *rtnh; 4568 int remaining; 4569 int attrlen; 4570 int err = 1, last_err = 0; 4571 4572 remaining = cfg->fc_mp_len; 4573 rtnh = (struct rtnexthop *)cfg->fc_mp; 4574 4575 /* Parse a Multipath Entry */ 4576 while (rtnh_ok(rtnh, remaining)) { 4577 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4578 if (rtnh->rtnh_ifindex) 4579 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4580 4581 attrlen = rtnh_attrlen(rtnh); 4582 if (attrlen > 0) { 4583 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4584 4585 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4586 if (nla) { 4587 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4588 r_cfg.fc_flags |= RTF_GATEWAY; 4589 } 4590 } 4591 err = ip6_route_del(&r_cfg, extack); 4592 if (err) 4593 last_err = err; 4594 4595 rtnh = rtnh_next(rtnh, &remaining); 4596 } 4597 4598 return last_err; 4599 } 4600 4601 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4602 struct netlink_ext_ack *extack) 4603 { 4604 struct fib6_config cfg; 4605 int err; 4606 4607 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4608 if (err < 0) 4609 return err; 4610 4611 if (cfg.fc_mp) 4612 return ip6_route_multipath_del(&cfg, extack); 4613 else { 4614 cfg.fc_delete_all_nh = 1; 4615 return ip6_route_del(&cfg, extack); 4616 } 4617 } 4618 4619 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4620 struct netlink_ext_ack *extack) 4621 { 4622 struct fib6_config cfg; 4623 int err; 4624 4625 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4626 if (err < 0) 4627 return err; 4628 4629 if (cfg.fc_metric == 0) 4630 cfg.fc_metric = IP6_RT_PRIO_USER; 4631 4632 if (cfg.fc_mp) 4633 return ip6_route_multipath_add(&cfg, extack); 4634 else 4635 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4636 } 4637 4638 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4639 { 4640 int nexthop_len = 0; 4641 4642 if (rt->fib6_nsiblings) { 4643 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4644 + NLA_ALIGN(sizeof(struct rtnexthop)) 4645 + nla_total_size(16) /* RTA_GATEWAY */ 4646 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4647 4648 nexthop_len *= rt->fib6_nsiblings; 4649 } 4650 4651 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4652 + nla_total_size(16) /* RTA_SRC */ 4653 + nla_total_size(16) /* RTA_DST */ 4654 + nla_total_size(16) /* RTA_GATEWAY */ 4655 + nla_total_size(16) /* RTA_PREFSRC */ 4656 + nla_total_size(4) /* RTA_TABLE */ 4657 + nla_total_size(4) /* RTA_IIF */ 4658 + nla_total_size(4) /* RTA_OIF */ 4659 + nla_total_size(4) /* RTA_PRIORITY */ 4660 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4661 + nla_total_size(sizeof(struct rta_cacheinfo)) 4662 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4663 + nla_total_size(1) /* RTA_PREF */ 4664 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4665 + nexthop_len; 4666 } 4667 4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4669 struct fib6_info *rt, struct dst_entry *dst, 4670 struct in6_addr *dest, struct in6_addr *src, 4671 int iif, int type, u32 portid, u32 seq, 4672 unsigned int flags) 4673 { 4674 struct rt6_info *rt6 = (struct rt6_info *)dst; 4675 struct rt6key *rt6_dst, *rt6_src; 4676 u32 *pmetrics, table, rt6_flags; 4677 struct nlmsghdr *nlh; 4678 struct rtmsg *rtm; 4679 long expires = 0; 4680 4681 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4682 if (!nlh) 4683 return -EMSGSIZE; 4684 4685 if (rt6) { 4686 rt6_dst = &rt6->rt6i_dst; 4687 rt6_src = &rt6->rt6i_src; 4688 rt6_flags = rt6->rt6i_flags; 4689 } else { 4690 rt6_dst = &rt->fib6_dst; 4691 rt6_src = &rt->fib6_src; 4692 rt6_flags = rt->fib6_flags; 4693 } 4694 4695 rtm = nlmsg_data(nlh); 4696 rtm->rtm_family = AF_INET6; 4697 rtm->rtm_dst_len = rt6_dst->plen; 4698 rtm->rtm_src_len = rt6_src->plen; 4699 rtm->rtm_tos = 0; 4700 if (rt->fib6_table) 4701 table = rt->fib6_table->tb6_id; 4702 else 4703 table = RT6_TABLE_UNSPEC; 4704 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4705 if (nla_put_u32(skb, RTA_TABLE, table)) 4706 goto nla_put_failure; 4707 4708 rtm->rtm_type = rt->fib6_type; 4709 rtm->rtm_flags = 0; 4710 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4711 rtm->rtm_protocol = rt->fib6_protocol; 4712 4713 if (rt6_flags & RTF_CACHE) 4714 rtm->rtm_flags |= RTM_F_CLONED; 4715 4716 if (dest) { 4717 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4718 goto nla_put_failure; 4719 rtm->rtm_dst_len = 128; 4720 } else if (rtm->rtm_dst_len) 4721 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4722 goto nla_put_failure; 4723 #ifdef CONFIG_IPV6_SUBTREES 4724 if (src) { 4725 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4726 goto nla_put_failure; 4727 rtm->rtm_src_len = 128; 4728 } else if (rtm->rtm_src_len && 4729 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4730 goto nla_put_failure; 4731 #endif 4732 if (iif) { 4733 #ifdef CONFIG_IPV6_MROUTE 4734 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4735 int err = ip6mr_get_route(net, skb, rtm, portid); 4736 4737 if (err == 0) 4738 return 0; 4739 if (err < 0) 4740 goto nla_put_failure; 4741 } else 4742 #endif 4743 if (nla_put_u32(skb, RTA_IIF, iif)) 4744 goto nla_put_failure; 4745 } else if (dest) { 4746 struct in6_addr saddr_buf; 4747 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4748 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4749 goto nla_put_failure; 4750 } 4751 4752 if (rt->fib6_prefsrc.plen) { 4753 struct in6_addr saddr_buf; 4754 saddr_buf = rt->fib6_prefsrc.addr; 4755 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4756 goto nla_put_failure; 4757 } 4758 4759 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4760 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4761 goto nla_put_failure; 4762 4763 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4764 goto nla_put_failure; 4765 4766 /* For multipath routes, walk the siblings list and add 4767 * each as a nexthop within RTA_MULTIPATH. 4768 */ 4769 if (rt6) { 4770 if (rt6_flags & RTF_GATEWAY && 4771 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4772 goto nla_put_failure; 4773 4774 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4775 goto nla_put_failure; 4776 } else if (rt->fib6_nsiblings) { 4777 struct fib6_info *sibling, *next_sibling; 4778 struct nlattr *mp; 4779 4780 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 4781 if (!mp) 4782 goto nla_put_failure; 4783 4784 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, 4785 rt->fib6_nh.fib_nh_weight) < 0) 4786 goto nla_put_failure; 4787 4788 list_for_each_entry_safe(sibling, next_sibling, 4789 &rt->fib6_siblings, fib6_siblings) { 4790 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, 4791 sibling->fib6_nh.fib_nh_weight) < 0) 4792 goto nla_put_failure; 4793 } 4794 4795 nla_nest_end(skb, mp); 4796 } else { 4797 unsigned char nh_flags = 0; 4798 4799 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, 4800 &nh_flags, false) < 0) 4801 goto nla_put_failure; 4802 4803 rtm->rtm_flags |= nh_flags; 4804 } 4805 4806 if (rt6_flags & RTF_EXPIRES) { 4807 expires = dst ? dst->expires : rt->expires; 4808 expires -= jiffies; 4809 } 4810 4811 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4812 goto nla_put_failure; 4813 4814 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4815 goto nla_put_failure; 4816 4817 4818 nlmsg_end(skb, nlh); 4819 return 0; 4820 4821 nla_put_failure: 4822 nlmsg_cancel(skb, nlh); 4823 return -EMSGSIZE; 4824 } 4825 4826 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4827 const struct net_device *dev) 4828 { 4829 if (f6i->fib6_nh.fib_nh_dev == dev) 4830 return true; 4831 4832 if (f6i->fib6_nsiblings) { 4833 struct fib6_info *sibling, *next_sibling; 4834 4835 list_for_each_entry_safe(sibling, next_sibling, 4836 &f6i->fib6_siblings, fib6_siblings) { 4837 if (sibling->fib6_nh.fib_nh_dev == dev) 4838 return true; 4839 } 4840 } 4841 4842 return false; 4843 } 4844 4845 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4846 { 4847 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4848 struct fib_dump_filter *filter = &arg->filter; 4849 unsigned int flags = NLM_F_MULTI; 4850 struct net *net = arg->net; 4851 4852 if (rt == net->ipv6.fib6_null_entry) 4853 return 0; 4854 4855 if ((filter->flags & RTM_F_PREFIX) && 4856 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4857 /* success since this is not a prefix route */ 4858 return 1; 4859 } 4860 if (filter->filter_set) { 4861 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4862 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4863 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4864 return 1; 4865 } 4866 flags |= NLM_F_DUMP_FILTERED; 4867 } 4868 4869 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4870 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4871 arg->cb->nlh->nlmsg_seq, flags); 4872 } 4873 4874 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4875 const struct nlmsghdr *nlh, 4876 struct nlattr **tb, 4877 struct netlink_ext_ack *extack) 4878 { 4879 struct rtmsg *rtm; 4880 int i, err; 4881 4882 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4883 NL_SET_ERR_MSG_MOD(extack, 4884 "Invalid header for get route request"); 4885 return -EINVAL; 4886 } 4887 4888 if (!netlink_strict_get_check(skb)) 4889 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4890 rtm_ipv6_policy, extack); 4891 4892 rtm = nlmsg_data(nlh); 4893 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4894 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4895 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4896 rtm->rtm_type) { 4897 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4898 return -EINVAL; 4899 } 4900 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4901 NL_SET_ERR_MSG_MOD(extack, 4902 "Invalid flags for get route request"); 4903 return -EINVAL; 4904 } 4905 4906 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4907 rtm_ipv6_policy, extack); 4908 if (err) 4909 return err; 4910 4911 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4912 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4913 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4914 return -EINVAL; 4915 } 4916 4917 for (i = 0; i <= RTA_MAX; i++) { 4918 if (!tb[i]) 4919 continue; 4920 4921 switch (i) { 4922 case RTA_SRC: 4923 case RTA_DST: 4924 case RTA_IIF: 4925 case RTA_OIF: 4926 case RTA_MARK: 4927 case RTA_UID: 4928 case RTA_SPORT: 4929 case RTA_DPORT: 4930 case RTA_IP_PROTO: 4931 break; 4932 default: 4933 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4934 return -EINVAL; 4935 } 4936 } 4937 4938 return 0; 4939 } 4940 4941 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4942 struct netlink_ext_ack *extack) 4943 { 4944 struct net *net = sock_net(in_skb->sk); 4945 struct nlattr *tb[RTA_MAX+1]; 4946 int err, iif = 0, oif = 0; 4947 struct fib6_info *from; 4948 struct dst_entry *dst; 4949 struct rt6_info *rt; 4950 struct sk_buff *skb; 4951 struct rtmsg *rtm; 4952 struct flowi6 fl6 = {}; 4953 bool fibmatch; 4954 4955 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4956 if (err < 0) 4957 goto errout; 4958 4959 err = -EINVAL; 4960 rtm = nlmsg_data(nlh); 4961 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4962 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4963 4964 if (tb[RTA_SRC]) { 4965 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4966 goto errout; 4967 4968 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4969 } 4970 4971 if (tb[RTA_DST]) { 4972 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4973 goto errout; 4974 4975 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4976 } 4977 4978 if (tb[RTA_IIF]) 4979 iif = nla_get_u32(tb[RTA_IIF]); 4980 4981 if (tb[RTA_OIF]) 4982 oif = nla_get_u32(tb[RTA_OIF]); 4983 4984 if (tb[RTA_MARK]) 4985 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4986 4987 if (tb[RTA_UID]) 4988 fl6.flowi6_uid = make_kuid(current_user_ns(), 4989 nla_get_u32(tb[RTA_UID])); 4990 else 4991 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4992 4993 if (tb[RTA_SPORT]) 4994 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4995 4996 if (tb[RTA_DPORT]) 4997 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4998 4999 if (tb[RTA_IP_PROTO]) { 5000 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 5001 &fl6.flowi6_proto, AF_INET6, 5002 extack); 5003 if (err) 5004 goto errout; 5005 } 5006 5007 if (iif) { 5008 struct net_device *dev; 5009 int flags = 0; 5010 5011 rcu_read_lock(); 5012 5013 dev = dev_get_by_index_rcu(net, iif); 5014 if (!dev) { 5015 rcu_read_unlock(); 5016 err = -ENODEV; 5017 goto errout; 5018 } 5019 5020 fl6.flowi6_iif = iif; 5021 5022 if (!ipv6_addr_any(&fl6.saddr)) 5023 flags |= RT6_LOOKUP_F_HAS_SADDR; 5024 5025 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5026 5027 rcu_read_unlock(); 5028 } else { 5029 fl6.flowi6_oif = oif; 5030 5031 dst = ip6_route_output(net, NULL, &fl6); 5032 } 5033 5034 5035 rt = container_of(dst, struct rt6_info, dst); 5036 if (rt->dst.error) { 5037 err = rt->dst.error; 5038 ip6_rt_put(rt); 5039 goto errout; 5040 } 5041 5042 if (rt == net->ipv6.ip6_null_entry) { 5043 err = rt->dst.error; 5044 ip6_rt_put(rt); 5045 goto errout; 5046 } 5047 5048 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5049 if (!skb) { 5050 ip6_rt_put(rt); 5051 err = -ENOBUFS; 5052 goto errout; 5053 } 5054 5055 skb_dst_set(skb, &rt->dst); 5056 5057 rcu_read_lock(); 5058 from = rcu_dereference(rt->from); 5059 5060 if (fibmatch) 5061 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 5062 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 5063 nlh->nlmsg_seq, 0); 5064 else 5065 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5066 &fl6.saddr, iif, RTM_NEWROUTE, 5067 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 5068 0); 5069 rcu_read_unlock(); 5070 5071 if (err < 0) { 5072 kfree_skb(skb); 5073 goto errout; 5074 } 5075 5076 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5077 errout: 5078 return err; 5079 } 5080 5081 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5082 unsigned int nlm_flags) 5083 { 5084 struct sk_buff *skb; 5085 struct net *net = info->nl_net; 5086 u32 seq; 5087 int err; 5088 5089 err = -ENOBUFS; 5090 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5091 5092 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5093 if (!skb) 5094 goto errout; 5095 5096 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5097 event, info->portid, seq, nlm_flags); 5098 if (err < 0) { 5099 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5100 WARN_ON(err == -EMSGSIZE); 5101 kfree_skb(skb); 5102 goto errout; 5103 } 5104 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5105 info->nlh, gfp_any()); 5106 return; 5107 errout: 5108 if (err < 0) 5109 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5110 } 5111 5112 static int ip6_route_dev_notify(struct notifier_block *this, 5113 unsigned long event, void *ptr) 5114 { 5115 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5116 struct net *net = dev_net(dev); 5117 5118 if (!(dev->flags & IFF_LOOPBACK)) 5119 return NOTIFY_OK; 5120 5121 if (event == NETDEV_REGISTER) { 5122 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5123 net->ipv6.ip6_null_entry->dst.dev = dev; 5124 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5125 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5126 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5127 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5128 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5129 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5130 #endif 5131 } else if (event == NETDEV_UNREGISTER && 5132 dev->reg_state != NETREG_UNREGISTERED) { 5133 /* NETDEV_UNREGISTER could be fired for multiple times by 5134 * netdev_wait_allrefs(). Make sure we only call this once. 5135 */ 5136 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5137 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5138 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5139 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5140 #endif 5141 } 5142 5143 return NOTIFY_OK; 5144 } 5145 5146 /* 5147 * /proc 5148 */ 5149 5150 #ifdef CONFIG_PROC_FS 5151 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5152 { 5153 struct net *net = (struct net *)seq->private; 5154 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5155 net->ipv6.rt6_stats->fib_nodes, 5156 net->ipv6.rt6_stats->fib_route_nodes, 5157 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5158 net->ipv6.rt6_stats->fib_rt_entries, 5159 net->ipv6.rt6_stats->fib_rt_cache, 5160 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5161 net->ipv6.rt6_stats->fib_discarded_routes); 5162 5163 return 0; 5164 } 5165 #endif /* CONFIG_PROC_FS */ 5166 5167 #ifdef CONFIG_SYSCTL 5168 5169 static 5170 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5171 void __user *buffer, size_t *lenp, loff_t *ppos) 5172 { 5173 struct net *net; 5174 int delay; 5175 int ret; 5176 if (!write) 5177 return -EINVAL; 5178 5179 net = (struct net *)ctl->extra1; 5180 delay = net->ipv6.sysctl.flush_delay; 5181 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5182 if (ret) 5183 return ret; 5184 5185 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5186 return 0; 5187 } 5188 5189 static int zero; 5190 static int one = 1; 5191 5192 static struct ctl_table ipv6_route_table_template[] = { 5193 { 5194 .procname = "flush", 5195 .data = &init_net.ipv6.sysctl.flush_delay, 5196 .maxlen = sizeof(int), 5197 .mode = 0200, 5198 .proc_handler = ipv6_sysctl_rtcache_flush 5199 }, 5200 { 5201 .procname = "gc_thresh", 5202 .data = &ip6_dst_ops_template.gc_thresh, 5203 .maxlen = sizeof(int), 5204 .mode = 0644, 5205 .proc_handler = proc_dointvec, 5206 }, 5207 { 5208 .procname = "max_size", 5209 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5210 .maxlen = sizeof(int), 5211 .mode = 0644, 5212 .proc_handler = proc_dointvec, 5213 }, 5214 { 5215 .procname = "gc_min_interval", 5216 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5217 .maxlen = sizeof(int), 5218 .mode = 0644, 5219 .proc_handler = proc_dointvec_jiffies, 5220 }, 5221 { 5222 .procname = "gc_timeout", 5223 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5224 .maxlen = sizeof(int), 5225 .mode = 0644, 5226 .proc_handler = proc_dointvec_jiffies, 5227 }, 5228 { 5229 .procname = "gc_interval", 5230 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5231 .maxlen = sizeof(int), 5232 .mode = 0644, 5233 .proc_handler = proc_dointvec_jiffies, 5234 }, 5235 { 5236 .procname = "gc_elasticity", 5237 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5238 .maxlen = sizeof(int), 5239 .mode = 0644, 5240 .proc_handler = proc_dointvec, 5241 }, 5242 { 5243 .procname = "mtu_expires", 5244 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5245 .maxlen = sizeof(int), 5246 .mode = 0644, 5247 .proc_handler = proc_dointvec_jiffies, 5248 }, 5249 { 5250 .procname = "min_adv_mss", 5251 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5252 .maxlen = sizeof(int), 5253 .mode = 0644, 5254 .proc_handler = proc_dointvec, 5255 }, 5256 { 5257 .procname = "gc_min_interval_ms", 5258 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5259 .maxlen = sizeof(int), 5260 .mode = 0644, 5261 .proc_handler = proc_dointvec_ms_jiffies, 5262 }, 5263 { 5264 .procname = "skip_notify_on_dev_down", 5265 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5266 .maxlen = sizeof(int), 5267 .mode = 0644, 5268 .proc_handler = proc_dointvec, 5269 .extra1 = &zero, 5270 .extra2 = &one, 5271 }, 5272 { } 5273 }; 5274 5275 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5276 { 5277 struct ctl_table *table; 5278 5279 table = kmemdup(ipv6_route_table_template, 5280 sizeof(ipv6_route_table_template), 5281 GFP_KERNEL); 5282 5283 if (table) { 5284 table[0].data = &net->ipv6.sysctl.flush_delay; 5285 table[0].extra1 = net; 5286 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5287 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5288 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5289 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5290 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5291 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5292 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5293 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5294 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5295 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5296 5297 /* Don't export sysctls to unprivileged users */ 5298 if (net->user_ns != &init_user_ns) 5299 table[0].procname = NULL; 5300 } 5301 5302 return table; 5303 } 5304 #endif 5305 5306 static int __net_init ip6_route_net_init(struct net *net) 5307 { 5308 int ret = -ENOMEM; 5309 5310 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5311 sizeof(net->ipv6.ip6_dst_ops)); 5312 5313 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5314 goto out_ip6_dst_ops; 5315 5316 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5317 sizeof(*net->ipv6.fib6_null_entry), 5318 GFP_KERNEL); 5319 if (!net->ipv6.fib6_null_entry) 5320 goto out_ip6_dst_entries; 5321 5322 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5323 sizeof(*net->ipv6.ip6_null_entry), 5324 GFP_KERNEL); 5325 if (!net->ipv6.ip6_null_entry) 5326 goto out_fib6_null_entry; 5327 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5328 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5329 ip6_template_metrics, true); 5330 5331 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5332 net->ipv6.fib6_has_custom_rules = false; 5333 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5334 sizeof(*net->ipv6.ip6_prohibit_entry), 5335 GFP_KERNEL); 5336 if (!net->ipv6.ip6_prohibit_entry) 5337 goto out_ip6_null_entry; 5338 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5339 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5340 ip6_template_metrics, true); 5341 5342 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5343 sizeof(*net->ipv6.ip6_blk_hole_entry), 5344 GFP_KERNEL); 5345 if (!net->ipv6.ip6_blk_hole_entry) 5346 goto out_ip6_prohibit_entry; 5347 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5348 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5349 ip6_template_metrics, true); 5350 #endif 5351 5352 net->ipv6.sysctl.flush_delay = 0; 5353 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5354 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5355 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5356 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5357 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5358 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5359 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5360 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5361 5362 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5363 5364 ret = 0; 5365 out: 5366 return ret; 5367 5368 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5369 out_ip6_prohibit_entry: 5370 kfree(net->ipv6.ip6_prohibit_entry); 5371 out_ip6_null_entry: 5372 kfree(net->ipv6.ip6_null_entry); 5373 #endif 5374 out_fib6_null_entry: 5375 kfree(net->ipv6.fib6_null_entry); 5376 out_ip6_dst_entries: 5377 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5378 out_ip6_dst_ops: 5379 goto out; 5380 } 5381 5382 static void __net_exit ip6_route_net_exit(struct net *net) 5383 { 5384 kfree(net->ipv6.fib6_null_entry); 5385 kfree(net->ipv6.ip6_null_entry); 5386 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5387 kfree(net->ipv6.ip6_prohibit_entry); 5388 kfree(net->ipv6.ip6_blk_hole_entry); 5389 #endif 5390 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5391 } 5392 5393 static int __net_init ip6_route_net_init_late(struct net *net) 5394 { 5395 #ifdef CONFIG_PROC_FS 5396 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5397 sizeof(struct ipv6_route_iter)); 5398 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5399 rt6_stats_seq_show, NULL); 5400 #endif 5401 return 0; 5402 } 5403 5404 static void __net_exit ip6_route_net_exit_late(struct net *net) 5405 { 5406 #ifdef CONFIG_PROC_FS 5407 remove_proc_entry("ipv6_route", net->proc_net); 5408 remove_proc_entry("rt6_stats", net->proc_net); 5409 #endif 5410 } 5411 5412 static struct pernet_operations ip6_route_net_ops = { 5413 .init = ip6_route_net_init, 5414 .exit = ip6_route_net_exit, 5415 }; 5416 5417 static int __net_init ipv6_inetpeer_init(struct net *net) 5418 { 5419 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5420 5421 if (!bp) 5422 return -ENOMEM; 5423 inet_peer_base_init(bp); 5424 net->ipv6.peers = bp; 5425 return 0; 5426 } 5427 5428 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5429 { 5430 struct inet_peer_base *bp = net->ipv6.peers; 5431 5432 net->ipv6.peers = NULL; 5433 inetpeer_invalidate_tree(bp); 5434 kfree(bp); 5435 } 5436 5437 static struct pernet_operations ipv6_inetpeer_ops = { 5438 .init = ipv6_inetpeer_init, 5439 .exit = ipv6_inetpeer_exit, 5440 }; 5441 5442 static struct pernet_operations ip6_route_net_late_ops = { 5443 .init = ip6_route_net_init_late, 5444 .exit = ip6_route_net_exit_late, 5445 }; 5446 5447 static struct notifier_block ip6_route_dev_notifier = { 5448 .notifier_call = ip6_route_dev_notify, 5449 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5450 }; 5451 5452 void __init ip6_route_init_special_entries(void) 5453 { 5454 /* Registering of the loopback is done before this portion of code, 5455 * the loopback reference in rt6_info will not be taken, do it 5456 * manually for init_net */ 5457 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5458 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5459 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5460 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5461 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5462 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5463 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5464 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5465 #endif 5466 } 5467 5468 int __init ip6_route_init(void) 5469 { 5470 int ret; 5471 int cpu; 5472 5473 ret = -ENOMEM; 5474 ip6_dst_ops_template.kmem_cachep = 5475 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5476 SLAB_HWCACHE_ALIGN, NULL); 5477 if (!ip6_dst_ops_template.kmem_cachep) 5478 goto out; 5479 5480 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5481 if (ret) 5482 goto out_kmem_cache; 5483 5484 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5485 if (ret) 5486 goto out_dst_entries; 5487 5488 ret = register_pernet_subsys(&ip6_route_net_ops); 5489 if (ret) 5490 goto out_register_inetpeer; 5491 5492 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5493 5494 ret = fib6_init(); 5495 if (ret) 5496 goto out_register_subsys; 5497 5498 ret = xfrm6_init(); 5499 if (ret) 5500 goto out_fib6_init; 5501 5502 ret = fib6_rules_init(); 5503 if (ret) 5504 goto xfrm6_init; 5505 5506 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5507 if (ret) 5508 goto fib6_rules_init; 5509 5510 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5511 inet6_rtm_newroute, NULL, 0); 5512 if (ret < 0) 5513 goto out_register_late_subsys; 5514 5515 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5516 inet6_rtm_delroute, NULL, 0); 5517 if (ret < 0) 5518 goto out_register_late_subsys; 5519 5520 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5521 inet6_rtm_getroute, NULL, 5522 RTNL_FLAG_DOIT_UNLOCKED); 5523 if (ret < 0) 5524 goto out_register_late_subsys; 5525 5526 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5527 if (ret) 5528 goto out_register_late_subsys; 5529 5530 for_each_possible_cpu(cpu) { 5531 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5532 5533 INIT_LIST_HEAD(&ul->head); 5534 spin_lock_init(&ul->lock); 5535 } 5536 5537 out: 5538 return ret; 5539 5540 out_register_late_subsys: 5541 rtnl_unregister_all(PF_INET6); 5542 unregister_pernet_subsys(&ip6_route_net_late_ops); 5543 fib6_rules_init: 5544 fib6_rules_cleanup(); 5545 xfrm6_init: 5546 xfrm6_fini(); 5547 out_fib6_init: 5548 fib6_gc_cleanup(); 5549 out_register_subsys: 5550 unregister_pernet_subsys(&ip6_route_net_ops); 5551 out_register_inetpeer: 5552 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5553 out_dst_entries: 5554 dst_entries_destroy(&ip6_dst_blackhole_ops); 5555 out_kmem_cache: 5556 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5557 goto out; 5558 } 5559 5560 void ip6_route_cleanup(void) 5561 { 5562 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5563 unregister_pernet_subsys(&ip6_route_net_late_ops); 5564 fib6_rules_cleanup(); 5565 xfrm6_fini(); 5566 fib6_gc_cleanup(); 5567 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5568 unregister_pernet_subsys(&ip6_route_net_ops); 5569 dst_entries_destroy(&ip6_dst_blackhole_ops); 5570 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5571 } 5572