1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 214 n = neigh_create(&nd_tbl, daddr, dev); 215 return IS_ERR(n) ? NULL : n; 216 } 217 218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 219 struct sk_buff *skb, 220 const void *daddr) 221 { 222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 223 224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 225 } 226 227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 228 { 229 struct net_device *dev = dst->dev; 230 struct rt6_info *rt = (struct rt6_info *)dst; 231 232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 233 if (!daddr) 234 return; 235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 236 return; 237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 238 return; 239 __ipv6_confirm_neigh(dev, daddr); 240 } 241 242 static struct dst_ops ip6_dst_ops_template = { 243 .family = AF_INET6, 244 .gc = ip6_dst_gc, 245 .gc_thresh = 1024, 246 .check = ip6_dst_check, 247 .default_advmss = ip6_default_advmss, 248 .mtu = ip6_mtu, 249 .cow_metrics = dst_cow_metrics_generic, 250 .destroy = ip6_dst_destroy, 251 .ifdown = ip6_dst_ifdown, 252 .negative_advice = ip6_negative_advice, 253 .link_failure = ip6_link_failure, 254 .update_pmtu = ip6_rt_update_pmtu, 255 .redirect = rt6_do_redirect, 256 .local_out = __ip6_local_out, 257 .neigh_lookup = ip6_dst_neigh_lookup, 258 .confirm_neigh = ip6_confirm_neigh, 259 }; 260 261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 262 { 263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 264 265 return mtu ? : dst->dev->mtu; 266 } 267 268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 269 struct sk_buff *skb, u32 mtu) 270 { 271 } 272 273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 274 struct sk_buff *skb) 275 { 276 } 277 278 static struct dst_ops ip6_dst_blackhole_ops = { 279 .family = AF_INET6, 280 .destroy = ip6_dst_destroy, 281 .check = ip6_dst_check, 282 .mtu = ip6_blackhole_mtu, 283 .default_advmss = ip6_default_advmss, 284 .update_pmtu = ip6_rt_blackhole_update_pmtu, 285 .redirect = ip6_rt_blackhole_redirect, 286 .cow_metrics = dst_cow_metrics_generic, 287 .neigh_lookup = ip6_dst_neigh_lookup, 288 }; 289 290 static const u32 ip6_template_metrics[RTAX_MAX] = { 291 [RTAX_HOPLIMIT - 1] = 0, 292 }; 293 294 static const struct fib6_info fib6_null_entry_template = { 295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 296 .fib6_protocol = RTPROT_KERNEL, 297 .fib6_metric = ~(u32)0, 298 .fib6_ref = ATOMIC_INIT(1), 299 .fib6_type = RTN_UNREACHABLE, 300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 301 }; 302 303 static const struct rt6_info ip6_null_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -ENETUNREACH, 309 .input = ip6_pkt_discard, 310 .output = ip6_pkt_discard_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 316 317 static const struct rt6_info ip6_prohibit_entry_template = { 318 .dst = { 319 .__refcnt = ATOMIC_INIT(1), 320 .__use = 1, 321 .obsolete = DST_OBSOLETE_FORCE_CHK, 322 .error = -EACCES, 323 .input = ip6_pkt_prohibit, 324 .output = ip6_pkt_prohibit_out, 325 }, 326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 327 }; 328 329 static const struct rt6_info ip6_blk_hole_entry_template = { 330 .dst = { 331 .__refcnt = ATOMIC_INIT(1), 332 .__use = 1, 333 .obsolete = DST_OBSOLETE_FORCE_CHK, 334 .error = -EINVAL, 335 .input = dst_discard, 336 .output = dst_discard_out, 337 }, 338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 339 }; 340 341 #endif 342 343 static void rt6_info_init(struct rt6_info *rt) 344 { 345 struct dst_entry *dst = &rt->dst; 346 347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 348 INIT_LIST_HEAD(&rt->rt6i_uncached); 349 } 350 351 /* allocate dst with ip6_dst_ops */ 352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) { 359 rt6_info_init(rt); 360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 361 } 362 363 return rt; 364 } 365 EXPORT_SYMBOL(ip6_dst_alloc); 366 367 static void ip6_dst_destroy(struct dst_entry *dst) 368 { 369 struct rt6_info *rt = (struct rt6_info *)dst; 370 struct fib6_info *from; 371 struct inet6_dev *idev; 372 373 ip_dst_metrics_put(dst); 374 rt6_uncached_list_del(rt); 375 376 idev = rt->rt6i_idev; 377 if (idev) { 378 rt->rt6i_idev = NULL; 379 in6_dev_put(idev); 380 } 381 382 rcu_read_lock(); 383 from = rcu_dereference(rt->from); 384 rcu_assign_pointer(rt->from, NULL); 385 fib6_info_release(from); 386 rcu_read_unlock(); 387 } 388 389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 390 int how) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct inet6_dev *idev = rt->rt6i_idev; 394 struct net_device *loopback_dev = 395 dev_net(dev)->loopback_dev; 396 397 if (idev && idev->dev != loopback_dev) { 398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 399 if (loopback_idev) { 400 rt->rt6i_idev = loopback_idev; 401 in6_dev_put(idev); 402 } 403 } 404 } 405 406 static bool __rt6_check_expired(const struct rt6_info *rt) 407 { 408 if (rt->rt6i_flags & RTF_EXPIRES) 409 return time_after(jiffies, rt->dst.expires); 410 else 411 return false; 412 } 413 414 static bool rt6_check_expired(const struct rt6_info *rt) 415 { 416 struct fib6_info *from; 417 418 from = rcu_dereference(rt->from); 419 420 if (rt->rt6i_flags & RTF_EXPIRES) { 421 if (time_after(jiffies, rt->dst.expires)) 422 return true; 423 } else if (from) { 424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 425 fib6_check_expired(from); 426 } 427 return false; 428 } 429 430 struct fib6_info *fib6_multipath_select(const struct net *net, 431 struct fib6_info *match, 432 struct flowi6 *fl6, int oif, 433 const struct sk_buff *skb, 434 int strict) 435 { 436 struct fib6_info *sibling, *next_sibling; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 445 return match; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 int nh_upper_bound; 450 451 nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound); 452 if (fl6->mp_hash > nh_upper_bound) 453 continue; 454 if (rt6_score_route(sibling, oif, strict) < 0) 455 break; 456 match = sibling; 457 break; 458 } 459 460 return match; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static inline struct fib6_info *rt6_device_match(struct net *net, 468 struct fib6_info *rt, 469 const struct in6_addr *saddr, 470 int oif, 471 int flags) 472 { 473 struct fib6_info *sprt; 474 475 if (!oif && ipv6_addr_any(saddr) && 476 !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)) 477 return rt; 478 479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 480 const struct net_device *dev = sprt->fib6_nh.fib_nh_dev; 481 482 if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 483 continue; 484 485 if (oif) { 486 if (dev->ifindex == oif) 487 return sprt; 488 } else { 489 if (ipv6_chk_addr(net, saddr, dev, 490 flags & RT6_LOOKUP_F_IFACE)) 491 return sprt; 492 } 493 } 494 495 if (oif && flags & RT6_LOOKUP_F_IFACE) 496 return net->ipv6.fib6_null_entry; 497 498 return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 499 } 500 501 #ifdef CONFIG_IPV6_ROUTER_PREF 502 struct __rt6_probe_work { 503 struct work_struct work; 504 struct in6_addr target; 505 struct net_device *dev; 506 }; 507 508 static void rt6_probe_deferred(struct work_struct *w) 509 { 510 struct in6_addr mcaddr; 511 struct __rt6_probe_work *work = 512 container_of(w, struct __rt6_probe_work, work); 513 514 addrconf_addr_solict_mult(&work->target, &mcaddr); 515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 516 dev_put(work->dev); 517 kfree(work); 518 } 519 520 static void rt6_probe(struct fib6_info *rt) 521 { 522 struct __rt6_probe_work *work = NULL; 523 const struct in6_addr *nh_gw; 524 struct neighbour *neigh; 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 528 /* 529 * Okay, this does not seem to be appropriate 530 * for now, however, we need to check if it 531 * is really so; aka Router Reachability Probing. 532 * 533 * Router Reachability Probe MUST be rate-limited 534 * to no more than one per minute. 535 */ 536 if (!rt || !rt->fib6_nh.fib_nh_has_gw) 537 return; 538 539 nh_gw = &rt->fib6_nh.fib_nh_gw6; 540 dev = rt->fib6_nh.fib_nh_dev; 541 rcu_read_lock_bh(); 542 idev = __in6_dev_get(dev); 543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 544 if (neigh) { 545 if (neigh->nud_state & NUD_VALID) 546 goto out; 547 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else if (time_after(jiffies, rt->last_probe + 558 idev->cnf.rtr_probe_interval)) { 559 work = kmalloc(sizeof(*work), GFP_ATOMIC); 560 } 561 562 if (work) { 563 rt->last_probe = jiffies; 564 INIT_WORK(&work->work, rt6_probe_deferred); 565 work->target = *nh_gw; 566 dev_hold(dev); 567 work->dev = dev; 568 schedule_work(&work->work); 569 } 570 571 out: 572 rcu_read_unlock_bh(); 573 } 574 #else 575 static inline void rt6_probe(struct fib6_info *rt) 576 { 577 } 578 #endif 579 580 /* 581 * Default Router Selection (RFC 2461 6.3.6) 582 */ 583 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 584 { 585 const struct net_device *dev = rt->fib6_nh.fib_nh_dev; 586 587 if (!oif || dev->ifindex == oif) 588 return 2; 589 return 0; 590 } 591 592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 593 { 594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 595 struct neighbour *neigh; 596 597 if (rt->fib6_flags & RTF_NONEXTHOP || 598 !rt->fib6_nh.fib_nh_has_gw) 599 return RT6_NUD_SUCCEED; 600 601 rcu_read_lock_bh(); 602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev, 603 &rt->fib6_nh.fib_nh_gw6); 604 if (neigh) { 605 read_lock(&neigh->lock); 606 if (neigh->nud_state & NUD_VALID) 607 ret = RT6_NUD_SUCCEED; 608 #ifdef CONFIG_IPV6_ROUTER_PREF 609 else if (!(neigh->nud_state & NUD_FAILED)) 610 ret = RT6_NUD_SUCCEED; 611 else 612 ret = RT6_NUD_FAIL_PROBE; 613 #endif 614 read_unlock(&neigh->lock); 615 } else { 616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 618 } 619 rcu_read_unlock_bh(); 620 621 return ret; 622 } 623 624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 625 { 626 int m; 627 628 m = rt6_check_dev(rt, oif); 629 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 630 return RT6_NUD_FAIL_HARD; 631 #ifdef CONFIG_IPV6_ROUTER_PREF 632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 633 #endif 634 if (strict & RT6_LOOKUP_F_REACHABLE) { 635 int n = rt6_check_neigh(rt); 636 if (n < 0) 637 return n; 638 } 639 return m; 640 } 641 642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 643 int *mpri, struct fib6_info *match, 644 bool *do_rr) 645 { 646 int m; 647 bool match_do_rr = false; 648 649 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 650 goto out; 651 652 if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) && 653 rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 654 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 655 goto out; 656 657 if (fib6_check_expired(rt)) 658 goto out; 659 660 m = rt6_score_route(rt, oif, strict); 661 if (m == RT6_NUD_FAIL_DO_RR) { 662 match_do_rr = true; 663 m = 0; /* lowest valid score */ 664 } else if (m == RT6_NUD_FAIL_HARD) { 665 goto out; 666 } 667 668 if (strict & RT6_LOOKUP_F_REACHABLE) 669 rt6_probe(rt); 670 671 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 672 if (m > *mpri) { 673 *do_rr = match_do_rr; 674 *mpri = m; 675 match = rt; 676 } 677 out: 678 return match; 679 } 680 681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 682 struct fib6_info *leaf, 683 struct fib6_info *rr_head, 684 u32 metric, int oif, int strict, 685 bool *do_rr) 686 { 687 struct fib6_info *rt, *match, *cont; 688 int mpri = -1; 689 690 match = NULL; 691 cont = NULL; 692 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 693 if (rt->fib6_metric != metric) { 694 cont = rt; 695 break; 696 } 697 698 match = find_match(rt, oif, strict, &mpri, match, do_rr); 699 } 700 701 for (rt = leaf; rt && rt != rr_head; 702 rt = rcu_dereference(rt->fib6_next)) { 703 if (rt->fib6_metric != metric) { 704 cont = rt; 705 break; 706 } 707 708 match = find_match(rt, oif, strict, &mpri, match, do_rr); 709 } 710 711 if (match || !cont) 712 return match; 713 714 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 715 match = find_match(rt, oif, strict, &mpri, match, do_rr); 716 717 return match; 718 } 719 720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 721 int oif, int strict) 722 { 723 struct fib6_info *leaf = rcu_dereference(fn->leaf); 724 struct fib6_info *match, *rt0; 725 bool do_rr = false; 726 int key_plen; 727 728 if (!leaf || leaf == net->ipv6.fib6_null_entry) 729 return net->ipv6.fib6_null_entry; 730 731 rt0 = rcu_dereference(fn->rr_ptr); 732 if (!rt0) 733 rt0 = leaf; 734 735 /* Double check to make sure fn is not an intermediate node 736 * and fn->leaf does not points to its child's leaf 737 * (This might happen if all routes under fn are deleted from 738 * the tree and fib6_repair_tree() is called on the node.) 739 */ 740 key_plen = rt0->fib6_dst.plen; 741 #ifdef CONFIG_IPV6_SUBTREES 742 if (rt0->fib6_src.plen) 743 key_plen = rt0->fib6_src.plen; 744 #endif 745 if (fn->fn_bit != key_plen) 746 return net->ipv6.fib6_null_entry; 747 748 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 749 &do_rr); 750 751 if (do_rr) { 752 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 753 754 /* no entries matched; do round-robin */ 755 if (!next || next->fib6_metric != rt0->fib6_metric) 756 next = leaf; 757 758 if (next != rt0) { 759 spin_lock_bh(&leaf->fib6_table->tb6_lock); 760 /* make sure next is not being deleted from the tree */ 761 if (next->fib6_node) 762 rcu_assign_pointer(fn->rr_ptr, next); 763 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 764 } 765 } 766 767 return match ? match : net->ipv6.fib6_null_entry; 768 } 769 770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 771 { 772 return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw; 773 } 774 775 #ifdef CONFIG_IPV6_ROUTE_INFO 776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 777 const struct in6_addr *gwaddr) 778 { 779 struct net *net = dev_net(dev); 780 struct route_info *rinfo = (struct route_info *) opt; 781 struct in6_addr prefix_buf, *prefix; 782 unsigned int pref; 783 unsigned long lifetime; 784 struct fib6_info *rt; 785 786 if (len < sizeof(struct route_info)) { 787 return -EINVAL; 788 } 789 790 /* Sanity check for prefix_len and length */ 791 if (rinfo->length > 3) { 792 return -EINVAL; 793 } else if (rinfo->prefix_len > 128) { 794 return -EINVAL; 795 } else if (rinfo->prefix_len > 64) { 796 if (rinfo->length < 2) { 797 return -EINVAL; 798 } 799 } else if (rinfo->prefix_len > 0) { 800 if (rinfo->length < 1) { 801 return -EINVAL; 802 } 803 } 804 805 pref = rinfo->route_pref; 806 if (pref == ICMPV6_ROUTER_PREF_INVALID) 807 return -EINVAL; 808 809 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 810 811 if (rinfo->length == 3) 812 prefix = (struct in6_addr *)rinfo->prefix; 813 else { 814 /* this function is safe */ 815 ipv6_addr_prefix(&prefix_buf, 816 (struct in6_addr *)rinfo->prefix, 817 rinfo->prefix_len); 818 prefix = &prefix_buf; 819 } 820 821 if (rinfo->prefix_len == 0) 822 rt = rt6_get_dflt_router(net, gwaddr, dev); 823 else 824 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 825 gwaddr, dev); 826 827 if (rt && !lifetime) { 828 ip6_del_rt(net, rt); 829 rt = NULL; 830 } 831 832 if (!rt && lifetime) 833 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 834 dev, pref); 835 else if (rt) 836 rt->fib6_flags = RTF_ROUTEINFO | 837 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 838 839 if (rt) { 840 if (!addrconf_finite_timeout(lifetime)) 841 fib6_clean_expires(rt); 842 else 843 fib6_set_expires(rt, jiffies + HZ * lifetime); 844 845 fib6_info_release(rt); 846 } 847 return 0; 848 } 849 #endif 850 851 /* 852 * Misc support functions 853 */ 854 855 /* called with rcu_lock held */ 856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 857 { 858 struct net_device *dev = rt->fib6_nh.fib_nh_dev; 859 860 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 861 /* for copies of local routes, dst->dev needs to be the 862 * device if it is a master device, the master device if 863 * device is enslaved, and the loopback as the default 864 */ 865 if (netif_is_l3_slave(dev) && 866 !rt6_need_strict(&rt->fib6_dst.addr)) 867 dev = l3mdev_master_dev_rcu(dev); 868 else if (!netif_is_l3_master(dev)) 869 dev = dev_net(dev)->loopback_dev; 870 /* last case is netif_is_l3_master(dev) is true in which 871 * case we want dev returned to be dev 872 */ 873 } 874 875 return dev; 876 } 877 878 static const int fib6_prop[RTN_MAX + 1] = { 879 [RTN_UNSPEC] = 0, 880 [RTN_UNICAST] = 0, 881 [RTN_LOCAL] = 0, 882 [RTN_BROADCAST] = 0, 883 [RTN_ANYCAST] = 0, 884 [RTN_MULTICAST] = 0, 885 [RTN_BLACKHOLE] = -EINVAL, 886 [RTN_UNREACHABLE] = -EHOSTUNREACH, 887 [RTN_PROHIBIT] = -EACCES, 888 [RTN_THROW] = -EAGAIN, 889 [RTN_NAT] = -EINVAL, 890 [RTN_XRESOLVE] = -EINVAL, 891 }; 892 893 static int ip6_rt_type_to_error(u8 fib6_type) 894 { 895 return fib6_prop[fib6_type]; 896 } 897 898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 899 { 900 unsigned short flags = 0; 901 902 if (rt->dst_nocount) 903 flags |= DST_NOCOUNT; 904 if (rt->dst_nopolicy) 905 flags |= DST_NOPOLICY; 906 if (rt->dst_host) 907 flags |= DST_HOST; 908 909 return flags; 910 } 911 912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 913 { 914 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 915 916 switch (ort->fib6_type) { 917 case RTN_BLACKHOLE: 918 rt->dst.output = dst_discard_out; 919 rt->dst.input = dst_discard; 920 break; 921 case RTN_PROHIBIT: 922 rt->dst.output = ip6_pkt_prohibit_out; 923 rt->dst.input = ip6_pkt_prohibit; 924 break; 925 case RTN_THROW: 926 case RTN_UNREACHABLE: 927 default: 928 rt->dst.output = ip6_pkt_discard_out; 929 rt->dst.input = ip6_pkt_discard; 930 break; 931 } 932 } 933 934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 935 { 936 if (ort->fib6_flags & RTF_REJECT) { 937 ip6_rt_init_dst_reject(rt, ort); 938 return; 939 } 940 941 rt->dst.error = 0; 942 rt->dst.output = ip6_output; 943 944 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 945 rt->dst.input = ip6_input; 946 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 947 rt->dst.input = ip6_mc_input; 948 } else { 949 rt->dst.input = ip6_forward; 950 } 951 952 if (ort->fib6_nh.fib_nh_lws) { 953 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws); 954 lwtunnel_set_redirect(&rt->dst); 955 } 956 957 rt->dst.lastuse = jiffies; 958 } 959 960 /* Caller must already hold reference to @from */ 961 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 962 { 963 rt->rt6i_flags &= ~RTF_EXPIRES; 964 rcu_assign_pointer(rt->from, from); 965 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 966 } 967 968 /* Caller must already hold reference to @ort */ 969 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 970 { 971 struct net_device *dev = fib6_info_nh_dev(ort); 972 973 ip6_rt_init_dst(rt, ort); 974 975 rt->rt6i_dst = ort->fib6_dst; 976 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 977 rt->rt6i_flags = ort->fib6_flags; 978 if (ort->fib6_nh.fib_nh_has_gw) { 979 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6; 980 rt->rt6i_flags |= RTF_GATEWAY; 981 } 982 rt6_set_from(rt, ort); 983 #ifdef CONFIG_IPV6_SUBTREES 984 rt->rt6i_src = ort->fib6_src; 985 #endif 986 } 987 988 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 989 struct in6_addr *saddr) 990 { 991 struct fib6_node *pn, *sn; 992 while (1) { 993 if (fn->fn_flags & RTN_TL_ROOT) 994 return NULL; 995 pn = rcu_dereference(fn->parent); 996 sn = FIB6_SUBTREE(pn); 997 if (sn && sn != fn) 998 fn = fib6_node_lookup(sn, NULL, saddr); 999 else 1000 fn = pn; 1001 if (fn->fn_flags & RTN_RTINFO) 1002 return fn; 1003 } 1004 } 1005 1006 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1007 { 1008 struct rt6_info *rt = *prt; 1009 1010 if (dst_hold_safe(&rt->dst)) 1011 return true; 1012 if (net) { 1013 rt = net->ipv6.ip6_null_entry; 1014 dst_hold(&rt->dst); 1015 } else { 1016 rt = NULL; 1017 } 1018 *prt = rt; 1019 return false; 1020 } 1021 1022 /* called with rcu_lock held */ 1023 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1024 { 1025 unsigned short flags = fib6_info_dst_flags(rt); 1026 struct net_device *dev = rt->fib6_nh.fib_nh_dev; 1027 struct rt6_info *nrt; 1028 1029 if (!fib6_info_hold_safe(rt)) 1030 goto fallback; 1031 1032 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1033 if (!nrt) { 1034 fib6_info_release(rt); 1035 goto fallback; 1036 } 1037 1038 ip6_rt_copy_init(nrt, rt); 1039 return nrt; 1040 1041 fallback: 1042 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1043 dst_hold(&nrt->dst); 1044 return nrt; 1045 } 1046 1047 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1048 struct fib6_table *table, 1049 struct flowi6 *fl6, 1050 const struct sk_buff *skb, 1051 int flags) 1052 { 1053 struct fib6_info *f6i; 1054 struct fib6_node *fn; 1055 struct rt6_info *rt; 1056 1057 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1058 flags &= ~RT6_LOOKUP_F_IFACE; 1059 1060 rcu_read_lock(); 1061 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1062 restart: 1063 f6i = rcu_dereference(fn->leaf); 1064 if (!f6i) { 1065 f6i = net->ipv6.fib6_null_entry; 1066 } else { 1067 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1068 fl6->flowi6_oif, flags); 1069 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1070 f6i = fib6_multipath_select(net, f6i, fl6, 1071 fl6->flowi6_oif, skb, 1072 flags); 1073 } 1074 if (f6i == net->ipv6.fib6_null_entry) { 1075 fn = fib6_backtrack(fn, &fl6->saddr); 1076 if (fn) 1077 goto restart; 1078 } 1079 1080 trace_fib6_table_lookup(net, f6i, table, fl6); 1081 1082 /* Search through exception table */ 1083 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1084 if (rt) { 1085 if (ip6_hold_safe(net, &rt)) 1086 dst_use_noref(&rt->dst, jiffies); 1087 } else if (f6i == net->ipv6.fib6_null_entry) { 1088 rt = net->ipv6.ip6_null_entry; 1089 dst_hold(&rt->dst); 1090 } else { 1091 rt = ip6_create_rt_rcu(f6i); 1092 } 1093 1094 rcu_read_unlock(); 1095 1096 return rt; 1097 } 1098 1099 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1100 const struct sk_buff *skb, int flags) 1101 { 1102 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1103 } 1104 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1105 1106 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1107 const struct in6_addr *saddr, int oif, 1108 const struct sk_buff *skb, int strict) 1109 { 1110 struct flowi6 fl6 = { 1111 .flowi6_oif = oif, 1112 .daddr = *daddr, 1113 }; 1114 struct dst_entry *dst; 1115 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1116 1117 if (saddr) { 1118 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1119 flags |= RT6_LOOKUP_F_HAS_SADDR; 1120 } 1121 1122 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1123 if (dst->error == 0) 1124 return (struct rt6_info *) dst; 1125 1126 dst_release(dst); 1127 1128 return NULL; 1129 } 1130 EXPORT_SYMBOL(rt6_lookup); 1131 1132 /* ip6_ins_rt is called with FREE table->tb6_lock. 1133 * It takes new route entry, the addition fails by any reason the 1134 * route is released. 1135 * Caller must hold dst before calling it. 1136 */ 1137 1138 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1139 struct netlink_ext_ack *extack) 1140 { 1141 int err; 1142 struct fib6_table *table; 1143 1144 table = rt->fib6_table; 1145 spin_lock_bh(&table->tb6_lock); 1146 err = fib6_add(&table->tb6_root, rt, info, extack); 1147 spin_unlock_bh(&table->tb6_lock); 1148 1149 return err; 1150 } 1151 1152 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1153 { 1154 struct nl_info info = { .nl_net = net, }; 1155 1156 return __ip6_ins_rt(rt, &info, NULL); 1157 } 1158 1159 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1160 const struct in6_addr *daddr, 1161 const struct in6_addr *saddr) 1162 { 1163 struct net_device *dev; 1164 struct rt6_info *rt; 1165 1166 /* 1167 * Clone the route. 1168 */ 1169 1170 if (!fib6_info_hold_safe(ort)) 1171 return NULL; 1172 1173 dev = ip6_rt_get_dev_rcu(ort); 1174 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1175 if (!rt) { 1176 fib6_info_release(ort); 1177 return NULL; 1178 } 1179 1180 ip6_rt_copy_init(rt, ort); 1181 rt->rt6i_flags |= RTF_CACHE; 1182 rt->dst.flags |= DST_HOST; 1183 rt->rt6i_dst.addr = *daddr; 1184 rt->rt6i_dst.plen = 128; 1185 1186 if (!rt6_is_gw_or_nonexthop(ort)) { 1187 if (ort->fib6_dst.plen != 128 && 1188 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1189 rt->rt6i_flags |= RTF_ANYCAST; 1190 #ifdef CONFIG_IPV6_SUBTREES 1191 if (rt->rt6i_src.plen && saddr) { 1192 rt->rt6i_src.addr = *saddr; 1193 rt->rt6i_src.plen = 128; 1194 } 1195 #endif 1196 } 1197 1198 return rt; 1199 } 1200 1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1202 { 1203 unsigned short flags = fib6_info_dst_flags(rt); 1204 struct net_device *dev; 1205 struct rt6_info *pcpu_rt; 1206 1207 if (!fib6_info_hold_safe(rt)) 1208 return NULL; 1209 1210 rcu_read_lock(); 1211 dev = ip6_rt_get_dev_rcu(rt); 1212 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1213 rcu_read_unlock(); 1214 if (!pcpu_rt) { 1215 fib6_info_release(rt); 1216 return NULL; 1217 } 1218 ip6_rt_copy_init(pcpu_rt, rt); 1219 pcpu_rt->rt6i_flags |= RTF_PCPU; 1220 return pcpu_rt; 1221 } 1222 1223 /* It should be called with rcu_read_lock() acquired */ 1224 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1225 { 1226 struct rt6_info *pcpu_rt, **p; 1227 1228 p = this_cpu_ptr(rt->rt6i_pcpu); 1229 pcpu_rt = *p; 1230 1231 if (pcpu_rt) 1232 ip6_hold_safe(NULL, &pcpu_rt); 1233 1234 return pcpu_rt; 1235 } 1236 1237 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1238 struct fib6_info *rt) 1239 { 1240 struct rt6_info *pcpu_rt, *prev, **p; 1241 1242 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1243 if (!pcpu_rt) { 1244 dst_hold(&net->ipv6.ip6_null_entry->dst); 1245 return net->ipv6.ip6_null_entry; 1246 } 1247 1248 dst_hold(&pcpu_rt->dst); 1249 p = this_cpu_ptr(rt->rt6i_pcpu); 1250 prev = cmpxchg(p, NULL, pcpu_rt); 1251 BUG_ON(prev); 1252 1253 return pcpu_rt; 1254 } 1255 1256 /* exception hash table implementation 1257 */ 1258 static DEFINE_SPINLOCK(rt6_exception_lock); 1259 1260 /* Remove rt6_ex from hash table and free the memory 1261 * Caller must hold rt6_exception_lock 1262 */ 1263 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1264 struct rt6_exception *rt6_ex) 1265 { 1266 struct fib6_info *from; 1267 struct net *net; 1268 1269 if (!bucket || !rt6_ex) 1270 return; 1271 1272 net = dev_net(rt6_ex->rt6i->dst.dev); 1273 net->ipv6.rt6_stats->fib_rt_cache--; 1274 1275 /* purge completely the exception to allow releasing the held resources: 1276 * some [sk] cache may keep the dst around for unlimited time 1277 */ 1278 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1279 lockdep_is_held(&rt6_exception_lock)); 1280 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1281 fib6_info_release(from); 1282 dst_dev_put(&rt6_ex->rt6i->dst); 1283 1284 hlist_del_rcu(&rt6_ex->hlist); 1285 dst_release(&rt6_ex->rt6i->dst); 1286 kfree_rcu(rt6_ex, rcu); 1287 WARN_ON_ONCE(!bucket->depth); 1288 bucket->depth--; 1289 } 1290 1291 /* Remove oldest rt6_ex in bucket and free the memory 1292 * Caller must hold rt6_exception_lock 1293 */ 1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1295 { 1296 struct rt6_exception *rt6_ex, *oldest = NULL; 1297 1298 if (!bucket) 1299 return; 1300 1301 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1302 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1303 oldest = rt6_ex; 1304 } 1305 rt6_remove_exception(bucket, oldest); 1306 } 1307 1308 static u32 rt6_exception_hash(const struct in6_addr *dst, 1309 const struct in6_addr *src) 1310 { 1311 static u32 seed __read_mostly; 1312 u32 val; 1313 1314 net_get_random_once(&seed, sizeof(seed)); 1315 val = jhash(dst, sizeof(*dst), seed); 1316 1317 #ifdef CONFIG_IPV6_SUBTREES 1318 if (src) 1319 val = jhash(src, sizeof(*src), val); 1320 #endif 1321 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1322 } 1323 1324 /* Helper function to find the cached rt in the hash table 1325 * and update bucket pointer to point to the bucket for this 1326 * (daddr, saddr) pair 1327 * Caller must hold rt6_exception_lock 1328 */ 1329 static struct rt6_exception * 1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1331 const struct in6_addr *daddr, 1332 const struct in6_addr *saddr) 1333 { 1334 struct rt6_exception *rt6_ex; 1335 u32 hval; 1336 1337 if (!(*bucket) || !daddr) 1338 return NULL; 1339 1340 hval = rt6_exception_hash(daddr, saddr); 1341 *bucket += hval; 1342 1343 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1344 struct rt6_info *rt6 = rt6_ex->rt6i; 1345 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1346 1347 #ifdef CONFIG_IPV6_SUBTREES 1348 if (matched && saddr) 1349 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1350 #endif 1351 if (matched) 1352 return rt6_ex; 1353 } 1354 return NULL; 1355 } 1356 1357 /* Helper function to find the cached rt in the hash table 1358 * and update bucket pointer to point to the bucket for this 1359 * (daddr, saddr) pair 1360 * Caller must hold rcu_read_lock() 1361 */ 1362 static struct rt6_exception * 1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1364 const struct in6_addr *daddr, 1365 const struct in6_addr *saddr) 1366 { 1367 struct rt6_exception *rt6_ex; 1368 u32 hval; 1369 1370 WARN_ON_ONCE(!rcu_read_lock_held()); 1371 1372 if (!(*bucket) || !daddr) 1373 return NULL; 1374 1375 hval = rt6_exception_hash(daddr, saddr); 1376 *bucket += hval; 1377 1378 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1379 struct rt6_info *rt6 = rt6_ex->rt6i; 1380 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1381 1382 #ifdef CONFIG_IPV6_SUBTREES 1383 if (matched && saddr) 1384 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1385 #endif 1386 if (matched) 1387 return rt6_ex; 1388 } 1389 return NULL; 1390 } 1391 1392 static unsigned int fib6_mtu(const struct fib6_info *rt) 1393 { 1394 unsigned int mtu; 1395 1396 if (rt->fib6_pmtu) { 1397 mtu = rt->fib6_pmtu; 1398 } else { 1399 struct net_device *dev = fib6_info_nh_dev(rt); 1400 struct inet6_dev *idev; 1401 1402 rcu_read_lock(); 1403 idev = __in6_dev_get(dev); 1404 mtu = idev->cnf.mtu6; 1405 rcu_read_unlock(); 1406 } 1407 1408 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1409 1410 return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu); 1411 } 1412 1413 static int rt6_insert_exception(struct rt6_info *nrt, 1414 struct fib6_info *ort) 1415 { 1416 struct net *net = dev_net(nrt->dst.dev); 1417 struct rt6_exception_bucket *bucket; 1418 struct in6_addr *src_key = NULL; 1419 struct rt6_exception *rt6_ex; 1420 int err = 0; 1421 1422 spin_lock_bh(&rt6_exception_lock); 1423 1424 if (ort->exception_bucket_flushed) { 1425 err = -EINVAL; 1426 goto out; 1427 } 1428 1429 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1430 lockdep_is_held(&rt6_exception_lock)); 1431 if (!bucket) { 1432 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1433 GFP_ATOMIC); 1434 if (!bucket) { 1435 err = -ENOMEM; 1436 goto out; 1437 } 1438 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1439 } 1440 1441 #ifdef CONFIG_IPV6_SUBTREES 1442 /* rt6i_src.plen != 0 indicates ort is in subtree 1443 * and exception table is indexed by a hash of 1444 * both rt6i_dst and rt6i_src. 1445 * Otherwise, the exception table is indexed by 1446 * a hash of only rt6i_dst. 1447 */ 1448 if (ort->fib6_src.plen) 1449 src_key = &nrt->rt6i_src.addr; 1450 #endif 1451 /* rt6_mtu_change() might lower mtu on ort. 1452 * Only insert this exception route if its mtu 1453 * is less than ort's mtu value. 1454 */ 1455 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1456 err = -EINVAL; 1457 goto out; 1458 } 1459 1460 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1461 src_key); 1462 if (rt6_ex) 1463 rt6_remove_exception(bucket, rt6_ex); 1464 1465 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1466 if (!rt6_ex) { 1467 err = -ENOMEM; 1468 goto out; 1469 } 1470 rt6_ex->rt6i = nrt; 1471 rt6_ex->stamp = jiffies; 1472 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1473 bucket->depth++; 1474 net->ipv6.rt6_stats->fib_rt_cache++; 1475 1476 if (bucket->depth > FIB6_MAX_DEPTH) 1477 rt6_exception_remove_oldest(bucket); 1478 1479 out: 1480 spin_unlock_bh(&rt6_exception_lock); 1481 1482 /* Update fn->fn_sernum to invalidate all cached dst */ 1483 if (!err) { 1484 spin_lock_bh(&ort->fib6_table->tb6_lock); 1485 fib6_update_sernum(net, ort); 1486 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1487 fib6_force_start_gc(net); 1488 } 1489 1490 return err; 1491 } 1492 1493 void rt6_flush_exceptions(struct fib6_info *rt) 1494 { 1495 struct rt6_exception_bucket *bucket; 1496 struct rt6_exception *rt6_ex; 1497 struct hlist_node *tmp; 1498 int i; 1499 1500 spin_lock_bh(&rt6_exception_lock); 1501 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1502 rt->exception_bucket_flushed = 1; 1503 1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1505 lockdep_is_held(&rt6_exception_lock)); 1506 if (!bucket) 1507 goto out; 1508 1509 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1510 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1511 rt6_remove_exception(bucket, rt6_ex); 1512 WARN_ON_ONCE(bucket->depth); 1513 bucket++; 1514 } 1515 1516 out: 1517 spin_unlock_bh(&rt6_exception_lock); 1518 } 1519 1520 /* Find cached rt in the hash table inside passed in rt 1521 * Caller has to hold rcu_read_lock() 1522 */ 1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1524 struct in6_addr *daddr, 1525 struct in6_addr *saddr) 1526 { 1527 struct rt6_exception_bucket *bucket; 1528 struct in6_addr *src_key = NULL; 1529 struct rt6_exception *rt6_ex; 1530 struct rt6_info *res = NULL; 1531 1532 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1533 1534 #ifdef CONFIG_IPV6_SUBTREES 1535 /* rt6i_src.plen != 0 indicates rt is in subtree 1536 * and exception table is indexed by a hash of 1537 * both rt6i_dst and rt6i_src. 1538 * Otherwise, the exception table is indexed by 1539 * a hash of only rt6i_dst. 1540 */ 1541 if (rt->fib6_src.plen) 1542 src_key = saddr; 1543 #endif 1544 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1545 1546 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1547 res = rt6_ex->rt6i; 1548 1549 return res; 1550 } 1551 1552 /* Remove the passed in cached rt from the hash table that contains it */ 1553 static int rt6_remove_exception_rt(struct rt6_info *rt) 1554 { 1555 struct rt6_exception_bucket *bucket; 1556 struct in6_addr *src_key = NULL; 1557 struct rt6_exception *rt6_ex; 1558 struct fib6_info *from; 1559 int err; 1560 1561 from = rcu_dereference(rt->from); 1562 if (!from || 1563 !(rt->rt6i_flags & RTF_CACHE)) 1564 return -EINVAL; 1565 1566 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1567 return -ENOENT; 1568 1569 spin_lock_bh(&rt6_exception_lock); 1570 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1571 lockdep_is_held(&rt6_exception_lock)); 1572 #ifdef CONFIG_IPV6_SUBTREES 1573 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1574 * and exception table is indexed by a hash of 1575 * both rt6i_dst and rt6i_src. 1576 * Otherwise, the exception table is indexed by 1577 * a hash of only rt6i_dst. 1578 */ 1579 if (from->fib6_src.plen) 1580 src_key = &rt->rt6i_src.addr; 1581 #endif 1582 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1583 &rt->rt6i_dst.addr, 1584 src_key); 1585 if (rt6_ex) { 1586 rt6_remove_exception(bucket, rt6_ex); 1587 err = 0; 1588 } else { 1589 err = -ENOENT; 1590 } 1591 1592 spin_unlock_bh(&rt6_exception_lock); 1593 return err; 1594 } 1595 1596 /* Find rt6_ex which contains the passed in rt cache and 1597 * refresh its stamp 1598 */ 1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1600 { 1601 struct rt6_exception_bucket *bucket; 1602 struct in6_addr *src_key = NULL; 1603 struct rt6_exception *rt6_ex; 1604 struct fib6_info *from; 1605 1606 rcu_read_lock(); 1607 from = rcu_dereference(rt->from); 1608 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1609 goto unlock; 1610 1611 bucket = rcu_dereference(from->rt6i_exception_bucket); 1612 1613 #ifdef CONFIG_IPV6_SUBTREES 1614 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1615 * and exception table is indexed by a hash of 1616 * both rt6i_dst and rt6i_src. 1617 * Otherwise, the exception table is indexed by 1618 * a hash of only rt6i_dst. 1619 */ 1620 if (from->fib6_src.plen) 1621 src_key = &rt->rt6i_src.addr; 1622 #endif 1623 rt6_ex = __rt6_find_exception_rcu(&bucket, 1624 &rt->rt6i_dst.addr, 1625 src_key); 1626 if (rt6_ex) 1627 rt6_ex->stamp = jiffies; 1628 1629 unlock: 1630 rcu_read_unlock(); 1631 } 1632 1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1634 struct rt6_info *rt, int mtu) 1635 { 1636 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1637 * lowest MTU in the path: always allow updating the route PMTU to 1638 * reflect PMTU decreases. 1639 * 1640 * If the new MTU is higher, and the route PMTU is equal to the local 1641 * MTU, this means the old MTU is the lowest in the path, so allow 1642 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1643 * handle this. 1644 */ 1645 1646 if (dst_mtu(&rt->dst) >= mtu) 1647 return true; 1648 1649 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1650 return true; 1651 1652 return false; 1653 } 1654 1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1656 struct fib6_info *rt, int mtu) 1657 { 1658 struct rt6_exception_bucket *bucket; 1659 struct rt6_exception *rt6_ex; 1660 int i; 1661 1662 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1663 lockdep_is_held(&rt6_exception_lock)); 1664 1665 if (!bucket) 1666 return; 1667 1668 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1669 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1670 struct rt6_info *entry = rt6_ex->rt6i; 1671 1672 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1673 * route), the metrics of its rt->from have already 1674 * been updated. 1675 */ 1676 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1677 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1678 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1679 } 1680 bucket++; 1681 } 1682 } 1683 1684 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1685 1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1687 struct in6_addr *gateway) 1688 { 1689 struct rt6_exception_bucket *bucket; 1690 struct rt6_exception *rt6_ex; 1691 struct hlist_node *tmp; 1692 int i; 1693 1694 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1695 return; 1696 1697 spin_lock_bh(&rt6_exception_lock); 1698 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1699 lockdep_is_held(&rt6_exception_lock)); 1700 1701 if (bucket) { 1702 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1703 hlist_for_each_entry_safe(rt6_ex, tmp, 1704 &bucket->chain, hlist) { 1705 struct rt6_info *entry = rt6_ex->rt6i; 1706 1707 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1708 RTF_CACHE_GATEWAY && 1709 ipv6_addr_equal(gateway, 1710 &entry->rt6i_gateway)) { 1711 rt6_remove_exception(bucket, rt6_ex); 1712 } 1713 } 1714 bucket++; 1715 } 1716 } 1717 1718 spin_unlock_bh(&rt6_exception_lock); 1719 } 1720 1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1722 struct rt6_exception *rt6_ex, 1723 struct fib6_gc_args *gc_args, 1724 unsigned long now) 1725 { 1726 struct rt6_info *rt = rt6_ex->rt6i; 1727 1728 /* we are pruning and obsoleting aged-out and non gateway exceptions 1729 * even if others have still references to them, so that on next 1730 * dst_check() such references can be dropped. 1731 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1732 * expired, independently from their aging, as per RFC 8201 section 4 1733 */ 1734 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1735 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1736 RT6_TRACE("aging clone %p\n", rt); 1737 rt6_remove_exception(bucket, rt6_ex); 1738 return; 1739 } 1740 } else if (time_after(jiffies, rt->dst.expires)) { 1741 RT6_TRACE("purging expired route %p\n", rt); 1742 rt6_remove_exception(bucket, rt6_ex); 1743 return; 1744 } 1745 1746 if (rt->rt6i_flags & RTF_GATEWAY) { 1747 struct neighbour *neigh; 1748 __u8 neigh_flags = 0; 1749 1750 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1751 if (neigh) 1752 neigh_flags = neigh->flags; 1753 1754 if (!(neigh_flags & NTF_ROUTER)) { 1755 RT6_TRACE("purging route %p via non-router but gateway\n", 1756 rt); 1757 rt6_remove_exception(bucket, rt6_ex); 1758 return; 1759 } 1760 } 1761 1762 gc_args->more++; 1763 } 1764 1765 void rt6_age_exceptions(struct fib6_info *rt, 1766 struct fib6_gc_args *gc_args, 1767 unsigned long now) 1768 { 1769 struct rt6_exception_bucket *bucket; 1770 struct rt6_exception *rt6_ex; 1771 struct hlist_node *tmp; 1772 int i; 1773 1774 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1775 return; 1776 1777 rcu_read_lock_bh(); 1778 spin_lock(&rt6_exception_lock); 1779 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1780 lockdep_is_held(&rt6_exception_lock)); 1781 1782 if (bucket) { 1783 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1784 hlist_for_each_entry_safe(rt6_ex, tmp, 1785 &bucket->chain, hlist) { 1786 rt6_age_examine_exception(bucket, rt6_ex, 1787 gc_args, now); 1788 } 1789 bucket++; 1790 } 1791 } 1792 spin_unlock(&rt6_exception_lock); 1793 rcu_read_unlock_bh(); 1794 } 1795 1796 /* must be called with rcu lock held */ 1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1798 int oif, struct flowi6 *fl6, int strict) 1799 { 1800 struct fib6_node *fn, *saved_fn; 1801 struct fib6_info *f6i; 1802 1803 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1804 saved_fn = fn; 1805 1806 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1807 oif = 0; 1808 1809 redo_rt6_select: 1810 f6i = rt6_select(net, fn, oif, strict); 1811 if (f6i == net->ipv6.fib6_null_entry) { 1812 fn = fib6_backtrack(fn, &fl6->saddr); 1813 if (fn) 1814 goto redo_rt6_select; 1815 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1816 /* also consider unreachable route */ 1817 strict &= ~RT6_LOOKUP_F_REACHABLE; 1818 fn = saved_fn; 1819 goto redo_rt6_select; 1820 } 1821 } 1822 1823 trace_fib6_table_lookup(net, f6i, table, fl6); 1824 1825 return f6i; 1826 } 1827 1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1829 int oif, struct flowi6 *fl6, 1830 const struct sk_buff *skb, int flags) 1831 { 1832 struct fib6_info *f6i; 1833 struct rt6_info *rt; 1834 int strict = 0; 1835 1836 strict |= flags & RT6_LOOKUP_F_IFACE; 1837 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1838 if (net->ipv6.devconf_all->forwarding == 0) 1839 strict |= RT6_LOOKUP_F_REACHABLE; 1840 1841 rcu_read_lock(); 1842 1843 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1844 if (f6i->fib6_nsiblings) 1845 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1846 1847 if (f6i == net->ipv6.fib6_null_entry) { 1848 rt = net->ipv6.ip6_null_entry; 1849 rcu_read_unlock(); 1850 dst_hold(&rt->dst); 1851 return rt; 1852 } 1853 1854 /*Search through exception table */ 1855 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1856 if (rt) { 1857 if (ip6_hold_safe(net, &rt)) 1858 dst_use_noref(&rt->dst, jiffies); 1859 1860 rcu_read_unlock(); 1861 return rt; 1862 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1863 !f6i->fib6_nh.fib_nh_has_gw)) { 1864 /* Create a RTF_CACHE clone which will not be 1865 * owned by the fib6 tree. It is for the special case where 1866 * the daddr in the skb during the neighbor look-up is different 1867 * from the fl6->daddr used to look-up route here. 1868 */ 1869 struct rt6_info *uncached_rt; 1870 1871 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1872 1873 rcu_read_unlock(); 1874 1875 if (uncached_rt) { 1876 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1877 * No need for another dst_hold() 1878 */ 1879 rt6_uncached_list_add(uncached_rt); 1880 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1881 } else { 1882 uncached_rt = net->ipv6.ip6_null_entry; 1883 dst_hold(&uncached_rt->dst); 1884 } 1885 1886 return uncached_rt; 1887 } else { 1888 /* Get a percpu copy */ 1889 1890 struct rt6_info *pcpu_rt; 1891 1892 local_bh_disable(); 1893 pcpu_rt = rt6_get_pcpu_route(f6i); 1894 1895 if (!pcpu_rt) 1896 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1897 1898 local_bh_enable(); 1899 rcu_read_unlock(); 1900 1901 return pcpu_rt; 1902 } 1903 } 1904 EXPORT_SYMBOL_GPL(ip6_pol_route); 1905 1906 static struct rt6_info *ip6_pol_route_input(struct net *net, 1907 struct fib6_table *table, 1908 struct flowi6 *fl6, 1909 const struct sk_buff *skb, 1910 int flags) 1911 { 1912 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1913 } 1914 1915 struct dst_entry *ip6_route_input_lookup(struct net *net, 1916 struct net_device *dev, 1917 struct flowi6 *fl6, 1918 const struct sk_buff *skb, 1919 int flags) 1920 { 1921 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1922 flags |= RT6_LOOKUP_F_IFACE; 1923 1924 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1925 } 1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1927 1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1929 struct flow_keys *keys, 1930 struct flow_keys *flkeys) 1931 { 1932 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1933 const struct ipv6hdr *key_iph = outer_iph; 1934 struct flow_keys *_flkeys = flkeys; 1935 const struct ipv6hdr *inner_iph; 1936 const struct icmp6hdr *icmph; 1937 struct ipv6hdr _inner_iph; 1938 struct icmp6hdr _icmph; 1939 1940 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1941 goto out; 1942 1943 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1944 sizeof(_icmph), &_icmph); 1945 if (!icmph) 1946 goto out; 1947 1948 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1949 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1950 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1951 icmph->icmp6_type != ICMPV6_PARAMPROB) 1952 goto out; 1953 1954 inner_iph = skb_header_pointer(skb, 1955 skb_transport_offset(skb) + sizeof(*icmph), 1956 sizeof(_inner_iph), &_inner_iph); 1957 if (!inner_iph) 1958 goto out; 1959 1960 key_iph = inner_iph; 1961 _flkeys = NULL; 1962 out: 1963 if (_flkeys) { 1964 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1965 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1966 keys->tags.flow_label = _flkeys->tags.flow_label; 1967 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1968 } else { 1969 keys->addrs.v6addrs.src = key_iph->saddr; 1970 keys->addrs.v6addrs.dst = key_iph->daddr; 1971 keys->tags.flow_label = ip6_flowlabel(key_iph); 1972 keys->basic.ip_proto = key_iph->nexthdr; 1973 } 1974 } 1975 1976 /* if skb is set it will be used and fl6 can be NULL */ 1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1978 const struct sk_buff *skb, struct flow_keys *flkeys) 1979 { 1980 struct flow_keys hash_keys; 1981 u32 mhash; 1982 1983 switch (ip6_multipath_hash_policy(net)) { 1984 case 0: 1985 memset(&hash_keys, 0, sizeof(hash_keys)); 1986 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1987 if (skb) { 1988 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1989 } else { 1990 hash_keys.addrs.v6addrs.src = fl6->saddr; 1991 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1992 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 1993 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1994 } 1995 break; 1996 case 1: 1997 if (skb) { 1998 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1999 struct flow_keys keys; 2000 2001 /* short-circuit if we already have L4 hash present */ 2002 if (skb->l4_hash) 2003 return skb_get_hash_raw(skb) >> 1; 2004 2005 memset(&hash_keys, 0, sizeof(hash_keys)); 2006 2007 if (!flkeys) { 2008 skb_flow_dissect_flow_keys(skb, &keys, flag); 2009 flkeys = &keys; 2010 } 2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2012 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2013 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2014 hash_keys.ports.src = flkeys->ports.src; 2015 hash_keys.ports.dst = flkeys->ports.dst; 2016 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2017 } else { 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2020 hash_keys.addrs.v6addrs.src = fl6->saddr; 2021 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2022 hash_keys.ports.src = fl6->fl6_sport; 2023 hash_keys.ports.dst = fl6->fl6_dport; 2024 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2025 } 2026 break; 2027 } 2028 mhash = flow_hash_from_keys(&hash_keys); 2029 2030 return mhash >> 1; 2031 } 2032 2033 void ip6_route_input(struct sk_buff *skb) 2034 { 2035 const struct ipv6hdr *iph = ipv6_hdr(skb); 2036 struct net *net = dev_net(skb->dev); 2037 int flags = RT6_LOOKUP_F_HAS_SADDR; 2038 struct ip_tunnel_info *tun_info; 2039 struct flowi6 fl6 = { 2040 .flowi6_iif = skb->dev->ifindex, 2041 .daddr = iph->daddr, 2042 .saddr = iph->saddr, 2043 .flowlabel = ip6_flowinfo(iph), 2044 .flowi6_mark = skb->mark, 2045 .flowi6_proto = iph->nexthdr, 2046 }; 2047 struct flow_keys *flkeys = NULL, _flkeys; 2048 2049 tun_info = skb_tunnel_info(skb); 2050 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2051 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2052 2053 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2054 flkeys = &_flkeys; 2055 2056 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2057 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2058 skb_dst_drop(skb); 2059 skb_dst_set(skb, 2060 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2061 } 2062 2063 static struct rt6_info *ip6_pol_route_output(struct net *net, 2064 struct fib6_table *table, 2065 struct flowi6 *fl6, 2066 const struct sk_buff *skb, 2067 int flags) 2068 { 2069 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2070 } 2071 2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2073 struct flowi6 *fl6, int flags) 2074 { 2075 bool any_src; 2076 2077 if (ipv6_addr_type(&fl6->daddr) & 2078 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2079 struct dst_entry *dst; 2080 2081 dst = l3mdev_link_scope_lookup(net, fl6); 2082 if (dst) 2083 return dst; 2084 } 2085 2086 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2087 2088 any_src = ipv6_addr_any(&fl6->saddr); 2089 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2090 (fl6->flowi6_oif && any_src)) 2091 flags |= RT6_LOOKUP_F_IFACE; 2092 2093 if (!any_src) 2094 flags |= RT6_LOOKUP_F_HAS_SADDR; 2095 else if (sk) 2096 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2097 2098 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2099 } 2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2101 2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2103 { 2104 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2105 struct net_device *loopback_dev = net->loopback_dev; 2106 struct dst_entry *new = NULL; 2107 2108 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2109 DST_OBSOLETE_DEAD, 0); 2110 if (rt) { 2111 rt6_info_init(rt); 2112 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2113 2114 new = &rt->dst; 2115 new->__use = 1; 2116 new->input = dst_discard; 2117 new->output = dst_discard_out; 2118 2119 dst_copy_metrics(new, &ort->dst); 2120 2121 rt->rt6i_idev = in6_dev_get(loopback_dev); 2122 rt->rt6i_gateway = ort->rt6i_gateway; 2123 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2124 2125 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2126 #ifdef CONFIG_IPV6_SUBTREES 2127 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2128 #endif 2129 } 2130 2131 dst_release(dst_orig); 2132 return new ? new : ERR_PTR(-ENOMEM); 2133 } 2134 2135 /* 2136 * Destination cache support functions 2137 */ 2138 2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2140 { 2141 u32 rt_cookie = 0; 2142 2143 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2144 return false; 2145 2146 if (fib6_check_expired(f6i)) 2147 return false; 2148 2149 return true; 2150 } 2151 2152 static struct dst_entry *rt6_check(struct rt6_info *rt, 2153 struct fib6_info *from, 2154 u32 cookie) 2155 { 2156 u32 rt_cookie = 0; 2157 2158 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2159 rt_cookie != cookie) 2160 return NULL; 2161 2162 if (rt6_check_expired(rt)) 2163 return NULL; 2164 2165 return &rt->dst; 2166 } 2167 2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2169 struct fib6_info *from, 2170 u32 cookie) 2171 { 2172 if (!__rt6_check_expired(rt) && 2173 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2174 fib6_check(from, cookie)) 2175 return &rt->dst; 2176 else 2177 return NULL; 2178 } 2179 2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2181 { 2182 struct dst_entry *dst_ret; 2183 struct fib6_info *from; 2184 struct rt6_info *rt; 2185 2186 rt = container_of(dst, struct rt6_info, dst); 2187 2188 rcu_read_lock(); 2189 2190 /* All IPV6 dsts are created with ->obsolete set to the value 2191 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2192 * into this function always. 2193 */ 2194 2195 from = rcu_dereference(rt->from); 2196 2197 if (from && (rt->rt6i_flags & RTF_PCPU || 2198 unlikely(!list_empty(&rt->rt6i_uncached)))) 2199 dst_ret = rt6_dst_from_check(rt, from, cookie); 2200 else 2201 dst_ret = rt6_check(rt, from, cookie); 2202 2203 rcu_read_unlock(); 2204 2205 return dst_ret; 2206 } 2207 2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2209 { 2210 struct rt6_info *rt = (struct rt6_info *) dst; 2211 2212 if (rt) { 2213 if (rt->rt6i_flags & RTF_CACHE) { 2214 rcu_read_lock(); 2215 if (rt6_check_expired(rt)) { 2216 rt6_remove_exception_rt(rt); 2217 dst = NULL; 2218 } 2219 rcu_read_unlock(); 2220 } else { 2221 dst_release(dst); 2222 dst = NULL; 2223 } 2224 } 2225 return dst; 2226 } 2227 2228 static void ip6_link_failure(struct sk_buff *skb) 2229 { 2230 struct rt6_info *rt; 2231 2232 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2233 2234 rt = (struct rt6_info *) skb_dst(skb); 2235 if (rt) { 2236 rcu_read_lock(); 2237 if (rt->rt6i_flags & RTF_CACHE) { 2238 rt6_remove_exception_rt(rt); 2239 } else { 2240 struct fib6_info *from; 2241 struct fib6_node *fn; 2242 2243 from = rcu_dereference(rt->from); 2244 if (from) { 2245 fn = rcu_dereference(from->fib6_node); 2246 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2247 fn->fn_sernum = -1; 2248 } 2249 } 2250 rcu_read_unlock(); 2251 } 2252 } 2253 2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2255 { 2256 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2257 struct fib6_info *from; 2258 2259 rcu_read_lock(); 2260 from = rcu_dereference(rt0->from); 2261 if (from) 2262 rt0->dst.expires = from->expires; 2263 rcu_read_unlock(); 2264 } 2265 2266 dst_set_expires(&rt0->dst, timeout); 2267 rt0->rt6i_flags |= RTF_EXPIRES; 2268 } 2269 2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2271 { 2272 struct net *net = dev_net(rt->dst.dev); 2273 2274 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2275 rt->rt6i_flags |= RTF_MODIFIED; 2276 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2277 } 2278 2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2280 { 2281 return !(rt->rt6i_flags & RTF_CACHE) && 2282 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2283 } 2284 2285 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2286 const struct ipv6hdr *iph, u32 mtu) 2287 { 2288 const struct in6_addr *daddr, *saddr; 2289 struct rt6_info *rt6 = (struct rt6_info *)dst; 2290 2291 if (dst_metric_locked(dst, RTAX_MTU)) 2292 return; 2293 2294 if (iph) { 2295 daddr = &iph->daddr; 2296 saddr = &iph->saddr; 2297 } else if (sk) { 2298 daddr = &sk->sk_v6_daddr; 2299 saddr = &inet6_sk(sk)->saddr; 2300 } else { 2301 daddr = NULL; 2302 saddr = NULL; 2303 } 2304 dst_confirm_neigh(dst, daddr); 2305 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2306 if (mtu >= dst_mtu(dst)) 2307 return; 2308 2309 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2310 rt6_do_update_pmtu(rt6, mtu); 2311 /* update rt6_ex->stamp for cache */ 2312 if (rt6->rt6i_flags & RTF_CACHE) 2313 rt6_update_exception_stamp_rt(rt6); 2314 } else if (daddr) { 2315 struct fib6_info *from; 2316 struct rt6_info *nrt6; 2317 2318 rcu_read_lock(); 2319 from = rcu_dereference(rt6->from); 2320 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2321 if (nrt6) { 2322 rt6_do_update_pmtu(nrt6, mtu); 2323 if (rt6_insert_exception(nrt6, from)) 2324 dst_release_immediate(&nrt6->dst); 2325 } 2326 rcu_read_unlock(); 2327 } 2328 } 2329 2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2331 struct sk_buff *skb, u32 mtu) 2332 { 2333 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2334 } 2335 2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2337 int oif, u32 mark, kuid_t uid) 2338 { 2339 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2340 struct dst_entry *dst; 2341 struct flowi6 fl6 = { 2342 .flowi6_oif = oif, 2343 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2344 .daddr = iph->daddr, 2345 .saddr = iph->saddr, 2346 .flowlabel = ip6_flowinfo(iph), 2347 .flowi6_uid = uid, 2348 }; 2349 2350 dst = ip6_route_output(net, NULL, &fl6); 2351 if (!dst->error) 2352 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2353 dst_release(dst); 2354 } 2355 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2356 2357 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2358 { 2359 int oif = sk->sk_bound_dev_if; 2360 struct dst_entry *dst; 2361 2362 if (!oif && skb->dev) 2363 oif = l3mdev_master_ifindex(skb->dev); 2364 2365 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2366 2367 dst = __sk_dst_get(sk); 2368 if (!dst || !dst->obsolete || 2369 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2370 return; 2371 2372 bh_lock_sock(sk); 2373 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2374 ip6_datagram_dst_update(sk, false); 2375 bh_unlock_sock(sk); 2376 } 2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2378 2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2380 const struct flowi6 *fl6) 2381 { 2382 #ifdef CONFIG_IPV6_SUBTREES 2383 struct ipv6_pinfo *np = inet6_sk(sk); 2384 #endif 2385 2386 ip6_dst_store(sk, dst, 2387 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2388 &sk->sk_v6_daddr : NULL, 2389 #ifdef CONFIG_IPV6_SUBTREES 2390 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2391 &np->saddr : 2392 #endif 2393 NULL); 2394 } 2395 2396 /* Handle redirects */ 2397 struct ip6rd_flowi { 2398 struct flowi6 fl6; 2399 struct in6_addr gateway; 2400 }; 2401 2402 static struct rt6_info *__ip6_route_redirect(struct net *net, 2403 struct fib6_table *table, 2404 struct flowi6 *fl6, 2405 const struct sk_buff *skb, 2406 int flags) 2407 { 2408 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2409 struct rt6_info *ret = NULL, *rt_cache; 2410 struct fib6_info *rt; 2411 struct fib6_node *fn; 2412 2413 /* Get the "current" route for this destination and 2414 * check if the redirect has come from appropriate router. 2415 * 2416 * RFC 4861 specifies that redirects should only be 2417 * accepted if they come from the nexthop to the target. 2418 * Due to the way the routes are chosen, this notion 2419 * is a bit fuzzy and one might need to check all possible 2420 * routes. 2421 */ 2422 2423 rcu_read_lock(); 2424 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2425 restart: 2426 for_each_fib6_node_rt_rcu(fn) { 2427 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 2428 continue; 2429 if (fib6_check_expired(rt)) 2430 continue; 2431 if (rt->fib6_flags & RTF_REJECT) 2432 break; 2433 if (!rt->fib6_nh.fib_nh_has_gw) 2434 continue; 2435 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex) 2436 continue; 2437 /* rt_cache's gateway might be different from its 'parent' 2438 * in the case of an ip redirect. 2439 * So we keep searching in the exception table if the gateway 2440 * is different. 2441 */ 2442 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) { 2443 rt_cache = rt6_find_cached_rt(rt, 2444 &fl6->daddr, 2445 &fl6->saddr); 2446 if (rt_cache && 2447 ipv6_addr_equal(&rdfl->gateway, 2448 &rt_cache->rt6i_gateway)) { 2449 ret = rt_cache; 2450 break; 2451 } 2452 continue; 2453 } 2454 break; 2455 } 2456 2457 if (!rt) 2458 rt = net->ipv6.fib6_null_entry; 2459 else if (rt->fib6_flags & RTF_REJECT) { 2460 ret = net->ipv6.ip6_null_entry; 2461 goto out; 2462 } 2463 2464 if (rt == net->ipv6.fib6_null_entry) { 2465 fn = fib6_backtrack(fn, &fl6->saddr); 2466 if (fn) 2467 goto restart; 2468 } 2469 2470 out: 2471 if (ret) 2472 ip6_hold_safe(net, &ret); 2473 else 2474 ret = ip6_create_rt_rcu(rt); 2475 2476 rcu_read_unlock(); 2477 2478 trace_fib6_table_lookup(net, rt, table, fl6); 2479 return ret; 2480 }; 2481 2482 static struct dst_entry *ip6_route_redirect(struct net *net, 2483 const struct flowi6 *fl6, 2484 const struct sk_buff *skb, 2485 const struct in6_addr *gateway) 2486 { 2487 int flags = RT6_LOOKUP_F_HAS_SADDR; 2488 struct ip6rd_flowi rdfl; 2489 2490 rdfl.fl6 = *fl6; 2491 rdfl.gateway = *gateway; 2492 2493 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2494 flags, __ip6_route_redirect); 2495 } 2496 2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2498 kuid_t uid) 2499 { 2500 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2501 struct dst_entry *dst; 2502 struct flowi6 fl6 = { 2503 .flowi6_iif = LOOPBACK_IFINDEX, 2504 .flowi6_oif = oif, 2505 .flowi6_mark = mark, 2506 .daddr = iph->daddr, 2507 .saddr = iph->saddr, 2508 .flowlabel = ip6_flowinfo(iph), 2509 .flowi6_uid = uid, 2510 }; 2511 2512 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2513 rt6_do_redirect(dst, NULL, skb); 2514 dst_release(dst); 2515 } 2516 EXPORT_SYMBOL_GPL(ip6_redirect); 2517 2518 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2519 { 2520 const struct ipv6hdr *iph = ipv6_hdr(skb); 2521 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2522 struct dst_entry *dst; 2523 struct flowi6 fl6 = { 2524 .flowi6_iif = LOOPBACK_IFINDEX, 2525 .flowi6_oif = oif, 2526 .daddr = msg->dest, 2527 .saddr = iph->daddr, 2528 .flowi6_uid = sock_net_uid(net, NULL), 2529 }; 2530 2531 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2532 rt6_do_redirect(dst, NULL, skb); 2533 dst_release(dst); 2534 } 2535 2536 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2537 { 2538 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2539 sk->sk_uid); 2540 } 2541 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2542 2543 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2544 { 2545 struct net_device *dev = dst->dev; 2546 unsigned int mtu = dst_mtu(dst); 2547 struct net *net = dev_net(dev); 2548 2549 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2550 2551 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2552 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2553 2554 /* 2555 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2556 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2557 * IPV6_MAXPLEN is also valid and means: "any MSS, 2558 * rely only on pmtu discovery" 2559 */ 2560 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2561 mtu = IPV6_MAXPLEN; 2562 return mtu; 2563 } 2564 2565 static unsigned int ip6_mtu(const struct dst_entry *dst) 2566 { 2567 struct inet6_dev *idev; 2568 unsigned int mtu; 2569 2570 mtu = dst_metric_raw(dst, RTAX_MTU); 2571 if (mtu) 2572 goto out; 2573 2574 mtu = IPV6_MIN_MTU; 2575 2576 rcu_read_lock(); 2577 idev = __in6_dev_get(dst->dev); 2578 if (idev) 2579 mtu = idev->cnf.mtu6; 2580 rcu_read_unlock(); 2581 2582 out: 2583 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2584 2585 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2586 } 2587 2588 /* MTU selection: 2589 * 1. mtu on route is locked - use it 2590 * 2. mtu from nexthop exception 2591 * 3. mtu from egress device 2592 * 2593 * based on ip6_dst_mtu_forward and exception logic of 2594 * rt6_find_cached_rt; called with rcu_read_lock 2595 */ 2596 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2597 struct in6_addr *saddr) 2598 { 2599 struct rt6_exception_bucket *bucket; 2600 struct rt6_exception *rt6_ex; 2601 struct in6_addr *src_key; 2602 struct inet6_dev *idev; 2603 u32 mtu = 0; 2604 2605 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2606 mtu = f6i->fib6_pmtu; 2607 if (mtu) 2608 goto out; 2609 } 2610 2611 src_key = NULL; 2612 #ifdef CONFIG_IPV6_SUBTREES 2613 if (f6i->fib6_src.plen) 2614 src_key = saddr; 2615 #endif 2616 2617 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2618 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2619 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2620 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2621 2622 if (likely(!mtu)) { 2623 struct net_device *dev = fib6_info_nh_dev(f6i); 2624 2625 mtu = IPV6_MIN_MTU; 2626 idev = __in6_dev_get(dev); 2627 if (idev && idev->cnf.mtu6 > mtu) 2628 mtu = idev->cnf.mtu6; 2629 } 2630 2631 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2632 out: 2633 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2634 } 2635 2636 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2637 struct flowi6 *fl6) 2638 { 2639 struct dst_entry *dst; 2640 struct rt6_info *rt; 2641 struct inet6_dev *idev = in6_dev_get(dev); 2642 struct net *net = dev_net(dev); 2643 2644 if (unlikely(!idev)) 2645 return ERR_PTR(-ENODEV); 2646 2647 rt = ip6_dst_alloc(net, dev, 0); 2648 if (unlikely(!rt)) { 2649 in6_dev_put(idev); 2650 dst = ERR_PTR(-ENOMEM); 2651 goto out; 2652 } 2653 2654 rt->dst.flags |= DST_HOST; 2655 rt->dst.input = ip6_input; 2656 rt->dst.output = ip6_output; 2657 rt->rt6i_gateway = fl6->daddr; 2658 rt->rt6i_dst.addr = fl6->daddr; 2659 rt->rt6i_dst.plen = 128; 2660 rt->rt6i_idev = idev; 2661 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2662 2663 /* Add this dst into uncached_list so that rt6_disable_ip() can 2664 * do proper release of the net_device 2665 */ 2666 rt6_uncached_list_add(rt); 2667 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2668 2669 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2670 2671 out: 2672 return dst; 2673 } 2674 2675 static int ip6_dst_gc(struct dst_ops *ops) 2676 { 2677 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2678 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2679 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2680 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2681 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2682 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2683 int entries; 2684 2685 entries = dst_entries_get_fast(ops); 2686 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2687 entries <= rt_max_size) 2688 goto out; 2689 2690 net->ipv6.ip6_rt_gc_expire++; 2691 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2692 entries = dst_entries_get_slow(ops); 2693 if (entries < ops->gc_thresh) 2694 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2695 out: 2696 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2697 return entries > rt_max_size; 2698 } 2699 2700 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2701 struct fib6_config *cfg, 2702 const struct in6_addr *gw_addr, 2703 u32 tbid, int flags) 2704 { 2705 struct flowi6 fl6 = { 2706 .flowi6_oif = cfg->fc_ifindex, 2707 .daddr = *gw_addr, 2708 .saddr = cfg->fc_prefsrc, 2709 }; 2710 struct fib6_table *table; 2711 struct rt6_info *rt; 2712 2713 table = fib6_get_table(net, tbid); 2714 if (!table) 2715 return NULL; 2716 2717 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2718 flags |= RT6_LOOKUP_F_HAS_SADDR; 2719 2720 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2721 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2722 2723 /* if table lookup failed, fall back to full lookup */ 2724 if (rt == net->ipv6.ip6_null_entry) { 2725 ip6_rt_put(rt); 2726 rt = NULL; 2727 } 2728 2729 return rt; 2730 } 2731 2732 static int ip6_route_check_nh_onlink(struct net *net, 2733 struct fib6_config *cfg, 2734 const struct net_device *dev, 2735 struct netlink_ext_ack *extack) 2736 { 2737 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2738 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2739 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2740 struct fib6_info *from; 2741 struct rt6_info *grt; 2742 int err; 2743 2744 err = 0; 2745 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2746 if (grt) { 2747 rcu_read_lock(); 2748 from = rcu_dereference(grt->from); 2749 if (!grt->dst.error && 2750 /* ignore match if it is the default route */ 2751 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2752 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2753 NL_SET_ERR_MSG(extack, 2754 "Nexthop has invalid gateway or device mismatch"); 2755 err = -EINVAL; 2756 } 2757 rcu_read_unlock(); 2758 2759 ip6_rt_put(grt); 2760 } 2761 2762 return err; 2763 } 2764 2765 static int ip6_route_check_nh(struct net *net, 2766 struct fib6_config *cfg, 2767 struct net_device **_dev, 2768 struct inet6_dev **idev) 2769 { 2770 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2771 struct net_device *dev = _dev ? *_dev : NULL; 2772 struct rt6_info *grt = NULL; 2773 int err = -EHOSTUNREACH; 2774 2775 if (cfg->fc_table) { 2776 int flags = RT6_LOOKUP_F_IFACE; 2777 2778 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2779 cfg->fc_table, flags); 2780 if (grt) { 2781 if (grt->rt6i_flags & RTF_GATEWAY || 2782 (dev && dev != grt->dst.dev)) { 2783 ip6_rt_put(grt); 2784 grt = NULL; 2785 } 2786 } 2787 } 2788 2789 if (!grt) 2790 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2791 2792 if (!grt) 2793 goto out; 2794 2795 if (dev) { 2796 if (dev != grt->dst.dev) { 2797 ip6_rt_put(grt); 2798 goto out; 2799 } 2800 } else { 2801 *_dev = dev = grt->dst.dev; 2802 *idev = grt->rt6i_idev; 2803 dev_hold(dev); 2804 in6_dev_hold(grt->rt6i_idev); 2805 } 2806 2807 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2808 err = 0; 2809 2810 ip6_rt_put(grt); 2811 2812 out: 2813 return err; 2814 } 2815 2816 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2817 struct net_device **_dev, struct inet6_dev **idev, 2818 struct netlink_ext_ack *extack) 2819 { 2820 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2821 int gwa_type = ipv6_addr_type(gw_addr); 2822 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2823 const struct net_device *dev = *_dev; 2824 bool need_addr_check = !dev; 2825 int err = -EINVAL; 2826 2827 /* if gw_addr is local we will fail to detect this in case 2828 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2829 * will return already-added prefix route via interface that 2830 * prefix route was assigned to, which might be non-loopback. 2831 */ 2832 if (dev && 2833 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2834 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2835 goto out; 2836 } 2837 2838 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2839 /* IPv6 strictly inhibits using not link-local 2840 * addresses as nexthop address. 2841 * Otherwise, router will not able to send redirects. 2842 * It is very good, but in some (rare!) circumstances 2843 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2844 * some exceptions. --ANK 2845 * We allow IPv4-mapped nexthops to support RFC4798-type 2846 * addressing 2847 */ 2848 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2849 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2850 goto out; 2851 } 2852 2853 if (cfg->fc_flags & RTNH_F_ONLINK) 2854 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2855 else 2856 err = ip6_route_check_nh(net, cfg, _dev, idev); 2857 2858 if (err) 2859 goto out; 2860 } 2861 2862 /* reload in case device was changed */ 2863 dev = *_dev; 2864 2865 err = -EINVAL; 2866 if (!dev) { 2867 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2868 goto out; 2869 } else if (dev->flags & IFF_LOOPBACK) { 2870 NL_SET_ERR_MSG(extack, 2871 "Egress device can not be loopback device for this route"); 2872 goto out; 2873 } 2874 2875 /* if we did not check gw_addr above, do so now that the 2876 * egress device has been resolved. 2877 */ 2878 if (need_addr_check && 2879 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2880 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2881 goto out; 2882 } 2883 2884 err = 0; 2885 out: 2886 return err; 2887 } 2888 2889 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2890 { 2891 if ((flags & RTF_REJECT) || 2892 (dev && (dev->flags & IFF_LOOPBACK) && 2893 !(addr_type & IPV6_ADDR_LOOPBACK) && 2894 !(flags & RTF_LOCAL))) 2895 return true; 2896 2897 return false; 2898 } 2899 2900 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2901 struct fib6_config *cfg, gfp_t gfp_flags, 2902 struct netlink_ext_ack *extack) 2903 { 2904 struct net_device *dev = NULL; 2905 struct inet6_dev *idev = NULL; 2906 int addr_type; 2907 int err; 2908 2909 fib6_nh->fib_nh_family = AF_INET6; 2910 2911 err = -ENODEV; 2912 if (cfg->fc_ifindex) { 2913 dev = dev_get_by_index(net, cfg->fc_ifindex); 2914 if (!dev) 2915 goto out; 2916 idev = in6_dev_get(dev); 2917 if (!idev) 2918 goto out; 2919 } 2920 2921 if (cfg->fc_flags & RTNH_F_ONLINK) { 2922 if (!dev) { 2923 NL_SET_ERR_MSG(extack, 2924 "Nexthop device required for onlink"); 2925 goto out; 2926 } 2927 2928 if (!(dev->flags & IFF_UP)) { 2929 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2930 err = -ENETDOWN; 2931 goto out; 2932 } 2933 2934 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 2935 } 2936 2937 if (cfg->fc_encap) { 2938 struct lwtunnel_state *lwtstate; 2939 2940 err = lwtunnel_build_state(cfg->fc_encap_type, 2941 cfg->fc_encap, AF_INET6, cfg, 2942 &lwtstate, extack); 2943 if (err) 2944 goto out; 2945 2946 fib6_nh->fib_nh_lws = lwtstate_get(lwtstate); 2947 } 2948 2949 fib6_nh->fib_nh_weight = 1; 2950 2951 /* We cannot add true routes via loopback here, 2952 * they would result in kernel looping; promote them to reject routes 2953 */ 2954 addr_type = ipv6_addr_type(&cfg->fc_dst); 2955 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 2956 /* hold loopback dev/idev if we haven't done so. */ 2957 if (dev != net->loopback_dev) { 2958 if (dev) { 2959 dev_put(dev); 2960 in6_dev_put(idev); 2961 } 2962 dev = net->loopback_dev; 2963 dev_hold(dev); 2964 idev = in6_dev_get(dev); 2965 if (!idev) { 2966 err = -ENODEV; 2967 goto out; 2968 } 2969 } 2970 goto set_dev; 2971 } 2972 2973 if (cfg->fc_flags & RTF_GATEWAY) { 2974 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 2975 if (err) 2976 goto out; 2977 2978 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 2979 fib6_nh->fib_nh_has_gw = 1; 2980 } 2981 2982 err = -ENODEV; 2983 if (!dev) 2984 goto out; 2985 2986 if (idev->cnf.disable_ipv6) { 2987 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 2988 err = -EACCES; 2989 goto out; 2990 } 2991 2992 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 2993 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2994 err = -ENETDOWN; 2995 goto out; 2996 } 2997 2998 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 2999 !netif_carrier_ok(dev)) 3000 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3001 3002 set_dev: 3003 fib6_nh->fib_nh_dev = dev; 3004 fib6_nh->fib_nh_oif = dev->ifindex; 3005 err = 0; 3006 out: 3007 if (idev) 3008 in6_dev_put(idev); 3009 3010 if (err) { 3011 lwtstate_put(fib6_nh->fib_nh_lws); 3012 fib6_nh->fib_nh_lws = NULL; 3013 if (dev) 3014 dev_put(dev); 3015 } 3016 3017 return err; 3018 } 3019 3020 void fib6_nh_release(struct fib6_nh *fib6_nh) 3021 { 3022 lwtstate_put(fib6_nh->fib_nh_lws); 3023 3024 if (fib6_nh->fib_nh_dev) 3025 dev_put(fib6_nh->fib_nh_dev); 3026 } 3027 3028 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3029 gfp_t gfp_flags, 3030 struct netlink_ext_ack *extack) 3031 { 3032 struct net *net = cfg->fc_nlinfo.nl_net; 3033 struct fib6_info *rt = NULL; 3034 struct fib6_table *table; 3035 int err = -EINVAL; 3036 int addr_type; 3037 3038 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3039 if (cfg->fc_flags & RTF_PCPU) { 3040 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3041 goto out; 3042 } 3043 3044 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3045 if (cfg->fc_flags & RTF_CACHE) { 3046 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3047 goto out; 3048 } 3049 3050 if (cfg->fc_type > RTN_MAX) { 3051 NL_SET_ERR_MSG(extack, "Invalid route type"); 3052 goto out; 3053 } 3054 3055 if (cfg->fc_dst_len > 128) { 3056 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3057 goto out; 3058 } 3059 if (cfg->fc_src_len > 128) { 3060 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3061 goto out; 3062 } 3063 #ifndef CONFIG_IPV6_SUBTREES 3064 if (cfg->fc_src_len) { 3065 NL_SET_ERR_MSG(extack, 3066 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3067 goto out; 3068 } 3069 #endif 3070 3071 err = -ENOBUFS; 3072 if (cfg->fc_nlinfo.nlh && 3073 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3074 table = fib6_get_table(net, cfg->fc_table); 3075 if (!table) { 3076 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3077 table = fib6_new_table(net, cfg->fc_table); 3078 } 3079 } else { 3080 table = fib6_new_table(net, cfg->fc_table); 3081 } 3082 3083 if (!table) 3084 goto out; 3085 3086 err = -ENOMEM; 3087 rt = fib6_info_alloc(gfp_flags); 3088 if (!rt) 3089 goto out; 3090 3091 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3092 extack); 3093 if (IS_ERR(rt->fib6_metrics)) { 3094 err = PTR_ERR(rt->fib6_metrics); 3095 /* Do not leave garbage there. */ 3096 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3097 goto out; 3098 } 3099 3100 if (cfg->fc_flags & RTF_ADDRCONF) 3101 rt->dst_nocount = true; 3102 3103 if (cfg->fc_flags & RTF_EXPIRES) 3104 fib6_set_expires(rt, jiffies + 3105 clock_t_to_jiffies(cfg->fc_expires)); 3106 else 3107 fib6_clean_expires(rt); 3108 3109 if (cfg->fc_protocol == RTPROT_UNSPEC) 3110 cfg->fc_protocol = RTPROT_BOOT; 3111 rt->fib6_protocol = cfg->fc_protocol; 3112 3113 rt->fib6_table = table; 3114 rt->fib6_metric = cfg->fc_metric; 3115 rt->fib6_type = cfg->fc_type; 3116 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3117 3118 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3119 rt->fib6_dst.plen = cfg->fc_dst_len; 3120 if (rt->fib6_dst.plen == 128) 3121 rt->dst_host = true; 3122 3123 #ifdef CONFIG_IPV6_SUBTREES 3124 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3125 rt->fib6_src.plen = cfg->fc_src_len; 3126 #endif 3127 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3128 if (err) 3129 goto out; 3130 3131 /* We cannot add true routes via loopback here, 3132 * they would result in kernel looping; promote them to reject routes 3133 */ 3134 addr_type = ipv6_addr_type(&cfg->fc_dst); 3135 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3136 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3137 3138 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3139 struct net_device *dev = fib6_info_nh_dev(rt); 3140 3141 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3142 NL_SET_ERR_MSG(extack, "Invalid source address"); 3143 err = -EINVAL; 3144 goto out; 3145 } 3146 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3147 rt->fib6_prefsrc.plen = 128; 3148 } else 3149 rt->fib6_prefsrc.plen = 0; 3150 3151 return rt; 3152 out: 3153 fib6_info_release(rt); 3154 return ERR_PTR(err); 3155 } 3156 3157 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3158 struct netlink_ext_ack *extack) 3159 { 3160 struct fib6_info *rt; 3161 int err; 3162 3163 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3164 if (IS_ERR(rt)) 3165 return PTR_ERR(rt); 3166 3167 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3168 fib6_info_release(rt); 3169 3170 return err; 3171 } 3172 3173 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3174 { 3175 struct net *net = info->nl_net; 3176 struct fib6_table *table; 3177 int err; 3178 3179 if (rt == net->ipv6.fib6_null_entry) { 3180 err = -ENOENT; 3181 goto out; 3182 } 3183 3184 table = rt->fib6_table; 3185 spin_lock_bh(&table->tb6_lock); 3186 err = fib6_del(rt, info); 3187 spin_unlock_bh(&table->tb6_lock); 3188 3189 out: 3190 fib6_info_release(rt); 3191 return err; 3192 } 3193 3194 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3195 { 3196 struct nl_info info = { .nl_net = net }; 3197 3198 return __ip6_del_rt(rt, &info); 3199 } 3200 3201 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3202 { 3203 struct nl_info *info = &cfg->fc_nlinfo; 3204 struct net *net = info->nl_net; 3205 struct sk_buff *skb = NULL; 3206 struct fib6_table *table; 3207 int err = -ENOENT; 3208 3209 if (rt == net->ipv6.fib6_null_entry) 3210 goto out_put; 3211 table = rt->fib6_table; 3212 spin_lock_bh(&table->tb6_lock); 3213 3214 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3215 struct fib6_info *sibling, *next_sibling; 3216 3217 /* prefer to send a single notification with all hops */ 3218 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3219 if (skb) { 3220 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3221 3222 if (rt6_fill_node(net, skb, rt, NULL, 3223 NULL, NULL, 0, RTM_DELROUTE, 3224 info->portid, seq, 0) < 0) { 3225 kfree_skb(skb); 3226 skb = NULL; 3227 } else 3228 info->skip_notify = 1; 3229 } 3230 3231 list_for_each_entry_safe(sibling, next_sibling, 3232 &rt->fib6_siblings, 3233 fib6_siblings) { 3234 err = fib6_del(sibling, info); 3235 if (err) 3236 goto out_unlock; 3237 } 3238 } 3239 3240 err = fib6_del(rt, info); 3241 out_unlock: 3242 spin_unlock_bh(&table->tb6_lock); 3243 out_put: 3244 fib6_info_release(rt); 3245 3246 if (skb) { 3247 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3248 info->nlh, gfp_any()); 3249 } 3250 return err; 3251 } 3252 3253 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3254 { 3255 int rc = -ESRCH; 3256 3257 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3258 goto out; 3259 3260 if (cfg->fc_flags & RTF_GATEWAY && 3261 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3262 goto out; 3263 3264 rc = rt6_remove_exception_rt(rt); 3265 out: 3266 return rc; 3267 } 3268 3269 static int ip6_route_del(struct fib6_config *cfg, 3270 struct netlink_ext_ack *extack) 3271 { 3272 struct rt6_info *rt_cache; 3273 struct fib6_table *table; 3274 struct fib6_info *rt; 3275 struct fib6_node *fn; 3276 int err = -ESRCH; 3277 3278 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3279 if (!table) { 3280 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3281 return err; 3282 } 3283 3284 rcu_read_lock(); 3285 3286 fn = fib6_locate(&table->tb6_root, 3287 &cfg->fc_dst, cfg->fc_dst_len, 3288 &cfg->fc_src, cfg->fc_src_len, 3289 !(cfg->fc_flags & RTF_CACHE)); 3290 3291 if (fn) { 3292 for_each_fib6_node_rt_rcu(fn) { 3293 struct fib6_nh *nh; 3294 3295 if (cfg->fc_flags & RTF_CACHE) { 3296 int rc; 3297 3298 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3299 &cfg->fc_src); 3300 if (rt_cache) { 3301 rc = ip6_del_cached_rt(rt_cache, cfg); 3302 if (rc != -ESRCH) { 3303 rcu_read_unlock(); 3304 return rc; 3305 } 3306 } 3307 continue; 3308 } 3309 3310 nh = &rt->fib6_nh; 3311 if (cfg->fc_ifindex && 3312 (!nh->fib_nh_dev || 3313 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3314 continue; 3315 if (cfg->fc_flags & RTF_GATEWAY && 3316 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3317 continue; 3318 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3319 continue; 3320 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3321 continue; 3322 if (!fib6_info_hold_safe(rt)) 3323 continue; 3324 rcu_read_unlock(); 3325 3326 /* if gateway was specified only delete the one hop */ 3327 if (cfg->fc_flags & RTF_GATEWAY) 3328 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3329 3330 return __ip6_del_rt_siblings(rt, cfg); 3331 } 3332 } 3333 rcu_read_unlock(); 3334 3335 return err; 3336 } 3337 3338 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3339 { 3340 struct netevent_redirect netevent; 3341 struct rt6_info *rt, *nrt = NULL; 3342 struct ndisc_options ndopts; 3343 struct inet6_dev *in6_dev; 3344 struct neighbour *neigh; 3345 struct fib6_info *from; 3346 struct rd_msg *msg; 3347 int optlen, on_link; 3348 u8 *lladdr; 3349 3350 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3351 optlen -= sizeof(*msg); 3352 3353 if (optlen < 0) { 3354 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3355 return; 3356 } 3357 3358 msg = (struct rd_msg *)icmp6_hdr(skb); 3359 3360 if (ipv6_addr_is_multicast(&msg->dest)) { 3361 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3362 return; 3363 } 3364 3365 on_link = 0; 3366 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3367 on_link = 1; 3368 } else if (ipv6_addr_type(&msg->target) != 3369 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3370 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3371 return; 3372 } 3373 3374 in6_dev = __in6_dev_get(skb->dev); 3375 if (!in6_dev) 3376 return; 3377 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3378 return; 3379 3380 /* RFC2461 8.1: 3381 * The IP source address of the Redirect MUST be the same as the current 3382 * first-hop router for the specified ICMP Destination Address. 3383 */ 3384 3385 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3386 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3387 return; 3388 } 3389 3390 lladdr = NULL; 3391 if (ndopts.nd_opts_tgt_lladdr) { 3392 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3393 skb->dev); 3394 if (!lladdr) { 3395 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3396 return; 3397 } 3398 } 3399 3400 rt = (struct rt6_info *) dst; 3401 if (rt->rt6i_flags & RTF_REJECT) { 3402 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3403 return; 3404 } 3405 3406 /* Redirect received -> path was valid. 3407 * Look, redirects are sent only in response to data packets, 3408 * so that this nexthop apparently is reachable. --ANK 3409 */ 3410 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3411 3412 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3413 if (!neigh) 3414 return; 3415 3416 /* 3417 * We have finally decided to accept it. 3418 */ 3419 3420 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3421 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3422 NEIGH_UPDATE_F_OVERRIDE| 3423 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3424 NEIGH_UPDATE_F_ISROUTER)), 3425 NDISC_REDIRECT, &ndopts); 3426 3427 rcu_read_lock(); 3428 from = rcu_dereference(rt->from); 3429 /* This fib6_info_hold() is safe here because we hold reference to rt 3430 * and rt already holds reference to fib6_info. 3431 */ 3432 fib6_info_hold(from); 3433 rcu_read_unlock(); 3434 3435 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3436 if (!nrt) 3437 goto out; 3438 3439 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3440 if (on_link) 3441 nrt->rt6i_flags &= ~RTF_GATEWAY; 3442 3443 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3444 3445 /* No need to remove rt from the exception table if rt is 3446 * a cached route because rt6_insert_exception() will 3447 * takes care of it 3448 */ 3449 if (rt6_insert_exception(nrt, from)) { 3450 dst_release_immediate(&nrt->dst); 3451 goto out; 3452 } 3453 3454 netevent.old = &rt->dst; 3455 netevent.new = &nrt->dst; 3456 netevent.daddr = &msg->dest; 3457 netevent.neigh = neigh; 3458 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3459 3460 out: 3461 fib6_info_release(from); 3462 neigh_release(neigh); 3463 } 3464 3465 #ifdef CONFIG_IPV6_ROUTE_INFO 3466 static struct fib6_info *rt6_get_route_info(struct net *net, 3467 const struct in6_addr *prefix, int prefixlen, 3468 const struct in6_addr *gwaddr, 3469 struct net_device *dev) 3470 { 3471 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3472 int ifindex = dev->ifindex; 3473 struct fib6_node *fn; 3474 struct fib6_info *rt = NULL; 3475 struct fib6_table *table; 3476 3477 table = fib6_get_table(net, tb_id); 3478 if (!table) 3479 return NULL; 3480 3481 rcu_read_lock(); 3482 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3483 if (!fn) 3484 goto out; 3485 3486 for_each_fib6_node_rt_rcu(fn) { 3487 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3488 continue; 3489 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3490 !rt->fib6_nh.fib_nh_has_gw) 3491 continue; 3492 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3493 continue; 3494 if (!fib6_info_hold_safe(rt)) 3495 continue; 3496 break; 3497 } 3498 out: 3499 rcu_read_unlock(); 3500 return rt; 3501 } 3502 3503 static struct fib6_info *rt6_add_route_info(struct net *net, 3504 const struct in6_addr *prefix, int prefixlen, 3505 const struct in6_addr *gwaddr, 3506 struct net_device *dev, 3507 unsigned int pref) 3508 { 3509 struct fib6_config cfg = { 3510 .fc_metric = IP6_RT_PRIO_USER, 3511 .fc_ifindex = dev->ifindex, 3512 .fc_dst_len = prefixlen, 3513 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3514 RTF_UP | RTF_PREF(pref), 3515 .fc_protocol = RTPROT_RA, 3516 .fc_type = RTN_UNICAST, 3517 .fc_nlinfo.portid = 0, 3518 .fc_nlinfo.nlh = NULL, 3519 .fc_nlinfo.nl_net = net, 3520 }; 3521 3522 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3523 cfg.fc_dst = *prefix; 3524 cfg.fc_gateway = *gwaddr; 3525 3526 /* We should treat it as a default route if prefix length is 0. */ 3527 if (!prefixlen) 3528 cfg.fc_flags |= RTF_DEFAULT; 3529 3530 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3531 3532 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3533 } 3534 #endif 3535 3536 struct fib6_info *rt6_get_dflt_router(struct net *net, 3537 const struct in6_addr *addr, 3538 struct net_device *dev) 3539 { 3540 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3541 struct fib6_info *rt; 3542 struct fib6_table *table; 3543 3544 table = fib6_get_table(net, tb_id); 3545 if (!table) 3546 return NULL; 3547 3548 rcu_read_lock(); 3549 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3550 struct fib6_nh *nh = &rt->fib6_nh; 3551 3552 if (dev == nh->fib_nh_dev && 3553 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3554 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3555 break; 3556 } 3557 if (rt && !fib6_info_hold_safe(rt)) 3558 rt = NULL; 3559 rcu_read_unlock(); 3560 return rt; 3561 } 3562 3563 struct fib6_info *rt6_add_dflt_router(struct net *net, 3564 const struct in6_addr *gwaddr, 3565 struct net_device *dev, 3566 unsigned int pref) 3567 { 3568 struct fib6_config cfg = { 3569 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3570 .fc_metric = IP6_RT_PRIO_USER, 3571 .fc_ifindex = dev->ifindex, 3572 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3573 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3574 .fc_protocol = RTPROT_RA, 3575 .fc_type = RTN_UNICAST, 3576 .fc_nlinfo.portid = 0, 3577 .fc_nlinfo.nlh = NULL, 3578 .fc_nlinfo.nl_net = net, 3579 }; 3580 3581 cfg.fc_gateway = *gwaddr; 3582 3583 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3584 struct fib6_table *table; 3585 3586 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3587 if (table) 3588 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3589 } 3590 3591 return rt6_get_dflt_router(net, gwaddr, dev); 3592 } 3593 3594 static void __rt6_purge_dflt_routers(struct net *net, 3595 struct fib6_table *table) 3596 { 3597 struct fib6_info *rt; 3598 3599 restart: 3600 rcu_read_lock(); 3601 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3602 struct net_device *dev = fib6_info_nh_dev(rt); 3603 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3604 3605 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3606 (!idev || idev->cnf.accept_ra != 2) && 3607 fib6_info_hold_safe(rt)) { 3608 rcu_read_unlock(); 3609 ip6_del_rt(net, rt); 3610 goto restart; 3611 } 3612 } 3613 rcu_read_unlock(); 3614 3615 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3616 } 3617 3618 void rt6_purge_dflt_routers(struct net *net) 3619 { 3620 struct fib6_table *table; 3621 struct hlist_head *head; 3622 unsigned int h; 3623 3624 rcu_read_lock(); 3625 3626 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3627 head = &net->ipv6.fib_table_hash[h]; 3628 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3629 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3630 __rt6_purge_dflt_routers(net, table); 3631 } 3632 } 3633 3634 rcu_read_unlock(); 3635 } 3636 3637 static void rtmsg_to_fib6_config(struct net *net, 3638 struct in6_rtmsg *rtmsg, 3639 struct fib6_config *cfg) 3640 { 3641 *cfg = (struct fib6_config){ 3642 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3643 : RT6_TABLE_MAIN, 3644 .fc_ifindex = rtmsg->rtmsg_ifindex, 3645 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3646 .fc_expires = rtmsg->rtmsg_info, 3647 .fc_dst_len = rtmsg->rtmsg_dst_len, 3648 .fc_src_len = rtmsg->rtmsg_src_len, 3649 .fc_flags = rtmsg->rtmsg_flags, 3650 .fc_type = rtmsg->rtmsg_type, 3651 3652 .fc_nlinfo.nl_net = net, 3653 3654 .fc_dst = rtmsg->rtmsg_dst, 3655 .fc_src = rtmsg->rtmsg_src, 3656 .fc_gateway = rtmsg->rtmsg_gateway, 3657 }; 3658 } 3659 3660 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3661 { 3662 struct fib6_config cfg; 3663 struct in6_rtmsg rtmsg; 3664 int err; 3665 3666 switch (cmd) { 3667 case SIOCADDRT: /* Add a route */ 3668 case SIOCDELRT: /* Delete a route */ 3669 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3670 return -EPERM; 3671 err = copy_from_user(&rtmsg, arg, 3672 sizeof(struct in6_rtmsg)); 3673 if (err) 3674 return -EFAULT; 3675 3676 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3677 3678 rtnl_lock(); 3679 switch (cmd) { 3680 case SIOCADDRT: 3681 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3682 break; 3683 case SIOCDELRT: 3684 err = ip6_route_del(&cfg, NULL); 3685 break; 3686 default: 3687 err = -EINVAL; 3688 } 3689 rtnl_unlock(); 3690 3691 return err; 3692 } 3693 3694 return -EINVAL; 3695 } 3696 3697 /* 3698 * Drop the packet on the floor 3699 */ 3700 3701 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3702 { 3703 int type; 3704 struct dst_entry *dst = skb_dst(skb); 3705 switch (ipstats_mib_noroutes) { 3706 case IPSTATS_MIB_INNOROUTES: 3707 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3708 if (type == IPV6_ADDR_ANY) { 3709 IP6_INC_STATS(dev_net(dst->dev), 3710 __in6_dev_get_safely(skb->dev), 3711 IPSTATS_MIB_INADDRERRORS); 3712 break; 3713 } 3714 /* FALLTHROUGH */ 3715 case IPSTATS_MIB_OUTNOROUTES: 3716 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3717 ipstats_mib_noroutes); 3718 break; 3719 } 3720 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3721 kfree_skb(skb); 3722 return 0; 3723 } 3724 3725 static int ip6_pkt_discard(struct sk_buff *skb) 3726 { 3727 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3728 } 3729 3730 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3731 { 3732 skb->dev = skb_dst(skb)->dev; 3733 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3734 } 3735 3736 static int ip6_pkt_prohibit(struct sk_buff *skb) 3737 { 3738 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3739 } 3740 3741 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3742 { 3743 skb->dev = skb_dst(skb)->dev; 3744 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3745 } 3746 3747 /* 3748 * Allocate a dst for local (unicast / anycast) address. 3749 */ 3750 3751 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3752 struct inet6_dev *idev, 3753 const struct in6_addr *addr, 3754 bool anycast, gfp_t gfp_flags) 3755 { 3756 struct fib6_config cfg = { 3757 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3758 .fc_ifindex = idev->dev->ifindex, 3759 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3760 .fc_dst = *addr, 3761 .fc_dst_len = 128, 3762 .fc_protocol = RTPROT_KERNEL, 3763 .fc_nlinfo.nl_net = net, 3764 .fc_ignore_dev_down = true, 3765 }; 3766 3767 if (anycast) { 3768 cfg.fc_type = RTN_ANYCAST; 3769 cfg.fc_flags |= RTF_ANYCAST; 3770 } else { 3771 cfg.fc_type = RTN_LOCAL; 3772 cfg.fc_flags |= RTF_LOCAL; 3773 } 3774 3775 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3776 } 3777 3778 /* remove deleted ip from prefsrc entries */ 3779 struct arg_dev_net_ip { 3780 struct net_device *dev; 3781 struct net *net; 3782 struct in6_addr *addr; 3783 }; 3784 3785 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3786 { 3787 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3788 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3789 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3790 3791 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3792 rt != net->ipv6.fib6_null_entry && 3793 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3794 spin_lock_bh(&rt6_exception_lock); 3795 /* remove prefsrc entry */ 3796 rt->fib6_prefsrc.plen = 0; 3797 spin_unlock_bh(&rt6_exception_lock); 3798 } 3799 return 0; 3800 } 3801 3802 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3803 { 3804 struct net *net = dev_net(ifp->idev->dev); 3805 struct arg_dev_net_ip adni = { 3806 .dev = ifp->idev->dev, 3807 .net = net, 3808 .addr = &ifp->addr, 3809 }; 3810 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3811 } 3812 3813 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3814 3815 /* Remove routers and update dst entries when gateway turn into host. */ 3816 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3817 { 3818 struct in6_addr *gateway = (struct in6_addr *)arg; 3819 3820 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3821 rt->fib6_nh.fib_nh_has_gw && 3822 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3823 return -1; 3824 } 3825 3826 /* Further clean up cached routes in exception table. 3827 * This is needed because cached route may have a different 3828 * gateway than its 'parent' in the case of an ip redirect. 3829 */ 3830 rt6_exceptions_clean_tohost(rt, gateway); 3831 3832 return 0; 3833 } 3834 3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3836 { 3837 fib6_clean_all(net, fib6_clean_tohost, gateway); 3838 } 3839 3840 struct arg_netdev_event { 3841 const struct net_device *dev; 3842 union { 3843 unsigned int nh_flags; 3844 unsigned long event; 3845 }; 3846 }; 3847 3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3849 { 3850 struct fib6_info *iter; 3851 struct fib6_node *fn; 3852 3853 fn = rcu_dereference_protected(rt->fib6_node, 3854 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3855 iter = rcu_dereference_protected(fn->leaf, 3856 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3857 while (iter) { 3858 if (iter->fib6_metric == rt->fib6_metric && 3859 rt6_qualify_for_ecmp(iter)) 3860 return iter; 3861 iter = rcu_dereference_protected(iter->fib6_next, 3862 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3863 } 3864 3865 return NULL; 3866 } 3867 3868 static bool rt6_is_dead(const struct fib6_info *rt) 3869 { 3870 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3871 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3872 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3873 return true; 3874 3875 return false; 3876 } 3877 3878 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3879 { 3880 struct fib6_info *iter; 3881 int total = 0; 3882 3883 if (!rt6_is_dead(rt)) 3884 total += rt->fib6_nh.fib_nh_weight; 3885 3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3887 if (!rt6_is_dead(iter)) 3888 total += iter->fib6_nh.fib_nh_weight; 3889 } 3890 3891 return total; 3892 } 3893 3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3895 { 3896 int upper_bound = -1; 3897 3898 if (!rt6_is_dead(rt)) { 3899 *weight += rt->fib6_nh.fib_nh_weight; 3900 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3901 total) - 1; 3902 } 3903 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3904 } 3905 3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3907 { 3908 struct fib6_info *iter; 3909 int weight = 0; 3910 3911 rt6_upper_bound_set(rt, &weight, total); 3912 3913 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3914 rt6_upper_bound_set(iter, &weight, total); 3915 } 3916 3917 void rt6_multipath_rebalance(struct fib6_info *rt) 3918 { 3919 struct fib6_info *first; 3920 int total; 3921 3922 /* In case the entire multipath route was marked for flushing, 3923 * then there is no need to rebalance upon the removal of every 3924 * sibling route. 3925 */ 3926 if (!rt->fib6_nsiblings || rt->should_flush) 3927 return; 3928 3929 /* During lookup routes are evaluated in order, so we need to 3930 * make sure upper bounds are assigned from the first sibling 3931 * onwards. 3932 */ 3933 first = rt6_multipath_first_sibling(rt); 3934 if (WARN_ON_ONCE(!first)) 3935 return; 3936 3937 total = rt6_multipath_total_weight(first); 3938 rt6_multipath_upper_bound_set(first, total); 3939 } 3940 3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3942 { 3943 const struct arg_netdev_event *arg = p_arg; 3944 struct net *net = dev_net(arg->dev); 3945 3946 if (rt != net->ipv6.fib6_null_entry && 3947 rt->fib6_nh.fib_nh_dev == arg->dev) { 3948 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 3949 fib6_update_sernum_upto_root(net, rt); 3950 rt6_multipath_rebalance(rt); 3951 } 3952 3953 return 0; 3954 } 3955 3956 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3957 { 3958 struct arg_netdev_event arg = { 3959 .dev = dev, 3960 { 3961 .nh_flags = nh_flags, 3962 }, 3963 }; 3964 3965 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3966 arg.nh_flags |= RTNH_F_LINKDOWN; 3967 3968 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3969 } 3970 3971 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3972 const struct net_device *dev) 3973 { 3974 struct fib6_info *iter; 3975 3976 if (rt->fib6_nh.fib_nh_dev == dev) 3977 return true; 3978 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3979 if (iter->fib6_nh.fib_nh_dev == dev) 3980 return true; 3981 3982 return false; 3983 } 3984 3985 static void rt6_multipath_flush(struct fib6_info *rt) 3986 { 3987 struct fib6_info *iter; 3988 3989 rt->should_flush = 1; 3990 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3991 iter->should_flush = 1; 3992 } 3993 3994 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3995 const struct net_device *down_dev) 3996 { 3997 struct fib6_info *iter; 3998 unsigned int dead = 0; 3999 4000 if (rt->fib6_nh.fib_nh_dev == down_dev || 4001 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4002 dead++; 4003 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4004 if (iter->fib6_nh.fib_nh_dev == down_dev || 4005 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4006 dead++; 4007 4008 return dead; 4009 } 4010 4011 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4012 const struct net_device *dev, 4013 unsigned int nh_flags) 4014 { 4015 struct fib6_info *iter; 4016 4017 if (rt->fib6_nh.fib_nh_dev == dev) 4018 rt->fib6_nh.fib_nh_flags |= nh_flags; 4019 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4020 if (iter->fib6_nh.fib_nh_dev == dev) 4021 iter->fib6_nh.fib_nh_flags |= nh_flags; 4022 } 4023 4024 /* called with write lock held for table with rt */ 4025 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4026 { 4027 const struct arg_netdev_event *arg = p_arg; 4028 const struct net_device *dev = arg->dev; 4029 struct net *net = dev_net(dev); 4030 4031 if (rt == net->ipv6.fib6_null_entry) 4032 return 0; 4033 4034 switch (arg->event) { 4035 case NETDEV_UNREGISTER: 4036 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4037 case NETDEV_DOWN: 4038 if (rt->should_flush) 4039 return -1; 4040 if (!rt->fib6_nsiblings) 4041 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4042 if (rt6_multipath_uses_dev(rt, dev)) { 4043 unsigned int count; 4044 4045 count = rt6_multipath_dead_count(rt, dev); 4046 if (rt->fib6_nsiblings + 1 == count) { 4047 rt6_multipath_flush(rt); 4048 return -1; 4049 } 4050 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4051 RTNH_F_LINKDOWN); 4052 fib6_update_sernum(net, rt); 4053 rt6_multipath_rebalance(rt); 4054 } 4055 return -2; 4056 case NETDEV_CHANGE: 4057 if (rt->fib6_nh.fib_nh_dev != dev || 4058 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4059 break; 4060 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4061 rt6_multipath_rebalance(rt); 4062 break; 4063 } 4064 4065 return 0; 4066 } 4067 4068 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4069 { 4070 struct arg_netdev_event arg = { 4071 .dev = dev, 4072 { 4073 .event = event, 4074 }, 4075 }; 4076 struct net *net = dev_net(dev); 4077 4078 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4079 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4080 else 4081 fib6_clean_all(net, fib6_ifdown, &arg); 4082 } 4083 4084 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4085 { 4086 rt6_sync_down_dev(dev, event); 4087 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4088 neigh_ifdown(&nd_tbl, dev); 4089 } 4090 4091 struct rt6_mtu_change_arg { 4092 struct net_device *dev; 4093 unsigned int mtu; 4094 }; 4095 4096 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4097 { 4098 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4099 struct inet6_dev *idev; 4100 4101 /* In IPv6 pmtu discovery is not optional, 4102 so that RTAX_MTU lock cannot disable it. 4103 We still use this lock to block changes 4104 caused by addrconf/ndisc. 4105 */ 4106 4107 idev = __in6_dev_get(arg->dev); 4108 if (!idev) 4109 return 0; 4110 4111 /* For administrative MTU increase, there is no way to discover 4112 IPv6 PMTU increase, so PMTU increase should be updated here. 4113 Since RFC 1981 doesn't include administrative MTU increase 4114 update PMTU increase is a MUST. (i.e. jumbo frame) 4115 */ 4116 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4117 !fib6_metric_locked(rt, RTAX_MTU)) { 4118 u32 mtu = rt->fib6_pmtu; 4119 4120 if (mtu >= arg->mtu || 4121 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4122 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4123 4124 spin_lock_bh(&rt6_exception_lock); 4125 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4126 spin_unlock_bh(&rt6_exception_lock); 4127 } 4128 return 0; 4129 } 4130 4131 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4132 { 4133 struct rt6_mtu_change_arg arg = { 4134 .dev = dev, 4135 .mtu = mtu, 4136 }; 4137 4138 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4139 } 4140 4141 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4142 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4143 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4144 [RTA_OIF] = { .type = NLA_U32 }, 4145 [RTA_IIF] = { .type = NLA_U32 }, 4146 [RTA_PRIORITY] = { .type = NLA_U32 }, 4147 [RTA_METRICS] = { .type = NLA_NESTED }, 4148 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4149 [RTA_PREF] = { .type = NLA_U8 }, 4150 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4151 [RTA_ENCAP] = { .type = NLA_NESTED }, 4152 [RTA_EXPIRES] = { .type = NLA_U32 }, 4153 [RTA_UID] = { .type = NLA_U32 }, 4154 [RTA_MARK] = { .type = NLA_U32 }, 4155 [RTA_TABLE] = { .type = NLA_U32 }, 4156 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4157 [RTA_SPORT] = { .type = NLA_U16 }, 4158 [RTA_DPORT] = { .type = NLA_U16 }, 4159 }; 4160 4161 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4162 struct fib6_config *cfg, 4163 struct netlink_ext_ack *extack) 4164 { 4165 struct rtmsg *rtm; 4166 struct nlattr *tb[RTA_MAX+1]; 4167 unsigned int pref; 4168 int err; 4169 4170 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4171 extack); 4172 if (err < 0) 4173 goto errout; 4174 4175 err = -EINVAL; 4176 rtm = nlmsg_data(nlh); 4177 4178 *cfg = (struct fib6_config){ 4179 .fc_table = rtm->rtm_table, 4180 .fc_dst_len = rtm->rtm_dst_len, 4181 .fc_src_len = rtm->rtm_src_len, 4182 .fc_flags = RTF_UP, 4183 .fc_protocol = rtm->rtm_protocol, 4184 .fc_type = rtm->rtm_type, 4185 4186 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4187 .fc_nlinfo.nlh = nlh, 4188 .fc_nlinfo.nl_net = sock_net(skb->sk), 4189 }; 4190 4191 if (rtm->rtm_type == RTN_UNREACHABLE || 4192 rtm->rtm_type == RTN_BLACKHOLE || 4193 rtm->rtm_type == RTN_PROHIBIT || 4194 rtm->rtm_type == RTN_THROW) 4195 cfg->fc_flags |= RTF_REJECT; 4196 4197 if (rtm->rtm_type == RTN_LOCAL) 4198 cfg->fc_flags |= RTF_LOCAL; 4199 4200 if (rtm->rtm_flags & RTM_F_CLONED) 4201 cfg->fc_flags |= RTF_CACHE; 4202 4203 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4204 4205 if (tb[RTA_GATEWAY]) { 4206 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4207 cfg->fc_flags |= RTF_GATEWAY; 4208 } 4209 if (tb[RTA_VIA]) { 4210 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4211 goto errout; 4212 } 4213 4214 if (tb[RTA_DST]) { 4215 int plen = (rtm->rtm_dst_len + 7) >> 3; 4216 4217 if (nla_len(tb[RTA_DST]) < plen) 4218 goto errout; 4219 4220 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4221 } 4222 4223 if (tb[RTA_SRC]) { 4224 int plen = (rtm->rtm_src_len + 7) >> 3; 4225 4226 if (nla_len(tb[RTA_SRC]) < plen) 4227 goto errout; 4228 4229 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4230 } 4231 4232 if (tb[RTA_PREFSRC]) 4233 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4234 4235 if (tb[RTA_OIF]) 4236 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4237 4238 if (tb[RTA_PRIORITY]) 4239 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4240 4241 if (tb[RTA_METRICS]) { 4242 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4243 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4244 } 4245 4246 if (tb[RTA_TABLE]) 4247 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4248 4249 if (tb[RTA_MULTIPATH]) { 4250 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4251 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4252 4253 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4254 cfg->fc_mp_len, extack); 4255 if (err < 0) 4256 goto errout; 4257 } 4258 4259 if (tb[RTA_PREF]) { 4260 pref = nla_get_u8(tb[RTA_PREF]); 4261 if (pref != ICMPV6_ROUTER_PREF_LOW && 4262 pref != ICMPV6_ROUTER_PREF_HIGH) 4263 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4264 cfg->fc_flags |= RTF_PREF(pref); 4265 } 4266 4267 if (tb[RTA_ENCAP]) 4268 cfg->fc_encap = tb[RTA_ENCAP]; 4269 4270 if (tb[RTA_ENCAP_TYPE]) { 4271 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4272 4273 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4274 if (err < 0) 4275 goto errout; 4276 } 4277 4278 if (tb[RTA_EXPIRES]) { 4279 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4280 4281 if (addrconf_finite_timeout(timeout)) { 4282 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4283 cfg->fc_flags |= RTF_EXPIRES; 4284 } 4285 } 4286 4287 err = 0; 4288 errout: 4289 return err; 4290 } 4291 4292 struct rt6_nh { 4293 struct fib6_info *fib6_info; 4294 struct fib6_config r_cfg; 4295 struct list_head next; 4296 }; 4297 4298 static int ip6_route_info_append(struct net *net, 4299 struct list_head *rt6_nh_list, 4300 struct fib6_info *rt, 4301 struct fib6_config *r_cfg) 4302 { 4303 struct rt6_nh *nh; 4304 int err = -EEXIST; 4305 4306 list_for_each_entry(nh, rt6_nh_list, next) { 4307 /* check if fib6_info already exists */ 4308 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4309 return err; 4310 } 4311 4312 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4313 if (!nh) 4314 return -ENOMEM; 4315 nh->fib6_info = rt; 4316 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4317 list_add_tail(&nh->next, rt6_nh_list); 4318 4319 return 0; 4320 } 4321 4322 static void ip6_route_mpath_notify(struct fib6_info *rt, 4323 struct fib6_info *rt_last, 4324 struct nl_info *info, 4325 __u16 nlflags) 4326 { 4327 /* if this is an APPEND route, then rt points to the first route 4328 * inserted and rt_last points to last route inserted. Userspace 4329 * wants a consistent dump of the route which starts at the first 4330 * nexthop. Since sibling routes are always added at the end of 4331 * the list, find the first sibling of the last route appended 4332 */ 4333 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4334 rt = list_first_entry(&rt_last->fib6_siblings, 4335 struct fib6_info, 4336 fib6_siblings); 4337 } 4338 4339 if (rt) 4340 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4341 } 4342 4343 static int ip6_route_multipath_add(struct fib6_config *cfg, 4344 struct netlink_ext_ack *extack) 4345 { 4346 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4347 struct nl_info *info = &cfg->fc_nlinfo; 4348 struct fib6_config r_cfg; 4349 struct rtnexthop *rtnh; 4350 struct fib6_info *rt; 4351 struct rt6_nh *err_nh; 4352 struct rt6_nh *nh, *nh_safe; 4353 __u16 nlflags; 4354 int remaining; 4355 int attrlen; 4356 int err = 1; 4357 int nhn = 0; 4358 int replace = (cfg->fc_nlinfo.nlh && 4359 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4360 LIST_HEAD(rt6_nh_list); 4361 4362 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4363 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4364 nlflags |= NLM_F_APPEND; 4365 4366 remaining = cfg->fc_mp_len; 4367 rtnh = (struct rtnexthop *)cfg->fc_mp; 4368 4369 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4370 * fib6_info structs per nexthop 4371 */ 4372 while (rtnh_ok(rtnh, remaining)) { 4373 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4374 if (rtnh->rtnh_ifindex) 4375 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4376 4377 attrlen = rtnh_attrlen(rtnh); 4378 if (attrlen > 0) { 4379 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4380 4381 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4382 if (nla) { 4383 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4384 r_cfg.fc_flags |= RTF_GATEWAY; 4385 } 4386 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4387 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4388 if (nla) 4389 r_cfg.fc_encap_type = nla_get_u16(nla); 4390 } 4391 4392 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4393 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4394 if (IS_ERR(rt)) { 4395 err = PTR_ERR(rt); 4396 rt = NULL; 4397 goto cleanup; 4398 } 4399 if (!rt6_qualify_for_ecmp(rt)) { 4400 err = -EINVAL; 4401 NL_SET_ERR_MSG(extack, 4402 "Device only routes can not be added for IPv6 using the multipath API."); 4403 fib6_info_release(rt); 4404 goto cleanup; 4405 } 4406 4407 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4408 4409 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4410 rt, &r_cfg); 4411 if (err) { 4412 fib6_info_release(rt); 4413 goto cleanup; 4414 } 4415 4416 rtnh = rtnh_next(rtnh, &remaining); 4417 } 4418 4419 /* for add and replace send one notification with all nexthops. 4420 * Skip the notification in fib6_add_rt2node and send one with 4421 * the full route when done 4422 */ 4423 info->skip_notify = 1; 4424 4425 err_nh = NULL; 4426 list_for_each_entry(nh, &rt6_nh_list, next) { 4427 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4428 fib6_info_release(nh->fib6_info); 4429 4430 if (!err) { 4431 /* save reference to last route successfully inserted */ 4432 rt_last = nh->fib6_info; 4433 4434 /* save reference to first route for notification */ 4435 if (!rt_notif) 4436 rt_notif = nh->fib6_info; 4437 } 4438 4439 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4440 nh->fib6_info = NULL; 4441 if (err) { 4442 if (replace && nhn) 4443 NL_SET_ERR_MSG_MOD(extack, 4444 "multipath route replace failed (check consistency of installed routes)"); 4445 err_nh = nh; 4446 goto add_errout; 4447 } 4448 4449 /* Because each route is added like a single route we remove 4450 * these flags after the first nexthop: if there is a collision, 4451 * we have already failed to add the first nexthop: 4452 * fib6_add_rt2node() has rejected it; when replacing, old 4453 * nexthops have been replaced by first new, the rest should 4454 * be added to it. 4455 */ 4456 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4457 NLM_F_REPLACE); 4458 nhn++; 4459 } 4460 4461 /* success ... tell user about new route */ 4462 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4463 goto cleanup; 4464 4465 add_errout: 4466 /* send notification for routes that were added so that 4467 * the delete notifications sent by ip6_route_del are 4468 * coherent 4469 */ 4470 if (rt_notif) 4471 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4472 4473 /* Delete routes that were already added */ 4474 list_for_each_entry(nh, &rt6_nh_list, next) { 4475 if (err_nh == nh) 4476 break; 4477 ip6_route_del(&nh->r_cfg, extack); 4478 } 4479 4480 cleanup: 4481 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4482 if (nh->fib6_info) 4483 fib6_info_release(nh->fib6_info); 4484 list_del(&nh->next); 4485 kfree(nh); 4486 } 4487 4488 return err; 4489 } 4490 4491 static int ip6_route_multipath_del(struct fib6_config *cfg, 4492 struct netlink_ext_ack *extack) 4493 { 4494 struct fib6_config r_cfg; 4495 struct rtnexthop *rtnh; 4496 int remaining; 4497 int attrlen; 4498 int err = 1, last_err = 0; 4499 4500 remaining = cfg->fc_mp_len; 4501 rtnh = (struct rtnexthop *)cfg->fc_mp; 4502 4503 /* Parse a Multipath Entry */ 4504 while (rtnh_ok(rtnh, remaining)) { 4505 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4506 if (rtnh->rtnh_ifindex) 4507 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4508 4509 attrlen = rtnh_attrlen(rtnh); 4510 if (attrlen > 0) { 4511 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4512 4513 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4514 if (nla) { 4515 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4516 r_cfg.fc_flags |= RTF_GATEWAY; 4517 } 4518 } 4519 err = ip6_route_del(&r_cfg, extack); 4520 if (err) 4521 last_err = err; 4522 4523 rtnh = rtnh_next(rtnh, &remaining); 4524 } 4525 4526 return last_err; 4527 } 4528 4529 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4530 struct netlink_ext_ack *extack) 4531 { 4532 struct fib6_config cfg; 4533 int err; 4534 4535 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4536 if (err < 0) 4537 return err; 4538 4539 if (cfg.fc_mp) 4540 return ip6_route_multipath_del(&cfg, extack); 4541 else { 4542 cfg.fc_delete_all_nh = 1; 4543 return ip6_route_del(&cfg, extack); 4544 } 4545 } 4546 4547 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4548 struct netlink_ext_ack *extack) 4549 { 4550 struct fib6_config cfg; 4551 int err; 4552 4553 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4554 if (err < 0) 4555 return err; 4556 4557 if (cfg.fc_metric == 0) 4558 cfg.fc_metric = IP6_RT_PRIO_USER; 4559 4560 if (cfg.fc_mp) 4561 return ip6_route_multipath_add(&cfg, extack); 4562 else 4563 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4564 } 4565 4566 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4567 { 4568 int nexthop_len = 0; 4569 4570 if (rt->fib6_nsiblings) { 4571 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4572 + NLA_ALIGN(sizeof(struct rtnexthop)) 4573 + nla_total_size(16) /* RTA_GATEWAY */ 4574 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4575 4576 nexthop_len *= rt->fib6_nsiblings; 4577 } 4578 4579 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4580 + nla_total_size(16) /* RTA_SRC */ 4581 + nla_total_size(16) /* RTA_DST */ 4582 + nla_total_size(16) /* RTA_GATEWAY */ 4583 + nla_total_size(16) /* RTA_PREFSRC */ 4584 + nla_total_size(4) /* RTA_TABLE */ 4585 + nla_total_size(4) /* RTA_IIF */ 4586 + nla_total_size(4) /* RTA_OIF */ 4587 + nla_total_size(4) /* RTA_PRIORITY */ 4588 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4589 + nla_total_size(sizeof(struct rta_cacheinfo)) 4590 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4591 + nla_total_size(1) /* RTA_PREF */ 4592 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4593 + nexthop_len; 4594 } 4595 4596 static int rt6_nexthop_info(struct sk_buff *skb, const struct fib6_nh *fib6_nh, 4597 unsigned int *flags, bool skip_oif) 4598 { 4599 if (fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4600 *flags |= RTNH_F_DEAD; 4601 4602 if (fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN) { 4603 *flags |= RTNH_F_LINKDOWN; 4604 4605 rcu_read_lock(); 4606 if (ip6_ignore_linkdown(fib6_nh->fib_nh_dev)) 4607 *flags |= RTNH_F_DEAD; 4608 rcu_read_unlock(); 4609 } 4610 4611 if (fib6_nh->fib_nh_has_gw) { 4612 if (nla_put_in6_addr(skb, RTA_GATEWAY, &fib6_nh->fib_nh_gw6) < 0) 4613 goto nla_put_failure; 4614 } 4615 4616 *flags |= (fib6_nh->fib_nh_flags & RTNH_F_ONLINK); 4617 if (fib6_nh->fib_nh_flags & RTNH_F_OFFLOAD) 4618 *flags |= RTNH_F_OFFLOAD; 4619 4620 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4621 if (!skip_oif && fib6_nh->fib_nh_dev && 4622 nla_put_u32(skb, RTA_OIF, fib6_nh->fib_nh_dev->ifindex)) 4623 goto nla_put_failure; 4624 4625 if (fib6_nh->fib_nh_lws && 4626 lwtunnel_fill_encap(skb, fib6_nh->fib_nh_lws) < 0) 4627 goto nla_put_failure; 4628 4629 return 0; 4630 4631 nla_put_failure: 4632 return -EMSGSIZE; 4633 } 4634 4635 /* add multipath next hop */ 4636 static int rt6_add_nexthop(struct sk_buff *skb, const struct fib6_nh *fib6_nh) 4637 { 4638 const struct net_device *dev = fib6_nh->fib_nh_dev; 4639 struct rtnexthop *rtnh; 4640 unsigned int flags = 0; 4641 4642 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4643 if (!rtnh) 4644 goto nla_put_failure; 4645 4646 rtnh->rtnh_hops = fib6_nh->fib_nh_weight - 1; 4647 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4648 4649 if (rt6_nexthop_info(skb, fib6_nh, &flags, true) < 0) 4650 goto nla_put_failure; 4651 4652 rtnh->rtnh_flags = flags; 4653 4654 /* length of rtnetlink header + attributes */ 4655 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4656 4657 return 0; 4658 4659 nla_put_failure: 4660 return -EMSGSIZE; 4661 } 4662 4663 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4664 struct fib6_info *rt, struct dst_entry *dst, 4665 struct in6_addr *dest, struct in6_addr *src, 4666 int iif, int type, u32 portid, u32 seq, 4667 unsigned int flags) 4668 { 4669 struct rt6_info *rt6 = (struct rt6_info *)dst; 4670 struct rt6key *rt6_dst, *rt6_src; 4671 u32 *pmetrics, table, rt6_flags; 4672 struct nlmsghdr *nlh; 4673 struct rtmsg *rtm; 4674 long expires = 0; 4675 4676 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4677 if (!nlh) 4678 return -EMSGSIZE; 4679 4680 if (rt6) { 4681 rt6_dst = &rt6->rt6i_dst; 4682 rt6_src = &rt6->rt6i_src; 4683 rt6_flags = rt6->rt6i_flags; 4684 } else { 4685 rt6_dst = &rt->fib6_dst; 4686 rt6_src = &rt->fib6_src; 4687 rt6_flags = rt->fib6_flags; 4688 } 4689 4690 rtm = nlmsg_data(nlh); 4691 rtm->rtm_family = AF_INET6; 4692 rtm->rtm_dst_len = rt6_dst->plen; 4693 rtm->rtm_src_len = rt6_src->plen; 4694 rtm->rtm_tos = 0; 4695 if (rt->fib6_table) 4696 table = rt->fib6_table->tb6_id; 4697 else 4698 table = RT6_TABLE_UNSPEC; 4699 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4700 if (nla_put_u32(skb, RTA_TABLE, table)) 4701 goto nla_put_failure; 4702 4703 rtm->rtm_type = rt->fib6_type; 4704 rtm->rtm_flags = 0; 4705 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4706 rtm->rtm_protocol = rt->fib6_protocol; 4707 4708 if (rt6_flags & RTF_CACHE) 4709 rtm->rtm_flags |= RTM_F_CLONED; 4710 4711 if (dest) { 4712 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4713 goto nla_put_failure; 4714 rtm->rtm_dst_len = 128; 4715 } else if (rtm->rtm_dst_len) 4716 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4717 goto nla_put_failure; 4718 #ifdef CONFIG_IPV6_SUBTREES 4719 if (src) { 4720 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4721 goto nla_put_failure; 4722 rtm->rtm_src_len = 128; 4723 } else if (rtm->rtm_src_len && 4724 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4725 goto nla_put_failure; 4726 #endif 4727 if (iif) { 4728 #ifdef CONFIG_IPV6_MROUTE 4729 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4730 int err = ip6mr_get_route(net, skb, rtm, portid); 4731 4732 if (err == 0) 4733 return 0; 4734 if (err < 0) 4735 goto nla_put_failure; 4736 } else 4737 #endif 4738 if (nla_put_u32(skb, RTA_IIF, iif)) 4739 goto nla_put_failure; 4740 } else if (dest) { 4741 struct in6_addr saddr_buf; 4742 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4743 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4744 goto nla_put_failure; 4745 } 4746 4747 if (rt->fib6_prefsrc.plen) { 4748 struct in6_addr saddr_buf; 4749 saddr_buf = rt->fib6_prefsrc.addr; 4750 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4751 goto nla_put_failure; 4752 } 4753 4754 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4755 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4756 goto nla_put_failure; 4757 4758 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4759 goto nla_put_failure; 4760 4761 /* For multipath routes, walk the siblings list and add 4762 * each as a nexthop within RTA_MULTIPATH. 4763 */ 4764 if (rt6) { 4765 if (rt6_flags & RTF_GATEWAY && 4766 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4767 goto nla_put_failure; 4768 4769 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4770 goto nla_put_failure; 4771 } else if (rt->fib6_nsiblings) { 4772 struct fib6_info *sibling, *next_sibling; 4773 struct nlattr *mp; 4774 4775 mp = nla_nest_start(skb, RTA_MULTIPATH); 4776 if (!mp) 4777 goto nla_put_failure; 4778 4779 if (rt6_add_nexthop(skb, &rt->fib6_nh) < 0) 4780 goto nla_put_failure; 4781 4782 list_for_each_entry_safe(sibling, next_sibling, 4783 &rt->fib6_siblings, fib6_siblings) { 4784 if (rt6_add_nexthop(skb, &sibling->fib6_nh) < 0) 4785 goto nla_put_failure; 4786 } 4787 4788 nla_nest_end(skb, mp); 4789 } else { 4790 if (rt6_nexthop_info(skb, &rt->fib6_nh, &rtm->rtm_flags, 4791 false) < 0) 4792 goto nla_put_failure; 4793 } 4794 4795 if (rt6_flags & RTF_EXPIRES) { 4796 expires = dst ? dst->expires : rt->expires; 4797 expires -= jiffies; 4798 } 4799 4800 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4801 goto nla_put_failure; 4802 4803 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4804 goto nla_put_failure; 4805 4806 4807 nlmsg_end(skb, nlh); 4808 return 0; 4809 4810 nla_put_failure: 4811 nlmsg_cancel(skb, nlh); 4812 return -EMSGSIZE; 4813 } 4814 4815 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4816 const struct net_device *dev) 4817 { 4818 if (f6i->fib6_nh.fib_nh_dev == dev) 4819 return true; 4820 4821 if (f6i->fib6_nsiblings) { 4822 struct fib6_info *sibling, *next_sibling; 4823 4824 list_for_each_entry_safe(sibling, next_sibling, 4825 &f6i->fib6_siblings, fib6_siblings) { 4826 if (sibling->fib6_nh.fib_nh_dev == dev) 4827 return true; 4828 } 4829 } 4830 4831 return false; 4832 } 4833 4834 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4835 { 4836 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4837 struct fib_dump_filter *filter = &arg->filter; 4838 unsigned int flags = NLM_F_MULTI; 4839 struct net *net = arg->net; 4840 4841 if (rt == net->ipv6.fib6_null_entry) 4842 return 0; 4843 4844 if ((filter->flags & RTM_F_PREFIX) && 4845 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4846 /* success since this is not a prefix route */ 4847 return 1; 4848 } 4849 if (filter->filter_set) { 4850 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4851 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4852 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4853 return 1; 4854 } 4855 flags |= NLM_F_DUMP_FILTERED; 4856 } 4857 4858 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4859 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4860 arg->cb->nlh->nlmsg_seq, flags); 4861 } 4862 4863 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4864 const struct nlmsghdr *nlh, 4865 struct nlattr **tb, 4866 struct netlink_ext_ack *extack) 4867 { 4868 struct rtmsg *rtm; 4869 int i, err; 4870 4871 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4872 NL_SET_ERR_MSG_MOD(extack, 4873 "Invalid header for get route request"); 4874 return -EINVAL; 4875 } 4876 4877 if (!netlink_strict_get_check(skb)) 4878 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4879 rtm_ipv6_policy, extack); 4880 4881 rtm = nlmsg_data(nlh); 4882 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4883 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4884 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4885 rtm->rtm_type) { 4886 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4887 return -EINVAL; 4888 } 4889 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4890 NL_SET_ERR_MSG_MOD(extack, 4891 "Invalid flags for get route request"); 4892 return -EINVAL; 4893 } 4894 4895 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4896 rtm_ipv6_policy, extack); 4897 if (err) 4898 return err; 4899 4900 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4901 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4902 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4903 return -EINVAL; 4904 } 4905 4906 for (i = 0; i <= RTA_MAX; i++) { 4907 if (!tb[i]) 4908 continue; 4909 4910 switch (i) { 4911 case RTA_SRC: 4912 case RTA_DST: 4913 case RTA_IIF: 4914 case RTA_OIF: 4915 case RTA_MARK: 4916 case RTA_UID: 4917 case RTA_SPORT: 4918 case RTA_DPORT: 4919 case RTA_IP_PROTO: 4920 break; 4921 default: 4922 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4923 return -EINVAL; 4924 } 4925 } 4926 4927 return 0; 4928 } 4929 4930 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4931 struct netlink_ext_ack *extack) 4932 { 4933 struct net *net = sock_net(in_skb->sk); 4934 struct nlattr *tb[RTA_MAX+1]; 4935 int err, iif = 0, oif = 0; 4936 struct fib6_info *from; 4937 struct dst_entry *dst; 4938 struct rt6_info *rt; 4939 struct sk_buff *skb; 4940 struct rtmsg *rtm; 4941 struct flowi6 fl6 = {}; 4942 bool fibmatch; 4943 4944 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4945 if (err < 0) 4946 goto errout; 4947 4948 err = -EINVAL; 4949 rtm = nlmsg_data(nlh); 4950 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4951 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4952 4953 if (tb[RTA_SRC]) { 4954 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4955 goto errout; 4956 4957 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4958 } 4959 4960 if (tb[RTA_DST]) { 4961 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4962 goto errout; 4963 4964 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4965 } 4966 4967 if (tb[RTA_IIF]) 4968 iif = nla_get_u32(tb[RTA_IIF]); 4969 4970 if (tb[RTA_OIF]) 4971 oif = nla_get_u32(tb[RTA_OIF]); 4972 4973 if (tb[RTA_MARK]) 4974 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4975 4976 if (tb[RTA_UID]) 4977 fl6.flowi6_uid = make_kuid(current_user_ns(), 4978 nla_get_u32(tb[RTA_UID])); 4979 else 4980 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4981 4982 if (tb[RTA_SPORT]) 4983 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4984 4985 if (tb[RTA_DPORT]) 4986 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4987 4988 if (tb[RTA_IP_PROTO]) { 4989 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4990 &fl6.flowi6_proto, AF_INET6, 4991 extack); 4992 if (err) 4993 goto errout; 4994 } 4995 4996 if (iif) { 4997 struct net_device *dev; 4998 int flags = 0; 4999 5000 rcu_read_lock(); 5001 5002 dev = dev_get_by_index_rcu(net, iif); 5003 if (!dev) { 5004 rcu_read_unlock(); 5005 err = -ENODEV; 5006 goto errout; 5007 } 5008 5009 fl6.flowi6_iif = iif; 5010 5011 if (!ipv6_addr_any(&fl6.saddr)) 5012 flags |= RT6_LOOKUP_F_HAS_SADDR; 5013 5014 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5015 5016 rcu_read_unlock(); 5017 } else { 5018 fl6.flowi6_oif = oif; 5019 5020 dst = ip6_route_output(net, NULL, &fl6); 5021 } 5022 5023 5024 rt = container_of(dst, struct rt6_info, dst); 5025 if (rt->dst.error) { 5026 err = rt->dst.error; 5027 ip6_rt_put(rt); 5028 goto errout; 5029 } 5030 5031 if (rt == net->ipv6.ip6_null_entry) { 5032 err = rt->dst.error; 5033 ip6_rt_put(rt); 5034 goto errout; 5035 } 5036 5037 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5038 if (!skb) { 5039 ip6_rt_put(rt); 5040 err = -ENOBUFS; 5041 goto errout; 5042 } 5043 5044 skb_dst_set(skb, &rt->dst); 5045 5046 rcu_read_lock(); 5047 from = rcu_dereference(rt->from); 5048 5049 if (fibmatch) 5050 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 5051 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 5052 nlh->nlmsg_seq, 0); 5053 else 5054 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5055 &fl6.saddr, iif, RTM_NEWROUTE, 5056 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 5057 0); 5058 rcu_read_unlock(); 5059 5060 if (err < 0) { 5061 kfree_skb(skb); 5062 goto errout; 5063 } 5064 5065 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5066 errout: 5067 return err; 5068 } 5069 5070 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5071 unsigned int nlm_flags) 5072 { 5073 struct sk_buff *skb; 5074 struct net *net = info->nl_net; 5075 u32 seq; 5076 int err; 5077 5078 err = -ENOBUFS; 5079 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5080 5081 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5082 if (!skb) 5083 goto errout; 5084 5085 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5086 event, info->portid, seq, nlm_flags); 5087 if (err < 0) { 5088 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5089 WARN_ON(err == -EMSGSIZE); 5090 kfree_skb(skb); 5091 goto errout; 5092 } 5093 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5094 info->nlh, gfp_any()); 5095 return; 5096 errout: 5097 if (err < 0) 5098 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5099 } 5100 5101 static int ip6_route_dev_notify(struct notifier_block *this, 5102 unsigned long event, void *ptr) 5103 { 5104 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5105 struct net *net = dev_net(dev); 5106 5107 if (!(dev->flags & IFF_LOOPBACK)) 5108 return NOTIFY_OK; 5109 5110 if (event == NETDEV_REGISTER) { 5111 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5112 net->ipv6.ip6_null_entry->dst.dev = dev; 5113 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5114 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5115 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5116 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5117 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5118 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5119 #endif 5120 } else if (event == NETDEV_UNREGISTER && 5121 dev->reg_state != NETREG_UNREGISTERED) { 5122 /* NETDEV_UNREGISTER could be fired for multiple times by 5123 * netdev_wait_allrefs(). Make sure we only call this once. 5124 */ 5125 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5126 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5127 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5128 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5129 #endif 5130 } 5131 5132 return NOTIFY_OK; 5133 } 5134 5135 /* 5136 * /proc 5137 */ 5138 5139 #ifdef CONFIG_PROC_FS 5140 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5141 { 5142 struct net *net = (struct net *)seq->private; 5143 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5144 net->ipv6.rt6_stats->fib_nodes, 5145 net->ipv6.rt6_stats->fib_route_nodes, 5146 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5147 net->ipv6.rt6_stats->fib_rt_entries, 5148 net->ipv6.rt6_stats->fib_rt_cache, 5149 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5150 net->ipv6.rt6_stats->fib_discarded_routes); 5151 5152 return 0; 5153 } 5154 #endif /* CONFIG_PROC_FS */ 5155 5156 #ifdef CONFIG_SYSCTL 5157 5158 static 5159 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5160 void __user *buffer, size_t *lenp, loff_t *ppos) 5161 { 5162 struct net *net; 5163 int delay; 5164 int ret; 5165 if (!write) 5166 return -EINVAL; 5167 5168 net = (struct net *)ctl->extra1; 5169 delay = net->ipv6.sysctl.flush_delay; 5170 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5171 if (ret) 5172 return ret; 5173 5174 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5175 return 0; 5176 } 5177 5178 static int zero; 5179 static int one = 1; 5180 5181 static struct ctl_table ipv6_route_table_template[] = { 5182 { 5183 .procname = "flush", 5184 .data = &init_net.ipv6.sysctl.flush_delay, 5185 .maxlen = sizeof(int), 5186 .mode = 0200, 5187 .proc_handler = ipv6_sysctl_rtcache_flush 5188 }, 5189 { 5190 .procname = "gc_thresh", 5191 .data = &ip6_dst_ops_template.gc_thresh, 5192 .maxlen = sizeof(int), 5193 .mode = 0644, 5194 .proc_handler = proc_dointvec, 5195 }, 5196 { 5197 .procname = "max_size", 5198 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5199 .maxlen = sizeof(int), 5200 .mode = 0644, 5201 .proc_handler = proc_dointvec, 5202 }, 5203 { 5204 .procname = "gc_min_interval", 5205 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5206 .maxlen = sizeof(int), 5207 .mode = 0644, 5208 .proc_handler = proc_dointvec_jiffies, 5209 }, 5210 { 5211 .procname = "gc_timeout", 5212 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5213 .maxlen = sizeof(int), 5214 .mode = 0644, 5215 .proc_handler = proc_dointvec_jiffies, 5216 }, 5217 { 5218 .procname = "gc_interval", 5219 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5220 .maxlen = sizeof(int), 5221 .mode = 0644, 5222 .proc_handler = proc_dointvec_jiffies, 5223 }, 5224 { 5225 .procname = "gc_elasticity", 5226 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5227 .maxlen = sizeof(int), 5228 .mode = 0644, 5229 .proc_handler = proc_dointvec, 5230 }, 5231 { 5232 .procname = "mtu_expires", 5233 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5234 .maxlen = sizeof(int), 5235 .mode = 0644, 5236 .proc_handler = proc_dointvec_jiffies, 5237 }, 5238 { 5239 .procname = "min_adv_mss", 5240 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5241 .maxlen = sizeof(int), 5242 .mode = 0644, 5243 .proc_handler = proc_dointvec, 5244 }, 5245 { 5246 .procname = "gc_min_interval_ms", 5247 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5248 .maxlen = sizeof(int), 5249 .mode = 0644, 5250 .proc_handler = proc_dointvec_ms_jiffies, 5251 }, 5252 { 5253 .procname = "skip_notify_on_dev_down", 5254 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5255 .maxlen = sizeof(int), 5256 .mode = 0644, 5257 .proc_handler = proc_dointvec, 5258 .extra1 = &zero, 5259 .extra2 = &one, 5260 }, 5261 { } 5262 }; 5263 5264 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5265 { 5266 struct ctl_table *table; 5267 5268 table = kmemdup(ipv6_route_table_template, 5269 sizeof(ipv6_route_table_template), 5270 GFP_KERNEL); 5271 5272 if (table) { 5273 table[0].data = &net->ipv6.sysctl.flush_delay; 5274 table[0].extra1 = net; 5275 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5276 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5277 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5278 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5279 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5280 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5281 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5282 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5283 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5284 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5285 5286 /* Don't export sysctls to unprivileged users */ 5287 if (net->user_ns != &init_user_ns) 5288 table[0].procname = NULL; 5289 } 5290 5291 return table; 5292 } 5293 #endif 5294 5295 static int __net_init ip6_route_net_init(struct net *net) 5296 { 5297 int ret = -ENOMEM; 5298 5299 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5300 sizeof(net->ipv6.ip6_dst_ops)); 5301 5302 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5303 goto out_ip6_dst_ops; 5304 5305 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5306 sizeof(*net->ipv6.fib6_null_entry), 5307 GFP_KERNEL); 5308 if (!net->ipv6.fib6_null_entry) 5309 goto out_ip6_dst_entries; 5310 5311 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5312 sizeof(*net->ipv6.ip6_null_entry), 5313 GFP_KERNEL); 5314 if (!net->ipv6.ip6_null_entry) 5315 goto out_fib6_null_entry; 5316 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5317 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5318 ip6_template_metrics, true); 5319 5320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5321 net->ipv6.fib6_has_custom_rules = false; 5322 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5323 sizeof(*net->ipv6.ip6_prohibit_entry), 5324 GFP_KERNEL); 5325 if (!net->ipv6.ip6_prohibit_entry) 5326 goto out_ip6_null_entry; 5327 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5328 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5329 ip6_template_metrics, true); 5330 5331 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5332 sizeof(*net->ipv6.ip6_blk_hole_entry), 5333 GFP_KERNEL); 5334 if (!net->ipv6.ip6_blk_hole_entry) 5335 goto out_ip6_prohibit_entry; 5336 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5337 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5338 ip6_template_metrics, true); 5339 #endif 5340 5341 net->ipv6.sysctl.flush_delay = 0; 5342 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5343 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5344 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5345 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5346 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5347 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5348 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5349 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5350 5351 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5352 5353 ret = 0; 5354 out: 5355 return ret; 5356 5357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5358 out_ip6_prohibit_entry: 5359 kfree(net->ipv6.ip6_prohibit_entry); 5360 out_ip6_null_entry: 5361 kfree(net->ipv6.ip6_null_entry); 5362 #endif 5363 out_fib6_null_entry: 5364 kfree(net->ipv6.fib6_null_entry); 5365 out_ip6_dst_entries: 5366 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5367 out_ip6_dst_ops: 5368 goto out; 5369 } 5370 5371 static void __net_exit ip6_route_net_exit(struct net *net) 5372 { 5373 kfree(net->ipv6.fib6_null_entry); 5374 kfree(net->ipv6.ip6_null_entry); 5375 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5376 kfree(net->ipv6.ip6_prohibit_entry); 5377 kfree(net->ipv6.ip6_blk_hole_entry); 5378 #endif 5379 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5380 } 5381 5382 static int __net_init ip6_route_net_init_late(struct net *net) 5383 { 5384 #ifdef CONFIG_PROC_FS 5385 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5386 sizeof(struct ipv6_route_iter)); 5387 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5388 rt6_stats_seq_show, NULL); 5389 #endif 5390 return 0; 5391 } 5392 5393 static void __net_exit ip6_route_net_exit_late(struct net *net) 5394 { 5395 #ifdef CONFIG_PROC_FS 5396 remove_proc_entry("ipv6_route", net->proc_net); 5397 remove_proc_entry("rt6_stats", net->proc_net); 5398 #endif 5399 } 5400 5401 static struct pernet_operations ip6_route_net_ops = { 5402 .init = ip6_route_net_init, 5403 .exit = ip6_route_net_exit, 5404 }; 5405 5406 static int __net_init ipv6_inetpeer_init(struct net *net) 5407 { 5408 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5409 5410 if (!bp) 5411 return -ENOMEM; 5412 inet_peer_base_init(bp); 5413 net->ipv6.peers = bp; 5414 return 0; 5415 } 5416 5417 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5418 { 5419 struct inet_peer_base *bp = net->ipv6.peers; 5420 5421 net->ipv6.peers = NULL; 5422 inetpeer_invalidate_tree(bp); 5423 kfree(bp); 5424 } 5425 5426 static struct pernet_operations ipv6_inetpeer_ops = { 5427 .init = ipv6_inetpeer_init, 5428 .exit = ipv6_inetpeer_exit, 5429 }; 5430 5431 static struct pernet_operations ip6_route_net_late_ops = { 5432 .init = ip6_route_net_init_late, 5433 .exit = ip6_route_net_exit_late, 5434 }; 5435 5436 static struct notifier_block ip6_route_dev_notifier = { 5437 .notifier_call = ip6_route_dev_notify, 5438 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5439 }; 5440 5441 void __init ip6_route_init_special_entries(void) 5442 { 5443 /* Registering of the loopback is done before this portion of code, 5444 * the loopback reference in rt6_info will not be taken, do it 5445 * manually for init_net */ 5446 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5447 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5448 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5449 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5450 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5451 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5452 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5453 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5454 #endif 5455 } 5456 5457 int __init ip6_route_init(void) 5458 { 5459 int ret; 5460 int cpu; 5461 5462 ret = -ENOMEM; 5463 ip6_dst_ops_template.kmem_cachep = 5464 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5465 SLAB_HWCACHE_ALIGN, NULL); 5466 if (!ip6_dst_ops_template.kmem_cachep) 5467 goto out; 5468 5469 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5470 if (ret) 5471 goto out_kmem_cache; 5472 5473 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5474 if (ret) 5475 goto out_dst_entries; 5476 5477 ret = register_pernet_subsys(&ip6_route_net_ops); 5478 if (ret) 5479 goto out_register_inetpeer; 5480 5481 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5482 5483 ret = fib6_init(); 5484 if (ret) 5485 goto out_register_subsys; 5486 5487 ret = xfrm6_init(); 5488 if (ret) 5489 goto out_fib6_init; 5490 5491 ret = fib6_rules_init(); 5492 if (ret) 5493 goto xfrm6_init; 5494 5495 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5496 if (ret) 5497 goto fib6_rules_init; 5498 5499 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5500 inet6_rtm_newroute, NULL, 0); 5501 if (ret < 0) 5502 goto out_register_late_subsys; 5503 5504 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5505 inet6_rtm_delroute, NULL, 0); 5506 if (ret < 0) 5507 goto out_register_late_subsys; 5508 5509 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5510 inet6_rtm_getroute, NULL, 5511 RTNL_FLAG_DOIT_UNLOCKED); 5512 if (ret < 0) 5513 goto out_register_late_subsys; 5514 5515 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5516 if (ret) 5517 goto out_register_late_subsys; 5518 5519 for_each_possible_cpu(cpu) { 5520 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5521 5522 INIT_LIST_HEAD(&ul->head); 5523 spin_lock_init(&ul->lock); 5524 } 5525 5526 out: 5527 return ret; 5528 5529 out_register_late_subsys: 5530 rtnl_unregister_all(PF_INET6); 5531 unregister_pernet_subsys(&ip6_route_net_late_ops); 5532 fib6_rules_init: 5533 fib6_rules_cleanup(); 5534 xfrm6_init: 5535 xfrm6_fini(); 5536 out_fib6_init: 5537 fib6_gc_cleanup(); 5538 out_register_subsys: 5539 unregister_pernet_subsys(&ip6_route_net_ops); 5540 out_register_inetpeer: 5541 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5542 out_dst_entries: 5543 dst_entries_destroy(&ip6_dst_blackhole_ops); 5544 out_kmem_cache: 5545 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5546 goto out; 5547 } 5548 5549 void ip6_route_cleanup(void) 5550 { 5551 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5552 unregister_pernet_subsys(&ip6_route_net_late_ops); 5553 fib6_rules_cleanup(); 5554 xfrm6_fini(); 5555 fib6_gc_cleanup(); 5556 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5557 unregister_pernet_subsys(&ip6_route_net_ops); 5558 dst_entries_destroy(&ip6_dst_blackhole_ops); 5559 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5560 } 5561