1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 214 n = neigh_create(&nd_tbl, daddr, dev); 215 return IS_ERR(n) ? NULL : n; 216 } 217 218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 219 struct sk_buff *skb, 220 const void *daddr) 221 { 222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 223 224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 225 } 226 227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 228 { 229 struct net_device *dev = dst->dev; 230 struct rt6_info *rt = (struct rt6_info *)dst; 231 232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 233 if (!daddr) 234 return; 235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 236 return; 237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 238 return; 239 __ipv6_confirm_neigh(dev, daddr); 240 } 241 242 static struct dst_ops ip6_dst_ops_template = { 243 .family = AF_INET6, 244 .gc = ip6_dst_gc, 245 .gc_thresh = 1024, 246 .check = ip6_dst_check, 247 .default_advmss = ip6_default_advmss, 248 .mtu = ip6_mtu, 249 .cow_metrics = dst_cow_metrics_generic, 250 .destroy = ip6_dst_destroy, 251 .ifdown = ip6_dst_ifdown, 252 .negative_advice = ip6_negative_advice, 253 .link_failure = ip6_link_failure, 254 .update_pmtu = ip6_rt_update_pmtu, 255 .redirect = rt6_do_redirect, 256 .local_out = __ip6_local_out, 257 .neigh_lookup = ip6_dst_neigh_lookup, 258 .confirm_neigh = ip6_confirm_neigh, 259 }; 260 261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 262 { 263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 264 265 return mtu ? : dst->dev->mtu; 266 } 267 268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 269 struct sk_buff *skb, u32 mtu) 270 { 271 } 272 273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 274 struct sk_buff *skb) 275 { 276 } 277 278 static struct dst_ops ip6_dst_blackhole_ops = { 279 .family = AF_INET6, 280 .destroy = ip6_dst_destroy, 281 .check = ip6_dst_check, 282 .mtu = ip6_blackhole_mtu, 283 .default_advmss = ip6_default_advmss, 284 .update_pmtu = ip6_rt_blackhole_update_pmtu, 285 .redirect = ip6_rt_blackhole_redirect, 286 .cow_metrics = dst_cow_metrics_generic, 287 .neigh_lookup = ip6_dst_neigh_lookup, 288 }; 289 290 static const u32 ip6_template_metrics[RTAX_MAX] = { 291 [RTAX_HOPLIMIT - 1] = 0, 292 }; 293 294 static const struct fib6_info fib6_null_entry_template = { 295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 296 .fib6_protocol = RTPROT_KERNEL, 297 .fib6_metric = ~(u32)0, 298 .fib6_ref = ATOMIC_INIT(1), 299 .fib6_type = RTN_UNREACHABLE, 300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 301 }; 302 303 static const struct rt6_info ip6_null_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -ENETUNREACH, 309 .input = ip6_pkt_discard, 310 .output = ip6_pkt_discard_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 316 317 static const struct rt6_info ip6_prohibit_entry_template = { 318 .dst = { 319 .__refcnt = ATOMIC_INIT(1), 320 .__use = 1, 321 .obsolete = DST_OBSOLETE_FORCE_CHK, 322 .error = -EACCES, 323 .input = ip6_pkt_prohibit, 324 .output = ip6_pkt_prohibit_out, 325 }, 326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 327 }; 328 329 static const struct rt6_info ip6_blk_hole_entry_template = { 330 .dst = { 331 .__refcnt = ATOMIC_INIT(1), 332 .__use = 1, 333 .obsolete = DST_OBSOLETE_FORCE_CHK, 334 .error = -EINVAL, 335 .input = dst_discard, 336 .output = dst_discard_out, 337 }, 338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 339 }; 340 341 #endif 342 343 static void rt6_info_init(struct rt6_info *rt) 344 { 345 struct dst_entry *dst = &rt->dst; 346 347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 348 INIT_LIST_HEAD(&rt->rt6i_uncached); 349 } 350 351 /* allocate dst with ip6_dst_ops */ 352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) { 359 rt6_info_init(rt); 360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 361 } 362 363 return rt; 364 } 365 EXPORT_SYMBOL(ip6_dst_alloc); 366 367 static void ip6_dst_destroy(struct dst_entry *dst) 368 { 369 struct rt6_info *rt = (struct rt6_info *)dst; 370 struct fib6_info *from; 371 struct inet6_dev *idev; 372 373 ip_dst_metrics_put(dst); 374 rt6_uncached_list_del(rt); 375 376 idev = rt->rt6i_idev; 377 if (idev) { 378 rt->rt6i_idev = NULL; 379 in6_dev_put(idev); 380 } 381 382 from = xchg((__force struct fib6_info **)&rt->from, NULL); 383 fib6_info_release(from); 384 } 385 386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 387 int how) 388 { 389 struct rt6_info *rt = (struct rt6_info *)dst; 390 struct inet6_dev *idev = rt->rt6i_idev; 391 struct net_device *loopback_dev = 392 dev_net(dev)->loopback_dev; 393 394 if (idev && idev->dev != loopback_dev) { 395 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 396 if (loopback_idev) { 397 rt->rt6i_idev = loopback_idev; 398 in6_dev_put(idev); 399 } 400 } 401 } 402 403 static bool __rt6_check_expired(const struct rt6_info *rt) 404 { 405 if (rt->rt6i_flags & RTF_EXPIRES) 406 return time_after(jiffies, rt->dst.expires); 407 else 408 return false; 409 } 410 411 static bool rt6_check_expired(const struct rt6_info *rt) 412 { 413 struct fib6_info *from; 414 415 from = rcu_dereference(rt->from); 416 417 if (rt->rt6i_flags & RTF_EXPIRES) { 418 if (time_after(jiffies, rt->dst.expires)) 419 return true; 420 } else if (from) { 421 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 422 fib6_check_expired(from); 423 } 424 return false; 425 } 426 427 struct fib6_info *fib6_multipath_select(const struct net *net, 428 struct fib6_info *match, 429 struct flowi6 *fl6, int oif, 430 const struct sk_buff *skb, 431 int strict) 432 { 433 struct fib6_info *sibling, *next_sibling; 434 435 /* We might have already computed the hash for ICMPv6 errors. In such 436 * case it will always be non-zero. Otherwise now is the time to do it. 437 */ 438 if (!fl6->mp_hash) 439 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 440 441 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 442 return match; 443 444 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 445 fib6_siblings) { 446 int nh_upper_bound; 447 448 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 449 if (fl6->mp_hash > nh_upper_bound) 450 continue; 451 if (rt6_score_route(sibling, oif, strict) < 0) 452 break; 453 match = sibling; 454 break; 455 } 456 457 return match; 458 } 459 460 /* 461 * Route lookup. rcu_read_lock() should be held. 462 */ 463 464 static inline struct fib6_info *rt6_device_match(struct net *net, 465 struct fib6_info *rt, 466 const struct in6_addr *saddr, 467 int oif, 468 int flags) 469 { 470 struct fib6_info *sprt; 471 472 if (!oif && ipv6_addr_any(saddr) && 473 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 474 return rt; 475 476 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 477 const struct net_device *dev = sprt->fib6_nh.nh_dev; 478 479 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 480 continue; 481 482 if (oif) { 483 if (dev->ifindex == oif) 484 return sprt; 485 } else { 486 if (ipv6_chk_addr(net, saddr, dev, 487 flags & RT6_LOOKUP_F_IFACE)) 488 return sprt; 489 } 490 } 491 492 if (oif && flags & RT6_LOOKUP_F_IFACE) 493 return net->ipv6.fib6_null_entry; 494 495 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 496 } 497 498 #ifdef CONFIG_IPV6_ROUTER_PREF 499 struct __rt6_probe_work { 500 struct work_struct work; 501 struct in6_addr target; 502 struct net_device *dev; 503 }; 504 505 static void rt6_probe_deferred(struct work_struct *w) 506 { 507 struct in6_addr mcaddr; 508 struct __rt6_probe_work *work = 509 container_of(w, struct __rt6_probe_work, work); 510 511 addrconf_addr_solict_mult(&work->target, &mcaddr); 512 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 513 dev_put(work->dev); 514 kfree(work); 515 } 516 517 static void rt6_probe(struct fib6_info *rt) 518 { 519 struct __rt6_probe_work *work = NULL; 520 const struct in6_addr *nh_gw; 521 struct neighbour *neigh; 522 struct net_device *dev; 523 struct inet6_dev *idev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 idev = __in6_dev_get(dev); 540 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 541 if (neigh) { 542 if (neigh->nud_state & NUD_VALID) 543 goto out; 544 545 write_lock(&neigh->lock); 546 if (!(neigh->nud_state & NUD_VALID) && 547 time_after(jiffies, 548 neigh->updated + idev->cnf.rtr_probe_interval)) { 549 work = kmalloc(sizeof(*work), GFP_ATOMIC); 550 if (work) 551 __neigh_set_probe_once(neigh); 552 } 553 write_unlock(&neigh->lock); 554 } else if (time_after(jiffies, rt->last_probe + 555 idev->cnf.rtr_probe_interval)) { 556 work = kmalloc(sizeof(*work), GFP_ATOMIC); 557 } 558 559 if (work) { 560 rt->last_probe = jiffies; 561 INIT_WORK(&work->work, rt6_probe_deferred); 562 work->target = *nh_gw; 563 dev_hold(dev); 564 work->dev = dev; 565 schedule_work(&work->work); 566 } 567 568 out: 569 rcu_read_unlock_bh(); 570 } 571 #else 572 static inline void rt6_probe(struct fib6_info *rt) 573 { 574 } 575 #endif 576 577 /* 578 * Default Router Selection (RFC 2461 6.3.6) 579 */ 580 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 581 { 582 const struct net_device *dev = rt->fib6_nh.nh_dev; 583 584 if (!oif || dev->ifindex == oif) 585 return 2; 586 return 0; 587 } 588 589 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 590 { 591 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 592 struct neighbour *neigh; 593 594 if (rt->fib6_flags & RTF_NONEXTHOP || 595 !(rt->fib6_flags & RTF_GATEWAY)) 596 return RT6_NUD_SUCCEED; 597 598 rcu_read_lock_bh(); 599 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 600 &rt->fib6_nh.nh_gw); 601 if (neigh) { 602 read_lock(&neigh->lock); 603 if (neigh->nud_state & NUD_VALID) 604 ret = RT6_NUD_SUCCEED; 605 #ifdef CONFIG_IPV6_ROUTER_PREF 606 else if (!(neigh->nud_state & NUD_FAILED)) 607 ret = RT6_NUD_SUCCEED; 608 else 609 ret = RT6_NUD_FAIL_PROBE; 610 #endif 611 read_unlock(&neigh->lock); 612 } else { 613 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 614 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 615 } 616 rcu_read_unlock_bh(); 617 618 return ret; 619 } 620 621 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 622 { 623 int m; 624 625 m = rt6_check_dev(rt, oif); 626 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 627 return RT6_NUD_FAIL_HARD; 628 #ifdef CONFIG_IPV6_ROUTER_PREF 629 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 630 #endif 631 if (strict & RT6_LOOKUP_F_REACHABLE) { 632 int n = rt6_check_neigh(rt); 633 if (n < 0) 634 return n; 635 } 636 return m; 637 } 638 639 /* called with rc_read_lock held */ 640 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 641 { 642 const struct net_device *dev = fib6_info_nh_dev(f6i); 643 bool rc = false; 644 645 if (dev) { 646 const struct inet6_dev *idev = __in6_dev_get(dev); 647 648 rc = !!idev->cnf.ignore_routes_with_linkdown; 649 } 650 651 return rc; 652 } 653 654 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 655 int *mpri, struct fib6_info *match, 656 bool *do_rr) 657 { 658 int m; 659 bool match_do_rr = false; 660 661 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 662 goto out; 663 664 if (fib6_ignore_linkdown(rt) && 665 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 666 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 667 goto out; 668 669 if (fib6_check_expired(rt)) 670 goto out; 671 672 m = rt6_score_route(rt, oif, strict); 673 if (m == RT6_NUD_FAIL_DO_RR) { 674 match_do_rr = true; 675 m = 0; /* lowest valid score */ 676 } else if (m == RT6_NUD_FAIL_HARD) { 677 goto out; 678 } 679 680 if (strict & RT6_LOOKUP_F_REACHABLE) 681 rt6_probe(rt); 682 683 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 684 if (m > *mpri) { 685 *do_rr = match_do_rr; 686 *mpri = m; 687 match = rt; 688 } 689 out: 690 return match; 691 } 692 693 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 694 struct fib6_info *leaf, 695 struct fib6_info *rr_head, 696 u32 metric, int oif, int strict, 697 bool *do_rr) 698 { 699 struct fib6_info *rt, *match, *cont; 700 int mpri = -1; 701 702 match = NULL; 703 cont = NULL; 704 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 705 if (rt->fib6_metric != metric) { 706 cont = rt; 707 break; 708 } 709 710 match = find_match(rt, oif, strict, &mpri, match, do_rr); 711 } 712 713 for (rt = leaf; rt && rt != rr_head; 714 rt = rcu_dereference(rt->fib6_next)) { 715 if (rt->fib6_metric != metric) { 716 cont = rt; 717 break; 718 } 719 720 match = find_match(rt, oif, strict, &mpri, match, do_rr); 721 } 722 723 if (match || !cont) 724 return match; 725 726 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 727 match = find_match(rt, oif, strict, &mpri, match, do_rr); 728 729 return match; 730 } 731 732 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 733 int oif, int strict) 734 { 735 struct fib6_info *leaf = rcu_dereference(fn->leaf); 736 struct fib6_info *match, *rt0; 737 bool do_rr = false; 738 int key_plen; 739 740 if (!leaf || leaf == net->ipv6.fib6_null_entry) 741 return net->ipv6.fib6_null_entry; 742 743 rt0 = rcu_dereference(fn->rr_ptr); 744 if (!rt0) 745 rt0 = leaf; 746 747 /* Double check to make sure fn is not an intermediate node 748 * and fn->leaf does not points to its child's leaf 749 * (This might happen if all routes under fn are deleted from 750 * the tree and fib6_repair_tree() is called on the node.) 751 */ 752 key_plen = rt0->fib6_dst.plen; 753 #ifdef CONFIG_IPV6_SUBTREES 754 if (rt0->fib6_src.plen) 755 key_plen = rt0->fib6_src.plen; 756 #endif 757 if (fn->fn_bit != key_plen) 758 return net->ipv6.fib6_null_entry; 759 760 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 761 &do_rr); 762 763 if (do_rr) { 764 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 765 766 /* no entries matched; do round-robin */ 767 if (!next || next->fib6_metric != rt0->fib6_metric) 768 next = leaf; 769 770 if (next != rt0) { 771 spin_lock_bh(&leaf->fib6_table->tb6_lock); 772 /* make sure next is not being deleted from the tree */ 773 if (next->fib6_node) 774 rcu_assign_pointer(fn->rr_ptr, next); 775 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 776 } 777 } 778 779 return match ? match : net->ipv6.fib6_null_entry; 780 } 781 782 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 783 { 784 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 785 } 786 787 #ifdef CONFIG_IPV6_ROUTE_INFO 788 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 789 const struct in6_addr *gwaddr) 790 { 791 struct net *net = dev_net(dev); 792 struct route_info *rinfo = (struct route_info *) opt; 793 struct in6_addr prefix_buf, *prefix; 794 unsigned int pref; 795 unsigned long lifetime; 796 struct fib6_info *rt; 797 798 if (len < sizeof(struct route_info)) { 799 return -EINVAL; 800 } 801 802 /* Sanity check for prefix_len and length */ 803 if (rinfo->length > 3) { 804 return -EINVAL; 805 } else if (rinfo->prefix_len > 128) { 806 return -EINVAL; 807 } else if (rinfo->prefix_len > 64) { 808 if (rinfo->length < 2) { 809 return -EINVAL; 810 } 811 } else if (rinfo->prefix_len > 0) { 812 if (rinfo->length < 1) { 813 return -EINVAL; 814 } 815 } 816 817 pref = rinfo->route_pref; 818 if (pref == ICMPV6_ROUTER_PREF_INVALID) 819 return -EINVAL; 820 821 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 822 823 if (rinfo->length == 3) 824 prefix = (struct in6_addr *)rinfo->prefix; 825 else { 826 /* this function is safe */ 827 ipv6_addr_prefix(&prefix_buf, 828 (struct in6_addr *)rinfo->prefix, 829 rinfo->prefix_len); 830 prefix = &prefix_buf; 831 } 832 833 if (rinfo->prefix_len == 0) 834 rt = rt6_get_dflt_router(net, gwaddr, dev); 835 else 836 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 837 gwaddr, dev); 838 839 if (rt && !lifetime) { 840 ip6_del_rt(net, rt); 841 rt = NULL; 842 } 843 844 if (!rt && lifetime) 845 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 846 dev, pref); 847 else if (rt) 848 rt->fib6_flags = RTF_ROUTEINFO | 849 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 850 851 if (rt) { 852 if (!addrconf_finite_timeout(lifetime)) 853 fib6_clean_expires(rt); 854 else 855 fib6_set_expires(rt, jiffies + HZ * lifetime); 856 857 fib6_info_release(rt); 858 } 859 return 0; 860 } 861 #endif 862 863 /* 864 * Misc support functions 865 */ 866 867 /* called with rcu_lock held */ 868 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 869 { 870 struct net_device *dev = rt->fib6_nh.nh_dev; 871 872 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 873 /* for copies of local routes, dst->dev needs to be the 874 * device if it is a master device, the master device if 875 * device is enslaved, and the loopback as the default 876 */ 877 if (netif_is_l3_slave(dev) && 878 !rt6_need_strict(&rt->fib6_dst.addr)) 879 dev = l3mdev_master_dev_rcu(dev); 880 else if (!netif_is_l3_master(dev)) 881 dev = dev_net(dev)->loopback_dev; 882 /* last case is netif_is_l3_master(dev) is true in which 883 * case we want dev returned to be dev 884 */ 885 } 886 887 return dev; 888 } 889 890 static const int fib6_prop[RTN_MAX + 1] = { 891 [RTN_UNSPEC] = 0, 892 [RTN_UNICAST] = 0, 893 [RTN_LOCAL] = 0, 894 [RTN_BROADCAST] = 0, 895 [RTN_ANYCAST] = 0, 896 [RTN_MULTICAST] = 0, 897 [RTN_BLACKHOLE] = -EINVAL, 898 [RTN_UNREACHABLE] = -EHOSTUNREACH, 899 [RTN_PROHIBIT] = -EACCES, 900 [RTN_THROW] = -EAGAIN, 901 [RTN_NAT] = -EINVAL, 902 [RTN_XRESOLVE] = -EINVAL, 903 }; 904 905 static int ip6_rt_type_to_error(u8 fib6_type) 906 { 907 return fib6_prop[fib6_type]; 908 } 909 910 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 911 { 912 unsigned short flags = 0; 913 914 if (rt->dst_nocount) 915 flags |= DST_NOCOUNT; 916 if (rt->dst_nopolicy) 917 flags |= DST_NOPOLICY; 918 if (rt->dst_host) 919 flags |= DST_HOST; 920 921 return flags; 922 } 923 924 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 925 { 926 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 927 928 switch (ort->fib6_type) { 929 case RTN_BLACKHOLE: 930 rt->dst.output = dst_discard_out; 931 rt->dst.input = dst_discard; 932 break; 933 case RTN_PROHIBIT: 934 rt->dst.output = ip6_pkt_prohibit_out; 935 rt->dst.input = ip6_pkt_prohibit; 936 break; 937 case RTN_THROW: 938 case RTN_UNREACHABLE: 939 default: 940 rt->dst.output = ip6_pkt_discard_out; 941 rt->dst.input = ip6_pkt_discard; 942 break; 943 } 944 } 945 946 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 947 { 948 if (ort->fib6_flags & RTF_REJECT) { 949 ip6_rt_init_dst_reject(rt, ort); 950 return; 951 } 952 953 rt->dst.error = 0; 954 rt->dst.output = ip6_output; 955 956 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 957 rt->dst.input = ip6_input; 958 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 959 rt->dst.input = ip6_mc_input; 960 } else { 961 rt->dst.input = ip6_forward; 962 } 963 964 if (ort->fib6_nh.nh_lwtstate) { 965 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 966 lwtunnel_set_redirect(&rt->dst); 967 } 968 969 rt->dst.lastuse = jiffies; 970 } 971 972 /* Caller must already hold reference to @from */ 973 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 974 { 975 rt->rt6i_flags &= ~RTF_EXPIRES; 976 rcu_assign_pointer(rt->from, from); 977 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 978 } 979 980 /* Caller must already hold reference to @ort */ 981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 982 { 983 struct net_device *dev = fib6_info_nh_dev(ort); 984 985 ip6_rt_init_dst(rt, ort); 986 987 rt->rt6i_dst = ort->fib6_dst; 988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 989 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 990 rt->rt6i_flags = ort->fib6_flags; 991 rt6_set_from(rt, ort); 992 #ifdef CONFIG_IPV6_SUBTREES 993 rt->rt6i_src = ort->fib6_src; 994 #endif 995 } 996 997 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 998 struct in6_addr *saddr) 999 { 1000 struct fib6_node *pn, *sn; 1001 while (1) { 1002 if (fn->fn_flags & RTN_TL_ROOT) 1003 return NULL; 1004 pn = rcu_dereference(fn->parent); 1005 sn = FIB6_SUBTREE(pn); 1006 if (sn && sn != fn) 1007 fn = fib6_node_lookup(sn, NULL, saddr); 1008 else 1009 fn = pn; 1010 if (fn->fn_flags & RTN_RTINFO) 1011 return fn; 1012 } 1013 } 1014 1015 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1016 bool null_fallback) 1017 { 1018 struct rt6_info *rt = *prt; 1019 1020 if (dst_hold_safe(&rt->dst)) 1021 return true; 1022 if (null_fallback) { 1023 rt = net->ipv6.ip6_null_entry; 1024 dst_hold(&rt->dst); 1025 } else { 1026 rt = NULL; 1027 } 1028 *prt = rt; 1029 return false; 1030 } 1031 1032 /* called with rcu_lock held */ 1033 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1034 { 1035 unsigned short flags = fib6_info_dst_flags(rt); 1036 struct net_device *dev = rt->fib6_nh.nh_dev; 1037 struct rt6_info *nrt; 1038 1039 if (!fib6_info_hold_safe(rt)) 1040 goto fallback; 1041 1042 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1043 if (!nrt) { 1044 fib6_info_release(rt); 1045 goto fallback; 1046 } 1047 1048 ip6_rt_copy_init(nrt, rt); 1049 return nrt; 1050 1051 fallback: 1052 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1053 dst_hold(&nrt->dst); 1054 return nrt; 1055 } 1056 1057 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1058 struct fib6_table *table, 1059 struct flowi6 *fl6, 1060 const struct sk_buff *skb, 1061 int flags) 1062 { 1063 struct fib6_info *f6i; 1064 struct fib6_node *fn; 1065 struct rt6_info *rt; 1066 1067 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1068 flags &= ~RT6_LOOKUP_F_IFACE; 1069 1070 rcu_read_lock(); 1071 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1072 restart: 1073 f6i = rcu_dereference(fn->leaf); 1074 if (!f6i) { 1075 f6i = net->ipv6.fib6_null_entry; 1076 } else { 1077 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1078 fl6->flowi6_oif, flags); 1079 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1080 f6i = fib6_multipath_select(net, f6i, fl6, 1081 fl6->flowi6_oif, skb, 1082 flags); 1083 } 1084 if (f6i == net->ipv6.fib6_null_entry) { 1085 fn = fib6_backtrack(fn, &fl6->saddr); 1086 if (fn) 1087 goto restart; 1088 } 1089 1090 trace_fib6_table_lookup(net, f6i, table, fl6); 1091 1092 /* Search through exception table */ 1093 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1094 if (rt) { 1095 if (ip6_hold_safe(net, &rt, true)) 1096 dst_use_noref(&rt->dst, jiffies); 1097 } else if (f6i == net->ipv6.fib6_null_entry) { 1098 rt = net->ipv6.ip6_null_entry; 1099 dst_hold(&rt->dst); 1100 } else { 1101 rt = ip6_create_rt_rcu(f6i); 1102 } 1103 1104 rcu_read_unlock(); 1105 1106 return rt; 1107 } 1108 1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1110 const struct sk_buff *skb, int flags) 1111 { 1112 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1113 } 1114 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1115 1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1117 const struct in6_addr *saddr, int oif, 1118 const struct sk_buff *skb, int strict) 1119 { 1120 struct flowi6 fl6 = { 1121 .flowi6_oif = oif, 1122 .daddr = *daddr, 1123 }; 1124 struct dst_entry *dst; 1125 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1126 1127 if (saddr) { 1128 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1129 flags |= RT6_LOOKUP_F_HAS_SADDR; 1130 } 1131 1132 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1133 if (dst->error == 0) 1134 return (struct rt6_info *) dst; 1135 1136 dst_release(dst); 1137 1138 return NULL; 1139 } 1140 EXPORT_SYMBOL(rt6_lookup); 1141 1142 /* ip6_ins_rt is called with FREE table->tb6_lock. 1143 * It takes new route entry, the addition fails by any reason the 1144 * route is released. 1145 * Caller must hold dst before calling it. 1146 */ 1147 1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1149 struct netlink_ext_ack *extack) 1150 { 1151 int err; 1152 struct fib6_table *table; 1153 1154 table = rt->fib6_table; 1155 spin_lock_bh(&table->tb6_lock); 1156 err = fib6_add(&table->tb6_root, rt, info, extack); 1157 spin_unlock_bh(&table->tb6_lock); 1158 1159 return err; 1160 } 1161 1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1163 { 1164 struct nl_info info = { .nl_net = net, }; 1165 1166 return __ip6_ins_rt(rt, &info, NULL); 1167 } 1168 1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1170 const struct in6_addr *daddr, 1171 const struct in6_addr *saddr) 1172 { 1173 struct net_device *dev; 1174 struct rt6_info *rt; 1175 1176 /* 1177 * Clone the route. 1178 */ 1179 1180 if (!fib6_info_hold_safe(ort)) 1181 return NULL; 1182 1183 dev = ip6_rt_get_dev_rcu(ort); 1184 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1185 if (!rt) { 1186 fib6_info_release(ort); 1187 return NULL; 1188 } 1189 1190 ip6_rt_copy_init(rt, ort); 1191 rt->rt6i_flags |= RTF_CACHE; 1192 rt->dst.flags |= DST_HOST; 1193 rt->rt6i_dst.addr = *daddr; 1194 rt->rt6i_dst.plen = 128; 1195 1196 if (!rt6_is_gw_or_nonexthop(ort)) { 1197 if (ort->fib6_dst.plen != 128 && 1198 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1199 rt->rt6i_flags |= RTF_ANYCAST; 1200 #ifdef CONFIG_IPV6_SUBTREES 1201 if (rt->rt6i_src.plen && saddr) { 1202 rt->rt6i_src.addr = *saddr; 1203 rt->rt6i_src.plen = 128; 1204 } 1205 #endif 1206 } 1207 1208 return rt; 1209 } 1210 1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1212 { 1213 unsigned short flags = fib6_info_dst_flags(rt); 1214 struct net_device *dev; 1215 struct rt6_info *pcpu_rt; 1216 1217 if (!fib6_info_hold_safe(rt)) 1218 return NULL; 1219 1220 rcu_read_lock(); 1221 dev = ip6_rt_get_dev_rcu(rt); 1222 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1223 rcu_read_unlock(); 1224 if (!pcpu_rt) { 1225 fib6_info_release(rt); 1226 return NULL; 1227 } 1228 ip6_rt_copy_init(pcpu_rt, rt); 1229 pcpu_rt->rt6i_flags |= RTF_PCPU; 1230 return pcpu_rt; 1231 } 1232 1233 /* It should be called with rcu_read_lock() acquired */ 1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1235 { 1236 struct rt6_info *pcpu_rt, **p; 1237 1238 p = this_cpu_ptr(rt->rt6i_pcpu); 1239 pcpu_rt = *p; 1240 1241 if (pcpu_rt) 1242 ip6_hold_safe(NULL, &pcpu_rt, false); 1243 1244 return pcpu_rt; 1245 } 1246 1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1248 struct fib6_info *rt) 1249 { 1250 struct rt6_info *pcpu_rt, *prev, **p; 1251 1252 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1253 if (!pcpu_rt) { 1254 dst_hold(&net->ipv6.ip6_null_entry->dst); 1255 return net->ipv6.ip6_null_entry; 1256 } 1257 1258 dst_hold(&pcpu_rt->dst); 1259 p = this_cpu_ptr(rt->rt6i_pcpu); 1260 prev = cmpxchg(p, NULL, pcpu_rt); 1261 BUG_ON(prev); 1262 1263 return pcpu_rt; 1264 } 1265 1266 /* exception hash table implementation 1267 */ 1268 static DEFINE_SPINLOCK(rt6_exception_lock); 1269 1270 /* Remove rt6_ex from hash table and free the memory 1271 * Caller must hold rt6_exception_lock 1272 */ 1273 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1274 struct rt6_exception *rt6_ex) 1275 { 1276 struct fib6_info *from; 1277 struct net *net; 1278 1279 if (!bucket || !rt6_ex) 1280 return; 1281 1282 net = dev_net(rt6_ex->rt6i->dst.dev); 1283 net->ipv6.rt6_stats->fib_rt_cache--; 1284 1285 /* purge completely the exception to allow releasing the held resources: 1286 * some [sk] cache may keep the dst around for unlimited time 1287 */ 1288 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1289 fib6_info_release(from); 1290 dst_dev_put(&rt6_ex->rt6i->dst); 1291 1292 hlist_del_rcu(&rt6_ex->hlist); 1293 dst_release(&rt6_ex->rt6i->dst); 1294 kfree_rcu(rt6_ex, rcu); 1295 WARN_ON_ONCE(!bucket->depth); 1296 bucket->depth--; 1297 } 1298 1299 /* Remove oldest rt6_ex in bucket and free the memory 1300 * Caller must hold rt6_exception_lock 1301 */ 1302 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1303 { 1304 struct rt6_exception *rt6_ex, *oldest = NULL; 1305 1306 if (!bucket) 1307 return; 1308 1309 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1310 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1311 oldest = rt6_ex; 1312 } 1313 rt6_remove_exception(bucket, oldest); 1314 } 1315 1316 static u32 rt6_exception_hash(const struct in6_addr *dst, 1317 const struct in6_addr *src) 1318 { 1319 static u32 seed __read_mostly; 1320 u32 val; 1321 1322 net_get_random_once(&seed, sizeof(seed)); 1323 val = jhash(dst, sizeof(*dst), seed); 1324 1325 #ifdef CONFIG_IPV6_SUBTREES 1326 if (src) 1327 val = jhash(src, sizeof(*src), val); 1328 #endif 1329 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1330 } 1331 1332 /* Helper function to find the cached rt in the hash table 1333 * and update bucket pointer to point to the bucket for this 1334 * (daddr, saddr) pair 1335 * Caller must hold rt6_exception_lock 1336 */ 1337 static struct rt6_exception * 1338 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1339 const struct in6_addr *daddr, 1340 const struct in6_addr *saddr) 1341 { 1342 struct rt6_exception *rt6_ex; 1343 u32 hval; 1344 1345 if (!(*bucket) || !daddr) 1346 return NULL; 1347 1348 hval = rt6_exception_hash(daddr, saddr); 1349 *bucket += hval; 1350 1351 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1352 struct rt6_info *rt6 = rt6_ex->rt6i; 1353 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1354 1355 #ifdef CONFIG_IPV6_SUBTREES 1356 if (matched && saddr) 1357 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1358 #endif 1359 if (matched) 1360 return rt6_ex; 1361 } 1362 return NULL; 1363 } 1364 1365 /* Helper function to find the cached rt in the hash table 1366 * and update bucket pointer to point to the bucket for this 1367 * (daddr, saddr) pair 1368 * Caller must hold rcu_read_lock() 1369 */ 1370 static struct rt6_exception * 1371 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1372 const struct in6_addr *daddr, 1373 const struct in6_addr *saddr) 1374 { 1375 struct rt6_exception *rt6_ex; 1376 u32 hval; 1377 1378 WARN_ON_ONCE(!rcu_read_lock_held()); 1379 1380 if (!(*bucket) || !daddr) 1381 return NULL; 1382 1383 hval = rt6_exception_hash(daddr, saddr); 1384 *bucket += hval; 1385 1386 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1387 struct rt6_info *rt6 = rt6_ex->rt6i; 1388 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1389 1390 #ifdef CONFIG_IPV6_SUBTREES 1391 if (matched && saddr) 1392 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1393 #endif 1394 if (matched) 1395 return rt6_ex; 1396 } 1397 return NULL; 1398 } 1399 1400 static unsigned int fib6_mtu(const struct fib6_info *rt) 1401 { 1402 unsigned int mtu; 1403 1404 if (rt->fib6_pmtu) { 1405 mtu = rt->fib6_pmtu; 1406 } else { 1407 struct net_device *dev = fib6_info_nh_dev(rt); 1408 struct inet6_dev *idev; 1409 1410 rcu_read_lock(); 1411 idev = __in6_dev_get(dev); 1412 mtu = idev->cnf.mtu6; 1413 rcu_read_unlock(); 1414 } 1415 1416 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1417 1418 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1419 } 1420 1421 static int rt6_insert_exception(struct rt6_info *nrt, 1422 struct fib6_info *ort) 1423 { 1424 struct net *net = dev_net(nrt->dst.dev); 1425 struct rt6_exception_bucket *bucket; 1426 struct in6_addr *src_key = NULL; 1427 struct rt6_exception *rt6_ex; 1428 int err = 0; 1429 1430 spin_lock_bh(&rt6_exception_lock); 1431 1432 if (ort->exception_bucket_flushed) { 1433 err = -EINVAL; 1434 goto out; 1435 } 1436 1437 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1438 lockdep_is_held(&rt6_exception_lock)); 1439 if (!bucket) { 1440 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1441 GFP_ATOMIC); 1442 if (!bucket) { 1443 err = -ENOMEM; 1444 goto out; 1445 } 1446 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1447 } 1448 1449 #ifdef CONFIG_IPV6_SUBTREES 1450 /* rt6i_src.plen != 0 indicates ort is in subtree 1451 * and exception table is indexed by a hash of 1452 * both rt6i_dst and rt6i_src. 1453 * Otherwise, the exception table is indexed by 1454 * a hash of only rt6i_dst. 1455 */ 1456 if (ort->fib6_src.plen) 1457 src_key = &nrt->rt6i_src.addr; 1458 #endif 1459 /* rt6_mtu_change() might lower mtu on ort. 1460 * Only insert this exception route if its mtu 1461 * is less than ort's mtu value. 1462 */ 1463 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1464 err = -EINVAL; 1465 goto out; 1466 } 1467 1468 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1469 src_key); 1470 if (rt6_ex) 1471 rt6_remove_exception(bucket, rt6_ex); 1472 1473 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1474 if (!rt6_ex) { 1475 err = -ENOMEM; 1476 goto out; 1477 } 1478 rt6_ex->rt6i = nrt; 1479 rt6_ex->stamp = jiffies; 1480 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1481 bucket->depth++; 1482 net->ipv6.rt6_stats->fib_rt_cache++; 1483 1484 if (bucket->depth > FIB6_MAX_DEPTH) 1485 rt6_exception_remove_oldest(bucket); 1486 1487 out: 1488 spin_unlock_bh(&rt6_exception_lock); 1489 1490 /* Update fn->fn_sernum to invalidate all cached dst */ 1491 if (!err) { 1492 spin_lock_bh(&ort->fib6_table->tb6_lock); 1493 fib6_update_sernum(net, ort); 1494 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1495 fib6_force_start_gc(net); 1496 } 1497 1498 return err; 1499 } 1500 1501 void rt6_flush_exceptions(struct fib6_info *rt) 1502 { 1503 struct rt6_exception_bucket *bucket; 1504 struct rt6_exception *rt6_ex; 1505 struct hlist_node *tmp; 1506 int i; 1507 1508 spin_lock_bh(&rt6_exception_lock); 1509 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1510 rt->exception_bucket_flushed = 1; 1511 1512 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1513 lockdep_is_held(&rt6_exception_lock)); 1514 if (!bucket) 1515 goto out; 1516 1517 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1518 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1519 rt6_remove_exception(bucket, rt6_ex); 1520 WARN_ON_ONCE(bucket->depth); 1521 bucket++; 1522 } 1523 1524 out: 1525 spin_unlock_bh(&rt6_exception_lock); 1526 } 1527 1528 /* Find cached rt in the hash table inside passed in rt 1529 * Caller has to hold rcu_read_lock() 1530 */ 1531 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1532 struct in6_addr *daddr, 1533 struct in6_addr *saddr) 1534 { 1535 struct rt6_exception_bucket *bucket; 1536 struct in6_addr *src_key = NULL; 1537 struct rt6_exception *rt6_ex; 1538 struct rt6_info *res = NULL; 1539 1540 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1541 1542 #ifdef CONFIG_IPV6_SUBTREES 1543 /* rt6i_src.plen != 0 indicates rt is in subtree 1544 * and exception table is indexed by a hash of 1545 * both rt6i_dst and rt6i_src. 1546 * Otherwise, the exception table is indexed by 1547 * a hash of only rt6i_dst. 1548 */ 1549 if (rt->fib6_src.plen) 1550 src_key = saddr; 1551 #endif 1552 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1553 1554 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1555 res = rt6_ex->rt6i; 1556 1557 return res; 1558 } 1559 1560 /* Remove the passed in cached rt from the hash table that contains it */ 1561 static int rt6_remove_exception_rt(struct rt6_info *rt) 1562 { 1563 struct rt6_exception_bucket *bucket; 1564 struct in6_addr *src_key = NULL; 1565 struct rt6_exception *rt6_ex; 1566 struct fib6_info *from; 1567 int err; 1568 1569 from = rcu_dereference(rt->from); 1570 if (!from || 1571 !(rt->rt6i_flags & RTF_CACHE)) 1572 return -EINVAL; 1573 1574 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1575 return -ENOENT; 1576 1577 spin_lock_bh(&rt6_exception_lock); 1578 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1579 lockdep_is_held(&rt6_exception_lock)); 1580 #ifdef CONFIG_IPV6_SUBTREES 1581 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1582 * and exception table is indexed by a hash of 1583 * both rt6i_dst and rt6i_src. 1584 * Otherwise, the exception table is indexed by 1585 * a hash of only rt6i_dst. 1586 */ 1587 if (from->fib6_src.plen) 1588 src_key = &rt->rt6i_src.addr; 1589 #endif 1590 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1591 &rt->rt6i_dst.addr, 1592 src_key); 1593 if (rt6_ex) { 1594 rt6_remove_exception(bucket, rt6_ex); 1595 err = 0; 1596 } else { 1597 err = -ENOENT; 1598 } 1599 1600 spin_unlock_bh(&rt6_exception_lock); 1601 return err; 1602 } 1603 1604 /* Find rt6_ex which contains the passed in rt cache and 1605 * refresh its stamp 1606 */ 1607 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1608 { 1609 struct rt6_exception_bucket *bucket; 1610 struct in6_addr *src_key = NULL; 1611 struct rt6_exception *rt6_ex; 1612 struct fib6_info *from; 1613 1614 rcu_read_lock(); 1615 from = rcu_dereference(rt->from); 1616 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1617 goto unlock; 1618 1619 bucket = rcu_dereference(from->rt6i_exception_bucket); 1620 1621 #ifdef CONFIG_IPV6_SUBTREES 1622 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1623 * and exception table is indexed by a hash of 1624 * both rt6i_dst and rt6i_src. 1625 * Otherwise, the exception table is indexed by 1626 * a hash of only rt6i_dst. 1627 */ 1628 if (from->fib6_src.plen) 1629 src_key = &rt->rt6i_src.addr; 1630 #endif 1631 rt6_ex = __rt6_find_exception_rcu(&bucket, 1632 &rt->rt6i_dst.addr, 1633 src_key); 1634 if (rt6_ex) 1635 rt6_ex->stamp = jiffies; 1636 1637 unlock: 1638 rcu_read_unlock(); 1639 } 1640 1641 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1642 struct rt6_info *rt, int mtu) 1643 { 1644 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1645 * lowest MTU in the path: always allow updating the route PMTU to 1646 * reflect PMTU decreases. 1647 * 1648 * If the new MTU is higher, and the route PMTU is equal to the local 1649 * MTU, this means the old MTU is the lowest in the path, so allow 1650 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1651 * handle this. 1652 */ 1653 1654 if (dst_mtu(&rt->dst) >= mtu) 1655 return true; 1656 1657 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1658 return true; 1659 1660 return false; 1661 } 1662 1663 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1664 struct fib6_info *rt, int mtu) 1665 { 1666 struct rt6_exception_bucket *bucket; 1667 struct rt6_exception *rt6_ex; 1668 int i; 1669 1670 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1671 lockdep_is_held(&rt6_exception_lock)); 1672 1673 if (!bucket) 1674 return; 1675 1676 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1677 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1678 struct rt6_info *entry = rt6_ex->rt6i; 1679 1680 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1681 * route), the metrics of its rt->from have already 1682 * been updated. 1683 */ 1684 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1685 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1686 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1687 } 1688 bucket++; 1689 } 1690 } 1691 1692 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1693 1694 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1695 struct in6_addr *gateway) 1696 { 1697 struct rt6_exception_bucket *bucket; 1698 struct rt6_exception *rt6_ex; 1699 struct hlist_node *tmp; 1700 int i; 1701 1702 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1703 return; 1704 1705 spin_lock_bh(&rt6_exception_lock); 1706 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1707 lockdep_is_held(&rt6_exception_lock)); 1708 1709 if (bucket) { 1710 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1711 hlist_for_each_entry_safe(rt6_ex, tmp, 1712 &bucket->chain, hlist) { 1713 struct rt6_info *entry = rt6_ex->rt6i; 1714 1715 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1716 RTF_CACHE_GATEWAY && 1717 ipv6_addr_equal(gateway, 1718 &entry->rt6i_gateway)) { 1719 rt6_remove_exception(bucket, rt6_ex); 1720 } 1721 } 1722 bucket++; 1723 } 1724 } 1725 1726 spin_unlock_bh(&rt6_exception_lock); 1727 } 1728 1729 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1730 struct rt6_exception *rt6_ex, 1731 struct fib6_gc_args *gc_args, 1732 unsigned long now) 1733 { 1734 struct rt6_info *rt = rt6_ex->rt6i; 1735 1736 /* we are pruning and obsoleting aged-out and non gateway exceptions 1737 * even if others have still references to them, so that on next 1738 * dst_check() such references can be dropped. 1739 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1740 * expired, independently from their aging, as per RFC 8201 section 4 1741 */ 1742 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1743 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1744 RT6_TRACE("aging clone %p\n", rt); 1745 rt6_remove_exception(bucket, rt6_ex); 1746 return; 1747 } 1748 } else if (time_after(jiffies, rt->dst.expires)) { 1749 RT6_TRACE("purging expired route %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 1754 if (rt->rt6i_flags & RTF_GATEWAY) { 1755 struct neighbour *neigh; 1756 __u8 neigh_flags = 0; 1757 1758 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1759 if (neigh) 1760 neigh_flags = neigh->flags; 1761 1762 if (!(neigh_flags & NTF_ROUTER)) { 1763 RT6_TRACE("purging route %p via non-router but gateway\n", 1764 rt); 1765 rt6_remove_exception(bucket, rt6_ex); 1766 return; 1767 } 1768 } 1769 1770 gc_args->more++; 1771 } 1772 1773 void rt6_age_exceptions(struct fib6_info *rt, 1774 struct fib6_gc_args *gc_args, 1775 unsigned long now) 1776 { 1777 struct rt6_exception_bucket *bucket; 1778 struct rt6_exception *rt6_ex; 1779 struct hlist_node *tmp; 1780 int i; 1781 1782 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1783 return; 1784 1785 rcu_read_lock_bh(); 1786 spin_lock(&rt6_exception_lock); 1787 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1788 lockdep_is_held(&rt6_exception_lock)); 1789 1790 if (bucket) { 1791 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1792 hlist_for_each_entry_safe(rt6_ex, tmp, 1793 &bucket->chain, hlist) { 1794 rt6_age_examine_exception(bucket, rt6_ex, 1795 gc_args, now); 1796 } 1797 bucket++; 1798 } 1799 } 1800 spin_unlock(&rt6_exception_lock); 1801 rcu_read_unlock_bh(); 1802 } 1803 1804 /* must be called with rcu lock held */ 1805 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1806 int oif, struct flowi6 *fl6, int strict) 1807 { 1808 struct fib6_node *fn, *saved_fn; 1809 struct fib6_info *f6i; 1810 1811 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1812 saved_fn = fn; 1813 1814 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1815 oif = 0; 1816 1817 redo_rt6_select: 1818 f6i = rt6_select(net, fn, oif, strict); 1819 if (f6i == net->ipv6.fib6_null_entry) { 1820 fn = fib6_backtrack(fn, &fl6->saddr); 1821 if (fn) 1822 goto redo_rt6_select; 1823 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1824 /* also consider unreachable route */ 1825 strict &= ~RT6_LOOKUP_F_REACHABLE; 1826 fn = saved_fn; 1827 goto redo_rt6_select; 1828 } 1829 } 1830 1831 trace_fib6_table_lookup(net, f6i, table, fl6); 1832 1833 return f6i; 1834 } 1835 1836 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1837 int oif, struct flowi6 *fl6, 1838 const struct sk_buff *skb, int flags) 1839 { 1840 struct fib6_info *f6i; 1841 struct rt6_info *rt; 1842 int strict = 0; 1843 1844 strict |= flags & RT6_LOOKUP_F_IFACE; 1845 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1846 if (net->ipv6.devconf_all->forwarding == 0) 1847 strict |= RT6_LOOKUP_F_REACHABLE; 1848 1849 rcu_read_lock(); 1850 1851 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1852 if (f6i->fib6_nsiblings) 1853 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1854 1855 if (f6i == net->ipv6.fib6_null_entry) { 1856 rt = net->ipv6.ip6_null_entry; 1857 rcu_read_unlock(); 1858 dst_hold(&rt->dst); 1859 return rt; 1860 } 1861 1862 /*Search through exception table */ 1863 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1864 if (rt) { 1865 if (ip6_hold_safe(net, &rt, true)) 1866 dst_use_noref(&rt->dst, jiffies); 1867 1868 rcu_read_unlock(); 1869 return rt; 1870 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1871 !(f6i->fib6_flags & RTF_GATEWAY))) { 1872 /* Create a RTF_CACHE clone which will not be 1873 * owned by the fib6 tree. It is for the special case where 1874 * the daddr in the skb during the neighbor look-up is different 1875 * from the fl6->daddr used to look-up route here. 1876 */ 1877 struct rt6_info *uncached_rt; 1878 1879 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1880 1881 rcu_read_unlock(); 1882 1883 if (uncached_rt) { 1884 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1885 * No need for another dst_hold() 1886 */ 1887 rt6_uncached_list_add(uncached_rt); 1888 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1889 } else { 1890 uncached_rt = net->ipv6.ip6_null_entry; 1891 dst_hold(&uncached_rt->dst); 1892 } 1893 1894 return uncached_rt; 1895 } else { 1896 /* Get a percpu copy */ 1897 1898 struct rt6_info *pcpu_rt; 1899 1900 local_bh_disable(); 1901 pcpu_rt = rt6_get_pcpu_route(f6i); 1902 1903 if (!pcpu_rt) 1904 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1905 1906 local_bh_enable(); 1907 rcu_read_unlock(); 1908 1909 return pcpu_rt; 1910 } 1911 } 1912 EXPORT_SYMBOL_GPL(ip6_pol_route); 1913 1914 static struct rt6_info *ip6_pol_route_input(struct net *net, 1915 struct fib6_table *table, 1916 struct flowi6 *fl6, 1917 const struct sk_buff *skb, 1918 int flags) 1919 { 1920 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1921 } 1922 1923 struct dst_entry *ip6_route_input_lookup(struct net *net, 1924 struct net_device *dev, 1925 struct flowi6 *fl6, 1926 const struct sk_buff *skb, 1927 int flags) 1928 { 1929 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1930 flags |= RT6_LOOKUP_F_IFACE; 1931 1932 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1933 } 1934 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1935 1936 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1937 struct flow_keys *keys, 1938 struct flow_keys *flkeys) 1939 { 1940 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1941 const struct ipv6hdr *key_iph = outer_iph; 1942 struct flow_keys *_flkeys = flkeys; 1943 const struct ipv6hdr *inner_iph; 1944 const struct icmp6hdr *icmph; 1945 struct ipv6hdr _inner_iph; 1946 struct icmp6hdr _icmph; 1947 1948 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1949 goto out; 1950 1951 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1952 sizeof(_icmph), &_icmph); 1953 if (!icmph) 1954 goto out; 1955 1956 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1957 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1958 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1959 icmph->icmp6_type != ICMPV6_PARAMPROB) 1960 goto out; 1961 1962 inner_iph = skb_header_pointer(skb, 1963 skb_transport_offset(skb) + sizeof(*icmph), 1964 sizeof(_inner_iph), &_inner_iph); 1965 if (!inner_iph) 1966 goto out; 1967 1968 key_iph = inner_iph; 1969 _flkeys = NULL; 1970 out: 1971 if (_flkeys) { 1972 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1973 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1974 keys->tags.flow_label = _flkeys->tags.flow_label; 1975 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1976 } else { 1977 keys->addrs.v6addrs.src = key_iph->saddr; 1978 keys->addrs.v6addrs.dst = key_iph->daddr; 1979 keys->tags.flow_label = ip6_flowlabel(key_iph); 1980 keys->basic.ip_proto = key_iph->nexthdr; 1981 } 1982 } 1983 1984 /* if skb is set it will be used and fl6 can be NULL */ 1985 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1986 const struct sk_buff *skb, struct flow_keys *flkeys) 1987 { 1988 struct flow_keys hash_keys; 1989 u32 mhash; 1990 1991 switch (ip6_multipath_hash_policy(net)) { 1992 case 0: 1993 memset(&hash_keys, 0, sizeof(hash_keys)); 1994 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1995 if (skb) { 1996 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1997 } else { 1998 hash_keys.addrs.v6addrs.src = fl6->saddr; 1999 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2000 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2001 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2002 } 2003 break; 2004 case 1: 2005 if (skb) { 2006 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2007 struct flow_keys keys; 2008 2009 /* short-circuit if we already have L4 hash present */ 2010 if (skb->l4_hash) 2011 return skb_get_hash_raw(skb) >> 1; 2012 2013 memset(&hash_keys, 0, sizeof(hash_keys)); 2014 2015 if (!flkeys) { 2016 skb_flow_dissect_flow_keys(skb, &keys, flag); 2017 flkeys = &keys; 2018 } 2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2020 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2021 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2022 hash_keys.ports.src = flkeys->ports.src; 2023 hash_keys.ports.dst = flkeys->ports.dst; 2024 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2025 } else { 2026 memset(&hash_keys, 0, sizeof(hash_keys)); 2027 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2028 hash_keys.addrs.v6addrs.src = fl6->saddr; 2029 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2030 hash_keys.ports.src = fl6->fl6_sport; 2031 hash_keys.ports.dst = fl6->fl6_dport; 2032 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2033 } 2034 break; 2035 } 2036 mhash = flow_hash_from_keys(&hash_keys); 2037 2038 return mhash >> 1; 2039 } 2040 2041 void ip6_route_input(struct sk_buff *skb) 2042 { 2043 const struct ipv6hdr *iph = ipv6_hdr(skb); 2044 struct net *net = dev_net(skb->dev); 2045 int flags = RT6_LOOKUP_F_HAS_SADDR; 2046 struct ip_tunnel_info *tun_info; 2047 struct flowi6 fl6 = { 2048 .flowi6_iif = skb->dev->ifindex, 2049 .daddr = iph->daddr, 2050 .saddr = iph->saddr, 2051 .flowlabel = ip6_flowinfo(iph), 2052 .flowi6_mark = skb->mark, 2053 .flowi6_proto = iph->nexthdr, 2054 }; 2055 struct flow_keys *flkeys = NULL, _flkeys; 2056 2057 tun_info = skb_tunnel_info(skb); 2058 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2059 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2060 2061 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2062 flkeys = &_flkeys; 2063 2064 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2065 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2066 skb_dst_drop(skb); 2067 skb_dst_set(skb, 2068 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2069 } 2070 2071 static struct rt6_info *ip6_pol_route_output(struct net *net, 2072 struct fib6_table *table, 2073 struct flowi6 *fl6, 2074 const struct sk_buff *skb, 2075 int flags) 2076 { 2077 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2078 } 2079 2080 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2081 struct flowi6 *fl6, int flags) 2082 { 2083 bool any_src; 2084 2085 if (ipv6_addr_type(&fl6->daddr) & 2086 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2087 struct dst_entry *dst; 2088 2089 dst = l3mdev_link_scope_lookup(net, fl6); 2090 if (dst) 2091 return dst; 2092 } 2093 2094 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2095 2096 any_src = ipv6_addr_any(&fl6->saddr); 2097 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2098 (fl6->flowi6_oif && any_src)) 2099 flags |= RT6_LOOKUP_F_IFACE; 2100 2101 if (!any_src) 2102 flags |= RT6_LOOKUP_F_HAS_SADDR; 2103 else if (sk) 2104 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2105 2106 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2107 } 2108 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2109 2110 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2111 { 2112 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2113 struct net_device *loopback_dev = net->loopback_dev; 2114 struct dst_entry *new = NULL; 2115 2116 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2117 DST_OBSOLETE_DEAD, 0); 2118 if (rt) { 2119 rt6_info_init(rt); 2120 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2121 2122 new = &rt->dst; 2123 new->__use = 1; 2124 new->input = dst_discard; 2125 new->output = dst_discard_out; 2126 2127 dst_copy_metrics(new, &ort->dst); 2128 2129 rt->rt6i_idev = in6_dev_get(loopback_dev); 2130 rt->rt6i_gateway = ort->rt6i_gateway; 2131 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2132 2133 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2134 #ifdef CONFIG_IPV6_SUBTREES 2135 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2136 #endif 2137 } 2138 2139 dst_release(dst_orig); 2140 return new ? new : ERR_PTR(-ENOMEM); 2141 } 2142 2143 /* 2144 * Destination cache support functions 2145 */ 2146 2147 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2148 { 2149 u32 rt_cookie = 0; 2150 2151 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2152 return false; 2153 2154 if (fib6_check_expired(f6i)) 2155 return false; 2156 2157 return true; 2158 } 2159 2160 static struct dst_entry *rt6_check(struct rt6_info *rt, 2161 struct fib6_info *from, 2162 u32 cookie) 2163 { 2164 u32 rt_cookie = 0; 2165 2166 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2167 rt_cookie != cookie) 2168 return NULL; 2169 2170 if (rt6_check_expired(rt)) 2171 return NULL; 2172 2173 return &rt->dst; 2174 } 2175 2176 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2177 struct fib6_info *from, 2178 u32 cookie) 2179 { 2180 if (!__rt6_check_expired(rt) && 2181 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2182 fib6_check(from, cookie)) 2183 return &rt->dst; 2184 else 2185 return NULL; 2186 } 2187 2188 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2189 { 2190 struct dst_entry *dst_ret; 2191 struct fib6_info *from; 2192 struct rt6_info *rt; 2193 2194 rt = container_of(dst, struct rt6_info, dst); 2195 2196 rcu_read_lock(); 2197 2198 /* All IPV6 dsts are created with ->obsolete set to the value 2199 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2200 * into this function always. 2201 */ 2202 2203 from = rcu_dereference(rt->from); 2204 2205 if (from && (rt->rt6i_flags & RTF_PCPU || 2206 unlikely(!list_empty(&rt->rt6i_uncached)))) 2207 dst_ret = rt6_dst_from_check(rt, from, cookie); 2208 else 2209 dst_ret = rt6_check(rt, from, cookie); 2210 2211 rcu_read_unlock(); 2212 2213 return dst_ret; 2214 } 2215 2216 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2217 { 2218 struct rt6_info *rt = (struct rt6_info *) dst; 2219 2220 if (rt) { 2221 if (rt->rt6i_flags & RTF_CACHE) { 2222 rcu_read_lock(); 2223 if (rt6_check_expired(rt)) { 2224 rt6_remove_exception_rt(rt); 2225 dst = NULL; 2226 } 2227 rcu_read_unlock(); 2228 } else { 2229 dst_release(dst); 2230 dst = NULL; 2231 } 2232 } 2233 return dst; 2234 } 2235 2236 static void ip6_link_failure(struct sk_buff *skb) 2237 { 2238 struct rt6_info *rt; 2239 2240 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2241 2242 rt = (struct rt6_info *) skb_dst(skb); 2243 if (rt) { 2244 rcu_read_lock(); 2245 if (rt->rt6i_flags & RTF_CACHE) { 2246 rt6_remove_exception_rt(rt); 2247 } else { 2248 struct fib6_info *from; 2249 struct fib6_node *fn; 2250 2251 from = rcu_dereference(rt->from); 2252 if (from) { 2253 fn = rcu_dereference(from->fib6_node); 2254 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2255 fn->fn_sernum = -1; 2256 } 2257 } 2258 rcu_read_unlock(); 2259 } 2260 } 2261 2262 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2263 { 2264 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2265 struct fib6_info *from; 2266 2267 rcu_read_lock(); 2268 from = rcu_dereference(rt0->from); 2269 if (from) 2270 rt0->dst.expires = from->expires; 2271 rcu_read_unlock(); 2272 } 2273 2274 dst_set_expires(&rt0->dst, timeout); 2275 rt0->rt6i_flags |= RTF_EXPIRES; 2276 } 2277 2278 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2279 { 2280 struct net *net = dev_net(rt->dst.dev); 2281 2282 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2283 rt->rt6i_flags |= RTF_MODIFIED; 2284 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2285 } 2286 2287 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2288 { 2289 return !(rt->rt6i_flags & RTF_CACHE) && 2290 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2291 } 2292 2293 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2294 const struct ipv6hdr *iph, u32 mtu) 2295 { 2296 const struct in6_addr *daddr, *saddr; 2297 struct rt6_info *rt6 = (struct rt6_info *)dst; 2298 2299 if (dst_metric_locked(dst, RTAX_MTU)) 2300 return; 2301 2302 if (iph) { 2303 daddr = &iph->daddr; 2304 saddr = &iph->saddr; 2305 } else if (sk) { 2306 daddr = &sk->sk_v6_daddr; 2307 saddr = &inet6_sk(sk)->saddr; 2308 } else { 2309 daddr = NULL; 2310 saddr = NULL; 2311 } 2312 dst_confirm_neigh(dst, daddr); 2313 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2314 if (mtu >= dst_mtu(dst)) 2315 return; 2316 2317 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2318 rt6_do_update_pmtu(rt6, mtu); 2319 /* update rt6_ex->stamp for cache */ 2320 if (rt6->rt6i_flags & RTF_CACHE) 2321 rt6_update_exception_stamp_rt(rt6); 2322 } else if (daddr) { 2323 struct fib6_info *from; 2324 struct rt6_info *nrt6; 2325 2326 rcu_read_lock(); 2327 from = rcu_dereference(rt6->from); 2328 if (!from) { 2329 rcu_read_unlock(); 2330 return; 2331 } 2332 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2333 if (nrt6) { 2334 rt6_do_update_pmtu(nrt6, mtu); 2335 if (rt6_insert_exception(nrt6, from)) 2336 dst_release_immediate(&nrt6->dst); 2337 } 2338 rcu_read_unlock(); 2339 } 2340 } 2341 2342 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2343 struct sk_buff *skb, u32 mtu) 2344 { 2345 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2346 } 2347 2348 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2349 int oif, u32 mark, kuid_t uid) 2350 { 2351 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2352 struct dst_entry *dst; 2353 struct flowi6 fl6 = { 2354 .flowi6_oif = oif, 2355 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2356 .daddr = iph->daddr, 2357 .saddr = iph->saddr, 2358 .flowlabel = ip6_flowinfo(iph), 2359 .flowi6_uid = uid, 2360 }; 2361 2362 dst = ip6_route_output(net, NULL, &fl6); 2363 if (!dst->error) 2364 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2365 dst_release(dst); 2366 } 2367 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2368 2369 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2370 { 2371 int oif = sk->sk_bound_dev_if; 2372 struct dst_entry *dst; 2373 2374 if (!oif && skb->dev) 2375 oif = l3mdev_master_ifindex(skb->dev); 2376 2377 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2378 2379 dst = __sk_dst_get(sk); 2380 if (!dst || !dst->obsolete || 2381 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2382 return; 2383 2384 bh_lock_sock(sk); 2385 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2386 ip6_datagram_dst_update(sk, false); 2387 bh_unlock_sock(sk); 2388 } 2389 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2390 2391 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2392 const struct flowi6 *fl6) 2393 { 2394 #ifdef CONFIG_IPV6_SUBTREES 2395 struct ipv6_pinfo *np = inet6_sk(sk); 2396 #endif 2397 2398 ip6_dst_store(sk, dst, 2399 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2400 &sk->sk_v6_daddr : NULL, 2401 #ifdef CONFIG_IPV6_SUBTREES 2402 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2403 &np->saddr : 2404 #endif 2405 NULL); 2406 } 2407 2408 /* Handle redirects */ 2409 struct ip6rd_flowi { 2410 struct flowi6 fl6; 2411 struct in6_addr gateway; 2412 }; 2413 2414 static struct rt6_info *__ip6_route_redirect(struct net *net, 2415 struct fib6_table *table, 2416 struct flowi6 *fl6, 2417 const struct sk_buff *skb, 2418 int flags) 2419 { 2420 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2421 struct rt6_info *ret = NULL, *rt_cache; 2422 struct fib6_info *rt; 2423 struct fib6_node *fn; 2424 2425 /* Get the "current" route for this destination and 2426 * check if the redirect has come from appropriate router. 2427 * 2428 * RFC 4861 specifies that redirects should only be 2429 * accepted if they come from the nexthop to the target. 2430 * Due to the way the routes are chosen, this notion 2431 * is a bit fuzzy and one might need to check all possible 2432 * routes. 2433 */ 2434 2435 rcu_read_lock(); 2436 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2437 restart: 2438 for_each_fib6_node_rt_rcu(fn) { 2439 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2440 continue; 2441 if (fib6_check_expired(rt)) 2442 continue; 2443 if (rt->fib6_flags & RTF_REJECT) 2444 break; 2445 if (!(rt->fib6_flags & RTF_GATEWAY)) 2446 continue; 2447 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2448 continue; 2449 /* rt_cache's gateway might be different from its 'parent' 2450 * in the case of an ip redirect. 2451 * So we keep searching in the exception table if the gateway 2452 * is different. 2453 */ 2454 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2455 rt_cache = rt6_find_cached_rt(rt, 2456 &fl6->daddr, 2457 &fl6->saddr); 2458 if (rt_cache && 2459 ipv6_addr_equal(&rdfl->gateway, 2460 &rt_cache->rt6i_gateway)) { 2461 ret = rt_cache; 2462 break; 2463 } 2464 continue; 2465 } 2466 break; 2467 } 2468 2469 if (!rt) 2470 rt = net->ipv6.fib6_null_entry; 2471 else if (rt->fib6_flags & RTF_REJECT) { 2472 ret = net->ipv6.ip6_null_entry; 2473 goto out; 2474 } 2475 2476 if (rt == net->ipv6.fib6_null_entry) { 2477 fn = fib6_backtrack(fn, &fl6->saddr); 2478 if (fn) 2479 goto restart; 2480 } 2481 2482 out: 2483 if (ret) 2484 ip6_hold_safe(net, &ret, true); 2485 else 2486 ret = ip6_create_rt_rcu(rt); 2487 2488 rcu_read_unlock(); 2489 2490 trace_fib6_table_lookup(net, rt, table, fl6); 2491 return ret; 2492 }; 2493 2494 static struct dst_entry *ip6_route_redirect(struct net *net, 2495 const struct flowi6 *fl6, 2496 const struct sk_buff *skb, 2497 const struct in6_addr *gateway) 2498 { 2499 int flags = RT6_LOOKUP_F_HAS_SADDR; 2500 struct ip6rd_flowi rdfl; 2501 2502 rdfl.fl6 = *fl6; 2503 rdfl.gateway = *gateway; 2504 2505 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2506 flags, __ip6_route_redirect); 2507 } 2508 2509 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2510 kuid_t uid) 2511 { 2512 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2513 struct dst_entry *dst; 2514 struct flowi6 fl6 = { 2515 .flowi6_iif = LOOPBACK_IFINDEX, 2516 .flowi6_oif = oif, 2517 .flowi6_mark = mark, 2518 .daddr = iph->daddr, 2519 .saddr = iph->saddr, 2520 .flowlabel = ip6_flowinfo(iph), 2521 .flowi6_uid = uid, 2522 }; 2523 2524 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2525 rt6_do_redirect(dst, NULL, skb); 2526 dst_release(dst); 2527 } 2528 EXPORT_SYMBOL_GPL(ip6_redirect); 2529 2530 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2531 { 2532 const struct ipv6hdr *iph = ipv6_hdr(skb); 2533 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2534 struct dst_entry *dst; 2535 struct flowi6 fl6 = { 2536 .flowi6_iif = LOOPBACK_IFINDEX, 2537 .flowi6_oif = oif, 2538 .daddr = msg->dest, 2539 .saddr = iph->daddr, 2540 .flowi6_uid = sock_net_uid(net, NULL), 2541 }; 2542 2543 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2544 rt6_do_redirect(dst, NULL, skb); 2545 dst_release(dst); 2546 } 2547 2548 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2549 { 2550 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2551 sk->sk_uid); 2552 } 2553 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2554 2555 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2556 { 2557 struct net_device *dev = dst->dev; 2558 unsigned int mtu = dst_mtu(dst); 2559 struct net *net = dev_net(dev); 2560 2561 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2562 2563 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2564 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2565 2566 /* 2567 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2568 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2569 * IPV6_MAXPLEN is also valid and means: "any MSS, 2570 * rely only on pmtu discovery" 2571 */ 2572 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2573 mtu = IPV6_MAXPLEN; 2574 return mtu; 2575 } 2576 2577 static unsigned int ip6_mtu(const struct dst_entry *dst) 2578 { 2579 struct inet6_dev *idev; 2580 unsigned int mtu; 2581 2582 mtu = dst_metric_raw(dst, RTAX_MTU); 2583 if (mtu) 2584 goto out; 2585 2586 mtu = IPV6_MIN_MTU; 2587 2588 rcu_read_lock(); 2589 idev = __in6_dev_get(dst->dev); 2590 if (idev) 2591 mtu = idev->cnf.mtu6; 2592 rcu_read_unlock(); 2593 2594 out: 2595 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2596 2597 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2598 } 2599 2600 /* MTU selection: 2601 * 1. mtu on route is locked - use it 2602 * 2. mtu from nexthop exception 2603 * 3. mtu from egress device 2604 * 2605 * based on ip6_dst_mtu_forward and exception logic of 2606 * rt6_find_cached_rt; called with rcu_read_lock 2607 */ 2608 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2609 struct in6_addr *saddr) 2610 { 2611 struct rt6_exception_bucket *bucket; 2612 struct rt6_exception *rt6_ex; 2613 struct in6_addr *src_key; 2614 struct inet6_dev *idev; 2615 u32 mtu = 0; 2616 2617 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2618 mtu = f6i->fib6_pmtu; 2619 if (mtu) 2620 goto out; 2621 } 2622 2623 src_key = NULL; 2624 #ifdef CONFIG_IPV6_SUBTREES 2625 if (f6i->fib6_src.plen) 2626 src_key = saddr; 2627 #endif 2628 2629 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2630 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2631 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2632 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2633 2634 if (likely(!mtu)) { 2635 struct net_device *dev = fib6_info_nh_dev(f6i); 2636 2637 mtu = IPV6_MIN_MTU; 2638 idev = __in6_dev_get(dev); 2639 if (idev && idev->cnf.mtu6 > mtu) 2640 mtu = idev->cnf.mtu6; 2641 } 2642 2643 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2644 out: 2645 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2646 } 2647 2648 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2649 struct flowi6 *fl6) 2650 { 2651 struct dst_entry *dst; 2652 struct rt6_info *rt; 2653 struct inet6_dev *idev = in6_dev_get(dev); 2654 struct net *net = dev_net(dev); 2655 2656 if (unlikely(!idev)) 2657 return ERR_PTR(-ENODEV); 2658 2659 rt = ip6_dst_alloc(net, dev, 0); 2660 if (unlikely(!rt)) { 2661 in6_dev_put(idev); 2662 dst = ERR_PTR(-ENOMEM); 2663 goto out; 2664 } 2665 2666 rt->dst.flags |= DST_HOST; 2667 rt->dst.input = ip6_input; 2668 rt->dst.output = ip6_output; 2669 rt->rt6i_gateway = fl6->daddr; 2670 rt->rt6i_dst.addr = fl6->daddr; 2671 rt->rt6i_dst.plen = 128; 2672 rt->rt6i_idev = idev; 2673 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2674 2675 /* Add this dst into uncached_list so that rt6_disable_ip() can 2676 * do proper release of the net_device 2677 */ 2678 rt6_uncached_list_add(rt); 2679 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2680 2681 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2682 2683 out: 2684 return dst; 2685 } 2686 2687 static int ip6_dst_gc(struct dst_ops *ops) 2688 { 2689 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2690 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2691 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2692 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2693 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2694 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2695 int entries; 2696 2697 entries = dst_entries_get_fast(ops); 2698 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2699 entries <= rt_max_size) 2700 goto out; 2701 2702 net->ipv6.ip6_rt_gc_expire++; 2703 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2704 entries = dst_entries_get_slow(ops); 2705 if (entries < ops->gc_thresh) 2706 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2707 out: 2708 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2709 return entries > rt_max_size; 2710 } 2711 2712 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2713 struct fib6_config *cfg, 2714 const struct in6_addr *gw_addr, 2715 u32 tbid, int flags) 2716 { 2717 struct flowi6 fl6 = { 2718 .flowi6_oif = cfg->fc_ifindex, 2719 .daddr = *gw_addr, 2720 .saddr = cfg->fc_prefsrc, 2721 }; 2722 struct fib6_table *table; 2723 struct rt6_info *rt; 2724 2725 table = fib6_get_table(net, tbid); 2726 if (!table) 2727 return NULL; 2728 2729 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2730 flags |= RT6_LOOKUP_F_HAS_SADDR; 2731 2732 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2733 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2734 2735 /* if table lookup failed, fall back to full lookup */ 2736 if (rt == net->ipv6.ip6_null_entry) { 2737 ip6_rt_put(rt); 2738 rt = NULL; 2739 } 2740 2741 return rt; 2742 } 2743 2744 static int ip6_route_check_nh_onlink(struct net *net, 2745 struct fib6_config *cfg, 2746 const struct net_device *dev, 2747 struct netlink_ext_ack *extack) 2748 { 2749 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2750 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2751 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2752 struct fib6_info *from; 2753 struct rt6_info *grt; 2754 int err; 2755 2756 err = 0; 2757 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2758 if (grt) { 2759 rcu_read_lock(); 2760 from = rcu_dereference(grt->from); 2761 if (!grt->dst.error && 2762 /* ignore match if it is the default route */ 2763 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2764 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2765 NL_SET_ERR_MSG(extack, 2766 "Nexthop has invalid gateway or device mismatch"); 2767 err = -EINVAL; 2768 } 2769 rcu_read_unlock(); 2770 2771 ip6_rt_put(grt); 2772 } 2773 2774 return err; 2775 } 2776 2777 static int ip6_route_check_nh(struct net *net, 2778 struct fib6_config *cfg, 2779 struct net_device **_dev, 2780 struct inet6_dev **idev) 2781 { 2782 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2783 struct net_device *dev = _dev ? *_dev : NULL; 2784 struct rt6_info *grt = NULL; 2785 int err = -EHOSTUNREACH; 2786 2787 if (cfg->fc_table) { 2788 int flags = RT6_LOOKUP_F_IFACE; 2789 2790 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2791 cfg->fc_table, flags); 2792 if (grt) { 2793 if (grt->rt6i_flags & RTF_GATEWAY || 2794 (dev && dev != grt->dst.dev)) { 2795 ip6_rt_put(grt); 2796 grt = NULL; 2797 } 2798 } 2799 } 2800 2801 if (!grt) 2802 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2803 2804 if (!grt) 2805 goto out; 2806 2807 if (dev) { 2808 if (dev != grt->dst.dev) { 2809 ip6_rt_put(grt); 2810 goto out; 2811 } 2812 } else { 2813 *_dev = dev = grt->dst.dev; 2814 *idev = grt->rt6i_idev; 2815 dev_hold(dev); 2816 in6_dev_hold(grt->rt6i_idev); 2817 } 2818 2819 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2820 err = 0; 2821 2822 ip6_rt_put(grt); 2823 2824 out: 2825 return err; 2826 } 2827 2828 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2829 struct net_device **_dev, struct inet6_dev **idev, 2830 struct netlink_ext_ack *extack) 2831 { 2832 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2833 int gwa_type = ipv6_addr_type(gw_addr); 2834 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2835 const struct net_device *dev = *_dev; 2836 bool need_addr_check = !dev; 2837 int err = -EINVAL; 2838 2839 /* if gw_addr is local we will fail to detect this in case 2840 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2841 * will return already-added prefix route via interface that 2842 * prefix route was assigned to, which might be non-loopback. 2843 */ 2844 if (dev && 2845 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2846 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2847 goto out; 2848 } 2849 2850 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2851 /* IPv6 strictly inhibits using not link-local 2852 * addresses as nexthop address. 2853 * Otherwise, router will not able to send redirects. 2854 * It is very good, but in some (rare!) circumstances 2855 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2856 * some exceptions. --ANK 2857 * We allow IPv4-mapped nexthops to support RFC4798-type 2858 * addressing 2859 */ 2860 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2861 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2862 goto out; 2863 } 2864 2865 if (cfg->fc_flags & RTNH_F_ONLINK) 2866 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2867 else 2868 err = ip6_route_check_nh(net, cfg, _dev, idev); 2869 2870 if (err) 2871 goto out; 2872 } 2873 2874 /* reload in case device was changed */ 2875 dev = *_dev; 2876 2877 err = -EINVAL; 2878 if (!dev) { 2879 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2880 goto out; 2881 } else if (dev->flags & IFF_LOOPBACK) { 2882 NL_SET_ERR_MSG(extack, 2883 "Egress device can not be loopback device for this route"); 2884 goto out; 2885 } 2886 2887 /* if we did not check gw_addr above, do so now that the 2888 * egress device has been resolved. 2889 */ 2890 if (need_addr_check && 2891 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2892 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2893 goto out; 2894 } 2895 2896 err = 0; 2897 out: 2898 return err; 2899 } 2900 2901 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2902 gfp_t gfp_flags, 2903 struct netlink_ext_ack *extack) 2904 { 2905 struct net *net = cfg->fc_nlinfo.nl_net; 2906 struct fib6_info *rt = NULL; 2907 struct net_device *dev = NULL; 2908 struct inet6_dev *idev = NULL; 2909 struct fib6_table *table; 2910 int addr_type; 2911 int err = -EINVAL; 2912 2913 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2914 if (cfg->fc_flags & RTF_PCPU) { 2915 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2916 goto out; 2917 } 2918 2919 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2920 if (cfg->fc_flags & RTF_CACHE) { 2921 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2922 goto out; 2923 } 2924 2925 if (cfg->fc_type > RTN_MAX) { 2926 NL_SET_ERR_MSG(extack, "Invalid route type"); 2927 goto out; 2928 } 2929 2930 if (cfg->fc_dst_len > 128) { 2931 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2932 goto out; 2933 } 2934 if (cfg->fc_src_len > 128) { 2935 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2936 goto out; 2937 } 2938 #ifndef CONFIG_IPV6_SUBTREES 2939 if (cfg->fc_src_len) { 2940 NL_SET_ERR_MSG(extack, 2941 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2942 goto out; 2943 } 2944 #endif 2945 if (cfg->fc_ifindex) { 2946 err = -ENODEV; 2947 dev = dev_get_by_index(net, cfg->fc_ifindex); 2948 if (!dev) 2949 goto out; 2950 idev = in6_dev_get(dev); 2951 if (!idev) 2952 goto out; 2953 } 2954 2955 if (cfg->fc_metric == 0) 2956 cfg->fc_metric = IP6_RT_PRIO_USER; 2957 2958 if (cfg->fc_flags & RTNH_F_ONLINK) { 2959 if (!dev) { 2960 NL_SET_ERR_MSG(extack, 2961 "Nexthop device required for onlink"); 2962 err = -ENODEV; 2963 goto out; 2964 } 2965 2966 if (!(dev->flags & IFF_UP)) { 2967 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2968 err = -ENETDOWN; 2969 goto out; 2970 } 2971 } 2972 2973 err = -ENOBUFS; 2974 if (cfg->fc_nlinfo.nlh && 2975 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2976 table = fib6_get_table(net, cfg->fc_table); 2977 if (!table) { 2978 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2979 table = fib6_new_table(net, cfg->fc_table); 2980 } 2981 } else { 2982 table = fib6_new_table(net, cfg->fc_table); 2983 } 2984 2985 if (!table) 2986 goto out; 2987 2988 err = -ENOMEM; 2989 rt = fib6_info_alloc(gfp_flags); 2990 if (!rt) 2991 goto out; 2992 2993 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 2994 extack); 2995 if (IS_ERR(rt->fib6_metrics)) { 2996 err = PTR_ERR(rt->fib6_metrics); 2997 /* Do not leave garbage there. */ 2998 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 2999 goto out; 3000 } 3001 3002 if (cfg->fc_flags & RTF_ADDRCONF) 3003 rt->dst_nocount = true; 3004 3005 if (cfg->fc_flags & RTF_EXPIRES) 3006 fib6_set_expires(rt, jiffies + 3007 clock_t_to_jiffies(cfg->fc_expires)); 3008 else 3009 fib6_clean_expires(rt); 3010 3011 if (cfg->fc_protocol == RTPROT_UNSPEC) 3012 cfg->fc_protocol = RTPROT_BOOT; 3013 rt->fib6_protocol = cfg->fc_protocol; 3014 3015 addr_type = ipv6_addr_type(&cfg->fc_dst); 3016 3017 if (cfg->fc_encap) { 3018 struct lwtunnel_state *lwtstate; 3019 3020 err = lwtunnel_build_state(cfg->fc_encap_type, 3021 cfg->fc_encap, AF_INET6, cfg, 3022 &lwtstate, extack); 3023 if (err) 3024 goto out; 3025 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3026 } 3027 3028 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3029 rt->fib6_dst.plen = cfg->fc_dst_len; 3030 if (rt->fib6_dst.plen == 128) 3031 rt->dst_host = true; 3032 3033 #ifdef CONFIG_IPV6_SUBTREES 3034 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3035 rt->fib6_src.plen = cfg->fc_src_len; 3036 #endif 3037 3038 rt->fib6_metric = cfg->fc_metric; 3039 rt->fib6_nh.nh_weight = 1; 3040 3041 rt->fib6_type = cfg->fc_type; 3042 3043 /* We cannot add true routes via loopback here, 3044 they would result in kernel looping; promote them to reject routes 3045 */ 3046 if ((cfg->fc_flags & RTF_REJECT) || 3047 (dev && (dev->flags & IFF_LOOPBACK) && 3048 !(addr_type & IPV6_ADDR_LOOPBACK) && 3049 !(cfg->fc_flags & RTF_LOCAL))) { 3050 /* hold loopback dev/idev if we haven't done so. */ 3051 if (dev != net->loopback_dev) { 3052 if (dev) { 3053 dev_put(dev); 3054 in6_dev_put(idev); 3055 } 3056 dev = net->loopback_dev; 3057 dev_hold(dev); 3058 idev = in6_dev_get(dev); 3059 if (!idev) { 3060 err = -ENODEV; 3061 goto out; 3062 } 3063 } 3064 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3065 goto install_route; 3066 } 3067 3068 if (cfg->fc_flags & RTF_GATEWAY) { 3069 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3070 if (err) 3071 goto out; 3072 3073 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3074 } 3075 3076 err = -ENODEV; 3077 if (!dev) 3078 goto out; 3079 3080 if (idev->cnf.disable_ipv6) { 3081 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3082 err = -EACCES; 3083 goto out; 3084 } 3085 3086 if (!(dev->flags & IFF_UP)) { 3087 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3088 err = -ENETDOWN; 3089 goto out; 3090 } 3091 3092 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3093 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3094 NL_SET_ERR_MSG(extack, "Invalid source address"); 3095 err = -EINVAL; 3096 goto out; 3097 } 3098 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3099 rt->fib6_prefsrc.plen = 128; 3100 } else 3101 rt->fib6_prefsrc.plen = 0; 3102 3103 rt->fib6_flags = cfg->fc_flags; 3104 3105 install_route: 3106 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3107 !netif_carrier_ok(dev)) 3108 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3109 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3110 rt->fib6_nh.nh_dev = dev; 3111 rt->fib6_table = table; 3112 3113 if (idev) 3114 in6_dev_put(idev); 3115 3116 return rt; 3117 out: 3118 if (dev) 3119 dev_put(dev); 3120 if (idev) 3121 in6_dev_put(idev); 3122 3123 fib6_info_release(rt); 3124 return ERR_PTR(err); 3125 } 3126 3127 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3128 struct netlink_ext_ack *extack) 3129 { 3130 struct fib6_info *rt; 3131 int err; 3132 3133 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3134 if (IS_ERR(rt)) 3135 return PTR_ERR(rt); 3136 3137 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3138 fib6_info_release(rt); 3139 3140 return err; 3141 } 3142 3143 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3144 { 3145 struct net *net = info->nl_net; 3146 struct fib6_table *table; 3147 int err; 3148 3149 if (rt == net->ipv6.fib6_null_entry) { 3150 err = -ENOENT; 3151 goto out; 3152 } 3153 3154 table = rt->fib6_table; 3155 spin_lock_bh(&table->tb6_lock); 3156 err = fib6_del(rt, info); 3157 spin_unlock_bh(&table->tb6_lock); 3158 3159 out: 3160 fib6_info_release(rt); 3161 return err; 3162 } 3163 3164 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3165 { 3166 struct nl_info info = { .nl_net = net }; 3167 3168 return __ip6_del_rt(rt, &info); 3169 } 3170 3171 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3172 { 3173 struct nl_info *info = &cfg->fc_nlinfo; 3174 struct net *net = info->nl_net; 3175 struct sk_buff *skb = NULL; 3176 struct fib6_table *table; 3177 int err = -ENOENT; 3178 3179 if (rt == net->ipv6.fib6_null_entry) 3180 goto out_put; 3181 table = rt->fib6_table; 3182 spin_lock_bh(&table->tb6_lock); 3183 3184 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3185 struct fib6_info *sibling, *next_sibling; 3186 3187 /* prefer to send a single notification with all hops */ 3188 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3189 if (skb) { 3190 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3191 3192 if (rt6_fill_node(net, skb, rt, NULL, 3193 NULL, NULL, 0, RTM_DELROUTE, 3194 info->portid, seq, 0) < 0) { 3195 kfree_skb(skb); 3196 skb = NULL; 3197 } else 3198 info->skip_notify = 1; 3199 } 3200 3201 list_for_each_entry_safe(sibling, next_sibling, 3202 &rt->fib6_siblings, 3203 fib6_siblings) { 3204 err = fib6_del(sibling, info); 3205 if (err) 3206 goto out_unlock; 3207 } 3208 } 3209 3210 err = fib6_del(rt, info); 3211 out_unlock: 3212 spin_unlock_bh(&table->tb6_lock); 3213 out_put: 3214 fib6_info_release(rt); 3215 3216 if (skb) { 3217 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3218 info->nlh, gfp_any()); 3219 } 3220 return err; 3221 } 3222 3223 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3224 { 3225 int rc = -ESRCH; 3226 3227 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3228 goto out; 3229 3230 if (cfg->fc_flags & RTF_GATEWAY && 3231 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3232 goto out; 3233 3234 rc = rt6_remove_exception_rt(rt); 3235 out: 3236 return rc; 3237 } 3238 3239 static int ip6_route_del(struct fib6_config *cfg, 3240 struct netlink_ext_ack *extack) 3241 { 3242 struct rt6_info *rt_cache; 3243 struct fib6_table *table; 3244 struct fib6_info *rt; 3245 struct fib6_node *fn; 3246 int err = -ESRCH; 3247 3248 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3249 if (!table) { 3250 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3251 return err; 3252 } 3253 3254 rcu_read_lock(); 3255 3256 fn = fib6_locate(&table->tb6_root, 3257 &cfg->fc_dst, cfg->fc_dst_len, 3258 &cfg->fc_src, cfg->fc_src_len, 3259 !(cfg->fc_flags & RTF_CACHE)); 3260 3261 if (fn) { 3262 for_each_fib6_node_rt_rcu(fn) { 3263 if (cfg->fc_flags & RTF_CACHE) { 3264 int rc; 3265 3266 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3267 &cfg->fc_src); 3268 if (rt_cache) { 3269 rc = ip6_del_cached_rt(rt_cache, cfg); 3270 if (rc != -ESRCH) { 3271 rcu_read_unlock(); 3272 return rc; 3273 } 3274 } 3275 continue; 3276 } 3277 if (cfg->fc_ifindex && 3278 (!rt->fib6_nh.nh_dev || 3279 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3280 continue; 3281 if (cfg->fc_flags & RTF_GATEWAY && 3282 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3283 continue; 3284 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3285 continue; 3286 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3287 continue; 3288 if (!fib6_info_hold_safe(rt)) 3289 continue; 3290 rcu_read_unlock(); 3291 3292 /* if gateway was specified only delete the one hop */ 3293 if (cfg->fc_flags & RTF_GATEWAY) 3294 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3295 3296 return __ip6_del_rt_siblings(rt, cfg); 3297 } 3298 } 3299 rcu_read_unlock(); 3300 3301 return err; 3302 } 3303 3304 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3305 { 3306 struct netevent_redirect netevent; 3307 struct rt6_info *rt, *nrt = NULL; 3308 struct ndisc_options ndopts; 3309 struct inet6_dev *in6_dev; 3310 struct neighbour *neigh; 3311 struct fib6_info *from; 3312 struct rd_msg *msg; 3313 int optlen, on_link; 3314 u8 *lladdr; 3315 3316 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3317 optlen -= sizeof(*msg); 3318 3319 if (optlen < 0) { 3320 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3321 return; 3322 } 3323 3324 msg = (struct rd_msg *)icmp6_hdr(skb); 3325 3326 if (ipv6_addr_is_multicast(&msg->dest)) { 3327 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3328 return; 3329 } 3330 3331 on_link = 0; 3332 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3333 on_link = 1; 3334 } else if (ipv6_addr_type(&msg->target) != 3335 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3336 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3337 return; 3338 } 3339 3340 in6_dev = __in6_dev_get(skb->dev); 3341 if (!in6_dev) 3342 return; 3343 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3344 return; 3345 3346 /* RFC2461 8.1: 3347 * The IP source address of the Redirect MUST be the same as the current 3348 * first-hop router for the specified ICMP Destination Address. 3349 */ 3350 3351 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3352 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3353 return; 3354 } 3355 3356 lladdr = NULL; 3357 if (ndopts.nd_opts_tgt_lladdr) { 3358 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3359 skb->dev); 3360 if (!lladdr) { 3361 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3362 return; 3363 } 3364 } 3365 3366 rt = (struct rt6_info *) dst; 3367 if (rt->rt6i_flags & RTF_REJECT) { 3368 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3369 return; 3370 } 3371 3372 /* Redirect received -> path was valid. 3373 * Look, redirects are sent only in response to data packets, 3374 * so that this nexthop apparently is reachable. --ANK 3375 */ 3376 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3377 3378 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3379 if (!neigh) 3380 return; 3381 3382 /* 3383 * We have finally decided to accept it. 3384 */ 3385 3386 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3387 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3388 NEIGH_UPDATE_F_OVERRIDE| 3389 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3390 NEIGH_UPDATE_F_ISROUTER)), 3391 NDISC_REDIRECT, &ndopts); 3392 3393 rcu_read_lock(); 3394 from = rcu_dereference(rt->from); 3395 if (!from) 3396 goto out; 3397 3398 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3399 if (!nrt) 3400 goto out; 3401 3402 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3403 if (on_link) 3404 nrt->rt6i_flags &= ~RTF_GATEWAY; 3405 3406 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3407 3408 /* rt6_insert_exception() will take care of duplicated exceptions */ 3409 if (rt6_insert_exception(nrt, from)) { 3410 dst_release_immediate(&nrt->dst); 3411 goto out; 3412 } 3413 3414 netevent.old = &rt->dst; 3415 netevent.new = &nrt->dst; 3416 netevent.daddr = &msg->dest; 3417 netevent.neigh = neigh; 3418 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3419 3420 out: 3421 rcu_read_unlock(); 3422 neigh_release(neigh); 3423 } 3424 3425 #ifdef CONFIG_IPV6_ROUTE_INFO 3426 static struct fib6_info *rt6_get_route_info(struct net *net, 3427 const struct in6_addr *prefix, int prefixlen, 3428 const struct in6_addr *gwaddr, 3429 struct net_device *dev) 3430 { 3431 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3432 int ifindex = dev->ifindex; 3433 struct fib6_node *fn; 3434 struct fib6_info *rt = NULL; 3435 struct fib6_table *table; 3436 3437 table = fib6_get_table(net, tb_id); 3438 if (!table) 3439 return NULL; 3440 3441 rcu_read_lock(); 3442 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3443 if (!fn) 3444 goto out; 3445 3446 for_each_fib6_node_rt_rcu(fn) { 3447 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3448 continue; 3449 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3450 continue; 3451 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3452 continue; 3453 if (!fib6_info_hold_safe(rt)) 3454 continue; 3455 break; 3456 } 3457 out: 3458 rcu_read_unlock(); 3459 return rt; 3460 } 3461 3462 static struct fib6_info *rt6_add_route_info(struct net *net, 3463 const struct in6_addr *prefix, int prefixlen, 3464 const struct in6_addr *gwaddr, 3465 struct net_device *dev, 3466 unsigned int pref) 3467 { 3468 struct fib6_config cfg = { 3469 .fc_metric = IP6_RT_PRIO_USER, 3470 .fc_ifindex = dev->ifindex, 3471 .fc_dst_len = prefixlen, 3472 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3473 RTF_UP | RTF_PREF(pref), 3474 .fc_protocol = RTPROT_RA, 3475 .fc_type = RTN_UNICAST, 3476 .fc_nlinfo.portid = 0, 3477 .fc_nlinfo.nlh = NULL, 3478 .fc_nlinfo.nl_net = net, 3479 }; 3480 3481 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3482 cfg.fc_dst = *prefix; 3483 cfg.fc_gateway = *gwaddr; 3484 3485 /* We should treat it as a default route if prefix length is 0. */ 3486 if (!prefixlen) 3487 cfg.fc_flags |= RTF_DEFAULT; 3488 3489 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3490 3491 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3492 } 3493 #endif 3494 3495 struct fib6_info *rt6_get_dflt_router(struct net *net, 3496 const struct in6_addr *addr, 3497 struct net_device *dev) 3498 { 3499 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3500 struct fib6_info *rt; 3501 struct fib6_table *table; 3502 3503 table = fib6_get_table(net, tb_id); 3504 if (!table) 3505 return NULL; 3506 3507 rcu_read_lock(); 3508 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3509 if (dev == rt->fib6_nh.nh_dev && 3510 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3511 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3512 break; 3513 } 3514 if (rt && !fib6_info_hold_safe(rt)) 3515 rt = NULL; 3516 rcu_read_unlock(); 3517 return rt; 3518 } 3519 3520 struct fib6_info *rt6_add_dflt_router(struct net *net, 3521 const struct in6_addr *gwaddr, 3522 struct net_device *dev, 3523 unsigned int pref) 3524 { 3525 struct fib6_config cfg = { 3526 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3527 .fc_metric = IP6_RT_PRIO_USER, 3528 .fc_ifindex = dev->ifindex, 3529 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3530 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3531 .fc_protocol = RTPROT_RA, 3532 .fc_type = RTN_UNICAST, 3533 .fc_nlinfo.portid = 0, 3534 .fc_nlinfo.nlh = NULL, 3535 .fc_nlinfo.nl_net = net, 3536 }; 3537 3538 cfg.fc_gateway = *gwaddr; 3539 3540 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3541 struct fib6_table *table; 3542 3543 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3544 if (table) 3545 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3546 } 3547 3548 return rt6_get_dflt_router(net, gwaddr, dev); 3549 } 3550 3551 static void __rt6_purge_dflt_routers(struct net *net, 3552 struct fib6_table *table) 3553 { 3554 struct fib6_info *rt; 3555 3556 restart: 3557 rcu_read_lock(); 3558 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3559 struct net_device *dev = fib6_info_nh_dev(rt); 3560 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3561 3562 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3563 (!idev || idev->cnf.accept_ra != 2) && 3564 fib6_info_hold_safe(rt)) { 3565 rcu_read_unlock(); 3566 ip6_del_rt(net, rt); 3567 goto restart; 3568 } 3569 } 3570 rcu_read_unlock(); 3571 3572 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3573 } 3574 3575 void rt6_purge_dflt_routers(struct net *net) 3576 { 3577 struct fib6_table *table; 3578 struct hlist_head *head; 3579 unsigned int h; 3580 3581 rcu_read_lock(); 3582 3583 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3584 head = &net->ipv6.fib_table_hash[h]; 3585 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3586 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3587 __rt6_purge_dflt_routers(net, table); 3588 } 3589 } 3590 3591 rcu_read_unlock(); 3592 } 3593 3594 static void rtmsg_to_fib6_config(struct net *net, 3595 struct in6_rtmsg *rtmsg, 3596 struct fib6_config *cfg) 3597 { 3598 *cfg = (struct fib6_config){ 3599 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3600 : RT6_TABLE_MAIN, 3601 .fc_ifindex = rtmsg->rtmsg_ifindex, 3602 .fc_metric = rtmsg->rtmsg_metric, 3603 .fc_expires = rtmsg->rtmsg_info, 3604 .fc_dst_len = rtmsg->rtmsg_dst_len, 3605 .fc_src_len = rtmsg->rtmsg_src_len, 3606 .fc_flags = rtmsg->rtmsg_flags, 3607 .fc_type = rtmsg->rtmsg_type, 3608 3609 .fc_nlinfo.nl_net = net, 3610 3611 .fc_dst = rtmsg->rtmsg_dst, 3612 .fc_src = rtmsg->rtmsg_src, 3613 .fc_gateway = rtmsg->rtmsg_gateway, 3614 }; 3615 } 3616 3617 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3618 { 3619 struct fib6_config cfg; 3620 struct in6_rtmsg rtmsg; 3621 int err; 3622 3623 switch (cmd) { 3624 case SIOCADDRT: /* Add a route */ 3625 case SIOCDELRT: /* Delete a route */ 3626 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3627 return -EPERM; 3628 err = copy_from_user(&rtmsg, arg, 3629 sizeof(struct in6_rtmsg)); 3630 if (err) 3631 return -EFAULT; 3632 3633 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3634 3635 rtnl_lock(); 3636 switch (cmd) { 3637 case SIOCADDRT: 3638 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3639 break; 3640 case SIOCDELRT: 3641 err = ip6_route_del(&cfg, NULL); 3642 break; 3643 default: 3644 err = -EINVAL; 3645 } 3646 rtnl_unlock(); 3647 3648 return err; 3649 } 3650 3651 return -EINVAL; 3652 } 3653 3654 /* 3655 * Drop the packet on the floor 3656 */ 3657 3658 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3659 { 3660 struct dst_entry *dst = skb_dst(skb); 3661 struct net *net = dev_net(dst->dev); 3662 struct inet6_dev *idev; 3663 int type; 3664 3665 if (netif_is_l3_master(skb->dev) && 3666 dst->dev == net->loopback_dev) 3667 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 3668 else 3669 idev = ip6_dst_idev(dst); 3670 3671 switch (ipstats_mib_noroutes) { 3672 case IPSTATS_MIB_INNOROUTES: 3673 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3674 if (type == IPV6_ADDR_ANY) { 3675 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 3676 break; 3677 } 3678 /* FALLTHROUGH */ 3679 case IPSTATS_MIB_OUTNOROUTES: 3680 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 3681 break; 3682 } 3683 3684 /* Start over by dropping the dst for l3mdev case */ 3685 if (netif_is_l3_master(skb->dev)) 3686 skb_dst_drop(skb); 3687 3688 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3689 kfree_skb(skb); 3690 return 0; 3691 } 3692 3693 static int ip6_pkt_discard(struct sk_buff *skb) 3694 { 3695 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3696 } 3697 3698 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3699 { 3700 skb->dev = skb_dst(skb)->dev; 3701 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3702 } 3703 3704 static int ip6_pkt_prohibit(struct sk_buff *skb) 3705 { 3706 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3707 } 3708 3709 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3710 { 3711 skb->dev = skb_dst(skb)->dev; 3712 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3713 } 3714 3715 /* 3716 * Allocate a dst for local (unicast / anycast) address. 3717 */ 3718 3719 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3720 struct inet6_dev *idev, 3721 const struct in6_addr *addr, 3722 bool anycast, gfp_t gfp_flags) 3723 { 3724 u32 tb_id; 3725 struct net_device *dev = idev->dev; 3726 struct fib6_info *f6i; 3727 3728 f6i = fib6_info_alloc(gfp_flags); 3729 if (!f6i) 3730 return ERR_PTR(-ENOMEM); 3731 3732 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); 3733 f6i->dst_nocount = true; 3734 f6i->dst_host = true; 3735 f6i->fib6_protocol = RTPROT_KERNEL; 3736 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3737 if (anycast) { 3738 f6i->fib6_type = RTN_ANYCAST; 3739 f6i->fib6_flags |= RTF_ANYCAST; 3740 } else { 3741 f6i->fib6_type = RTN_LOCAL; 3742 f6i->fib6_flags |= RTF_LOCAL; 3743 } 3744 3745 f6i->fib6_nh.nh_gw = *addr; 3746 dev_hold(dev); 3747 f6i->fib6_nh.nh_dev = dev; 3748 f6i->fib6_dst.addr = *addr; 3749 f6i->fib6_dst.plen = 128; 3750 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3751 f6i->fib6_table = fib6_get_table(net, tb_id); 3752 3753 return f6i; 3754 } 3755 3756 /* remove deleted ip from prefsrc entries */ 3757 struct arg_dev_net_ip { 3758 struct net_device *dev; 3759 struct net *net; 3760 struct in6_addr *addr; 3761 }; 3762 3763 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3764 { 3765 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3766 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3767 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3768 3769 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3770 rt != net->ipv6.fib6_null_entry && 3771 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3772 spin_lock_bh(&rt6_exception_lock); 3773 /* remove prefsrc entry */ 3774 rt->fib6_prefsrc.plen = 0; 3775 spin_unlock_bh(&rt6_exception_lock); 3776 } 3777 return 0; 3778 } 3779 3780 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3781 { 3782 struct net *net = dev_net(ifp->idev->dev); 3783 struct arg_dev_net_ip adni = { 3784 .dev = ifp->idev->dev, 3785 .net = net, 3786 .addr = &ifp->addr, 3787 }; 3788 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3789 } 3790 3791 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3792 3793 /* Remove routers and update dst entries when gateway turn into host. */ 3794 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3795 { 3796 struct in6_addr *gateway = (struct in6_addr *)arg; 3797 3798 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3799 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3800 return -1; 3801 } 3802 3803 /* Further clean up cached routes in exception table. 3804 * This is needed because cached route may have a different 3805 * gateway than its 'parent' in the case of an ip redirect. 3806 */ 3807 rt6_exceptions_clean_tohost(rt, gateway); 3808 3809 return 0; 3810 } 3811 3812 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3813 { 3814 fib6_clean_all(net, fib6_clean_tohost, gateway); 3815 } 3816 3817 struct arg_netdev_event { 3818 const struct net_device *dev; 3819 union { 3820 unsigned int nh_flags; 3821 unsigned long event; 3822 }; 3823 }; 3824 3825 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3826 { 3827 struct fib6_info *iter; 3828 struct fib6_node *fn; 3829 3830 fn = rcu_dereference_protected(rt->fib6_node, 3831 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3832 iter = rcu_dereference_protected(fn->leaf, 3833 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3834 while (iter) { 3835 if (iter->fib6_metric == rt->fib6_metric && 3836 rt6_qualify_for_ecmp(iter)) 3837 return iter; 3838 iter = rcu_dereference_protected(iter->fib6_next, 3839 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3840 } 3841 3842 return NULL; 3843 } 3844 3845 static bool rt6_is_dead(const struct fib6_info *rt) 3846 { 3847 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3848 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3849 fib6_ignore_linkdown(rt))) 3850 return true; 3851 3852 return false; 3853 } 3854 3855 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3856 { 3857 struct fib6_info *iter; 3858 int total = 0; 3859 3860 if (!rt6_is_dead(rt)) 3861 total += rt->fib6_nh.nh_weight; 3862 3863 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3864 if (!rt6_is_dead(iter)) 3865 total += iter->fib6_nh.nh_weight; 3866 } 3867 3868 return total; 3869 } 3870 3871 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3872 { 3873 int upper_bound = -1; 3874 3875 if (!rt6_is_dead(rt)) { 3876 *weight += rt->fib6_nh.nh_weight; 3877 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3878 total) - 1; 3879 } 3880 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3881 } 3882 3883 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3884 { 3885 struct fib6_info *iter; 3886 int weight = 0; 3887 3888 rt6_upper_bound_set(rt, &weight, total); 3889 3890 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3891 rt6_upper_bound_set(iter, &weight, total); 3892 } 3893 3894 void rt6_multipath_rebalance(struct fib6_info *rt) 3895 { 3896 struct fib6_info *first; 3897 int total; 3898 3899 /* In case the entire multipath route was marked for flushing, 3900 * then there is no need to rebalance upon the removal of every 3901 * sibling route. 3902 */ 3903 if (!rt->fib6_nsiblings || rt->should_flush) 3904 return; 3905 3906 /* During lookup routes are evaluated in order, so we need to 3907 * make sure upper bounds are assigned from the first sibling 3908 * onwards. 3909 */ 3910 first = rt6_multipath_first_sibling(rt); 3911 if (WARN_ON_ONCE(!first)) 3912 return; 3913 3914 total = rt6_multipath_total_weight(first); 3915 rt6_multipath_upper_bound_set(first, total); 3916 } 3917 3918 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3919 { 3920 const struct arg_netdev_event *arg = p_arg; 3921 struct net *net = dev_net(arg->dev); 3922 3923 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3924 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3925 fib6_update_sernum_upto_root(net, rt); 3926 rt6_multipath_rebalance(rt); 3927 } 3928 3929 return 0; 3930 } 3931 3932 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3933 { 3934 struct arg_netdev_event arg = { 3935 .dev = dev, 3936 { 3937 .nh_flags = nh_flags, 3938 }, 3939 }; 3940 3941 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3942 arg.nh_flags |= RTNH_F_LINKDOWN; 3943 3944 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3945 } 3946 3947 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3948 const struct net_device *dev) 3949 { 3950 struct fib6_info *iter; 3951 3952 if (rt->fib6_nh.nh_dev == dev) 3953 return true; 3954 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3955 if (iter->fib6_nh.nh_dev == dev) 3956 return true; 3957 3958 return false; 3959 } 3960 3961 static void rt6_multipath_flush(struct fib6_info *rt) 3962 { 3963 struct fib6_info *iter; 3964 3965 rt->should_flush = 1; 3966 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3967 iter->should_flush = 1; 3968 } 3969 3970 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3971 const struct net_device *down_dev) 3972 { 3973 struct fib6_info *iter; 3974 unsigned int dead = 0; 3975 3976 if (rt->fib6_nh.nh_dev == down_dev || 3977 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3978 dead++; 3979 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3980 if (iter->fib6_nh.nh_dev == down_dev || 3981 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3982 dead++; 3983 3984 return dead; 3985 } 3986 3987 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3988 const struct net_device *dev, 3989 unsigned int nh_flags) 3990 { 3991 struct fib6_info *iter; 3992 3993 if (rt->fib6_nh.nh_dev == dev) 3994 rt->fib6_nh.nh_flags |= nh_flags; 3995 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3996 if (iter->fib6_nh.nh_dev == dev) 3997 iter->fib6_nh.nh_flags |= nh_flags; 3998 } 3999 4000 /* called with write lock held for table with rt */ 4001 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4002 { 4003 const struct arg_netdev_event *arg = p_arg; 4004 const struct net_device *dev = arg->dev; 4005 struct net *net = dev_net(dev); 4006 4007 if (rt == net->ipv6.fib6_null_entry) 4008 return 0; 4009 4010 switch (arg->event) { 4011 case NETDEV_UNREGISTER: 4012 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4013 case NETDEV_DOWN: 4014 if (rt->should_flush) 4015 return -1; 4016 if (!rt->fib6_nsiblings) 4017 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4018 if (rt6_multipath_uses_dev(rt, dev)) { 4019 unsigned int count; 4020 4021 count = rt6_multipath_dead_count(rt, dev); 4022 if (rt->fib6_nsiblings + 1 == count) { 4023 rt6_multipath_flush(rt); 4024 return -1; 4025 } 4026 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4027 RTNH_F_LINKDOWN); 4028 fib6_update_sernum(net, rt); 4029 rt6_multipath_rebalance(rt); 4030 } 4031 return -2; 4032 case NETDEV_CHANGE: 4033 if (rt->fib6_nh.nh_dev != dev || 4034 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4035 break; 4036 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4037 rt6_multipath_rebalance(rt); 4038 break; 4039 } 4040 4041 return 0; 4042 } 4043 4044 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4045 { 4046 struct arg_netdev_event arg = { 4047 .dev = dev, 4048 { 4049 .event = event, 4050 }, 4051 }; 4052 struct net *net = dev_net(dev); 4053 4054 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4055 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4056 else 4057 fib6_clean_all(net, fib6_ifdown, &arg); 4058 } 4059 4060 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4061 { 4062 rt6_sync_down_dev(dev, event); 4063 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4064 neigh_ifdown(&nd_tbl, dev); 4065 } 4066 4067 struct rt6_mtu_change_arg { 4068 struct net_device *dev; 4069 unsigned int mtu; 4070 }; 4071 4072 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4073 { 4074 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4075 struct inet6_dev *idev; 4076 4077 /* In IPv6 pmtu discovery is not optional, 4078 so that RTAX_MTU lock cannot disable it. 4079 We still use this lock to block changes 4080 caused by addrconf/ndisc. 4081 */ 4082 4083 idev = __in6_dev_get(arg->dev); 4084 if (!idev) 4085 return 0; 4086 4087 /* For administrative MTU increase, there is no way to discover 4088 IPv6 PMTU increase, so PMTU increase should be updated here. 4089 Since RFC 1981 doesn't include administrative MTU increase 4090 update PMTU increase is a MUST. (i.e. jumbo frame) 4091 */ 4092 if (rt->fib6_nh.nh_dev == arg->dev && 4093 !fib6_metric_locked(rt, RTAX_MTU)) { 4094 u32 mtu = rt->fib6_pmtu; 4095 4096 if (mtu >= arg->mtu || 4097 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4098 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4099 4100 spin_lock_bh(&rt6_exception_lock); 4101 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4102 spin_unlock_bh(&rt6_exception_lock); 4103 } 4104 return 0; 4105 } 4106 4107 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4108 { 4109 struct rt6_mtu_change_arg arg = { 4110 .dev = dev, 4111 .mtu = mtu, 4112 }; 4113 4114 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4115 } 4116 4117 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4118 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4119 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4120 [RTA_OIF] = { .type = NLA_U32 }, 4121 [RTA_IIF] = { .type = NLA_U32 }, 4122 [RTA_PRIORITY] = { .type = NLA_U32 }, 4123 [RTA_METRICS] = { .type = NLA_NESTED }, 4124 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4125 [RTA_PREF] = { .type = NLA_U8 }, 4126 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4127 [RTA_ENCAP] = { .type = NLA_NESTED }, 4128 [RTA_EXPIRES] = { .type = NLA_U32 }, 4129 [RTA_UID] = { .type = NLA_U32 }, 4130 [RTA_MARK] = { .type = NLA_U32 }, 4131 [RTA_TABLE] = { .type = NLA_U32 }, 4132 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4133 [RTA_SPORT] = { .type = NLA_U16 }, 4134 [RTA_DPORT] = { .type = NLA_U16 }, 4135 }; 4136 4137 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4138 struct fib6_config *cfg, 4139 struct netlink_ext_ack *extack) 4140 { 4141 struct rtmsg *rtm; 4142 struct nlattr *tb[RTA_MAX+1]; 4143 unsigned int pref; 4144 int err; 4145 4146 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4147 extack); 4148 if (err < 0) 4149 goto errout; 4150 4151 err = -EINVAL; 4152 rtm = nlmsg_data(nlh); 4153 4154 *cfg = (struct fib6_config){ 4155 .fc_table = rtm->rtm_table, 4156 .fc_dst_len = rtm->rtm_dst_len, 4157 .fc_src_len = rtm->rtm_src_len, 4158 .fc_flags = RTF_UP, 4159 .fc_protocol = rtm->rtm_protocol, 4160 .fc_type = rtm->rtm_type, 4161 4162 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4163 .fc_nlinfo.nlh = nlh, 4164 .fc_nlinfo.nl_net = sock_net(skb->sk), 4165 }; 4166 4167 if (rtm->rtm_type == RTN_UNREACHABLE || 4168 rtm->rtm_type == RTN_BLACKHOLE || 4169 rtm->rtm_type == RTN_PROHIBIT || 4170 rtm->rtm_type == RTN_THROW) 4171 cfg->fc_flags |= RTF_REJECT; 4172 4173 if (rtm->rtm_type == RTN_LOCAL) 4174 cfg->fc_flags |= RTF_LOCAL; 4175 4176 if (rtm->rtm_flags & RTM_F_CLONED) 4177 cfg->fc_flags |= RTF_CACHE; 4178 4179 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4180 4181 if (tb[RTA_GATEWAY]) { 4182 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4183 cfg->fc_flags |= RTF_GATEWAY; 4184 } 4185 if (tb[RTA_VIA]) { 4186 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4187 goto errout; 4188 } 4189 4190 if (tb[RTA_DST]) { 4191 int plen = (rtm->rtm_dst_len + 7) >> 3; 4192 4193 if (nla_len(tb[RTA_DST]) < plen) 4194 goto errout; 4195 4196 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4197 } 4198 4199 if (tb[RTA_SRC]) { 4200 int plen = (rtm->rtm_src_len + 7) >> 3; 4201 4202 if (nla_len(tb[RTA_SRC]) < plen) 4203 goto errout; 4204 4205 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4206 } 4207 4208 if (tb[RTA_PREFSRC]) 4209 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4210 4211 if (tb[RTA_OIF]) 4212 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4213 4214 if (tb[RTA_PRIORITY]) 4215 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4216 4217 if (tb[RTA_METRICS]) { 4218 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4219 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4220 } 4221 4222 if (tb[RTA_TABLE]) 4223 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4224 4225 if (tb[RTA_MULTIPATH]) { 4226 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4227 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4228 4229 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4230 cfg->fc_mp_len, extack); 4231 if (err < 0) 4232 goto errout; 4233 } 4234 4235 if (tb[RTA_PREF]) { 4236 pref = nla_get_u8(tb[RTA_PREF]); 4237 if (pref != ICMPV6_ROUTER_PREF_LOW && 4238 pref != ICMPV6_ROUTER_PREF_HIGH) 4239 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4240 cfg->fc_flags |= RTF_PREF(pref); 4241 } 4242 4243 if (tb[RTA_ENCAP]) 4244 cfg->fc_encap = tb[RTA_ENCAP]; 4245 4246 if (tb[RTA_ENCAP_TYPE]) { 4247 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4248 4249 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4250 if (err < 0) 4251 goto errout; 4252 } 4253 4254 if (tb[RTA_EXPIRES]) { 4255 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4256 4257 if (addrconf_finite_timeout(timeout)) { 4258 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4259 cfg->fc_flags |= RTF_EXPIRES; 4260 } 4261 } 4262 4263 err = 0; 4264 errout: 4265 return err; 4266 } 4267 4268 struct rt6_nh { 4269 struct fib6_info *fib6_info; 4270 struct fib6_config r_cfg; 4271 struct list_head next; 4272 }; 4273 4274 static int ip6_route_info_append(struct net *net, 4275 struct list_head *rt6_nh_list, 4276 struct fib6_info *rt, 4277 struct fib6_config *r_cfg) 4278 { 4279 struct rt6_nh *nh; 4280 int err = -EEXIST; 4281 4282 list_for_each_entry(nh, rt6_nh_list, next) { 4283 /* check if fib6_info already exists */ 4284 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4285 return err; 4286 } 4287 4288 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4289 if (!nh) 4290 return -ENOMEM; 4291 nh->fib6_info = rt; 4292 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4293 list_add_tail(&nh->next, rt6_nh_list); 4294 4295 return 0; 4296 } 4297 4298 static void ip6_route_mpath_notify(struct fib6_info *rt, 4299 struct fib6_info *rt_last, 4300 struct nl_info *info, 4301 __u16 nlflags) 4302 { 4303 /* if this is an APPEND route, then rt points to the first route 4304 * inserted and rt_last points to last route inserted. Userspace 4305 * wants a consistent dump of the route which starts at the first 4306 * nexthop. Since sibling routes are always added at the end of 4307 * the list, find the first sibling of the last route appended 4308 */ 4309 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4310 rt = list_first_entry(&rt_last->fib6_siblings, 4311 struct fib6_info, 4312 fib6_siblings); 4313 } 4314 4315 if (rt) 4316 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4317 } 4318 4319 static int ip6_route_multipath_add(struct fib6_config *cfg, 4320 struct netlink_ext_ack *extack) 4321 { 4322 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4323 struct nl_info *info = &cfg->fc_nlinfo; 4324 struct fib6_config r_cfg; 4325 struct rtnexthop *rtnh; 4326 struct fib6_info *rt; 4327 struct rt6_nh *err_nh; 4328 struct rt6_nh *nh, *nh_safe; 4329 __u16 nlflags; 4330 int remaining; 4331 int attrlen; 4332 int err = 1; 4333 int nhn = 0; 4334 int replace = (cfg->fc_nlinfo.nlh && 4335 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4336 LIST_HEAD(rt6_nh_list); 4337 4338 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4339 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4340 nlflags |= NLM_F_APPEND; 4341 4342 remaining = cfg->fc_mp_len; 4343 rtnh = (struct rtnexthop *)cfg->fc_mp; 4344 4345 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4346 * fib6_info structs per nexthop 4347 */ 4348 while (rtnh_ok(rtnh, remaining)) { 4349 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4350 if (rtnh->rtnh_ifindex) 4351 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4352 4353 attrlen = rtnh_attrlen(rtnh); 4354 if (attrlen > 0) { 4355 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4356 4357 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4358 if (nla) { 4359 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4360 r_cfg.fc_flags |= RTF_GATEWAY; 4361 } 4362 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4363 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4364 if (nla) 4365 r_cfg.fc_encap_type = nla_get_u16(nla); 4366 } 4367 4368 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4369 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4370 if (IS_ERR(rt)) { 4371 err = PTR_ERR(rt); 4372 rt = NULL; 4373 goto cleanup; 4374 } 4375 if (!rt6_qualify_for_ecmp(rt)) { 4376 err = -EINVAL; 4377 NL_SET_ERR_MSG(extack, 4378 "Device only routes can not be added for IPv6 using the multipath API."); 4379 fib6_info_release(rt); 4380 goto cleanup; 4381 } 4382 4383 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4384 4385 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4386 rt, &r_cfg); 4387 if (err) { 4388 fib6_info_release(rt); 4389 goto cleanup; 4390 } 4391 4392 rtnh = rtnh_next(rtnh, &remaining); 4393 } 4394 4395 /* for add and replace send one notification with all nexthops. 4396 * Skip the notification in fib6_add_rt2node and send one with 4397 * the full route when done 4398 */ 4399 info->skip_notify = 1; 4400 4401 err_nh = NULL; 4402 list_for_each_entry(nh, &rt6_nh_list, next) { 4403 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4404 fib6_info_release(nh->fib6_info); 4405 4406 if (!err) { 4407 /* save reference to last route successfully inserted */ 4408 rt_last = nh->fib6_info; 4409 4410 /* save reference to first route for notification */ 4411 if (!rt_notif) 4412 rt_notif = nh->fib6_info; 4413 } 4414 4415 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4416 nh->fib6_info = NULL; 4417 if (err) { 4418 if (replace && nhn) 4419 NL_SET_ERR_MSG_MOD(extack, 4420 "multipath route replace failed (check consistency of installed routes)"); 4421 err_nh = nh; 4422 goto add_errout; 4423 } 4424 4425 /* Because each route is added like a single route we remove 4426 * these flags after the first nexthop: if there is a collision, 4427 * we have already failed to add the first nexthop: 4428 * fib6_add_rt2node() has rejected it; when replacing, old 4429 * nexthops have been replaced by first new, the rest should 4430 * be added to it. 4431 */ 4432 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4433 NLM_F_REPLACE); 4434 nhn++; 4435 } 4436 4437 /* success ... tell user about new route */ 4438 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4439 goto cleanup; 4440 4441 add_errout: 4442 /* send notification for routes that were added so that 4443 * the delete notifications sent by ip6_route_del are 4444 * coherent 4445 */ 4446 if (rt_notif) 4447 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4448 4449 /* Delete routes that were already added */ 4450 list_for_each_entry(nh, &rt6_nh_list, next) { 4451 if (err_nh == nh) 4452 break; 4453 ip6_route_del(&nh->r_cfg, extack); 4454 } 4455 4456 cleanup: 4457 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4458 if (nh->fib6_info) 4459 fib6_info_release(nh->fib6_info); 4460 list_del(&nh->next); 4461 kfree(nh); 4462 } 4463 4464 return err; 4465 } 4466 4467 static int ip6_route_multipath_del(struct fib6_config *cfg, 4468 struct netlink_ext_ack *extack) 4469 { 4470 struct fib6_config r_cfg; 4471 struct rtnexthop *rtnh; 4472 int remaining; 4473 int attrlen; 4474 int err = 1, last_err = 0; 4475 4476 remaining = cfg->fc_mp_len; 4477 rtnh = (struct rtnexthop *)cfg->fc_mp; 4478 4479 /* Parse a Multipath Entry */ 4480 while (rtnh_ok(rtnh, remaining)) { 4481 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4482 if (rtnh->rtnh_ifindex) 4483 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4484 4485 attrlen = rtnh_attrlen(rtnh); 4486 if (attrlen > 0) { 4487 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4488 4489 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4490 if (nla) { 4491 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4492 r_cfg.fc_flags |= RTF_GATEWAY; 4493 } 4494 } 4495 err = ip6_route_del(&r_cfg, extack); 4496 if (err) 4497 last_err = err; 4498 4499 rtnh = rtnh_next(rtnh, &remaining); 4500 } 4501 4502 return last_err; 4503 } 4504 4505 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4506 struct netlink_ext_ack *extack) 4507 { 4508 struct fib6_config cfg; 4509 int err; 4510 4511 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4512 if (err < 0) 4513 return err; 4514 4515 if (cfg.fc_mp) 4516 return ip6_route_multipath_del(&cfg, extack); 4517 else { 4518 cfg.fc_delete_all_nh = 1; 4519 return ip6_route_del(&cfg, extack); 4520 } 4521 } 4522 4523 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4524 struct netlink_ext_ack *extack) 4525 { 4526 struct fib6_config cfg; 4527 int err; 4528 4529 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4530 if (err < 0) 4531 return err; 4532 4533 if (cfg.fc_mp) 4534 return ip6_route_multipath_add(&cfg, extack); 4535 else 4536 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4537 } 4538 4539 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4540 { 4541 int nexthop_len = 0; 4542 4543 if (rt->fib6_nsiblings) { 4544 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4545 + NLA_ALIGN(sizeof(struct rtnexthop)) 4546 + nla_total_size(16) /* RTA_GATEWAY */ 4547 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4548 4549 nexthop_len *= rt->fib6_nsiblings; 4550 } 4551 4552 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4553 + nla_total_size(16) /* RTA_SRC */ 4554 + nla_total_size(16) /* RTA_DST */ 4555 + nla_total_size(16) /* RTA_GATEWAY */ 4556 + nla_total_size(16) /* RTA_PREFSRC */ 4557 + nla_total_size(4) /* RTA_TABLE */ 4558 + nla_total_size(4) /* RTA_IIF */ 4559 + nla_total_size(4) /* RTA_OIF */ 4560 + nla_total_size(4) /* RTA_PRIORITY */ 4561 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4562 + nla_total_size(sizeof(struct rta_cacheinfo)) 4563 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4564 + nla_total_size(1) /* RTA_PREF */ 4565 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4566 + nexthop_len; 4567 } 4568 4569 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4570 unsigned int *flags, bool skip_oif) 4571 { 4572 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4573 *flags |= RTNH_F_DEAD; 4574 4575 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4576 *flags |= RTNH_F_LINKDOWN; 4577 4578 rcu_read_lock(); 4579 if (fib6_ignore_linkdown(rt)) 4580 *flags |= RTNH_F_DEAD; 4581 rcu_read_unlock(); 4582 } 4583 4584 if (rt->fib6_flags & RTF_GATEWAY) { 4585 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4586 goto nla_put_failure; 4587 } 4588 4589 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4590 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4591 *flags |= RTNH_F_OFFLOAD; 4592 4593 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4594 if (!skip_oif && rt->fib6_nh.nh_dev && 4595 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4596 goto nla_put_failure; 4597 4598 if (rt->fib6_nh.nh_lwtstate && 4599 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4600 goto nla_put_failure; 4601 4602 return 0; 4603 4604 nla_put_failure: 4605 return -EMSGSIZE; 4606 } 4607 4608 /* add multipath next hop */ 4609 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4610 { 4611 const struct net_device *dev = rt->fib6_nh.nh_dev; 4612 struct rtnexthop *rtnh; 4613 unsigned int flags = 0; 4614 4615 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4616 if (!rtnh) 4617 goto nla_put_failure; 4618 4619 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4620 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4621 4622 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4623 goto nla_put_failure; 4624 4625 rtnh->rtnh_flags = flags; 4626 4627 /* length of rtnetlink header + attributes */ 4628 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4629 4630 return 0; 4631 4632 nla_put_failure: 4633 return -EMSGSIZE; 4634 } 4635 4636 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4637 struct fib6_info *rt, struct dst_entry *dst, 4638 struct in6_addr *dest, struct in6_addr *src, 4639 int iif, int type, u32 portid, u32 seq, 4640 unsigned int flags) 4641 { 4642 struct rt6_info *rt6 = (struct rt6_info *)dst; 4643 struct rt6key *rt6_dst, *rt6_src; 4644 u32 *pmetrics, table, rt6_flags; 4645 struct nlmsghdr *nlh; 4646 struct rtmsg *rtm; 4647 long expires = 0; 4648 4649 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4650 if (!nlh) 4651 return -EMSGSIZE; 4652 4653 if (rt6) { 4654 rt6_dst = &rt6->rt6i_dst; 4655 rt6_src = &rt6->rt6i_src; 4656 rt6_flags = rt6->rt6i_flags; 4657 } else { 4658 rt6_dst = &rt->fib6_dst; 4659 rt6_src = &rt->fib6_src; 4660 rt6_flags = rt->fib6_flags; 4661 } 4662 4663 rtm = nlmsg_data(nlh); 4664 rtm->rtm_family = AF_INET6; 4665 rtm->rtm_dst_len = rt6_dst->plen; 4666 rtm->rtm_src_len = rt6_src->plen; 4667 rtm->rtm_tos = 0; 4668 if (rt->fib6_table) 4669 table = rt->fib6_table->tb6_id; 4670 else 4671 table = RT6_TABLE_UNSPEC; 4672 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4673 if (nla_put_u32(skb, RTA_TABLE, table)) 4674 goto nla_put_failure; 4675 4676 rtm->rtm_type = rt->fib6_type; 4677 rtm->rtm_flags = 0; 4678 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4679 rtm->rtm_protocol = rt->fib6_protocol; 4680 4681 if (rt6_flags & RTF_CACHE) 4682 rtm->rtm_flags |= RTM_F_CLONED; 4683 4684 if (dest) { 4685 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4686 goto nla_put_failure; 4687 rtm->rtm_dst_len = 128; 4688 } else if (rtm->rtm_dst_len) 4689 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4690 goto nla_put_failure; 4691 #ifdef CONFIG_IPV6_SUBTREES 4692 if (src) { 4693 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4694 goto nla_put_failure; 4695 rtm->rtm_src_len = 128; 4696 } else if (rtm->rtm_src_len && 4697 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4698 goto nla_put_failure; 4699 #endif 4700 if (iif) { 4701 #ifdef CONFIG_IPV6_MROUTE 4702 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4703 int err = ip6mr_get_route(net, skb, rtm, portid); 4704 4705 if (err == 0) 4706 return 0; 4707 if (err < 0) 4708 goto nla_put_failure; 4709 } else 4710 #endif 4711 if (nla_put_u32(skb, RTA_IIF, iif)) 4712 goto nla_put_failure; 4713 } else if (dest) { 4714 struct in6_addr saddr_buf; 4715 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4716 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4717 goto nla_put_failure; 4718 } 4719 4720 if (rt->fib6_prefsrc.plen) { 4721 struct in6_addr saddr_buf; 4722 saddr_buf = rt->fib6_prefsrc.addr; 4723 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4724 goto nla_put_failure; 4725 } 4726 4727 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4728 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4729 goto nla_put_failure; 4730 4731 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4732 goto nla_put_failure; 4733 4734 /* For multipath routes, walk the siblings list and add 4735 * each as a nexthop within RTA_MULTIPATH. 4736 */ 4737 if (rt6) { 4738 if (rt6_flags & RTF_GATEWAY && 4739 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4740 goto nla_put_failure; 4741 4742 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4743 goto nla_put_failure; 4744 } else if (rt->fib6_nsiblings) { 4745 struct fib6_info *sibling, *next_sibling; 4746 struct nlattr *mp; 4747 4748 mp = nla_nest_start(skb, RTA_MULTIPATH); 4749 if (!mp) 4750 goto nla_put_failure; 4751 4752 if (rt6_add_nexthop(skb, rt) < 0) 4753 goto nla_put_failure; 4754 4755 list_for_each_entry_safe(sibling, next_sibling, 4756 &rt->fib6_siblings, fib6_siblings) { 4757 if (rt6_add_nexthop(skb, sibling) < 0) 4758 goto nla_put_failure; 4759 } 4760 4761 nla_nest_end(skb, mp); 4762 } else { 4763 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4764 goto nla_put_failure; 4765 } 4766 4767 if (rt6_flags & RTF_EXPIRES) { 4768 expires = dst ? dst->expires : rt->expires; 4769 expires -= jiffies; 4770 } 4771 4772 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4773 goto nla_put_failure; 4774 4775 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4776 goto nla_put_failure; 4777 4778 4779 nlmsg_end(skb, nlh); 4780 return 0; 4781 4782 nla_put_failure: 4783 nlmsg_cancel(skb, nlh); 4784 return -EMSGSIZE; 4785 } 4786 4787 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4788 const struct net_device *dev) 4789 { 4790 if (f6i->fib6_nh.nh_dev == dev) 4791 return true; 4792 4793 if (f6i->fib6_nsiblings) { 4794 struct fib6_info *sibling, *next_sibling; 4795 4796 list_for_each_entry_safe(sibling, next_sibling, 4797 &f6i->fib6_siblings, fib6_siblings) { 4798 if (sibling->fib6_nh.nh_dev == dev) 4799 return true; 4800 } 4801 } 4802 4803 return false; 4804 } 4805 4806 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4807 { 4808 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4809 struct fib_dump_filter *filter = &arg->filter; 4810 unsigned int flags = NLM_F_MULTI; 4811 struct net *net = arg->net; 4812 4813 if (rt == net->ipv6.fib6_null_entry) 4814 return 0; 4815 4816 if ((filter->flags & RTM_F_PREFIX) && 4817 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4818 /* success since this is not a prefix route */ 4819 return 1; 4820 } 4821 if (filter->filter_set) { 4822 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4823 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4824 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4825 return 1; 4826 } 4827 flags |= NLM_F_DUMP_FILTERED; 4828 } 4829 4830 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4831 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4832 arg->cb->nlh->nlmsg_seq, flags); 4833 } 4834 4835 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4836 const struct nlmsghdr *nlh, 4837 struct nlattr **tb, 4838 struct netlink_ext_ack *extack) 4839 { 4840 struct rtmsg *rtm; 4841 int i, err; 4842 4843 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4844 NL_SET_ERR_MSG_MOD(extack, 4845 "Invalid header for get route request"); 4846 return -EINVAL; 4847 } 4848 4849 if (!netlink_strict_get_check(skb)) 4850 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4851 rtm_ipv6_policy, extack); 4852 4853 rtm = nlmsg_data(nlh); 4854 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4855 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4856 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4857 rtm->rtm_type) { 4858 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4859 return -EINVAL; 4860 } 4861 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4862 NL_SET_ERR_MSG_MOD(extack, 4863 "Invalid flags for get route request"); 4864 return -EINVAL; 4865 } 4866 4867 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4868 rtm_ipv6_policy, extack); 4869 if (err) 4870 return err; 4871 4872 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4873 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4874 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4875 return -EINVAL; 4876 } 4877 4878 for (i = 0; i <= RTA_MAX; i++) { 4879 if (!tb[i]) 4880 continue; 4881 4882 switch (i) { 4883 case RTA_SRC: 4884 case RTA_DST: 4885 case RTA_IIF: 4886 case RTA_OIF: 4887 case RTA_MARK: 4888 case RTA_UID: 4889 case RTA_SPORT: 4890 case RTA_DPORT: 4891 case RTA_IP_PROTO: 4892 break; 4893 default: 4894 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4895 return -EINVAL; 4896 } 4897 } 4898 4899 return 0; 4900 } 4901 4902 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4903 struct netlink_ext_ack *extack) 4904 { 4905 struct net *net = sock_net(in_skb->sk); 4906 struct nlattr *tb[RTA_MAX+1]; 4907 int err, iif = 0, oif = 0; 4908 struct fib6_info *from; 4909 struct dst_entry *dst; 4910 struct rt6_info *rt; 4911 struct sk_buff *skb; 4912 struct rtmsg *rtm; 4913 struct flowi6 fl6 = {}; 4914 bool fibmatch; 4915 4916 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4917 if (err < 0) 4918 goto errout; 4919 4920 err = -EINVAL; 4921 rtm = nlmsg_data(nlh); 4922 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4923 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4924 4925 if (tb[RTA_SRC]) { 4926 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4927 goto errout; 4928 4929 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4930 } 4931 4932 if (tb[RTA_DST]) { 4933 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4934 goto errout; 4935 4936 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4937 } 4938 4939 if (tb[RTA_IIF]) 4940 iif = nla_get_u32(tb[RTA_IIF]); 4941 4942 if (tb[RTA_OIF]) 4943 oif = nla_get_u32(tb[RTA_OIF]); 4944 4945 if (tb[RTA_MARK]) 4946 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4947 4948 if (tb[RTA_UID]) 4949 fl6.flowi6_uid = make_kuid(current_user_ns(), 4950 nla_get_u32(tb[RTA_UID])); 4951 else 4952 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4953 4954 if (tb[RTA_SPORT]) 4955 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4956 4957 if (tb[RTA_DPORT]) 4958 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4959 4960 if (tb[RTA_IP_PROTO]) { 4961 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4962 &fl6.flowi6_proto, AF_INET6, 4963 extack); 4964 if (err) 4965 goto errout; 4966 } 4967 4968 if (iif) { 4969 struct net_device *dev; 4970 int flags = 0; 4971 4972 rcu_read_lock(); 4973 4974 dev = dev_get_by_index_rcu(net, iif); 4975 if (!dev) { 4976 rcu_read_unlock(); 4977 err = -ENODEV; 4978 goto errout; 4979 } 4980 4981 fl6.flowi6_iif = iif; 4982 4983 if (!ipv6_addr_any(&fl6.saddr)) 4984 flags |= RT6_LOOKUP_F_HAS_SADDR; 4985 4986 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4987 4988 rcu_read_unlock(); 4989 } else { 4990 fl6.flowi6_oif = oif; 4991 4992 dst = ip6_route_output(net, NULL, &fl6); 4993 } 4994 4995 4996 rt = container_of(dst, struct rt6_info, dst); 4997 if (rt->dst.error) { 4998 err = rt->dst.error; 4999 ip6_rt_put(rt); 5000 goto errout; 5001 } 5002 5003 if (rt == net->ipv6.ip6_null_entry) { 5004 err = rt->dst.error; 5005 ip6_rt_put(rt); 5006 goto errout; 5007 } 5008 5009 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5010 if (!skb) { 5011 ip6_rt_put(rt); 5012 err = -ENOBUFS; 5013 goto errout; 5014 } 5015 5016 skb_dst_set(skb, &rt->dst); 5017 5018 rcu_read_lock(); 5019 from = rcu_dereference(rt->from); 5020 if (from) { 5021 if (fibmatch) 5022 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 5023 iif, RTM_NEWROUTE, 5024 NETLINK_CB(in_skb).portid, 5025 nlh->nlmsg_seq, 0); 5026 else 5027 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5028 &fl6.saddr, iif, RTM_NEWROUTE, 5029 NETLINK_CB(in_skb).portid, 5030 nlh->nlmsg_seq, 0); 5031 } else { 5032 err = -ENETUNREACH; 5033 } 5034 rcu_read_unlock(); 5035 5036 if (err < 0) { 5037 kfree_skb(skb); 5038 goto errout; 5039 } 5040 5041 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5042 errout: 5043 return err; 5044 } 5045 5046 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5047 unsigned int nlm_flags) 5048 { 5049 struct sk_buff *skb; 5050 struct net *net = info->nl_net; 5051 u32 seq; 5052 int err; 5053 5054 err = -ENOBUFS; 5055 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5056 5057 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5058 if (!skb) 5059 goto errout; 5060 5061 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5062 event, info->portid, seq, nlm_flags); 5063 if (err < 0) { 5064 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5065 WARN_ON(err == -EMSGSIZE); 5066 kfree_skb(skb); 5067 goto errout; 5068 } 5069 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5070 info->nlh, gfp_any()); 5071 return; 5072 errout: 5073 if (err < 0) 5074 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5075 } 5076 5077 static int ip6_route_dev_notify(struct notifier_block *this, 5078 unsigned long event, void *ptr) 5079 { 5080 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5081 struct net *net = dev_net(dev); 5082 5083 if (!(dev->flags & IFF_LOOPBACK)) 5084 return NOTIFY_OK; 5085 5086 if (event == NETDEV_REGISTER) { 5087 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5088 net->ipv6.ip6_null_entry->dst.dev = dev; 5089 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5090 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5091 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5092 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5093 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5094 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5095 #endif 5096 } else if (event == NETDEV_UNREGISTER && 5097 dev->reg_state != NETREG_UNREGISTERED) { 5098 /* NETDEV_UNREGISTER could be fired for multiple times by 5099 * netdev_wait_allrefs(). Make sure we only call this once. 5100 */ 5101 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5102 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5103 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5104 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5105 #endif 5106 } 5107 5108 return NOTIFY_OK; 5109 } 5110 5111 /* 5112 * /proc 5113 */ 5114 5115 #ifdef CONFIG_PROC_FS 5116 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5117 { 5118 struct net *net = (struct net *)seq->private; 5119 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5120 net->ipv6.rt6_stats->fib_nodes, 5121 net->ipv6.rt6_stats->fib_route_nodes, 5122 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5123 net->ipv6.rt6_stats->fib_rt_entries, 5124 net->ipv6.rt6_stats->fib_rt_cache, 5125 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5126 net->ipv6.rt6_stats->fib_discarded_routes); 5127 5128 return 0; 5129 } 5130 #endif /* CONFIG_PROC_FS */ 5131 5132 #ifdef CONFIG_SYSCTL 5133 5134 static 5135 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5136 void __user *buffer, size_t *lenp, loff_t *ppos) 5137 { 5138 struct net *net; 5139 int delay; 5140 int ret; 5141 if (!write) 5142 return -EINVAL; 5143 5144 net = (struct net *)ctl->extra1; 5145 delay = net->ipv6.sysctl.flush_delay; 5146 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5147 if (ret) 5148 return ret; 5149 5150 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5151 return 0; 5152 } 5153 5154 static int zero; 5155 static int one = 1; 5156 5157 static struct ctl_table ipv6_route_table_template[] = { 5158 { 5159 .procname = "flush", 5160 .data = &init_net.ipv6.sysctl.flush_delay, 5161 .maxlen = sizeof(int), 5162 .mode = 0200, 5163 .proc_handler = ipv6_sysctl_rtcache_flush 5164 }, 5165 { 5166 .procname = "gc_thresh", 5167 .data = &ip6_dst_ops_template.gc_thresh, 5168 .maxlen = sizeof(int), 5169 .mode = 0644, 5170 .proc_handler = proc_dointvec, 5171 }, 5172 { 5173 .procname = "max_size", 5174 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5175 .maxlen = sizeof(int), 5176 .mode = 0644, 5177 .proc_handler = proc_dointvec, 5178 }, 5179 { 5180 .procname = "gc_min_interval", 5181 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5182 .maxlen = sizeof(int), 5183 .mode = 0644, 5184 .proc_handler = proc_dointvec_jiffies, 5185 }, 5186 { 5187 .procname = "gc_timeout", 5188 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5189 .maxlen = sizeof(int), 5190 .mode = 0644, 5191 .proc_handler = proc_dointvec_jiffies, 5192 }, 5193 { 5194 .procname = "gc_interval", 5195 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5196 .maxlen = sizeof(int), 5197 .mode = 0644, 5198 .proc_handler = proc_dointvec_jiffies, 5199 }, 5200 { 5201 .procname = "gc_elasticity", 5202 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5203 .maxlen = sizeof(int), 5204 .mode = 0644, 5205 .proc_handler = proc_dointvec, 5206 }, 5207 { 5208 .procname = "mtu_expires", 5209 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5210 .maxlen = sizeof(int), 5211 .mode = 0644, 5212 .proc_handler = proc_dointvec_jiffies, 5213 }, 5214 { 5215 .procname = "min_adv_mss", 5216 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5217 .maxlen = sizeof(int), 5218 .mode = 0644, 5219 .proc_handler = proc_dointvec, 5220 }, 5221 { 5222 .procname = "gc_min_interval_ms", 5223 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5224 .maxlen = sizeof(int), 5225 .mode = 0644, 5226 .proc_handler = proc_dointvec_ms_jiffies, 5227 }, 5228 { 5229 .procname = "skip_notify_on_dev_down", 5230 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5231 .maxlen = sizeof(int), 5232 .mode = 0644, 5233 .proc_handler = proc_dointvec, 5234 .extra1 = &zero, 5235 .extra2 = &one, 5236 }, 5237 { } 5238 }; 5239 5240 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5241 { 5242 struct ctl_table *table; 5243 5244 table = kmemdup(ipv6_route_table_template, 5245 sizeof(ipv6_route_table_template), 5246 GFP_KERNEL); 5247 5248 if (table) { 5249 table[0].data = &net->ipv6.sysctl.flush_delay; 5250 table[0].extra1 = net; 5251 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5252 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5253 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5254 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5255 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5256 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5257 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5258 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5259 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5260 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5261 5262 /* Don't export sysctls to unprivileged users */ 5263 if (net->user_ns != &init_user_ns) 5264 table[0].procname = NULL; 5265 } 5266 5267 return table; 5268 } 5269 #endif 5270 5271 static int __net_init ip6_route_net_init(struct net *net) 5272 { 5273 int ret = -ENOMEM; 5274 5275 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5276 sizeof(net->ipv6.ip6_dst_ops)); 5277 5278 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5279 goto out_ip6_dst_ops; 5280 5281 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5282 sizeof(*net->ipv6.fib6_null_entry), 5283 GFP_KERNEL); 5284 if (!net->ipv6.fib6_null_entry) 5285 goto out_ip6_dst_entries; 5286 5287 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5288 sizeof(*net->ipv6.ip6_null_entry), 5289 GFP_KERNEL); 5290 if (!net->ipv6.ip6_null_entry) 5291 goto out_fib6_null_entry; 5292 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5293 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5294 ip6_template_metrics, true); 5295 5296 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5297 net->ipv6.fib6_has_custom_rules = false; 5298 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5299 sizeof(*net->ipv6.ip6_prohibit_entry), 5300 GFP_KERNEL); 5301 if (!net->ipv6.ip6_prohibit_entry) 5302 goto out_ip6_null_entry; 5303 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5304 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5305 ip6_template_metrics, true); 5306 5307 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5308 sizeof(*net->ipv6.ip6_blk_hole_entry), 5309 GFP_KERNEL); 5310 if (!net->ipv6.ip6_blk_hole_entry) 5311 goto out_ip6_prohibit_entry; 5312 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5313 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5314 ip6_template_metrics, true); 5315 #endif 5316 5317 net->ipv6.sysctl.flush_delay = 0; 5318 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5319 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5320 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5321 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5322 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5323 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5324 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5325 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5326 5327 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5328 5329 ret = 0; 5330 out: 5331 return ret; 5332 5333 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5334 out_ip6_prohibit_entry: 5335 kfree(net->ipv6.ip6_prohibit_entry); 5336 out_ip6_null_entry: 5337 kfree(net->ipv6.ip6_null_entry); 5338 #endif 5339 out_fib6_null_entry: 5340 kfree(net->ipv6.fib6_null_entry); 5341 out_ip6_dst_entries: 5342 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5343 out_ip6_dst_ops: 5344 goto out; 5345 } 5346 5347 static void __net_exit ip6_route_net_exit(struct net *net) 5348 { 5349 kfree(net->ipv6.fib6_null_entry); 5350 kfree(net->ipv6.ip6_null_entry); 5351 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5352 kfree(net->ipv6.ip6_prohibit_entry); 5353 kfree(net->ipv6.ip6_blk_hole_entry); 5354 #endif 5355 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5356 } 5357 5358 static int __net_init ip6_route_net_init_late(struct net *net) 5359 { 5360 #ifdef CONFIG_PROC_FS 5361 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5362 sizeof(struct ipv6_route_iter)); 5363 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5364 rt6_stats_seq_show, NULL); 5365 #endif 5366 return 0; 5367 } 5368 5369 static void __net_exit ip6_route_net_exit_late(struct net *net) 5370 { 5371 #ifdef CONFIG_PROC_FS 5372 remove_proc_entry("ipv6_route", net->proc_net); 5373 remove_proc_entry("rt6_stats", net->proc_net); 5374 #endif 5375 } 5376 5377 static struct pernet_operations ip6_route_net_ops = { 5378 .init = ip6_route_net_init, 5379 .exit = ip6_route_net_exit, 5380 }; 5381 5382 static int __net_init ipv6_inetpeer_init(struct net *net) 5383 { 5384 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5385 5386 if (!bp) 5387 return -ENOMEM; 5388 inet_peer_base_init(bp); 5389 net->ipv6.peers = bp; 5390 return 0; 5391 } 5392 5393 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5394 { 5395 struct inet_peer_base *bp = net->ipv6.peers; 5396 5397 net->ipv6.peers = NULL; 5398 inetpeer_invalidate_tree(bp); 5399 kfree(bp); 5400 } 5401 5402 static struct pernet_operations ipv6_inetpeer_ops = { 5403 .init = ipv6_inetpeer_init, 5404 .exit = ipv6_inetpeer_exit, 5405 }; 5406 5407 static struct pernet_operations ip6_route_net_late_ops = { 5408 .init = ip6_route_net_init_late, 5409 .exit = ip6_route_net_exit_late, 5410 }; 5411 5412 static struct notifier_block ip6_route_dev_notifier = { 5413 .notifier_call = ip6_route_dev_notify, 5414 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5415 }; 5416 5417 void __init ip6_route_init_special_entries(void) 5418 { 5419 /* Registering of the loopback is done before this portion of code, 5420 * the loopback reference in rt6_info will not be taken, do it 5421 * manually for init_net */ 5422 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5423 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5424 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5425 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5426 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5427 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5428 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5429 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5430 #endif 5431 } 5432 5433 int __init ip6_route_init(void) 5434 { 5435 int ret; 5436 int cpu; 5437 5438 ret = -ENOMEM; 5439 ip6_dst_ops_template.kmem_cachep = 5440 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5441 SLAB_HWCACHE_ALIGN, NULL); 5442 if (!ip6_dst_ops_template.kmem_cachep) 5443 goto out; 5444 5445 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5446 if (ret) 5447 goto out_kmem_cache; 5448 5449 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5450 if (ret) 5451 goto out_dst_entries; 5452 5453 ret = register_pernet_subsys(&ip6_route_net_ops); 5454 if (ret) 5455 goto out_register_inetpeer; 5456 5457 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5458 5459 ret = fib6_init(); 5460 if (ret) 5461 goto out_register_subsys; 5462 5463 ret = xfrm6_init(); 5464 if (ret) 5465 goto out_fib6_init; 5466 5467 ret = fib6_rules_init(); 5468 if (ret) 5469 goto xfrm6_init; 5470 5471 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5472 if (ret) 5473 goto fib6_rules_init; 5474 5475 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5476 inet6_rtm_newroute, NULL, 0); 5477 if (ret < 0) 5478 goto out_register_late_subsys; 5479 5480 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5481 inet6_rtm_delroute, NULL, 0); 5482 if (ret < 0) 5483 goto out_register_late_subsys; 5484 5485 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5486 inet6_rtm_getroute, NULL, 5487 RTNL_FLAG_DOIT_UNLOCKED); 5488 if (ret < 0) 5489 goto out_register_late_subsys; 5490 5491 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5492 if (ret) 5493 goto out_register_late_subsys; 5494 5495 for_each_possible_cpu(cpu) { 5496 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5497 5498 INIT_LIST_HEAD(&ul->head); 5499 spin_lock_init(&ul->lock); 5500 } 5501 5502 out: 5503 return ret; 5504 5505 out_register_late_subsys: 5506 rtnl_unregister_all(PF_INET6); 5507 unregister_pernet_subsys(&ip6_route_net_late_ops); 5508 fib6_rules_init: 5509 fib6_rules_cleanup(); 5510 xfrm6_init: 5511 xfrm6_fini(); 5512 out_fib6_init: 5513 fib6_gc_cleanup(); 5514 out_register_subsys: 5515 unregister_pernet_subsys(&ip6_route_net_ops); 5516 out_register_inetpeer: 5517 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5518 out_dst_entries: 5519 dst_entries_destroy(&ip6_dst_blackhole_ops); 5520 out_kmem_cache: 5521 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5522 goto out; 5523 } 5524 5525 void ip6_route_cleanup(void) 5526 { 5527 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5528 unregister_pernet_subsys(&ip6_route_net_late_ops); 5529 fib6_rules_cleanup(); 5530 xfrm6_fini(); 5531 fib6_gc_cleanup(); 5532 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5533 unregister_pernet_subsys(&ip6_route_net_ops); 5534 dst_entries_destroy(&ip6_dst_blackhole_ops); 5535 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5536 } 5537