1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 214 n = neigh_create(&nd_tbl, daddr, dev); 215 return IS_ERR(n) ? NULL : n; 216 } 217 218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 219 struct sk_buff *skb, 220 const void *daddr) 221 { 222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 223 224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 225 } 226 227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 228 { 229 struct net_device *dev = dst->dev; 230 struct rt6_info *rt = (struct rt6_info *)dst; 231 232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 233 if (!daddr) 234 return; 235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 236 return; 237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 238 return; 239 __ipv6_confirm_neigh(dev, daddr); 240 } 241 242 static struct dst_ops ip6_dst_ops_template = { 243 .family = AF_INET6, 244 .gc = ip6_dst_gc, 245 .gc_thresh = 1024, 246 .check = ip6_dst_check, 247 .default_advmss = ip6_default_advmss, 248 .mtu = ip6_mtu, 249 .cow_metrics = dst_cow_metrics_generic, 250 .destroy = ip6_dst_destroy, 251 .ifdown = ip6_dst_ifdown, 252 .negative_advice = ip6_negative_advice, 253 .link_failure = ip6_link_failure, 254 .update_pmtu = ip6_rt_update_pmtu, 255 .redirect = rt6_do_redirect, 256 .local_out = __ip6_local_out, 257 .neigh_lookup = ip6_dst_neigh_lookup, 258 .confirm_neigh = ip6_confirm_neigh, 259 }; 260 261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 262 { 263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 264 265 return mtu ? : dst->dev->mtu; 266 } 267 268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 269 struct sk_buff *skb, u32 mtu) 270 { 271 } 272 273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 274 struct sk_buff *skb) 275 { 276 } 277 278 static struct dst_ops ip6_dst_blackhole_ops = { 279 .family = AF_INET6, 280 .destroy = ip6_dst_destroy, 281 .check = ip6_dst_check, 282 .mtu = ip6_blackhole_mtu, 283 .default_advmss = ip6_default_advmss, 284 .update_pmtu = ip6_rt_blackhole_update_pmtu, 285 .redirect = ip6_rt_blackhole_redirect, 286 .cow_metrics = dst_cow_metrics_generic, 287 .neigh_lookup = ip6_dst_neigh_lookup, 288 }; 289 290 static const u32 ip6_template_metrics[RTAX_MAX] = { 291 [RTAX_HOPLIMIT - 1] = 0, 292 }; 293 294 static const struct fib6_info fib6_null_entry_template = { 295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 296 .fib6_protocol = RTPROT_KERNEL, 297 .fib6_metric = ~(u32)0, 298 .fib6_ref = ATOMIC_INIT(1), 299 .fib6_type = RTN_UNREACHABLE, 300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 301 }; 302 303 static const struct rt6_info ip6_null_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -ENETUNREACH, 309 .input = ip6_pkt_discard, 310 .output = ip6_pkt_discard_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 316 317 static const struct rt6_info ip6_prohibit_entry_template = { 318 .dst = { 319 .__refcnt = ATOMIC_INIT(1), 320 .__use = 1, 321 .obsolete = DST_OBSOLETE_FORCE_CHK, 322 .error = -EACCES, 323 .input = ip6_pkt_prohibit, 324 .output = ip6_pkt_prohibit_out, 325 }, 326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 327 }; 328 329 static const struct rt6_info ip6_blk_hole_entry_template = { 330 .dst = { 331 .__refcnt = ATOMIC_INIT(1), 332 .__use = 1, 333 .obsolete = DST_OBSOLETE_FORCE_CHK, 334 .error = -EINVAL, 335 .input = dst_discard, 336 .output = dst_discard_out, 337 }, 338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 339 }; 340 341 #endif 342 343 static void rt6_info_init(struct rt6_info *rt) 344 { 345 struct dst_entry *dst = &rt->dst; 346 347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 348 INIT_LIST_HEAD(&rt->rt6i_uncached); 349 } 350 351 /* allocate dst with ip6_dst_ops */ 352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) { 359 rt6_info_init(rt); 360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 361 } 362 363 return rt; 364 } 365 EXPORT_SYMBOL(ip6_dst_alloc); 366 367 static void ip6_dst_destroy(struct dst_entry *dst) 368 { 369 struct rt6_info *rt = (struct rt6_info *)dst; 370 struct fib6_info *from; 371 struct inet6_dev *idev; 372 373 ip_dst_metrics_put(dst); 374 rt6_uncached_list_del(rt); 375 376 idev = rt->rt6i_idev; 377 if (idev) { 378 rt->rt6i_idev = NULL; 379 in6_dev_put(idev); 380 } 381 382 rcu_read_lock(); 383 from = rcu_dereference(rt->from); 384 rcu_assign_pointer(rt->from, NULL); 385 fib6_info_release(from); 386 rcu_read_unlock(); 387 } 388 389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 390 int how) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct inet6_dev *idev = rt->rt6i_idev; 394 struct net_device *loopback_dev = 395 dev_net(dev)->loopback_dev; 396 397 if (idev && idev->dev != loopback_dev) { 398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 399 if (loopback_idev) { 400 rt->rt6i_idev = loopback_idev; 401 in6_dev_put(idev); 402 } 403 } 404 } 405 406 static bool __rt6_check_expired(const struct rt6_info *rt) 407 { 408 if (rt->rt6i_flags & RTF_EXPIRES) 409 return time_after(jiffies, rt->dst.expires); 410 else 411 return false; 412 } 413 414 static bool rt6_check_expired(const struct rt6_info *rt) 415 { 416 struct fib6_info *from; 417 418 from = rcu_dereference(rt->from); 419 420 if (rt->rt6i_flags & RTF_EXPIRES) { 421 if (time_after(jiffies, rt->dst.expires)) 422 return true; 423 } else if (from) { 424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 425 fib6_check_expired(from); 426 } 427 return false; 428 } 429 430 struct fib6_info *fib6_multipath_select(const struct net *net, 431 struct fib6_info *match, 432 struct flowi6 *fl6, int oif, 433 const struct sk_buff *skb, 434 int strict) 435 { 436 struct fib6_info *sibling, *next_sibling; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 445 return match; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 int nh_upper_bound; 450 451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 452 if (fl6->mp_hash > nh_upper_bound) 453 continue; 454 if (rt6_score_route(sibling, oif, strict) < 0) 455 break; 456 match = sibling; 457 break; 458 } 459 460 return match; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static inline struct fib6_info *rt6_device_match(struct net *net, 468 struct fib6_info *rt, 469 const struct in6_addr *saddr, 470 int oif, 471 int flags) 472 { 473 struct fib6_info *sprt; 474 475 if (!oif && ipv6_addr_any(saddr) && 476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 477 return rt; 478 479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 480 const struct net_device *dev = sprt->fib6_nh.nh_dev; 481 482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 483 continue; 484 485 if (oif) { 486 if (dev->ifindex == oif) 487 return sprt; 488 } else { 489 if (ipv6_chk_addr(net, saddr, dev, 490 flags & RT6_LOOKUP_F_IFACE)) 491 return sprt; 492 } 493 } 494 495 if (oif && flags & RT6_LOOKUP_F_IFACE) 496 return net->ipv6.fib6_null_entry; 497 498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 499 } 500 501 #ifdef CONFIG_IPV6_ROUTER_PREF 502 struct __rt6_probe_work { 503 struct work_struct work; 504 struct in6_addr target; 505 struct net_device *dev; 506 }; 507 508 static void rt6_probe_deferred(struct work_struct *w) 509 { 510 struct in6_addr mcaddr; 511 struct __rt6_probe_work *work = 512 container_of(w, struct __rt6_probe_work, work); 513 514 addrconf_addr_solict_mult(&work->target, &mcaddr); 515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 516 dev_put(work->dev); 517 kfree(work); 518 } 519 520 static void rt6_probe(struct fib6_info *rt) 521 { 522 struct __rt6_probe_work *work = NULL; 523 const struct in6_addr *nh_gw; 524 struct neighbour *neigh; 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 528 /* 529 * Okay, this does not seem to be appropriate 530 * for now, however, we need to check if it 531 * is really so; aka Router Reachability Probing. 532 * 533 * Router Reachability Probe MUST be rate-limited 534 * to no more than one per minute. 535 */ 536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 537 return; 538 539 nh_gw = &rt->fib6_nh.nh_gw; 540 dev = rt->fib6_nh.nh_dev; 541 rcu_read_lock_bh(); 542 idev = __in6_dev_get(dev); 543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 544 if (neigh) { 545 if (neigh->nud_state & NUD_VALID) 546 goto out; 547 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else if (time_after(jiffies, rt->last_probe + 558 idev->cnf.rtr_probe_interval)) { 559 work = kmalloc(sizeof(*work), GFP_ATOMIC); 560 } 561 562 if (work) { 563 rt->last_probe = jiffies; 564 INIT_WORK(&work->work, rt6_probe_deferred); 565 work->target = *nh_gw; 566 dev_hold(dev); 567 work->dev = dev; 568 schedule_work(&work->work); 569 } 570 571 out: 572 rcu_read_unlock_bh(); 573 } 574 #else 575 static inline void rt6_probe(struct fib6_info *rt) 576 { 577 } 578 #endif 579 580 /* 581 * Default Router Selection (RFC 2461 6.3.6) 582 */ 583 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 584 { 585 const struct net_device *dev = rt->fib6_nh.nh_dev; 586 587 if (!oif || dev->ifindex == oif) 588 return 2; 589 return 0; 590 } 591 592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 593 { 594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 595 struct neighbour *neigh; 596 597 if (rt->fib6_flags & RTF_NONEXTHOP || 598 !(rt->fib6_flags & RTF_GATEWAY)) 599 return RT6_NUD_SUCCEED; 600 601 rcu_read_lock_bh(); 602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 603 &rt->fib6_nh.nh_gw); 604 if (neigh) { 605 read_lock(&neigh->lock); 606 if (neigh->nud_state & NUD_VALID) 607 ret = RT6_NUD_SUCCEED; 608 #ifdef CONFIG_IPV6_ROUTER_PREF 609 else if (!(neigh->nud_state & NUD_FAILED)) 610 ret = RT6_NUD_SUCCEED; 611 else 612 ret = RT6_NUD_FAIL_PROBE; 613 #endif 614 read_unlock(&neigh->lock); 615 } else { 616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 618 } 619 rcu_read_unlock_bh(); 620 621 return ret; 622 } 623 624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 625 { 626 int m; 627 628 m = rt6_check_dev(rt, oif); 629 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 630 return RT6_NUD_FAIL_HARD; 631 #ifdef CONFIG_IPV6_ROUTER_PREF 632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 633 #endif 634 if (strict & RT6_LOOKUP_F_REACHABLE) { 635 int n = rt6_check_neigh(rt); 636 if (n < 0) 637 return n; 638 } 639 return m; 640 } 641 642 /* called with rc_read_lock held */ 643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 644 { 645 const struct net_device *dev = fib6_info_nh_dev(f6i); 646 bool rc = false; 647 648 if (dev) { 649 const struct inet6_dev *idev = __in6_dev_get(dev); 650 651 rc = !!idev->cnf.ignore_routes_with_linkdown; 652 } 653 654 return rc; 655 } 656 657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 658 int *mpri, struct fib6_info *match, 659 bool *do_rr) 660 { 661 int m; 662 bool match_do_rr = false; 663 664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 665 goto out; 666 667 if (fib6_ignore_linkdown(rt) && 668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 670 goto out; 671 672 if (fib6_check_expired(rt)) 673 goto out; 674 675 m = rt6_score_route(rt, oif, strict); 676 if (m == RT6_NUD_FAIL_DO_RR) { 677 match_do_rr = true; 678 m = 0; /* lowest valid score */ 679 } else if (m == RT6_NUD_FAIL_HARD) { 680 goto out; 681 } 682 683 if (strict & RT6_LOOKUP_F_REACHABLE) 684 rt6_probe(rt); 685 686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 687 if (m > *mpri) { 688 *do_rr = match_do_rr; 689 *mpri = m; 690 match = rt; 691 } 692 out: 693 return match; 694 } 695 696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 697 struct fib6_info *leaf, 698 struct fib6_info *rr_head, 699 u32 metric, int oif, int strict, 700 bool *do_rr) 701 { 702 struct fib6_info *rt, *match, *cont; 703 int mpri = -1; 704 705 match = NULL; 706 cont = NULL; 707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 708 if (rt->fib6_metric != metric) { 709 cont = rt; 710 break; 711 } 712 713 match = find_match(rt, oif, strict, &mpri, match, do_rr); 714 } 715 716 for (rt = leaf; rt && rt != rr_head; 717 rt = rcu_dereference(rt->fib6_next)) { 718 if (rt->fib6_metric != metric) { 719 cont = rt; 720 break; 721 } 722 723 match = find_match(rt, oif, strict, &mpri, match, do_rr); 724 } 725 726 if (match || !cont) 727 return match; 728 729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 730 match = find_match(rt, oif, strict, &mpri, match, do_rr); 731 732 return match; 733 } 734 735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 736 int oif, int strict) 737 { 738 struct fib6_info *leaf = rcu_dereference(fn->leaf); 739 struct fib6_info *match, *rt0; 740 bool do_rr = false; 741 int key_plen; 742 743 if (!leaf || leaf == net->ipv6.fib6_null_entry) 744 return net->ipv6.fib6_null_entry; 745 746 rt0 = rcu_dereference(fn->rr_ptr); 747 if (!rt0) 748 rt0 = leaf; 749 750 /* Double check to make sure fn is not an intermediate node 751 * and fn->leaf does not points to its child's leaf 752 * (This might happen if all routes under fn are deleted from 753 * the tree and fib6_repair_tree() is called on the node.) 754 */ 755 key_plen = rt0->fib6_dst.plen; 756 #ifdef CONFIG_IPV6_SUBTREES 757 if (rt0->fib6_src.plen) 758 key_plen = rt0->fib6_src.plen; 759 #endif 760 if (fn->fn_bit != key_plen) 761 return net->ipv6.fib6_null_entry; 762 763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 764 &do_rr); 765 766 if (do_rr) { 767 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 768 769 /* no entries matched; do round-robin */ 770 if (!next || next->fib6_metric != rt0->fib6_metric) 771 next = leaf; 772 773 if (next != rt0) { 774 spin_lock_bh(&leaf->fib6_table->tb6_lock); 775 /* make sure next is not being deleted from the tree */ 776 if (next->fib6_node) 777 rcu_assign_pointer(fn->rr_ptr, next); 778 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 779 } 780 } 781 782 return match ? match : net->ipv6.fib6_null_entry; 783 } 784 785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 786 { 787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 788 } 789 790 #ifdef CONFIG_IPV6_ROUTE_INFO 791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 792 const struct in6_addr *gwaddr) 793 { 794 struct net *net = dev_net(dev); 795 struct route_info *rinfo = (struct route_info *) opt; 796 struct in6_addr prefix_buf, *prefix; 797 unsigned int pref; 798 unsigned long lifetime; 799 struct fib6_info *rt; 800 801 if (len < sizeof(struct route_info)) { 802 return -EINVAL; 803 } 804 805 /* Sanity check for prefix_len and length */ 806 if (rinfo->length > 3) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 128) { 809 return -EINVAL; 810 } else if (rinfo->prefix_len > 64) { 811 if (rinfo->length < 2) { 812 return -EINVAL; 813 } 814 } else if (rinfo->prefix_len > 0) { 815 if (rinfo->length < 1) { 816 return -EINVAL; 817 } 818 } 819 820 pref = rinfo->route_pref; 821 if (pref == ICMPV6_ROUTER_PREF_INVALID) 822 return -EINVAL; 823 824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 825 826 if (rinfo->length == 3) 827 prefix = (struct in6_addr *)rinfo->prefix; 828 else { 829 /* this function is safe */ 830 ipv6_addr_prefix(&prefix_buf, 831 (struct in6_addr *)rinfo->prefix, 832 rinfo->prefix_len); 833 prefix = &prefix_buf; 834 } 835 836 if (rinfo->prefix_len == 0) 837 rt = rt6_get_dflt_router(net, gwaddr, dev); 838 else 839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 840 gwaddr, dev); 841 842 if (rt && !lifetime) { 843 ip6_del_rt(net, rt); 844 rt = NULL; 845 } 846 847 if (!rt && lifetime) 848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 849 dev, pref); 850 else if (rt) 851 rt->fib6_flags = RTF_ROUTEINFO | 852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 853 854 if (rt) { 855 if (!addrconf_finite_timeout(lifetime)) 856 fib6_clean_expires(rt); 857 else 858 fib6_set_expires(rt, jiffies + HZ * lifetime); 859 860 fib6_info_release(rt); 861 } 862 return 0; 863 } 864 #endif 865 866 /* 867 * Misc support functions 868 */ 869 870 /* called with rcu_lock held */ 871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 872 { 873 struct net_device *dev = rt->fib6_nh.nh_dev; 874 875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 876 /* for copies of local routes, dst->dev needs to be the 877 * device if it is a master device, the master device if 878 * device is enslaved, and the loopback as the default 879 */ 880 if (netif_is_l3_slave(dev) && 881 !rt6_need_strict(&rt->fib6_dst.addr)) 882 dev = l3mdev_master_dev_rcu(dev); 883 else if (!netif_is_l3_master(dev)) 884 dev = dev_net(dev)->loopback_dev; 885 /* last case is netif_is_l3_master(dev) is true in which 886 * case we want dev returned to be dev 887 */ 888 } 889 890 return dev; 891 } 892 893 static const int fib6_prop[RTN_MAX + 1] = { 894 [RTN_UNSPEC] = 0, 895 [RTN_UNICAST] = 0, 896 [RTN_LOCAL] = 0, 897 [RTN_BROADCAST] = 0, 898 [RTN_ANYCAST] = 0, 899 [RTN_MULTICAST] = 0, 900 [RTN_BLACKHOLE] = -EINVAL, 901 [RTN_UNREACHABLE] = -EHOSTUNREACH, 902 [RTN_PROHIBIT] = -EACCES, 903 [RTN_THROW] = -EAGAIN, 904 [RTN_NAT] = -EINVAL, 905 [RTN_XRESOLVE] = -EINVAL, 906 }; 907 908 static int ip6_rt_type_to_error(u8 fib6_type) 909 { 910 return fib6_prop[fib6_type]; 911 } 912 913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 914 { 915 unsigned short flags = 0; 916 917 if (rt->dst_nocount) 918 flags |= DST_NOCOUNT; 919 if (rt->dst_nopolicy) 920 flags |= DST_NOPOLICY; 921 if (rt->dst_host) 922 flags |= DST_HOST; 923 924 return flags; 925 } 926 927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 928 { 929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 930 931 switch (ort->fib6_type) { 932 case RTN_BLACKHOLE: 933 rt->dst.output = dst_discard_out; 934 rt->dst.input = dst_discard; 935 break; 936 case RTN_PROHIBIT: 937 rt->dst.output = ip6_pkt_prohibit_out; 938 rt->dst.input = ip6_pkt_prohibit; 939 break; 940 case RTN_THROW: 941 case RTN_UNREACHABLE: 942 default: 943 rt->dst.output = ip6_pkt_discard_out; 944 rt->dst.input = ip6_pkt_discard; 945 break; 946 } 947 } 948 949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 950 { 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 } 999 1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1001 struct in6_addr *saddr) 1002 { 1003 struct fib6_node *pn, *sn; 1004 while (1) { 1005 if (fn->fn_flags & RTN_TL_ROOT) 1006 return NULL; 1007 pn = rcu_dereference(fn->parent); 1008 sn = FIB6_SUBTREE(pn); 1009 if (sn && sn != fn) 1010 fn = fib6_node_lookup(sn, NULL, saddr); 1011 else 1012 fn = pn; 1013 if (fn->fn_flags & RTN_RTINFO) 1014 return fn; 1015 } 1016 } 1017 1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1019 bool null_fallback) 1020 { 1021 struct rt6_info *rt = *prt; 1022 1023 if (dst_hold_safe(&rt->dst)) 1024 return true; 1025 if (null_fallback) { 1026 rt = net->ipv6.ip6_null_entry; 1027 dst_hold(&rt->dst); 1028 } else { 1029 rt = NULL; 1030 } 1031 *prt = rt; 1032 return false; 1033 } 1034 1035 /* called with rcu_lock held */ 1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1037 { 1038 unsigned short flags = fib6_info_dst_flags(rt); 1039 struct net_device *dev = rt->fib6_nh.nh_dev; 1040 struct rt6_info *nrt; 1041 1042 if (!fib6_info_hold_safe(rt)) 1043 goto fallback; 1044 1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1046 if (!nrt) { 1047 fib6_info_release(rt); 1048 goto fallback; 1049 } 1050 1051 ip6_rt_copy_init(nrt, rt); 1052 return nrt; 1053 1054 fallback: 1055 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1056 dst_hold(&nrt->dst); 1057 return nrt; 1058 } 1059 1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1061 struct fib6_table *table, 1062 struct flowi6 *fl6, 1063 const struct sk_buff *skb, 1064 int flags) 1065 { 1066 struct fib6_info *f6i; 1067 struct fib6_node *fn; 1068 struct rt6_info *rt; 1069 1070 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1071 flags &= ~RT6_LOOKUP_F_IFACE; 1072 1073 rcu_read_lock(); 1074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1075 restart: 1076 f6i = rcu_dereference(fn->leaf); 1077 if (!f6i) { 1078 f6i = net->ipv6.fib6_null_entry; 1079 } else { 1080 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1081 fl6->flowi6_oif, flags); 1082 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1083 f6i = fib6_multipath_select(net, f6i, fl6, 1084 fl6->flowi6_oif, skb, 1085 flags); 1086 } 1087 if (f6i == net->ipv6.fib6_null_entry) { 1088 fn = fib6_backtrack(fn, &fl6->saddr); 1089 if (fn) 1090 goto restart; 1091 } 1092 1093 trace_fib6_table_lookup(net, f6i, table, fl6); 1094 1095 /* Search through exception table */ 1096 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1097 if (rt) { 1098 if (ip6_hold_safe(net, &rt, true)) 1099 dst_use_noref(&rt->dst, jiffies); 1100 } else if (f6i == net->ipv6.fib6_null_entry) { 1101 rt = net->ipv6.ip6_null_entry; 1102 dst_hold(&rt->dst); 1103 } else { 1104 rt = ip6_create_rt_rcu(f6i); 1105 } 1106 1107 rcu_read_unlock(); 1108 1109 return rt; 1110 } 1111 1112 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1113 const struct sk_buff *skb, int flags) 1114 { 1115 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1116 } 1117 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1118 1119 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1120 const struct in6_addr *saddr, int oif, 1121 const struct sk_buff *skb, int strict) 1122 { 1123 struct flowi6 fl6 = { 1124 .flowi6_oif = oif, 1125 .daddr = *daddr, 1126 }; 1127 struct dst_entry *dst; 1128 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1129 1130 if (saddr) { 1131 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1132 flags |= RT6_LOOKUP_F_HAS_SADDR; 1133 } 1134 1135 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1136 if (dst->error == 0) 1137 return (struct rt6_info *) dst; 1138 1139 dst_release(dst); 1140 1141 return NULL; 1142 } 1143 EXPORT_SYMBOL(rt6_lookup); 1144 1145 /* ip6_ins_rt is called with FREE table->tb6_lock. 1146 * It takes new route entry, the addition fails by any reason the 1147 * route is released. 1148 * Caller must hold dst before calling it. 1149 */ 1150 1151 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1152 struct netlink_ext_ack *extack) 1153 { 1154 int err; 1155 struct fib6_table *table; 1156 1157 table = rt->fib6_table; 1158 spin_lock_bh(&table->tb6_lock); 1159 err = fib6_add(&table->tb6_root, rt, info, extack); 1160 spin_unlock_bh(&table->tb6_lock); 1161 1162 return err; 1163 } 1164 1165 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1166 { 1167 struct nl_info info = { .nl_net = net, }; 1168 1169 return __ip6_ins_rt(rt, &info, NULL); 1170 } 1171 1172 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1173 const struct in6_addr *daddr, 1174 const struct in6_addr *saddr) 1175 { 1176 struct net_device *dev; 1177 struct rt6_info *rt; 1178 1179 /* 1180 * Clone the route. 1181 */ 1182 1183 if (!fib6_info_hold_safe(ort)) 1184 return NULL; 1185 1186 dev = ip6_rt_get_dev_rcu(ort); 1187 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1188 if (!rt) { 1189 fib6_info_release(ort); 1190 return NULL; 1191 } 1192 1193 ip6_rt_copy_init(rt, ort); 1194 rt->rt6i_flags |= RTF_CACHE; 1195 rt->dst.flags |= DST_HOST; 1196 rt->rt6i_dst.addr = *daddr; 1197 rt->rt6i_dst.plen = 128; 1198 1199 if (!rt6_is_gw_or_nonexthop(ort)) { 1200 if (ort->fib6_dst.plen != 128 && 1201 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1202 rt->rt6i_flags |= RTF_ANYCAST; 1203 #ifdef CONFIG_IPV6_SUBTREES 1204 if (rt->rt6i_src.plen && saddr) { 1205 rt->rt6i_src.addr = *saddr; 1206 rt->rt6i_src.plen = 128; 1207 } 1208 #endif 1209 } 1210 1211 return rt; 1212 } 1213 1214 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1215 { 1216 unsigned short flags = fib6_info_dst_flags(rt); 1217 struct net_device *dev; 1218 struct rt6_info *pcpu_rt; 1219 1220 if (!fib6_info_hold_safe(rt)) 1221 return NULL; 1222 1223 rcu_read_lock(); 1224 dev = ip6_rt_get_dev_rcu(rt); 1225 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1226 rcu_read_unlock(); 1227 if (!pcpu_rt) { 1228 fib6_info_release(rt); 1229 return NULL; 1230 } 1231 ip6_rt_copy_init(pcpu_rt, rt); 1232 pcpu_rt->rt6i_flags |= RTF_PCPU; 1233 return pcpu_rt; 1234 } 1235 1236 /* It should be called with rcu_read_lock() acquired */ 1237 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1238 { 1239 struct rt6_info *pcpu_rt, **p; 1240 1241 p = this_cpu_ptr(rt->rt6i_pcpu); 1242 pcpu_rt = *p; 1243 1244 if (pcpu_rt) 1245 ip6_hold_safe(NULL, &pcpu_rt, false); 1246 1247 return pcpu_rt; 1248 } 1249 1250 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1251 struct fib6_info *rt) 1252 { 1253 struct rt6_info *pcpu_rt, *prev, **p; 1254 1255 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1256 if (!pcpu_rt) { 1257 dst_hold(&net->ipv6.ip6_null_entry->dst); 1258 return net->ipv6.ip6_null_entry; 1259 } 1260 1261 dst_hold(&pcpu_rt->dst); 1262 p = this_cpu_ptr(rt->rt6i_pcpu); 1263 prev = cmpxchg(p, NULL, pcpu_rt); 1264 BUG_ON(prev); 1265 1266 return pcpu_rt; 1267 } 1268 1269 /* exception hash table implementation 1270 */ 1271 static DEFINE_SPINLOCK(rt6_exception_lock); 1272 1273 /* Remove rt6_ex from hash table and free the memory 1274 * Caller must hold rt6_exception_lock 1275 */ 1276 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1277 struct rt6_exception *rt6_ex) 1278 { 1279 struct fib6_info *from; 1280 struct net *net; 1281 1282 if (!bucket || !rt6_ex) 1283 return; 1284 1285 net = dev_net(rt6_ex->rt6i->dst.dev); 1286 net->ipv6.rt6_stats->fib_rt_cache--; 1287 1288 /* purge completely the exception to allow releasing the held resources: 1289 * some [sk] cache may keep the dst around for unlimited time 1290 */ 1291 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1292 lockdep_is_held(&rt6_exception_lock)); 1293 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1294 fib6_info_release(from); 1295 dst_dev_put(&rt6_ex->rt6i->dst); 1296 1297 hlist_del_rcu(&rt6_ex->hlist); 1298 dst_release(&rt6_ex->rt6i->dst); 1299 kfree_rcu(rt6_ex, rcu); 1300 WARN_ON_ONCE(!bucket->depth); 1301 bucket->depth--; 1302 } 1303 1304 /* Remove oldest rt6_ex in bucket and free the memory 1305 * Caller must hold rt6_exception_lock 1306 */ 1307 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1308 { 1309 struct rt6_exception *rt6_ex, *oldest = NULL; 1310 1311 if (!bucket) 1312 return; 1313 1314 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1315 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1316 oldest = rt6_ex; 1317 } 1318 rt6_remove_exception(bucket, oldest); 1319 } 1320 1321 static u32 rt6_exception_hash(const struct in6_addr *dst, 1322 const struct in6_addr *src) 1323 { 1324 static u32 seed __read_mostly; 1325 u32 val; 1326 1327 net_get_random_once(&seed, sizeof(seed)); 1328 val = jhash(dst, sizeof(*dst), seed); 1329 1330 #ifdef CONFIG_IPV6_SUBTREES 1331 if (src) 1332 val = jhash(src, sizeof(*src), val); 1333 #endif 1334 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1335 } 1336 1337 /* Helper function to find the cached rt in the hash table 1338 * and update bucket pointer to point to the bucket for this 1339 * (daddr, saddr) pair 1340 * Caller must hold rt6_exception_lock 1341 */ 1342 static struct rt6_exception * 1343 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1344 const struct in6_addr *daddr, 1345 const struct in6_addr *saddr) 1346 { 1347 struct rt6_exception *rt6_ex; 1348 u32 hval; 1349 1350 if (!(*bucket) || !daddr) 1351 return NULL; 1352 1353 hval = rt6_exception_hash(daddr, saddr); 1354 *bucket += hval; 1355 1356 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1357 struct rt6_info *rt6 = rt6_ex->rt6i; 1358 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1359 1360 #ifdef CONFIG_IPV6_SUBTREES 1361 if (matched && saddr) 1362 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1363 #endif 1364 if (matched) 1365 return rt6_ex; 1366 } 1367 return NULL; 1368 } 1369 1370 /* Helper function to find the cached rt in the hash table 1371 * and update bucket pointer to point to the bucket for this 1372 * (daddr, saddr) pair 1373 * Caller must hold rcu_read_lock() 1374 */ 1375 static struct rt6_exception * 1376 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1377 const struct in6_addr *daddr, 1378 const struct in6_addr *saddr) 1379 { 1380 struct rt6_exception *rt6_ex; 1381 u32 hval; 1382 1383 WARN_ON_ONCE(!rcu_read_lock_held()); 1384 1385 if (!(*bucket) || !daddr) 1386 return NULL; 1387 1388 hval = rt6_exception_hash(daddr, saddr); 1389 *bucket += hval; 1390 1391 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1392 struct rt6_info *rt6 = rt6_ex->rt6i; 1393 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1394 1395 #ifdef CONFIG_IPV6_SUBTREES 1396 if (matched && saddr) 1397 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1398 #endif 1399 if (matched) 1400 return rt6_ex; 1401 } 1402 return NULL; 1403 } 1404 1405 static unsigned int fib6_mtu(const struct fib6_info *rt) 1406 { 1407 unsigned int mtu; 1408 1409 if (rt->fib6_pmtu) { 1410 mtu = rt->fib6_pmtu; 1411 } else { 1412 struct net_device *dev = fib6_info_nh_dev(rt); 1413 struct inet6_dev *idev; 1414 1415 rcu_read_lock(); 1416 idev = __in6_dev_get(dev); 1417 mtu = idev->cnf.mtu6; 1418 rcu_read_unlock(); 1419 } 1420 1421 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1422 1423 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1424 } 1425 1426 static int rt6_insert_exception(struct rt6_info *nrt, 1427 struct fib6_info *ort) 1428 { 1429 struct net *net = dev_net(nrt->dst.dev); 1430 struct rt6_exception_bucket *bucket; 1431 struct in6_addr *src_key = NULL; 1432 struct rt6_exception *rt6_ex; 1433 int err = 0; 1434 1435 spin_lock_bh(&rt6_exception_lock); 1436 1437 if (ort->exception_bucket_flushed) { 1438 err = -EINVAL; 1439 goto out; 1440 } 1441 1442 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1443 lockdep_is_held(&rt6_exception_lock)); 1444 if (!bucket) { 1445 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1446 GFP_ATOMIC); 1447 if (!bucket) { 1448 err = -ENOMEM; 1449 goto out; 1450 } 1451 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1452 } 1453 1454 #ifdef CONFIG_IPV6_SUBTREES 1455 /* rt6i_src.plen != 0 indicates ort is in subtree 1456 * and exception table is indexed by a hash of 1457 * both rt6i_dst and rt6i_src. 1458 * Otherwise, the exception table is indexed by 1459 * a hash of only rt6i_dst. 1460 */ 1461 if (ort->fib6_src.plen) 1462 src_key = &nrt->rt6i_src.addr; 1463 #endif 1464 /* rt6_mtu_change() might lower mtu on ort. 1465 * Only insert this exception route if its mtu 1466 * is less than ort's mtu value. 1467 */ 1468 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1469 err = -EINVAL; 1470 goto out; 1471 } 1472 1473 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1474 src_key); 1475 if (rt6_ex) 1476 rt6_remove_exception(bucket, rt6_ex); 1477 1478 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1479 if (!rt6_ex) { 1480 err = -ENOMEM; 1481 goto out; 1482 } 1483 rt6_ex->rt6i = nrt; 1484 rt6_ex->stamp = jiffies; 1485 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1486 bucket->depth++; 1487 net->ipv6.rt6_stats->fib_rt_cache++; 1488 1489 if (bucket->depth > FIB6_MAX_DEPTH) 1490 rt6_exception_remove_oldest(bucket); 1491 1492 out: 1493 spin_unlock_bh(&rt6_exception_lock); 1494 1495 /* Update fn->fn_sernum to invalidate all cached dst */ 1496 if (!err) { 1497 spin_lock_bh(&ort->fib6_table->tb6_lock); 1498 fib6_update_sernum(net, ort); 1499 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1500 fib6_force_start_gc(net); 1501 } 1502 1503 return err; 1504 } 1505 1506 void rt6_flush_exceptions(struct fib6_info *rt) 1507 { 1508 struct rt6_exception_bucket *bucket; 1509 struct rt6_exception *rt6_ex; 1510 struct hlist_node *tmp; 1511 int i; 1512 1513 spin_lock_bh(&rt6_exception_lock); 1514 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1515 rt->exception_bucket_flushed = 1; 1516 1517 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1518 lockdep_is_held(&rt6_exception_lock)); 1519 if (!bucket) 1520 goto out; 1521 1522 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1523 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1524 rt6_remove_exception(bucket, rt6_ex); 1525 WARN_ON_ONCE(bucket->depth); 1526 bucket++; 1527 } 1528 1529 out: 1530 spin_unlock_bh(&rt6_exception_lock); 1531 } 1532 1533 /* Find cached rt in the hash table inside passed in rt 1534 * Caller has to hold rcu_read_lock() 1535 */ 1536 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1537 struct in6_addr *daddr, 1538 struct in6_addr *saddr) 1539 { 1540 struct rt6_exception_bucket *bucket; 1541 struct in6_addr *src_key = NULL; 1542 struct rt6_exception *rt6_ex; 1543 struct rt6_info *res = NULL; 1544 1545 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1546 1547 #ifdef CONFIG_IPV6_SUBTREES 1548 /* rt6i_src.plen != 0 indicates rt is in subtree 1549 * and exception table is indexed by a hash of 1550 * both rt6i_dst and rt6i_src. 1551 * Otherwise, the exception table is indexed by 1552 * a hash of only rt6i_dst. 1553 */ 1554 if (rt->fib6_src.plen) 1555 src_key = saddr; 1556 #endif 1557 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1558 1559 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1560 res = rt6_ex->rt6i; 1561 1562 return res; 1563 } 1564 1565 /* Remove the passed in cached rt from the hash table that contains it */ 1566 static int rt6_remove_exception_rt(struct rt6_info *rt) 1567 { 1568 struct rt6_exception_bucket *bucket; 1569 struct in6_addr *src_key = NULL; 1570 struct rt6_exception *rt6_ex; 1571 struct fib6_info *from; 1572 int err; 1573 1574 from = rcu_dereference(rt->from); 1575 if (!from || 1576 !(rt->rt6i_flags & RTF_CACHE)) 1577 return -EINVAL; 1578 1579 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1580 return -ENOENT; 1581 1582 spin_lock_bh(&rt6_exception_lock); 1583 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1584 lockdep_is_held(&rt6_exception_lock)); 1585 #ifdef CONFIG_IPV6_SUBTREES 1586 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1587 * and exception table is indexed by a hash of 1588 * both rt6i_dst and rt6i_src. 1589 * Otherwise, the exception table is indexed by 1590 * a hash of only rt6i_dst. 1591 */ 1592 if (from->fib6_src.plen) 1593 src_key = &rt->rt6i_src.addr; 1594 #endif 1595 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1596 &rt->rt6i_dst.addr, 1597 src_key); 1598 if (rt6_ex) { 1599 rt6_remove_exception(bucket, rt6_ex); 1600 err = 0; 1601 } else { 1602 err = -ENOENT; 1603 } 1604 1605 spin_unlock_bh(&rt6_exception_lock); 1606 return err; 1607 } 1608 1609 /* Find rt6_ex which contains the passed in rt cache and 1610 * refresh its stamp 1611 */ 1612 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1613 { 1614 struct rt6_exception_bucket *bucket; 1615 struct in6_addr *src_key = NULL; 1616 struct rt6_exception *rt6_ex; 1617 struct fib6_info *from; 1618 1619 rcu_read_lock(); 1620 from = rcu_dereference(rt->from); 1621 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1622 goto unlock; 1623 1624 bucket = rcu_dereference(from->rt6i_exception_bucket); 1625 1626 #ifdef CONFIG_IPV6_SUBTREES 1627 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1628 * and exception table is indexed by a hash of 1629 * both rt6i_dst and rt6i_src. 1630 * Otherwise, the exception table is indexed by 1631 * a hash of only rt6i_dst. 1632 */ 1633 if (from->fib6_src.plen) 1634 src_key = &rt->rt6i_src.addr; 1635 #endif 1636 rt6_ex = __rt6_find_exception_rcu(&bucket, 1637 &rt->rt6i_dst.addr, 1638 src_key); 1639 if (rt6_ex) 1640 rt6_ex->stamp = jiffies; 1641 1642 unlock: 1643 rcu_read_unlock(); 1644 } 1645 1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1647 struct rt6_info *rt, int mtu) 1648 { 1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1650 * lowest MTU in the path: always allow updating the route PMTU to 1651 * reflect PMTU decreases. 1652 * 1653 * If the new MTU is higher, and the route PMTU is equal to the local 1654 * MTU, this means the old MTU is the lowest in the path, so allow 1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1656 * handle this. 1657 */ 1658 1659 if (dst_mtu(&rt->dst) >= mtu) 1660 return true; 1661 1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1663 return true; 1664 1665 return false; 1666 } 1667 1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1669 struct fib6_info *rt, int mtu) 1670 { 1671 struct rt6_exception_bucket *bucket; 1672 struct rt6_exception *rt6_ex; 1673 int i; 1674 1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1676 lockdep_is_held(&rt6_exception_lock)); 1677 1678 if (!bucket) 1679 return; 1680 1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1683 struct rt6_info *entry = rt6_ex->rt6i; 1684 1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1686 * route), the metrics of its rt->from have already 1687 * been updated. 1688 */ 1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1690 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1692 } 1693 bucket++; 1694 } 1695 } 1696 1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1698 1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1700 struct in6_addr *gateway) 1701 { 1702 struct rt6_exception_bucket *bucket; 1703 struct rt6_exception *rt6_ex; 1704 struct hlist_node *tmp; 1705 int i; 1706 1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1708 return; 1709 1710 spin_lock_bh(&rt6_exception_lock); 1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1712 lockdep_is_held(&rt6_exception_lock)); 1713 1714 if (bucket) { 1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1716 hlist_for_each_entry_safe(rt6_ex, tmp, 1717 &bucket->chain, hlist) { 1718 struct rt6_info *entry = rt6_ex->rt6i; 1719 1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1721 RTF_CACHE_GATEWAY && 1722 ipv6_addr_equal(gateway, 1723 &entry->rt6i_gateway)) { 1724 rt6_remove_exception(bucket, rt6_ex); 1725 } 1726 } 1727 bucket++; 1728 } 1729 } 1730 1731 spin_unlock_bh(&rt6_exception_lock); 1732 } 1733 1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1735 struct rt6_exception *rt6_ex, 1736 struct fib6_gc_args *gc_args, 1737 unsigned long now) 1738 { 1739 struct rt6_info *rt = rt6_ex->rt6i; 1740 1741 /* we are pruning and obsoleting aged-out and non gateway exceptions 1742 * even if others have still references to them, so that on next 1743 * dst_check() such references can be dropped. 1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1745 * expired, independently from their aging, as per RFC 8201 section 4 1746 */ 1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1749 RT6_TRACE("aging clone %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 } else if (time_after(jiffies, rt->dst.expires)) { 1754 RT6_TRACE("purging expired route %p\n", rt); 1755 rt6_remove_exception(bucket, rt6_ex); 1756 return; 1757 } 1758 1759 if (rt->rt6i_flags & RTF_GATEWAY) { 1760 struct neighbour *neigh; 1761 __u8 neigh_flags = 0; 1762 1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1764 if (neigh) 1765 neigh_flags = neigh->flags; 1766 1767 if (!(neigh_flags & NTF_ROUTER)) { 1768 RT6_TRACE("purging route %p via non-router but gateway\n", 1769 rt); 1770 rt6_remove_exception(bucket, rt6_ex); 1771 return; 1772 } 1773 } 1774 1775 gc_args->more++; 1776 } 1777 1778 void rt6_age_exceptions(struct fib6_info *rt, 1779 struct fib6_gc_args *gc_args, 1780 unsigned long now) 1781 { 1782 struct rt6_exception_bucket *bucket; 1783 struct rt6_exception *rt6_ex; 1784 struct hlist_node *tmp; 1785 int i; 1786 1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1788 return; 1789 1790 rcu_read_lock_bh(); 1791 spin_lock(&rt6_exception_lock); 1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1793 lockdep_is_held(&rt6_exception_lock)); 1794 1795 if (bucket) { 1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1797 hlist_for_each_entry_safe(rt6_ex, tmp, 1798 &bucket->chain, hlist) { 1799 rt6_age_examine_exception(bucket, rt6_ex, 1800 gc_args, now); 1801 } 1802 bucket++; 1803 } 1804 } 1805 spin_unlock(&rt6_exception_lock); 1806 rcu_read_unlock_bh(); 1807 } 1808 1809 /* must be called with rcu lock held */ 1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1811 int oif, struct flowi6 *fl6, int strict) 1812 { 1813 struct fib6_node *fn, *saved_fn; 1814 struct fib6_info *f6i; 1815 1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1817 saved_fn = fn; 1818 1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1820 oif = 0; 1821 1822 redo_rt6_select: 1823 f6i = rt6_select(net, fn, oif, strict); 1824 if (f6i == net->ipv6.fib6_null_entry) { 1825 fn = fib6_backtrack(fn, &fl6->saddr); 1826 if (fn) 1827 goto redo_rt6_select; 1828 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1829 /* also consider unreachable route */ 1830 strict &= ~RT6_LOOKUP_F_REACHABLE; 1831 fn = saved_fn; 1832 goto redo_rt6_select; 1833 } 1834 } 1835 1836 trace_fib6_table_lookup(net, f6i, table, fl6); 1837 1838 return f6i; 1839 } 1840 1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1842 int oif, struct flowi6 *fl6, 1843 const struct sk_buff *skb, int flags) 1844 { 1845 struct fib6_info *f6i; 1846 struct rt6_info *rt; 1847 int strict = 0; 1848 1849 strict |= flags & RT6_LOOKUP_F_IFACE; 1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1851 if (net->ipv6.devconf_all->forwarding == 0) 1852 strict |= RT6_LOOKUP_F_REACHABLE; 1853 1854 rcu_read_lock(); 1855 1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1857 if (f6i->fib6_nsiblings) 1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1859 1860 if (f6i == net->ipv6.fib6_null_entry) { 1861 rt = net->ipv6.ip6_null_entry; 1862 rcu_read_unlock(); 1863 dst_hold(&rt->dst); 1864 return rt; 1865 } 1866 1867 /*Search through exception table */ 1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1869 if (rt) { 1870 if (ip6_hold_safe(net, &rt, true)) 1871 dst_use_noref(&rt->dst, jiffies); 1872 1873 rcu_read_unlock(); 1874 return rt; 1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1876 !(f6i->fib6_flags & RTF_GATEWAY))) { 1877 /* Create a RTF_CACHE clone which will not be 1878 * owned by the fib6 tree. It is for the special case where 1879 * the daddr in the skb during the neighbor look-up is different 1880 * from the fl6->daddr used to look-up route here. 1881 */ 1882 struct rt6_info *uncached_rt; 1883 1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1885 1886 rcu_read_unlock(); 1887 1888 if (uncached_rt) { 1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1890 * No need for another dst_hold() 1891 */ 1892 rt6_uncached_list_add(uncached_rt); 1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1894 } else { 1895 uncached_rt = net->ipv6.ip6_null_entry; 1896 dst_hold(&uncached_rt->dst); 1897 } 1898 1899 return uncached_rt; 1900 } else { 1901 /* Get a percpu copy */ 1902 1903 struct rt6_info *pcpu_rt; 1904 1905 local_bh_disable(); 1906 pcpu_rt = rt6_get_pcpu_route(f6i); 1907 1908 if (!pcpu_rt) 1909 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1910 1911 local_bh_enable(); 1912 rcu_read_unlock(); 1913 1914 return pcpu_rt; 1915 } 1916 } 1917 EXPORT_SYMBOL_GPL(ip6_pol_route); 1918 1919 static struct rt6_info *ip6_pol_route_input(struct net *net, 1920 struct fib6_table *table, 1921 struct flowi6 *fl6, 1922 const struct sk_buff *skb, 1923 int flags) 1924 { 1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1926 } 1927 1928 struct dst_entry *ip6_route_input_lookup(struct net *net, 1929 struct net_device *dev, 1930 struct flowi6 *fl6, 1931 const struct sk_buff *skb, 1932 int flags) 1933 { 1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1935 flags |= RT6_LOOKUP_F_IFACE; 1936 1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1938 } 1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1940 1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1942 struct flow_keys *keys, 1943 struct flow_keys *flkeys) 1944 { 1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1946 const struct ipv6hdr *key_iph = outer_iph; 1947 struct flow_keys *_flkeys = flkeys; 1948 const struct ipv6hdr *inner_iph; 1949 const struct icmp6hdr *icmph; 1950 struct ipv6hdr _inner_iph; 1951 struct icmp6hdr _icmph; 1952 1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1954 goto out; 1955 1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1957 sizeof(_icmph), &_icmph); 1958 if (!icmph) 1959 goto out; 1960 1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1964 icmph->icmp6_type != ICMPV6_PARAMPROB) 1965 goto out; 1966 1967 inner_iph = skb_header_pointer(skb, 1968 skb_transport_offset(skb) + sizeof(*icmph), 1969 sizeof(_inner_iph), &_inner_iph); 1970 if (!inner_iph) 1971 goto out; 1972 1973 key_iph = inner_iph; 1974 _flkeys = NULL; 1975 out: 1976 if (_flkeys) { 1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1979 keys->tags.flow_label = _flkeys->tags.flow_label; 1980 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1981 } else { 1982 keys->addrs.v6addrs.src = key_iph->saddr; 1983 keys->addrs.v6addrs.dst = key_iph->daddr; 1984 keys->tags.flow_label = ip6_flowlabel(key_iph); 1985 keys->basic.ip_proto = key_iph->nexthdr; 1986 } 1987 } 1988 1989 /* if skb is set it will be used and fl6 can be NULL */ 1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1991 const struct sk_buff *skb, struct flow_keys *flkeys) 1992 { 1993 struct flow_keys hash_keys; 1994 u32 mhash; 1995 1996 switch (ip6_multipath_hash_policy(net)) { 1997 case 0: 1998 memset(&hash_keys, 0, sizeof(hash_keys)); 1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2000 if (skb) { 2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2002 } else { 2003 hash_keys.addrs.v6addrs.src = fl6->saddr; 2004 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2007 } 2008 break; 2009 case 1: 2010 if (skb) { 2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2012 struct flow_keys keys; 2013 2014 /* short-circuit if we already have L4 hash present */ 2015 if (skb->l4_hash) 2016 return skb_get_hash_raw(skb) >> 1; 2017 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 2020 if (!flkeys) { 2021 skb_flow_dissect_flow_keys(skb, &keys, flag); 2022 flkeys = &keys; 2023 } 2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2027 hash_keys.ports.src = flkeys->ports.src; 2028 hash_keys.ports.dst = flkeys->ports.dst; 2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2030 } else { 2031 memset(&hash_keys, 0, sizeof(hash_keys)); 2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.ports.src = fl6->fl6_sport; 2036 hash_keys.ports.dst = fl6->fl6_dport; 2037 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2038 } 2039 break; 2040 } 2041 mhash = flow_hash_from_keys(&hash_keys); 2042 2043 return mhash >> 1; 2044 } 2045 2046 void ip6_route_input(struct sk_buff *skb) 2047 { 2048 const struct ipv6hdr *iph = ipv6_hdr(skb); 2049 struct net *net = dev_net(skb->dev); 2050 int flags = RT6_LOOKUP_F_HAS_SADDR; 2051 struct ip_tunnel_info *tun_info; 2052 struct flowi6 fl6 = { 2053 .flowi6_iif = skb->dev->ifindex, 2054 .daddr = iph->daddr, 2055 .saddr = iph->saddr, 2056 .flowlabel = ip6_flowinfo(iph), 2057 .flowi6_mark = skb->mark, 2058 .flowi6_proto = iph->nexthdr, 2059 }; 2060 struct flow_keys *flkeys = NULL, _flkeys; 2061 2062 tun_info = skb_tunnel_info(skb); 2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2065 2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2067 flkeys = &_flkeys; 2068 2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2071 skb_dst_drop(skb); 2072 skb_dst_set(skb, 2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2074 } 2075 2076 static struct rt6_info *ip6_pol_route_output(struct net *net, 2077 struct fib6_table *table, 2078 struct flowi6 *fl6, 2079 const struct sk_buff *skb, 2080 int flags) 2081 { 2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2083 } 2084 2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2086 struct flowi6 *fl6, int flags) 2087 { 2088 bool any_src; 2089 2090 if (ipv6_addr_type(&fl6->daddr) & 2091 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2092 struct dst_entry *dst; 2093 2094 dst = l3mdev_link_scope_lookup(net, fl6); 2095 if (dst) 2096 return dst; 2097 } 2098 2099 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2100 2101 any_src = ipv6_addr_any(&fl6->saddr); 2102 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2103 (fl6->flowi6_oif && any_src)) 2104 flags |= RT6_LOOKUP_F_IFACE; 2105 2106 if (!any_src) 2107 flags |= RT6_LOOKUP_F_HAS_SADDR; 2108 else if (sk) 2109 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2110 2111 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2112 } 2113 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2114 2115 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2116 { 2117 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2118 struct net_device *loopback_dev = net->loopback_dev; 2119 struct dst_entry *new = NULL; 2120 2121 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2122 DST_OBSOLETE_DEAD, 0); 2123 if (rt) { 2124 rt6_info_init(rt); 2125 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2126 2127 new = &rt->dst; 2128 new->__use = 1; 2129 new->input = dst_discard; 2130 new->output = dst_discard_out; 2131 2132 dst_copy_metrics(new, &ort->dst); 2133 2134 rt->rt6i_idev = in6_dev_get(loopback_dev); 2135 rt->rt6i_gateway = ort->rt6i_gateway; 2136 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2137 2138 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2139 #ifdef CONFIG_IPV6_SUBTREES 2140 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2141 #endif 2142 } 2143 2144 dst_release(dst_orig); 2145 return new ? new : ERR_PTR(-ENOMEM); 2146 } 2147 2148 /* 2149 * Destination cache support functions 2150 */ 2151 2152 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2153 { 2154 u32 rt_cookie = 0; 2155 2156 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2157 return false; 2158 2159 if (fib6_check_expired(f6i)) 2160 return false; 2161 2162 return true; 2163 } 2164 2165 static struct dst_entry *rt6_check(struct rt6_info *rt, 2166 struct fib6_info *from, 2167 u32 cookie) 2168 { 2169 u32 rt_cookie = 0; 2170 2171 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2172 rt_cookie != cookie) 2173 return NULL; 2174 2175 if (rt6_check_expired(rt)) 2176 return NULL; 2177 2178 return &rt->dst; 2179 } 2180 2181 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2182 struct fib6_info *from, 2183 u32 cookie) 2184 { 2185 if (!__rt6_check_expired(rt) && 2186 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2187 fib6_check(from, cookie)) 2188 return &rt->dst; 2189 else 2190 return NULL; 2191 } 2192 2193 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2194 { 2195 struct dst_entry *dst_ret; 2196 struct fib6_info *from; 2197 struct rt6_info *rt; 2198 2199 rt = container_of(dst, struct rt6_info, dst); 2200 2201 rcu_read_lock(); 2202 2203 /* All IPV6 dsts are created with ->obsolete set to the value 2204 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2205 * into this function always. 2206 */ 2207 2208 from = rcu_dereference(rt->from); 2209 2210 if (from && (rt->rt6i_flags & RTF_PCPU || 2211 unlikely(!list_empty(&rt->rt6i_uncached)))) 2212 dst_ret = rt6_dst_from_check(rt, from, cookie); 2213 else 2214 dst_ret = rt6_check(rt, from, cookie); 2215 2216 rcu_read_unlock(); 2217 2218 return dst_ret; 2219 } 2220 2221 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2222 { 2223 struct rt6_info *rt = (struct rt6_info *) dst; 2224 2225 if (rt) { 2226 if (rt->rt6i_flags & RTF_CACHE) { 2227 rcu_read_lock(); 2228 if (rt6_check_expired(rt)) { 2229 rt6_remove_exception_rt(rt); 2230 dst = NULL; 2231 } 2232 rcu_read_unlock(); 2233 } else { 2234 dst_release(dst); 2235 dst = NULL; 2236 } 2237 } 2238 return dst; 2239 } 2240 2241 static void ip6_link_failure(struct sk_buff *skb) 2242 { 2243 struct rt6_info *rt; 2244 2245 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2246 2247 rt = (struct rt6_info *) skb_dst(skb); 2248 if (rt) { 2249 rcu_read_lock(); 2250 if (rt->rt6i_flags & RTF_CACHE) { 2251 rt6_remove_exception_rt(rt); 2252 } else { 2253 struct fib6_info *from; 2254 struct fib6_node *fn; 2255 2256 from = rcu_dereference(rt->from); 2257 if (from) { 2258 fn = rcu_dereference(from->fib6_node); 2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2260 fn->fn_sernum = -1; 2261 } 2262 } 2263 rcu_read_unlock(); 2264 } 2265 } 2266 2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2268 { 2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2270 struct fib6_info *from; 2271 2272 rcu_read_lock(); 2273 from = rcu_dereference(rt0->from); 2274 if (from) 2275 rt0->dst.expires = from->expires; 2276 rcu_read_unlock(); 2277 } 2278 2279 dst_set_expires(&rt0->dst, timeout); 2280 rt0->rt6i_flags |= RTF_EXPIRES; 2281 } 2282 2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2284 { 2285 struct net *net = dev_net(rt->dst.dev); 2286 2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2288 rt->rt6i_flags |= RTF_MODIFIED; 2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2290 } 2291 2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2293 { 2294 return !(rt->rt6i_flags & RTF_CACHE) && 2295 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2296 } 2297 2298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2299 const struct ipv6hdr *iph, u32 mtu) 2300 { 2301 const struct in6_addr *daddr, *saddr; 2302 struct rt6_info *rt6 = (struct rt6_info *)dst; 2303 2304 if (dst_metric_locked(dst, RTAX_MTU)) 2305 return; 2306 2307 if (iph) { 2308 daddr = &iph->daddr; 2309 saddr = &iph->saddr; 2310 } else if (sk) { 2311 daddr = &sk->sk_v6_daddr; 2312 saddr = &inet6_sk(sk)->saddr; 2313 } else { 2314 daddr = NULL; 2315 saddr = NULL; 2316 } 2317 dst_confirm_neigh(dst, daddr); 2318 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2319 if (mtu >= dst_mtu(dst)) 2320 return; 2321 2322 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2323 rt6_do_update_pmtu(rt6, mtu); 2324 /* update rt6_ex->stamp for cache */ 2325 if (rt6->rt6i_flags & RTF_CACHE) 2326 rt6_update_exception_stamp_rt(rt6); 2327 } else if (daddr) { 2328 struct fib6_info *from; 2329 struct rt6_info *nrt6; 2330 2331 rcu_read_lock(); 2332 from = rcu_dereference(rt6->from); 2333 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2334 if (nrt6) { 2335 rt6_do_update_pmtu(nrt6, mtu); 2336 if (rt6_insert_exception(nrt6, from)) 2337 dst_release_immediate(&nrt6->dst); 2338 } 2339 rcu_read_unlock(); 2340 } 2341 } 2342 2343 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2344 struct sk_buff *skb, u32 mtu) 2345 { 2346 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2347 } 2348 2349 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2350 int oif, u32 mark, kuid_t uid) 2351 { 2352 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2353 struct dst_entry *dst; 2354 struct flowi6 fl6 = { 2355 .flowi6_oif = oif, 2356 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2357 .daddr = iph->daddr, 2358 .saddr = iph->saddr, 2359 .flowlabel = ip6_flowinfo(iph), 2360 .flowi6_uid = uid, 2361 }; 2362 2363 dst = ip6_route_output(net, NULL, &fl6); 2364 if (!dst->error) 2365 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2366 dst_release(dst); 2367 } 2368 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2369 2370 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2371 { 2372 int oif = sk->sk_bound_dev_if; 2373 struct dst_entry *dst; 2374 2375 if (!oif && skb->dev) 2376 oif = l3mdev_master_ifindex(skb->dev); 2377 2378 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2379 2380 dst = __sk_dst_get(sk); 2381 if (!dst || !dst->obsolete || 2382 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2383 return; 2384 2385 bh_lock_sock(sk); 2386 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2387 ip6_datagram_dst_update(sk, false); 2388 bh_unlock_sock(sk); 2389 } 2390 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2391 2392 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2393 const struct flowi6 *fl6) 2394 { 2395 #ifdef CONFIG_IPV6_SUBTREES 2396 struct ipv6_pinfo *np = inet6_sk(sk); 2397 #endif 2398 2399 ip6_dst_store(sk, dst, 2400 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2401 &sk->sk_v6_daddr : NULL, 2402 #ifdef CONFIG_IPV6_SUBTREES 2403 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2404 &np->saddr : 2405 #endif 2406 NULL); 2407 } 2408 2409 /* Handle redirects */ 2410 struct ip6rd_flowi { 2411 struct flowi6 fl6; 2412 struct in6_addr gateway; 2413 }; 2414 2415 static struct rt6_info *__ip6_route_redirect(struct net *net, 2416 struct fib6_table *table, 2417 struct flowi6 *fl6, 2418 const struct sk_buff *skb, 2419 int flags) 2420 { 2421 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2422 struct rt6_info *ret = NULL, *rt_cache; 2423 struct fib6_info *rt; 2424 struct fib6_node *fn; 2425 2426 /* Get the "current" route for this destination and 2427 * check if the redirect has come from appropriate router. 2428 * 2429 * RFC 4861 specifies that redirects should only be 2430 * accepted if they come from the nexthop to the target. 2431 * Due to the way the routes are chosen, this notion 2432 * is a bit fuzzy and one might need to check all possible 2433 * routes. 2434 */ 2435 2436 rcu_read_lock(); 2437 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2438 restart: 2439 for_each_fib6_node_rt_rcu(fn) { 2440 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2441 continue; 2442 if (fib6_check_expired(rt)) 2443 continue; 2444 if (rt->fib6_flags & RTF_REJECT) 2445 break; 2446 if (!(rt->fib6_flags & RTF_GATEWAY)) 2447 continue; 2448 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2449 continue; 2450 /* rt_cache's gateway might be different from its 'parent' 2451 * in the case of an ip redirect. 2452 * So we keep searching in the exception table if the gateway 2453 * is different. 2454 */ 2455 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2456 rt_cache = rt6_find_cached_rt(rt, 2457 &fl6->daddr, 2458 &fl6->saddr); 2459 if (rt_cache && 2460 ipv6_addr_equal(&rdfl->gateway, 2461 &rt_cache->rt6i_gateway)) { 2462 ret = rt_cache; 2463 break; 2464 } 2465 continue; 2466 } 2467 break; 2468 } 2469 2470 if (!rt) 2471 rt = net->ipv6.fib6_null_entry; 2472 else if (rt->fib6_flags & RTF_REJECT) { 2473 ret = net->ipv6.ip6_null_entry; 2474 goto out; 2475 } 2476 2477 if (rt == net->ipv6.fib6_null_entry) { 2478 fn = fib6_backtrack(fn, &fl6->saddr); 2479 if (fn) 2480 goto restart; 2481 } 2482 2483 out: 2484 if (ret) 2485 ip6_hold_safe(net, &ret, true); 2486 else 2487 ret = ip6_create_rt_rcu(rt); 2488 2489 rcu_read_unlock(); 2490 2491 trace_fib6_table_lookup(net, rt, table, fl6); 2492 return ret; 2493 }; 2494 2495 static struct dst_entry *ip6_route_redirect(struct net *net, 2496 const struct flowi6 *fl6, 2497 const struct sk_buff *skb, 2498 const struct in6_addr *gateway) 2499 { 2500 int flags = RT6_LOOKUP_F_HAS_SADDR; 2501 struct ip6rd_flowi rdfl; 2502 2503 rdfl.fl6 = *fl6; 2504 rdfl.gateway = *gateway; 2505 2506 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2507 flags, __ip6_route_redirect); 2508 } 2509 2510 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2511 kuid_t uid) 2512 { 2513 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2514 struct dst_entry *dst; 2515 struct flowi6 fl6 = { 2516 .flowi6_iif = LOOPBACK_IFINDEX, 2517 .flowi6_oif = oif, 2518 .flowi6_mark = mark, 2519 .daddr = iph->daddr, 2520 .saddr = iph->saddr, 2521 .flowlabel = ip6_flowinfo(iph), 2522 .flowi6_uid = uid, 2523 }; 2524 2525 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2526 rt6_do_redirect(dst, NULL, skb); 2527 dst_release(dst); 2528 } 2529 EXPORT_SYMBOL_GPL(ip6_redirect); 2530 2531 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2532 { 2533 const struct ipv6hdr *iph = ipv6_hdr(skb); 2534 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2535 struct dst_entry *dst; 2536 struct flowi6 fl6 = { 2537 .flowi6_iif = LOOPBACK_IFINDEX, 2538 .flowi6_oif = oif, 2539 .daddr = msg->dest, 2540 .saddr = iph->daddr, 2541 .flowi6_uid = sock_net_uid(net, NULL), 2542 }; 2543 2544 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2545 rt6_do_redirect(dst, NULL, skb); 2546 dst_release(dst); 2547 } 2548 2549 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2550 { 2551 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2552 sk->sk_uid); 2553 } 2554 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2555 2556 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2557 { 2558 struct net_device *dev = dst->dev; 2559 unsigned int mtu = dst_mtu(dst); 2560 struct net *net = dev_net(dev); 2561 2562 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2563 2564 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2565 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2566 2567 /* 2568 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2569 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2570 * IPV6_MAXPLEN is also valid and means: "any MSS, 2571 * rely only on pmtu discovery" 2572 */ 2573 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2574 mtu = IPV6_MAXPLEN; 2575 return mtu; 2576 } 2577 2578 static unsigned int ip6_mtu(const struct dst_entry *dst) 2579 { 2580 struct inet6_dev *idev; 2581 unsigned int mtu; 2582 2583 mtu = dst_metric_raw(dst, RTAX_MTU); 2584 if (mtu) 2585 goto out; 2586 2587 mtu = IPV6_MIN_MTU; 2588 2589 rcu_read_lock(); 2590 idev = __in6_dev_get(dst->dev); 2591 if (idev) 2592 mtu = idev->cnf.mtu6; 2593 rcu_read_unlock(); 2594 2595 out: 2596 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2597 2598 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2599 } 2600 2601 /* MTU selection: 2602 * 1. mtu on route is locked - use it 2603 * 2. mtu from nexthop exception 2604 * 3. mtu from egress device 2605 * 2606 * based on ip6_dst_mtu_forward and exception logic of 2607 * rt6_find_cached_rt; called with rcu_read_lock 2608 */ 2609 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2610 struct in6_addr *saddr) 2611 { 2612 struct rt6_exception_bucket *bucket; 2613 struct rt6_exception *rt6_ex; 2614 struct in6_addr *src_key; 2615 struct inet6_dev *idev; 2616 u32 mtu = 0; 2617 2618 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2619 mtu = f6i->fib6_pmtu; 2620 if (mtu) 2621 goto out; 2622 } 2623 2624 src_key = NULL; 2625 #ifdef CONFIG_IPV6_SUBTREES 2626 if (f6i->fib6_src.plen) 2627 src_key = saddr; 2628 #endif 2629 2630 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2631 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2632 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2633 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2634 2635 if (likely(!mtu)) { 2636 struct net_device *dev = fib6_info_nh_dev(f6i); 2637 2638 mtu = IPV6_MIN_MTU; 2639 idev = __in6_dev_get(dev); 2640 if (idev && idev->cnf.mtu6 > mtu) 2641 mtu = idev->cnf.mtu6; 2642 } 2643 2644 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2645 out: 2646 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2647 } 2648 2649 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2650 struct flowi6 *fl6) 2651 { 2652 struct dst_entry *dst; 2653 struct rt6_info *rt; 2654 struct inet6_dev *idev = in6_dev_get(dev); 2655 struct net *net = dev_net(dev); 2656 2657 if (unlikely(!idev)) 2658 return ERR_PTR(-ENODEV); 2659 2660 rt = ip6_dst_alloc(net, dev, 0); 2661 if (unlikely(!rt)) { 2662 in6_dev_put(idev); 2663 dst = ERR_PTR(-ENOMEM); 2664 goto out; 2665 } 2666 2667 rt->dst.flags |= DST_HOST; 2668 rt->dst.input = ip6_input; 2669 rt->dst.output = ip6_output; 2670 rt->rt6i_gateway = fl6->daddr; 2671 rt->rt6i_dst.addr = fl6->daddr; 2672 rt->rt6i_dst.plen = 128; 2673 rt->rt6i_idev = idev; 2674 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2675 2676 /* Add this dst into uncached_list so that rt6_disable_ip() can 2677 * do proper release of the net_device 2678 */ 2679 rt6_uncached_list_add(rt); 2680 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2681 2682 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2683 2684 out: 2685 return dst; 2686 } 2687 2688 static int ip6_dst_gc(struct dst_ops *ops) 2689 { 2690 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2691 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2692 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2693 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2694 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2695 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2696 int entries; 2697 2698 entries = dst_entries_get_fast(ops); 2699 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2700 entries <= rt_max_size) 2701 goto out; 2702 2703 net->ipv6.ip6_rt_gc_expire++; 2704 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2705 entries = dst_entries_get_slow(ops); 2706 if (entries < ops->gc_thresh) 2707 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2708 out: 2709 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2710 return entries > rt_max_size; 2711 } 2712 2713 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2714 struct fib6_config *cfg, 2715 const struct in6_addr *gw_addr, 2716 u32 tbid, int flags) 2717 { 2718 struct flowi6 fl6 = { 2719 .flowi6_oif = cfg->fc_ifindex, 2720 .daddr = *gw_addr, 2721 .saddr = cfg->fc_prefsrc, 2722 }; 2723 struct fib6_table *table; 2724 struct rt6_info *rt; 2725 2726 table = fib6_get_table(net, tbid); 2727 if (!table) 2728 return NULL; 2729 2730 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2731 flags |= RT6_LOOKUP_F_HAS_SADDR; 2732 2733 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2734 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2735 2736 /* if table lookup failed, fall back to full lookup */ 2737 if (rt == net->ipv6.ip6_null_entry) { 2738 ip6_rt_put(rt); 2739 rt = NULL; 2740 } 2741 2742 return rt; 2743 } 2744 2745 static int ip6_route_check_nh_onlink(struct net *net, 2746 struct fib6_config *cfg, 2747 const struct net_device *dev, 2748 struct netlink_ext_ack *extack) 2749 { 2750 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2751 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2752 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2753 struct fib6_info *from; 2754 struct rt6_info *grt; 2755 int err; 2756 2757 err = 0; 2758 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2759 if (grt) { 2760 rcu_read_lock(); 2761 from = rcu_dereference(grt->from); 2762 if (!grt->dst.error && 2763 /* ignore match if it is the default route */ 2764 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2765 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2766 NL_SET_ERR_MSG(extack, 2767 "Nexthop has invalid gateway or device mismatch"); 2768 err = -EINVAL; 2769 } 2770 rcu_read_unlock(); 2771 2772 ip6_rt_put(grt); 2773 } 2774 2775 return err; 2776 } 2777 2778 static int ip6_route_check_nh(struct net *net, 2779 struct fib6_config *cfg, 2780 struct net_device **_dev, 2781 struct inet6_dev **idev) 2782 { 2783 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2784 struct net_device *dev = _dev ? *_dev : NULL; 2785 struct rt6_info *grt = NULL; 2786 int err = -EHOSTUNREACH; 2787 2788 if (cfg->fc_table) { 2789 int flags = RT6_LOOKUP_F_IFACE; 2790 2791 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2792 cfg->fc_table, flags); 2793 if (grt) { 2794 if (grt->rt6i_flags & RTF_GATEWAY || 2795 (dev && dev != grt->dst.dev)) { 2796 ip6_rt_put(grt); 2797 grt = NULL; 2798 } 2799 } 2800 } 2801 2802 if (!grt) 2803 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2804 2805 if (!grt) 2806 goto out; 2807 2808 if (dev) { 2809 if (dev != grt->dst.dev) { 2810 ip6_rt_put(grt); 2811 goto out; 2812 } 2813 } else { 2814 *_dev = dev = grt->dst.dev; 2815 *idev = grt->rt6i_idev; 2816 dev_hold(dev); 2817 in6_dev_hold(grt->rt6i_idev); 2818 } 2819 2820 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2821 err = 0; 2822 2823 ip6_rt_put(grt); 2824 2825 out: 2826 return err; 2827 } 2828 2829 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2830 struct net_device **_dev, struct inet6_dev **idev, 2831 struct netlink_ext_ack *extack) 2832 { 2833 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2834 int gwa_type = ipv6_addr_type(gw_addr); 2835 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2836 const struct net_device *dev = *_dev; 2837 bool need_addr_check = !dev; 2838 int err = -EINVAL; 2839 2840 /* if gw_addr is local we will fail to detect this in case 2841 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2842 * will return already-added prefix route via interface that 2843 * prefix route was assigned to, which might be non-loopback. 2844 */ 2845 if (dev && 2846 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2847 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2848 goto out; 2849 } 2850 2851 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2852 /* IPv6 strictly inhibits using not link-local 2853 * addresses as nexthop address. 2854 * Otherwise, router will not able to send redirects. 2855 * It is very good, but in some (rare!) circumstances 2856 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2857 * some exceptions. --ANK 2858 * We allow IPv4-mapped nexthops to support RFC4798-type 2859 * addressing 2860 */ 2861 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2862 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2863 goto out; 2864 } 2865 2866 if (cfg->fc_flags & RTNH_F_ONLINK) 2867 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2868 else 2869 err = ip6_route_check_nh(net, cfg, _dev, idev); 2870 2871 if (err) 2872 goto out; 2873 } 2874 2875 /* reload in case device was changed */ 2876 dev = *_dev; 2877 2878 err = -EINVAL; 2879 if (!dev) { 2880 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2881 goto out; 2882 } else if (dev->flags & IFF_LOOPBACK) { 2883 NL_SET_ERR_MSG(extack, 2884 "Egress device can not be loopback device for this route"); 2885 goto out; 2886 } 2887 2888 /* if we did not check gw_addr above, do so now that the 2889 * egress device has been resolved. 2890 */ 2891 if (need_addr_check && 2892 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2893 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2894 goto out; 2895 } 2896 2897 err = 0; 2898 out: 2899 return err; 2900 } 2901 2902 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2903 gfp_t gfp_flags, 2904 struct netlink_ext_ack *extack) 2905 { 2906 struct net *net = cfg->fc_nlinfo.nl_net; 2907 struct fib6_info *rt = NULL; 2908 struct net_device *dev = NULL; 2909 struct inet6_dev *idev = NULL; 2910 struct fib6_table *table; 2911 int addr_type; 2912 int err = -EINVAL; 2913 2914 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2915 if (cfg->fc_flags & RTF_PCPU) { 2916 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2917 goto out; 2918 } 2919 2920 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2921 if (cfg->fc_flags & RTF_CACHE) { 2922 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2923 goto out; 2924 } 2925 2926 if (cfg->fc_type > RTN_MAX) { 2927 NL_SET_ERR_MSG(extack, "Invalid route type"); 2928 goto out; 2929 } 2930 2931 if (cfg->fc_dst_len > 128) { 2932 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2933 goto out; 2934 } 2935 if (cfg->fc_src_len > 128) { 2936 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2937 goto out; 2938 } 2939 #ifndef CONFIG_IPV6_SUBTREES 2940 if (cfg->fc_src_len) { 2941 NL_SET_ERR_MSG(extack, 2942 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2943 goto out; 2944 } 2945 #endif 2946 if (cfg->fc_ifindex) { 2947 err = -ENODEV; 2948 dev = dev_get_by_index(net, cfg->fc_ifindex); 2949 if (!dev) 2950 goto out; 2951 idev = in6_dev_get(dev); 2952 if (!idev) 2953 goto out; 2954 } 2955 2956 if (cfg->fc_metric == 0) 2957 cfg->fc_metric = IP6_RT_PRIO_USER; 2958 2959 if (cfg->fc_flags & RTNH_F_ONLINK) { 2960 if (!dev) { 2961 NL_SET_ERR_MSG(extack, 2962 "Nexthop device required for onlink"); 2963 err = -ENODEV; 2964 goto out; 2965 } 2966 2967 if (!(dev->flags & IFF_UP)) { 2968 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2969 err = -ENETDOWN; 2970 goto out; 2971 } 2972 } 2973 2974 err = -ENOBUFS; 2975 if (cfg->fc_nlinfo.nlh && 2976 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2977 table = fib6_get_table(net, cfg->fc_table); 2978 if (!table) { 2979 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2980 table = fib6_new_table(net, cfg->fc_table); 2981 } 2982 } else { 2983 table = fib6_new_table(net, cfg->fc_table); 2984 } 2985 2986 if (!table) 2987 goto out; 2988 2989 err = -ENOMEM; 2990 rt = fib6_info_alloc(gfp_flags); 2991 if (!rt) 2992 goto out; 2993 2994 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 2995 extack); 2996 if (IS_ERR(rt->fib6_metrics)) { 2997 err = PTR_ERR(rt->fib6_metrics); 2998 /* Do not leave garbage there. */ 2999 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3000 goto out; 3001 } 3002 3003 if (cfg->fc_flags & RTF_ADDRCONF) 3004 rt->dst_nocount = true; 3005 3006 if (cfg->fc_flags & RTF_EXPIRES) 3007 fib6_set_expires(rt, jiffies + 3008 clock_t_to_jiffies(cfg->fc_expires)); 3009 else 3010 fib6_clean_expires(rt); 3011 3012 if (cfg->fc_protocol == RTPROT_UNSPEC) 3013 cfg->fc_protocol = RTPROT_BOOT; 3014 rt->fib6_protocol = cfg->fc_protocol; 3015 3016 addr_type = ipv6_addr_type(&cfg->fc_dst); 3017 3018 if (cfg->fc_encap) { 3019 struct lwtunnel_state *lwtstate; 3020 3021 err = lwtunnel_build_state(cfg->fc_encap_type, 3022 cfg->fc_encap, AF_INET6, cfg, 3023 &lwtstate, extack); 3024 if (err) 3025 goto out; 3026 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3027 } 3028 3029 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3030 rt->fib6_dst.plen = cfg->fc_dst_len; 3031 if (rt->fib6_dst.plen == 128) 3032 rt->dst_host = true; 3033 3034 #ifdef CONFIG_IPV6_SUBTREES 3035 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3036 rt->fib6_src.plen = cfg->fc_src_len; 3037 #endif 3038 3039 rt->fib6_metric = cfg->fc_metric; 3040 rt->fib6_nh.nh_weight = 1; 3041 3042 rt->fib6_type = cfg->fc_type; 3043 3044 /* We cannot add true routes via loopback here, 3045 they would result in kernel looping; promote them to reject routes 3046 */ 3047 if ((cfg->fc_flags & RTF_REJECT) || 3048 (dev && (dev->flags & IFF_LOOPBACK) && 3049 !(addr_type & IPV6_ADDR_LOOPBACK) && 3050 !(cfg->fc_flags & RTF_LOCAL))) { 3051 /* hold loopback dev/idev if we haven't done so. */ 3052 if (dev != net->loopback_dev) { 3053 if (dev) { 3054 dev_put(dev); 3055 in6_dev_put(idev); 3056 } 3057 dev = net->loopback_dev; 3058 dev_hold(dev); 3059 idev = in6_dev_get(dev); 3060 if (!idev) { 3061 err = -ENODEV; 3062 goto out; 3063 } 3064 } 3065 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3066 goto install_route; 3067 } 3068 3069 if (cfg->fc_flags & RTF_GATEWAY) { 3070 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3071 if (err) 3072 goto out; 3073 3074 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3075 } 3076 3077 err = -ENODEV; 3078 if (!dev) 3079 goto out; 3080 3081 if (idev->cnf.disable_ipv6) { 3082 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3083 err = -EACCES; 3084 goto out; 3085 } 3086 3087 if (!(dev->flags & IFF_UP)) { 3088 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3089 err = -ENETDOWN; 3090 goto out; 3091 } 3092 3093 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3094 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3095 NL_SET_ERR_MSG(extack, "Invalid source address"); 3096 err = -EINVAL; 3097 goto out; 3098 } 3099 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3100 rt->fib6_prefsrc.plen = 128; 3101 } else 3102 rt->fib6_prefsrc.plen = 0; 3103 3104 rt->fib6_flags = cfg->fc_flags; 3105 3106 install_route: 3107 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3108 !netif_carrier_ok(dev)) 3109 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3110 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3111 rt->fib6_nh.nh_dev = dev; 3112 rt->fib6_table = table; 3113 3114 if (idev) 3115 in6_dev_put(idev); 3116 3117 return rt; 3118 out: 3119 if (dev) 3120 dev_put(dev); 3121 if (idev) 3122 in6_dev_put(idev); 3123 3124 fib6_info_release(rt); 3125 return ERR_PTR(err); 3126 } 3127 3128 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3129 struct netlink_ext_ack *extack) 3130 { 3131 struct fib6_info *rt; 3132 int err; 3133 3134 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3135 if (IS_ERR(rt)) 3136 return PTR_ERR(rt); 3137 3138 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3139 fib6_info_release(rt); 3140 3141 return err; 3142 } 3143 3144 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3145 { 3146 struct net *net = info->nl_net; 3147 struct fib6_table *table; 3148 int err; 3149 3150 if (rt == net->ipv6.fib6_null_entry) { 3151 err = -ENOENT; 3152 goto out; 3153 } 3154 3155 table = rt->fib6_table; 3156 spin_lock_bh(&table->tb6_lock); 3157 err = fib6_del(rt, info); 3158 spin_unlock_bh(&table->tb6_lock); 3159 3160 out: 3161 fib6_info_release(rt); 3162 return err; 3163 } 3164 3165 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3166 { 3167 struct nl_info info = { .nl_net = net }; 3168 3169 return __ip6_del_rt(rt, &info); 3170 } 3171 3172 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3173 { 3174 struct nl_info *info = &cfg->fc_nlinfo; 3175 struct net *net = info->nl_net; 3176 struct sk_buff *skb = NULL; 3177 struct fib6_table *table; 3178 int err = -ENOENT; 3179 3180 if (rt == net->ipv6.fib6_null_entry) 3181 goto out_put; 3182 table = rt->fib6_table; 3183 spin_lock_bh(&table->tb6_lock); 3184 3185 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3186 struct fib6_info *sibling, *next_sibling; 3187 3188 /* prefer to send a single notification with all hops */ 3189 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3190 if (skb) { 3191 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3192 3193 if (rt6_fill_node(net, skb, rt, NULL, 3194 NULL, NULL, 0, RTM_DELROUTE, 3195 info->portid, seq, 0) < 0) { 3196 kfree_skb(skb); 3197 skb = NULL; 3198 } else 3199 info->skip_notify = 1; 3200 } 3201 3202 list_for_each_entry_safe(sibling, next_sibling, 3203 &rt->fib6_siblings, 3204 fib6_siblings) { 3205 err = fib6_del(sibling, info); 3206 if (err) 3207 goto out_unlock; 3208 } 3209 } 3210 3211 err = fib6_del(rt, info); 3212 out_unlock: 3213 spin_unlock_bh(&table->tb6_lock); 3214 out_put: 3215 fib6_info_release(rt); 3216 3217 if (skb) { 3218 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3219 info->nlh, gfp_any()); 3220 } 3221 return err; 3222 } 3223 3224 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3225 { 3226 int rc = -ESRCH; 3227 3228 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3229 goto out; 3230 3231 if (cfg->fc_flags & RTF_GATEWAY && 3232 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3233 goto out; 3234 3235 rc = rt6_remove_exception_rt(rt); 3236 out: 3237 return rc; 3238 } 3239 3240 static int ip6_route_del(struct fib6_config *cfg, 3241 struct netlink_ext_ack *extack) 3242 { 3243 struct rt6_info *rt_cache; 3244 struct fib6_table *table; 3245 struct fib6_info *rt; 3246 struct fib6_node *fn; 3247 int err = -ESRCH; 3248 3249 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3250 if (!table) { 3251 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3252 return err; 3253 } 3254 3255 rcu_read_lock(); 3256 3257 fn = fib6_locate(&table->tb6_root, 3258 &cfg->fc_dst, cfg->fc_dst_len, 3259 &cfg->fc_src, cfg->fc_src_len, 3260 !(cfg->fc_flags & RTF_CACHE)); 3261 3262 if (fn) { 3263 for_each_fib6_node_rt_rcu(fn) { 3264 if (cfg->fc_flags & RTF_CACHE) { 3265 int rc; 3266 3267 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3268 &cfg->fc_src); 3269 if (rt_cache) { 3270 rc = ip6_del_cached_rt(rt_cache, cfg); 3271 if (rc != -ESRCH) { 3272 rcu_read_unlock(); 3273 return rc; 3274 } 3275 } 3276 continue; 3277 } 3278 if (cfg->fc_ifindex && 3279 (!rt->fib6_nh.nh_dev || 3280 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3281 continue; 3282 if (cfg->fc_flags & RTF_GATEWAY && 3283 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3284 continue; 3285 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3286 continue; 3287 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3288 continue; 3289 if (!fib6_info_hold_safe(rt)) 3290 continue; 3291 rcu_read_unlock(); 3292 3293 /* if gateway was specified only delete the one hop */ 3294 if (cfg->fc_flags & RTF_GATEWAY) 3295 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3296 3297 return __ip6_del_rt_siblings(rt, cfg); 3298 } 3299 } 3300 rcu_read_unlock(); 3301 3302 return err; 3303 } 3304 3305 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3306 { 3307 struct netevent_redirect netevent; 3308 struct rt6_info *rt, *nrt = NULL; 3309 struct ndisc_options ndopts; 3310 struct inet6_dev *in6_dev; 3311 struct neighbour *neigh; 3312 struct fib6_info *from; 3313 struct rd_msg *msg; 3314 int optlen, on_link; 3315 u8 *lladdr; 3316 3317 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3318 optlen -= sizeof(*msg); 3319 3320 if (optlen < 0) { 3321 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3322 return; 3323 } 3324 3325 msg = (struct rd_msg *)icmp6_hdr(skb); 3326 3327 if (ipv6_addr_is_multicast(&msg->dest)) { 3328 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3329 return; 3330 } 3331 3332 on_link = 0; 3333 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3334 on_link = 1; 3335 } else if (ipv6_addr_type(&msg->target) != 3336 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3337 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3338 return; 3339 } 3340 3341 in6_dev = __in6_dev_get(skb->dev); 3342 if (!in6_dev) 3343 return; 3344 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3345 return; 3346 3347 /* RFC2461 8.1: 3348 * The IP source address of the Redirect MUST be the same as the current 3349 * first-hop router for the specified ICMP Destination Address. 3350 */ 3351 3352 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3353 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3354 return; 3355 } 3356 3357 lladdr = NULL; 3358 if (ndopts.nd_opts_tgt_lladdr) { 3359 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3360 skb->dev); 3361 if (!lladdr) { 3362 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3363 return; 3364 } 3365 } 3366 3367 rt = (struct rt6_info *) dst; 3368 if (rt->rt6i_flags & RTF_REJECT) { 3369 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3370 return; 3371 } 3372 3373 /* Redirect received -> path was valid. 3374 * Look, redirects are sent only in response to data packets, 3375 * so that this nexthop apparently is reachable. --ANK 3376 */ 3377 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3378 3379 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3380 if (!neigh) 3381 return; 3382 3383 /* 3384 * We have finally decided to accept it. 3385 */ 3386 3387 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3388 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3389 NEIGH_UPDATE_F_OVERRIDE| 3390 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3391 NEIGH_UPDATE_F_ISROUTER)), 3392 NDISC_REDIRECT, &ndopts); 3393 3394 rcu_read_lock(); 3395 from = rcu_dereference(rt->from); 3396 /* This fib6_info_hold() is safe here because we hold reference to rt 3397 * and rt already holds reference to fib6_info. 3398 */ 3399 fib6_info_hold(from); 3400 rcu_read_unlock(); 3401 3402 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3403 if (!nrt) 3404 goto out; 3405 3406 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3407 if (on_link) 3408 nrt->rt6i_flags &= ~RTF_GATEWAY; 3409 3410 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3411 3412 /* No need to remove rt from the exception table if rt is 3413 * a cached route because rt6_insert_exception() will 3414 * takes care of it 3415 */ 3416 if (rt6_insert_exception(nrt, from)) { 3417 dst_release_immediate(&nrt->dst); 3418 goto out; 3419 } 3420 3421 netevent.old = &rt->dst; 3422 netevent.new = &nrt->dst; 3423 netevent.daddr = &msg->dest; 3424 netevent.neigh = neigh; 3425 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3426 3427 out: 3428 fib6_info_release(from); 3429 neigh_release(neigh); 3430 } 3431 3432 #ifdef CONFIG_IPV6_ROUTE_INFO 3433 static struct fib6_info *rt6_get_route_info(struct net *net, 3434 const struct in6_addr *prefix, int prefixlen, 3435 const struct in6_addr *gwaddr, 3436 struct net_device *dev) 3437 { 3438 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3439 int ifindex = dev->ifindex; 3440 struct fib6_node *fn; 3441 struct fib6_info *rt = NULL; 3442 struct fib6_table *table; 3443 3444 table = fib6_get_table(net, tb_id); 3445 if (!table) 3446 return NULL; 3447 3448 rcu_read_lock(); 3449 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3450 if (!fn) 3451 goto out; 3452 3453 for_each_fib6_node_rt_rcu(fn) { 3454 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3455 continue; 3456 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3457 continue; 3458 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3459 continue; 3460 if (!fib6_info_hold_safe(rt)) 3461 continue; 3462 break; 3463 } 3464 out: 3465 rcu_read_unlock(); 3466 return rt; 3467 } 3468 3469 static struct fib6_info *rt6_add_route_info(struct net *net, 3470 const struct in6_addr *prefix, int prefixlen, 3471 const struct in6_addr *gwaddr, 3472 struct net_device *dev, 3473 unsigned int pref) 3474 { 3475 struct fib6_config cfg = { 3476 .fc_metric = IP6_RT_PRIO_USER, 3477 .fc_ifindex = dev->ifindex, 3478 .fc_dst_len = prefixlen, 3479 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3480 RTF_UP | RTF_PREF(pref), 3481 .fc_protocol = RTPROT_RA, 3482 .fc_type = RTN_UNICAST, 3483 .fc_nlinfo.portid = 0, 3484 .fc_nlinfo.nlh = NULL, 3485 .fc_nlinfo.nl_net = net, 3486 }; 3487 3488 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3489 cfg.fc_dst = *prefix; 3490 cfg.fc_gateway = *gwaddr; 3491 3492 /* We should treat it as a default route if prefix length is 0. */ 3493 if (!prefixlen) 3494 cfg.fc_flags |= RTF_DEFAULT; 3495 3496 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3497 3498 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3499 } 3500 #endif 3501 3502 struct fib6_info *rt6_get_dflt_router(struct net *net, 3503 const struct in6_addr *addr, 3504 struct net_device *dev) 3505 { 3506 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3507 struct fib6_info *rt; 3508 struct fib6_table *table; 3509 3510 table = fib6_get_table(net, tb_id); 3511 if (!table) 3512 return NULL; 3513 3514 rcu_read_lock(); 3515 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3516 if (dev == rt->fib6_nh.nh_dev && 3517 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3518 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3519 break; 3520 } 3521 if (rt && !fib6_info_hold_safe(rt)) 3522 rt = NULL; 3523 rcu_read_unlock(); 3524 return rt; 3525 } 3526 3527 struct fib6_info *rt6_add_dflt_router(struct net *net, 3528 const struct in6_addr *gwaddr, 3529 struct net_device *dev, 3530 unsigned int pref) 3531 { 3532 struct fib6_config cfg = { 3533 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3534 .fc_metric = IP6_RT_PRIO_USER, 3535 .fc_ifindex = dev->ifindex, 3536 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3537 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3538 .fc_protocol = RTPROT_RA, 3539 .fc_type = RTN_UNICAST, 3540 .fc_nlinfo.portid = 0, 3541 .fc_nlinfo.nlh = NULL, 3542 .fc_nlinfo.nl_net = net, 3543 }; 3544 3545 cfg.fc_gateway = *gwaddr; 3546 3547 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3548 struct fib6_table *table; 3549 3550 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3551 if (table) 3552 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3553 } 3554 3555 return rt6_get_dflt_router(net, gwaddr, dev); 3556 } 3557 3558 static void __rt6_purge_dflt_routers(struct net *net, 3559 struct fib6_table *table) 3560 { 3561 struct fib6_info *rt; 3562 3563 restart: 3564 rcu_read_lock(); 3565 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3566 struct net_device *dev = fib6_info_nh_dev(rt); 3567 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3568 3569 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3570 (!idev || idev->cnf.accept_ra != 2) && 3571 fib6_info_hold_safe(rt)) { 3572 rcu_read_unlock(); 3573 ip6_del_rt(net, rt); 3574 goto restart; 3575 } 3576 } 3577 rcu_read_unlock(); 3578 3579 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3580 } 3581 3582 void rt6_purge_dflt_routers(struct net *net) 3583 { 3584 struct fib6_table *table; 3585 struct hlist_head *head; 3586 unsigned int h; 3587 3588 rcu_read_lock(); 3589 3590 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3591 head = &net->ipv6.fib_table_hash[h]; 3592 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3593 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3594 __rt6_purge_dflt_routers(net, table); 3595 } 3596 } 3597 3598 rcu_read_unlock(); 3599 } 3600 3601 static void rtmsg_to_fib6_config(struct net *net, 3602 struct in6_rtmsg *rtmsg, 3603 struct fib6_config *cfg) 3604 { 3605 *cfg = (struct fib6_config){ 3606 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3607 : RT6_TABLE_MAIN, 3608 .fc_ifindex = rtmsg->rtmsg_ifindex, 3609 .fc_metric = rtmsg->rtmsg_metric, 3610 .fc_expires = rtmsg->rtmsg_info, 3611 .fc_dst_len = rtmsg->rtmsg_dst_len, 3612 .fc_src_len = rtmsg->rtmsg_src_len, 3613 .fc_flags = rtmsg->rtmsg_flags, 3614 .fc_type = rtmsg->rtmsg_type, 3615 3616 .fc_nlinfo.nl_net = net, 3617 3618 .fc_dst = rtmsg->rtmsg_dst, 3619 .fc_src = rtmsg->rtmsg_src, 3620 .fc_gateway = rtmsg->rtmsg_gateway, 3621 }; 3622 } 3623 3624 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3625 { 3626 struct fib6_config cfg; 3627 struct in6_rtmsg rtmsg; 3628 int err; 3629 3630 switch (cmd) { 3631 case SIOCADDRT: /* Add a route */ 3632 case SIOCDELRT: /* Delete a route */ 3633 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3634 return -EPERM; 3635 err = copy_from_user(&rtmsg, arg, 3636 sizeof(struct in6_rtmsg)); 3637 if (err) 3638 return -EFAULT; 3639 3640 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3641 3642 rtnl_lock(); 3643 switch (cmd) { 3644 case SIOCADDRT: 3645 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3646 break; 3647 case SIOCDELRT: 3648 err = ip6_route_del(&cfg, NULL); 3649 break; 3650 default: 3651 err = -EINVAL; 3652 } 3653 rtnl_unlock(); 3654 3655 return err; 3656 } 3657 3658 return -EINVAL; 3659 } 3660 3661 /* 3662 * Drop the packet on the floor 3663 */ 3664 3665 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3666 { 3667 int type; 3668 struct dst_entry *dst = skb_dst(skb); 3669 switch (ipstats_mib_noroutes) { 3670 case IPSTATS_MIB_INNOROUTES: 3671 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3672 if (type == IPV6_ADDR_ANY) { 3673 IP6_INC_STATS(dev_net(dst->dev), 3674 __in6_dev_get_safely(skb->dev), 3675 IPSTATS_MIB_INADDRERRORS); 3676 break; 3677 } 3678 /* FALLTHROUGH */ 3679 case IPSTATS_MIB_OUTNOROUTES: 3680 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3681 ipstats_mib_noroutes); 3682 break; 3683 } 3684 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3685 kfree_skb(skb); 3686 return 0; 3687 } 3688 3689 static int ip6_pkt_discard(struct sk_buff *skb) 3690 { 3691 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3692 } 3693 3694 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3695 { 3696 skb->dev = skb_dst(skb)->dev; 3697 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3698 } 3699 3700 static int ip6_pkt_prohibit(struct sk_buff *skb) 3701 { 3702 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3703 } 3704 3705 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3706 { 3707 skb->dev = skb_dst(skb)->dev; 3708 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3709 } 3710 3711 /* 3712 * Allocate a dst for local (unicast / anycast) address. 3713 */ 3714 3715 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3716 struct inet6_dev *idev, 3717 const struct in6_addr *addr, 3718 bool anycast, gfp_t gfp_flags) 3719 { 3720 u32 tb_id; 3721 struct net_device *dev = idev->dev; 3722 struct fib6_info *f6i; 3723 3724 f6i = fib6_info_alloc(gfp_flags); 3725 if (!f6i) 3726 return ERR_PTR(-ENOMEM); 3727 3728 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); 3729 f6i->dst_nocount = true; 3730 f6i->dst_host = true; 3731 f6i->fib6_protocol = RTPROT_KERNEL; 3732 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3733 if (anycast) { 3734 f6i->fib6_type = RTN_ANYCAST; 3735 f6i->fib6_flags |= RTF_ANYCAST; 3736 } else { 3737 f6i->fib6_type = RTN_LOCAL; 3738 f6i->fib6_flags |= RTF_LOCAL; 3739 } 3740 3741 f6i->fib6_nh.nh_gw = *addr; 3742 dev_hold(dev); 3743 f6i->fib6_nh.nh_dev = dev; 3744 f6i->fib6_dst.addr = *addr; 3745 f6i->fib6_dst.plen = 128; 3746 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3747 f6i->fib6_table = fib6_get_table(net, tb_id); 3748 3749 return f6i; 3750 } 3751 3752 /* remove deleted ip from prefsrc entries */ 3753 struct arg_dev_net_ip { 3754 struct net_device *dev; 3755 struct net *net; 3756 struct in6_addr *addr; 3757 }; 3758 3759 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3760 { 3761 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3762 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3763 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3764 3765 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3766 rt != net->ipv6.fib6_null_entry && 3767 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3768 spin_lock_bh(&rt6_exception_lock); 3769 /* remove prefsrc entry */ 3770 rt->fib6_prefsrc.plen = 0; 3771 spin_unlock_bh(&rt6_exception_lock); 3772 } 3773 return 0; 3774 } 3775 3776 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3777 { 3778 struct net *net = dev_net(ifp->idev->dev); 3779 struct arg_dev_net_ip adni = { 3780 .dev = ifp->idev->dev, 3781 .net = net, 3782 .addr = &ifp->addr, 3783 }; 3784 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3785 } 3786 3787 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3788 3789 /* Remove routers and update dst entries when gateway turn into host. */ 3790 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3791 { 3792 struct in6_addr *gateway = (struct in6_addr *)arg; 3793 3794 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3795 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3796 return -1; 3797 } 3798 3799 /* Further clean up cached routes in exception table. 3800 * This is needed because cached route may have a different 3801 * gateway than its 'parent' in the case of an ip redirect. 3802 */ 3803 rt6_exceptions_clean_tohost(rt, gateway); 3804 3805 return 0; 3806 } 3807 3808 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3809 { 3810 fib6_clean_all(net, fib6_clean_tohost, gateway); 3811 } 3812 3813 struct arg_netdev_event { 3814 const struct net_device *dev; 3815 union { 3816 unsigned int nh_flags; 3817 unsigned long event; 3818 }; 3819 }; 3820 3821 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3822 { 3823 struct fib6_info *iter; 3824 struct fib6_node *fn; 3825 3826 fn = rcu_dereference_protected(rt->fib6_node, 3827 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3828 iter = rcu_dereference_protected(fn->leaf, 3829 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3830 while (iter) { 3831 if (iter->fib6_metric == rt->fib6_metric && 3832 rt6_qualify_for_ecmp(iter)) 3833 return iter; 3834 iter = rcu_dereference_protected(iter->fib6_next, 3835 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3836 } 3837 3838 return NULL; 3839 } 3840 3841 static bool rt6_is_dead(const struct fib6_info *rt) 3842 { 3843 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3844 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3845 fib6_ignore_linkdown(rt))) 3846 return true; 3847 3848 return false; 3849 } 3850 3851 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3852 { 3853 struct fib6_info *iter; 3854 int total = 0; 3855 3856 if (!rt6_is_dead(rt)) 3857 total += rt->fib6_nh.nh_weight; 3858 3859 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3860 if (!rt6_is_dead(iter)) 3861 total += iter->fib6_nh.nh_weight; 3862 } 3863 3864 return total; 3865 } 3866 3867 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3868 { 3869 int upper_bound = -1; 3870 3871 if (!rt6_is_dead(rt)) { 3872 *weight += rt->fib6_nh.nh_weight; 3873 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3874 total) - 1; 3875 } 3876 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3877 } 3878 3879 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3880 { 3881 struct fib6_info *iter; 3882 int weight = 0; 3883 3884 rt6_upper_bound_set(rt, &weight, total); 3885 3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3887 rt6_upper_bound_set(iter, &weight, total); 3888 } 3889 3890 void rt6_multipath_rebalance(struct fib6_info *rt) 3891 { 3892 struct fib6_info *first; 3893 int total; 3894 3895 /* In case the entire multipath route was marked for flushing, 3896 * then there is no need to rebalance upon the removal of every 3897 * sibling route. 3898 */ 3899 if (!rt->fib6_nsiblings || rt->should_flush) 3900 return; 3901 3902 /* During lookup routes are evaluated in order, so we need to 3903 * make sure upper bounds are assigned from the first sibling 3904 * onwards. 3905 */ 3906 first = rt6_multipath_first_sibling(rt); 3907 if (WARN_ON_ONCE(!first)) 3908 return; 3909 3910 total = rt6_multipath_total_weight(first); 3911 rt6_multipath_upper_bound_set(first, total); 3912 } 3913 3914 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3915 { 3916 const struct arg_netdev_event *arg = p_arg; 3917 struct net *net = dev_net(arg->dev); 3918 3919 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3920 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3921 fib6_update_sernum_upto_root(net, rt); 3922 rt6_multipath_rebalance(rt); 3923 } 3924 3925 return 0; 3926 } 3927 3928 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3929 { 3930 struct arg_netdev_event arg = { 3931 .dev = dev, 3932 { 3933 .nh_flags = nh_flags, 3934 }, 3935 }; 3936 3937 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3938 arg.nh_flags |= RTNH_F_LINKDOWN; 3939 3940 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3941 } 3942 3943 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3944 const struct net_device *dev) 3945 { 3946 struct fib6_info *iter; 3947 3948 if (rt->fib6_nh.nh_dev == dev) 3949 return true; 3950 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3951 if (iter->fib6_nh.nh_dev == dev) 3952 return true; 3953 3954 return false; 3955 } 3956 3957 static void rt6_multipath_flush(struct fib6_info *rt) 3958 { 3959 struct fib6_info *iter; 3960 3961 rt->should_flush = 1; 3962 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3963 iter->should_flush = 1; 3964 } 3965 3966 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3967 const struct net_device *down_dev) 3968 { 3969 struct fib6_info *iter; 3970 unsigned int dead = 0; 3971 3972 if (rt->fib6_nh.nh_dev == down_dev || 3973 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3974 dead++; 3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3976 if (iter->fib6_nh.nh_dev == down_dev || 3977 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3978 dead++; 3979 3980 return dead; 3981 } 3982 3983 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3984 const struct net_device *dev, 3985 unsigned int nh_flags) 3986 { 3987 struct fib6_info *iter; 3988 3989 if (rt->fib6_nh.nh_dev == dev) 3990 rt->fib6_nh.nh_flags |= nh_flags; 3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3992 if (iter->fib6_nh.nh_dev == dev) 3993 iter->fib6_nh.nh_flags |= nh_flags; 3994 } 3995 3996 /* called with write lock held for table with rt */ 3997 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3998 { 3999 const struct arg_netdev_event *arg = p_arg; 4000 const struct net_device *dev = arg->dev; 4001 struct net *net = dev_net(dev); 4002 4003 if (rt == net->ipv6.fib6_null_entry) 4004 return 0; 4005 4006 switch (arg->event) { 4007 case NETDEV_UNREGISTER: 4008 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4009 case NETDEV_DOWN: 4010 if (rt->should_flush) 4011 return -1; 4012 if (!rt->fib6_nsiblings) 4013 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4014 if (rt6_multipath_uses_dev(rt, dev)) { 4015 unsigned int count; 4016 4017 count = rt6_multipath_dead_count(rt, dev); 4018 if (rt->fib6_nsiblings + 1 == count) { 4019 rt6_multipath_flush(rt); 4020 return -1; 4021 } 4022 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4023 RTNH_F_LINKDOWN); 4024 fib6_update_sernum(net, rt); 4025 rt6_multipath_rebalance(rt); 4026 } 4027 return -2; 4028 case NETDEV_CHANGE: 4029 if (rt->fib6_nh.nh_dev != dev || 4030 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4031 break; 4032 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4033 rt6_multipath_rebalance(rt); 4034 break; 4035 } 4036 4037 return 0; 4038 } 4039 4040 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4041 { 4042 struct arg_netdev_event arg = { 4043 .dev = dev, 4044 { 4045 .event = event, 4046 }, 4047 }; 4048 struct net *net = dev_net(dev); 4049 4050 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4051 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4052 else 4053 fib6_clean_all(net, fib6_ifdown, &arg); 4054 } 4055 4056 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4057 { 4058 rt6_sync_down_dev(dev, event); 4059 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4060 neigh_ifdown(&nd_tbl, dev); 4061 } 4062 4063 struct rt6_mtu_change_arg { 4064 struct net_device *dev; 4065 unsigned int mtu; 4066 }; 4067 4068 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4069 { 4070 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4071 struct inet6_dev *idev; 4072 4073 /* In IPv6 pmtu discovery is not optional, 4074 so that RTAX_MTU lock cannot disable it. 4075 We still use this lock to block changes 4076 caused by addrconf/ndisc. 4077 */ 4078 4079 idev = __in6_dev_get(arg->dev); 4080 if (!idev) 4081 return 0; 4082 4083 /* For administrative MTU increase, there is no way to discover 4084 IPv6 PMTU increase, so PMTU increase should be updated here. 4085 Since RFC 1981 doesn't include administrative MTU increase 4086 update PMTU increase is a MUST. (i.e. jumbo frame) 4087 */ 4088 if (rt->fib6_nh.nh_dev == arg->dev && 4089 !fib6_metric_locked(rt, RTAX_MTU)) { 4090 u32 mtu = rt->fib6_pmtu; 4091 4092 if (mtu >= arg->mtu || 4093 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4094 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4095 4096 spin_lock_bh(&rt6_exception_lock); 4097 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4098 spin_unlock_bh(&rt6_exception_lock); 4099 } 4100 return 0; 4101 } 4102 4103 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4104 { 4105 struct rt6_mtu_change_arg arg = { 4106 .dev = dev, 4107 .mtu = mtu, 4108 }; 4109 4110 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4111 } 4112 4113 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4114 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4115 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4116 [RTA_OIF] = { .type = NLA_U32 }, 4117 [RTA_IIF] = { .type = NLA_U32 }, 4118 [RTA_PRIORITY] = { .type = NLA_U32 }, 4119 [RTA_METRICS] = { .type = NLA_NESTED }, 4120 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4121 [RTA_PREF] = { .type = NLA_U8 }, 4122 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4123 [RTA_ENCAP] = { .type = NLA_NESTED }, 4124 [RTA_EXPIRES] = { .type = NLA_U32 }, 4125 [RTA_UID] = { .type = NLA_U32 }, 4126 [RTA_MARK] = { .type = NLA_U32 }, 4127 [RTA_TABLE] = { .type = NLA_U32 }, 4128 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4129 [RTA_SPORT] = { .type = NLA_U16 }, 4130 [RTA_DPORT] = { .type = NLA_U16 }, 4131 }; 4132 4133 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4134 struct fib6_config *cfg, 4135 struct netlink_ext_ack *extack) 4136 { 4137 struct rtmsg *rtm; 4138 struct nlattr *tb[RTA_MAX+1]; 4139 unsigned int pref; 4140 int err; 4141 4142 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4143 extack); 4144 if (err < 0) 4145 goto errout; 4146 4147 err = -EINVAL; 4148 rtm = nlmsg_data(nlh); 4149 4150 *cfg = (struct fib6_config){ 4151 .fc_table = rtm->rtm_table, 4152 .fc_dst_len = rtm->rtm_dst_len, 4153 .fc_src_len = rtm->rtm_src_len, 4154 .fc_flags = RTF_UP, 4155 .fc_protocol = rtm->rtm_protocol, 4156 .fc_type = rtm->rtm_type, 4157 4158 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4159 .fc_nlinfo.nlh = nlh, 4160 .fc_nlinfo.nl_net = sock_net(skb->sk), 4161 }; 4162 4163 if (rtm->rtm_type == RTN_UNREACHABLE || 4164 rtm->rtm_type == RTN_BLACKHOLE || 4165 rtm->rtm_type == RTN_PROHIBIT || 4166 rtm->rtm_type == RTN_THROW) 4167 cfg->fc_flags |= RTF_REJECT; 4168 4169 if (rtm->rtm_type == RTN_LOCAL) 4170 cfg->fc_flags |= RTF_LOCAL; 4171 4172 if (rtm->rtm_flags & RTM_F_CLONED) 4173 cfg->fc_flags |= RTF_CACHE; 4174 4175 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4176 4177 if (tb[RTA_GATEWAY]) { 4178 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4179 cfg->fc_flags |= RTF_GATEWAY; 4180 } 4181 if (tb[RTA_VIA]) { 4182 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4183 goto errout; 4184 } 4185 4186 if (tb[RTA_DST]) { 4187 int plen = (rtm->rtm_dst_len + 7) >> 3; 4188 4189 if (nla_len(tb[RTA_DST]) < plen) 4190 goto errout; 4191 4192 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4193 } 4194 4195 if (tb[RTA_SRC]) { 4196 int plen = (rtm->rtm_src_len + 7) >> 3; 4197 4198 if (nla_len(tb[RTA_SRC]) < plen) 4199 goto errout; 4200 4201 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4202 } 4203 4204 if (tb[RTA_PREFSRC]) 4205 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4206 4207 if (tb[RTA_OIF]) 4208 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4209 4210 if (tb[RTA_PRIORITY]) 4211 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4212 4213 if (tb[RTA_METRICS]) { 4214 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4215 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4216 } 4217 4218 if (tb[RTA_TABLE]) 4219 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4220 4221 if (tb[RTA_MULTIPATH]) { 4222 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4223 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4224 4225 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4226 cfg->fc_mp_len, extack); 4227 if (err < 0) 4228 goto errout; 4229 } 4230 4231 if (tb[RTA_PREF]) { 4232 pref = nla_get_u8(tb[RTA_PREF]); 4233 if (pref != ICMPV6_ROUTER_PREF_LOW && 4234 pref != ICMPV6_ROUTER_PREF_HIGH) 4235 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4236 cfg->fc_flags |= RTF_PREF(pref); 4237 } 4238 4239 if (tb[RTA_ENCAP]) 4240 cfg->fc_encap = tb[RTA_ENCAP]; 4241 4242 if (tb[RTA_ENCAP_TYPE]) { 4243 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4244 4245 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4246 if (err < 0) 4247 goto errout; 4248 } 4249 4250 if (tb[RTA_EXPIRES]) { 4251 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4252 4253 if (addrconf_finite_timeout(timeout)) { 4254 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4255 cfg->fc_flags |= RTF_EXPIRES; 4256 } 4257 } 4258 4259 err = 0; 4260 errout: 4261 return err; 4262 } 4263 4264 struct rt6_nh { 4265 struct fib6_info *fib6_info; 4266 struct fib6_config r_cfg; 4267 struct list_head next; 4268 }; 4269 4270 static int ip6_route_info_append(struct net *net, 4271 struct list_head *rt6_nh_list, 4272 struct fib6_info *rt, 4273 struct fib6_config *r_cfg) 4274 { 4275 struct rt6_nh *nh; 4276 int err = -EEXIST; 4277 4278 list_for_each_entry(nh, rt6_nh_list, next) { 4279 /* check if fib6_info already exists */ 4280 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4281 return err; 4282 } 4283 4284 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4285 if (!nh) 4286 return -ENOMEM; 4287 nh->fib6_info = rt; 4288 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4289 list_add_tail(&nh->next, rt6_nh_list); 4290 4291 return 0; 4292 } 4293 4294 static void ip6_route_mpath_notify(struct fib6_info *rt, 4295 struct fib6_info *rt_last, 4296 struct nl_info *info, 4297 __u16 nlflags) 4298 { 4299 /* if this is an APPEND route, then rt points to the first route 4300 * inserted and rt_last points to last route inserted. Userspace 4301 * wants a consistent dump of the route which starts at the first 4302 * nexthop. Since sibling routes are always added at the end of 4303 * the list, find the first sibling of the last route appended 4304 */ 4305 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4306 rt = list_first_entry(&rt_last->fib6_siblings, 4307 struct fib6_info, 4308 fib6_siblings); 4309 } 4310 4311 if (rt) 4312 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4313 } 4314 4315 static int ip6_route_multipath_add(struct fib6_config *cfg, 4316 struct netlink_ext_ack *extack) 4317 { 4318 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4319 struct nl_info *info = &cfg->fc_nlinfo; 4320 struct fib6_config r_cfg; 4321 struct rtnexthop *rtnh; 4322 struct fib6_info *rt; 4323 struct rt6_nh *err_nh; 4324 struct rt6_nh *nh, *nh_safe; 4325 __u16 nlflags; 4326 int remaining; 4327 int attrlen; 4328 int err = 1; 4329 int nhn = 0; 4330 int replace = (cfg->fc_nlinfo.nlh && 4331 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4332 LIST_HEAD(rt6_nh_list); 4333 4334 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4335 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4336 nlflags |= NLM_F_APPEND; 4337 4338 remaining = cfg->fc_mp_len; 4339 rtnh = (struct rtnexthop *)cfg->fc_mp; 4340 4341 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4342 * fib6_info structs per nexthop 4343 */ 4344 while (rtnh_ok(rtnh, remaining)) { 4345 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4346 if (rtnh->rtnh_ifindex) 4347 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4348 4349 attrlen = rtnh_attrlen(rtnh); 4350 if (attrlen > 0) { 4351 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4352 4353 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4354 if (nla) { 4355 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4356 r_cfg.fc_flags |= RTF_GATEWAY; 4357 } 4358 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4359 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4360 if (nla) 4361 r_cfg.fc_encap_type = nla_get_u16(nla); 4362 } 4363 4364 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4365 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4366 if (IS_ERR(rt)) { 4367 err = PTR_ERR(rt); 4368 rt = NULL; 4369 goto cleanup; 4370 } 4371 if (!rt6_qualify_for_ecmp(rt)) { 4372 err = -EINVAL; 4373 NL_SET_ERR_MSG(extack, 4374 "Device only routes can not be added for IPv6 using the multipath API."); 4375 fib6_info_release(rt); 4376 goto cleanup; 4377 } 4378 4379 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4380 4381 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4382 rt, &r_cfg); 4383 if (err) { 4384 fib6_info_release(rt); 4385 goto cleanup; 4386 } 4387 4388 rtnh = rtnh_next(rtnh, &remaining); 4389 } 4390 4391 /* for add and replace send one notification with all nexthops. 4392 * Skip the notification in fib6_add_rt2node and send one with 4393 * the full route when done 4394 */ 4395 info->skip_notify = 1; 4396 4397 err_nh = NULL; 4398 list_for_each_entry(nh, &rt6_nh_list, next) { 4399 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4400 fib6_info_release(nh->fib6_info); 4401 4402 if (!err) { 4403 /* save reference to last route successfully inserted */ 4404 rt_last = nh->fib6_info; 4405 4406 /* save reference to first route for notification */ 4407 if (!rt_notif) 4408 rt_notif = nh->fib6_info; 4409 } 4410 4411 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4412 nh->fib6_info = NULL; 4413 if (err) { 4414 if (replace && nhn) 4415 NL_SET_ERR_MSG_MOD(extack, 4416 "multipath route replace failed (check consistency of installed routes)"); 4417 err_nh = nh; 4418 goto add_errout; 4419 } 4420 4421 /* Because each route is added like a single route we remove 4422 * these flags after the first nexthop: if there is a collision, 4423 * we have already failed to add the first nexthop: 4424 * fib6_add_rt2node() has rejected it; when replacing, old 4425 * nexthops have been replaced by first new, the rest should 4426 * be added to it. 4427 */ 4428 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4429 NLM_F_REPLACE); 4430 nhn++; 4431 } 4432 4433 /* success ... tell user about new route */ 4434 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4435 goto cleanup; 4436 4437 add_errout: 4438 /* send notification for routes that were added so that 4439 * the delete notifications sent by ip6_route_del are 4440 * coherent 4441 */ 4442 if (rt_notif) 4443 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4444 4445 /* Delete routes that were already added */ 4446 list_for_each_entry(nh, &rt6_nh_list, next) { 4447 if (err_nh == nh) 4448 break; 4449 ip6_route_del(&nh->r_cfg, extack); 4450 } 4451 4452 cleanup: 4453 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4454 if (nh->fib6_info) 4455 fib6_info_release(nh->fib6_info); 4456 list_del(&nh->next); 4457 kfree(nh); 4458 } 4459 4460 return err; 4461 } 4462 4463 static int ip6_route_multipath_del(struct fib6_config *cfg, 4464 struct netlink_ext_ack *extack) 4465 { 4466 struct fib6_config r_cfg; 4467 struct rtnexthop *rtnh; 4468 int remaining; 4469 int attrlen; 4470 int err = 1, last_err = 0; 4471 4472 remaining = cfg->fc_mp_len; 4473 rtnh = (struct rtnexthop *)cfg->fc_mp; 4474 4475 /* Parse a Multipath Entry */ 4476 while (rtnh_ok(rtnh, remaining)) { 4477 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4478 if (rtnh->rtnh_ifindex) 4479 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4480 4481 attrlen = rtnh_attrlen(rtnh); 4482 if (attrlen > 0) { 4483 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4484 4485 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4486 if (nla) { 4487 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4488 r_cfg.fc_flags |= RTF_GATEWAY; 4489 } 4490 } 4491 err = ip6_route_del(&r_cfg, extack); 4492 if (err) 4493 last_err = err; 4494 4495 rtnh = rtnh_next(rtnh, &remaining); 4496 } 4497 4498 return last_err; 4499 } 4500 4501 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4502 struct netlink_ext_ack *extack) 4503 { 4504 struct fib6_config cfg; 4505 int err; 4506 4507 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4508 if (err < 0) 4509 return err; 4510 4511 if (cfg.fc_mp) 4512 return ip6_route_multipath_del(&cfg, extack); 4513 else { 4514 cfg.fc_delete_all_nh = 1; 4515 return ip6_route_del(&cfg, extack); 4516 } 4517 } 4518 4519 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4520 struct netlink_ext_ack *extack) 4521 { 4522 struct fib6_config cfg; 4523 int err; 4524 4525 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4526 if (err < 0) 4527 return err; 4528 4529 if (cfg.fc_mp) 4530 return ip6_route_multipath_add(&cfg, extack); 4531 else 4532 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4533 } 4534 4535 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4536 { 4537 int nexthop_len = 0; 4538 4539 if (rt->fib6_nsiblings) { 4540 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4541 + NLA_ALIGN(sizeof(struct rtnexthop)) 4542 + nla_total_size(16) /* RTA_GATEWAY */ 4543 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4544 4545 nexthop_len *= rt->fib6_nsiblings; 4546 } 4547 4548 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4549 + nla_total_size(16) /* RTA_SRC */ 4550 + nla_total_size(16) /* RTA_DST */ 4551 + nla_total_size(16) /* RTA_GATEWAY */ 4552 + nla_total_size(16) /* RTA_PREFSRC */ 4553 + nla_total_size(4) /* RTA_TABLE */ 4554 + nla_total_size(4) /* RTA_IIF */ 4555 + nla_total_size(4) /* RTA_OIF */ 4556 + nla_total_size(4) /* RTA_PRIORITY */ 4557 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4558 + nla_total_size(sizeof(struct rta_cacheinfo)) 4559 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4560 + nla_total_size(1) /* RTA_PREF */ 4561 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4562 + nexthop_len; 4563 } 4564 4565 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4566 unsigned int *flags, bool skip_oif) 4567 { 4568 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4569 *flags |= RTNH_F_DEAD; 4570 4571 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4572 *flags |= RTNH_F_LINKDOWN; 4573 4574 rcu_read_lock(); 4575 if (fib6_ignore_linkdown(rt)) 4576 *flags |= RTNH_F_DEAD; 4577 rcu_read_unlock(); 4578 } 4579 4580 if (rt->fib6_flags & RTF_GATEWAY) { 4581 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4582 goto nla_put_failure; 4583 } 4584 4585 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4586 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4587 *flags |= RTNH_F_OFFLOAD; 4588 4589 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4590 if (!skip_oif && rt->fib6_nh.nh_dev && 4591 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4592 goto nla_put_failure; 4593 4594 if (rt->fib6_nh.nh_lwtstate && 4595 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4596 goto nla_put_failure; 4597 4598 return 0; 4599 4600 nla_put_failure: 4601 return -EMSGSIZE; 4602 } 4603 4604 /* add multipath next hop */ 4605 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4606 { 4607 const struct net_device *dev = rt->fib6_nh.nh_dev; 4608 struct rtnexthop *rtnh; 4609 unsigned int flags = 0; 4610 4611 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4612 if (!rtnh) 4613 goto nla_put_failure; 4614 4615 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4616 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4617 4618 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4619 goto nla_put_failure; 4620 4621 rtnh->rtnh_flags = flags; 4622 4623 /* length of rtnetlink header + attributes */ 4624 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4625 4626 return 0; 4627 4628 nla_put_failure: 4629 return -EMSGSIZE; 4630 } 4631 4632 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4633 struct fib6_info *rt, struct dst_entry *dst, 4634 struct in6_addr *dest, struct in6_addr *src, 4635 int iif, int type, u32 portid, u32 seq, 4636 unsigned int flags) 4637 { 4638 struct rt6_info *rt6 = (struct rt6_info *)dst; 4639 struct rt6key *rt6_dst, *rt6_src; 4640 u32 *pmetrics, table, rt6_flags; 4641 struct nlmsghdr *nlh; 4642 struct rtmsg *rtm; 4643 long expires = 0; 4644 4645 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4646 if (!nlh) 4647 return -EMSGSIZE; 4648 4649 if (rt6) { 4650 rt6_dst = &rt6->rt6i_dst; 4651 rt6_src = &rt6->rt6i_src; 4652 rt6_flags = rt6->rt6i_flags; 4653 } else { 4654 rt6_dst = &rt->fib6_dst; 4655 rt6_src = &rt->fib6_src; 4656 rt6_flags = rt->fib6_flags; 4657 } 4658 4659 rtm = nlmsg_data(nlh); 4660 rtm->rtm_family = AF_INET6; 4661 rtm->rtm_dst_len = rt6_dst->plen; 4662 rtm->rtm_src_len = rt6_src->plen; 4663 rtm->rtm_tos = 0; 4664 if (rt->fib6_table) 4665 table = rt->fib6_table->tb6_id; 4666 else 4667 table = RT6_TABLE_UNSPEC; 4668 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4669 if (nla_put_u32(skb, RTA_TABLE, table)) 4670 goto nla_put_failure; 4671 4672 rtm->rtm_type = rt->fib6_type; 4673 rtm->rtm_flags = 0; 4674 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4675 rtm->rtm_protocol = rt->fib6_protocol; 4676 4677 if (rt6_flags & RTF_CACHE) 4678 rtm->rtm_flags |= RTM_F_CLONED; 4679 4680 if (dest) { 4681 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4682 goto nla_put_failure; 4683 rtm->rtm_dst_len = 128; 4684 } else if (rtm->rtm_dst_len) 4685 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4686 goto nla_put_failure; 4687 #ifdef CONFIG_IPV6_SUBTREES 4688 if (src) { 4689 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4690 goto nla_put_failure; 4691 rtm->rtm_src_len = 128; 4692 } else if (rtm->rtm_src_len && 4693 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4694 goto nla_put_failure; 4695 #endif 4696 if (iif) { 4697 #ifdef CONFIG_IPV6_MROUTE 4698 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4699 int err = ip6mr_get_route(net, skb, rtm, portid); 4700 4701 if (err == 0) 4702 return 0; 4703 if (err < 0) 4704 goto nla_put_failure; 4705 } else 4706 #endif 4707 if (nla_put_u32(skb, RTA_IIF, iif)) 4708 goto nla_put_failure; 4709 } else if (dest) { 4710 struct in6_addr saddr_buf; 4711 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4712 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4713 goto nla_put_failure; 4714 } 4715 4716 if (rt->fib6_prefsrc.plen) { 4717 struct in6_addr saddr_buf; 4718 saddr_buf = rt->fib6_prefsrc.addr; 4719 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4720 goto nla_put_failure; 4721 } 4722 4723 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4724 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4725 goto nla_put_failure; 4726 4727 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4728 goto nla_put_failure; 4729 4730 /* For multipath routes, walk the siblings list and add 4731 * each as a nexthop within RTA_MULTIPATH. 4732 */ 4733 if (rt6) { 4734 if (rt6_flags & RTF_GATEWAY && 4735 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4736 goto nla_put_failure; 4737 4738 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4739 goto nla_put_failure; 4740 } else if (rt->fib6_nsiblings) { 4741 struct fib6_info *sibling, *next_sibling; 4742 struct nlattr *mp; 4743 4744 mp = nla_nest_start(skb, RTA_MULTIPATH); 4745 if (!mp) 4746 goto nla_put_failure; 4747 4748 if (rt6_add_nexthop(skb, rt) < 0) 4749 goto nla_put_failure; 4750 4751 list_for_each_entry_safe(sibling, next_sibling, 4752 &rt->fib6_siblings, fib6_siblings) { 4753 if (rt6_add_nexthop(skb, sibling) < 0) 4754 goto nla_put_failure; 4755 } 4756 4757 nla_nest_end(skb, mp); 4758 } else { 4759 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4760 goto nla_put_failure; 4761 } 4762 4763 if (rt6_flags & RTF_EXPIRES) { 4764 expires = dst ? dst->expires : rt->expires; 4765 expires -= jiffies; 4766 } 4767 4768 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4769 goto nla_put_failure; 4770 4771 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4772 goto nla_put_failure; 4773 4774 4775 nlmsg_end(skb, nlh); 4776 return 0; 4777 4778 nla_put_failure: 4779 nlmsg_cancel(skb, nlh); 4780 return -EMSGSIZE; 4781 } 4782 4783 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4784 const struct net_device *dev) 4785 { 4786 if (f6i->fib6_nh.nh_dev == dev) 4787 return true; 4788 4789 if (f6i->fib6_nsiblings) { 4790 struct fib6_info *sibling, *next_sibling; 4791 4792 list_for_each_entry_safe(sibling, next_sibling, 4793 &f6i->fib6_siblings, fib6_siblings) { 4794 if (sibling->fib6_nh.nh_dev == dev) 4795 return true; 4796 } 4797 } 4798 4799 return false; 4800 } 4801 4802 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4803 { 4804 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4805 struct fib_dump_filter *filter = &arg->filter; 4806 unsigned int flags = NLM_F_MULTI; 4807 struct net *net = arg->net; 4808 4809 if (rt == net->ipv6.fib6_null_entry) 4810 return 0; 4811 4812 if ((filter->flags & RTM_F_PREFIX) && 4813 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4814 /* success since this is not a prefix route */ 4815 return 1; 4816 } 4817 if (filter->filter_set) { 4818 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4819 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4820 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4821 return 1; 4822 } 4823 flags |= NLM_F_DUMP_FILTERED; 4824 } 4825 4826 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4827 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4828 arg->cb->nlh->nlmsg_seq, flags); 4829 } 4830 4831 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4832 const struct nlmsghdr *nlh, 4833 struct nlattr **tb, 4834 struct netlink_ext_ack *extack) 4835 { 4836 struct rtmsg *rtm; 4837 int i, err; 4838 4839 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4840 NL_SET_ERR_MSG_MOD(extack, 4841 "Invalid header for get route request"); 4842 return -EINVAL; 4843 } 4844 4845 if (!netlink_strict_get_check(skb)) 4846 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4847 rtm_ipv6_policy, extack); 4848 4849 rtm = nlmsg_data(nlh); 4850 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4851 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4852 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4853 rtm->rtm_type) { 4854 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4855 return -EINVAL; 4856 } 4857 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4858 NL_SET_ERR_MSG_MOD(extack, 4859 "Invalid flags for get route request"); 4860 return -EINVAL; 4861 } 4862 4863 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4864 rtm_ipv6_policy, extack); 4865 if (err) 4866 return err; 4867 4868 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4869 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4870 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4871 return -EINVAL; 4872 } 4873 4874 for (i = 0; i <= RTA_MAX; i++) { 4875 if (!tb[i]) 4876 continue; 4877 4878 switch (i) { 4879 case RTA_SRC: 4880 case RTA_DST: 4881 case RTA_IIF: 4882 case RTA_OIF: 4883 case RTA_MARK: 4884 case RTA_UID: 4885 case RTA_SPORT: 4886 case RTA_DPORT: 4887 case RTA_IP_PROTO: 4888 break; 4889 default: 4890 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4891 return -EINVAL; 4892 } 4893 } 4894 4895 return 0; 4896 } 4897 4898 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4899 struct netlink_ext_ack *extack) 4900 { 4901 struct net *net = sock_net(in_skb->sk); 4902 struct nlattr *tb[RTA_MAX+1]; 4903 int err, iif = 0, oif = 0; 4904 struct fib6_info *from; 4905 struct dst_entry *dst; 4906 struct rt6_info *rt; 4907 struct sk_buff *skb; 4908 struct rtmsg *rtm; 4909 struct flowi6 fl6 = {}; 4910 bool fibmatch; 4911 4912 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4913 if (err < 0) 4914 goto errout; 4915 4916 err = -EINVAL; 4917 rtm = nlmsg_data(nlh); 4918 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4919 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4920 4921 if (tb[RTA_SRC]) { 4922 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4923 goto errout; 4924 4925 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4926 } 4927 4928 if (tb[RTA_DST]) { 4929 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4930 goto errout; 4931 4932 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4933 } 4934 4935 if (tb[RTA_IIF]) 4936 iif = nla_get_u32(tb[RTA_IIF]); 4937 4938 if (tb[RTA_OIF]) 4939 oif = nla_get_u32(tb[RTA_OIF]); 4940 4941 if (tb[RTA_MARK]) 4942 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4943 4944 if (tb[RTA_UID]) 4945 fl6.flowi6_uid = make_kuid(current_user_ns(), 4946 nla_get_u32(tb[RTA_UID])); 4947 else 4948 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4949 4950 if (tb[RTA_SPORT]) 4951 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4952 4953 if (tb[RTA_DPORT]) 4954 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4955 4956 if (tb[RTA_IP_PROTO]) { 4957 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4958 &fl6.flowi6_proto, AF_INET6, 4959 extack); 4960 if (err) 4961 goto errout; 4962 } 4963 4964 if (iif) { 4965 struct net_device *dev; 4966 int flags = 0; 4967 4968 rcu_read_lock(); 4969 4970 dev = dev_get_by_index_rcu(net, iif); 4971 if (!dev) { 4972 rcu_read_unlock(); 4973 err = -ENODEV; 4974 goto errout; 4975 } 4976 4977 fl6.flowi6_iif = iif; 4978 4979 if (!ipv6_addr_any(&fl6.saddr)) 4980 flags |= RT6_LOOKUP_F_HAS_SADDR; 4981 4982 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4983 4984 rcu_read_unlock(); 4985 } else { 4986 fl6.flowi6_oif = oif; 4987 4988 dst = ip6_route_output(net, NULL, &fl6); 4989 } 4990 4991 4992 rt = container_of(dst, struct rt6_info, dst); 4993 if (rt->dst.error) { 4994 err = rt->dst.error; 4995 ip6_rt_put(rt); 4996 goto errout; 4997 } 4998 4999 if (rt == net->ipv6.ip6_null_entry) { 5000 err = rt->dst.error; 5001 ip6_rt_put(rt); 5002 goto errout; 5003 } 5004 5005 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5006 if (!skb) { 5007 ip6_rt_put(rt); 5008 err = -ENOBUFS; 5009 goto errout; 5010 } 5011 5012 skb_dst_set(skb, &rt->dst); 5013 5014 rcu_read_lock(); 5015 from = rcu_dereference(rt->from); 5016 5017 if (fibmatch) 5018 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 5019 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 5020 nlh->nlmsg_seq, 0); 5021 else 5022 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5023 &fl6.saddr, iif, RTM_NEWROUTE, 5024 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 5025 0); 5026 rcu_read_unlock(); 5027 5028 if (err < 0) { 5029 kfree_skb(skb); 5030 goto errout; 5031 } 5032 5033 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5034 errout: 5035 return err; 5036 } 5037 5038 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5039 unsigned int nlm_flags) 5040 { 5041 struct sk_buff *skb; 5042 struct net *net = info->nl_net; 5043 u32 seq; 5044 int err; 5045 5046 err = -ENOBUFS; 5047 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5048 5049 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5050 if (!skb) 5051 goto errout; 5052 5053 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5054 event, info->portid, seq, nlm_flags); 5055 if (err < 0) { 5056 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5057 WARN_ON(err == -EMSGSIZE); 5058 kfree_skb(skb); 5059 goto errout; 5060 } 5061 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5062 info->nlh, gfp_any()); 5063 return; 5064 errout: 5065 if (err < 0) 5066 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5067 } 5068 5069 static int ip6_route_dev_notify(struct notifier_block *this, 5070 unsigned long event, void *ptr) 5071 { 5072 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5073 struct net *net = dev_net(dev); 5074 5075 if (!(dev->flags & IFF_LOOPBACK)) 5076 return NOTIFY_OK; 5077 5078 if (event == NETDEV_REGISTER) { 5079 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5080 net->ipv6.ip6_null_entry->dst.dev = dev; 5081 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5082 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5083 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5084 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5085 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5086 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5087 #endif 5088 } else if (event == NETDEV_UNREGISTER && 5089 dev->reg_state != NETREG_UNREGISTERED) { 5090 /* NETDEV_UNREGISTER could be fired for multiple times by 5091 * netdev_wait_allrefs(). Make sure we only call this once. 5092 */ 5093 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5094 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5095 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5096 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5097 #endif 5098 } 5099 5100 return NOTIFY_OK; 5101 } 5102 5103 /* 5104 * /proc 5105 */ 5106 5107 #ifdef CONFIG_PROC_FS 5108 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5109 { 5110 struct net *net = (struct net *)seq->private; 5111 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5112 net->ipv6.rt6_stats->fib_nodes, 5113 net->ipv6.rt6_stats->fib_route_nodes, 5114 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5115 net->ipv6.rt6_stats->fib_rt_entries, 5116 net->ipv6.rt6_stats->fib_rt_cache, 5117 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5118 net->ipv6.rt6_stats->fib_discarded_routes); 5119 5120 return 0; 5121 } 5122 #endif /* CONFIG_PROC_FS */ 5123 5124 #ifdef CONFIG_SYSCTL 5125 5126 static 5127 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5128 void __user *buffer, size_t *lenp, loff_t *ppos) 5129 { 5130 struct net *net; 5131 int delay; 5132 int ret; 5133 if (!write) 5134 return -EINVAL; 5135 5136 net = (struct net *)ctl->extra1; 5137 delay = net->ipv6.sysctl.flush_delay; 5138 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5139 if (ret) 5140 return ret; 5141 5142 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5143 return 0; 5144 } 5145 5146 static int zero; 5147 static int one = 1; 5148 5149 static struct ctl_table ipv6_route_table_template[] = { 5150 { 5151 .procname = "flush", 5152 .data = &init_net.ipv6.sysctl.flush_delay, 5153 .maxlen = sizeof(int), 5154 .mode = 0200, 5155 .proc_handler = ipv6_sysctl_rtcache_flush 5156 }, 5157 { 5158 .procname = "gc_thresh", 5159 .data = &ip6_dst_ops_template.gc_thresh, 5160 .maxlen = sizeof(int), 5161 .mode = 0644, 5162 .proc_handler = proc_dointvec, 5163 }, 5164 { 5165 .procname = "max_size", 5166 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5167 .maxlen = sizeof(int), 5168 .mode = 0644, 5169 .proc_handler = proc_dointvec, 5170 }, 5171 { 5172 .procname = "gc_min_interval", 5173 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5174 .maxlen = sizeof(int), 5175 .mode = 0644, 5176 .proc_handler = proc_dointvec_jiffies, 5177 }, 5178 { 5179 .procname = "gc_timeout", 5180 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5181 .maxlen = sizeof(int), 5182 .mode = 0644, 5183 .proc_handler = proc_dointvec_jiffies, 5184 }, 5185 { 5186 .procname = "gc_interval", 5187 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5188 .maxlen = sizeof(int), 5189 .mode = 0644, 5190 .proc_handler = proc_dointvec_jiffies, 5191 }, 5192 { 5193 .procname = "gc_elasticity", 5194 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5195 .maxlen = sizeof(int), 5196 .mode = 0644, 5197 .proc_handler = proc_dointvec, 5198 }, 5199 { 5200 .procname = "mtu_expires", 5201 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5202 .maxlen = sizeof(int), 5203 .mode = 0644, 5204 .proc_handler = proc_dointvec_jiffies, 5205 }, 5206 { 5207 .procname = "min_adv_mss", 5208 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5209 .maxlen = sizeof(int), 5210 .mode = 0644, 5211 .proc_handler = proc_dointvec, 5212 }, 5213 { 5214 .procname = "gc_min_interval_ms", 5215 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5216 .maxlen = sizeof(int), 5217 .mode = 0644, 5218 .proc_handler = proc_dointvec_ms_jiffies, 5219 }, 5220 { 5221 .procname = "skip_notify_on_dev_down", 5222 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5223 .maxlen = sizeof(int), 5224 .mode = 0644, 5225 .proc_handler = proc_dointvec, 5226 .extra1 = &zero, 5227 .extra2 = &one, 5228 }, 5229 { } 5230 }; 5231 5232 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5233 { 5234 struct ctl_table *table; 5235 5236 table = kmemdup(ipv6_route_table_template, 5237 sizeof(ipv6_route_table_template), 5238 GFP_KERNEL); 5239 5240 if (table) { 5241 table[0].data = &net->ipv6.sysctl.flush_delay; 5242 table[0].extra1 = net; 5243 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5244 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5245 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5246 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5247 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5248 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5249 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5250 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5251 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5252 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5253 5254 /* Don't export sysctls to unprivileged users */ 5255 if (net->user_ns != &init_user_ns) 5256 table[0].procname = NULL; 5257 } 5258 5259 return table; 5260 } 5261 #endif 5262 5263 static int __net_init ip6_route_net_init(struct net *net) 5264 { 5265 int ret = -ENOMEM; 5266 5267 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5268 sizeof(net->ipv6.ip6_dst_ops)); 5269 5270 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5271 goto out_ip6_dst_ops; 5272 5273 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5274 sizeof(*net->ipv6.fib6_null_entry), 5275 GFP_KERNEL); 5276 if (!net->ipv6.fib6_null_entry) 5277 goto out_ip6_dst_entries; 5278 5279 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5280 sizeof(*net->ipv6.ip6_null_entry), 5281 GFP_KERNEL); 5282 if (!net->ipv6.ip6_null_entry) 5283 goto out_fib6_null_entry; 5284 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5285 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5286 ip6_template_metrics, true); 5287 5288 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5289 net->ipv6.fib6_has_custom_rules = false; 5290 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5291 sizeof(*net->ipv6.ip6_prohibit_entry), 5292 GFP_KERNEL); 5293 if (!net->ipv6.ip6_prohibit_entry) 5294 goto out_ip6_null_entry; 5295 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5296 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5297 ip6_template_metrics, true); 5298 5299 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5300 sizeof(*net->ipv6.ip6_blk_hole_entry), 5301 GFP_KERNEL); 5302 if (!net->ipv6.ip6_blk_hole_entry) 5303 goto out_ip6_prohibit_entry; 5304 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5305 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5306 ip6_template_metrics, true); 5307 #endif 5308 5309 net->ipv6.sysctl.flush_delay = 0; 5310 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5311 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5312 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5313 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5314 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5315 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5316 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5317 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5318 5319 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5320 5321 ret = 0; 5322 out: 5323 return ret; 5324 5325 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5326 out_ip6_prohibit_entry: 5327 kfree(net->ipv6.ip6_prohibit_entry); 5328 out_ip6_null_entry: 5329 kfree(net->ipv6.ip6_null_entry); 5330 #endif 5331 out_fib6_null_entry: 5332 kfree(net->ipv6.fib6_null_entry); 5333 out_ip6_dst_entries: 5334 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5335 out_ip6_dst_ops: 5336 goto out; 5337 } 5338 5339 static void __net_exit ip6_route_net_exit(struct net *net) 5340 { 5341 kfree(net->ipv6.fib6_null_entry); 5342 kfree(net->ipv6.ip6_null_entry); 5343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5344 kfree(net->ipv6.ip6_prohibit_entry); 5345 kfree(net->ipv6.ip6_blk_hole_entry); 5346 #endif 5347 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5348 } 5349 5350 static int __net_init ip6_route_net_init_late(struct net *net) 5351 { 5352 #ifdef CONFIG_PROC_FS 5353 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5354 sizeof(struct ipv6_route_iter)); 5355 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5356 rt6_stats_seq_show, NULL); 5357 #endif 5358 return 0; 5359 } 5360 5361 static void __net_exit ip6_route_net_exit_late(struct net *net) 5362 { 5363 #ifdef CONFIG_PROC_FS 5364 remove_proc_entry("ipv6_route", net->proc_net); 5365 remove_proc_entry("rt6_stats", net->proc_net); 5366 #endif 5367 } 5368 5369 static struct pernet_operations ip6_route_net_ops = { 5370 .init = ip6_route_net_init, 5371 .exit = ip6_route_net_exit, 5372 }; 5373 5374 static int __net_init ipv6_inetpeer_init(struct net *net) 5375 { 5376 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5377 5378 if (!bp) 5379 return -ENOMEM; 5380 inet_peer_base_init(bp); 5381 net->ipv6.peers = bp; 5382 return 0; 5383 } 5384 5385 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5386 { 5387 struct inet_peer_base *bp = net->ipv6.peers; 5388 5389 net->ipv6.peers = NULL; 5390 inetpeer_invalidate_tree(bp); 5391 kfree(bp); 5392 } 5393 5394 static struct pernet_operations ipv6_inetpeer_ops = { 5395 .init = ipv6_inetpeer_init, 5396 .exit = ipv6_inetpeer_exit, 5397 }; 5398 5399 static struct pernet_operations ip6_route_net_late_ops = { 5400 .init = ip6_route_net_init_late, 5401 .exit = ip6_route_net_exit_late, 5402 }; 5403 5404 static struct notifier_block ip6_route_dev_notifier = { 5405 .notifier_call = ip6_route_dev_notify, 5406 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5407 }; 5408 5409 void __init ip6_route_init_special_entries(void) 5410 { 5411 /* Registering of the loopback is done before this portion of code, 5412 * the loopback reference in rt6_info will not be taken, do it 5413 * manually for init_net */ 5414 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5415 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5416 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5417 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5418 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5419 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5420 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5421 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5422 #endif 5423 } 5424 5425 int __init ip6_route_init(void) 5426 { 5427 int ret; 5428 int cpu; 5429 5430 ret = -ENOMEM; 5431 ip6_dst_ops_template.kmem_cachep = 5432 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5433 SLAB_HWCACHE_ALIGN, NULL); 5434 if (!ip6_dst_ops_template.kmem_cachep) 5435 goto out; 5436 5437 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5438 if (ret) 5439 goto out_kmem_cache; 5440 5441 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5442 if (ret) 5443 goto out_dst_entries; 5444 5445 ret = register_pernet_subsys(&ip6_route_net_ops); 5446 if (ret) 5447 goto out_register_inetpeer; 5448 5449 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5450 5451 ret = fib6_init(); 5452 if (ret) 5453 goto out_register_subsys; 5454 5455 ret = xfrm6_init(); 5456 if (ret) 5457 goto out_fib6_init; 5458 5459 ret = fib6_rules_init(); 5460 if (ret) 5461 goto xfrm6_init; 5462 5463 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5464 if (ret) 5465 goto fib6_rules_init; 5466 5467 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5468 inet6_rtm_newroute, NULL, 0); 5469 if (ret < 0) 5470 goto out_register_late_subsys; 5471 5472 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5473 inet6_rtm_delroute, NULL, 0); 5474 if (ret < 0) 5475 goto out_register_late_subsys; 5476 5477 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5478 inet6_rtm_getroute, NULL, 5479 RTNL_FLAG_DOIT_UNLOCKED); 5480 if (ret < 0) 5481 goto out_register_late_subsys; 5482 5483 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5484 if (ret) 5485 goto out_register_late_subsys; 5486 5487 for_each_possible_cpu(cpu) { 5488 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5489 5490 INIT_LIST_HEAD(&ul->head); 5491 spin_lock_init(&ul->lock); 5492 } 5493 5494 out: 5495 return ret; 5496 5497 out_register_late_subsys: 5498 rtnl_unregister_all(PF_INET6); 5499 unregister_pernet_subsys(&ip6_route_net_late_ops); 5500 fib6_rules_init: 5501 fib6_rules_cleanup(); 5502 xfrm6_init: 5503 xfrm6_fini(); 5504 out_fib6_init: 5505 fib6_gc_cleanup(); 5506 out_register_subsys: 5507 unregister_pernet_subsys(&ip6_route_net_ops); 5508 out_register_inetpeer: 5509 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5510 out_dst_entries: 5511 dst_entries_destroy(&ip6_dst_blackhole_ops); 5512 out_kmem_cache: 5513 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5514 goto out; 5515 } 5516 5517 void ip6_route_cleanup(void) 5518 { 5519 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5520 unregister_pernet_subsys(&ip6_route_net_late_ops); 5521 fib6_rules_cleanup(); 5522 xfrm6_fini(); 5523 fib6_gc_cleanup(); 5524 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5525 unregister_pernet_subsys(&ip6_route_net_ops); 5526 dst_entries_destroy(&ip6_dst_blackhole_ops); 5527 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5528 } 5529