1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 214 n = neigh_create(&nd_tbl, daddr, dev); 215 return IS_ERR(n) ? NULL : n; 216 } 217 218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 219 struct sk_buff *skb, 220 const void *daddr) 221 { 222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 223 224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 225 } 226 227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 228 { 229 struct net_device *dev = dst->dev; 230 struct rt6_info *rt = (struct rt6_info *)dst; 231 232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 233 if (!daddr) 234 return; 235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 236 return; 237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 238 return; 239 __ipv6_confirm_neigh(dev, daddr); 240 } 241 242 static struct dst_ops ip6_dst_ops_template = { 243 .family = AF_INET6, 244 .gc = ip6_dst_gc, 245 .gc_thresh = 1024, 246 .check = ip6_dst_check, 247 .default_advmss = ip6_default_advmss, 248 .mtu = ip6_mtu, 249 .cow_metrics = dst_cow_metrics_generic, 250 .destroy = ip6_dst_destroy, 251 .ifdown = ip6_dst_ifdown, 252 .negative_advice = ip6_negative_advice, 253 .link_failure = ip6_link_failure, 254 .update_pmtu = ip6_rt_update_pmtu, 255 .redirect = rt6_do_redirect, 256 .local_out = __ip6_local_out, 257 .neigh_lookup = ip6_dst_neigh_lookup, 258 .confirm_neigh = ip6_confirm_neigh, 259 }; 260 261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 262 { 263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 264 265 return mtu ? : dst->dev->mtu; 266 } 267 268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 269 struct sk_buff *skb, u32 mtu) 270 { 271 } 272 273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 274 struct sk_buff *skb) 275 { 276 } 277 278 static struct dst_ops ip6_dst_blackhole_ops = { 279 .family = AF_INET6, 280 .destroy = ip6_dst_destroy, 281 .check = ip6_dst_check, 282 .mtu = ip6_blackhole_mtu, 283 .default_advmss = ip6_default_advmss, 284 .update_pmtu = ip6_rt_blackhole_update_pmtu, 285 .redirect = ip6_rt_blackhole_redirect, 286 .cow_metrics = dst_cow_metrics_generic, 287 .neigh_lookup = ip6_dst_neigh_lookup, 288 }; 289 290 static const u32 ip6_template_metrics[RTAX_MAX] = { 291 [RTAX_HOPLIMIT - 1] = 0, 292 }; 293 294 static const struct fib6_info fib6_null_entry_template = { 295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 296 .fib6_protocol = RTPROT_KERNEL, 297 .fib6_metric = ~(u32)0, 298 .fib6_ref = ATOMIC_INIT(1), 299 .fib6_type = RTN_UNREACHABLE, 300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 301 }; 302 303 static const struct rt6_info ip6_null_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -ENETUNREACH, 309 .input = ip6_pkt_discard, 310 .output = ip6_pkt_discard_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 316 317 static const struct rt6_info ip6_prohibit_entry_template = { 318 .dst = { 319 .__refcnt = ATOMIC_INIT(1), 320 .__use = 1, 321 .obsolete = DST_OBSOLETE_FORCE_CHK, 322 .error = -EACCES, 323 .input = ip6_pkt_prohibit, 324 .output = ip6_pkt_prohibit_out, 325 }, 326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 327 }; 328 329 static const struct rt6_info ip6_blk_hole_entry_template = { 330 .dst = { 331 .__refcnt = ATOMIC_INIT(1), 332 .__use = 1, 333 .obsolete = DST_OBSOLETE_FORCE_CHK, 334 .error = -EINVAL, 335 .input = dst_discard, 336 .output = dst_discard_out, 337 }, 338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 339 }; 340 341 #endif 342 343 static void rt6_info_init(struct rt6_info *rt) 344 { 345 struct dst_entry *dst = &rt->dst; 346 347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 348 INIT_LIST_HEAD(&rt->rt6i_uncached); 349 } 350 351 /* allocate dst with ip6_dst_ops */ 352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) { 359 rt6_info_init(rt); 360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 361 } 362 363 return rt; 364 } 365 EXPORT_SYMBOL(ip6_dst_alloc); 366 367 static void ip6_dst_destroy(struct dst_entry *dst) 368 { 369 struct rt6_info *rt = (struct rt6_info *)dst; 370 struct fib6_info *from; 371 struct inet6_dev *idev; 372 373 ip_dst_metrics_put(dst); 374 rt6_uncached_list_del(rt); 375 376 idev = rt->rt6i_idev; 377 if (idev) { 378 rt->rt6i_idev = NULL; 379 in6_dev_put(idev); 380 } 381 382 rcu_read_lock(); 383 from = rcu_dereference(rt->from); 384 rcu_assign_pointer(rt->from, NULL); 385 fib6_info_release(from); 386 rcu_read_unlock(); 387 } 388 389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 390 int how) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct inet6_dev *idev = rt->rt6i_idev; 394 struct net_device *loopback_dev = 395 dev_net(dev)->loopback_dev; 396 397 if (idev && idev->dev != loopback_dev) { 398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 399 if (loopback_idev) { 400 rt->rt6i_idev = loopback_idev; 401 in6_dev_put(idev); 402 } 403 } 404 } 405 406 static bool __rt6_check_expired(const struct rt6_info *rt) 407 { 408 if (rt->rt6i_flags & RTF_EXPIRES) 409 return time_after(jiffies, rt->dst.expires); 410 else 411 return false; 412 } 413 414 static bool rt6_check_expired(const struct rt6_info *rt) 415 { 416 struct fib6_info *from; 417 418 from = rcu_dereference(rt->from); 419 420 if (rt->rt6i_flags & RTF_EXPIRES) { 421 if (time_after(jiffies, rt->dst.expires)) 422 return true; 423 } else if (from) { 424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 425 fib6_check_expired(from); 426 } 427 return false; 428 } 429 430 struct fib6_info *fib6_multipath_select(const struct net *net, 431 struct fib6_info *match, 432 struct flowi6 *fl6, int oif, 433 const struct sk_buff *skb, 434 int strict) 435 { 436 struct fib6_info *sibling, *next_sibling; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 445 return match; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 int nh_upper_bound; 450 451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 452 if (fl6->mp_hash > nh_upper_bound) 453 continue; 454 if (rt6_score_route(sibling, oif, strict) < 0) 455 break; 456 match = sibling; 457 break; 458 } 459 460 return match; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static inline struct fib6_info *rt6_device_match(struct net *net, 468 struct fib6_info *rt, 469 const struct in6_addr *saddr, 470 int oif, 471 int flags) 472 { 473 struct fib6_info *sprt; 474 475 if (!oif && ipv6_addr_any(saddr) && 476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 477 return rt; 478 479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 480 const struct net_device *dev = sprt->fib6_nh.nh_dev; 481 482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 483 continue; 484 485 if (oif) { 486 if (dev->ifindex == oif) 487 return sprt; 488 } else { 489 if (ipv6_chk_addr(net, saddr, dev, 490 flags & RT6_LOOKUP_F_IFACE)) 491 return sprt; 492 } 493 } 494 495 if (oif && flags & RT6_LOOKUP_F_IFACE) 496 return net->ipv6.fib6_null_entry; 497 498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 499 } 500 501 #ifdef CONFIG_IPV6_ROUTER_PREF 502 struct __rt6_probe_work { 503 struct work_struct work; 504 struct in6_addr target; 505 struct net_device *dev; 506 }; 507 508 static void rt6_probe_deferred(struct work_struct *w) 509 { 510 struct in6_addr mcaddr; 511 struct __rt6_probe_work *work = 512 container_of(w, struct __rt6_probe_work, work); 513 514 addrconf_addr_solict_mult(&work->target, &mcaddr); 515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 516 dev_put(work->dev); 517 kfree(work); 518 } 519 520 static void rt6_probe(struct fib6_info *rt) 521 { 522 struct __rt6_probe_work *work = NULL; 523 const struct in6_addr *nh_gw; 524 struct neighbour *neigh; 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 528 /* 529 * Okay, this does not seem to be appropriate 530 * for now, however, we need to check if it 531 * is really so; aka Router Reachability Probing. 532 * 533 * Router Reachability Probe MUST be rate-limited 534 * to no more than one per minute. 535 */ 536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 537 return; 538 539 nh_gw = &rt->fib6_nh.nh_gw; 540 dev = rt->fib6_nh.nh_dev; 541 rcu_read_lock_bh(); 542 idev = __in6_dev_get(dev); 543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 544 if (neigh) { 545 if (neigh->nud_state & NUD_VALID) 546 goto out; 547 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else if (time_after(jiffies, rt->last_probe + 558 idev->cnf.rtr_probe_interval)) { 559 work = kmalloc(sizeof(*work), GFP_ATOMIC); 560 } 561 562 if (work) { 563 rt->last_probe = jiffies; 564 INIT_WORK(&work->work, rt6_probe_deferred); 565 work->target = *nh_gw; 566 dev_hold(dev); 567 work->dev = dev; 568 schedule_work(&work->work); 569 } 570 571 out: 572 rcu_read_unlock_bh(); 573 } 574 #else 575 static inline void rt6_probe(struct fib6_info *rt) 576 { 577 } 578 #endif 579 580 /* 581 * Default Router Selection (RFC 2461 6.3.6) 582 */ 583 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 584 { 585 const struct net_device *dev = rt->fib6_nh.nh_dev; 586 587 if (!oif || dev->ifindex == oif) 588 return 2; 589 return 0; 590 } 591 592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 593 { 594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 595 struct neighbour *neigh; 596 597 if (rt->fib6_flags & RTF_NONEXTHOP || 598 !(rt->fib6_flags & RTF_GATEWAY)) 599 return RT6_NUD_SUCCEED; 600 601 rcu_read_lock_bh(); 602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 603 &rt->fib6_nh.nh_gw); 604 if (neigh) { 605 read_lock(&neigh->lock); 606 if (neigh->nud_state & NUD_VALID) 607 ret = RT6_NUD_SUCCEED; 608 #ifdef CONFIG_IPV6_ROUTER_PREF 609 else if (!(neigh->nud_state & NUD_FAILED)) 610 ret = RT6_NUD_SUCCEED; 611 else 612 ret = RT6_NUD_FAIL_PROBE; 613 #endif 614 read_unlock(&neigh->lock); 615 } else { 616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 618 } 619 rcu_read_unlock_bh(); 620 621 return ret; 622 } 623 624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 625 { 626 int m; 627 628 m = rt6_check_dev(rt, oif); 629 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 630 return RT6_NUD_FAIL_HARD; 631 #ifdef CONFIG_IPV6_ROUTER_PREF 632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 633 #endif 634 if (strict & RT6_LOOKUP_F_REACHABLE) { 635 int n = rt6_check_neigh(rt); 636 if (n < 0) 637 return n; 638 } 639 return m; 640 } 641 642 /* called with rc_read_lock held */ 643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 644 { 645 const struct net_device *dev = fib6_info_nh_dev(f6i); 646 bool rc = false; 647 648 if (dev) { 649 const struct inet6_dev *idev = __in6_dev_get(dev); 650 651 rc = !!idev->cnf.ignore_routes_with_linkdown; 652 } 653 654 return rc; 655 } 656 657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 658 int *mpri, struct fib6_info *match, 659 bool *do_rr) 660 { 661 int m; 662 bool match_do_rr = false; 663 664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 665 goto out; 666 667 if (fib6_ignore_linkdown(rt) && 668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 670 goto out; 671 672 if (fib6_check_expired(rt)) 673 goto out; 674 675 m = rt6_score_route(rt, oif, strict); 676 if (m == RT6_NUD_FAIL_DO_RR) { 677 match_do_rr = true; 678 m = 0; /* lowest valid score */ 679 } else if (m == RT6_NUD_FAIL_HARD) { 680 goto out; 681 } 682 683 if (strict & RT6_LOOKUP_F_REACHABLE) 684 rt6_probe(rt); 685 686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 687 if (m > *mpri) { 688 *do_rr = match_do_rr; 689 *mpri = m; 690 match = rt; 691 } 692 out: 693 return match; 694 } 695 696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 697 struct fib6_info *leaf, 698 struct fib6_info *rr_head, 699 u32 metric, int oif, int strict, 700 bool *do_rr) 701 { 702 struct fib6_info *rt, *match, *cont; 703 int mpri = -1; 704 705 match = NULL; 706 cont = NULL; 707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 708 if (rt->fib6_metric != metric) { 709 cont = rt; 710 break; 711 } 712 713 match = find_match(rt, oif, strict, &mpri, match, do_rr); 714 } 715 716 for (rt = leaf; rt && rt != rr_head; 717 rt = rcu_dereference(rt->fib6_next)) { 718 if (rt->fib6_metric != metric) { 719 cont = rt; 720 break; 721 } 722 723 match = find_match(rt, oif, strict, &mpri, match, do_rr); 724 } 725 726 if (match || !cont) 727 return match; 728 729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 730 match = find_match(rt, oif, strict, &mpri, match, do_rr); 731 732 return match; 733 } 734 735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 736 int oif, int strict) 737 { 738 struct fib6_info *leaf = rcu_dereference(fn->leaf); 739 struct fib6_info *match, *rt0; 740 bool do_rr = false; 741 int key_plen; 742 743 if (!leaf || leaf == net->ipv6.fib6_null_entry) 744 return net->ipv6.fib6_null_entry; 745 746 rt0 = rcu_dereference(fn->rr_ptr); 747 if (!rt0) 748 rt0 = leaf; 749 750 /* Double check to make sure fn is not an intermediate node 751 * and fn->leaf does not points to its child's leaf 752 * (This might happen if all routes under fn are deleted from 753 * the tree and fib6_repair_tree() is called on the node.) 754 */ 755 key_plen = rt0->fib6_dst.plen; 756 #ifdef CONFIG_IPV6_SUBTREES 757 if (rt0->fib6_src.plen) 758 key_plen = rt0->fib6_src.plen; 759 #endif 760 if (fn->fn_bit != key_plen) 761 return net->ipv6.fib6_null_entry; 762 763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 764 &do_rr); 765 766 if (do_rr) { 767 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 768 769 /* no entries matched; do round-robin */ 770 if (!next || next->fib6_metric != rt0->fib6_metric) 771 next = leaf; 772 773 if (next != rt0) { 774 spin_lock_bh(&leaf->fib6_table->tb6_lock); 775 /* make sure next is not being deleted from the tree */ 776 if (next->fib6_node) 777 rcu_assign_pointer(fn->rr_ptr, next); 778 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 779 } 780 } 781 782 return match ? match : net->ipv6.fib6_null_entry; 783 } 784 785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 786 { 787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 788 } 789 790 #ifdef CONFIG_IPV6_ROUTE_INFO 791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 792 const struct in6_addr *gwaddr) 793 { 794 struct net *net = dev_net(dev); 795 struct route_info *rinfo = (struct route_info *) opt; 796 struct in6_addr prefix_buf, *prefix; 797 unsigned int pref; 798 unsigned long lifetime; 799 struct fib6_info *rt; 800 801 if (len < sizeof(struct route_info)) { 802 return -EINVAL; 803 } 804 805 /* Sanity check for prefix_len and length */ 806 if (rinfo->length > 3) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 128) { 809 return -EINVAL; 810 } else if (rinfo->prefix_len > 64) { 811 if (rinfo->length < 2) { 812 return -EINVAL; 813 } 814 } else if (rinfo->prefix_len > 0) { 815 if (rinfo->length < 1) { 816 return -EINVAL; 817 } 818 } 819 820 pref = rinfo->route_pref; 821 if (pref == ICMPV6_ROUTER_PREF_INVALID) 822 return -EINVAL; 823 824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 825 826 if (rinfo->length == 3) 827 prefix = (struct in6_addr *)rinfo->prefix; 828 else { 829 /* this function is safe */ 830 ipv6_addr_prefix(&prefix_buf, 831 (struct in6_addr *)rinfo->prefix, 832 rinfo->prefix_len); 833 prefix = &prefix_buf; 834 } 835 836 if (rinfo->prefix_len == 0) 837 rt = rt6_get_dflt_router(net, gwaddr, dev); 838 else 839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 840 gwaddr, dev); 841 842 if (rt && !lifetime) { 843 ip6_del_rt(net, rt); 844 rt = NULL; 845 } 846 847 if (!rt && lifetime) 848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 849 dev, pref); 850 else if (rt) 851 rt->fib6_flags = RTF_ROUTEINFO | 852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 853 854 if (rt) { 855 if (!addrconf_finite_timeout(lifetime)) 856 fib6_clean_expires(rt); 857 else 858 fib6_set_expires(rt, jiffies + HZ * lifetime); 859 860 fib6_info_release(rt); 861 } 862 return 0; 863 } 864 #endif 865 866 /* 867 * Misc support functions 868 */ 869 870 /* called with rcu_lock held */ 871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 872 { 873 struct net_device *dev = rt->fib6_nh.nh_dev; 874 875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 876 /* for copies of local routes, dst->dev needs to be the 877 * device if it is a master device, the master device if 878 * device is enslaved, and the loopback as the default 879 */ 880 if (netif_is_l3_slave(dev) && 881 !rt6_need_strict(&rt->fib6_dst.addr)) 882 dev = l3mdev_master_dev_rcu(dev); 883 else if (!netif_is_l3_master(dev)) 884 dev = dev_net(dev)->loopback_dev; 885 /* last case is netif_is_l3_master(dev) is true in which 886 * case we want dev returned to be dev 887 */ 888 } 889 890 return dev; 891 } 892 893 static const int fib6_prop[RTN_MAX + 1] = { 894 [RTN_UNSPEC] = 0, 895 [RTN_UNICAST] = 0, 896 [RTN_LOCAL] = 0, 897 [RTN_BROADCAST] = 0, 898 [RTN_ANYCAST] = 0, 899 [RTN_MULTICAST] = 0, 900 [RTN_BLACKHOLE] = -EINVAL, 901 [RTN_UNREACHABLE] = -EHOSTUNREACH, 902 [RTN_PROHIBIT] = -EACCES, 903 [RTN_THROW] = -EAGAIN, 904 [RTN_NAT] = -EINVAL, 905 [RTN_XRESOLVE] = -EINVAL, 906 }; 907 908 static int ip6_rt_type_to_error(u8 fib6_type) 909 { 910 return fib6_prop[fib6_type]; 911 } 912 913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 914 { 915 unsigned short flags = 0; 916 917 if (rt->dst_nocount) 918 flags |= DST_NOCOUNT; 919 if (rt->dst_nopolicy) 920 flags |= DST_NOPOLICY; 921 if (rt->dst_host) 922 flags |= DST_HOST; 923 924 return flags; 925 } 926 927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 928 { 929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 930 931 switch (ort->fib6_type) { 932 case RTN_BLACKHOLE: 933 rt->dst.output = dst_discard_out; 934 rt->dst.input = dst_discard; 935 break; 936 case RTN_PROHIBIT: 937 rt->dst.output = ip6_pkt_prohibit_out; 938 rt->dst.input = ip6_pkt_prohibit; 939 break; 940 case RTN_THROW: 941 case RTN_UNREACHABLE: 942 default: 943 rt->dst.output = ip6_pkt_discard_out; 944 rt->dst.input = ip6_pkt_discard; 945 break; 946 } 947 } 948 949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 950 { 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 } 999 1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1001 struct in6_addr *saddr) 1002 { 1003 struct fib6_node *pn, *sn; 1004 while (1) { 1005 if (fn->fn_flags & RTN_TL_ROOT) 1006 return NULL; 1007 pn = rcu_dereference(fn->parent); 1008 sn = FIB6_SUBTREE(pn); 1009 if (sn && sn != fn) 1010 fn = fib6_node_lookup(sn, NULL, saddr); 1011 else 1012 fn = pn; 1013 if (fn->fn_flags & RTN_RTINFO) 1014 return fn; 1015 } 1016 } 1017 1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1019 bool null_fallback) 1020 { 1021 struct rt6_info *rt = *prt; 1022 1023 if (dst_hold_safe(&rt->dst)) 1024 return true; 1025 if (null_fallback) { 1026 rt = net->ipv6.ip6_null_entry; 1027 dst_hold(&rt->dst); 1028 } else { 1029 rt = NULL; 1030 } 1031 *prt = rt; 1032 return false; 1033 } 1034 1035 /* called with rcu_lock held */ 1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1037 { 1038 unsigned short flags = fib6_info_dst_flags(rt); 1039 struct net_device *dev = rt->fib6_nh.nh_dev; 1040 struct rt6_info *nrt; 1041 1042 if (!fib6_info_hold_safe(rt)) 1043 goto fallback; 1044 1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1046 if (!nrt) { 1047 fib6_info_release(rt); 1048 goto fallback; 1049 } 1050 1051 ip6_rt_copy_init(nrt, rt); 1052 return nrt; 1053 1054 fallback: 1055 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1056 dst_hold(&nrt->dst); 1057 return nrt; 1058 } 1059 1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1061 struct fib6_table *table, 1062 struct flowi6 *fl6, 1063 const struct sk_buff *skb, 1064 int flags) 1065 { 1066 struct fib6_info *f6i; 1067 struct fib6_node *fn; 1068 struct rt6_info *rt; 1069 1070 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1071 flags &= ~RT6_LOOKUP_F_IFACE; 1072 1073 rcu_read_lock(); 1074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1075 restart: 1076 f6i = rcu_dereference(fn->leaf); 1077 if (!f6i) { 1078 f6i = net->ipv6.fib6_null_entry; 1079 } else { 1080 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1081 fl6->flowi6_oif, flags); 1082 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1083 f6i = fib6_multipath_select(net, f6i, fl6, 1084 fl6->flowi6_oif, skb, 1085 flags); 1086 } 1087 if (f6i == net->ipv6.fib6_null_entry) { 1088 fn = fib6_backtrack(fn, &fl6->saddr); 1089 if (fn) 1090 goto restart; 1091 } 1092 1093 trace_fib6_table_lookup(net, f6i, table, fl6); 1094 1095 /* Search through exception table */ 1096 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1097 if (rt) { 1098 if (ip6_hold_safe(net, &rt, true)) 1099 dst_use_noref(&rt->dst, jiffies); 1100 } else if (f6i == net->ipv6.fib6_null_entry) { 1101 rt = net->ipv6.ip6_null_entry; 1102 dst_hold(&rt->dst); 1103 } else { 1104 rt = ip6_create_rt_rcu(f6i); 1105 } 1106 1107 rcu_read_unlock(); 1108 1109 return rt; 1110 } 1111 1112 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1113 const struct sk_buff *skb, int flags) 1114 { 1115 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1116 } 1117 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1118 1119 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1120 const struct in6_addr *saddr, int oif, 1121 const struct sk_buff *skb, int strict) 1122 { 1123 struct flowi6 fl6 = { 1124 .flowi6_oif = oif, 1125 .daddr = *daddr, 1126 }; 1127 struct dst_entry *dst; 1128 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1129 1130 if (saddr) { 1131 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1132 flags |= RT6_LOOKUP_F_HAS_SADDR; 1133 } 1134 1135 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1136 if (dst->error == 0) 1137 return (struct rt6_info *) dst; 1138 1139 dst_release(dst); 1140 1141 return NULL; 1142 } 1143 EXPORT_SYMBOL(rt6_lookup); 1144 1145 /* ip6_ins_rt is called with FREE table->tb6_lock. 1146 * It takes new route entry, the addition fails by any reason the 1147 * route is released. 1148 * Caller must hold dst before calling it. 1149 */ 1150 1151 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1152 struct netlink_ext_ack *extack) 1153 { 1154 int err; 1155 struct fib6_table *table; 1156 1157 table = rt->fib6_table; 1158 spin_lock_bh(&table->tb6_lock); 1159 err = fib6_add(&table->tb6_root, rt, info, extack); 1160 spin_unlock_bh(&table->tb6_lock); 1161 1162 return err; 1163 } 1164 1165 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1166 { 1167 struct nl_info info = { .nl_net = net, }; 1168 1169 return __ip6_ins_rt(rt, &info, NULL); 1170 } 1171 1172 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1173 const struct in6_addr *daddr, 1174 const struct in6_addr *saddr) 1175 { 1176 struct net_device *dev; 1177 struct rt6_info *rt; 1178 1179 /* 1180 * Clone the route. 1181 */ 1182 1183 if (!fib6_info_hold_safe(ort)) 1184 return NULL; 1185 1186 dev = ip6_rt_get_dev_rcu(ort); 1187 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1188 if (!rt) { 1189 fib6_info_release(ort); 1190 return NULL; 1191 } 1192 1193 ip6_rt_copy_init(rt, ort); 1194 rt->rt6i_flags |= RTF_CACHE; 1195 rt->dst.flags |= DST_HOST; 1196 rt->rt6i_dst.addr = *daddr; 1197 rt->rt6i_dst.plen = 128; 1198 1199 if (!rt6_is_gw_or_nonexthop(ort)) { 1200 if (ort->fib6_dst.plen != 128 && 1201 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1202 rt->rt6i_flags |= RTF_ANYCAST; 1203 #ifdef CONFIG_IPV6_SUBTREES 1204 if (rt->rt6i_src.plen && saddr) { 1205 rt->rt6i_src.addr = *saddr; 1206 rt->rt6i_src.plen = 128; 1207 } 1208 #endif 1209 } 1210 1211 return rt; 1212 } 1213 1214 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1215 { 1216 unsigned short flags = fib6_info_dst_flags(rt); 1217 struct net_device *dev; 1218 struct rt6_info *pcpu_rt; 1219 1220 if (!fib6_info_hold_safe(rt)) 1221 return NULL; 1222 1223 rcu_read_lock(); 1224 dev = ip6_rt_get_dev_rcu(rt); 1225 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1226 rcu_read_unlock(); 1227 if (!pcpu_rt) { 1228 fib6_info_release(rt); 1229 return NULL; 1230 } 1231 ip6_rt_copy_init(pcpu_rt, rt); 1232 pcpu_rt->rt6i_flags |= RTF_PCPU; 1233 return pcpu_rt; 1234 } 1235 1236 /* It should be called with rcu_read_lock() acquired */ 1237 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1238 { 1239 struct rt6_info *pcpu_rt, **p; 1240 1241 p = this_cpu_ptr(rt->rt6i_pcpu); 1242 pcpu_rt = *p; 1243 1244 if (pcpu_rt) 1245 ip6_hold_safe(NULL, &pcpu_rt, false); 1246 1247 return pcpu_rt; 1248 } 1249 1250 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1251 struct fib6_info *rt) 1252 { 1253 struct rt6_info *pcpu_rt, *prev, **p; 1254 1255 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1256 if (!pcpu_rt) { 1257 dst_hold(&net->ipv6.ip6_null_entry->dst); 1258 return net->ipv6.ip6_null_entry; 1259 } 1260 1261 dst_hold(&pcpu_rt->dst); 1262 p = this_cpu_ptr(rt->rt6i_pcpu); 1263 prev = cmpxchg(p, NULL, pcpu_rt); 1264 BUG_ON(prev); 1265 1266 return pcpu_rt; 1267 } 1268 1269 /* exception hash table implementation 1270 */ 1271 static DEFINE_SPINLOCK(rt6_exception_lock); 1272 1273 /* Remove rt6_ex from hash table and free the memory 1274 * Caller must hold rt6_exception_lock 1275 */ 1276 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1277 struct rt6_exception *rt6_ex) 1278 { 1279 struct fib6_info *from; 1280 struct net *net; 1281 1282 if (!bucket || !rt6_ex) 1283 return; 1284 1285 net = dev_net(rt6_ex->rt6i->dst.dev); 1286 net->ipv6.rt6_stats->fib_rt_cache--; 1287 1288 /* purge completely the exception to allow releasing the held resources: 1289 * some [sk] cache may keep the dst around for unlimited time 1290 */ 1291 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1292 lockdep_is_held(&rt6_exception_lock)); 1293 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1294 fib6_info_release(from); 1295 dst_dev_put(&rt6_ex->rt6i->dst); 1296 1297 hlist_del_rcu(&rt6_ex->hlist); 1298 dst_release(&rt6_ex->rt6i->dst); 1299 kfree_rcu(rt6_ex, rcu); 1300 WARN_ON_ONCE(!bucket->depth); 1301 bucket->depth--; 1302 } 1303 1304 /* Remove oldest rt6_ex in bucket and free the memory 1305 * Caller must hold rt6_exception_lock 1306 */ 1307 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1308 { 1309 struct rt6_exception *rt6_ex, *oldest = NULL; 1310 1311 if (!bucket) 1312 return; 1313 1314 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1315 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1316 oldest = rt6_ex; 1317 } 1318 rt6_remove_exception(bucket, oldest); 1319 } 1320 1321 static u32 rt6_exception_hash(const struct in6_addr *dst, 1322 const struct in6_addr *src) 1323 { 1324 static u32 seed __read_mostly; 1325 u32 val; 1326 1327 net_get_random_once(&seed, sizeof(seed)); 1328 val = jhash(dst, sizeof(*dst), seed); 1329 1330 #ifdef CONFIG_IPV6_SUBTREES 1331 if (src) 1332 val = jhash(src, sizeof(*src), val); 1333 #endif 1334 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1335 } 1336 1337 /* Helper function to find the cached rt in the hash table 1338 * and update bucket pointer to point to the bucket for this 1339 * (daddr, saddr) pair 1340 * Caller must hold rt6_exception_lock 1341 */ 1342 static struct rt6_exception * 1343 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1344 const struct in6_addr *daddr, 1345 const struct in6_addr *saddr) 1346 { 1347 struct rt6_exception *rt6_ex; 1348 u32 hval; 1349 1350 if (!(*bucket) || !daddr) 1351 return NULL; 1352 1353 hval = rt6_exception_hash(daddr, saddr); 1354 *bucket += hval; 1355 1356 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1357 struct rt6_info *rt6 = rt6_ex->rt6i; 1358 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1359 1360 #ifdef CONFIG_IPV6_SUBTREES 1361 if (matched && saddr) 1362 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1363 #endif 1364 if (matched) 1365 return rt6_ex; 1366 } 1367 return NULL; 1368 } 1369 1370 /* Helper function to find the cached rt in the hash table 1371 * and update bucket pointer to point to the bucket for this 1372 * (daddr, saddr) pair 1373 * Caller must hold rcu_read_lock() 1374 */ 1375 static struct rt6_exception * 1376 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1377 const struct in6_addr *daddr, 1378 const struct in6_addr *saddr) 1379 { 1380 struct rt6_exception *rt6_ex; 1381 u32 hval; 1382 1383 WARN_ON_ONCE(!rcu_read_lock_held()); 1384 1385 if (!(*bucket) || !daddr) 1386 return NULL; 1387 1388 hval = rt6_exception_hash(daddr, saddr); 1389 *bucket += hval; 1390 1391 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1392 struct rt6_info *rt6 = rt6_ex->rt6i; 1393 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1394 1395 #ifdef CONFIG_IPV6_SUBTREES 1396 if (matched && saddr) 1397 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1398 #endif 1399 if (matched) 1400 return rt6_ex; 1401 } 1402 return NULL; 1403 } 1404 1405 static unsigned int fib6_mtu(const struct fib6_info *rt) 1406 { 1407 unsigned int mtu; 1408 1409 if (rt->fib6_pmtu) { 1410 mtu = rt->fib6_pmtu; 1411 } else { 1412 struct net_device *dev = fib6_info_nh_dev(rt); 1413 struct inet6_dev *idev; 1414 1415 rcu_read_lock(); 1416 idev = __in6_dev_get(dev); 1417 mtu = idev->cnf.mtu6; 1418 rcu_read_unlock(); 1419 } 1420 1421 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1422 1423 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1424 } 1425 1426 static int rt6_insert_exception(struct rt6_info *nrt, 1427 struct fib6_info *ort) 1428 { 1429 struct net *net = dev_net(nrt->dst.dev); 1430 struct rt6_exception_bucket *bucket; 1431 struct in6_addr *src_key = NULL; 1432 struct rt6_exception *rt6_ex; 1433 int err = 0; 1434 1435 spin_lock_bh(&rt6_exception_lock); 1436 1437 if (ort->exception_bucket_flushed) { 1438 err = -EINVAL; 1439 goto out; 1440 } 1441 1442 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1443 lockdep_is_held(&rt6_exception_lock)); 1444 if (!bucket) { 1445 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1446 GFP_ATOMIC); 1447 if (!bucket) { 1448 err = -ENOMEM; 1449 goto out; 1450 } 1451 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1452 } 1453 1454 #ifdef CONFIG_IPV6_SUBTREES 1455 /* rt6i_src.plen != 0 indicates ort is in subtree 1456 * and exception table is indexed by a hash of 1457 * both rt6i_dst and rt6i_src. 1458 * Otherwise, the exception table is indexed by 1459 * a hash of only rt6i_dst. 1460 */ 1461 if (ort->fib6_src.plen) 1462 src_key = &nrt->rt6i_src.addr; 1463 #endif 1464 /* rt6_mtu_change() might lower mtu on ort. 1465 * Only insert this exception route if its mtu 1466 * is less than ort's mtu value. 1467 */ 1468 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1469 err = -EINVAL; 1470 goto out; 1471 } 1472 1473 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1474 src_key); 1475 if (rt6_ex) 1476 rt6_remove_exception(bucket, rt6_ex); 1477 1478 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1479 if (!rt6_ex) { 1480 err = -ENOMEM; 1481 goto out; 1482 } 1483 rt6_ex->rt6i = nrt; 1484 rt6_ex->stamp = jiffies; 1485 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1486 bucket->depth++; 1487 net->ipv6.rt6_stats->fib_rt_cache++; 1488 1489 if (bucket->depth > FIB6_MAX_DEPTH) 1490 rt6_exception_remove_oldest(bucket); 1491 1492 out: 1493 spin_unlock_bh(&rt6_exception_lock); 1494 1495 /* Update fn->fn_sernum to invalidate all cached dst */ 1496 if (!err) { 1497 spin_lock_bh(&ort->fib6_table->tb6_lock); 1498 fib6_update_sernum(net, ort); 1499 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1500 fib6_force_start_gc(net); 1501 } 1502 1503 return err; 1504 } 1505 1506 void rt6_flush_exceptions(struct fib6_info *rt) 1507 { 1508 struct rt6_exception_bucket *bucket; 1509 struct rt6_exception *rt6_ex; 1510 struct hlist_node *tmp; 1511 int i; 1512 1513 spin_lock_bh(&rt6_exception_lock); 1514 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1515 rt->exception_bucket_flushed = 1; 1516 1517 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1518 lockdep_is_held(&rt6_exception_lock)); 1519 if (!bucket) 1520 goto out; 1521 1522 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1523 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1524 rt6_remove_exception(bucket, rt6_ex); 1525 WARN_ON_ONCE(bucket->depth); 1526 bucket++; 1527 } 1528 1529 out: 1530 spin_unlock_bh(&rt6_exception_lock); 1531 } 1532 1533 /* Find cached rt in the hash table inside passed in rt 1534 * Caller has to hold rcu_read_lock() 1535 */ 1536 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1537 struct in6_addr *daddr, 1538 struct in6_addr *saddr) 1539 { 1540 struct rt6_exception_bucket *bucket; 1541 struct in6_addr *src_key = NULL; 1542 struct rt6_exception *rt6_ex; 1543 struct rt6_info *res = NULL; 1544 1545 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1546 1547 #ifdef CONFIG_IPV6_SUBTREES 1548 /* rt6i_src.plen != 0 indicates rt is in subtree 1549 * and exception table is indexed by a hash of 1550 * both rt6i_dst and rt6i_src. 1551 * Otherwise, the exception table is indexed by 1552 * a hash of only rt6i_dst. 1553 */ 1554 if (rt->fib6_src.plen) 1555 src_key = saddr; 1556 #endif 1557 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1558 1559 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1560 res = rt6_ex->rt6i; 1561 1562 return res; 1563 } 1564 1565 /* Remove the passed in cached rt from the hash table that contains it */ 1566 static int rt6_remove_exception_rt(struct rt6_info *rt) 1567 { 1568 struct rt6_exception_bucket *bucket; 1569 struct in6_addr *src_key = NULL; 1570 struct rt6_exception *rt6_ex; 1571 struct fib6_info *from; 1572 int err; 1573 1574 from = rcu_dereference(rt->from); 1575 if (!from || 1576 !(rt->rt6i_flags & RTF_CACHE)) 1577 return -EINVAL; 1578 1579 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1580 return -ENOENT; 1581 1582 spin_lock_bh(&rt6_exception_lock); 1583 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1584 lockdep_is_held(&rt6_exception_lock)); 1585 #ifdef CONFIG_IPV6_SUBTREES 1586 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1587 * and exception table is indexed by a hash of 1588 * both rt6i_dst and rt6i_src. 1589 * Otherwise, the exception table is indexed by 1590 * a hash of only rt6i_dst. 1591 */ 1592 if (from->fib6_src.plen) 1593 src_key = &rt->rt6i_src.addr; 1594 #endif 1595 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1596 &rt->rt6i_dst.addr, 1597 src_key); 1598 if (rt6_ex) { 1599 rt6_remove_exception(bucket, rt6_ex); 1600 err = 0; 1601 } else { 1602 err = -ENOENT; 1603 } 1604 1605 spin_unlock_bh(&rt6_exception_lock); 1606 return err; 1607 } 1608 1609 /* Find rt6_ex which contains the passed in rt cache and 1610 * refresh its stamp 1611 */ 1612 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1613 { 1614 struct rt6_exception_bucket *bucket; 1615 struct in6_addr *src_key = NULL; 1616 struct rt6_exception *rt6_ex; 1617 struct fib6_info *from; 1618 1619 rcu_read_lock(); 1620 from = rcu_dereference(rt->from); 1621 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1622 goto unlock; 1623 1624 bucket = rcu_dereference(from->rt6i_exception_bucket); 1625 1626 #ifdef CONFIG_IPV6_SUBTREES 1627 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1628 * and exception table is indexed by a hash of 1629 * both rt6i_dst and rt6i_src. 1630 * Otherwise, the exception table is indexed by 1631 * a hash of only rt6i_dst. 1632 */ 1633 if (from->fib6_src.plen) 1634 src_key = &rt->rt6i_src.addr; 1635 #endif 1636 rt6_ex = __rt6_find_exception_rcu(&bucket, 1637 &rt->rt6i_dst.addr, 1638 src_key); 1639 if (rt6_ex) 1640 rt6_ex->stamp = jiffies; 1641 1642 unlock: 1643 rcu_read_unlock(); 1644 } 1645 1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1647 struct rt6_info *rt, int mtu) 1648 { 1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1650 * lowest MTU in the path: always allow updating the route PMTU to 1651 * reflect PMTU decreases. 1652 * 1653 * If the new MTU is higher, and the route PMTU is equal to the local 1654 * MTU, this means the old MTU is the lowest in the path, so allow 1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1656 * handle this. 1657 */ 1658 1659 if (dst_mtu(&rt->dst) >= mtu) 1660 return true; 1661 1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1663 return true; 1664 1665 return false; 1666 } 1667 1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1669 struct fib6_info *rt, int mtu) 1670 { 1671 struct rt6_exception_bucket *bucket; 1672 struct rt6_exception *rt6_ex; 1673 int i; 1674 1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1676 lockdep_is_held(&rt6_exception_lock)); 1677 1678 if (!bucket) 1679 return; 1680 1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1683 struct rt6_info *entry = rt6_ex->rt6i; 1684 1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1686 * route), the metrics of its rt->from have already 1687 * been updated. 1688 */ 1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1690 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1692 } 1693 bucket++; 1694 } 1695 } 1696 1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1698 1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1700 struct in6_addr *gateway) 1701 { 1702 struct rt6_exception_bucket *bucket; 1703 struct rt6_exception *rt6_ex; 1704 struct hlist_node *tmp; 1705 int i; 1706 1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1708 return; 1709 1710 spin_lock_bh(&rt6_exception_lock); 1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1712 lockdep_is_held(&rt6_exception_lock)); 1713 1714 if (bucket) { 1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1716 hlist_for_each_entry_safe(rt6_ex, tmp, 1717 &bucket->chain, hlist) { 1718 struct rt6_info *entry = rt6_ex->rt6i; 1719 1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1721 RTF_CACHE_GATEWAY && 1722 ipv6_addr_equal(gateway, 1723 &entry->rt6i_gateway)) { 1724 rt6_remove_exception(bucket, rt6_ex); 1725 } 1726 } 1727 bucket++; 1728 } 1729 } 1730 1731 spin_unlock_bh(&rt6_exception_lock); 1732 } 1733 1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1735 struct rt6_exception *rt6_ex, 1736 struct fib6_gc_args *gc_args, 1737 unsigned long now) 1738 { 1739 struct rt6_info *rt = rt6_ex->rt6i; 1740 1741 /* we are pruning and obsoleting aged-out and non gateway exceptions 1742 * even if others have still references to them, so that on next 1743 * dst_check() such references can be dropped. 1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1745 * expired, independently from their aging, as per RFC 8201 section 4 1746 */ 1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1749 RT6_TRACE("aging clone %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 } else if (time_after(jiffies, rt->dst.expires)) { 1754 RT6_TRACE("purging expired route %p\n", rt); 1755 rt6_remove_exception(bucket, rt6_ex); 1756 return; 1757 } 1758 1759 if (rt->rt6i_flags & RTF_GATEWAY) { 1760 struct neighbour *neigh; 1761 __u8 neigh_flags = 0; 1762 1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1764 if (neigh) 1765 neigh_flags = neigh->flags; 1766 1767 if (!(neigh_flags & NTF_ROUTER)) { 1768 RT6_TRACE("purging route %p via non-router but gateway\n", 1769 rt); 1770 rt6_remove_exception(bucket, rt6_ex); 1771 return; 1772 } 1773 } 1774 1775 gc_args->more++; 1776 } 1777 1778 void rt6_age_exceptions(struct fib6_info *rt, 1779 struct fib6_gc_args *gc_args, 1780 unsigned long now) 1781 { 1782 struct rt6_exception_bucket *bucket; 1783 struct rt6_exception *rt6_ex; 1784 struct hlist_node *tmp; 1785 int i; 1786 1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1788 return; 1789 1790 rcu_read_lock_bh(); 1791 spin_lock(&rt6_exception_lock); 1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1793 lockdep_is_held(&rt6_exception_lock)); 1794 1795 if (bucket) { 1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1797 hlist_for_each_entry_safe(rt6_ex, tmp, 1798 &bucket->chain, hlist) { 1799 rt6_age_examine_exception(bucket, rt6_ex, 1800 gc_args, now); 1801 } 1802 bucket++; 1803 } 1804 } 1805 spin_unlock(&rt6_exception_lock); 1806 rcu_read_unlock_bh(); 1807 } 1808 1809 /* must be called with rcu lock held */ 1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1811 int oif, struct flowi6 *fl6, int strict) 1812 { 1813 struct fib6_node *fn, *saved_fn; 1814 struct fib6_info *f6i; 1815 1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1817 saved_fn = fn; 1818 1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1820 oif = 0; 1821 1822 redo_rt6_select: 1823 f6i = rt6_select(net, fn, oif, strict); 1824 if (f6i == net->ipv6.fib6_null_entry) { 1825 fn = fib6_backtrack(fn, &fl6->saddr); 1826 if (fn) 1827 goto redo_rt6_select; 1828 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1829 /* also consider unreachable route */ 1830 strict &= ~RT6_LOOKUP_F_REACHABLE; 1831 fn = saved_fn; 1832 goto redo_rt6_select; 1833 } 1834 } 1835 1836 trace_fib6_table_lookup(net, f6i, table, fl6); 1837 1838 return f6i; 1839 } 1840 1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1842 int oif, struct flowi6 *fl6, 1843 const struct sk_buff *skb, int flags) 1844 { 1845 struct fib6_info *f6i; 1846 struct rt6_info *rt; 1847 int strict = 0; 1848 1849 strict |= flags & RT6_LOOKUP_F_IFACE; 1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1851 if (net->ipv6.devconf_all->forwarding == 0) 1852 strict |= RT6_LOOKUP_F_REACHABLE; 1853 1854 rcu_read_lock(); 1855 1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1857 if (f6i->fib6_nsiblings) 1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1859 1860 if (f6i == net->ipv6.fib6_null_entry) { 1861 rt = net->ipv6.ip6_null_entry; 1862 rcu_read_unlock(); 1863 dst_hold(&rt->dst); 1864 return rt; 1865 } 1866 1867 /*Search through exception table */ 1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1869 if (rt) { 1870 if (ip6_hold_safe(net, &rt, true)) 1871 dst_use_noref(&rt->dst, jiffies); 1872 1873 rcu_read_unlock(); 1874 return rt; 1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1876 !(f6i->fib6_flags & RTF_GATEWAY))) { 1877 /* Create a RTF_CACHE clone which will not be 1878 * owned by the fib6 tree. It is for the special case where 1879 * the daddr in the skb during the neighbor look-up is different 1880 * from the fl6->daddr used to look-up route here. 1881 */ 1882 struct rt6_info *uncached_rt; 1883 1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1885 1886 rcu_read_unlock(); 1887 1888 if (uncached_rt) { 1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1890 * No need for another dst_hold() 1891 */ 1892 rt6_uncached_list_add(uncached_rt); 1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1894 } else { 1895 uncached_rt = net->ipv6.ip6_null_entry; 1896 dst_hold(&uncached_rt->dst); 1897 } 1898 1899 return uncached_rt; 1900 } else { 1901 /* Get a percpu copy */ 1902 1903 struct rt6_info *pcpu_rt; 1904 1905 local_bh_disable(); 1906 pcpu_rt = rt6_get_pcpu_route(f6i); 1907 1908 if (!pcpu_rt) 1909 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1910 1911 local_bh_enable(); 1912 rcu_read_unlock(); 1913 1914 return pcpu_rt; 1915 } 1916 } 1917 EXPORT_SYMBOL_GPL(ip6_pol_route); 1918 1919 static struct rt6_info *ip6_pol_route_input(struct net *net, 1920 struct fib6_table *table, 1921 struct flowi6 *fl6, 1922 const struct sk_buff *skb, 1923 int flags) 1924 { 1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1926 } 1927 1928 struct dst_entry *ip6_route_input_lookup(struct net *net, 1929 struct net_device *dev, 1930 struct flowi6 *fl6, 1931 const struct sk_buff *skb, 1932 int flags) 1933 { 1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1935 flags |= RT6_LOOKUP_F_IFACE; 1936 1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1938 } 1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1940 1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1942 struct flow_keys *keys, 1943 struct flow_keys *flkeys) 1944 { 1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1946 const struct ipv6hdr *key_iph = outer_iph; 1947 struct flow_keys *_flkeys = flkeys; 1948 const struct ipv6hdr *inner_iph; 1949 const struct icmp6hdr *icmph; 1950 struct ipv6hdr _inner_iph; 1951 struct icmp6hdr _icmph; 1952 1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1954 goto out; 1955 1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1957 sizeof(_icmph), &_icmph); 1958 if (!icmph) 1959 goto out; 1960 1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1964 icmph->icmp6_type != ICMPV6_PARAMPROB) 1965 goto out; 1966 1967 inner_iph = skb_header_pointer(skb, 1968 skb_transport_offset(skb) + sizeof(*icmph), 1969 sizeof(_inner_iph), &_inner_iph); 1970 if (!inner_iph) 1971 goto out; 1972 1973 key_iph = inner_iph; 1974 _flkeys = NULL; 1975 out: 1976 if (_flkeys) { 1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1979 keys->tags.flow_label = _flkeys->tags.flow_label; 1980 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1981 } else { 1982 keys->addrs.v6addrs.src = key_iph->saddr; 1983 keys->addrs.v6addrs.dst = key_iph->daddr; 1984 keys->tags.flow_label = ip6_flowlabel(key_iph); 1985 keys->basic.ip_proto = key_iph->nexthdr; 1986 } 1987 } 1988 1989 /* if skb is set it will be used and fl6 can be NULL */ 1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1991 const struct sk_buff *skb, struct flow_keys *flkeys) 1992 { 1993 struct flow_keys hash_keys; 1994 u32 mhash; 1995 1996 switch (ip6_multipath_hash_policy(net)) { 1997 case 0: 1998 memset(&hash_keys, 0, sizeof(hash_keys)); 1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2000 if (skb) { 2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2002 } else { 2003 hash_keys.addrs.v6addrs.src = fl6->saddr; 2004 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2007 } 2008 break; 2009 case 1: 2010 if (skb) { 2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2012 struct flow_keys keys; 2013 2014 /* short-circuit if we already have L4 hash present */ 2015 if (skb->l4_hash) 2016 return skb_get_hash_raw(skb) >> 1; 2017 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 2020 if (!flkeys) { 2021 skb_flow_dissect_flow_keys(skb, &keys, flag); 2022 flkeys = &keys; 2023 } 2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2027 hash_keys.ports.src = flkeys->ports.src; 2028 hash_keys.ports.dst = flkeys->ports.dst; 2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2030 } else { 2031 memset(&hash_keys, 0, sizeof(hash_keys)); 2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.ports.src = fl6->fl6_sport; 2036 hash_keys.ports.dst = fl6->fl6_dport; 2037 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2038 } 2039 break; 2040 } 2041 mhash = flow_hash_from_keys(&hash_keys); 2042 2043 return mhash >> 1; 2044 } 2045 2046 void ip6_route_input(struct sk_buff *skb) 2047 { 2048 const struct ipv6hdr *iph = ipv6_hdr(skb); 2049 struct net *net = dev_net(skb->dev); 2050 int flags = RT6_LOOKUP_F_HAS_SADDR; 2051 struct ip_tunnel_info *tun_info; 2052 struct flowi6 fl6 = { 2053 .flowi6_iif = skb->dev->ifindex, 2054 .daddr = iph->daddr, 2055 .saddr = iph->saddr, 2056 .flowlabel = ip6_flowinfo(iph), 2057 .flowi6_mark = skb->mark, 2058 .flowi6_proto = iph->nexthdr, 2059 }; 2060 struct flow_keys *flkeys = NULL, _flkeys; 2061 2062 tun_info = skb_tunnel_info(skb); 2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2065 2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2067 flkeys = &_flkeys; 2068 2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2071 skb_dst_drop(skb); 2072 skb_dst_set(skb, 2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2074 } 2075 2076 static struct rt6_info *ip6_pol_route_output(struct net *net, 2077 struct fib6_table *table, 2078 struct flowi6 *fl6, 2079 const struct sk_buff *skb, 2080 int flags) 2081 { 2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2083 } 2084 2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2086 struct flowi6 *fl6, int flags) 2087 { 2088 bool any_src; 2089 2090 if (ipv6_addr_type(&fl6->daddr) & 2091 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2092 struct dst_entry *dst; 2093 2094 dst = l3mdev_link_scope_lookup(net, fl6); 2095 if (dst) 2096 return dst; 2097 } 2098 2099 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2100 2101 any_src = ipv6_addr_any(&fl6->saddr); 2102 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2103 (fl6->flowi6_oif && any_src)) 2104 flags |= RT6_LOOKUP_F_IFACE; 2105 2106 if (!any_src) 2107 flags |= RT6_LOOKUP_F_HAS_SADDR; 2108 else if (sk) 2109 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2110 2111 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2112 } 2113 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2114 2115 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2116 { 2117 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2118 struct net_device *loopback_dev = net->loopback_dev; 2119 struct dst_entry *new = NULL; 2120 2121 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2122 DST_OBSOLETE_DEAD, 0); 2123 if (rt) { 2124 rt6_info_init(rt); 2125 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2126 2127 new = &rt->dst; 2128 new->__use = 1; 2129 new->input = dst_discard; 2130 new->output = dst_discard_out; 2131 2132 dst_copy_metrics(new, &ort->dst); 2133 2134 rt->rt6i_idev = in6_dev_get(loopback_dev); 2135 rt->rt6i_gateway = ort->rt6i_gateway; 2136 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2137 2138 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2139 #ifdef CONFIG_IPV6_SUBTREES 2140 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2141 #endif 2142 } 2143 2144 dst_release(dst_orig); 2145 return new ? new : ERR_PTR(-ENOMEM); 2146 } 2147 2148 /* 2149 * Destination cache support functions 2150 */ 2151 2152 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2153 { 2154 u32 rt_cookie = 0; 2155 2156 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2157 return false; 2158 2159 if (fib6_check_expired(f6i)) 2160 return false; 2161 2162 return true; 2163 } 2164 2165 static struct dst_entry *rt6_check(struct rt6_info *rt, 2166 struct fib6_info *from, 2167 u32 cookie) 2168 { 2169 u32 rt_cookie = 0; 2170 2171 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2172 rt_cookie != cookie) 2173 return NULL; 2174 2175 if (rt6_check_expired(rt)) 2176 return NULL; 2177 2178 return &rt->dst; 2179 } 2180 2181 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2182 struct fib6_info *from, 2183 u32 cookie) 2184 { 2185 if (!__rt6_check_expired(rt) && 2186 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2187 fib6_check(from, cookie)) 2188 return &rt->dst; 2189 else 2190 return NULL; 2191 } 2192 2193 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2194 { 2195 struct dst_entry *dst_ret; 2196 struct fib6_info *from; 2197 struct rt6_info *rt; 2198 2199 rt = container_of(dst, struct rt6_info, dst); 2200 2201 rcu_read_lock(); 2202 2203 /* All IPV6 dsts are created with ->obsolete set to the value 2204 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2205 * into this function always. 2206 */ 2207 2208 from = rcu_dereference(rt->from); 2209 2210 if (from && (rt->rt6i_flags & RTF_PCPU || 2211 unlikely(!list_empty(&rt->rt6i_uncached)))) 2212 dst_ret = rt6_dst_from_check(rt, from, cookie); 2213 else 2214 dst_ret = rt6_check(rt, from, cookie); 2215 2216 rcu_read_unlock(); 2217 2218 return dst_ret; 2219 } 2220 2221 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2222 { 2223 struct rt6_info *rt = (struct rt6_info *) dst; 2224 2225 if (rt) { 2226 if (rt->rt6i_flags & RTF_CACHE) { 2227 rcu_read_lock(); 2228 if (rt6_check_expired(rt)) { 2229 rt6_remove_exception_rt(rt); 2230 dst = NULL; 2231 } 2232 rcu_read_unlock(); 2233 } else { 2234 dst_release(dst); 2235 dst = NULL; 2236 } 2237 } 2238 return dst; 2239 } 2240 2241 static void ip6_link_failure(struct sk_buff *skb) 2242 { 2243 struct rt6_info *rt; 2244 2245 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2246 2247 rt = (struct rt6_info *) skb_dst(skb); 2248 if (rt) { 2249 rcu_read_lock(); 2250 if (rt->rt6i_flags & RTF_CACHE) { 2251 rt6_remove_exception_rt(rt); 2252 } else { 2253 struct fib6_info *from; 2254 struct fib6_node *fn; 2255 2256 from = rcu_dereference(rt->from); 2257 if (from) { 2258 fn = rcu_dereference(from->fib6_node); 2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2260 fn->fn_sernum = -1; 2261 } 2262 } 2263 rcu_read_unlock(); 2264 } 2265 } 2266 2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2268 { 2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2270 struct fib6_info *from; 2271 2272 rcu_read_lock(); 2273 from = rcu_dereference(rt0->from); 2274 if (from) 2275 rt0->dst.expires = from->expires; 2276 rcu_read_unlock(); 2277 } 2278 2279 dst_set_expires(&rt0->dst, timeout); 2280 rt0->rt6i_flags |= RTF_EXPIRES; 2281 } 2282 2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2284 { 2285 struct net *net = dev_net(rt->dst.dev); 2286 2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2288 rt->rt6i_flags |= RTF_MODIFIED; 2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2290 } 2291 2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2293 { 2294 return !(rt->rt6i_flags & RTF_CACHE) && 2295 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2296 } 2297 2298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2299 const struct ipv6hdr *iph, u32 mtu) 2300 { 2301 const struct in6_addr *daddr, *saddr; 2302 struct rt6_info *rt6 = (struct rt6_info *)dst; 2303 2304 if (dst_metric_locked(dst, RTAX_MTU)) 2305 return; 2306 2307 if (iph) { 2308 daddr = &iph->daddr; 2309 saddr = &iph->saddr; 2310 } else if (sk) { 2311 daddr = &sk->sk_v6_daddr; 2312 saddr = &inet6_sk(sk)->saddr; 2313 } else { 2314 daddr = NULL; 2315 saddr = NULL; 2316 } 2317 dst_confirm_neigh(dst, daddr); 2318 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2319 if (mtu >= dst_mtu(dst)) 2320 return; 2321 2322 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2323 rt6_do_update_pmtu(rt6, mtu); 2324 /* update rt6_ex->stamp for cache */ 2325 if (rt6->rt6i_flags & RTF_CACHE) 2326 rt6_update_exception_stamp_rt(rt6); 2327 } else if (daddr) { 2328 struct fib6_info *from; 2329 struct rt6_info *nrt6; 2330 2331 rcu_read_lock(); 2332 from = rcu_dereference(rt6->from); 2333 if (!from) { 2334 rcu_read_unlock(); 2335 return; 2336 } 2337 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2338 if (nrt6) { 2339 rt6_do_update_pmtu(nrt6, mtu); 2340 if (rt6_insert_exception(nrt6, from)) 2341 dst_release_immediate(&nrt6->dst); 2342 } 2343 rcu_read_unlock(); 2344 } 2345 } 2346 2347 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2348 struct sk_buff *skb, u32 mtu) 2349 { 2350 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2351 } 2352 2353 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2354 int oif, u32 mark, kuid_t uid) 2355 { 2356 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2357 struct dst_entry *dst; 2358 struct flowi6 fl6 = { 2359 .flowi6_oif = oif, 2360 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2361 .daddr = iph->daddr, 2362 .saddr = iph->saddr, 2363 .flowlabel = ip6_flowinfo(iph), 2364 .flowi6_uid = uid, 2365 }; 2366 2367 dst = ip6_route_output(net, NULL, &fl6); 2368 if (!dst->error) 2369 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2370 dst_release(dst); 2371 } 2372 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2373 2374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2375 { 2376 int oif = sk->sk_bound_dev_if; 2377 struct dst_entry *dst; 2378 2379 if (!oif && skb->dev) 2380 oif = l3mdev_master_ifindex(skb->dev); 2381 2382 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2383 2384 dst = __sk_dst_get(sk); 2385 if (!dst || !dst->obsolete || 2386 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2387 return; 2388 2389 bh_lock_sock(sk); 2390 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2391 ip6_datagram_dst_update(sk, false); 2392 bh_unlock_sock(sk); 2393 } 2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2395 2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2397 const struct flowi6 *fl6) 2398 { 2399 #ifdef CONFIG_IPV6_SUBTREES 2400 struct ipv6_pinfo *np = inet6_sk(sk); 2401 #endif 2402 2403 ip6_dst_store(sk, dst, 2404 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2405 &sk->sk_v6_daddr : NULL, 2406 #ifdef CONFIG_IPV6_SUBTREES 2407 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2408 &np->saddr : 2409 #endif 2410 NULL); 2411 } 2412 2413 /* Handle redirects */ 2414 struct ip6rd_flowi { 2415 struct flowi6 fl6; 2416 struct in6_addr gateway; 2417 }; 2418 2419 static struct rt6_info *__ip6_route_redirect(struct net *net, 2420 struct fib6_table *table, 2421 struct flowi6 *fl6, 2422 const struct sk_buff *skb, 2423 int flags) 2424 { 2425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2426 struct rt6_info *ret = NULL, *rt_cache; 2427 struct fib6_info *rt; 2428 struct fib6_node *fn; 2429 2430 /* Get the "current" route for this destination and 2431 * check if the redirect has come from appropriate router. 2432 * 2433 * RFC 4861 specifies that redirects should only be 2434 * accepted if they come from the nexthop to the target. 2435 * Due to the way the routes are chosen, this notion 2436 * is a bit fuzzy and one might need to check all possible 2437 * routes. 2438 */ 2439 2440 rcu_read_lock(); 2441 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2442 restart: 2443 for_each_fib6_node_rt_rcu(fn) { 2444 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2445 continue; 2446 if (fib6_check_expired(rt)) 2447 continue; 2448 if (rt->fib6_flags & RTF_REJECT) 2449 break; 2450 if (!(rt->fib6_flags & RTF_GATEWAY)) 2451 continue; 2452 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2453 continue; 2454 /* rt_cache's gateway might be different from its 'parent' 2455 * in the case of an ip redirect. 2456 * So we keep searching in the exception table if the gateway 2457 * is different. 2458 */ 2459 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2460 rt_cache = rt6_find_cached_rt(rt, 2461 &fl6->daddr, 2462 &fl6->saddr); 2463 if (rt_cache && 2464 ipv6_addr_equal(&rdfl->gateway, 2465 &rt_cache->rt6i_gateway)) { 2466 ret = rt_cache; 2467 break; 2468 } 2469 continue; 2470 } 2471 break; 2472 } 2473 2474 if (!rt) 2475 rt = net->ipv6.fib6_null_entry; 2476 else if (rt->fib6_flags & RTF_REJECT) { 2477 ret = net->ipv6.ip6_null_entry; 2478 goto out; 2479 } 2480 2481 if (rt == net->ipv6.fib6_null_entry) { 2482 fn = fib6_backtrack(fn, &fl6->saddr); 2483 if (fn) 2484 goto restart; 2485 } 2486 2487 out: 2488 if (ret) 2489 ip6_hold_safe(net, &ret, true); 2490 else 2491 ret = ip6_create_rt_rcu(rt); 2492 2493 rcu_read_unlock(); 2494 2495 trace_fib6_table_lookup(net, rt, table, fl6); 2496 return ret; 2497 }; 2498 2499 static struct dst_entry *ip6_route_redirect(struct net *net, 2500 const struct flowi6 *fl6, 2501 const struct sk_buff *skb, 2502 const struct in6_addr *gateway) 2503 { 2504 int flags = RT6_LOOKUP_F_HAS_SADDR; 2505 struct ip6rd_flowi rdfl; 2506 2507 rdfl.fl6 = *fl6; 2508 rdfl.gateway = *gateway; 2509 2510 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2511 flags, __ip6_route_redirect); 2512 } 2513 2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2515 kuid_t uid) 2516 { 2517 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2518 struct dst_entry *dst; 2519 struct flowi6 fl6 = { 2520 .flowi6_iif = LOOPBACK_IFINDEX, 2521 .flowi6_oif = oif, 2522 .flowi6_mark = mark, 2523 .daddr = iph->daddr, 2524 .saddr = iph->saddr, 2525 .flowlabel = ip6_flowinfo(iph), 2526 .flowi6_uid = uid, 2527 }; 2528 2529 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2530 rt6_do_redirect(dst, NULL, skb); 2531 dst_release(dst); 2532 } 2533 EXPORT_SYMBOL_GPL(ip6_redirect); 2534 2535 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2536 { 2537 const struct ipv6hdr *iph = ipv6_hdr(skb); 2538 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2539 struct dst_entry *dst; 2540 struct flowi6 fl6 = { 2541 .flowi6_iif = LOOPBACK_IFINDEX, 2542 .flowi6_oif = oif, 2543 .daddr = msg->dest, 2544 .saddr = iph->daddr, 2545 .flowi6_uid = sock_net_uid(net, NULL), 2546 }; 2547 2548 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2549 rt6_do_redirect(dst, NULL, skb); 2550 dst_release(dst); 2551 } 2552 2553 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2554 { 2555 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2556 sk->sk_uid); 2557 } 2558 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2559 2560 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2561 { 2562 struct net_device *dev = dst->dev; 2563 unsigned int mtu = dst_mtu(dst); 2564 struct net *net = dev_net(dev); 2565 2566 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2567 2568 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2569 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2570 2571 /* 2572 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2573 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2574 * IPV6_MAXPLEN is also valid and means: "any MSS, 2575 * rely only on pmtu discovery" 2576 */ 2577 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2578 mtu = IPV6_MAXPLEN; 2579 return mtu; 2580 } 2581 2582 static unsigned int ip6_mtu(const struct dst_entry *dst) 2583 { 2584 struct inet6_dev *idev; 2585 unsigned int mtu; 2586 2587 mtu = dst_metric_raw(dst, RTAX_MTU); 2588 if (mtu) 2589 goto out; 2590 2591 mtu = IPV6_MIN_MTU; 2592 2593 rcu_read_lock(); 2594 idev = __in6_dev_get(dst->dev); 2595 if (idev) 2596 mtu = idev->cnf.mtu6; 2597 rcu_read_unlock(); 2598 2599 out: 2600 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2601 2602 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2603 } 2604 2605 /* MTU selection: 2606 * 1. mtu on route is locked - use it 2607 * 2. mtu from nexthop exception 2608 * 3. mtu from egress device 2609 * 2610 * based on ip6_dst_mtu_forward and exception logic of 2611 * rt6_find_cached_rt; called with rcu_read_lock 2612 */ 2613 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2614 struct in6_addr *saddr) 2615 { 2616 struct rt6_exception_bucket *bucket; 2617 struct rt6_exception *rt6_ex; 2618 struct in6_addr *src_key; 2619 struct inet6_dev *idev; 2620 u32 mtu = 0; 2621 2622 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2623 mtu = f6i->fib6_pmtu; 2624 if (mtu) 2625 goto out; 2626 } 2627 2628 src_key = NULL; 2629 #ifdef CONFIG_IPV6_SUBTREES 2630 if (f6i->fib6_src.plen) 2631 src_key = saddr; 2632 #endif 2633 2634 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2635 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2636 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2637 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2638 2639 if (likely(!mtu)) { 2640 struct net_device *dev = fib6_info_nh_dev(f6i); 2641 2642 mtu = IPV6_MIN_MTU; 2643 idev = __in6_dev_get(dev); 2644 if (idev && idev->cnf.mtu6 > mtu) 2645 mtu = idev->cnf.mtu6; 2646 } 2647 2648 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2649 out: 2650 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2651 } 2652 2653 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2654 struct flowi6 *fl6) 2655 { 2656 struct dst_entry *dst; 2657 struct rt6_info *rt; 2658 struct inet6_dev *idev = in6_dev_get(dev); 2659 struct net *net = dev_net(dev); 2660 2661 if (unlikely(!idev)) 2662 return ERR_PTR(-ENODEV); 2663 2664 rt = ip6_dst_alloc(net, dev, 0); 2665 if (unlikely(!rt)) { 2666 in6_dev_put(idev); 2667 dst = ERR_PTR(-ENOMEM); 2668 goto out; 2669 } 2670 2671 rt->dst.flags |= DST_HOST; 2672 rt->dst.input = ip6_input; 2673 rt->dst.output = ip6_output; 2674 rt->rt6i_gateway = fl6->daddr; 2675 rt->rt6i_dst.addr = fl6->daddr; 2676 rt->rt6i_dst.plen = 128; 2677 rt->rt6i_idev = idev; 2678 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2679 2680 /* Add this dst into uncached_list so that rt6_disable_ip() can 2681 * do proper release of the net_device 2682 */ 2683 rt6_uncached_list_add(rt); 2684 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2685 2686 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2687 2688 out: 2689 return dst; 2690 } 2691 2692 static int ip6_dst_gc(struct dst_ops *ops) 2693 { 2694 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2695 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2696 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2697 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2698 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2699 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2700 int entries; 2701 2702 entries = dst_entries_get_fast(ops); 2703 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2704 entries <= rt_max_size) 2705 goto out; 2706 2707 net->ipv6.ip6_rt_gc_expire++; 2708 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2709 entries = dst_entries_get_slow(ops); 2710 if (entries < ops->gc_thresh) 2711 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2712 out: 2713 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2714 return entries > rt_max_size; 2715 } 2716 2717 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2718 struct fib6_config *cfg, 2719 const struct in6_addr *gw_addr, 2720 u32 tbid, int flags) 2721 { 2722 struct flowi6 fl6 = { 2723 .flowi6_oif = cfg->fc_ifindex, 2724 .daddr = *gw_addr, 2725 .saddr = cfg->fc_prefsrc, 2726 }; 2727 struct fib6_table *table; 2728 struct rt6_info *rt; 2729 2730 table = fib6_get_table(net, tbid); 2731 if (!table) 2732 return NULL; 2733 2734 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2735 flags |= RT6_LOOKUP_F_HAS_SADDR; 2736 2737 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2738 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2739 2740 /* if table lookup failed, fall back to full lookup */ 2741 if (rt == net->ipv6.ip6_null_entry) { 2742 ip6_rt_put(rt); 2743 rt = NULL; 2744 } 2745 2746 return rt; 2747 } 2748 2749 static int ip6_route_check_nh_onlink(struct net *net, 2750 struct fib6_config *cfg, 2751 const struct net_device *dev, 2752 struct netlink_ext_ack *extack) 2753 { 2754 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2755 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2756 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2757 struct fib6_info *from; 2758 struct rt6_info *grt; 2759 int err; 2760 2761 err = 0; 2762 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2763 if (grt) { 2764 rcu_read_lock(); 2765 from = rcu_dereference(grt->from); 2766 if (!grt->dst.error && 2767 /* ignore match if it is the default route */ 2768 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2769 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2770 NL_SET_ERR_MSG(extack, 2771 "Nexthop has invalid gateway or device mismatch"); 2772 err = -EINVAL; 2773 } 2774 rcu_read_unlock(); 2775 2776 ip6_rt_put(grt); 2777 } 2778 2779 return err; 2780 } 2781 2782 static int ip6_route_check_nh(struct net *net, 2783 struct fib6_config *cfg, 2784 struct net_device **_dev, 2785 struct inet6_dev **idev) 2786 { 2787 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2788 struct net_device *dev = _dev ? *_dev : NULL; 2789 struct rt6_info *grt = NULL; 2790 int err = -EHOSTUNREACH; 2791 2792 if (cfg->fc_table) { 2793 int flags = RT6_LOOKUP_F_IFACE; 2794 2795 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2796 cfg->fc_table, flags); 2797 if (grt) { 2798 if (grt->rt6i_flags & RTF_GATEWAY || 2799 (dev && dev != grt->dst.dev)) { 2800 ip6_rt_put(grt); 2801 grt = NULL; 2802 } 2803 } 2804 } 2805 2806 if (!grt) 2807 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2808 2809 if (!grt) 2810 goto out; 2811 2812 if (dev) { 2813 if (dev != grt->dst.dev) { 2814 ip6_rt_put(grt); 2815 goto out; 2816 } 2817 } else { 2818 *_dev = dev = grt->dst.dev; 2819 *idev = grt->rt6i_idev; 2820 dev_hold(dev); 2821 in6_dev_hold(grt->rt6i_idev); 2822 } 2823 2824 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2825 err = 0; 2826 2827 ip6_rt_put(grt); 2828 2829 out: 2830 return err; 2831 } 2832 2833 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2834 struct net_device **_dev, struct inet6_dev **idev, 2835 struct netlink_ext_ack *extack) 2836 { 2837 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2838 int gwa_type = ipv6_addr_type(gw_addr); 2839 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2840 const struct net_device *dev = *_dev; 2841 bool need_addr_check = !dev; 2842 int err = -EINVAL; 2843 2844 /* if gw_addr is local we will fail to detect this in case 2845 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2846 * will return already-added prefix route via interface that 2847 * prefix route was assigned to, which might be non-loopback. 2848 */ 2849 if (dev && 2850 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2851 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2852 goto out; 2853 } 2854 2855 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2856 /* IPv6 strictly inhibits using not link-local 2857 * addresses as nexthop address. 2858 * Otherwise, router will not able to send redirects. 2859 * It is very good, but in some (rare!) circumstances 2860 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2861 * some exceptions. --ANK 2862 * We allow IPv4-mapped nexthops to support RFC4798-type 2863 * addressing 2864 */ 2865 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2866 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2867 goto out; 2868 } 2869 2870 if (cfg->fc_flags & RTNH_F_ONLINK) 2871 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2872 else 2873 err = ip6_route_check_nh(net, cfg, _dev, idev); 2874 2875 if (err) 2876 goto out; 2877 } 2878 2879 /* reload in case device was changed */ 2880 dev = *_dev; 2881 2882 err = -EINVAL; 2883 if (!dev) { 2884 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2885 goto out; 2886 } else if (dev->flags & IFF_LOOPBACK) { 2887 NL_SET_ERR_MSG(extack, 2888 "Egress device can not be loopback device for this route"); 2889 goto out; 2890 } 2891 2892 /* if we did not check gw_addr above, do so now that the 2893 * egress device has been resolved. 2894 */ 2895 if (need_addr_check && 2896 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2897 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2898 goto out; 2899 } 2900 2901 err = 0; 2902 out: 2903 return err; 2904 } 2905 2906 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2907 gfp_t gfp_flags, 2908 struct netlink_ext_ack *extack) 2909 { 2910 struct net *net = cfg->fc_nlinfo.nl_net; 2911 struct fib6_info *rt = NULL; 2912 struct net_device *dev = NULL; 2913 struct inet6_dev *idev = NULL; 2914 struct fib6_table *table; 2915 int addr_type; 2916 int err = -EINVAL; 2917 2918 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2919 if (cfg->fc_flags & RTF_PCPU) { 2920 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2921 goto out; 2922 } 2923 2924 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2925 if (cfg->fc_flags & RTF_CACHE) { 2926 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2927 goto out; 2928 } 2929 2930 if (cfg->fc_type > RTN_MAX) { 2931 NL_SET_ERR_MSG(extack, "Invalid route type"); 2932 goto out; 2933 } 2934 2935 if (cfg->fc_dst_len > 128) { 2936 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2937 goto out; 2938 } 2939 if (cfg->fc_src_len > 128) { 2940 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2941 goto out; 2942 } 2943 #ifndef CONFIG_IPV6_SUBTREES 2944 if (cfg->fc_src_len) { 2945 NL_SET_ERR_MSG(extack, 2946 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2947 goto out; 2948 } 2949 #endif 2950 if (cfg->fc_ifindex) { 2951 err = -ENODEV; 2952 dev = dev_get_by_index(net, cfg->fc_ifindex); 2953 if (!dev) 2954 goto out; 2955 idev = in6_dev_get(dev); 2956 if (!idev) 2957 goto out; 2958 } 2959 2960 if (cfg->fc_metric == 0) 2961 cfg->fc_metric = IP6_RT_PRIO_USER; 2962 2963 if (cfg->fc_flags & RTNH_F_ONLINK) { 2964 if (!dev) { 2965 NL_SET_ERR_MSG(extack, 2966 "Nexthop device required for onlink"); 2967 err = -ENODEV; 2968 goto out; 2969 } 2970 2971 if (!(dev->flags & IFF_UP)) { 2972 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2973 err = -ENETDOWN; 2974 goto out; 2975 } 2976 } 2977 2978 err = -ENOBUFS; 2979 if (cfg->fc_nlinfo.nlh && 2980 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2981 table = fib6_get_table(net, cfg->fc_table); 2982 if (!table) { 2983 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2984 table = fib6_new_table(net, cfg->fc_table); 2985 } 2986 } else { 2987 table = fib6_new_table(net, cfg->fc_table); 2988 } 2989 2990 if (!table) 2991 goto out; 2992 2993 err = -ENOMEM; 2994 rt = fib6_info_alloc(gfp_flags); 2995 if (!rt) 2996 goto out; 2997 2998 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 2999 extack); 3000 if (IS_ERR(rt->fib6_metrics)) { 3001 err = PTR_ERR(rt->fib6_metrics); 3002 /* Do not leave garbage there. */ 3003 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3004 goto out; 3005 } 3006 3007 if (cfg->fc_flags & RTF_ADDRCONF) 3008 rt->dst_nocount = true; 3009 3010 if (cfg->fc_flags & RTF_EXPIRES) 3011 fib6_set_expires(rt, jiffies + 3012 clock_t_to_jiffies(cfg->fc_expires)); 3013 else 3014 fib6_clean_expires(rt); 3015 3016 if (cfg->fc_protocol == RTPROT_UNSPEC) 3017 cfg->fc_protocol = RTPROT_BOOT; 3018 rt->fib6_protocol = cfg->fc_protocol; 3019 3020 addr_type = ipv6_addr_type(&cfg->fc_dst); 3021 3022 if (cfg->fc_encap) { 3023 struct lwtunnel_state *lwtstate; 3024 3025 err = lwtunnel_build_state(cfg->fc_encap_type, 3026 cfg->fc_encap, AF_INET6, cfg, 3027 &lwtstate, extack); 3028 if (err) 3029 goto out; 3030 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3031 } 3032 3033 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3034 rt->fib6_dst.plen = cfg->fc_dst_len; 3035 if (rt->fib6_dst.plen == 128) 3036 rt->dst_host = true; 3037 3038 #ifdef CONFIG_IPV6_SUBTREES 3039 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3040 rt->fib6_src.plen = cfg->fc_src_len; 3041 #endif 3042 3043 rt->fib6_metric = cfg->fc_metric; 3044 rt->fib6_nh.nh_weight = 1; 3045 3046 rt->fib6_type = cfg->fc_type; 3047 3048 /* We cannot add true routes via loopback here, 3049 they would result in kernel looping; promote them to reject routes 3050 */ 3051 if ((cfg->fc_flags & RTF_REJECT) || 3052 (dev && (dev->flags & IFF_LOOPBACK) && 3053 !(addr_type & IPV6_ADDR_LOOPBACK) && 3054 !(cfg->fc_flags & RTF_LOCAL))) { 3055 /* hold loopback dev/idev if we haven't done so. */ 3056 if (dev != net->loopback_dev) { 3057 if (dev) { 3058 dev_put(dev); 3059 in6_dev_put(idev); 3060 } 3061 dev = net->loopback_dev; 3062 dev_hold(dev); 3063 idev = in6_dev_get(dev); 3064 if (!idev) { 3065 err = -ENODEV; 3066 goto out; 3067 } 3068 } 3069 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3070 goto install_route; 3071 } 3072 3073 if (cfg->fc_flags & RTF_GATEWAY) { 3074 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3075 if (err) 3076 goto out; 3077 3078 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3079 } 3080 3081 err = -ENODEV; 3082 if (!dev) 3083 goto out; 3084 3085 if (idev->cnf.disable_ipv6) { 3086 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3087 err = -EACCES; 3088 goto out; 3089 } 3090 3091 if (!(dev->flags & IFF_UP)) { 3092 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3093 err = -ENETDOWN; 3094 goto out; 3095 } 3096 3097 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3098 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3099 NL_SET_ERR_MSG(extack, "Invalid source address"); 3100 err = -EINVAL; 3101 goto out; 3102 } 3103 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3104 rt->fib6_prefsrc.plen = 128; 3105 } else 3106 rt->fib6_prefsrc.plen = 0; 3107 3108 rt->fib6_flags = cfg->fc_flags; 3109 3110 install_route: 3111 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3112 !netif_carrier_ok(dev)) 3113 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3114 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3115 rt->fib6_nh.nh_dev = dev; 3116 rt->fib6_table = table; 3117 3118 if (idev) 3119 in6_dev_put(idev); 3120 3121 return rt; 3122 out: 3123 if (dev) 3124 dev_put(dev); 3125 if (idev) 3126 in6_dev_put(idev); 3127 3128 fib6_info_release(rt); 3129 return ERR_PTR(err); 3130 } 3131 3132 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3133 struct netlink_ext_ack *extack) 3134 { 3135 struct fib6_info *rt; 3136 int err; 3137 3138 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3139 if (IS_ERR(rt)) 3140 return PTR_ERR(rt); 3141 3142 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3143 fib6_info_release(rt); 3144 3145 return err; 3146 } 3147 3148 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3149 { 3150 struct net *net = info->nl_net; 3151 struct fib6_table *table; 3152 int err; 3153 3154 if (rt == net->ipv6.fib6_null_entry) { 3155 err = -ENOENT; 3156 goto out; 3157 } 3158 3159 table = rt->fib6_table; 3160 spin_lock_bh(&table->tb6_lock); 3161 err = fib6_del(rt, info); 3162 spin_unlock_bh(&table->tb6_lock); 3163 3164 out: 3165 fib6_info_release(rt); 3166 return err; 3167 } 3168 3169 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3170 { 3171 struct nl_info info = { .nl_net = net }; 3172 3173 return __ip6_del_rt(rt, &info); 3174 } 3175 3176 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3177 { 3178 struct nl_info *info = &cfg->fc_nlinfo; 3179 struct net *net = info->nl_net; 3180 struct sk_buff *skb = NULL; 3181 struct fib6_table *table; 3182 int err = -ENOENT; 3183 3184 if (rt == net->ipv6.fib6_null_entry) 3185 goto out_put; 3186 table = rt->fib6_table; 3187 spin_lock_bh(&table->tb6_lock); 3188 3189 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3190 struct fib6_info *sibling, *next_sibling; 3191 3192 /* prefer to send a single notification with all hops */ 3193 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3194 if (skb) { 3195 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3196 3197 if (rt6_fill_node(net, skb, rt, NULL, 3198 NULL, NULL, 0, RTM_DELROUTE, 3199 info->portid, seq, 0) < 0) { 3200 kfree_skb(skb); 3201 skb = NULL; 3202 } else 3203 info->skip_notify = 1; 3204 } 3205 3206 list_for_each_entry_safe(sibling, next_sibling, 3207 &rt->fib6_siblings, 3208 fib6_siblings) { 3209 err = fib6_del(sibling, info); 3210 if (err) 3211 goto out_unlock; 3212 } 3213 } 3214 3215 err = fib6_del(rt, info); 3216 out_unlock: 3217 spin_unlock_bh(&table->tb6_lock); 3218 out_put: 3219 fib6_info_release(rt); 3220 3221 if (skb) { 3222 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3223 info->nlh, gfp_any()); 3224 } 3225 return err; 3226 } 3227 3228 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3229 { 3230 int rc = -ESRCH; 3231 3232 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3233 goto out; 3234 3235 if (cfg->fc_flags & RTF_GATEWAY && 3236 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3237 goto out; 3238 3239 rc = rt6_remove_exception_rt(rt); 3240 out: 3241 return rc; 3242 } 3243 3244 static int ip6_route_del(struct fib6_config *cfg, 3245 struct netlink_ext_ack *extack) 3246 { 3247 struct rt6_info *rt_cache; 3248 struct fib6_table *table; 3249 struct fib6_info *rt; 3250 struct fib6_node *fn; 3251 int err = -ESRCH; 3252 3253 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3254 if (!table) { 3255 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3256 return err; 3257 } 3258 3259 rcu_read_lock(); 3260 3261 fn = fib6_locate(&table->tb6_root, 3262 &cfg->fc_dst, cfg->fc_dst_len, 3263 &cfg->fc_src, cfg->fc_src_len, 3264 !(cfg->fc_flags & RTF_CACHE)); 3265 3266 if (fn) { 3267 for_each_fib6_node_rt_rcu(fn) { 3268 if (cfg->fc_flags & RTF_CACHE) { 3269 int rc; 3270 3271 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3272 &cfg->fc_src); 3273 if (rt_cache) { 3274 rc = ip6_del_cached_rt(rt_cache, cfg); 3275 if (rc != -ESRCH) { 3276 rcu_read_unlock(); 3277 return rc; 3278 } 3279 } 3280 continue; 3281 } 3282 if (cfg->fc_ifindex && 3283 (!rt->fib6_nh.nh_dev || 3284 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3285 continue; 3286 if (cfg->fc_flags & RTF_GATEWAY && 3287 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3288 continue; 3289 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3290 continue; 3291 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3292 continue; 3293 if (!fib6_info_hold_safe(rt)) 3294 continue; 3295 rcu_read_unlock(); 3296 3297 /* if gateway was specified only delete the one hop */ 3298 if (cfg->fc_flags & RTF_GATEWAY) 3299 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3300 3301 return __ip6_del_rt_siblings(rt, cfg); 3302 } 3303 } 3304 rcu_read_unlock(); 3305 3306 return err; 3307 } 3308 3309 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3310 { 3311 struct netevent_redirect netevent; 3312 struct rt6_info *rt, *nrt = NULL; 3313 struct ndisc_options ndopts; 3314 struct inet6_dev *in6_dev; 3315 struct neighbour *neigh; 3316 struct fib6_info *from; 3317 struct rd_msg *msg; 3318 int optlen, on_link; 3319 u8 *lladdr; 3320 3321 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3322 optlen -= sizeof(*msg); 3323 3324 if (optlen < 0) { 3325 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3326 return; 3327 } 3328 3329 msg = (struct rd_msg *)icmp6_hdr(skb); 3330 3331 if (ipv6_addr_is_multicast(&msg->dest)) { 3332 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3333 return; 3334 } 3335 3336 on_link = 0; 3337 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3338 on_link = 1; 3339 } else if (ipv6_addr_type(&msg->target) != 3340 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3341 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3342 return; 3343 } 3344 3345 in6_dev = __in6_dev_get(skb->dev); 3346 if (!in6_dev) 3347 return; 3348 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3349 return; 3350 3351 /* RFC2461 8.1: 3352 * The IP source address of the Redirect MUST be the same as the current 3353 * first-hop router for the specified ICMP Destination Address. 3354 */ 3355 3356 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3357 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3358 return; 3359 } 3360 3361 lladdr = NULL; 3362 if (ndopts.nd_opts_tgt_lladdr) { 3363 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3364 skb->dev); 3365 if (!lladdr) { 3366 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3367 return; 3368 } 3369 } 3370 3371 rt = (struct rt6_info *) dst; 3372 if (rt->rt6i_flags & RTF_REJECT) { 3373 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3374 return; 3375 } 3376 3377 /* Redirect received -> path was valid. 3378 * Look, redirects are sent only in response to data packets, 3379 * so that this nexthop apparently is reachable. --ANK 3380 */ 3381 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3382 3383 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3384 if (!neigh) 3385 return; 3386 3387 /* 3388 * We have finally decided to accept it. 3389 */ 3390 3391 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3392 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3393 NEIGH_UPDATE_F_OVERRIDE| 3394 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3395 NEIGH_UPDATE_F_ISROUTER)), 3396 NDISC_REDIRECT, &ndopts); 3397 3398 rcu_read_lock(); 3399 from = rcu_dereference(rt->from); 3400 /* This fib6_info_hold() is safe here because we hold reference to rt 3401 * and rt already holds reference to fib6_info. 3402 */ 3403 fib6_info_hold(from); 3404 rcu_read_unlock(); 3405 3406 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3407 if (!nrt) 3408 goto out; 3409 3410 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3411 if (on_link) 3412 nrt->rt6i_flags &= ~RTF_GATEWAY; 3413 3414 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3415 3416 /* No need to remove rt from the exception table if rt is 3417 * a cached route because rt6_insert_exception() will 3418 * takes care of it 3419 */ 3420 if (rt6_insert_exception(nrt, from)) { 3421 dst_release_immediate(&nrt->dst); 3422 goto out; 3423 } 3424 3425 netevent.old = &rt->dst; 3426 netevent.new = &nrt->dst; 3427 netevent.daddr = &msg->dest; 3428 netevent.neigh = neigh; 3429 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3430 3431 out: 3432 fib6_info_release(from); 3433 neigh_release(neigh); 3434 } 3435 3436 #ifdef CONFIG_IPV6_ROUTE_INFO 3437 static struct fib6_info *rt6_get_route_info(struct net *net, 3438 const struct in6_addr *prefix, int prefixlen, 3439 const struct in6_addr *gwaddr, 3440 struct net_device *dev) 3441 { 3442 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3443 int ifindex = dev->ifindex; 3444 struct fib6_node *fn; 3445 struct fib6_info *rt = NULL; 3446 struct fib6_table *table; 3447 3448 table = fib6_get_table(net, tb_id); 3449 if (!table) 3450 return NULL; 3451 3452 rcu_read_lock(); 3453 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3454 if (!fn) 3455 goto out; 3456 3457 for_each_fib6_node_rt_rcu(fn) { 3458 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3459 continue; 3460 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3461 continue; 3462 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3463 continue; 3464 if (!fib6_info_hold_safe(rt)) 3465 continue; 3466 break; 3467 } 3468 out: 3469 rcu_read_unlock(); 3470 return rt; 3471 } 3472 3473 static struct fib6_info *rt6_add_route_info(struct net *net, 3474 const struct in6_addr *prefix, int prefixlen, 3475 const struct in6_addr *gwaddr, 3476 struct net_device *dev, 3477 unsigned int pref) 3478 { 3479 struct fib6_config cfg = { 3480 .fc_metric = IP6_RT_PRIO_USER, 3481 .fc_ifindex = dev->ifindex, 3482 .fc_dst_len = prefixlen, 3483 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3484 RTF_UP | RTF_PREF(pref), 3485 .fc_protocol = RTPROT_RA, 3486 .fc_type = RTN_UNICAST, 3487 .fc_nlinfo.portid = 0, 3488 .fc_nlinfo.nlh = NULL, 3489 .fc_nlinfo.nl_net = net, 3490 }; 3491 3492 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3493 cfg.fc_dst = *prefix; 3494 cfg.fc_gateway = *gwaddr; 3495 3496 /* We should treat it as a default route if prefix length is 0. */ 3497 if (!prefixlen) 3498 cfg.fc_flags |= RTF_DEFAULT; 3499 3500 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3501 3502 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3503 } 3504 #endif 3505 3506 struct fib6_info *rt6_get_dflt_router(struct net *net, 3507 const struct in6_addr *addr, 3508 struct net_device *dev) 3509 { 3510 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3511 struct fib6_info *rt; 3512 struct fib6_table *table; 3513 3514 table = fib6_get_table(net, tb_id); 3515 if (!table) 3516 return NULL; 3517 3518 rcu_read_lock(); 3519 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3520 if (dev == rt->fib6_nh.nh_dev && 3521 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3522 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3523 break; 3524 } 3525 if (rt && !fib6_info_hold_safe(rt)) 3526 rt = NULL; 3527 rcu_read_unlock(); 3528 return rt; 3529 } 3530 3531 struct fib6_info *rt6_add_dflt_router(struct net *net, 3532 const struct in6_addr *gwaddr, 3533 struct net_device *dev, 3534 unsigned int pref) 3535 { 3536 struct fib6_config cfg = { 3537 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3538 .fc_metric = IP6_RT_PRIO_USER, 3539 .fc_ifindex = dev->ifindex, 3540 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3541 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3542 .fc_protocol = RTPROT_RA, 3543 .fc_type = RTN_UNICAST, 3544 .fc_nlinfo.portid = 0, 3545 .fc_nlinfo.nlh = NULL, 3546 .fc_nlinfo.nl_net = net, 3547 }; 3548 3549 cfg.fc_gateway = *gwaddr; 3550 3551 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3552 struct fib6_table *table; 3553 3554 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3555 if (table) 3556 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3557 } 3558 3559 return rt6_get_dflt_router(net, gwaddr, dev); 3560 } 3561 3562 static void __rt6_purge_dflt_routers(struct net *net, 3563 struct fib6_table *table) 3564 { 3565 struct fib6_info *rt; 3566 3567 restart: 3568 rcu_read_lock(); 3569 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3570 struct net_device *dev = fib6_info_nh_dev(rt); 3571 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3572 3573 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3574 (!idev || idev->cnf.accept_ra != 2) && 3575 fib6_info_hold_safe(rt)) { 3576 rcu_read_unlock(); 3577 ip6_del_rt(net, rt); 3578 goto restart; 3579 } 3580 } 3581 rcu_read_unlock(); 3582 3583 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3584 } 3585 3586 void rt6_purge_dflt_routers(struct net *net) 3587 { 3588 struct fib6_table *table; 3589 struct hlist_head *head; 3590 unsigned int h; 3591 3592 rcu_read_lock(); 3593 3594 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3595 head = &net->ipv6.fib_table_hash[h]; 3596 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3597 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3598 __rt6_purge_dflt_routers(net, table); 3599 } 3600 } 3601 3602 rcu_read_unlock(); 3603 } 3604 3605 static void rtmsg_to_fib6_config(struct net *net, 3606 struct in6_rtmsg *rtmsg, 3607 struct fib6_config *cfg) 3608 { 3609 *cfg = (struct fib6_config){ 3610 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3611 : RT6_TABLE_MAIN, 3612 .fc_ifindex = rtmsg->rtmsg_ifindex, 3613 .fc_metric = rtmsg->rtmsg_metric, 3614 .fc_expires = rtmsg->rtmsg_info, 3615 .fc_dst_len = rtmsg->rtmsg_dst_len, 3616 .fc_src_len = rtmsg->rtmsg_src_len, 3617 .fc_flags = rtmsg->rtmsg_flags, 3618 .fc_type = rtmsg->rtmsg_type, 3619 3620 .fc_nlinfo.nl_net = net, 3621 3622 .fc_dst = rtmsg->rtmsg_dst, 3623 .fc_src = rtmsg->rtmsg_src, 3624 .fc_gateway = rtmsg->rtmsg_gateway, 3625 }; 3626 } 3627 3628 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3629 { 3630 struct fib6_config cfg; 3631 struct in6_rtmsg rtmsg; 3632 int err; 3633 3634 switch (cmd) { 3635 case SIOCADDRT: /* Add a route */ 3636 case SIOCDELRT: /* Delete a route */ 3637 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3638 return -EPERM; 3639 err = copy_from_user(&rtmsg, arg, 3640 sizeof(struct in6_rtmsg)); 3641 if (err) 3642 return -EFAULT; 3643 3644 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3645 3646 rtnl_lock(); 3647 switch (cmd) { 3648 case SIOCADDRT: 3649 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3650 break; 3651 case SIOCDELRT: 3652 err = ip6_route_del(&cfg, NULL); 3653 break; 3654 default: 3655 err = -EINVAL; 3656 } 3657 rtnl_unlock(); 3658 3659 return err; 3660 } 3661 3662 return -EINVAL; 3663 } 3664 3665 /* 3666 * Drop the packet on the floor 3667 */ 3668 3669 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3670 { 3671 int type; 3672 struct dst_entry *dst = skb_dst(skb); 3673 switch (ipstats_mib_noroutes) { 3674 case IPSTATS_MIB_INNOROUTES: 3675 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3676 if (type == IPV6_ADDR_ANY) { 3677 IP6_INC_STATS(dev_net(dst->dev), 3678 __in6_dev_get_safely(skb->dev), 3679 IPSTATS_MIB_INADDRERRORS); 3680 break; 3681 } 3682 /* FALLTHROUGH */ 3683 case IPSTATS_MIB_OUTNOROUTES: 3684 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3685 ipstats_mib_noroutes); 3686 break; 3687 } 3688 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3689 kfree_skb(skb); 3690 return 0; 3691 } 3692 3693 static int ip6_pkt_discard(struct sk_buff *skb) 3694 { 3695 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3696 } 3697 3698 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3699 { 3700 skb->dev = skb_dst(skb)->dev; 3701 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3702 } 3703 3704 static int ip6_pkt_prohibit(struct sk_buff *skb) 3705 { 3706 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3707 } 3708 3709 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3710 { 3711 skb->dev = skb_dst(skb)->dev; 3712 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3713 } 3714 3715 /* 3716 * Allocate a dst for local (unicast / anycast) address. 3717 */ 3718 3719 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3720 struct inet6_dev *idev, 3721 const struct in6_addr *addr, 3722 bool anycast, gfp_t gfp_flags) 3723 { 3724 u32 tb_id; 3725 struct net_device *dev = idev->dev; 3726 struct fib6_info *f6i; 3727 3728 f6i = fib6_info_alloc(gfp_flags); 3729 if (!f6i) 3730 return ERR_PTR(-ENOMEM); 3731 3732 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); 3733 f6i->dst_nocount = true; 3734 f6i->dst_host = true; 3735 f6i->fib6_protocol = RTPROT_KERNEL; 3736 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3737 if (anycast) { 3738 f6i->fib6_type = RTN_ANYCAST; 3739 f6i->fib6_flags |= RTF_ANYCAST; 3740 } else { 3741 f6i->fib6_type = RTN_LOCAL; 3742 f6i->fib6_flags |= RTF_LOCAL; 3743 } 3744 3745 f6i->fib6_nh.nh_gw = *addr; 3746 dev_hold(dev); 3747 f6i->fib6_nh.nh_dev = dev; 3748 f6i->fib6_dst.addr = *addr; 3749 f6i->fib6_dst.plen = 128; 3750 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3751 f6i->fib6_table = fib6_get_table(net, tb_id); 3752 3753 return f6i; 3754 } 3755 3756 /* remove deleted ip from prefsrc entries */ 3757 struct arg_dev_net_ip { 3758 struct net_device *dev; 3759 struct net *net; 3760 struct in6_addr *addr; 3761 }; 3762 3763 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3764 { 3765 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3766 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3767 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3768 3769 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3770 rt != net->ipv6.fib6_null_entry && 3771 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3772 spin_lock_bh(&rt6_exception_lock); 3773 /* remove prefsrc entry */ 3774 rt->fib6_prefsrc.plen = 0; 3775 spin_unlock_bh(&rt6_exception_lock); 3776 } 3777 return 0; 3778 } 3779 3780 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3781 { 3782 struct net *net = dev_net(ifp->idev->dev); 3783 struct arg_dev_net_ip adni = { 3784 .dev = ifp->idev->dev, 3785 .net = net, 3786 .addr = &ifp->addr, 3787 }; 3788 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3789 } 3790 3791 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3792 3793 /* Remove routers and update dst entries when gateway turn into host. */ 3794 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3795 { 3796 struct in6_addr *gateway = (struct in6_addr *)arg; 3797 3798 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3799 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3800 return -1; 3801 } 3802 3803 /* Further clean up cached routes in exception table. 3804 * This is needed because cached route may have a different 3805 * gateway than its 'parent' in the case of an ip redirect. 3806 */ 3807 rt6_exceptions_clean_tohost(rt, gateway); 3808 3809 return 0; 3810 } 3811 3812 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3813 { 3814 fib6_clean_all(net, fib6_clean_tohost, gateway); 3815 } 3816 3817 struct arg_netdev_event { 3818 const struct net_device *dev; 3819 union { 3820 unsigned int nh_flags; 3821 unsigned long event; 3822 }; 3823 }; 3824 3825 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3826 { 3827 struct fib6_info *iter; 3828 struct fib6_node *fn; 3829 3830 fn = rcu_dereference_protected(rt->fib6_node, 3831 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3832 iter = rcu_dereference_protected(fn->leaf, 3833 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3834 while (iter) { 3835 if (iter->fib6_metric == rt->fib6_metric && 3836 rt6_qualify_for_ecmp(iter)) 3837 return iter; 3838 iter = rcu_dereference_protected(iter->fib6_next, 3839 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3840 } 3841 3842 return NULL; 3843 } 3844 3845 static bool rt6_is_dead(const struct fib6_info *rt) 3846 { 3847 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3848 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3849 fib6_ignore_linkdown(rt))) 3850 return true; 3851 3852 return false; 3853 } 3854 3855 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3856 { 3857 struct fib6_info *iter; 3858 int total = 0; 3859 3860 if (!rt6_is_dead(rt)) 3861 total += rt->fib6_nh.nh_weight; 3862 3863 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3864 if (!rt6_is_dead(iter)) 3865 total += iter->fib6_nh.nh_weight; 3866 } 3867 3868 return total; 3869 } 3870 3871 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3872 { 3873 int upper_bound = -1; 3874 3875 if (!rt6_is_dead(rt)) { 3876 *weight += rt->fib6_nh.nh_weight; 3877 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3878 total) - 1; 3879 } 3880 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3881 } 3882 3883 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3884 { 3885 struct fib6_info *iter; 3886 int weight = 0; 3887 3888 rt6_upper_bound_set(rt, &weight, total); 3889 3890 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3891 rt6_upper_bound_set(iter, &weight, total); 3892 } 3893 3894 void rt6_multipath_rebalance(struct fib6_info *rt) 3895 { 3896 struct fib6_info *first; 3897 int total; 3898 3899 /* In case the entire multipath route was marked for flushing, 3900 * then there is no need to rebalance upon the removal of every 3901 * sibling route. 3902 */ 3903 if (!rt->fib6_nsiblings || rt->should_flush) 3904 return; 3905 3906 /* During lookup routes are evaluated in order, so we need to 3907 * make sure upper bounds are assigned from the first sibling 3908 * onwards. 3909 */ 3910 first = rt6_multipath_first_sibling(rt); 3911 if (WARN_ON_ONCE(!first)) 3912 return; 3913 3914 total = rt6_multipath_total_weight(first); 3915 rt6_multipath_upper_bound_set(first, total); 3916 } 3917 3918 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3919 { 3920 const struct arg_netdev_event *arg = p_arg; 3921 struct net *net = dev_net(arg->dev); 3922 3923 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3924 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3925 fib6_update_sernum_upto_root(net, rt); 3926 rt6_multipath_rebalance(rt); 3927 } 3928 3929 return 0; 3930 } 3931 3932 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3933 { 3934 struct arg_netdev_event arg = { 3935 .dev = dev, 3936 { 3937 .nh_flags = nh_flags, 3938 }, 3939 }; 3940 3941 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3942 arg.nh_flags |= RTNH_F_LINKDOWN; 3943 3944 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3945 } 3946 3947 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3948 const struct net_device *dev) 3949 { 3950 struct fib6_info *iter; 3951 3952 if (rt->fib6_nh.nh_dev == dev) 3953 return true; 3954 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3955 if (iter->fib6_nh.nh_dev == dev) 3956 return true; 3957 3958 return false; 3959 } 3960 3961 static void rt6_multipath_flush(struct fib6_info *rt) 3962 { 3963 struct fib6_info *iter; 3964 3965 rt->should_flush = 1; 3966 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3967 iter->should_flush = 1; 3968 } 3969 3970 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3971 const struct net_device *down_dev) 3972 { 3973 struct fib6_info *iter; 3974 unsigned int dead = 0; 3975 3976 if (rt->fib6_nh.nh_dev == down_dev || 3977 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3978 dead++; 3979 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3980 if (iter->fib6_nh.nh_dev == down_dev || 3981 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3982 dead++; 3983 3984 return dead; 3985 } 3986 3987 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3988 const struct net_device *dev, 3989 unsigned int nh_flags) 3990 { 3991 struct fib6_info *iter; 3992 3993 if (rt->fib6_nh.nh_dev == dev) 3994 rt->fib6_nh.nh_flags |= nh_flags; 3995 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3996 if (iter->fib6_nh.nh_dev == dev) 3997 iter->fib6_nh.nh_flags |= nh_flags; 3998 } 3999 4000 /* called with write lock held for table with rt */ 4001 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4002 { 4003 const struct arg_netdev_event *arg = p_arg; 4004 const struct net_device *dev = arg->dev; 4005 struct net *net = dev_net(dev); 4006 4007 if (rt == net->ipv6.fib6_null_entry) 4008 return 0; 4009 4010 switch (arg->event) { 4011 case NETDEV_UNREGISTER: 4012 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4013 case NETDEV_DOWN: 4014 if (rt->should_flush) 4015 return -1; 4016 if (!rt->fib6_nsiblings) 4017 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4018 if (rt6_multipath_uses_dev(rt, dev)) { 4019 unsigned int count; 4020 4021 count = rt6_multipath_dead_count(rt, dev); 4022 if (rt->fib6_nsiblings + 1 == count) { 4023 rt6_multipath_flush(rt); 4024 return -1; 4025 } 4026 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4027 RTNH_F_LINKDOWN); 4028 fib6_update_sernum(net, rt); 4029 rt6_multipath_rebalance(rt); 4030 } 4031 return -2; 4032 case NETDEV_CHANGE: 4033 if (rt->fib6_nh.nh_dev != dev || 4034 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4035 break; 4036 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4037 rt6_multipath_rebalance(rt); 4038 break; 4039 } 4040 4041 return 0; 4042 } 4043 4044 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4045 { 4046 struct arg_netdev_event arg = { 4047 .dev = dev, 4048 { 4049 .event = event, 4050 }, 4051 }; 4052 struct net *net = dev_net(dev); 4053 4054 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4055 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4056 else 4057 fib6_clean_all(net, fib6_ifdown, &arg); 4058 } 4059 4060 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4061 { 4062 rt6_sync_down_dev(dev, event); 4063 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4064 neigh_ifdown(&nd_tbl, dev); 4065 } 4066 4067 struct rt6_mtu_change_arg { 4068 struct net_device *dev; 4069 unsigned int mtu; 4070 }; 4071 4072 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4073 { 4074 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4075 struct inet6_dev *idev; 4076 4077 /* In IPv6 pmtu discovery is not optional, 4078 so that RTAX_MTU lock cannot disable it. 4079 We still use this lock to block changes 4080 caused by addrconf/ndisc. 4081 */ 4082 4083 idev = __in6_dev_get(arg->dev); 4084 if (!idev) 4085 return 0; 4086 4087 /* For administrative MTU increase, there is no way to discover 4088 IPv6 PMTU increase, so PMTU increase should be updated here. 4089 Since RFC 1981 doesn't include administrative MTU increase 4090 update PMTU increase is a MUST. (i.e. jumbo frame) 4091 */ 4092 if (rt->fib6_nh.nh_dev == arg->dev && 4093 !fib6_metric_locked(rt, RTAX_MTU)) { 4094 u32 mtu = rt->fib6_pmtu; 4095 4096 if (mtu >= arg->mtu || 4097 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4098 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4099 4100 spin_lock_bh(&rt6_exception_lock); 4101 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4102 spin_unlock_bh(&rt6_exception_lock); 4103 } 4104 return 0; 4105 } 4106 4107 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4108 { 4109 struct rt6_mtu_change_arg arg = { 4110 .dev = dev, 4111 .mtu = mtu, 4112 }; 4113 4114 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4115 } 4116 4117 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4118 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4119 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4120 [RTA_OIF] = { .type = NLA_U32 }, 4121 [RTA_IIF] = { .type = NLA_U32 }, 4122 [RTA_PRIORITY] = { .type = NLA_U32 }, 4123 [RTA_METRICS] = { .type = NLA_NESTED }, 4124 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4125 [RTA_PREF] = { .type = NLA_U8 }, 4126 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4127 [RTA_ENCAP] = { .type = NLA_NESTED }, 4128 [RTA_EXPIRES] = { .type = NLA_U32 }, 4129 [RTA_UID] = { .type = NLA_U32 }, 4130 [RTA_MARK] = { .type = NLA_U32 }, 4131 [RTA_TABLE] = { .type = NLA_U32 }, 4132 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4133 [RTA_SPORT] = { .type = NLA_U16 }, 4134 [RTA_DPORT] = { .type = NLA_U16 }, 4135 }; 4136 4137 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4138 struct fib6_config *cfg, 4139 struct netlink_ext_ack *extack) 4140 { 4141 struct rtmsg *rtm; 4142 struct nlattr *tb[RTA_MAX+1]; 4143 unsigned int pref; 4144 int err; 4145 4146 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4147 extack); 4148 if (err < 0) 4149 goto errout; 4150 4151 err = -EINVAL; 4152 rtm = nlmsg_data(nlh); 4153 4154 *cfg = (struct fib6_config){ 4155 .fc_table = rtm->rtm_table, 4156 .fc_dst_len = rtm->rtm_dst_len, 4157 .fc_src_len = rtm->rtm_src_len, 4158 .fc_flags = RTF_UP, 4159 .fc_protocol = rtm->rtm_protocol, 4160 .fc_type = rtm->rtm_type, 4161 4162 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4163 .fc_nlinfo.nlh = nlh, 4164 .fc_nlinfo.nl_net = sock_net(skb->sk), 4165 }; 4166 4167 if (rtm->rtm_type == RTN_UNREACHABLE || 4168 rtm->rtm_type == RTN_BLACKHOLE || 4169 rtm->rtm_type == RTN_PROHIBIT || 4170 rtm->rtm_type == RTN_THROW) 4171 cfg->fc_flags |= RTF_REJECT; 4172 4173 if (rtm->rtm_type == RTN_LOCAL) 4174 cfg->fc_flags |= RTF_LOCAL; 4175 4176 if (rtm->rtm_flags & RTM_F_CLONED) 4177 cfg->fc_flags |= RTF_CACHE; 4178 4179 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4180 4181 if (tb[RTA_GATEWAY]) { 4182 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4183 cfg->fc_flags |= RTF_GATEWAY; 4184 } 4185 if (tb[RTA_VIA]) { 4186 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4187 goto errout; 4188 } 4189 4190 if (tb[RTA_DST]) { 4191 int plen = (rtm->rtm_dst_len + 7) >> 3; 4192 4193 if (nla_len(tb[RTA_DST]) < plen) 4194 goto errout; 4195 4196 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4197 } 4198 4199 if (tb[RTA_SRC]) { 4200 int plen = (rtm->rtm_src_len + 7) >> 3; 4201 4202 if (nla_len(tb[RTA_SRC]) < plen) 4203 goto errout; 4204 4205 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4206 } 4207 4208 if (tb[RTA_PREFSRC]) 4209 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4210 4211 if (tb[RTA_OIF]) 4212 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4213 4214 if (tb[RTA_PRIORITY]) 4215 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4216 4217 if (tb[RTA_METRICS]) { 4218 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4219 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4220 } 4221 4222 if (tb[RTA_TABLE]) 4223 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4224 4225 if (tb[RTA_MULTIPATH]) { 4226 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4227 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4228 4229 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4230 cfg->fc_mp_len, extack); 4231 if (err < 0) 4232 goto errout; 4233 } 4234 4235 if (tb[RTA_PREF]) { 4236 pref = nla_get_u8(tb[RTA_PREF]); 4237 if (pref != ICMPV6_ROUTER_PREF_LOW && 4238 pref != ICMPV6_ROUTER_PREF_HIGH) 4239 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4240 cfg->fc_flags |= RTF_PREF(pref); 4241 } 4242 4243 if (tb[RTA_ENCAP]) 4244 cfg->fc_encap = tb[RTA_ENCAP]; 4245 4246 if (tb[RTA_ENCAP_TYPE]) { 4247 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4248 4249 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4250 if (err < 0) 4251 goto errout; 4252 } 4253 4254 if (tb[RTA_EXPIRES]) { 4255 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4256 4257 if (addrconf_finite_timeout(timeout)) { 4258 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4259 cfg->fc_flags |= RTF_EXPIRES; 4260 } 4261 } 4262 4263 err = 0; 4264 errout: 4265 return err; 4266 } 4267 4268 struct rt6_nh { 4269 struct fib6_info *fib6_info; 4270 struct fib6_config r_cfg; 4271 struct list_head next; 4272 }; 4273 4274 static int ip6_route_info_append(struct net *net, 4275 struct list_head *rt6_nh_list, 4276 struct fib6_info *rt, 4277 struct fib6_config *r_cfg) 4278 { 4279 struct rt6_nh *nh; 4280 int err = -EEXIST; 4281 4282 list_for_each_entry(nh, rt6_nh_list, next) { 4283 /* check if fib6_info already exists */ 4284 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4285 return err; 4286 } 4287 4288 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4289 if (!nh) 4290 return -ENOMEM; 4291 nh->fib6_info = rt; 4292 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4293 list_add_tail(&nh->next, rt6_nh_list); 4294 4295 return 0; 4296 } 4297 4298 static void ip6_route_mpath_notify(struct fib6_info *rt, 4299 struct fib6_info *rt_last, 4300 struct nl_info *info, 4301 __u16 nlflags) 4302 { 4303 /* if this is an APPEND route, then rt points to the first route 4304 * inserted and rt_last points to last route inserted. Userspace 4305 * wants a consistent dump of the route which starts at the first 4306 * nexthop. Since sibling routes are always added at the end of 4307 * the list, find the first sibling of the last route appended 4308 */ 4309 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4310 rt = list_first_entry(&rt_last->fib6_siblings, 4311 struct fib6_info, 4312 fib6_siblings); 4313 } 4314 4315 if (rt) 4316 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4317 } 4318 4319 static int ip6_route_multipath_add(struct fib6_config *cfg, 4320 struct netlink_ext_ack *extack) 4321 { 4322 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4323 struct nl_info *info = &cfg->fc_nlinfo; 4324 struct fib6_config r_cfg; 4325 struct rtnexthop *rtnh; 4326 struct fib6_info *rt; 4327 struct rt6_nh *err_nh; 4328 struct rt6_nh *nh, *nh_safe; 4329 __u16 nlflags; 4330 int remaining; 4331 int attrlen; 4332 int err = 1; 4333 int nhn = 0; 4334 int replace = (cfg->fc_nlinfo.nlh && 4335 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4336 LIST_HEAD(rt6_nh_list); 4337 4338 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4339 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4340 nlflags |= NLM_F_APPEND; 4341 4342 remaining = cfg->fc_mp_len; 4343 rtnh = (struct rtnexthop *)cfg->fc_mp; 4344 4345 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4346 * fib6_info structs per nexthop 4347 */ 4348 while (rtnh_ok(rtnh, remaining)) { 4349 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4350 if (rtnh->rtnh_ifindex) 4351 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4352 4353 attrlen = rtnh_attrlen(rtnh); 4354 if (attrlen > 0) { 4355 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4356 4357 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4358 if (nla) { 4359 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4360 r_cfg.fc_flags |= RTF_GATEWAY; 4361 } 4362 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4363 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4364 if (nla) 4365 r_cfg.fc_encap_type = nla_get_u16(nla); 4366 } 4367 4368 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4369 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4370 if (IS_ERR(rt)) { 4371 err = PTR_ERR(rt); 4372 rt = NULL; 4373 goto cleanup; 4374 } 4375 if (!rt6_qualify_for_ecmp(rt)) { 4376 err = -EINVAL; 4377 NL_SET_ERR_MSG(extack, 4378 "Device only routes can not be added for IPv6 using the multipath API."); 4379 fib6_info_release(rt); 4380 goto cleanup; 4381 } 4382 4383 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4384 4385 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4386 rt, &r_cfg); 4387 if (err) { 4388 fib6_info_release(rt); 4389 goto cleanup; 4390 } 4391 4392 rtnh = rtnh_next(rtnh, &remaining); 4393 } 4394 4395 /* for add and replace send one notification with all nexthops. 4396 * Skip the notification in fib6_add_rt2node and send one with 4397 * the full route when done 4398 */ 4399 info->skip_notify = 1; 4400 4401 err_nh = NULL; 4402 list_for_each_entry(nh, &rt6_nh_list, next) { 4403 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4404 fib6_info_release(nh->fib6_info); 4405 4406 if (!err) { 4407 /* save reference to last route successfully inserted */ 4408 rt_last = nh->fib6_info; 4409 4410 /* save reference to first route for notification */ 4411 if (!rt_notif) 4412 rt_notif = nh->fib6_info; 4413 } 4414 4415 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4416 nh->fib6_info = NULL; 4417 if (err) { 4418 if (replace && nhn) 4419 NL_SET_ERR_MSG_MOD(extack, 4420 "multipath route replace failed (check consistency of installed routes)"); 4421 err_nh = nh; 4422 goto add_errout; 4423 } 4424 4425 /* Because each route is added like a single route we remove 4426 * these flags after the first nexthop: if there is a collision, 4427 * we have already failed to add the first nexthop: 4428 * fib6_add_rt2node() has rejected it; when replacing, old 4429 * nexthops have been replaced by first new, the rest should 4430 * be added to it. 4431 */ 4432 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4433 NLM_F_REPLACE); 4434 nhn++; 4435 } 4436 4437 /* success ... tell user about new route */ 4438 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4439 goto cleanup; 4440 4441 add_errout: 4442 /* send notification for routes that were added so that 4443 * the delete notifications sent by ip6_route_del are 4444 * coherent 4445 */ 4446 if (rt_notif) 4447 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4448 4449 /* Delete routes that were already added */ 4450 list_for_each_entry(nh, &rt6_nh_list, next) { 4451 if (err_nh == nh) 4452 break; 4453 ip6_route_del(&nh->r_cfg, extack); 4454 } 4455 4456 cleanup: 4457 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4458 if (nh->fib6_info) 4459 fib6_info_release(nh->fib6_info); 4460 list_del(&nh->next); 4461 kfree(nh); 4462 } 4463 4464 return err; 4465 } 4466 4467 static int ip6_route_multipath_del(struct fib6_config *cfg, 4468 struct netlink_ext_ack *extack) 4469 { 4470 struct fib6_config r_cfg; 4471 struct rtnexthop *rtnh; 4472 int remaining; 4473 int attrlen; 4474 int err = 1, last_err = 0; 4475 4476 remaining = cfg->fc_mp_len; 4477 rtnh = (struct rtnexthop *)cfg->fc_mp; 4478 4479 /* Parse a Multipath Entry */ 4480 while (rtnh_ok(rtnh, remaining)) { 4481 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4482 if (rtnh->rtnh_ifindex) 4483 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4484 4485 attrlen = rtnh_attrlen(rtnh); 4486 if (attrlen > 0) { 4487 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4488 4489 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4490 if (nla) { 4491 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4492 r_cfg.fc_flags |= RTF_GATEWAY; 4493 } 4494 } 4495 err = ip6_route_del(&r_cfg, extack); 4496 if (err) 4497 last_err = err; 4498 4499 rtnh = rtnh_next(rtnh, &remaining); 4500 } 4501 4502 return last_err; 4503 } 4504 4505 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4506 struct netlink_ext_ack *extack) 4507 { 4508 struct fib6_config cfg; 4509 int err; 4510 4511 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4512 if (err < 0) 4513 return err; 4514 4515 if (cfg.fc_mp) 4516 return ip6_route_multipath_del(&cfg, extack); 4517 else { 4518 cfg.fc_delete_all_nh = 1; 4519 return ip6_route_del(&cfg, extack); 4520 } 4521 } 4522 4523 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4524 struct netlink_ext_ack *extack) 4525 { 4526 struct fib6_config cfg; 4527 int err; 4528 4529 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4530 if (err < 0) 4531 return err; 4532 4533 if (cfg.fc_mp) 4534 return ip6_route_multipath_add(&cfg, extack); 4535 else 4536 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4537 } 4538 4539 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4540 { 4541 int nexthop_len = 0; 4542 4543 if (rt->fib6_nsiblings) { 4544 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4545 + NLA_ALIGN(sizeof(struct rtnexthop)) 4546 + nla_total_size(16) /* RTA_GATEWAY */ 4547 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4548 4549 nexthop_len *= rt->fib6_nsiblings; 4550 } 4551 4552 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4553 + nla_total_size(16) /* RTA_SRC */ 4554 + nla_total_size(16) /* RTA_DST */ 4555 + nla_total_size(16) /* RTA_GATEWAY */ 4556 + nla_total_size(16) /* RTA_PREFSRC */ 4557 + nla_total_size(4) /* RTA_TABLE */ 4558 + nla_total_size(4) /* RTA_IIF */ 4559 + nla_total_size(4) /* RTA_OIF */ 4560 + nla_total_size(4) /* RTA_PRIORITY */ 4561 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4562 + nla_total_size(sizeof(struct rta_cacheinfo)) 4563 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4564 + nla_total_size(1) /* RTA_PREF */ 4565 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4566 + nexthop_len; 4567 } 4568 4569 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4570 unsigned int *flags, bool skip_oif) 4571 { 4572 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4573 *flags |= RTNH_F_DEAD; 4574 4575 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4576 *flags |= RTNH_F_LINKDOWN; 4577 4578 rcu_read_lock(); 4579 if (fib6_ignore_linkdown(rt)) 4580 *flags |= RTNH_F_DEAD; 4581 rcu_read_unlock(); 4582 } 4583 4584 if (rt->fib6_flags & RTF_GATEWAY) { 4585 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4586 goto nla_put_failure; 4587 } 4588 4589 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4590 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4591 *flags |= RTNH_F_OFFLOAD; 4592 4593 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4594 if (!skip_oif && rt->fib6_nh.nh_dev && 4595 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4596 goto nla_put_failure; 4597 4598 if (rt->fib6_nh.nh_lwtstate && 4599 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4600 goto nla_put_failure; 4601 4602 return 0; 4603 4604 nla_put_failure: 4605 return -EMSGSIZE; 4606 } 4607 4608 /* add multipath next hop */ 4609 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4610 { 4611 const struct net_device *dev = rt->fib6_nh.nh_dev; 4612 struct rtnexthop *rtnh; 4613 unsigned int flags = 0; 4614 4615 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4616 if (!rtnh) 4617 goto nla_put_failure; 4618 4619 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4620 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4621 4622 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4623 goto nla_put_failure; 4624 4625 rtnh->rtnh_flags = flags; 4626 4627 /* length of rtnetlink header + attributes */ 4628 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4629 4630 return 0; 4631 4632 nla_put_failure: 4633 return -EMSGSIZE; 4634 } 4635 4636 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4637 struct fib6_info *rt, struct dst_entry *dst, 4638 struct in6_addr *dest, struct in6_addr *src, 4639 int iif, int type, u32 portid, u32 seq, 4640 unsigned int flags) 4641 { 4642 struct rt6_info *rt6 = (struct rt6_info *)dst; 4643 struct rt6key *rt6_dst, *rt6_src; 4644 u32 *pmetrics, table, rt6_flags; 4645 struct nlmsghdr *nlh; 4646 struct rtmsg *rtm; 4647 long expires = 0; 4648 4649 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4650 if (!nlh) 4651 return -EMSGSIZE; 4652 4653 if (rt6) { 4654 rt6_dst = &rt6->rt6i_dst; 4655 rt6_src = &rt6->rt6i_src; 4656 rt6_flags = rt6->rt6i_flags; 4657 } else { 4658 rt6_dst = &rt->fib6_dst; 4659 rt6_src = &rt->fib6_src; 4660 rt6_flags = rt->fib6_flags; 4661 } 4662 4663 rtm = nlmsg_data(nlh); 4664 rtm->rtm_family = AF_INET6; 4665 rtm->rtm_dst_len = rt6_dst->plen; 4666 rtm->rtm_src_len = rt6_src->plen; 4667 rtm->rtm_tos = 0; 4668 if (rt->fib6_table) 4669 table = rt->fib6_table->tb6_id; 4670 else 4671 table = RT6_TABLE_UNSPEC; 4672 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4673 if (nla_put_u32(skb, RTA_TABLE, table)) 4674 goto nla_put_failure; 4675 4676 rtm->rtm_type = rt->fib6_type; 4677 rtm->rtm_flags = 0; 4678 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4679 rtm->rtm_protocol = rt->fib6_protocol; 4680 4681 if (rt6_flags & RTF_CACHE) 4682 rtm->rtm_flags |= RTM_F_CLONED; 4683 4684 if (dest) { 4685 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4686 goto nla_put_failure; 4687 rtm->rtm_dst_len = 128; 4688 } else if (rtm->rtm_dst_len) 4689 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4690 goto nla_put_failure; 4691 #ifdef CONFIG_IPV6_SUBTREES 4692 if (src) { 4693 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4694 goto nla_put_failure; 4695 rtm->rtm_src_len = 128; 4696 } else if (rtm->rtm_src_len && 4697 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4698 goto nla_put_failure; 4699 #endif 4700 if (iif) { 4701 #ifdef CONFIG_IPV6_MROUTE 4702 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4703 int err = ip6mr_get_route(net, skb, rtm, portid); 4704 4705 if (err == 0) 4706 return 0; 4707 if (err < 0) 4708 goto nla_put_failure; 4709 } else 4710 #endif 4711 if (nla_put_u32(skb, RTA_IIF, iif)) 4712 goto nla_put_failure; 4713 } else if (dest) { 4714 struct in6_addr saddr_buf; 4715 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4716 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4717 goto nla_put_failure; 4718 } 4719 4720 if (rt->fib6_prefsrc.plen) { 4721 struct in6_addr saddr_buf; 4722 saddr_buf = rt->fib6_prefsrc.addr; 4723 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4724 goto nla_put_failure; 4725 } 4726 4727 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4728 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4729 goto nla_put_failure; 4730 4731 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4732 goto nla_put_failure; 4733 4734 /* For multipath routes, walk the siblings list and add 4735 * each as a nexthop within RTA_MULTIPATH. 4736 */ 4737 if (rt6) { 4738 if (rt6_flags & RTF_GATEWAY && 4739 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4740 goto nla_put_failure; 4741 4742 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4743 goto nla_put_failure; 4744 } else if (rt->fib6_nsiblings) { 4745 struct fib6_info *sibling, *next_sibling; 4746 struct nlattr *mp; 4747 4748 mp = nla_nest_start(skb, RTA_MULTIPATH); 4749 if (!mp) 4750 goto nla_put_failure; 4751 4752 if (rt6_add_nexthop(skb, rt) < 0) 4753 goto nla_put_failure; 4754 4755 list_for_each_entry_safe(sibling, next_sibling, 4756 &rt->fib6_siblings, fib6_siblings) { 4757 if (rt6_add_nexthop(skb, sibling) < 0) 4758 goto nla_put_failure; 4759 } 4760 4761 nla_nest_end(skb, mp); 4762 } else { 4763 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4764 goto nla_put_failure; 4765 } 4766 4767 if (rt6_flags & RTF_EXPIRES) { 4768 expires = dst ? dst->expires : rt->expires; 4769 expires -= jiffies; 4770 } 4771 4772 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4773 goto nla_put_failure; 4774 4775 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4776 goto nla_put_failure; 4777 4778 4779 nlmsg_end(skb, nlh); 4780 return 0; 4781 4782 nla_put_failure: 4783 nlmsg_cancel(skb, nlh); 4784 return -EMSGSIZE; 4785 } 4786 4787 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4788 const struct net_device *dev) 4789 { 4790 if (f6i->fib6_nh.nh_dev == dev) 4791 return true; 4792 4793 if (f6i->fib6_nsiblings) { 4794 struct fib6_info *sibling, *next_sibling; 4795 4796 list_for_each_entry_safe(sibling, next_sibling, 4797 &f6i->fib6_siblings, fib6_siblings) { 4798 if (sibling->fib6_nh.nh_dev == dev) 4799 return true; 4800 } 4801 } 4802 4803 return false; 4804 } 4805 4806 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4807 { 4808 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4809 struct fib_dump_filter *filter = &arg->filter; 4810 unsigned int flags = NLM_F_MULTI; 4811 struct net *net = arg->net; 4812 4813 if (rt == net->ipv6.fib6_null_entry) 4814 return 0; 4815 4816 if ((filter->flags & RTM_F_PREFIX) && 4817 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4818 /* success since this is not a prefix route */ 4819 return 1; 4820 } 4821 if (filter->filter_set) { 4822 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4823 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4824 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4825 return 1; 4826 } 4827 flags |= NLM_F_DUMP_FILTERED; 4828 } 4829 4830 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4831 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4832 arg->cb->nlh->nlmsg_seq, flags); 4833 } 4834 4835 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4836 const struct nlmsghdr *nlh, 4837 struct nlattr **tb, 4838 struct netlink_ext_ack *extack) 4839 { 4840 struct rtmsg *rtm; 4841 int i, err; 4842 4843 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4844 NL_SET_ERR_MSG_MOD(extack, 4845 "Invalid header for get route request"); 4846 return -EINVAL; 4847 } 4848 4849 if (!netlink_strict_get_check(skb)) 4850 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4851 rtm_ipv6_policy, extack); 4852 4853 rtm = nlmsg_data(nlh); 4854 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4855 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4856 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4857 rtm->rtm_type) { 4858 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4859 return -EINVAL; 4860 } 4861 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4862 NL_SET_ERR_MSG_MOD(extack, 4863 "Invalid flags for get route request"); 4864 return -EINVAL; 4865 } 4866 4867 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4868 rtm_ipv6_policy, extack); 4869 if (err) 4870 return err; 4871 4872 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4873 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4874 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4875 return -EINVAL; 4876 } 4877 4878 for (i = 0; i <= RTA_MAX; i++) { 4879 if (!tb[i]) 4880 continue; 4881 4882 switch (i) { 4883 case RTA_SRC: 4884 case RTA_DST: 4885 case RTA_IIF: 4886 case RTA_OIF: 4887 case RTA_MARK: 4888 case RTA_UID: 4889 case RTA_SPORT: 4890 case RTA_DPORT: 4891 case RTA_IP_PROTO: 4892 break; 4893 default: 4894 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4895 return -EINVAL; 4896 } 4897 } 4898 4899 return 0; 4900 } 4901 4902 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4903 struct netlink_ext_ack *extack) 4904 { 4905 struct net *net = sock_net(in_skb->sk); 4906 struct nlattr *tb[RTA_MAX+1]; 4907 int err, iif = 0, oif = 0; 4908 struct fib6_info *from; 4909 struct dst_entry *dst; 4910 struct rt6_info *rt; 4911 struct sk_buff *skb; 4912 struct rtmsg *rtm; 4913 struct flowi6 fl6 = {}; 4914 bool fibmatch; 4915 4916 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4917 if (err < 0) 4918 goto errout; 4919 4920 err = -EINVAL; 4921 rtm = nlmsg_data(nlh); 4922 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4923 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4924 4925 if (tb[RTA_SRC]) { 4926 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4927 goto errout; 4928 4929 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4930 } 4931 4932 if (tb[RTA_DST]) { 4933 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4934 goto errout; 4935 4936 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4937 } 4938 4939 if (tb[RTA_IIF]) 4940 iif = nla_get_u32(tb[RTA_IIF]); 4941 4942 if (tb[RTA_OIF]) 4943 oif = nla_get_u32(tb[RTA_OIF]); 4944 4945 if (tb[RTA_MARK]) 4946 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4947 4948 if (tb[RTA_UID]) 4949 fl6.flowi6_uid = make_kuid(current_user_ns(), 4950 nla_get_u32(tb[RTA_UID])); 4951 else 4952 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4953 4954 if (tb[RTA_SPORT]) 4955 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4956 4957 if (tb[RTA_DPORT]) 4958 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4959 4960 if (tb[RTA_IP_PROTO]) { 4961 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4962 &fl6.flowi6_proto, AF_INET6, 4963 extack); 4964 if (err) 4965 goto errout; 4966 } 4967 4968 if (iif) { 4969 struct net_device *dev; 4970 int flags = 0; 4971 4972 rcu_read_lock(); 4973 4974 dev = dev_get_by_index_rcu(net, iif); 4975 if (!dev) { 4976 rcu_read_unlock(); 4977 err = -ENODEV; 4978 goto errout; 4979 } 4980 4981 fl6.flowi6_iif = iif; 4982 4983 if (!ipv6_addr_any(&fl6.saddr)) 4984 flags |= RT6_LOOKUP_F_HAS_SADDR; 4985 4986 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4987 4988 rcu_read_unlock(); 4989 } else { 4990 fl6.flowi6_oif = oif; 4991 4992 dst = ip6_route_output(net, NULL, &fl6); 4993 } 4994 4995 4996 rt = container_of(dst, struct rt6_info, dst); 4997 if (rt->dst.error) { 4998 err = rt->dst.error; 4999 ip6_rt_put(rt); 5000 goto errout; 5001 } 5002 5003 if (rt == net->ipv6.ip6_null_entry) { 5004 err = rt->dst.error; 5005 ip6_rt_put(rt); 5006 goto errout; 5007 } 5008 5009 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5010 if (!skb) { 5011 ip6_rt_put(rt); 5012 err = -ENOBUFS; 5013 goto errout; 5014 } 5015 5016 skb_dst_set(skb, &rt->dst); 5017 5018 rcu_read_lock(); 5019 from = rcu_dereference(rt->from); 5020 5021 if (fibmatch) 5022 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 5023 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 5024 nlh->nlmsg_seq, 0); 5025 else 5026 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5027 &fl6.saddr, iif, RTM_NEWROUTE, 5028 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 5029 0); 5030 rcu_read_unlock(); 5031 5032 if (err < 0) { 5033 kfree_skb(skb); 5034 goto errout; 5035 } 5036 5037 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5038 errout: 5039 return err; 5040 } 5041 5042 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5043 unsigned int nlm_flags) 5044 { 5045 struct sk_buff *skb; 5046 struct net *net = info->nl_net; 5047 u32 seq; 5048 int err; 5049 5050 err = -ENOBUFS; 5051 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5052 5053 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5054 if (!skb) 5055 goto errout; 5056 5057 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5058 event, info->portid, seq, nlm_flags); 5059 if (err < 0) { 5060 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5061 WARN_ON(err == -EMSGSIZE); 5062 kfree_skb(skb); 5063 goto errout; 5064 } 5065 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5066 info->nlh, gfp_any()); 5067 return; 5068 errout: 5069 if (err < 0) 5070 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5071 } 5072 5073 static int ip6_route_dev_notify(struct notifier_block *this, 5074 unsigned long event, void *ptr) 5075 { 5076 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5077 struct net *net = dev_net(dev); 5078 5079 if (!(dev->flags & IFF_LOOPBACK)) 5080 return NOTIFY_OK; 5081 5082 if (event == NETDEV_REGISTER) { 5083 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5084 net->ipv6.ip6_null_entry->dst.dev = dev; 5085 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5086 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5087 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5088 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5089 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5090 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5091 #endif 5092 } else if (event == NETDEV_UNREGISTER && 5093 dev->reg_state != NETREG_UNREGISTERED) { 5094 /* NETDEV_UNREGISTER could be fired for multiple times by 5095 * netdev_wait_allrefs(). Make sure we only call this once. 5096 */ 5097 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5098 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5099 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5100 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5101 #endif 5102 } 5103 5104 return NOTIFY_OK; 5105 } 5106 5107 /* 5108 * /proc 5109 */ 5110 5111 #ifdef CONFIG_PROC_FS 5112 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5113 { 5114 struct net *net = (struct net *)seq->private; 5115 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5116 net->ipv6.rt6_stats->fib_nodes, 5117 net->ipv6.rt6_stats->fib_route_nodes, 5118 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5119 net->ipv6.rt6_stats->fib_rt_entries, 5120 net->ipv6.rt6_stats->fib_rt_cache, 5121 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5122 net->ipv6.rt6_stats->fib_discarded_routes); 5123 5124 return 0; 5125 } 5126 #endif /* CONFIG_PROC_FS */ 5127 5128 #ifdef CONFIG_SYSCTL 5129 5130 static 5131 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5132 void __user *buffer, size_t *lenp, loff_t *ppos) 5133 { 5134 struct net *net; 5135 int delay; 5136 int ret; 5137 if (!write) 5138 return -EINVAL; 5139 5140 net = (struct net *)ctl->extra1; 5141 delay = net->ipv6.sysctl.flush_delay; 5142 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5143 if (ret) 5144 return ret; 5145 5146 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5147 return 0; 5148 } 5149 5150 static int zero; 5151 static int one = 1; 5152 5153 static struct ctl_table ipv6_route_table_template[] = { 5154 { 5155 .procname = "flush", 5156 .data = &init_net.ipv6.sysctl.flush_delay, 5157 .maxlen = sizeof(int), 5158 .mode = 0200, 5159 .proc_handler = ipv6_sysctl_rtcache_flush 5160 }, 5161 { 5162 .procname = "gc_thresh", 5163 .data = &ip6_dst_ops_template.gc_thresh, 5164 .maxlen = sizeof(int), 5165 .mode = 0644, 5166 .proc_handler = proc_dointvec, 5167 }, 5168 { 5169 .procname = "max_size", 5170 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5171 .maxlen = sizeof(int), 5172 .mode = 0644, 5173 .proc_handler = proc_dointvec, 5174 }, 5175 { 5176 .procname = "gc_min_interval", 5177 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5178 .maxlen = sizeof(int), 5179 .mode = 0644, 5180 .proc_handler = proc_dointvec_jiffies, 5181 }, 5182 { 5183 .procname = "gc_timeout", 5184 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5185 .maxlen = sizeof(int), 5186 .mode = 0644, 5187 .proc_handler = proc_dointvec_jiffies, 5188 }, 5189 { 5190 .procname = "gc_interval", 5191 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5192 .maxlen = sizeof(int), 5193 .mode = 0644, 5194 .proc_handler = proc_dointvec_jiffies, 5195 }, 5196 { 5197 .procname = "gc_elasticity", 5198 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5199 .maxlen = sizeof(int), 5200 .mode = 0644, 5201 .proc_handler = proc_dointvec, 5202 }, 5203 { 5204 .procname = "mtu_expires", 5205 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5206 .maxlen = sizeof(int), 5207 .mode = 0644, 5208 .proc_handler = proc_dointvec_jiffies, 5209 }, 5210 { 5211 .procname = "min_adv_mss", 5212 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5213 .maxlen = sizeof(int), 5214 .mode = 0644, 5215 .proc_handler = proc_dointvec, 5216 }, 5217 { 5218 .procname = "gc_min_interval_ms", 5219 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5220 .maxlen = sizeof(int), 5221 .mode = 0644, 5222 .proc_handler = proc_dointvec_ms_jiffies, 5223 }, 5224 { 5225 .procname = "skip_notify_on_dev_down", 5226 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5227 .maxlen = sizeof(int), 5228 .mode = 0644, 5229 .proc_handler = proc_dointvec, 5230 .extra1 = &zero, 5231 .extra2 = &one, 5232 }, 5233 { } 5234 }; 5235 5236 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5237 { 5238 struct ctl_table *table; 5239 5240 table = kmemdup(ipv6_route_table_template, 5241 sizeof(ipv6_route_table_template), 5242 GFP_KERNEL); 5243 5244 if (table) { 5245 table[0].data = &net->ipv6.sysctl.flush_delay; 5246 table[0].extra1 = net; 5247 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5248 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5249 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5250 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5251 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5252 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5253 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5254 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5255 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5256 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5257 5258 /* Don't export sysctls to unprivileged users */ 5259 if (net->user_ns != &init_user_ns) 5260 table[0].procname = NULL; 5261 } 5262 5263 return table; 5264 } 5265 #endif 5266 5267 static int __net_init ip6_route_net_init(struct net *net) 5268 { 5269 int ret = -ENOMEM; 5270 5271 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5272 sizeof(net->ipv6.ip6_dst_ops)); 5273 5274 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5275 goto out_ip6_dst_ops; 5276 5277 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5278 sizeof(*net->ipv6.fib6_null_entry), 5279 GFP_KERNEL); 5280 if (!net->ipv6.fib6_null_entry) 5281 goto out_ip6_dst_entries; 5282 5283 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5284 sizeof(*net->ipv6.ip6_null_entry), 5285 GFP_KERNEL); 5286 if (!net->ipv6.ip6_null_entry) 5287 goto out_fib6_null_entry; 5288 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5289 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5290 ip6_template_metrics, true); 5291 5292 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5293 net->ipv6.fib6_has_custom_rules = false; 5294 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5295 sizeof(*net->ipv6.ip6_prohibit_entry), 5296 GFP_KERNEL); 5297 if (!net->ipv6.ip6_prohibit_entry) 5298 goto out_ip6_null_entry; 5299 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5300 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5301 ip6_template_metrics, true); 5302 5303 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5304 sizeof(*net->ipv6.ip6_blk_hole_entry), 5305 GFP_KERNEL); 5306 if (!net->ipv6.ip6_blk_hole_entry) 5307 goto out_ip6_prohibit_entry; 5308 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5309 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5310 ip6_template_metrics, true); 5311 #endif 5312 5313 net->ipv6.sysctl.flush_delay = 0; 5314 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5315 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5316 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5317 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5318 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5319 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5320 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5321 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5322 5323 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5324 5325 ret = 0; 5326 out: 5327 return ret; 5328 5329 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5330 out_ip6_prohibit_entry: 5331 kfree(net->ipv6.ip6_prohibit_entry); 5332 out_ip6_null_entry: 5333 kfree(net->ipv6.ip6_null_entry); 5334 #endif 5335 out_fib6_null_entry: 5336 kfree(net->ipv6.fib6_null_entry); 5337 out_ip6_dst_entries: 5338 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5339 out_ip6_dst_ops: 5340 goto out; 5341 } 5342 5343 static void __net_exit ip6_route_net_exit(struct net *net) 5344 { 5345 kfree(net->ipv6.fib6_null_entry); 5346 kfree(net->ipv6.ip6_null_entry); 5347 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5348 kfree(net->ipv6.ip6_prohibit_entry); 5349 kfree(net->ipv6.ip6_blk_hole_entry); 5350 #endif 5351 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5352 } 5353 5354 static int __net_init ip6_route_net_init_late(struct net *net) 5355 { 5356 #ifdef CONFIG_PROC_FS 5357 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5358 sizeof(struct ipv6_route_iter)); 5359 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5360 rt6_stats_seq_show, NULL); 5361 #endif 5362 return 0; 5363 } 5364 5365 static void __net_exit ip6_route_net_exit_late(struct net *net) 5366 { 5367 #ifdef CONFIG_PROC_FS 5368 remove_proc_entry("ipv6_route", net->proc_net); 5369 remove_proc_entry("rt6_stats", net->proc_net); 5370 #endif 5371 } 5372 5373 static struct pernet_operations ip6_route_net_ops = { 5374 .init = ip6_route_net_init, 5375 .exit = ip6_route_net_exit, 5376 }; 5377 5378 static int __net_init ipv6_inetpeer_init(struct net *net) 5379 { 5380 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5381 5382 if (!bp) 5383 return -ENOMEM; 5384 inet_peer_base_init(bp); 5385 net->ipv6.peers = bp; 5386 return 0; 5387 } 5388 5389 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5390 { 5391 struct inet_peer_base *bp = net->ipv6.peers; 5392 5393 net->ipv6.peers = NULL; 5394 inetpeer_invalidate_tree(bp); 5395 kfree(bp); 5396 } 5397 5398 static struct pernet_operations ipv6_inetpeer_ops = { 5399 .init = ipv6_inetpeer_init, 5400 .exit = ipv6_inetpeer_exit, 5401 }; 5402 5403 static struct pernet_operations ip6_route_net_late_ops = { 5404 .init = ip6_route_net_init_late, 5405 .exit = ip6_route_net_exit_late, 5406 }; 5407 5408 static struct notifier_block ip6_route_dev_notifier = { 5409 .notifier_call = ip6_route_dev_notify, 5410 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5411 }; 5412 5413 void __init ip6_route_init_special_entries(void) 5414 { 5415 /* Registering of the loopback is done before this portion of code, 5416 * the loopback reference in rt6_info will not be taken, do it 5417 * manually for init_net */ 5418 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5419 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5420 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5421 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5422 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5423 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5424 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5425 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5426 #endif 5427 } 5428 5429 int __init ip6_route_init(void) 5430 { 5431 int ret; 5432 int cpu; 5433 5434 ret = -ENOMEM; 5435 ip6_dst_ops_template.kmem_cachep = 5436 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5437 SLAB_HWCACHE_ALIGN, NULL); 5438 if (!ip6_dst_ops_template.kmem_cachep) 5439 goto out; 5440 5441 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5442 if (ret) 5443 goto out_kmem_cache; 5444 5445 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5446 if (ret) 5447 goto out_dst_entries; 5448 5449 ret = register_pernet_subsys(&ip6_route_net_ops); 5450 if (ret) 5451 goto out_register_inetpeer; 5452 5453 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5454 5455 ret = fib6_init(); 5456 if (ret) 5457 goto out_register_subsys; 5458 5459 ret = xfrm6_init(); 5460 if (ret) 5461 goto out_fib6_init; 5462 5463 ret = fib6_rules_init(); 5464 if (ret) 5465 goto xfrm6_init; 5466 5467 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5468 if (ret) 5469 goto fib6_rules_init; 5470 5471 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5472 inet6_rtm_newroute, NULL, 0); 5473 if (ret < 0) 5474 goto out_register_late_subsys; 5475 5476 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5477 inet6_rtm_delroute, NULL, 0); 5478 if (ret < 0) 5479 goto out_register_late_subsys; 5480 5481 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5482 inet6_rtm_getroute, NULL, 5483 RTNL_FLAG_DOIT_UNLOCKED); 5484 if (ret < 0) 5485 goto out_register_late_subsys; 5486 5487 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5488 if (ret) 5489 goto out_register_late_subsys; 5490 5491 for_each_possible_cpu(cpu) { 5492 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5493 5494 INIT_LIST_HEAD(&ul->head); 5495 spin_lock_init(&ul->lock); 5496 } 5497 5498 out: 5499 return ret; 5500 5501 out_register_late_subsys: 5502 rtnl_unregister_all(PF_INET6); 5503 unregister_pernet_subsys(&ip6_route_net_late_ops); 5504 fib6_rules_init: 5505 fib6_rules_cleanup(); 5506 xfrm6_init: 5507 xfrm6_fini(); 5508 out_fib6_init: 5509 fib6_gc_cleanup(); 5510 out_register_subsys: 5511 unregister_pernet_subsys(&ip6_route_net_ops); 5512 out_register_inetpeer: 5513 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5514 out_dst_entries: 5515 dst_entries_destroy(&ip6_dst_blackhole_ops); 5516 out_kmem_cache: 5517 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5518 goto out; 5519 } 5520 5521 void ip6_route_cleanup(void) 5522 { 5523 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5524 unregister_pernet_subsys(&ip6_route_net_late_ops); 5525 fib6_rules_cleanup(); 5526 xfrm6_fini(); 5527 fib6_gc_cleanup(); 5528 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5529 unregister_pernet_subsys(&ip6_route_net_ops); 5530 dst_entries_destroy(&ip6_dst_blackhole_ops); 5531 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5532 } 5533