1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 214 n = neigh_create(&nd_tbl, daddr, dev); 215 return IS_ERR(n) ? NULL : n; 216 } 217 218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 219 struct sk_buff *skb, 220 const void *daddr) 221 { 222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 223 224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 225 } 226 227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 228 { 229 struct net_device *dev = dst->dev; 230 struct rt6_info *rt = (struct rt6_info *)dst; 231 232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 233 if (!daddr) 234 return; 235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 236 return; 237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 238 return; 239 __ipv6_confirm_neigh(dev, daddr); 240 } 241 242 static struct dst_ops ip6_dst_ops_template = { 243 .family = AF_INET6, 244 .gc = ip6_dst_gc, 245 .gc_thresh = 1024, 246 .check = ip6_dst_check, 247 .default_advmss = ip6_default_advmss, 248 .mtu = ip6_mtu, 249 .cow_metrics = dst_cow_metrics_generic, 250 .destroy = ip6_dst_destroy, 251 .ifdown = ip6_dst_ifdown, 252 .negative_advice = ip6_negative_advice, 253 .link_failure = ip6_link_failure, 254 .update_pmtu = ip6_rt_update_pmtu, 255 .redirect = rt6_do_redirect, 256 .local_out = __ip6_local_out, 257 .neigh_lookup = ip6_dst_neigh_lookup, 258 .confirm_neigh = ip6_confirm_neigh, 259 }; 260 261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 262 { 263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 264 265 return mtu ? : dst->dev->mtu; 266 } 267 268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 269 struct sk_buff *skb, u32 mtu) 270 { 271 } 272 273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 274 struct sk_buff *skb) 275 { 276 } 277 278 static struct dst_ops ip6_dst_blackhole_ops = { 279 .family = AF_INET6, 280 .destroy = ip6_dst_destroy, 281 .check = ip6_dst_check, 282 .mtu = ip6_blackhole_mtu, 283 .default_advmss = ip6_default_advmss, 284 .update_pmtu = ip6_rt_blackhole_update_pmtu, 285 .redirect = ip6_rt_blackhole_redirect, 286 .cow_metrics = dst_cow_metrics_generic, 287 .neigh_lookup = ip6_dst_neigh_lookup, 288 }; 289 290 static const u32 ip6_template_metrics[RTAX_MAX] = { 291 [RTAX_HOPLIMIT - 1] = 0, 292 }; 293 294 static const struct fib6_info fib6_null_entry_template = { 295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 296 .fib6_protocol = RTPROT_KERNEL, 297 .fib6_metric = ~(u32)0, 298 .fib6_ref = ATOMIC_INIT(1), 299 .fib6_type = RTN_UNREACHABLE, 300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 301 }; 302 303 static const struct rt6_info ip6_null_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -ENETUNREACH, 309 .input = ip6_pkt_discard, 310 .output = ip6_pkt_discard_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 316 317 static const struct rt6_info ip6_prohibit_entry_template = { 318 .dst = { 319 .__refcnt = ATOMIC_INIT(1), 320 .__use = 1, 321 .obsolete = DST_OBSOLETE_FORCE_CHK, 322 .error = -EACCES, 323 .input = ip6_pkt_prohibit, 324 .output = ip6_pkt_prohibit_out, 325 }, 326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 327 }; 328 329 static const struct rt6_info ip6_blk_hole_entry_template = { 330 .dst = { 331 .__refcnt = ATOMIC_INIT(1), 332 .__use = 1, 333 .obsolete = DST_OBSOLETE_FORCE_CHK, 334 .error = -EINVAL, 335 .input = dst_discard, 336 .output = dst_discard_out, 337 }, 338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 339 }; 340 341 #endif 342 343 static void rt6_info_init(struct rt6_info *rt) 344 { 345 struct dst_entry *dst = &rt->dst; 346 347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 348 INIT_LIST_HEAD(&rt->rt6i_uncached); 349 } 350 351 /* allocate dst with ip6_dst_ops */ 352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) { 359 rt6_info_init(rt); 360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 361 } 362 363 return rt; 364 } 365 EXPORT_SYMBOL(ip6_dst_alloc); 366 367 static void ip6_dst_destroy(struct dst_entry *dst) 368 { 369 struct rt6_info *rt = (struct rt6_info *)dst; 370 struct fib6_info *from; 371 struct inet6_dev *idev; 372 373 ip_dst_metrics_put(dst); 374 rt6_uncached_list_del(rt); 375 376 idev = rt->rt6i_idev; 377 if (idev) { 378 rt->rt6i_idev = NULL; 379 in6_dev_put(idev); 380 } 381 382 rcu_read_lock(); 383 from = rcu_dereference(rt->from); 384 rcu_assign_pointer(rt->from, NULL); 385 fib6_info_release(from); 386 rcu_read_unlock(); 387 } 388 389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 390 int how) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct inet6_dev *idev = rt->rt6i_idev; 394 struct net_device *loopback_dev = 395 dev_net(dev)->loopback_dev; 396 397 if (idev && idev->dev != loopback_dev) { 398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 399 if (loopback_idev) { 400 rt->rt6i_idev = loopback_idev; 401 in6_dev_put(idev); 402 } 403 } 404 } 405 406 static bool __rt6_check_expired(const struct rt6_info *rt) 407 { 408 if (rt->rt6i_flags & RTF_EXPIRES) 409 return time_after(jiffies, rt->dst.expires); 410 else 411 return false; 412 } 413 414 static bool rt6_check_expired(const struct rt6_info *rt) 415 { 416 struct fib6_info *from; 417 418 from = rcu_dereference(rt->from); 419 420 if (rt->rt6i_flags & RTF_EXPIRES) { 421 if (time_after(jiffies, rt->dst.expires)) 422 return true; 423 } else if (from) { 424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 425 fib6_check_expired(from); 426 } 427 return false; 428 } 429 430 struct fib6_info *fib6_multipath_select(const struct net *net, 431 struct fib6_info *match, 432 struct flowi6 *fl6, int oif, 433 const struct sk_buff *skb, 434 int strict) 435 { 436 struct fib6_info *sibling, *next_sibling; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 445 return match; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 int nh_upper_bound; 450 451 nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound); 452 if (fl6->mp_hash > nh_upper_bound) 453 continue; 454 if (rt6_score_route(sibling, oif, strict) < 0) 455 break; 456 match = sibling; 457 break; 458 } 459 460 return match; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static inline struct fib6_info *rt6_device_match(struct net *net, 468 struct fib6_info *rt, 469 const struct in6_addr *saddr, 470 int oif, 471 int flags) 472 { 473 struct fib6_info *sprt; 474 475 if (!oif && ipv6_addr_any(saddr) && 476 !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)) 477 return rt; 478 479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 480 const struct net_device *dev = sprt->fib6_nh.fib_nh_dev; 481 482 if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 483 continue; 484 485 if (oif) { 486 if (dev->ifindex == oif) 487 return sprt; 488 } else { 489 if (ipv6_chk_addr(net, saddr, dev, 490 flags & RT6_LOOKUP_F_IFACE)) 491 return sprt; 492 } 493 } 494 495 if (oif && flags & RT6_LOOKUP_F_IFACE) 496 return net->ipv6.fib6_null_entry; 497 498 return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 499 } 500 501 #ifdef CONFIG_IPV6_ROUTER_PREF 502 struct __rt6_probe_work { 503 struct work_struct work; 504 struct in6_addr target; 505 struct net_device *dev; 506 }; 507 508 static void rt6_probe_deferred(struct work_struct *w) 509 { 510 struct in6_addr mcaddr; 511 struct __rt6_probe_work *work = 512 container_of(w, struct __rt6_probe_work, work); 513 514 addrconf_addr_solict_mult(&work->target, &mcaddr); 515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 516 dev_put(work->dev); 517 kfree(work); 518 } 519 520 static void rt6_probe(struct fib6_info *rt) 521 { 522 struct __rt6_probe_work *work = NULL; 523 const struct in6_addr *nh_gw; 524 struct neighbour *neigh; 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 528 /* 529 * Okay, this does not seem to be appropriate 530 * for now, however, we need to check if it 531 * is really so; aka Router Reachability Probing. 532 * 533 * Router Reachability Probe MUST be rate-limited 534 * to no more than one per minute. 535 */ 536 if (!rt || !rt->fib6_nh.fib_nh_has_gw) 537 return; 538 539 nh_gw = &rt->fib6_nh.fib_nh_gw6; 540 dev = rt->fib6_nh.fib_nh_dev; 541 rcu_read_lock_bh(); 542 idev = __in6_dev_get(dev); 543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 544 if (neigh) { 545 if (neigh->nud_state & NUD_VALID) 546 goto out; 547 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else if (time_after(jiffies, rt->last_probe + 558 idev->cnf.rtr_probe_interval)) { 559 work = kmalloc(sizeof(*work), GFP_ATOMIC); 560 } 561 562 if (work) { 563 rt->last_probe = jiffies; 564 INIT_WORK(&work->work, rt6_probe_deferred); 565 work->target = *nh_gw; 566 dev_hold(dev); 567 work->dev = dev; 568 schedule_work(&work->work); 569 } 570 571 out: 572 rcu_read_unlock_bh(); 573 } 574 #else 575 static inline void rt6_probe(struct fib6_info *rt) 576 { 577 } 578 #endif 579 580 /* 581 * Default Router Selection (RFC 2461 6.3.6) 582 */ 583 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 584 { 585 const struct net_device *dev = rt->fib6_nh.fib_nh_dev; 586 587 if (!oif || dev->ifindex == oif) 588 return 2; 589 return 0; 590 } 591 592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 593 { 594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 595 struct neighbour *neigh; 596 597 if (rt->fib6_flags & RTF_NONEXTHOP || 598 !rt->fib6_nh.fib_nh_has_gw) 599 return RT6_NUD_SUCCEED; 600 601 rcu_read_lock_bh(); 602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev, 603 &rt->fib6_nh.fib_nh_gw6); 604 if (neigh) { 605 read_lock(&neigh->lock); 606 if (neigh->nud_state & NUD_VALID) 607 ret = RT6_NUD_SUCCEED; 608 #ifdef CONFIG_IPV6_ROUTER_PREF 609 else if (!(neigh->nud_state & NUD_FAILED)) 610 ret = RT6_NUD_SUCCEED; 611 else 612 ret = RT6_NUD_FAIL_PROBE; 613 #endif 614 read_unlock(&neigh->lock); 615 } else { 616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 618 } 619 rcu_read_unlock_bh(); 620 621 return ret; 622 } 623 624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 625 { 626 int m; 627 628 m = rt6_check_dev(rt, oif); 629 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 630 return RT6_NUD_FAIL_HARD; 631 #ifdef CONFIG_IPV6_ROUTER_PREF 632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 633 #endif 634 if (strict & RT6_LOOKUP_F_REACHABLE) { 635 int n = rt6_check_neigh(rt); 636 if (n < 0) 637 return n; 638 } 639 return m; 640 } 641 642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 643 int *mpri, struct fib6_info *match, 644 bool *do_rr) 645 { 646 int m; 647 bool match_do_rr = false; 648 649 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 650 goto out; 651 652 if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) && 653 rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 654 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 655 goto out; 656 657 if (fib6_check_expired(rt)) 658 goto out; 659 660 m = rt6_score_route(rt, oif, strict); 661 if (m == RT6_NUD_FAIL_DO_RR) { 662 match_do_rr = true; 663 m = 0; /* lowest valid score */ 664 } else if (m == RT6_NUD_FAIL_HARD) { 665 goto out; 666 } 667 668 if (strict & RT6_LOOKUP_F_REACHABLE) 669 rt6_probe(rt); 670 671 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 672 if (m > *mpri) { 673 *do_rr = match_do_rr; 674 *mpri = m; 675 match = rt; 676 } 677 out: 678 return match; 679 } 680 681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 682 struct fib6_info *leaf, 683 struct fib6_info *rr_head, 684 u32 metric, int oif, int strict, 685 bool *do_rr) 686 { 687 struct fib6_info *rt, *match, *cont; 688 int mpri = -1; 689 690 match = NULL; 691 cont = NULL; 692 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 693 if (rt->fib6_metric != metric) { 694 cont = rt; 695 break; 696 } 697 698 match = find_match(rt, oif, strict, &mpri, match, do_rr); 699 } 700 701 for (rt = leaf; rt && rt != rr_head; 702 rt = rcu_dereference(rt->fib6_next)) { 703 if (rt->fib6_metric != metric) { 704 cont = rt; 705 break; 706 } 707 708 match = find_match(rt, oif, strict, &mpri, match, do_rr); 709 } 710 711 if (match || !cont) 712 return match; 713 714 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 715 match = find_match(rt, oif, strict, &mpri, match, do_rr); 716 717 return match; 718 } 719 720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 721 int oif, int strict) 722 { 723 struct fib6_info *leaf = rcu_dereference(fn->leaf); 724 struct fib6_info *match, *rt0; 725 bool do_rr = false; 726 int key_plen; 727 728 if (!leaf || leaf == net->ipv6.fib6_null_entry) 729 return net->ipv6.fib6_null_entry; 730 731 rt0 = rcu_dereference(fn->rr_ptr); 732 if (!rt0) 733 rt0 = leaf; 734 735 /* Double check to make sure fn is not an intermediate node 736 * and fn->leaf does not points to its child's leaf 737 * (This might happen if all routes under fn are deleted from 738 * the tree and fib6_repair_tree() is called on the node.) 739 */ 740 key_plen = rt0->fib6_dst.plen; 741 #ifdef CONFIG_IPV6_SUBTREES 742 if (rt0->fib6_src.plen) 743 key_plen = rt0->fib6_src.plen; 744 #endif 745 if (fn->fn_bit != key_plen) 746 return net->ipv6.fib6_null_entry; 747 748 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 749 &do_rr); 750 751 if (do_rr) { 752 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 753 754 /* no entries matched; do round-robin */ 755 if (!next || next->fib6_metric != rt0->fib6_metric) 756 next = leaf; 757 758 if (next != rt0) { 759 spin_lock_bh(&leaf->fib6_table->tb6_lock); 760 /* make sure next is not being deleted from the tree */ 761 if (next->fib6_node) 762 rcu_assign_pointer(fn->rr_ptr, next); 763 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 764 } 765 } 766 767 return match ? match : net->ipv6.fib6_null_entry; 768 } 769 770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 771 { 772 return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw; 773 } 774 775 #ifdef CONFIG_IPV6_ROUTE_INFO 776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 777 const struct in6_addr *gwaddr) 778 { 779 struct net *net = dev_net(dev); 780 struct route_info *rinfo = (struct route_info *) opt; 781 struct in6_addr prefix_buf, *prefix; 782 unsigned int pref; 783 unsigned long lifetime; 784 struct fib6_info *rt; 785 786 if (len < sizeof(struct route_info)) { 787 return -EINVAL; 788 } 789 790 /* Sanity check for prefix_len and length */ 791 if (rinfo->length > 3) { 792 return -EINVAL; 793 } else if (rinfo->prefix_len > 128) { 794 return -EINVAL; 795 } else if (rinfo->prefix_len > 64) { 796 if (rinfo->length < 2) { 797 return -EINVAL; 798 } 799 } else if (rinfo->prefix_len > 0) { 800 if (rinfo->length < 1) { 801 return -EINVAL; 802 } 803 } 804 805 pref = rinfo->route_pref; 806 if (pref == ICMPV6_ROUTER_PREF_INVALID) 807 return -EINVAL; 808 809 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 810 811 if (rinfo->length == 3) 812 prefix = (struct in6_addr *)rinfo->prefix; 813 else { 814 /* this function is safe */ 815 ipv6_addr_prefix(&prefix_buf, 816 (struct in6_addr *)rinfo->prefix, 817 rinfo->prefix_len); 818 prefix = &prefix_buf; 819 } 820 821 if (rinfo->prefix_len == 0) 822 rt = rt6_get_dflt_router(net, gwaddr, dev); 823 else 824 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 825 gwaddr, dev); 826 827 if (rt && !lifetime) { 828 ip6_del_rt(net, rt); 829 rt = NULL; 830 } 831 832 if (!rt && lifetime) 833 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 834 dev, pref); 835 else if (rt) 836 rt->fib6_flags = RTF_ROUTEINFO | 837 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 838 839 if (rt) { 840 if (!addrconf_finite_timeout(lifetime)) 841 fib6_clean_expires(rt); 842 else 843 fib6_set_expires(rt, jiffies + HZ * lifetime); 844 845 fib6_info_release(rt); 846 } 847 return 0; 848 } 849 #endif 850 851 /* 852 * Misc support functions 853 */ 854 855 /* called with rcu_lock held */ 856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 857 { 858 struct net_device *dev = rt->fib6_nh.fib_nh_dev; 859 860 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 861 /* for copies of local routes, dst->dev needs to be the 862 * device if it is a master device, the master device if 863 * device is enslaved, and the loopback as the default 864 */ 865 if (netif_is_l3_slave(dev) && 866 !rt6_need_strict(&rt->fib6_dst.addr)) 867 dev = l3mdev_master_dev_rcu(dev); 868 else if (!netif_is_l3_master(dev)) 869 dev = dev_net(dev)->loopback_dev; 870 /* last case is netif_is_l3_master(dev) is true in which 871 * case we want dev returned to be dev 872 */ 873 } 874 875 return dev; 876 } 877 878 static const int fib6_prop[RTN_MAX + 1] = { 879 [RTN_UNSPEC] = 0, 880 [RTN_UNICAST] = 0, 881 [RTN_LOCAL] = 0, 882 [RTN_BROADCAST] = 0, 883 [RTN_ANYCAST] = 0, 884 [RTN_MULTICAST] = 0, 885 [RTN_BLACKHOLE] = -EINVAL, 886 [RTN_UNREACHABLE] = -EHOSTUNREACH, 887 [RTN_PROHIBIT] = -EACCES, 888 [RTN_THROW] = -EAGAIN, 889 [RTN_NAT] = -EINVAL, 890 [RTN_XRESOLVE] = -EINVAL, 891 }; 892 893 static int ip6_rt_type_to_error(u8 fib6_type) 894 { 895 return fib6_prop[fib6_type]; 896 } 897 898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 899 { 900 unsigned short flags = 0; 901 902 if (rt->dst_nocount) 903 flags |= DST_NOCOUNT; 904 if (rt->dst_nopolicy) 905 flags |= DST_NOPOLICY; 906 if (rt->dst_host) 907 flags |= DST_HOST; 908 909 return flags; 910 } 911 912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 913 { 914 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 915 916 switch (ort->fib6_type) { 917 case RTN_BLACKHOLE: 918 rt->dst.output = dst_discard_out; 919 rt->dst.input = dst_discard; 920 break; 921 case RTN_PROHIBIT: 922 rt->dst.output = ip6_pkt_prohibit_out; 923 rt->dst.input = ip6_pkt_prohibit; 924 break; 925 case RTN_THROW: 926 case RTN_UNREACHABLE: 927 default: 928 rt->dst.output = ip6_pkt_discard_out; 929 rt->dst.input = ip6_pkt_discard; 930 break; 931 } 932 } 933 934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 935 { 936 if (ort->fib6_flags & RTF_REJECT) { 937 ip6_rt_init_dst_reject(rt, ort); 938 return; 939 } 940 941 rt->dst.error = 0; 942 rt->dst.output = ip6_output; 943 944 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 945 rt->dst.input = ip6_input; 946 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 947 rt->dst.input = ip6_mc_input; 948 } else { 949 rt->dst.input = ip6_forward; 950 } 951 952 if (ort->fib6_nh.fib_nh_lws) { 953 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws); 954 lwtunnel_set_redirect(&rt->dst); 955 } 956 957 rt->dst.lastuse = jiffies; 958 } 959 960 /* Caller must already hold reference to @from */ 961 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 962 { 963 rt->rt6i_flags &= ~RTF_EXPIRES; 964 rcu_assign_pointer(rt->from, from); 965 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 966 } 967 968 /* Caller must already hold reference to @ort */ 969 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 970 { 971 struct net_device *dev = fib6_info_nh_dev(ort); 972 973 ip6_rt_init_dst(rt, ort); 974 975 rt->rt6i_dst = ort->fib6_dst; 976 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 977 rt->rt6i_flags = ort->fib6_flags; 978 if (ort->fib6_nh.fib_nh_has_gw) { 979 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6; 980 rt->rt6i_flags |= RTF_GATEWAY; 981 } 982 rt6_set_from(rt, ort); 983 #ifdef CONFIG_IPV6_SUBTREES 984 rt->rt6i_src = ort->fib6_src; 985 #endif 986 } 987 988 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 989 struct in6_addr *saddr) 990 { 991 struct fib6_node *pn, *sn; 992 while (1) { 993 if (fn->fn_flags & RTN_TL_ROOT) 994 return NULL; 995 pn = rcu_dereference(fn->parent); 996 sn = FIB6_SUBTREE(pn); 997 if (sn && sn != fn) 998 fn = fib6_node_lookup(sn, NULL, saddr); 999 else 1000 fn = pn; 1001 if (fn->fn_flags & RTN_RTINFO) 1002 return fn; 1003 } 1004 } 1005 1006 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1007 { 1008 struct rt6_info *rt = *prt; 1009 1010 if (dst_hold_safe(&rt->dst)) 1011 return true; 1012 if (net) { 1013 rt = net->ipv6.ip6_null_entry; 1014 dst_hold(&rt->dst); 1015 } else { 1016 rt = NULL; 1017 } 1018 *prt = rt; 1019 return false; 1020 } 1021 1022 /* called with rcu_lock held */ 1023 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1024 { 1025 unsigned short flags = fib6_info_dst_flags(rt); 1026 struct net_device *dev = rt->fib6_nh.fib_nh_dev; 1027 struct rt6_info *nrt; 1028 1029 if (!fib6_info_hold_safe(rt)) 1030 goto fallback; 1031 1032 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1033 if (!nrt) { 1034 fib6_info_release(rt); 1035 goto fallback; 1036 } 1037 1038 ip6_rt_copy_init(nrt, rt); 1039 return nrt; 1040 1041 fallback: 1042 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1043 dst_hold(&nrt->dst); 1044 return nrt; 1045 } 1046 1047 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1048 struct fib6_table *table, 1049 struct flowi6 *fl6, 1050 const struct sk_buff *skb, 1051 int flags) 1052 { 1053 struct fib6_info *f6i; 1054 struct fib6_node *fn; 1055 struct rt6_info *rt; 1056 1057 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1058 flags &= ~RT6_LOOKUP_F_IFACE; 1059 1060 rcu_read_lock(); 1061 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1062 restart: 1063 f6i = rcu_dereference(fn->leaf); 1064 if (!f6i) { 1065 f6i = net->ipv6.fib6_null_entry; 1066 } else { 1067 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1068 fl6->flowi6_oif, flags); 1069 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1070 f6i = fib6_multipath_select(net, f6i, fl6, 1071 fl6->flowi6_oif, skb, 1072 flags); 1073 } 1074 if (f6i == net->ipv6.fib6_null_entry) { 1075 fn = fib6_backtrack(fn, &fl6->saddr); 1076 if (fn) 1077 goto restart; 1078 } 1079 1080 trace_fib6_table_lookup(net, f6i, table, fl6); 1081 1082 /* Search through exception table */ 1083 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1084 if (rt) { 1085 if (ip6_hold_safe(net, &rt)) 1086 dst_use_noref(&rt->dst, jiffies); 1087 } else if (f6i == net->ipv6.fib6_null_entry) { 1088 rt = net->ipv6.ip6_null_entry; 1089 dst_hold(&rt->dst); 1090 } else { 1091 rt = ip6_create_rt_rcu(f6i); 1092 } 1093 1094 rcu_read_unlock(); 1095 1096 return rt; 1097 } 1098 1099 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1100 const struct sk_buff *skb, int flags) 1101 { 1102 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1103 } 1104 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1105 1106 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1107 const struct in6_addr *saddr, int oif, 1108 const struct sk_buff *skb, int strict) 1109 { 1110 struct flowi6 fl6 = { 1111 .flowi6_oif = oif, 1112 .daddr = *daddr, 1113 }; 1114 struct dst_entry *dst; 1115 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1116 1117 if (saddr) { 1118 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1119 flags |= RT6_LOOKUP_F_HAS_SADDR; 1120 } 1121 1122 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1123 if (dst->error == 0) 1124 return (struct rt6_info *) dst; 1125 1126 dst_release(dst); 1127 1128 return NULL; 1129 } 1130 EXPORT_SYMBOL(rt6_lookup); 1131 1132 /* ip6_ins_rt is called with FREE table->tb6_lock. 1133 * It takes new route entry, the addition fails by any reason the 1134 * route is released. 1135 * Caller must hold dst before calling it. 1136 */ 1137 1138 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1139 struct netlink_ext_ack *extack) 1140 { 1141 int err; 1142 struct fib6_table *table; 1143 1144 table = rt->fib6_table; 1145 spin_lock_bh(&table->tb6_lock); 1146 err = fib6_add(&table->tb6_root, rt, info, extack); 1147 spin_unlock_bh(&table->tb6_lock); 1148 1149 return err; 1150 } 1151 1152 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1153 { 1154 struct nl_info info = { .nl_net = net, }; 1155 1156 return __ip6_ins_rt(rt, &info, NULL); 1157 } 1158 1159 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1160 const struct in6_addr *daddr, 1161 const struct in6_addr *saddr) 1162 { 1163 struct net_device *dev; 1164 struct rt6_info *rt; 1165 1166 /* 1167 * Clone the route. 1168 */ 1169 1170 if (!fib6_info_hold_safe(ort)) 1171 return NULL; 1172 1173 dev = ip6_rt_get_dev_rcu(ort); 1174 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1175 if (!rt) { 1176 fib6_info_release(ort); 1177 return NULL; 1178 } 1179 1180 ip6_rt_copy_init(rt, ort); 1181 rt->rt6i_flags |= RTF_CACHE; 1182 rt->dst.flags |= DST_HOST; 1183 rt->rt6i_dst.addr = *daddr; 1184 rt->rt6i_dst.plen = 128; 1185 1186 if (!rt6_is_gw_or_nonexthop(ort)) { 1187 if (ort->fib6_dst.plen != 128 && 1188 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1189 rt->rt6i_flags |= RTF_ANYCAST; 1190 #ifdef CONFIG_IPV6_SUBTREES 1191 if (rt->rt6i_src.plen && saddr) { 1192 rt->rt6i_src.addr = *saddr; 1193 rt->rt6i_src.plen = 128; 1194 } 1195 #endif 1196 } 1197 1198 return rt; 1199 } 1200 1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1202 { 1203 unsigned short flags = fib6_info_dst_flags(rt); 1204 struct net_device *dev; 1205 struct rt6_info *pcpu_rt; 1206 1207 if (!fib6_info_hold_safe(rt)) 1208 return NULL; 1209 1210 rcu_read_lock(); 1211 dev = ip6_rt_get_dev_rcu(rt); 1212 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1213 rcu_read_unlock(); 1214 if (!pcpu_rt) { 1215 fib6_info_release(rt); 1216 return NULL; 1217 } 1218 ip6_rt_copy_init(pcpu_rt, rt); 1219 pcpu_rt->rt6i_flags |= RTF_PCPU; 1220 return pcpu_rt; 1221 } 1222 1223 /* It should be called with rcu_read_lock() acquired */ 1224 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1225 { 1226 struct rt6_info *pcpu_rt, **p; 1227 1228 p = this_cpu_ptr(rt->rt6i_pcpu); 1229 pcpu_rt = *p; 1230 1231 if (pcpu_rt) 1232 ip6_hold_safe(NULL, &pcpu_rt); 1233 1234 return pcpu_rt; 1235 } 1236 1237 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1238 struct fib6_info *rt) 1239 { 1240 struct rt6_info *pcpu_rt, *prev, **p; 1241 1242 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1243 if (!pcpu_rt) { 1244 dst_hold(&net->ipv6.ip6_null_entry->dst); 1245 return net->ipv6.ip6_null_entry; 1246 } 1247 1248 dst_hold(&pcpu_rt->dst); 1249 p = this_cpu_ptr(rt->rt6i_pcpu); 1250 prev = cmpxchg(p, NULL, pcpu_rt); 1251 BUG_ON(prev); 1252 1253 return pcpu_rt; 1254 } 1255 1256 /* exception hash table implementation 1257 */ 1258 static DEFINE_SPINLOCK(rt6_exception_lock); 1259 1260 /* Remove rt6_ex from hash table and free the memory 1261 * Caller must hold rt6_exception_lock 1262 */ 1263 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1264 struct rt6_exception *rt6_ex) 1265 { 1266 struct fib6_info *from; 1267 struct net *net; 1268 1269 if (!bucket || !rt6_ex) 1270 return; 1271 1272 net = dev_net(rt6_ex->rt6i->dst.dev); 1273 net->ipv6.rt6_stats->fib_rt_cache--; 1274 1275 /* purge completely the exception to allow releasing the held resources: 1276 * some [sk] cache may keep the dst around for unlimited time 1277 */ 1278 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1279 lockdep_is_held(&rt6_exception_lock)); 1280 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1281 fib6_info_release(from); 1282 dst_dev_put(&rt6_ex->rt6i->dst); 1283 1284 hlist_del_rcu(&rt6_ex->hlist); 1285 dst_release(&rt6_ex->rt6i->dst); 1286 kfree_rcu(rt6_ex, rcu); 1287 WARN_ON_ONCE(!bucket->depth); 1288 bucket->depth--; 1289 } 1290 1291 /* Remove oldest rt6_ex in bucket and free the memory 1292 * Caller must hold rt6_exception_lock 1293 */ 1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1295 { 1296 struct rt6_exception *rt6_ex, *oldest = NULL; 1297 1298 if (!bucket) 1299 return; 1300 1301 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1302 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1303 oldest = rt6_ex; 1304 } 1305 rt6_remove_exception(bucket, oldest); 1306 } 1307 1308 static u32 rt6_exception_hash(const struct in6_addr *dst, 1309 const struct in6_addr *src) 1310 { 1311 static u32 seed __read_mostly; 1312 u32 val; 1313 1314 net_get_random_once(&seed, sizeof(seed)); 1315 val = jhash(dst, sizeof(*dst), seed); 1316 1317 #ifdef CONFIG_IPV6_SUBTREES 1318 if (src) 1319 val = jhash(src, sizeof(*src), val); 1320 #endif 1321 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1322 } 1323 1324 /* Helper function to find the cached rt in the hash table 1325 * and update bucket pointer to point to the bucket for this 1326 * (daddr, saddr) pair 1327 * Caller must hold rt6_exception_lock 1328 */ 1329 static struct rt6_exception * 1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1331 const struct in6_addr *daddr, 1332 const struct in6_addr *saddr) 1333 { 1334 struct rt6_exception *rt6_ex; 1335 u32 hval; 1336 1337 if (!(*bucket) || !daddr) 1338 return NULL; 1339 1340 hval = rt6_exception_hash(daddr, saddr); 1341 *bucket += hval; 1342 1343 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1344 struct rt6_info *rt6 = rt6_ex->rt6i; 1345 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1346 1347 #ifdef CONFIG_IPV6_SUBTREES 1348 if (matched && saddr) 1349 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1350 #endif 1351 if (matched) 1352 return rt6_ex; 1353 } 1354 return NULL; 1355 } 1356 1357 /* Helper function to find the cached rt in the hash table 1358 * and update bucket pointer to point to the bucket for this 1359 * (daddr, saddr) pair 1360 * Caller must hold rcu_read_lock() 1361 */ 1362 static struct rt6_exception * 1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1364 const struct in6_addr *daddr, 1365 const struct in6_addr *saddr) 1366 { 1367 struct rt6_exception *rt6_ex; 1368 u32 hval; 1369 1370 WARN_ON_ONCE(!rcu_read_lock_held()); 1371 1372 if (!(*bucket) || !daddr) 1373 return NULL; 1374 1375 hval = rt6_exception_hash(daddr, saddr); 1376 *bucket += hval; 1377 1378 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1379 struct rt6_info *rt6 = rt6_ex->rt6i; 1380 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1381 1382 #ifdef CONFIG_IPV6_SUBTREES 1383 if (matched && saddr) 1384 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1385 #endif 1386 if (matched) 1387 return rt6_ex; 1388 } 1389 return NULL; 1390 } 1391 1392 static unsigned int fib6_mtu(const struct fib6_info *rt) 1393 { 1394 unsigned int mtu; 1395 1396 if (rt->fib6_pmtu) { 1397 mtu = rt->fib6_pmtu; 1398 } else { 1399 struct net_device *dev = fib6_info_nh_dev(rt); 1400 struct inet6_dev *idev; 1401 1402 rcu_read_lock(); 1403 idev = __in6_dev_get(dev); 1404 mtu = idev->cnf.mtu6; 1405 rcu_read_unlock(); 1406 } 1407 1408 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1409 1410 return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu); 1411 } 1412 1413 static int rt6_insert_exception(struct rt6_info *nrt, 1414 struct fib6_info *ort) 1415 { 1416 struct net *net = dev_net(nrt->dst.dev); 1417 struct rt6_exception_bucket *bucket; 1418 struct in6_addr *src_key = NULL; 1419 struct rt6_exception *rt6_ex; 1420 int err = 0; 1421 1422 spin_lock_bh(&rt6_exception_lock); 1423 1424 if (ort->exception_bucket_flushed) { 1425 err = -EINVAL; 1426 goto out; 1427 } 1428 1429 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1430 lockdep_is_held(&rt6_exception_lock)); 1431 if (!bucket) { 1432 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1433 GFP_ATOMIC); 1434 if (!bucket) { 1435 err = -ENOMEM; 1436 goto out; 1437 } 1438 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1439 } 1440 1441 #ifdef CONFIG_IPV6_SUBTREES 1442 /* rt6i_src.plen != 0 indicates ort is in subtree 1443 * and exception table is indexed by a hash of 1444 * both rt6i_dst and rt6i_src. 1445 * Otherwise, the exception table is indexed by 1446 * a hash of only rt6i_dst. 1447 */ 1448 if (ort->fib6_src.plen) 1449 src_key = &nrt->rt6i_src.addr; 1450 #endif 1451 /* rt6_mtu_change() might lower mtu on ort. 1452 * Only insert this exception route if its mtu 1453 * is less than ort's mtu value. 1454 */ 1455 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1456 err = -EINVAL; 1457 goto out; 1458 } 1459 1460 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1461 src_key); 1462 if (rt6_ex) 1463 rt6_remove_exception(bucket, rt6_ex); 1464 1465 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1466 if (!rt6_ex) { 1467 err = -ENOMEM; 1468 goto out; 1469 } 1470 rt6_ex->rt6i = nrt; 1471 rt6_ex->stamp = jiffies; 1472 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1473 bucket->depth++; 1474 net->ipv6.rt6_stats->fib_rt_cache++; 1475 1476 if (bucket->depth > FIB6_MAX_DEPTH) 1477 rt6_exception_remove_oldest(bucket); 1478 1479 out: 1480 spin_unlock_bh(&rt6_exception_lock); 1481 1482 /* Update fn->fn_sernum to invalidate all cached dst */ 1483 if (!err) { 1484 spin_lock_bh(&ort->fib6_table->tb6_lock); 1485 fib6_update_sernum(net, ort); 1486 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1487 fib6_force_start_gc(net); 1488 } 1489 1490 return err; 1491 } 1492 1493 void rt6_flush_exceptions(struct fib6_info *rt) 1494 { 1495 struct rt6_exception_bucket *bucket; 1496 struct rt6_exception *rt6_ex; 1497 struct hlist_node *tmp; 1498 int i; 1499 1500 spin_lock_bh(&rt6_exception_lock); 1501 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1502 rt->exception_bucket_flushed = 1; 1503 1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1505 lockdep_is_held(&rt6_exception_lock)); 1506 if (!bucket) 1507 goto out; 1508 1509 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1510 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1511 rt6_remove_exception(bucket, rt6_ex); 1512 WARN_ON_ONCE(bucket->depth); 1513 bucket++; 1514 } 1515 1516 out: 1517 spin_unlock_bh(&rt6_exception_lock); 1518 } 1519 1520 /* Find cached rt in the hash table inside passed in rt 1521 * Caller has to hold rcu_read_lock() 1522 */ 1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1524 struct in6_addr *daddr, 1525 struct in6_addr *saddr) 1526 { 1527 struct rt6_exception_bucket *bucket; 1528 struct in6_addr *src_key = NULL; 1529 struct rt6_exception *rt6_ex; 1530 struct rt6_info *res = NULL; 1531 1532 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1533 1534 #ifdef CONFIG_IPV6_SUBTREES 1535 /* rt6i_src.plen != 0 indicates rt is in subtree 1536 * and exception table is indexed by a hash of 1537 * both rt6i_dst and rt6i_src. 1538 * Otherwise, the exception table is indexed by 1539 * a hash of only rt6i_dst. 1540 */ 1541 if (rt->fib6_src.plen) 1542 src_key = saddr; 1543 #endif 1544 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1545 1546 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1547 res = rt6_ex->rt6i; 1548 1549 return res; 1550 } 1551 1552 /* Remove the passed in cached rt from the hash table that contains it */ 1553 static int rt6_remove_exception_rt(struct rt6_info *rt) 1554 { 1555 struct rt6_exception_bucket *bucket; 1556 struct in6_addr *src_key = NULL; 1557 struct rt6_exception *rt6_ex; 1558 struct fib6_info *from; 1559 int err; 1560 1561 from = rcu_dereference(rt->from); 1562 if (!from || 1563 !(rt->rt6i_flags & RTF_CACHE)) 1564 return -EINVAL; 1565 1566 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1567 return -ENOENT; 1568 1569 spin_lock_bh(&rt6_exception_lock); 1570 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1571 lockdep_is_held(&rt6_exception_lock)); 1572 #ifdef CONFIG_IPV6_SUBTREES 1573 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1574 * and exception table is indexed by a hash of 1575 * both rt6i_dst and rt6i_src. 1576 * Otherwise, the exception table is indexed by 1577 * a hash of only rt6i_dst. 1578 */ 1579 if (from->fib6_src.plen) 1580 src_key = &rt->rt6i_src.addr; 1581 #endif 1582 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1583 &rt->rt6i_dst.addr, 1584 src_key); 1585 if (rt6_ex) { 1586 rt6_remove_exception(bucket, rt6_ex); 1587 err = 0; 1588 } else { 1589 err = -ENOENT; 1590 } 1591 1592 spin_unlock_bh(&rt6_exception_lock); 1593 return err; 1594 } 1595 1596 /* Find rt6_ex which contains the passed in rt cache and 1597 * refresh its stamp 1598 */ 1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1600 { 1601 struct rt6_exception_bucket *bucket; 1602 struct in6_addr *src_key = NULL; 1603 struct rt6_exception *rt6_ex; 1604 struct fib6_info *from; 1605 1606 rcu_read_lock(); 1607 from = rcu_dereference(rt->from); 1608 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1609 goto unlock; 1610 1611 bucket = rcu_dereference(from->rt6i_exception_bucket); 1612 1613 #ifdef CONFIG_IPV6_SUBTREES 1614 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1615 * and exception table is indexed by a hash of 1616 * both rt6i_dst and rt6i_src. 1617 * Otherwise, the exception table is indexed by 1618 * a hash of only rt6i_dst. 1619 */ 1620 if (from->fib6_src.plen) 1621 src_key = &rt->rt6i_src.addr; 1622 #endif 1623 rt6_ex = __rt6_find_exception_rcu(&bucket, 1624 &rt->rt6i_dst.addr, 1625 src_key); 1626 if (rt6_ex) 1627 rt6_ex->stamp = jiffies; 1628 1629 unlock: 1630 rcu_read_unlock(); 1631 } 1632 1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1634 struct rt6_info *rt, int mtu) 1635 { 1636 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1637 * lowest MTU in the path: always allow updating the route PMTU to 1638 * reflect PMTU decreases. 1639 * 1640 * If the new MTU is higher, and the route PMTU is equal to the local 1641 * MTU, this means the old MTU is the lowest in the path, so allow 1642 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1643 * handle this. 1644 */ 1645 1646 if (dst_mtu(&rt->dst) >= mtu) 1647 return true; 1648 1649 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1650 return true; 1651 1652 return false; 1653 } 1654 1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1656 struct fib6_info *rt, int mtu) 1657 { 1658 struct rt6_exception_bucket *bucket; 1659 struct rt6_exception *rt6_ex; 1660 int i; 1661 1662 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1663 lockdep_is_held(&rt6_exception_lock)); 1664 1665 if (!bucket) 1666 return; 1667 1668 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1669 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1670 struct rt6_info *entry = rt6_ex->rt6i; 1671 1672 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1673 * route), the metrics of its rt->from have already 1674 * been updated. 1675 */ 1676 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1677 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1678 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1679 } 1680 bucket++; 1681 } 1682 } 1683 1684 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1685 1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1687 struct in6_addr *gateway) 1688 { 1689 struct rt6_exception_bucket *bucket; 1690 struct rt6_exception *rt6_ex; 1691 struct hlist_node *tmp; 1692 int i; 1693 1694 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1695 return; 1696 1697 spin_lock_bh(&rt6_exception_lock); 1698 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1699 lockdep_is_held(&rt6_exception_lock)); 1700 1701 if (bucket) { 1702 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1703 hlist_for_each_entry_safe(rt6_ex, tmp, 1704 &bucket->chain, hlist) { 1705 struct rt6_info *entry = rt6_ex->rt6i; 1706 1707 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1708 RTF_CACHE_GATEWAY && 1709 ipv6_addr_equal(gateway, 1710 &entry->rt6i_gateway)) { 1711 rt6_remove_exception(bucket, rt6_ex); 1712 } 1713 } 1714 bucket++; 1715 } 1716 } 1717 1718 spin_unlock_bh(&rt6_exception_lock); 1719 } 1720 1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1722 struct rt6_exception *rt6_ex, 1723 struct fib6_gc_args *gc_args, 1724 unsigned long now) 1725 { 1726 struct rt6_info *rt = rt6_ex->rt6i; 1727 1728 /* we are pruning and obsoleting aged-out and non gateway exceptions 1729 * even if others have still references to them, so that on next 1730 * dst_check() such references can be dropped. 1731 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1732 * expired, independently from their aging, as per RFC 8201 section 4 1733 */ 1734 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1735 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1736 RT6_TRACE("aging clone %p\n", rt); 1737 rt6_remove_exception(bucket, rt6_ex); 1738 return; 1739 } 1740 } else if (time_after(jiffies, rt->dst.expires)) { 1741 RT6_TRACE("purging expired route %p\n", rt); 1742 rt6_remove_exception(bucket, rt6_ex); 1743 return; 1744 } 1745 1746 if (rt->rt6i_flags & RTF_GATEWAY) { 1747 struct neighbour *neigh; 1748 __u8 neigh_flags = 0; 1749 1750 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1751 if (neigh) 1752 neigh_flags = neigh->flags; 1753 1754 if (!(neigh_flags & NTF_ROUTER)) { 1755 RT6_TRACE("purging route %p via non-router but gateway\n", 1756 rt); 1757 rt6_remove_exception(bucket, rt6_ex); 1758 return; 1759 } 1760 } 1761 1762 gc_args->more++; 1763 } 1764 1765 void rt6_age_exceptions(struct fib6_info *rt, 1766 struct fib6_gc_args *gc_args, 1767 unsigned long now) 1768 { 1769 struct rt6_exception_bucket *bucket; 1770 struct rt6_exception *rt6_ex; 1771 struct hlist_node *tmp; 1772 int i; 1773 1774 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1775 return; 1776 1777 rcu_read_lock_bh(); 1778 spin_lock(&rt6_exception_lock); 1779 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1780 lockdep_is_held(&rt6_exception_lock)); 1781 1782 if (bucket) { 1783 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1784 hlist_for_each_entry_safe(rt6_ex, tmp, 1785 &bucket->chain, hlist) { 1786 rt6_age_examine_exception(bucket, rt6_ex, 1787 gc_args, now); 1788 } 1789 bucket++; 1790 } 1791 } 1792 spin_unlock(&rt6_exception_lock); 1793 rcu_read_unlock_bh(); 1794 } 1795 1796 /* must be called with rcu lock held */ 1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1798 int oif, struct flowi6 *fl6, int strict) 1799 { 1800 struct fib6_node *fn, *saved_fn; 1801 struct fib6_info *f6i; 1802 1803 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1804 saved_fn = fn; 1805 1806 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1807 oif = 0; 1808 1809 redo_rt6_select: 1810 f6i = rt6_select(net, fn, oif, strict); 1811 if (f6i == net->ipv6.fib6_null_entry) { 1812 fn = fib6_backtrack(fn, &fl6->saddr); 1813 if (fn) 1814 goto redo_rt6_select; 1815 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1816 /* also consider unreachable route */ 1817 strict &= ~RT6_LOOKUP_F_REACHABLE; 1818 fn = saved_fn; 1819 goto redo_rt6_select; 1820 } 1821 } 1822 1823 trace_fib6_table_lookup(net, f6i, table, fl6); 1824 1825 return f6i; 1826 } 1827 1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1829 int oif, struct flowi6 *fl6, 1830 const struct sk_buff *skb, int flags) 1831 { 1832 struct fib6_info *f6i; 1833 struct rt6_info *rt; 1834 int strict = 0; 1835 1836 strict |= flags & RT6_LOOKUP_F_IFACE; 1837 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1838 if (net->ipv6.devconf_all->forwarding == 0) 1839 strict |= RT6_LOOKUP_F_REACHABLE; 1840 1841 rcu_read_lock(); 1842 1843 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1844 if (f6i->fib6_nsiblings) 1845 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1846 1847 if (f6i == net->ipv6.fib6_null_entry) { 1848 rt = net->ipv6.ip6_null_entry; 1849 rcu_read_unlock(); 1850 dst_hold(&rt->dst); 1851 return rt; 1852 } 1853 1854 /*Search through exception table */ 1855 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1856 if (rt) { 1857 if (ip6_hold_safe(net, &rt)) 1858 dst_use_noref(&rt->dst, jiffies); 1859 1860 rcu_read_unlock(); 1861 return rt; 1862 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1863 !f6i->fib6_nh.fib_nh_has_gw)) { 1864 /* Create a RTF_CACHE clone which will not be 1865 * owned by the fib6 tree. It is for the special case where 1866 * the daddr in the skb during the neighbor look-up is different 1867 * from the fl6->daddr used to look-up route here. 1868 */ 1869 struct rt6_info *uncached_rt; 1870 1871 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1872 1873 rcu_read_unlock(); 1874 1875 if (uncached_rt) { 1876 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1877 * No need for another dst_hold() 1878 */ 1879 rt6_uncached_list_add(uncached_rt); 1880 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1881 } else { 1882 uncached_rt = net->ipv6.ip6_null_entry; 1883 dst_hold(&uncached_rt->dst); 1884 } 1885 1886 return uncached_rt; 1887 } else { 1888 /* Get a percpu copy */ 1889 1890 struct rt6_info *pcpu_rt; 1891 1892 local_bh_disable(); 1893 pcpu_rt = rt6_get_pcpu_route(f6i); 1894 1895 if (!pcpu_rt) 1896 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1897 1898 local_bh_enable(); 1899 rcu_read_unlock(); 1900 1901 return pcpu_rt; 1902 } 1903 } 1904 EXPORT_SYMBOL_GPL(ip6_pol_route); 1905 1906 static struct rt6_info *ip6_pol_route_input(struct net *net, 1907 struct fib6_table *table, 1908 struct flowi6 *fl6, 1909 const struct sk_buff *skb, 1910 int flags) 1911 { 1912 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1913 } 1914 1915 struct dst_entry *ip6_route_input_lookup(struct net *net, 1916 struct net_device *dev, 1917 struct flowi6 *fl6, 1918 const struct sk_buff *skb, 1919 int flags) 1920 { 1921 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1922 flags |= RT6_LOOKUP_F_IFACE; 1923 1924 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1925 } 1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1927 1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1929 struct flow_keys *keys, 1930 struct flow_keys *flkeys) 1931 { 1932 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1933 const struct ipv6hdr *key_iph = outer_iph; 1934 struct flow_keys *_flkeys = flkeys; 1935 const struct ipv6hdr *inner_iph; 1936 const struct icmp6hdr *icmph; 1937 struct ipv6hdr _inner_iph; 1938 struct icmp6hdr _icmph; 1939 1940 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1941 goto out; 1942 1943 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1944 sizeof(_icmph), &_icmph); 1945 if (!icmph) 1946 goto out; 1947 1948 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1949 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1950 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1951 icmph->icmp6_type != ICMPV6_PARAMPROB) 1952 goto out; 1953 1954 inner_iph = skb_header_pointer(skb, 1955 skb_transport_offset(skb) + sizeof(*icmph), 1956 sizeof(_inner_iph), &_inner_iph); 1957 if (!inner_iph) 1958 goto out; 1959 1960 key_iph = inner_iph; 1961 _flkeys = NULL; 1962 out: 1963 if (_flkeys) { 1964 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1965 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1966 keys->tags.flow_label = _flkeys->tags.flow_label; 1967 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1968 } else { 1969 keys->addrs.v6addrs.src = key_iph->saddr; 1970 keys->addrs.v6addrs.dst = key_iph->daddr; 1971 keys->tags.flow_label = ip6_flowlabel(key_iph); 1972 keys->basic.ip_proto = key_iph->nexthdr; 1973 } 1974 } 1975 1976 /* if skb is set it will be used and fl6 can be NULL */ 1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1978 const struct sk_buff *skb, struct flow_keys *flkeys) 1979 { 1980 struct flow_keys hash_keys; 1981 u32 mhash; 1982 1983 switch (ip6_multipath_hash_policy(net)) { 1984 case 0: 1985 memset(&hash_keys, 0, sizeof(hash_keys)); 1986 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1987 if (skb) { 1988 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1989 } else { 1990 hash_keys.addrs.v6addrs.src = fl6->saddr; 1991 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1992 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 1993 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1994 } 1995 break; 1996 case 1: 1997 if (skb) { 1998 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1999 struct flow_keys keys; 2000 2001 /* short-circuit if we already have L4 hash present */ 2002 if (skb->l4_hash) 2003 return skb_get_hash_raw(skb) >> 1; 2004 2005 memset(&hash_keys, 0, sizeof(hash_keys)); 2006 2007 if (!flkeys) { 2008 skb_flow_dissect_flow_keys(skb, &keys, flag); 2009 flkeys = &keys; 2010 } 2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2012 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2013 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2014 hash_keys.ports.src = flkeys->ports.src; 2015 hash_keys.ports.dst = flkeys->ports.dst; 2016 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2017 } else { 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2020 hash_keys.addrs.v6addrs.src = fl6->saddr; 2021 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2022 hash_keys.ports.src = fl6->fl6_sport; 2023 hash_keys.ports.dst = fl6->fl6_dport; 2024 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2025 } 2026 break; 2027 } 2028 mhash = flow_hash_from_keys(&hash_keys); 2029 2030 return mhash >> 1; 2031 } 2032 2033 void ip6_route_input(struct sk_buff *skb) 2034 { 2035 const struct ipv6hdr *iph = ipv6_hdr(skb); 2036 struct net *net = dev_net(skb->dev); 2037 int flags = RT6_LOOKUP_F_HAS_SADDR; 2038 struct ip_tunnel_info *tun_info; 2039 struct flowi6 fl6 = { 2040 .flowi6_iif = skb->dev->ifindex, 2041 .daddr = iph->daddr, 2042 .saddr = iph->saddr, 2043 .flowlabel = ip6_flowinfo(iph), 2044 .flowi6_mark = skb->mark, 2045 .flowi6_proto = iph->nexthdr, 2046 }; 2047 struct flow_keys *flkeys = NULL, _flkeys; 2048 2049 tun_info = skb_tunnel_info(skb); 2050 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2051 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2052 2053 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2054 flkeys = &_flkeys; 2055 2056 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2057 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2058 skb_dst_drop(skb); 2059 skb_dst_set(skb, 2060 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2061 } 2062 2063 static struct rt6_info *ip6_pol_route_output(struct net *net, 2064 struct fib6_table *table, 2065 struct flowi6 *fl6, 2066 const struct sk_buff *skb, 2067 int flags) 2068 { 2069 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2070 } 2071 2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2073 struct flowi6 *fl6, int flags) 2074 { 2075 bool any_src; 2076 2077 if (ipv6_addr_type(&fl6->daddr) & 2078 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2079 struct dst_entry *dst; 2080 2081 dst = l3mdev_link_scope_lookup(net, fl6); 2082 if (dst) 2083 return dst; 2084 } 2085 2086 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2087 2088 any_src = ipv6_addr_any(&fl6->saddr); 2089 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2090 (fl6->flowi6_oif && any_src)) 2091 flags |= RT6_LOOKUP_F_IFACE; 2092 2093 if (!any_src) 2094 flags |= RT6_LOOKUP_F_HAS_SADDR; 2095 else if (sk) 2096 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2097 2098 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2099 } 2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2101 2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2103 { 2104 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2105 struct net_device *loopback_dev = net->loopback_dev; 2106 struct dst_entry *new = NULL; 2107 2108 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2109 DST_OBSOLETE_DEAD, 0); 2110 if (rt) { 2111 rt6_info_init(rt); 2112 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2113 2114 new = &rt->dst; 2115 new->__use = 1; 2116 new->input = dst_discard; 2117 new->output = dst_discard_out; 2118 2119 dst_copy_metrics(new, &ort->dst); 2120 2121 rt->rt6i_idev = in6_dev_get(loopback_dev); 2122 rt->rt6i_gateway = ort->rt6i_gateway; 2123 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2124 2125 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2126 #ifdef CONFIG_IPV6_SUBTREES 2127 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2128 #endif 2129 } 2130 2131 dst_release(dst_orig); 2132 return new ? new : ERR_PTR(-ENOMEM); 2133 } 2134 2135 /* 2136 * Destination cache support functions 2137 */ 2138 2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2140 { 2141 u32 rt_cookie = 0; 2142 2143 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2144 return false; 2145 2146 if (fib6_check_expired(f6i)) 2147 return false; 2148 2149 return true; 2150 } 2151 2152 static struct dst_entry *rt6_check(struct rt6_info *rt, 2153 struct fib6_info *from, 2154 u32 cookie) 2155 { 2156 u32 rt_cookie = 0; 2157 2158 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2159 rt_cookie != cookie) 2160 return NULL; 2161 2162 if (rt6_check_expired(rt)) 2163 return NULL; 2164 2165 return &rt->dst; 2166 } 2167 2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2169 struct fib6_info *from, 2170 u32 cookie) 2171 { 2172 if (!__rt6_check_expired(rt) && 2173 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2174 fib6_check(from, cookie)) 2175 return &rt->dst; 2176 else 2177 return NULL; 2178 } 2179 2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2181 { 2182 struct dst_entry *dst_ret; 2183 struct fib6_info *from; 2184 struct rt6_info *rt; 2185 2186 rt = container_of(dst, struct rt6_info, dst); 2187 2188 rcu_read_lock(); 2189 2190 /* All IPV6 dsts are created with ->obsolete set to the value 2191 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2192 * into this function always. 2193 */ 2194 2195 from = rcu_dereference(rt->from); 2196 2197 if (from && (rt->rt6i_flags & RTF_PCPU || 2198 unlikely(!list_empty(&rt->rt6i_uncached)))) 2199 dst_ret = rt6_dst_from_check(rt, from, cookie); 2200 else 2201 dst_ret = rt6_check(rt, from, cookie); 2202 2203 rcu_read_unlock(); 2204 2205 return dst_ret; 2206 } 2207 2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2209 { 2210 struct rt6_info *rt = (struct rt6_info *) dst; 2211 2212 if (rt) { 2213 if (rt->rt6i_flags & RTF_CACHE) { 2214 rcu_read_lock(); 2215 if (rt6_check_expired(rt)) { 2216 rt6_remove_exception_rt(rt); 2217 dst = NULL; 2218 } 2219 rcu_read_unlock(); 2220 } else { 2221 dst_release(dst); 2222 dst = NULL; 2223 } 2224 } 2225 return dst; 2226 } 2227 2228 static void ip6_link_failure(struct sk_buff *skb) 2229 { 2230 struct rt6_info *rt; 2231 2232 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2233 2234 rt = (struct rt6_info *) skb_dst(skb); 2235 if (rt) { 2236 rcu_read_lock(); 2237 if (rt->rt6i_flags & RTF_CACHE) { 2238 rt6_remove_exception_rt(rt); 2239 } else { 2240 struct fib6_info *from; 2241 struct fib6_node *fn; 2242 2243 from = rcu_dereference(rt->from); 2244 if (from) { 2245 fn = rcu_dereference(from->fib6_node); 2246 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2247 fn->fn_sernum = -1; 2248 } 2249 } 2250 rcu_read_unlock(); 2251 } 2252 } 2253 2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2255 { 2256 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2257 struct fib6_info *from; 2258 2259 rcu_read_lock(); 2260 from = rcu_dereference(rt0->from); 2261 if (from) 2262 rt0->dst.expires = from->expires; 2263 rcu_read_unlock(); 2264 } 2265 2266 dst_set_expires(&rt0->dst, timeout); 2267 rt0->rt6i_flags |= RTF_EXPIRES; 2268 } 2269 2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2271 { 2272 struct net *net = dev_net(rt->dst.dev); 2273 2274 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2275 rt->rt6i_flags |= RTF_MODIFIED; 2276 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2277 } 2278 2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2280 { 2281 return !(rt->rt6i_flags & RTF_CACHE) && 2282 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2283 } 2284 2285 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2286 const struct ipv6hdr *iph, u32 mtu) 2287 { 2288 const struct in6_addr *daddr, *saddr; 2289 struct rt6_info *rt6 = (struct rt6_info *)dst; 2290 2291 if (dst_metric_locked(dst, RTAX_MTU)) 2292 return; 2293 2294 if (iph) { 2295 daddr = &iph->daddr; 2296 saddr = &iph->saddr; 2297 } else if (sk) { 2298 daddr = &sk->sk_v6_daddr; 2299 saddr = &inet6_sk(sk)->saddr; 2300 } else { 2301 daddr = NULL; 2302 saddr = NULL; 2303 } 2304 dst_confirm_neigh(dst, daddr); 2305 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2306 if (mtu >= dst_mtu(dst)) 2307 return; 2308 2309 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2310 rt6_do_update_pmtu(rt6, mtu); 2311 /* update rt6_ex->stamp for cache */ 2312 if (rt6->rt6i_flags & RTF_CACHE) 2313 rt6_update_exception_stamp_rt(rt6); 2314 } else if (daddr) { 2315 struct fib6_info *from; 2316 struct rt6_info *nrt6; 2317 2318 rcu_read_lock(); 2319 from = rcu_dereference(rt6->from); 2320 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2321 if (nrt6) { 2322 rt6_do_update_pmtu(nrt6, mtu); 2323 if (rt6_insert_exception(nrt6, from)) 2324 dst_release_immediate(&nrt6->dst); 2325 } 2326 rcu_read_unlock(); 2327 } 2328 } 2329 2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2331 struct sk_buff *skb, u32 mtu) 2332 { 2333 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2334 } 2335 2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2337 int oif, u32 mark, kuid_t uid) 2338 { 2339 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2340 struct dst_entry *dst; 2341 struct flowi6 fl6 = { 2342 .flowi6_oif = oif, 2343 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2344 .daddr = iph->daddr, 2345 .saddr = iph->saddr, 2346 .flowlabel = ip6_flowinfo(iph), 2347 .flowi6_uid = uid, 2348 }; 2349 2350 dst = ip6_route_output(net, NULL, &fl6); 2351 if (!dst->error) 2352 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2353 dst_release(dst); 2354 } 2355 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2356 2357 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2358 { 2359 int oif = sk->sk_bound_dev_if; 2360 struct dst_entry *dst; 2361 2362 if (!oif && skb->dev) 2363 oif = l3mdev_master_ifindex(skb->dev); 2364 2365 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2366 2367 dst = __sk_dst_get(sk); 2368 if (!dst || !dst->obsolete || 2369 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2370 return; 2371 2372 bh_lock_sock(sk); 2373 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2374 ip6_datagram_dst_update(sk, false); 2375 bh_unlock_sock(sk); 2376 } 2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2378 2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2380 const struct flowi6 *fl6) 2381 { 2382 #ifdef CONFIG_IPV6_SUBTREES 2383 struct ipv6_pinfo *np = inet6_sk(sk); 2384 #endif 2385 2386 ip6_dst_store(sk, dst, 2387 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2388 &sk->sk_v6_daddr : NULL, 2389 #ifdef CONFIG_IPV6_SUBTREES 2390 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2391 &np->saddr : 2392 #endif 2393 NULL); 2394 } 2395 2396 /* Handle redirects */ 2397 struct ip6rd_flowi { 2398 struct flowi6 fl6; 2399 struct in6_addr gateway; 2400 }; 2401 2402 static struct rt6_info *__ip6_route_redirect(struct net *net, 2403 struct fib6_table *table, 2404 struct flowi6 *fl6, 2405 const struct sk_buff *skb, 2406 int flags) 2407 { 2408 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2409 struct rt6_info *ret = NULL, *rt_cache; 2410 struct fib6_info *rt; 2411 struct fib6_node *fn; 2412 2413 /* Get the "current" route for this destination and 2414 * check if the redirect has come from appropriate router. 2415 * 2416 * RFC 4861 specifies that redirects should only be 2417 * accepted if they come from the nexthop to the target. 2418 * Due to the way the routes are chosen, this notion 2419 * is a bit fuzzy and one might need to check all possible 2420 * routes. 2421 */ 2422 2423 rcu_read_lock(); 2424 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2425 restart: 2426 for_each_fib6_node_rt_rcu(fn) { 2427 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 2428 continue; 2429 if (fib6_check_expired(rt)) 2430 continue; 2431 if (rt->fib6_flags & RTF_REJECT) 2432 break; 2433 if (!rt->fib6_nh.fib_nh_has_gw) 2434 continue; 2435 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex) 2436 continue; 2437 /* rt_cache's gateway might be different from its 'parent' 2438 * in the case of an ip redirect. 2439 * So we keep searching in the exception table if the gateway 2440 * is different. 2441 */ 2442 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) { 2443 rt_cache = rt6_find_cached_rt(rt, 2444 &fl6->daddr, 2445 &fl6->saddr); 2446 if (rt_cache && 2447 ipv6_addr_equal(&rdfl->gateway, 2448 &rt_cache->rt6i_gateway)) { 2449 ret = rt_cache; 2450 break; 2451 } 2452 continue; 2453 } 2454 break; 2455 } 2456 2457 if (!rt) 2458 rt = net->ipv6.fib6_null_entry; 2459 else if (rt->fib6_flags & RTF_REJECT) { 2460 ret = net->ipv6.ip6_null_entry; 2461 goto out; 2462 } 2463 2464 if (rt == net->ipv6.fib6_null_entry) { 2465 fn = fib6_backtrack(fn, &fl6->saddr); 2466 if (fn) 2467 goto restart; 2468 } 2469 2470 out: 2471 if (ret) 2472 ip6_hold_safe(net, &ret); 2473 else 2474 ret = ip6_create_rt_rcu(rt); 2475 2476 rcu_read_unlock(); 2477 2478 trace_fib6_table_lookup(net, rt, table, fl6); 2479 return ret; 2480 }; 2481 2482 static struct dst_entry *ip6_route_redirect(struct net *net, 2483 const struct flowi6 *fl6, 2484 const struct sk_buff *skb, 2485 const struct in6_addr *gateway) 2486 { 2487 int flags = RT6_LOOKUP_F_HAS_SADDR; 2488 struct ip6rd_flowi rdfl; 2489 2490 rdfl.fl6 = *fl6; 2491 rdfl.gateway = *gateway; 2492 2493 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2494 flags, __ip6_route_redirect); 2495 } 2496 2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2498 kuid_t uid) 2499 { 2500 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2501 struct dst_entry *dst; 2502 struct flowi6 fl6 = { 2503 .flowi6_iif = LOOPBACK_IFINDEX, 2504 .flowi6_oif = oif, 2505 .flowi6_mark = mark, 2506 .daddr = iph->daddr, 2507 .saddr = iph->saddr, 2508 .flowlabel = ip6_flowinfo(iph), 2509 .flowi6_uid = uid, 2510 }; 2511 2512 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2513 rt6_do_redirect(dst, NULL, skb); 2514 dst_release(dst); 2515 } 2516 EXPORT_SYMBOL_GPL(ip6_redirect); 2517 2518 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2519 { 2520 const struct ipv6hdr *iph = ipv6_hdr(skb); 2521 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2522 struct dst_entry *dst; 2523 struct flowi6 fl6 = { 2524 .flowi6_iif = LOOPBACK_IFINDEX, 2525 .flowi6_oif = oif, 2526 .daddr = msg->dest, 2527 .saddr = iph->daddr, 2528 .flowi6_uid = sock_net_uid(net, NULL), 2529 }; 2530 2531 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2532 rt6_do_redirect(dst, NULL, skb); 2533 dst_release(dst); 2534 } 2535 2536 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2537 { 2538 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2539 sk->sk_uid); 2540 } 2541 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2542 2543 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2544 { 2545 struct net_device *dev = dst->dev; 2546 unsigned int mtu = dst_mtu(dst); 2547 struct net *net = dev_net(dev); 2548 2549 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2550 2551 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2552 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2553 2554 /* 2555 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2556 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2557 * IPV6_MAXPLEN is also valid and means: "any MSS, 2558 * rely only on pmtu discovery" 2559 */ 2560 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2561 mtu = IPV6_MAXPLEN; 2562 return mtu; 2563 } 2564 2565 static unsigned int ip6_mtu(const struct dst_entry *dst) 2566 { 2567 struct inet6_dev *idev; 2568 unsigned int mtu; 2569 2570 mtu = dst_metric_raw(dst, RTAX_MTU); 2571 if (mtu) 2572 goto out; 2573 2574 mtu = IPV6_MIN_MTU; 2575 2576 rcu_read_lock(); 2577 idev = __in6_dev_get(dst->dev); 2578 if (idev) 2579 mtu = idev->cnf.mtu6; 2580 rcu_read_unlock(); 2581 2582 out: 2583 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2584 2585 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2586 } 2587 2588 /* MTU selection: 2589 * 1. mtu on route is locked - use it 2590 * 2. mtu from nexthop exception 2591 * 3. mtu from egress device 2592 * 2593 * based on ip6_dst_mtu_forward and exception logic of 2594 * rt6_find_cached_rt; called with rcu_read_lock 2595 */ 2596 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2597 struct in6_addr *saddr) 2598 { 2599 struct rt6_exception_bucket *bucket; 2600 struct rt6_exception *rt6_ex; 2601 struct in6_addr *src_key; 2602 struct inet6_dev *idev; 2603 u32 mtu = 0; 2604 2605 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2606 mtu = f6i->fib6_pmtu; 2607 if (mtu) 2608 goto out; 2609 } 2610 2611 src_key = NULL; 2612 #ifdef CONFIG_IPV6_SUBTREES 2613 if (f6i->fib6_src.plen) 2614 src_key = saddr; 2615 #endif 2616 2617 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2618 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2619 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2620 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2621 2622 if (likely(!mtu)) { 2623 struct net_device *dev = fib6_info_nh_dev(f6i); 2624 2625 mtu = IPV6_MIN_MTU; 2626 idev = __in6_dev_get(dev); 2627 if (idev && idev->cnf.mtu6 > mtu) 2628 mtu = idev->cnf.mtu6; 2629 } 2630 2631 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2632 out: 2633 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2634 } 2635 2636 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2637 struct flowi6 *fl6) 2638 { 2639 struct dst_entry *dst; 2640 struct rt6_info *rt; 2641 struct inet6_dev *idev = in6_dev_get(dev); 2642 struct net *net = dev_net(dev); 2643 2644 if (unlikely(!idev)) 2645 return ERR_PTR(-ENODEV); 2646 2647 rt = ip6_dst_alloc(net, dev, 0); 2648 if (unlikely(!rt)) { 2649 in6_dev_put(idev); 2650 dst = ERR_PTR(-ENOMEM); 2651 goto out; 2652 } 2653 2654 rt->dst.flags |= DST_HOST; 2655 rt->dst.input = ip6_input; 2656 rt->dst.output = ip6_output; 2657 rt->rt6i_gateway = fl6->daddr; 2658 rt->rt6i_dst.addr = fl6->daddr; 2659 rt->rt6i_dst.plen = 128; 2660 rt->rt6i_idev = idev; 2661 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2662 2663 /* Add this dst into uncached_list so that rt6_disable_ip() can 2664 * do proper release of the net_device 2665 */ 2666 rt6_uncached_list_add(rt); 2667 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2668 2669 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2670 2671 out: 2672 return dst; 2673 } 2674 2675 static int ip6_dst_gc(struct dst_ops *ops) 2676 { 2677 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2678 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2679 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2680 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2681 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2682 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2683 int entries; 2684 2685 entries = dst_entries_get_fast(ops); 2686 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2687 entries <= rt_max_size) 2688 goto out; 2689 2690 net->ipv6.ip6_rt_gc_expire++; 2691 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2692 entries = dst_entries_get_slow(ops); 2693 if (entries < ops->gc_thresh) 2694 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2695 out: 2696 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2697 return entries > rt_max_size; 2698 } 2699 2700 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2701 struct fib6_config *cfg, 2702 const struct in6_addr *gw_addr, 2703 u32 tbid, int flags) 2704 { 2705 struct flowi6 fl6 = { 2706 .flowi6_oif = cfg->fc_ifindex, 2707 .daddr = *gw_addr, 2708 .saddr = cfg->fc_prefsrc, 2709 }; 2710 struct fib6_table *table; 2711 struct rt6_info *rt; 2712 2713 table = fib6_get_table(net, tbid); 2714 if (!table) 2715 return NULL; 2716 2717 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2718 flags |= RT6_LOOKUP_F_HAS_SADDR; 2719 2720 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2721 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2722 2723 /* if table lookup failed, fall back to full lookup */ 2724 if (rt == net->ipv6.ip6_null_entry) { 2725 ip6_rt_put(rt); 2726 rt = NULL; 2727 } 2728 2729 return rt; 2730 } 2731 2732 static int ip6_route_check_nh_onlink(struct net *net, 2733 struct fib6_config *cfg, 2734 const struct net_device *dev, 2735 struct netlink_ext_ack *extack) 2736 { 2737 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2738 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2739 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2740 struct fib6_info *from; 2741 struct rt6_info *grt; 2742 int err; 2743 2744 err = 0; 2745 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2746 if (grt) { 2747 rcu_read_lock(); 2748 from = rcu_dereference(grt->from); 2749 if (!grt->dst.error && 2750 /* ignore match if it is the default route */ 2751 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2752 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2753 NL_SET_ERR_MSG(extack, 2754 "Nexthop has invalid gateway or device mismatch"); 2755 err = -EINVAL; 2756 } 2757 rcu_read_unlock(); 2758 2759 ip6_rt_put(grt); 2760 } 2761 2762 return err; 2763 } 2764 2765 static int ip6_route_check_nh(struct net *net, 2766 struct fib6_config *cfg, 2767 struct net_device **_dev, 2768 struct inet6_dev **idev) 2769 { 2770 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2771 struct net_device *dev = _dev ? *_dev : NULL; 2772 struct rt6_info *grt = NULL; 2773 int err = -EHOSTUNREACH; 2774 2775 if (cfg->fc_table) { 2776 int flags = RT6_LOOKUP_F_IFACE; 2777 2778 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2779 cfg->fc_table, flags); 2780 if (grt) { 2781 if (grt->rt6i_flags & RTF_GATEWAY || 2782 (dev && dev != grt->dst.dev)) { 2783 ip6_rt_put(grt); 2784 grt = NULL; 2785 } 2786 } 2787 } 2788 2789 if (!grt) 2790 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2791 2792 if (!grt) 2793 goto out; 2794 2795 if (dev) { 2796 if (dev != grt->dst.dev) { 2797 ip6_rt_put(grt); 2798 goto out; 2799 } 2800 } else { 2801 *_dev = dev = grt->dst.dev; 2802 *idev = grt->rt6i_idev; 2803 dev_hold(dev); 2804 in6_dev_hold(grt->rt6i_idev); 2805 } 2806 2807 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2808 err = 0; 2809 2810 ip6_rt_put(grt); 2811 2812 out: 2813 return err; 2814 } 2815 2816 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2817 struct net_device **_dev, struct inet6_dev **idev, 2818 struct netlink_ext_ack *extack) 2819 { 2820 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2821 int gwa_type = ipv6_addr_type(gw_addr); 2822 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2823 const struct net_device *dev = *_dev; 2824 bool need_addr_check = !dev; 2825 int err = -EINVAL; 2826 2827 /* if gw_addr is local we will fail to detect this in case 2828 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2829 * will return already-added prefix route via interface that 2830 * prefix route was assigned to, which might be non-loopback. 2831 */ 2832 if (dev && 2833 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2834 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2835 goto out; 2836 } 2837 2838 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2839 /* IPv6 strictly inhibits using not link-local 2840 * addresses as nexthop address. 2841 * Otherwise, router will not able to send redirects. 2842 * It is very good, but in some (rare!) circumstances 2843 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2844 * some exceptions. --ANK 2845 * We allow IPv4-mapped nexthops to support RFC4798-type 2846 * addressing 2847 */ 2848 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2849 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2850 goto out; 2851 } 2852 2853 if (cfg->fc_flags & RTNH_F_ONLINK) 2854 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2855 else 2856 err = ip6_route_check_nh(net, cfg, _dev, idev); 2857 2858 if (err) 2859 goto out; 2860 } 2861 2862 /* reload in case device was changed */ 2863 dev = *_dev; 2864 2865 err = -EINVAL; 2866 if (!dev) { 2867 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2868 goto out; 2869 } else if (dev->flags & IFF_LOOPBACK) { 2870 NL_SET_ERR_MSG(extack, 2871 "Egress device can not be loopback device for this route"); 2872 goto out; 2873 } 2874 2875 /* if we did not check gw_addr above, do so now that the 2876 * egress device has been resolved. 2877 */ 2878 if (need_addr_check && 2879 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2880 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2881 goto out; 2882 } 2883 2884 err = 0; 2885 out: 2886 return err; 2887 } 2888 2889 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2890 { 2891 if ((flags & RTF_REJECT) || 2892 (dev && (dev->flags & IFF_LOOPBACK) && 2893 !(addr_type & IPV6_ADDR_LOOPBACK) && 2894 !(flags & RTF_LOCAL))) 2895 return true; 2896 2897 return false; 2898 } 2899 2900 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2901 struct fib6_config *cfg, gfp_t gfp_flags, 2902 struct netlink_ext_ack *extack) 2903 { 2904 struct net_device *dev = NULL; 2905 struct inet6_dev *idev = NULL; 2906 int addr_type; 2907 int err; 2908 2909 fib6_nh->fib_nh_family = AF_INET6; 2910 2911 err = -ENODEV; 2912 if (cfg->fc_ifindex) { 2913 dev = dev_get_by_index(net, cfg->fc_ifindex); 2914 if (!dev) 2915 goto out; 2916 idev = in6_dev_get(dev); 2917 if (!idev) 2918 goto out; 2919 } 2920 2921 if (cfg->fc_flags & RTNH_F_ONLINK) { 2922 if (!dev) { 2923 NL_SET_ERR_MSG(extack, 2924 "Nexthop device required for onlink"); 2925 goto out; 2926 } 2927 2928 if (!(dev->flags & IFF_UP)) { 2929 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2930 err = -ENETDOWN; 2931 goto out; 2932 } 2933 2934 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 2935 } 2936 2937 fib6_nh->fib_nh_weight = 1; 2938 2939 /* We cannot add true routes via loopback here, 2940 * they would result in kernel looping; promote them to reject routes 2941 */ 2942 addr_type = ipv6_addr_type(&cfg->fc_dst); 2943 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 2944 /* hold loopback dev/idev if we haven't done so. */ 2945 if (dev != net->loopback_dev) { 2946 if (dev) { 2947 dev_put(dev); 2948 in6_dev_put(idev); 2949 } 2950 dev = net->loopback_dev; 2951 dev_hold(dev); 2952 idev = in6_dev_get(dev); 2953 if (!idev) { 2954 err = -ENODEV; 2955 goto out; 2956 } 2957 } 2958 goto set_dev; 2959 } 2960 2961 if (cfg->fc_flags & RTF_GATEWAY) { 2962 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 2963 if (err) 2964 goto out; 2965 2966 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 2967 fib6_nh->fib_nh_has_gw = 1; 2968 } 2969 2970 err = -ENODEV; 2971 if (!dev) 2972 goto out; 2973 2974 if (idev->cnf.disable_ipv6) { 2975 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 2976 err = -EACCES; 2977 goto out; 2978 } 2979 2980 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 2981 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2982 err = -ENETDOWN; 2983 goto out; 2984 } 2985 2986 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 2987 !netif_carrier_ok(dev)) 2988 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 2989 2990 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 2991 cfg->fc_encap_type, cfg, gfp_flags, extack); 2992 if (err) 2993 goto out; 2994 set_dev: 2995 fib6_nh->fib_nh_dev = dev; 2996 fib6_nh->fib_nh_oif = dev->ifindex; 2997 err = 0; 2998 out: 2999 if (idev) 3000 in6_dev_put(idev); 3001 3002 if (err) { 3003 lwtstate_put(fib6_nh->fib_nh_lws); 3004 fib6_nh->fib_nh_lws = NULL; 3005 if (dev) 3006 dev_put(dev); 3007 } 3008 3009 return err; 3010 } 3011 3012 void fib6_nh_release(struct fib6_nh *fib6_nh) 3013 { 3014 fib_nh_common_release(&fib6_nh->nh_common); 3015 } 3016 3017 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3018 gfp_t gfp_flags, 3019 struct netlink_ext_ack *extack) 3020 { 3021 struct net *net = cfg->fc_nlinfo.nl_net; 3022 struct fib6_info *rt = NULL; 3023 struct fib6_table *table; 3024 int err = -EINVAL; 3025 int addr_type; 3026 3027 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3028 if (cfg->fc_flags & RTF_PCPU) { 3029 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3030 goto out; 3031 } 3032 3033 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3034 if (cfg->fc_flags & RTF_CACHE) { 3035 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3036 goto out; 3037 } 3038 3039 if (cfg->fc_type > RTN_MAX) { 3040 NL_SET_ERR_MSG(extack, "Invalid route type"); 3041 goto out; 3042 } 3043 3044 if (cfg->fc_dst_len > 128) { 3045 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3046 goto out; 3047 } 3048 if (cfg->fc_src_len > 128) { 3049 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3050 goto out; 3051 } 3052 #ifndef CONFIG_IPV6_SUBTREES 3053 if (cfg->fc_src_len) { 3054 NL_SET_ERR_MSG(extack, 3055 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3056 goto out; 3057 } 3058 #endif 3059 3060 err = -ENOBUFS; 3061 if (cfg->fc_nlinfo.nlh && 3062 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3063 table = fib6_get_table(net, cfg->fc_table); 3064 if (!table) { 3065 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3066 table = fib6_new_table(net, cfg->fc_table); 3067 } 3068 } else { 3069 table = fib6_new_table(net, cfg->fc_table); 3070 } 3071 3072 if (!table) 3073 goto out; 3074 3075 err = -ENOMEM; 3076 rt = fib6_info_alloc(gfp_flags); 3077 if (!rt) 3078 goto out; 3079 3080 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3081 extack); 3082 if (IS_ERR(rt->fib6_metrics)) { 3083 err = PTR_ERR(rt->fib6_metrics); 3084 /* Do not leave garbage there. */ 3085 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3086 goto out; 3087 } 3088 3089 if (cfg->fc_flags & RTF_ADDRCONF) 3090 rt->dst_nocount = true; 3091 3092 if (cfg->fc_flags & RTF_EXPIRES) 3093 fib6_set_expires(rt, jiffies + 3094 clock_t_to_jiffies(cfg->fc_expires)); 3095 else 3096 fib6_clean_expires(rt); 3097 3098 if (cfg->fc_protocol == RTPROT_UNSPEC) 3099 cfg->fc_protocol = RTPROT_BOOT; 3100 rt->fib6_protocol = cfg->fc_protocol; 3101 3102 rt->fib6_table = table; 3103 rt->fib6_metric = cfg->fc_metric; 3104 rt->fib6_type = cfg->fc_type; 3105 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3106 3107 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3108 rt->fib6_dst.plen = cfg->fc_dst_len; 3109 if (rt->fib6_dst.plen == 128) 3110 rt->dst_host = true; 3111 3112 #ifdef CONFIG_IPV6_SUBTREES 3113 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3114 rt->fib6_src.plen = cfg->fc_src_len; 3115 #endif 3116 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3117 if (err) 3118 goto out; 3119 3120 /* We cannot add true routes via loopback here, 3121 * they would result in kernel looping; promote them to reject routes 3122 */ 3123 addr_type = ipv6_addr_type(&cfg->fc_dst); 3124 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3125 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3126 3127 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3128 struct net_device *dev = fib6_info_nh_dev(rt); 3129 3130 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3131 NL_SET_ERR_MSG(extack, "Invalid source address"); 3132 err = -EINVAL; 3133 goto out; 3134 } 3135 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3136 rt->fib6_prefsrc.plen = 128; 3137 } else 3138 rt->fib6_prefsrc.plen = 0; 3139 3140 return rt; 3141 out: 3142 fib6_info_release(rt); 3143 return ERR_PTR(err); 3144 } 3145 3146 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3147 struct netlink_ext_ack *extack) 3148 { 3149 struct fib6_info *rt; 3150 int err; 3151 3152 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3153 if (IS_ERR(rt)) 3154 return PTR_ERR(rt); 3155 3156 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3157 fib6_info_release(rt); 3158 3159 return err; 3160 } 3161 3162 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3163 { 3164 struct net *net = info->nl_net; 3165 struct fib6_table *table; 3166 int err; 3167 3168 if (rt == net->ipv6.fib6_null_entry) { 3169 err = -ENOENT; 3170 goto out; 3171 } 3172 3173 table = rt->fib6_table; 3174 spin_lock_bh(&table->tb6_lock); 3175 err = fib6_del(rt, info); 3176 spin_unlock_bh(&table->tb6_lock); 3177 3178 out: 3179 fib6_info_release(rt); 3180 return err; 3181 } 3182 3183 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3184 { 3185 struct nl_info info = { .nl_net = net }; 3186 3187 return __ip6_del_rt(rt, &info); 3188 } 3189 3190 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3191 { 3192 struct nl_info *info = &cfg->fc_nlinfo; 3193 struct net *net = info->nl_net; 3194 struct sk_buff *skb = NULL; 3195 struct fib6_table *table; 3196 int err = -ENOENT; 3197 3198 if (rt == net->ipv6.fib6_null_entry) 3199 goto out_put; 3200 table = rt->fib6_table; 3201 spin_lock_bh(&table->tb6_lock); 3202 3203 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3204 struct fib6_info *sibling, *next_sibling; 3205 3206 /* prefer to send a single notification with all hops */ 3207 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3208 if (skb) { 3209 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3210 3211 if (rt6_fill_node(net, skb, rt, NULL, 3212 NULL, NULL, 0, RTM_DELROUTE, 3213 info->portid, seq, 0) < 0) { 3214 kfree_skb(skb); 3215 skb = NULL; 3216 } else 3217 info->skip_notify = 1; 3218 } 3219 3220 list_for_each_entry_safe(sibling, next_sibling, 3221 &rt->fib6_siblings, 3222 fib6_siblings) { 3223 err = fib6_del(sibling, info); 3224 if (err) 3225 goto out_unlock; 3226 } 3227 } 3228 3229 err = fib6_del(rt, info); 3230 out_unlock: 3231 spin_unlock_bh(&table->tb6_lock); 3232 out_put: 3233 fib6_info_release(rt); 3234 3235 if (skb) { 3236 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3237 info->nlh, gfp_any()); 3238 } 3239 return err; 3240 } 3241 3242 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3243 { 3244 int rc = -ESRCH; 3245 3246 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3247 goto out; 3248 3249 if (cfg->fc_flags & RTF_GATEWAY && 3250 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3251 goto out; 3252 3253 rc = rt6_remove_exception_rt(rt); 3254 out: 3255 return rc; 3256 } 3257 3258 static int ip6_route_del(struct fib6_config *cfg, 3259 struct netlink_ext_ack *extack) 3260 { 3261 struct rt6_info *rt_cache; 3262 struct fib6_table *table; 3263 struct fib6_info *rt; 3264 struct fib6_node *fn; 3265 int err = -ESRCH; 3266 3267 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3268 if (!table) { 3269 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3270 return err; 3271 } 3272 3273 rcu_read_lock(); 3274 3275 fn = fib6_locate(&table->tb6_root, 3276 &cfg->fc_dst, cfg->fc_dst_len, 3277 &cfg->fc_src, cfg->fc_src_len, 3278 !(cfg->fc_flags & RTF_CACHE)); 3279 3280 if (fn) { 3281 for_each_fib6_node_rt_rcu(fn) { 3282 struct fib6_nh *nh; 3283 3284 if (cfg->fc_flags & RTF_CACHE) { 3285 int rc; 3286 3287 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3288 &cfg->fc_src); 3289 if (rt_cache) { 3290 rc = ip6_del_cached_rt(rt_cache, cfg); 3291 if (rc != -ESRCH) { 3292 rcu_read_unlock(); 3293 return rc; 3294 } 3295 } 3296 continue; 3297 } 3298 3299 nh = &rt->fib6_nh; 3300 if (cfg->fc_ifindex && 3301 (!nh->fib_nh_dev || 3302 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3303 continue; 3304 if (cfg->fc_flags & RTF_GATEWAY && 3305 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3306 continue; 3307 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3308 continue; 3309 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3310 continue; 3311 if (!fib6_info_hold_safe(rt)) 3312 continue; 3313 rcu_read_unlock(); 3314 3315 /* if gateway was specified only delete the one hop */ 3316 if (cfg->fc_flags & RTF_GATEWAY) 3317 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3318 3319 return __ip6_del_rt_siblings(rt, cfg); 3320 } 3321 } 3322 rcu_read_unlock(); 3323 3324 return err; 3325 } 3326 3327 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3328 { 3329 struct netevent_redirect netevent; 3330 struct rt6_info *rt, *nrt = NULL; 3331 struct ndisc_options ndopts; 3332 struct inet6_dev *in6_dev; 3333 struct neighbour *neigh; 3334 struct fib6_info *from; 3335 struct rd_msg *msg; 3336 int optlen, on_link; 3337 u8 *lladdr; 3338 3339 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3340 optlen -= sizeof(*msg); 3341 3342 if (optlen < 0) { 3343 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3344 return; 3345 } 3346 3347 msg = (struct rd_msg *)icmp6_hdr(skb); 3348 3349 if (ipv6_addr_is_multicast(&msg->dest)) { 3350 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3351 return; 3352 } 3353 3354 on_link = 0; 3355 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3356 on_link = 1; 3357 } else if (ipv6_addr_type(&msg->target) != 3358 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3359 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3360 return; 3361 } 3362 3363 in6_dev = __in6_dev_get(skb->dev); 3364 if (!in6_dev) 3365 return; 3366 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3367 return; 3368 3369 /* RFC2461 8.1: 3370 * The IP source address of the Redirect MUST be the same as the current 3371 * first-hop router for the specified ICMP Destination Address. 3372 */ 3373 3374 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3375 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3376 return; 3377 } 3378 3379 lladdr = NULL; 3380 if (ndopts.nd_opts_tgt_lladdr) { 3381 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3382 skb->dev); 3383 if (!lladdr) { 3384 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3385 return; 3386 } 3387 } 3388 3389 rt = (struct rt6_info *) dst; 3390 if (rt->rt6i_flags & RTF_REJECT) { 3391 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3392 return; 3393 } 3394 3395 /* Redirect received -> path was valid. 3396 * Look, redirects are sent only in response to data packets, 3397 * so that this nexthop apparently is reachable. --ANK 3398 */ 3399 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3400 3401 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3402 if (!neigh) 3403 return; 3404 3405 /* 3406 * We have finally decided to accept it. 3407 */ 3408 3409 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3410 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3411 NEIGH_UPDATE_F_OVERRIDE| 3412 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3413 NEIGH_UPDATE_F_ISROUTER)), 3414 NDISC_REDIRECT, &ndopts); 3415 3416 rcu_read_lock(); 3417 from = rcu_dereference(rt->from); 3418 /* This fib6_info_hold() is safe here because we hold reference to rt 3419 * and rt already holds reference to fib6_info. 3420 */ 3421 fib6_info_hold(from); 3422 rcu_read_unlock(); 3423 3424 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3425 if (!nrt) 3426 goto out; 3427 3428 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3429 if (on_link) 3430 nrt->rt6i_flags &= ~RTF_GATEWAY; 3431 3432 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3433 3434 /* No need to remove rt from the exception table if rt is 3435 * a cached route because rt6_insert_exception() will 3436 * takes care of it 3437 */ 3438 if (rt6_insert_exception(nrt, from)) { 3439 dst_release_immediate(&nrt->dst); 3440 goto out; 3441 } 3442 3443 netevent.old = &rt->dst; 3444 netevent.new = &nrt->dst; 3445 netevent.daddr = &msg->dest; 3446 netevent.neigh = neigh; 3447 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3448 3449 out: 3450 fib6_info_release(from); 3451 neigh_release(neigh); 3452 } 3453 3454 #ifdef CONFIG_IPV6_ROUTE_INFO 3455 static struct fib6_info *rt6_get_route_info(struct net *net, 3456 const struct in6_addr *prefix, int prefixlen, 3457 const struct in6_addr *gwaddr, 3458 struct net_device *dev) 3459 { 3460 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3461 int ifindex = dev->ifindex; 3462 struct fib6_node *fn; 3463 struct fib6_info *rt = NULL; 3464 struct fib6_table *table; 3465 3466 table = fib6_get_table(net, tb_id); 3467 if (!table) 3468 return NULL; 3469 3470 rcu_read_lock(); 3471 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3472 if (!fn) 3473 goto out; 3474 3475 for_each_fib6_node_rt_rcu(fn) { 3476 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3477 continue; 3478 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3479 !rt->fib6_nh.fib_nh_has_gw) 3480 continue; 3481 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3482 continue; 3483 if (!fib6_info_hold_safe(rt)) 3484 continue; 3485 break; 3486 } 3487 out: 3488 rcu_read_unlock(); 3489 return rt; 3490 } 3491 3492 static struct fib6_info *rt6_add_route_info(struct net *net, 3493 const struct in6_addr *prefix, int prefixlen, 3494 const struct in6_addr *gwaddr, 3495 struct net_device *dev, 3496 unsigned int pref) 3497 { 3498 struct fib6_config cfg = { 3499 .fc_metric = IP6_RT_PRIO_USER, 3500 .fc_ifindex = dev->ifindex, 3501 .fc_dst_len = prefixlen, 3502 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3503 RTF_UP | RTF_PREF(pref), 3504 .fc_protocol = RTPROT_RA, 3505 .fc_type = RTN_UNICAST, 3506 .fc_nlinfo.portid = 0, 3507 .fc_nlinfo.nlh = NULL, 3508 .fc_nlinfo.nl_net = net, 3509 }; 3510 3511 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3512 cfg.fc_dst = *prefix; 3513 cfg.fc_gateway = *gwaddr; 3514 3515 /* We should treat it as a default route if prefix length is 0. */ 3516 if (!prefixlen) 3517 cfg.fc_flags |= RTF_DEFAULT; 3518 3519 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3520 3521 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3522 } 3523 #endif 3524 3525 struct fib6_info *rt6_get_dflt_router(struct net *net, 3526 const struct in6_addr *addr, 3527 struct net_device *dev) 3528 { 3529 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3530 struct fib6_info *rt; 3531 struct fib6_table *table; 3532 3533 table = fib6_get_table(net, tb_id); 3534 if (!table) 3535 return NULL; 3536 3537 rcu_read_lock(); 3538 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3539 struct fib6_nh *nh = &rt->fib6_nh; 3540 3541 if (dev == nh->fib_nh_dev && 3542 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3543 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3544 break; 3545 } 3546 if (rt && !fib6_info_hold_safe(rt)) 3547 rt = NULL; 3548 rcu_read_unlock(); 3549 return rt; 3550 } 3551 3552 struct fib6_info *rt6_add_dflt_router(struct net *net, 3553 const struct in6_addr *gwaddr, 3554 struct net_device *dev, 3555 unsigned int pref) 3556 { 3557 struct fib6_config cfg = { 3558 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3559 .fc_metric = IP6_RT_PRIO_USER, 3560 .fc_ifindex = dev->ifindex, 3561 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3562 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3563 .fc_protocol = RTPROT_RA, 3564 .fc_type = RTN_UNICAST, 3565 .fc_nlinfo.portid = 0, 3566 .fc_nlinfo.nlh = NULL, 3567 .fc_nlinfo.nl_net = net, 3568 }; 3569 3570 cfg.fc_gateway = *gwaddr; 3571 3572 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3573 struct fib6_table *table; 3574 3575 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3576 if (table) 3577 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3578 } 3579 3580 return rt6_get_dflt_router(net, gwaddr, dev); 3581 } 3582 3583 static void __rt6_purge_dflt_routers(struct net *net, 3584 struct fib6_table *table) 3585 { 3586 struct fib6_info *rt; 3587 3588 restart: 3589 rcu_read_lock(); 3590 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3591 struct net_device *dev = fib6_info_nh_dev(rt); 3592 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3593 3594 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3595 (!idev || idev->cnf.accept_ra != 2) && 3596 fib6_info_hold_safe(rt)) { 3597 rcu_read_unlock(); 3598 ip6_del_rt(net, rt); 3599 goto restart; 3600 } 3601 } 3602 rcu_read_unlock(); 3603 3604 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3605 } 3606 3607 void rt6_purge_dflt_routers(struct net *net) 3608 { 3609 struct fib6_table *table; 3610 struct hlist_head *head; 3611 unsigned int h; 3612 3613 rcu_read_lock(); 3614 3615 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3616 head = &net->ipv6.fib_table_hash[h]; 3617 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3618 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3619 __rt6_purge_dflt_routers(net, table); 3620 } 3621 } 3622 3623 rcu_read_unlock(); 3624 } 3625 3626 static void rtmsg_to_fib6_config(struct net *net, 3627 struct in6_rtmsg *rtmsg, 3628 struct fib6_config *cfg) 3629 { 3630 *cfg = (struct fib6_config){ 3631 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3632 : RT6_TABLE_MAIN, 3633 .fc_ifindex = rtmsg->rtmsg_ifindex, 3634 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3635 .fc_expires = rtmsg->rtmsg_info, 3636 .fc_dst_len = rtmsg->rtmsg_dst_len, 3637 .fc_src_len = rtmsg->rtmsg_src_len, 3638 .fc_flags = rtmsg->rtmsg_flags, 3639 .fc_type = rtmsg->rtmsg_type, 3640 3641 .fc_nlinfo.nl_net = net, 3642 3643 .fc_dst = rtmsg->rtmsg_dst, 3644 .fc_src = rtmsg->rtmsg_src, 3645 .fc_gateway = rtmsg->rtmsg_gateway, 3646 }; 3647 } 3648 3649 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3650 { 3651 struct fib6_config cfg; 3652 struct in6_rtmsg rtmsg; 3653 int err; 3654 3655 switch (cmd) { 3656 case SIOCADDRT: /* Add a route */ 3657 case SIOCDELRT: /* Delete a route */ 3658 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3659 return -EPERM; 3660 err = copy_from_user(&rtmsg, arg, 3661 sizeof(struct in6_rtmsg)); 3662 if (err) 3663 return -EFAULT; 3664 3665 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3666 3667 rtnl_lock(); 3668 switch (cmd) { 3669 case SIOCADDRT: 3670 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3671 break; 3672 case SIOCDELRT: 3673 err = ip6_route_del(&cfg, NULL); 3674 break; 3675 default: 3676 err = -EINVAL; 3677 } 3678 rtnl_unlock(); 3679 3680 return err; 3681 } 3682 3683 return -EINVAL; 3684 } 3685 3686 /* 3687 * Drop the packet on the floor 3688 */ 3689 3690 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3691 { 3692 int type; 3693 struct dst_entry *dst = skb_dst(skb); 3694 switch (ipstats_mib_noroutes) { 3695 case IPSTATS_MIB_INNOROUTES: 3696 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3697 if (type == IPV6_ADDR_ANY) { 3698 IP6_INC_STATS(dev_net(dst->dev), 3699 __in6_dev_get_safely(skb->dev), 3700 IPSTATS_MIB_INADDRERRORS); 3701 break; 3702 } 3703 /* FALLTHROUGH */ 3704 case IPSTATS_MIB_OUTNOROUTES: 3705 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3706 ipstats_mib_noroutes); 3707 break; 3708 } 3709 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3710 kfree_skb(skb); 3711 return 0; 3712 } 3713 3714 static int ip6_pkt_discard(struct sk_buff *skb) 3715 { 3716 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3717 } 3718 3719 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3720 { 3721 skb->dev = skb_dst(skb)->dev; 3722 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3723 } 3724 3725 static int ip6_pkt_prohibit(struct sk_buff *skb) 3726 { 3727 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3728 } 3729 3730 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3731 { 3732 skb->dev = skb_dst(skb)->dev; 3733 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3734 } 3735 3736 /* 3737 * Allocate a dst for local (unicast / anycast) address. 3738 */ 3739 3740 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3741 struct inet6_dev *idev, 3742 const struct in6_addr *addr, 3743 bool anycast, gfp_t gfp_flags) 3744 { 3745 struct fib6_config cfg = { 3746 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3747 .fc_ifindex = idev->dev->ifindex, 3748 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3749 .fc_dst = *addr, 3750 .fc_dst_len = 128, 3751 .fc_protocol = RTPROT_KERNEL, 3752 .fc_nlinfo.nl_net = net, 3753 .fc_ignore_dev_down = true, 3754 }; 3755 3756 if (anycast) { 3757 cfg.fc_type = RTN_ANYCAST; 3758 cfg.fc_flags |= RTF_ANYCAST; 3759 } else { 3760 cfg.fc_type = RTN_LOCAL; 3761 cfg.fc_flags |= RTF_LOCAL; 3762 } 3763 3764 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3765 } 3766 3767 /* remove deleted ip from prefsrc entries */ 3768 struct arg_dev_net_ip { 3769 struct net_device *dev; 3770 struct net *net; 3771 struct in6_addr *addr; 3772 }; 3773 3774 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3775 { 3776 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3777 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3778 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3779 3780 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3781 rt != net->ipv6.fib6_null_entry && 3782 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3783 spin_lock_bh(&rt6_exception_lock); 3784 /* remove prefsrc entry */ 3785 rt->fib6_prefsrc.plen = 0; 3786 spin_unlock_bh(&rt6_exception_lock); 3787 } 3788 return 0; 3789 } 3790 3791 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3792 { 3793 struct net *net = dev_net(ifp->idev->dev); 3794 struct arg_dev_net_ip adni = { 3795 .dev = ifp->idev->dev, 3796 .net = net, 3797 .addr = &ifp->addr, 3798 }; 3799 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3800 } 3801 3802 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3803 3804 /* Remove routers and update dst entries when gateway turn into host. */ 3805 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3806 { 3807 struct in6_addr *gateway = (struct in6_addr *)arg; 3808 3809 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3810 rt->fib6_nh.fib_nh_has_gw && 3811 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3812 return -1; 3813 } 3814 3815 /* Further clean up cached routes in exception table. 3816 * This is needed because cached route may have a different 3817 * gateway than its 'parent' in the case of an ip redirect. 3818 */ 3819 rt6_exceptions_clean_tohost(rt, gateway); 3820 3821 return 0; 3822 } 3823 3824 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3825 { 3826 fib6_clean_all(net, fib6_clean_tohost, gateway); 3827 } 3828 3829 struct arg_netdev_event { 3830 const struct net_device *dev; 3831 union { 3832 unsigned int nh_flags; 3833 unsigned long event; 3834 }; 3835 }; 3836 3837 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3838 { 3839 struct fib6_info *iter; 3840 struct fib6_node *fn; 3841 3842 fn = rcu_dereference_protected(rt->fib6_node, 3843 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3844 iter = rcu_dereference_protected(fn->leaf, 3845 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3846 while (iter) { 3847 if (iter->fib6_metric == rt->fib6_metric && 3848 rt6_qualify_for_ecmp(iter)) 3849 return iter; 3850 iter = rcu_dereference_protected(iter->fib6_next, 3851 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3852 } 3853 3854 return NULL; 3855 } 3856 3857 static bool rt6_is_dead(const struct fib6_info *rt) 3858 { 3859 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3860 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3861 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3862 return true; 3863 3864 return false; 3865 } 3866 3867 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3868 { 3869 struct fib6_info *iter; 3870 int total = 0; 3871 3872 if (!rt6_is_dead(rt)) 3873 total += rt->fib6_nh.fib_nh_weight; 3874 3875 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3876 if (!rt6_is_dead(iter)) 3877 total += iter->fib6_nh.fib_nh_weight; 3878 } 3879 3880 return total; 3881 } 3882 3883 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3884 { 3885 int upper_bound = -1; 3886 3887 if (!rt6_is_dead(rt)) { 3888 *weight += rt->fib6_nh.fib_nh_weight; 3889 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3890 total) - 1; 3891 } 3892 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3893 } 3894 3895 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3896 { 3897 struct fib6_info *iter; 3898 int weight = 0; 3899 3900 rt6_upper_bound_set(rt, &weight, total); 3901 3902 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3903 rt6_upper_bound_set(iter, &weight, total); 3904 } 3905 3906 void rt6_multipath_rebalance(struct fib6_info *rt) 3907 { 3908 struct fib6_info *first; 3909 int total; 3910 3911 /* In case the entire multipath route was marked for flushing, 3912 * then there is no need to rebalance upon the removal of every 3913 * sibling route. 3914 */ 3915 if (!rt->fib6_nsiblings || rt->should_flush) 3916 return; 3917 3918 /* During lookup routes are evaluated in order, so we need to 3919 * make sure upper bounds are assigned from the first sibling 3920 * onwards. 3921 */ 3922 first = rt6_multipath_first_sibling(rt); 3923 if (WARN_ON_ONCE(!first)) 3924 return; 3925 3926 total = rt6_multipath_total_weight(first); 3927 rt6_multipath_upper_bound_set(first, total); 3928 } 3929 3930 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3931 { 3932 const struct arg_netdev_event *arg = p_arg; 3933 struct net *net = dev_net(arg->dev); 3934 3935 if (rt != net->ipv6.fib6_null_entry && 3936 rt->fib6_nh.fib_nh_dev == arg->dev) { 3937 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 3938 fib6_update_sernum_upto_root(net, rt); 3939 rt6_multipath_rebalance(rt); 3940 } 3941 3942 return 0; 3943 } 3944 3945 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3946 { 3947 struct arg_netdev_event arg = { 3948 .dev = dev, 3949 { 3950 .nh_flags = nh_flags, 3951 }, 3952 }; 3953 3954 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3955 arg.nh_flags |= RTNH_F_LINKDOWN; 3956 3957 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3958 } 3959 3960 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3961 const struct net_device *dev) 3962 { 3963 struct fib6_info *iter; 3964 3965 if (rt->fib6_nh.fib_nh_dev == dev) 3966 return true; 3967 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3968 if (iter->fib6_nh.fib_nh_dev == dev) 3969 return true; 3970 3971 return false; 3972 } 3973 3974 static void rt6_multipath_flush(struct fib6_info *rt) 3975 { 3976 struct fib6_info *iter; 3977 3978 rt->should_flush = 1; 3979 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3980 iter->should_flush = 1; 3981 } 3982 3983 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3984 const struct net_device *down_dev) 3985 { 3986 struct fib6_info *iter; 3987 unsigned int dead = 0; 3988 3989 if (rt->fib6_nh.fib_nh_dev == down_dev || 3990 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 3991 dead++; 3992 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3993 if (iter->fib6_nh.fib_nh_dev == down_dev || 3994 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 3995 dead++; 3996 3997 return dead; 3998 } 3999 4000 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4001 const struct net_device *dev, 4002 unsigned int nh_flags) 4003 { 4004 struct fib6_info *iter; 4005 4006 if (rt->fib6_nh.fib_nh_dev == dev) 4007 rt->fib6_nh.fib_nh_flags |= nh_flags; 4008 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4009 if (iter->fib6_nh.fib_nh_dev == dev) 4010 iter->fib6_nh.fib_nh_flags |= nh_flags; 4011 } 4012 4013 /* called with write lock held for table with rt */ 4014 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4015 { 4016 const struct arg_netdev_event *arg = p_arg; 4017 const struct net_device *dev = arg->dev; 4018 struct net *net = dev_net(dev); 4019 4020 if (rt == net->ipv6.fib6_null_entry) 4021 return 0; 4022 4023 switch (arg->event) { 4024 case NETDEV_UNREGISTER: 4025 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4026 case NETDEV_DOWN: 4027 if (rt->should_flush) 4028 return -1; 4029 if (!rt->fib6_nsiblings) 4030 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4031 if (rt6_multipath_uses_dev(rt, dev)) { 4032 unsigned int count; 4033 4034 count = rt6_multipath_dead_count(rt, dev); 4035 if (rt->fib6_nsiblings + 1 == count) { 4036 rt6_multipath_flush(rt); 4037 return -1; 4038 } 4039 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4040 RTNH_F_LINKDOWN); 4041 fib6_update_sernum(net, rt); 4042 rt6_multipath_rebalance(rt); 4043 } 4044 return -2; 4045 case NETDEV_CHANGE: 4046 if (rt->fib6_nh.fib_nh_dev != dev || 4047 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4048 break; 4049 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4050 rt6_multipath_rebalance(rt); 4051 break; 4052 } 4053 4054 return 0; 4055 } 4056 4057 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4058 { 4059 struct arg_netdev_event arg = { 4060 .dev = dev, 4061 { 4062 .event = event, 4063 }, 4064 }; 4065 struct net *net = dev_net(dev); 4066 4067 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4068 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4069 else 4070 fib6_clean_all(net, fib6_ifdown, &arg); 4071 } 4072 4073 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4074 { 4075 rt6_sync_down_dev(dev, event); 4076 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4077 neigh_ifdown(&nd_tbl, dev); 4078 } 4079 4080 struct rt6_mtu_change_arg { 4081 struct net_device *dev; 4082 unsigned int mtu; 4083 }; 4084 4085 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4086 { 4087 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4088 struct inet6_dev *idev; 4089 4090 /* In IPv6 pmtu discovery is not optional, 4091 so that RTAX_MTU lock cannot disable it. 4092 We still use this lock to block changes 4093 caused by addrconf/ndisc. 4094 */ 4095 4096 idev = __in6_dev_get(arg->dev); 4097 if (!idev) 4098 return 0; 4099 4100 /* For administrative MTU increase, there is no way to discover 4101 IPv6 PMTU increase, so PMTU increase should be updated here. 4102 Since RFC 1981 doesn't include administrative MTU increase 4103 update PMTU increase is a MUST. (i.e. jumbo frame) 4104 */ 4105 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4106 !fib6_metric_locked(rt, RTAX_MTU)) { 4107 u32 mtu = rt->fib6_pmtu; 4108 4109 if (mtu >= arg->mtu || 4110 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4111 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4112 4113 spin_lock_bh(&rt6_exception_lock); 4114 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4115 spin_unlock_bh(&rt6_exception_lock); 4116 } 4117 return 0; 4118 } 4119 4120 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4121 { 4122 struct rt6_mtu_change_arg arg = { 4123 .dev = dev, 4124 .mtu = mtu, 4125 }; 4126 4127 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4128 } 4129 4130 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4131 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4132 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4133 [RTA_OIF] = { .type = NLA_U32 }, 4134 [RTA_IIF] = { .type = NLA_U32 }, 4135 [RTA_PRIORITY] = { .type = NLA_U32 }, 4136 [RTA_METRICS] = { .type = NLA_NESTED }, 4137 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4138 [RTA_PREF] = { .type = NLA_U8 }, 4139 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4140 [RTA_ENCAP] = { .type = NLA_NESTED }, 4141 [RTA_EXPIRES] = { .type = NLA_U32 }, 4142 [RTA_UID] = { .type = NLA_U32 }, 4143 [RTA_MARK] = { .type = NLA_U32 }, 4144 [RTA_TABLE] = { .type = NLA_U32 }, 4145 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4146 [RTA_SPORT] = { .type = NLA_U16 }, 4147 [RTA_DPORT] = { .type = NLA_U16 }, 4148 }; 4149 4150 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4151 struct fib6_config *cfg, 4152 struct netlink_ext_ack *extack) 4153 { 4154 struct rtmsg *rtm; 4155 struct nlattr *tb[RTA_MAX+1]; 4156 unsigned int pref; 4157 int err; 4158 4159 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4160 extack); 4161 if (err < 0) 4162 goto errout; 4163 4164 err = -EINVAL; 4165 rtm = nlmsg_data(nlh); 4166 4167 *cfg = (struct fib6_config){ 4168 .fc_table = rtm->rtm_table, 4169 .fc_dst_len = rtm->rtm_dst_len, 4170 .fc_src_len = rtm->rtm_src_len, 4171 .fc_flags = RTF_UP, 4172 .fc_protocol = rtm->rtm_protocol, 4173 .fc_type = rtm->rtm_type, 4174 4175 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4176 .fc_nlinfo.nlh = nlh, 4177 .fc_nlinfo.nl_net = sock_net(skb->sk), 4178 }; 4179 4180 if (rtm->rtm_type == RTN_UNREACHABLE || 4181 rtm->rtm_type == RTN_BLACKHOLE || 4182 rtm->rtm_type == RTN_PROHIBIT || 4183 rtm->rtm_type == RTN_THROW) 4184 cfg->fc_flags |= RTF_REJECT; 4185 4186 if (rtm->rtm_type == RTN_LOCAL) 4187 cfg->fc_flags |= RTF_LOCAL; 4188 4189 if (rtm->rtm_flags & RTM_F_CLONED) 4190 cfg->fc_flags |= RTF_CACHE; 4191 4192 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4193 4194 if (tb[RTA_GATEWAY]) { 4195 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4196 cfg->fc_flags |= RTF_GATEWAY; 4197 } 4198 if (tb[RTA_VIA]) { 4199 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4200 goto errout; 4201 } 4202 4203 if (tb[RTA_DST]) { 4204 int plen = (rtm->rtm_dst_len + 7) >> 3; 4205 4206 if (nla_len(tb[RTA_DST]) < plen) 4207 goto errout; 4208 4209 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4210 } 4211 4212 if (tb[RTA_SRC]) { 4213 int plen = (rtm->rtm_src_len + 7) >> 3; 4214 4215 if (nla_len(tb[RTA_SRC]) < plen) 4216 goto errout; 4217 4218 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4219 } 4220 4221 if (tb[RTA_PREFSRC]) 4222 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4223 4224 if (tb[RTA_OIF]) 4225 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4226 4227 if (tb[RTA_PRIORITY]) 4228 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4229 4230 if (tb[RTA_METRICS]) { 4231 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4232 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4233 } 4234 4235 if (tb[RTA_TABLE]) 4236 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4237 4238 if (tb[RTA_MULTIPATH]) { 4239 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4240 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4241 4242 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4243 cfg->fc_mp_len, extack); 4244 if (err < 0) 4245 goto errout; 4246 } 4247 4248 if (tb[RTA_PREF]) { 4249 pref = nla_get_u8(tb[RTA_PREF]); 4250 if (pref != ICMPV6_ROUTER_PREF_LOW && 4251 pref != ICMPV6_ROUTER_PREF_HIGH) 4252 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4253 cfg->fc_flags |= RTF_PREF(pref); 4254 } 4255 4256 if (tb[RTA_ENCAP]) 4257 cfg->fc_encap = tb[RTA_ENCAP]; 4258 4259 if (tb[RTA_ENCAP_TYPE]) { 4260 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4261 4262 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4263 if (err < 0) 4264 goto errout; 4265 } 4266 4267 if (tb[RTA_EXPIRES]) { 4268 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4269 4270 if (addrconf_finite_timeout(timeout)) { 4271 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4272 cfg->fc_flags |= RTF_EXPIRES; 4273 } 4274 } 4275 4276 err = 0; 4277 errout: 4278 return err; 4279 } 4280 4281 struct rt6_nh { 4282 struct fib6_info *fib6_info; 4283 struct fib6_config r_cfg; 4284 struct list_head next; 4285 }; 4286 4287 static int ip6_route_info_append(struct net *net, 4288 struct list_head *rt6_nh_list, 4289 struct fib6_info *rt, 4290 struct fib6_config *r_cfg) 4291 { 4292 struct rt6_nh *nh; 4293 int err = -EEXIST; 4294 4295 list_for_each_entry(nh, rt6_nh_list, next) { 4296 /* check if fib6_info already exists */ 4297 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4298 return err; 4299 } 4300 4301 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4302 if (!nh) 4303 return -ENOMEM; 4304 nh->fib6_info = rt; 4305 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4306 list_add_tail(&nh->next, rt6_nh_list); 4307 4308 return 0; 4309 } 4310 4311 static void ip6_route_mpath_notify(struct fib6_info *rt, 4312 struct fib6_info *rt_last, 4313 struct nl_info *info, 4314 __u16 nlflags) 4315 { 4316 /* if this is an APPEND route, then rt points to the first route 4317 * inserted and rt_last points to last route inserted. Userspace 4318 * wants a consistent dump of the route which starts at the first 4319 * nexthop. Since sibling routes are always added at the end of 4320 * the list, find the first sibling of the last route appended 4321 */ 4322 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4323 rt = list_first_entry(&rt_last->fib6_siblings, 4324 struct fib6_info, 4325 fib6_siblings); 4326 } 4327 4328 if (rt) 4329 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4330 } 4331 4332 static int ip6_route_multipath_add(struct fib6_config *cfg, 4333 struct netlink_ext_ack *extack) 4334 { 4335 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4336 struct nl_info *info = &cfg->fc_nlinfo; 4337 struct fib6_config r_cfg; 4338 struct rtnexthop *rtnh; 4339 struct fib6_info *rt; 4340 struct rt6_nh *err_nh; 4341 struct rt6_nh *nh, *nh_safe; 4342 __u16 nlflags; 4343 int remaining; 4344 int attrlen; 4345 int err = 1; 4346 int nhn = 0; 4347 int replace = (cfg->fc_nlinfo.nlh && 4348 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4349 LIST_HEAD(rt6_nh_list); 4350 4351 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4352 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4353 nlflags |= NLM_F_APPEND; 4354 4355 remaining = cfg->fc_mp_len; 4356 rtnh = (struct rtnexthop *)cfg->fc_mp; 4357 4358 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4359 * fib6_info structs per nexthop 4360 */ 4361 while (rtnh_ok(rtnh, remaining)) { 4362 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4363 if (rtnh->rtnh_ifindex) 4364 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4365 4366 attrlen = rtnh_attrlen(rtnh); 4367 if (attrlen > 0) { 4368 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4369 4370 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4371 if (nla) { 4372 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4373 r_cfg.fc_flags |= RTF_GATEWAY; 4374 } 4375 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4376 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4377 if (nla) 4378 r_cfg.fc_encap_type = nla_get_u16(nla); 4379 } 4380 4381 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4382 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4383 if (IS_ERR(rt)) { 4384 err = PTR_ERR(rt); 4385 rt = NULL; 4386 goto cleanup; 4387 } 4388 if (!rt6_qualify_for_ecmp(rt)) { 4389 err = -EINVAL; 4390 NL_SET_ERR_MSG(extack, 4391 "Device only routes can not be added for IPv6 using the multipath API."); 4392 fib6_info_release(rt); 4393 goto cleanup; 4394 } 4395 4396 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4397 4398 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4399 rt, &r_cfg); 4400 if (err) { 4401 fib6_info_release(rt); 4402 goto cleanup; 4403 } 4404 4405 rtnh = rtnh_next(rtnh, &remaining); 4406 } 4407 4408 /* for add and replace send one notification with all nexthops. 4409 * Skip the notification in fib6_add_rt2node and send one with 4410 * the full route when done 4411 */ 4412 info->skip_notify = 1; 4413 4414 err_nh = NULL; 4415 list_for_each_entry(nh, &rt6_nh_list, next) { 4416 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4417 fib6_info_release(nh->fib6_info); 4418 4419 if (!err) { 4420 /* save reference to last route successfully inserted */ 4421 rt_last = nh->fib6_info; 4422 4423 /* save reference to first route for notification */ 4424 if (!rt_notif) 4425 rt_notif = nh->fib6_info; 4426 } 4427 4428 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4429 nh->fib6_info = NULL; 4430 if (err) { 4431 if (replace && nhn) 4432 NL_SET_ERR_MSG_MOD(extack, 4433 "multipath route replace failed (check consistency of installed routes)"); 4434 err_nh = nh; 4435 goto add_errout; 4436 } 4437 4438 /* Because each route is added like a single route we remove 4439 * these flags after the first nexthop: if there is a collision, 4440 * we have already failed to add the first nexthop: 4441 * fib6_add_rt2node() has rejected it; when replacing, old 4442 * nexthops have been replaced by first new, the rest should 4443 * be added to it. 4444 */ 4445 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4446 NLM_F_REPLACE); 4447 nhn++; 4448 } 4449 4450 /* success ... tell user about new route */ 4451 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4452 goto cleanup; 4453 4454 add_errout: 4455 /* send notification for routes that were added so that 4456 * the delete notifications sent by ip6_route_del are 4457 * coherent 4458 */ 4459 if (rt_notif) 4460 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4461 4462 /* Delete routes that were already added */ 4463 list_for_each_entry(nh, &rt6_nh_list, next) { 4464 if (err_nh == nh) 4465 break; 4466 ip6_route_del(&nh->r_cfg, extack); 4467 } 4468 4469 cleanup: 4470 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4471 if (nh->fib6_info) 4472 fib6_info_release(nh->fib6_info); 4473 list_del(&nh->next); 4474 kfree(nh); 4475 } 4476 4477 return err; 4478 } 4479 4480 static int ip6_route_multipath_del(struct fib6_config *cfg, 4481 struct netlink_ext_ack *extack) 4482 { 4483 struct fib6_config r_cfg; 4484 struct rtnexthop *rtnh; 4485 int remaining; 4486 int attrlen; 4487 int err = 1, last_err = 0; 4488 4489 remaining = cfg->fc_mp_len; 4490 rtnh = (struct rtnexthop *)cfg->fc_mp; 4491 4492 /* Parse a Multipath Entry */ 4493 while (rtnh_ok(rtnh, remaining)) { 4494 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4495 if (rtnh->rtnh_ifindex) 4496 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4497 4498 attrlen = rtnh_attrlen(rtnh); 4499 if (attrlen > 0) { 4500 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4501 4502 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4503 if (nla) { 4504 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4505 r_cfg.fc_flags |= RTF_GATEWAY; 4506 } 4507 } 4508 err = ip6_route_del(&r_cfg, extack); 4509 if (err) 4510 last_err = err; 4511 4512 rtnh = rtnh_next(rtnh, &remaining); 4513 } 4514 4515 return last_err; 4516 } 4517 4518 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4519 struct netlink_ext_ack *extack) 4520 { 4521 struct fib6_config cfg; 4522 int err; 4523 4524 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4525 if (err < 0) 4526 return err; 4527 4528 if (cfg.fc_mp) 4529 return ip6_route_multipath_del(&cfg, extack); 4530 else { 4531 cfg.fc_delete_all_nh = 1; 4532 return ip6_route_del(&cfg, extack); 4533 } 4534 } 4535 4536 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4537 struct netlink_ext_ack *extack) 4538 { 4539 struct fib6_config cfg; 4540 int err; 4541 4542 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4543 if (err < 0) 4544 return err; 4545 4546 if (cfg.fc_metric == 0) 4547 cfg.fc_metric = IP6_RT_PRIO_USER; 4548 4549 if (cfg.fc_mp) 4550 return ip6_route_multipath_add(&cfg, extack); 4551 else 4552 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4553 } 4554 4555 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4556 { 4557 int nexthop_len = 0; 4558 4559 if (rt->fib6_nsiblings) { 4560 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4561 + NLA_ALIGN(sizeof(struct rtnexthop)) 4562 + nla_total_size(16) /* RTA_GATEWAY */ 4563 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4564 4565 nexthop_len *= rt->fib6_nsiblings; 4566 } 4567 4568 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4569 + nla_total_size(16) /* RTA_SRC */ 4570 + nla_total_size(16) /* RTA_DST */ 4571 + nla_total_size(16) /* RTA_GATEWAY */ 4572 + nla_total_size(16) /* RTA_PREFSRC */ 4573 + nla_total_size(4) /* RTA_TABLE */ 4574 + nla_total_size(4) /* RTA_IIF */ 4575 + nla_total_size(4) /* RTA_OIF */ 4576 + nla_total_size(4) /* RTA_PRIORITY */ 4577 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4578 + nla_total_size(sizeof(struct rta_cacheinfo)) 4579 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4580 + nla_total_size(1) /* RTA_PREF */ 4581 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4582 + nexthop_len; 4583 } 4584 4585 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4586 struct fib6_info *rt, struct dst_entry *dst, 4587 struct in6_addr *dest, struct in6_addr *src, 4588 int iif, int type, u32 portid, u32 seq, 4589 unsigned int flags) 4590 { 4591 struct rt6_info *rt6 = (struct rt6_info *)dst; 4592 struct rt6key *rt6_dst, *rt6_src; 4593 u32 *pmetrics, table, rt6_flags; 4594 struct nlmsghdr *nlh; 4595 struct rtmsg *rtm; 4596 long expires = 0; 4597 4598 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4599 if (!nlh) 4600 return -EMSGSIZE; 4601 4602 if (rt6) { 4603 rt6_dst = &rt6->rt6i_dst; 4604 rt6_src = &rt6->rt6i_src; 4605 rt6_flags = rt6->rt6i_flags; 4606 } else { 4607 rt6_dst = &rt->fib6_dst; 4608 rt6_src = &rt->fib6_src; 4609 rt6_flags = rt->fib6_flags; 4610 } 4611 4612 rtm = nlmsg_data(nlh); 4613 rtm->rtm_family = AF_INET6; 4614 rtm->rtm_dst_len = rt6_dst->plen; 4615 rtm->rtm_src_len = rt6_src->plen; 4616 rtm->rtm_tos = 0; 4617 if (rt->fib6_table) 4618 table = rt->fib6_table->tb6_id; 4619 else 4620 table = RT6_TABLE_UNSPEC; 4621 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4622 if (nla_put_u32(skb, RTA_TABLE, table)) 4623 goto nla_put_failure; 4624 4625 rtm->rtm_type = rt->fib6_type; 4626 rtm->rtm_flags = 0; 4627 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4628 rtm->rtm_protocol = rt->fib6_protocol; 4629 4630 if (rt6_flags & RTF_CACHE) 4631 rtm->rtm_flags |= RTM_F_CLONED; 4632 4633 if (dest) { 4634 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4635 goto nla_put_failure; 4636 rtm->rtm_dst_len = 128; 4637 } else if (rtm->rtm_dst_len) 4638 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4639 goto nla_put_failure; 4640 #ifdef CONFIG_IPV6_SUBTREES 4641 if (src) { 4642 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4643 goto nla_put_failure; 4644 rtm->rtm_src_len = 128; 4645 } else if (rtm->rtm_src_len && 4646 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4647 goto nla_put_failure; 4648 #endif 4649 if (iif) { 4650 #ifdef CONFIG_IPV6_MROUTE 4651 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4652 int err = ip6mr_get_route(net, skb, rtm, portid); 4653 4654 if (err == 0) 4655 return 0; 4656 if (err < 0) 4657 goto nla_put_failure; 4658 } else 4659 #endif 4660 if (nla_put_u32(skb, RTA_IIF, iif)) 4661 goto nla_put_failure; 4662 } else if (dest) { 4663 struct in6_addr saddr_buf; 4664 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4665 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4666 goto nla_put_failure; 4667 } 4668 4669 if (rt->fib6_prefsrc.plen) { 4670 struct in6_addr saddr_buf; 4671 saddr_buf = rt->fib6_prefsrc.addr; 4672 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4673 goto nla_put_failure; 4674 } 4675 4676 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4677 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4678 goto nla_put_failure; 4679 4680 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4681 goto nla_put_failure; 4682 4683 /* For multipath routes, walk the siblings list and add 4684 * each as a nexthop within RTA_MULTIPATH. 4685 */ 4686 if (rt6) { 4687 if (rt6_flags & RTF_GATEWAY && 4688 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4689 goto nla_put_failure; 4690 4691 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4692 goto nla_put_failure; 4693 } else if (rt->fib6_nsiblings) { 4694 struct fib6_info *sibling, *next_sibling; 4695 struct nlattr *mp; 4696 4697 mp = nla_nest_start(skb, RTA_MULTIPATH); 4698 if (!mp) 4699 goto nla_put_failure; 4700 4701 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, 4702 rt->fib6_nh.fib_nh_weight) < 0) 4703 goto nla_put_failure; 4704 4705 list_for_each_entry_safe(sibling, next_sibling, 4706 &rt->fib6_siblings, fib6_siblings) { 4707 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, 4708 sibling->fib6_nh.fib_nh_weight) < 0) 4709 goto nla_put_failure; 4710 } 4711 4712 nla_nest_end(skb, mp); 4713 } else { 4714 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, 4715 &rtm->rtm_flags, false) < 0) 4716 goto nla_put_failure; 4717 } 4718 4719 if (rt6_flags & RTF_EXPIRES) { 4720 expires = dst ? dst->expires : rt->expires; 4721 expires -= jiffies; 4722 } 4723 4724 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4725 goto nla_put_failure; 4726 4727 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4728 goto nla_put_failure; 4729 4730 4731 nlmsg_end(skb, nlh); 4732 return 0; 4733 4734 nla_put_failure: 4735 nlmsg_cancel(skb, nlh); 4736 return -EMSGSIZE; 4737 } 4738 4739 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4740 const struct net_device *dev) 4741 { 4742 if (f6i->fib6_nh.fib_nh_dev == dev) 4743 return true; 4744 4745 if (f6i->fib6_nsiblings) { 4746 struct fib6_info *sibling, *next_sibling; 4747 4748 list_for_each_entry_safe(sibling, next_sibling, 4749 &f6i->fib6_siblings, fib6_siblings) { 4750 if (sibling->fib6_nh.fib_nh_dev == dev) 4751 return true; 4752 } 4753 } 4754 4755 return false; 4756 } 4757 4758 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4759 { 4760 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4761 struct fib_dump_filter *filter = &arg->filter; 4762 unsigned int flags = NLM_F_MULTI; 4763 struct net *net = arg->net; 4764 4765 if (rt == net->ipv6.fib6_null_entry) 4766 return 0; 4767 4768 if ((filter->flags & RTM_F_PREFIX) && 4769 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4770 /* success since this is not a prefix route */ 4771 return 1; 4772 } 4773 if (filter->filter_set) { 4774 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4775 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4776 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4777 return 1; 4778 } 4779 flags |= NLM_F_DUMP_FILTERED; 4780 } 4781 4782 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4783 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4784 arg->cb->nlh->nlmsg_seq, flags); 4785 } 4786 4787 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4788 const struct nlmsghdr *nlh, 4789 struct nlattr **tb, 4790 struct netlink_ext_ack *extack) 4791 { 4792 struct rtmsg *rtm; 4793 int i, err; 4794 4795 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4796 NL_SET_ERR_MSG_MOD(extack, 4797 "Invalid header for get route request"); 4798 return -EINVAL; 4799 } 4800 4801 if (!netlink_strict_get_check(skb)) 4802 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 4803 rtm_ipv6_policy, extack); 4804 4805 rtm = nlmsg_data(nlh); 4806 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4807 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4808 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4809 rtm->rtm_type) { 4810 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4811 return -EINVAL; 4812 } 4813 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4814 NL_SET_ERR_MSG_MOD(extack, 4815 "Invalid flags for get route request"); 4816 return -EINVAL; 4817 } 4818 4819 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4820 rtm_ipv6_policy, extack); 4821 if (err) 4822 return err; 4823 4824 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4825 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4826 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4827 return -EINVAL; 4828 } 4829 4830 for (i = 0; i <= RTA_MAX; i++) { 4831 if (!tb[i]) 4832 continue; 4833 4834 switch (i) { 4835 case RTA_SRC: 4836 case RTA_DST: 4837 case RTA_IIF: 4838 case RTA_OIF: 4839 case RTA_MARK: 4840 case RTA_UID: 4841 case RTA_SPORT: 4842 case RTA_DPORT: 4843 case RTA_IP_PROTO: 4844 break; 4845 default: 4846 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4847 return -EINVAL; 4848 } 4849 } 4850 4851 return 0; 4852 } 4853 4854 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4855 struct netlink_ext_ack *extack) 4856 { 4857 struct net *net = sock_net(in_skb->sk); 4858 struct nlattr *tb[RTA_MAX+1]; 4859 int err, iif = 0, oif = 0; 4860 struct fib6_info *from; 4861 struct dst_entry *dst; 4862 struct rt6_info *rt; 4863 struct sk_buff *skb; 4864 struct rtmsg *rtm; 4865 struct flowi6 fl6 = {}; 4866 bool fibmatch; 4867 4868 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4869 if (err < 0) 4870 goto errout; 4871 4872 err = -EINVAL; 4873 rtm = nlmsg_data(nlh); 4874 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4875 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4876 4877 if (tb[RTA_SRC]) { 4878 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4879 goto errout; 4880 4881 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4882 } 4883 4884 if (tb[RTA_DST]) { 4885 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4886 goto errout; 4887 4888 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4889 } 4890 4891 if (tb[RTA_IIF]) 4892 iif = nla_get_u32(tb[RTA_IIF]); 4893 4894 if (tb[RTA_OIF]) 4895 oif = nla_get_u32(tb[RTA_OIF]); 4896 4897 if (tb[RTA_MARK]) 4898 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4899 4900 if (tb[RTA_UID]) 4901 fl6.flowi6_uid = make_kuid(current_user_ns(), 4902 nla_get_u32(tb[RTA_UID])); 4903 else 4904 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4905 4906 if (tb[RTA_SPORT]) 4907 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4908 4909 if (tb[RTA_DPORT]) 4910 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4911 4912 if (tb[RTA_IP_PROTO]) { 4913 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4914 &fl6.flowi6_proto, AF_INET6, 4915 extack); 4916 if (err) 4917 goto errout; 4918 } 4919 4920 if (iif) { 4921 struct net_device *dev; 4922 int flags = 0; 4923 4924 rcu_read_lock(); 4925 4926 dev = dev_get_by_index_rcu(net, iif); 4927 if (!dev) { 4928 rcu_read_unlock(); 4929 err = -ENODEV; 4930 goto errout; 4931 } 4932 4933 fl6.flowi6_iif = iif; 4934 4935 if (!ipv6_addr_any(&fl6.saddr)) 4936 flags |= RT6_LOOKUP_F_HAS_SADDR; 4937 4938 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4939 4940 rcu_read_unlock(); 4941 } else { 4942 fl6.flowi6_oif = oif; 4943 4944 dst = ip6_route_output(net, NULL, &fl6); 4945 } 4946 4947 4948 rt = container_of(dst, struct rt6_info, dst); 4949 if (rt->dst.error) { 4950 err = rt->dst.error; 4951 ip6_rt_put(rt); 4952 goto errout; 4953 } 4954 4955 if (rt == net->ipv6.ip6_null_entry) { 4956 err = rt->dst.error; 4957 ip6_rt_put(rt); 4958 goto errout; 4959 } 4960 4961 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4962 if (!skb) { 4963 ip6_rt_put(rt); 4964 err = -ENOBUFS; 4965 goto errout; 4966 } 4967 4968 skb_dst_set(skb, &rt->dst); 4969 4970 rcu_read_lock(); 4971 from = rcu_dereference(rt->from); 4972 4973 if (fibmatch) 4974 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4975 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4976 nlh->nlmsg_seq, 0); 4977 else 4978 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4979 &fl6.saddr, iif, RTM_NEWROUTE, 4980 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4981 0); 4982 rcu_read_unlock(); 4983 4984 if (err < 0) { 4985 kfree_skb(skb); 4986 goto errout; 4987 } 4988 4989 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4990 errout: 4991 return err; 4992 } 4993 4994 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4995 unsigned int nlm_flags) 4996 { 4997 struct sk_buff *skb; 4998 struct net *net = info->nl_net; 4999 u32 seq; 5000 int err; 5001 5002 err = -ENOBUFS; 5003 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5004 5005 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5006 if (!skb) 5007 goto errout; 5008 5009 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5010 event, info->portid, seq, nlm_flags); 5011 if (err < 0) { 5012 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5013 WARN_ON(err == -EMSGSIZE); 5014 kfree_skb(skb); 5015 goto errout; 5016 } 5017 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5018 info->nlh, gfp_any()); 5019 return; 5020 errout: 5021 if (err < 0) 5022 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5023 } 5024 5025 static int ip6_route_dev_notify(struct notifier_block *this, 5026 unsigned long event, void *ptr) 5027 { 5028 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5029 struct net *net = dev_net(dev); 5030 5031 if (!(dev->flags & IFF_LOOPBACK)) 5032 return NOTIFY_OK; 5033 5034 if (event == NETDEV_REGISTER) { 5035 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5036 net->ipv6.ip6_null_entry->dst.dev = dev; 5037 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5038 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5039 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5040 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5041 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5042 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5043 #endif 5044 } else if (event == NETDEV_UNREGISTER && 5045 dev->reg_state != NETREG_UNREGISTERED) { 5046 /* NETDEV_UNREGISTER could be fired for multiple times by 5047 * netdev_wait_allrefs(). Make sure we only call this once. 5048 */ 5049 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5050 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5051 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5052 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5053 #endif 5054 } 5055 5056 return NOTIFY_OK; 5057 } 5058 5059 /* 5060 * /proc 5061 */ 5062 5063 #ifdef CONFIG_PROC_FS 5064 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5065 { 5066 struct net *net = (struct net *)seq->private; 5067 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5068 net->ipv6.rt6_stats->fib_nodes, 5069 net->ipv6.rt6_stats->fib_route_nodes, 5070 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5071 net->ipv6.rt6_stats->fib_rt_entries, 5072 net->ipv6.rt6_stats->fib_rt_cache, 5073 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5074 net->ipv6.rt6_stats->fib_discarded_routes); 5075 5076 return 0; 5077 } 5078 #endif /* CONFIG_PROC_FS */ 5079 5080 #ifdef CONFIG_SYSCTL 5081 5082 static 5083 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5084 void __user *buffer, size_t *lenp, loff_t *ppos) 5085 { 5086 struct net *net; 5087 int delay; 5088 int ret; 5089 if (!write) 5090 return -EINVAL; 5091 5092 net = (struct net *)ctl->extra1; 5093 delay = net->ipv6.sysctl.flush_delay; 5094 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5095 if (ret) 5096 return ret; 5097 5098 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5099 return 0; 5100 } 5101 5102 static int zero; 5103 static int one = 1; 5104 5105 static struct ctl_table ipv6_route_table_template[] = { 5106 { 5107 .procname = "flush", 5108 .data = &init_net.ipv6.sysctl.flush_delay, 5109 .maxlen = sizeof(int), 5110 .mode = 0200, 5111 .proc_handler = ipv6_sysctl_rtcache_flush 5112 }, 5113 { 5114 .procname = "gc_thresh", 5115 .data = &ip6_dst_ops_template.gc_thresh, 5116 .maxlen = sizeof(int), 5117 .mode = 0644, 5118 .proc_handler = proc_dointvec, 5119 }, 5120 { 5121 .procname = "max_size", 5122 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5123 .maxlen = sizeof(int), 5124 .mode = 0644, 5125 .proc_handler = proc_dointvec, 5126 }, 5127 { 5128 .procname = "gc_min_interval", 5129 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5130 .maxlen = sizeof(int), 5131 .mode = 0644, 5132 .proc_handler = proc_dointvec_jiffies, 5133 }, 5134 { 5135 .procname = "gc_timeout", 5136 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5137 .maxlen = sizeof(int), 5138 .mode = 0644, 5139 .proc_handler = proc_dointvec_jiffies, 5140 }, 5141 { 5142 .procname = "gc_interval", 5143 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5144 .maxlen = sizeof(int), 5145 .mode = 0644, 5146 .proc_handler = proc_dointvec_jiffies, 5147 }, 5148 { 5149 .procname = "gc_elasticity", 5150 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5151 .maxlen = sizeof(int), 5152 .mode = 0644, 5153 .proc_handler = proc_dointvec, 5154 }, 5155 { 5156 .procname = "mtu_expires", 5157 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5158 .maxlen = sizeof(int), 5159 .mode = 0644, 5160 .proc_handler = proc_dointvec_jiffies, 5161 }, 5162 { 5163 .procname = "min_adv_mss", 5164 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5165 .maxlen = sizeof(int), 5166 .mode = 0644, 5167 .proc_handler = proc_dointvec, 5168 }, 5169 { 5170 .procname = "gc_min_interval_ms", 5171 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5172 .maxlen = sizeof(int), 5173 .mode = 0644, 5174 .proc_handler = proc_dointvec_ms_jiffies, 5175 }, 5176 { 5177 .procname = "skip_notify_on_dev_down", 5178 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5179 .maxlen = sizeof(int), 5180 .mode = 0644, 5181 .proc_handler = proc_dointvec, 5182 .extra1 = &zero, 5183 .extra2 = &one, 5184 }, 5185 { } 5186 }; 5187 5188 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5189 { 5190 struct ctl_table *table; 5191 5192 table = kmemdup(ipv6_route_table_template, 5193 sizeof(ipv6_route_table_template), 5194 GFP_KERNEL); 5195 5196 if (table) { 5197 table[0].data = &net->ipv6.sysctl.flush_delay; 5198 table[0].extra1 = net; 5199 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5200 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5201 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5202 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5203 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5204 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5205 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5206 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5207 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5208 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5209 5210 /* Don't export sysctls to unprivileged users */ 5211 if (net->user_ns != &init_user_ns) 5212 table[0].procname = NULL; 5213 } 5214 5215 return table; 5216 } 5217 #endif 5218 5219 static int __net_init ip6_route_net_init(struct net *net) 5220 { 5221 int ret = -ENOMEM; 5222 5223 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5224 sizeof(net->ipv6.ip6_dst_ops)); 5225 5226 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5227 goto out_ip6_dst_ops; 5228 5229 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5230 sizeof(*net->ipv6.fib6_null_entry), 5231 GFP_KERNEL); 5232 if (!net->ipv6.fib6_null_entry) 5233 goto out_ip6_dst_entries; 5234 5235 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5236 sizeof(*net->ipv6.ip6_null_entry), 5237 GFP_KERNEL); 5238 if (!net->ipv6.ip6_null_entry) 5239 goto out_fib6_null_entry; 5240 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5241 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5242 ip6_template_metrics, true); 5243 5244 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5245 net->ipv6.fib6_has_custom_rules = false; 5246 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5247 sizeof(*net->ipv6.ip6_prohibit_entry), 5248 GFP_KERNEL); 5249 if (!net->ipv6.ip6_prohibit_entry) 5250 goto out_ip6_null_entry; 5251 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5252 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5253 ip6_template_metrics, true); 5254 5255 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5256 sizeof(*net->ipv6.ip6_blk_hole_entry), 5257 GFP_KERNEL); 5258 if (!net->ipv6.ip6_blk_hole_entry) 5259 goto out_ip6_prohibit_entry; 5260 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5261 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5262 ip6_template_metrics, true); 5263 #endif 5264 5265 net->ipv6.sysctl.flush_delay = 0; 5266 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5267 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5268 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5269 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5270 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5271 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5272 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5273 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5274 5275 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5276 5277 ret = 0; 5278 out: 5279 return ret; 5280 5281 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5282 out_ip6_prohibit_entry: 5283 kfree(net->ipv6.ip6_prohibit_entry); 5284 out_ip6_null_entry: 5285 kfree(net->ipv6.ip6_null_entry); 5286 #endif 5287 out_fib6_null_entry: 5288 kfree(net->ipv6.fib6_null_entry); 5289 out_ip6_dst_entries: 5290 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5291 out_ip6_dst_ops: 5292 goto out; 5293 } 5294 5295 static void __net_exit ip6_route_net_exit(struct net *net) 5296 { 5297 kfree(net->ipv6.fib6_null_entry); 5298 kfree(net->ipv6.ip6_null_entry); 5299 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5300 kfree(net->ipv6.ip6_prohibit_entry); 5301 kfree(net->ipv6.ip6_blk_hole_entry); 5302 #endif 5303 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5304 } 5305 5306 static int __net_init ip6_route_net_init_late(struct net *net) 5307 { 5308 #ifdef CONFIG_PROC_FS 5309 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5310 sizeof(struct ipv6_route_iter)); 5311 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5312 rt6_stats_seq_show, NULL); 5313 #endif 5314 return 0; 5315 } 5316 5317 static void __net_exit ip6_route_net_exit_late(struct net *net) 5318 { 5319 #ifdef CONFIG_PROC_FS 5320 remove_proc_entry("ipv6_route", net->proc_net); 5321 remove_proc_entry("rt6_stats", net->proc_net); 5322 #endif 5323 } 5324 5325 static struct pernet_operations ip6_route_net_ops = { 5326 .init = ip6_route_net_init, 5327 .exit = ip6_route_net_exit, 5328 }; 5329 5330 static int __net_init ipv6_inetpeer_init(struct net *net) 5331 { 5332 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5333 5334 if (!bp) 5335 return -ENOMEM; 5336 inet_peer_base_init(bp); 5337 net->ipv6.peers = bp; 5338 return 0; 5339 } 5340 5341 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5342 { 5343 struct inet_peer_base *bp = net->ipv6.peers; 5344 5345 net->ipv6.peers = NULL; 5346 inetpeer_invalidate_tree(bp); 5347 kfree(bp); 5348 } 5349 5350 static struct pernet_operations ipv6_inetpeer_ops = { 5351 .init = ipv6_inetpeer_init, 5352 .exit = ipv6_inetpeer_exit, 5353 }; 5354 5355 static struct pernet_operations ip6_route_net_late_ops = { 5356 .init = ip6_route_net_init_late, 5357 .exit = ip6_route_net_exit_late, 5358 }; 5359 5360 static struct notifier_block ip6_route_dev_notifier = { 5361 .notifier_call = ip6_route_dev_notify, 5362 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5363 }; 5364 5365 void __init ip6_route_init_special_entries(void) 5366 { 5367 /* Registering of the loopback is done before this portion of code, 5368 * the loopback reference in rt6_info will not be taken, do it 5369 * manually for init_net */ 5370 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5371 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5372 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5373 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5374 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5375 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5376 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5377 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5378 #endif 5379 } 5380 5381 int __init ip6_route_init(void) 5382 { 5383 int ret; 5384 int cpu; 5385 5386 ret = -ENOMEM; 5387 ip6_dst_ops_template.kmem_cachep = 5388 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5389 SLAB_HWCACHE_ALIGN, NULL); 5390 if (!ip6_dst_ops_template.kmem_cachep) 5391 goto out; 5392 5393 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5394 if (ret) 5395 goto out_kmem_cache; 5396 5397 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5398 if (ret) 5399 goto out_dst_entries; 5400 5401 ret = register_pernet_subsys(&ip6_route_net_ops); 5402 if (ret) 5403 goto out_register_inetpeer; 5404 5405 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5406 5407 ret = fib6_init(); 5408 if (ret) 5409 goto out_register_subsys; 5410 5411 ret = xfrm6_init(); 5412 if (ret) 5413 goto out_fib6_init; 5414 5415 ret = fib6_rules_init(); 5416 if (ret) 5417 goto xfrm6_init; 5418 5419 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5420 if (ret) 5421 goto fib6_rules_init; 5422 5423 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5424 inet6_rtm_newroute, NULL, 0); 5425 if (ret < 0) 5426 goto out_register_late_subsys; 5427 5428 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5429 inet6_rtm_delroute, NULL, 0); 5430 if (ret < 0) 5431 goto out_register_late_subsys; 5432 5433 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5434 inet6_rtm_getroute, NULL, 5435 RTNL_FLAG_DOIT_UNLOCKED); 5436 if (ret < 0) 5437 goto out_register_late_subsys; 5438 5439 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5440 if (ret) 5441 goto out_register_late_subsys; 5442 5443 for_each_possible_cpu(cpu) { 5444 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5445 5446 INIT_LIST_HEAD(&ul->head); 5447 spin_lock_init(&ul->lock); 5448 } 5449 5450 out: 5451 return ret; 5452 5453 out_register_late_subsys: 5454 rtnl_unregister_all(PF_INET6); 5455 unregister_pernet_subsys(&ip6_route_net_late_ops); 5456 fib6_rules_init: 5457 fib6_rules_cleanup(); 5458 xfrm6_init: 5459 xfrm6_fini(); 5460 out_fib6_init: 5461 fib6_gc_cleanup(); 5462 out_register_subsys: 5463 unregister_pernet_subsys(&ip6_route_net_ops); 5464 out_register_inetpeer: 5465 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5466 out_dst_entries: 5467 dst_entries_destroy(&ip6_dst_blackhole_ops); 5468 out_kmem_cache: 5469 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5470 goto out; 5471 } 5472 5473 void ip6_route_cleanup(void) 5474 { 5475 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5476 unregister_pernet_subsys(&ip6_route_net_late_ops); 5477 fib6_rules_cleanup(); 5478 xfrm6_fini(); 5479 fib6_gc_cleanup(); 5480 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5481 unregister_pernet_subsys(&ip6_route_net_ops); 5482 dst_entries_destroy(&ip6_dst_blackhole_ops); 5483 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5484 } 5485