1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 214 n = neigh_create(&nd_tbl, daddr, dev); 215 return IS_ERR(n) ? NULL : n; 216 } 217 218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 219 struct sk_buff *skb, 220 const void *daddr) 221 { 222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 223 224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 225 } 226 227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 228 { 229 struct net_device *dev = dst->dev; 230 struct rt6_info *rt = (struct rt6_info *)dst; 231 232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 233 if (!daddr) 234 return; 235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 236 return; 237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 238 return; 239 __ipv6_confirm_neigh(dev, daddr); 240 } 241 242 static struct dst_ops ip6_dst_ops_template = { 243 .family = AF_INET6, 244 .gc = ip6_dst_gc, 245 .gc_thresh = 1024, 246 .check = ip6_dst_check, 247 .default_advmss = ip6_default_advmss, 248 .mtu = ip6_mtu, 249 .cow_metrics = dst_cow_metrics_generic, 250 .destroy = ip6_dst_destroy, 251 .ifdown = ip6_dst_ifdown, 252 .negative_advice = ip6_negative_advice, 253 .link_failure = ip6_link_failure, 254 .update_pmtu = ip6_rt_update_pmtu, 255 .redirect = rt6_do_redirect, 256 .local_out = __ip6_local_out, 257 .neigh_lookup = ip6_dst_neigh_lookup, 258 .confirm_neigh = ip6_confirm_neigh, 259 }; 260 261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 262 { 263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 264 265 return mtu ? : dst->dev->mtu; 266 } 267 268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 269 struct sk_buff *skb, u32 mtu) 270 { 271 } 272 273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 274 struct sk_buff *skb) 275 { 276 } 277 278 static struct dst_ops ip6_dst_blackhole_ops = { 279 .family = AF_INET6, 280 .destroy = ip6_dst_destroy, 281 .check = ip6_dst_check, 282 .mtu = ip6_blackhole_mtu, 283 .default_advmss = ip6_default_advmss, 284 .update_pmtu = ip6_rt_blackhole_update_pmtu, 285 .redirect = ip6_rt_blackhole_redirect, 286 .cow_metrics = dst_cow_metrics_generic, 287 .neigh_lookup = ip6_dst_neigh_lookup, 288 }; 289 290 static const u32 ip6_template_metrics[RTAX_MAX] = { 291 [RTAX_HOPLIMIT - 1] = 0, 292 }; 293 294 static const struct fib6_info fib6_null_entry_template = { 295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 296 .fib6_protocol = RTPROT_KERNEL, 297 .fib6_metric = ~(u32)0, 298 .fib6_ref = ATOMIC_INIT(1), 299 .fib6_type = RTN_UNREACHABLE, 300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 301 }; 302 303 static const struct rt6_info ip6_null_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -ENETUNREACH, 309 .input = ip6_pkt_discard, 310 .output = ip6_pkt_discard_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 316 317 static const struct rt6_info ip6_prohibit_entry_template = { 318 .dst = { 319 .__refcnt = ATOMIC_INIT(1), 320 .__use = 1, 321 .obsolete = DST_OBSOLETE_FORCE_CHK, 322 .error = -EACCES, 323 .input = ip6_pkt_prohibit, 324 .output = ip6_pkt_prohibit_out, 325 }, 326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 327 }; 328 329 static const struct rt6_info ip6_blk_hole_entry_template = { 330 .dst = { 331 .__refcnt = ATOMIC_INIT(1), 332 .__use = 1, 333 .obsolete = DST_OBSOLETE_FORCE_CHK, 334 .error = -EINVAL, 335 .input = dst_discard, 336 .output = dst_discard_out, 337 }, 338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 339 }; 340 341 #endif 342 343 static void rt6_info_init(struct rt6_info *rt) 344 { 345 struct dst_entry *dst = &rt->dst; 346 347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 348 INIT_LIST_HEAD(&rt->rt6i_uncached); 349 } 350 351 /* allocate dst with ip6_dst_ops */ 352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) { 359 rt6_info_init(rt); 360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 361 } 362 363 return rt; 364 } 365 EXPORT_SYMBOL(ip6_dst_alloc); 366 367 static void ip6_dst_destroy(struct dst_entry *dst) 368 { 369 struct rt6_info *rt = (struct rt6_info *)dst; 370 struct fib6_info *from; 371 struct inet6_dev *idev; 372 373 ip_dst_metrics_put(dst); 374 rt6_uncached_list_del(rt); 375 376 idev = rt->rt6i_idev; 377 if (idev) { 378 rt->rt6i_idev = NULL; 379 in6_dev_put(idev); 380 } 381 382 rcu_read_lock(); 383 from = rcu_dereference(rt->from); 384 rcu_assign_pointer(rt->from, NULL); 385 fib6_info_release(from); 386 rcu_read_unlock(); 387 } 388 389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 390 int how) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct inet6_dev *idev = rt->rt6i_idev; 394 struct net_device *loopback_dev = 395 dev_net(dev)->loopback_dev; 396 397 if (idev && idev->dev != loopback_dev) { 398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 399 if (loopback_idev) { 400 rt->rt6i_idev = loopback_idev; 401 in6_dev_put(idev); 402 } 403 } 404 } 405 406 static bool __rt6_check_expired(const struct rt6_info *rt) 407 { 408 if (rt->rt6i_flags & RTF_EXPIRES) 409 return time_after(jiffies, rt->dst.expires); 410 else 411 return false; 412 } 413 414 static bool rt6_check_expired(const struct rt6_info *rt) 415 { 416 struct fib6_info *from; 417 418 from = rcu_dereference(rt->from); 419 420 if (rt->rt6i_flags & RTF_EXPIRES) { 421 if (time_after(jiffies, rt->dst.expires)) 422 return true; 423 } else if (from) { 424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 425 fib6_check_expired(from); 426 } 427 return false; 428 } 429 430 struct fib6_info *fib6_multipath_select(const struct net *net, 431 struct fib6_info *match, 432 struct flowi6 *fl6, int oif, 433 const struct sk_buff *skb, 434 int strict) 435 { 436 struct fib6_info *sibling, *next_sibling; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 445 return match; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 int nh_upper_bound; 450 451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 452 if (fl6->mp_hash > nh_upper_bound) 453 continue; 454 if (rt6_score_route(sibling, oif, strict) < 0) 455 break; 456 match = sibling; 457 break; 458 } 459 460 return match; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static inline struct fib6_info *rt6_device_match(struct net *net, 468 struct fib6_info *rt, 469 const struct in6_addr *saddr, 470 int oif, 471 int flags) 472 { 473 struct fib6_info *sprt; 474 475 if (!oif && ipv6_addr_any(saddr) && 476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 477 return rt; 478 479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 480 const struct net_device *dev = sprt->fib6_nh.nh_dev; 481 482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 483 continue; 484 485 if (oif) { 486 if (dev->ifindex == oif) 487 return sprt; 488 } else { 489 if (ipv6_chk_addr(net, saddr, dev, 490 flags & RT6_LOOKUP_F_IFACE)) 491 return sprt; 492 } 493 } 494 495 if (oif && flags & RT6_LOOKUP_F_IFACE) 496 return net->ipv6.fib6_null_entry; 497 498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 499 } 500 501 #ifdef CONFIG_IPV6_ROUTER_PREF 502 struct __rt6_probe_work { 503 struct work_struct work; 504 struct in6_addr target; 505 struct net_device *dev; 506 }; 507 508 static void rt6_probe_deferred(struct work_struct *w) 509 { 510 struct in6_addr mcaddr; 511 struct __rt6_probe_work *work = 512 container_of(w, struct __rt6_probe_work, work); 513 514 addrconf_addr_solict_mult(&work->target, &mcaddr); 515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 516 dev_put(work->dev); 517 kfree(work); 518 } 519 520 static void rt6_probe(struct fib6_info *rt) 521 { 522 struct __rt6_probe_work *work = NULL; 523 const struct in6_addr *nh_gw; 524 struct neighbour *neigh; 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 528 /* 529 * Okay, this does not seem to be appropriate 530 * for now, however, we need to check if it 531 * is really so; aka Router Reachability Probing. 532 * 533 * Router Reachability Probe MUST be rate-limited 534 * to no more than one per minute. 535 */ 536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 537 return; 538 539 nh_gw = &rt->fib6_nh.nh_gw; 540 dev = rt->fib6_nh.nh_dev; 541 rcu_read_lock_bh(); 542 idev = __in6_dev_get(dev); 543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 544 if (neigh) { 545 if (neigh->nud_state & NUD_VALID) 546 goto out; 547 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else if (time_after(jiffies, rt->last_probe + 558 idev->cnf.rtr_probe_interval)) { 559 work = kmalloc(sizeof(*work), GFP_ATOMIC); 560 } 561 562 if (work) { 563 rt->last_probe = jiffies; 564 INIT_WORK(&work->work, rt6_probe_deferred); 565 work->target = *nh_gw; 566 dev_hold(dev); 567 work->dev = dev; 568 schedule_work(&work->work); 569 } 570 571 out: 572 rcu_read_unlock_bh(); 573 } 574 #else 575 static inline void rt6_probe(struct fib6_info *rt) 576 { 577 } 578 #endif 579 580 /* 581 * Default Router Selection (RFC 2461 6.3.6) 582 */ 583 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 584 { 585 const struct net_device *dev = rt->fib6_nh.nh_dev; 586 587 if (!oif || dev->ifindex == oif) 588 return 2; 589 return 0; 590 } 591 592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 593 { 594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 595 struct neighbour *neigh; 596 597 if (rt->fib6_flags & RTF_NONEXTHOP || 598 !(rt->fib6_flags & RTF_GATEWAY)) 599 return RT6_NUD_SUCCEED; 600 601 rcu_read_lock_bh(); 602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 603 &rt->fib6_nh.nh_gw); 604 if (neigh) { 605 read_lock(&neigh->lock); 606 if (neigh->nud_state & NUD_VALID) 607 ret = RT6_NUD_SUCCEED; 608 #ifdef CONFIG_IPV6_ROUTER_PREF 609 else if (!(neigh->nud_state & NUD_FAILED)) 610 ret = RT6_NUD_SUCCEED; 611 else 612 ret = RT6_NUD_FAIL_PROBE; 613 #endif 614 read_unlock(&neigh->lock); 615 } else { 616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 618 } 619 rcu_read_unlock_bh(); 620 621 return ret; 622 } 623 624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 625 { 626 int m; 627 628 m = rt6_check_dev(rt, oif); 629 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 630 return RT6_NUD_FAIL_HARD; 631 #ifdef CONFIG_IPV6_ROUTER_PREF 632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 633 #endif 634 if (strict & RT6_LOOKUP_F_REACHABLE) { 635 int n = rt6_check_neigh(rt); 636 if (n < 0) 637 return n; 638 } 639 return m; 640 } 641 642 /* called with rc_read_lock held */ 643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 644 { 645 const struct net_device *dev = fib6_info_nh_dev(f6i); 646 bool rc = false; 647 648 if (dev) { 649 const struct inet6_dev *idev = __in6_dev_get(dev); 650 651 rc = !!idev->cnf.ignore_routes_with_linkdown; 652 } 653 654 return rc; 655 } 656 657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 658 int *mpri, struct fib6_info *match, 659 bool *do_rr) 660 { 661 int m; 662 bool match_do_rr = false; 663 664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 665 goto out; 666 667 if (fib6_ignore_linkdown(rt) && 668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 670 goto out; 671 672 if (fib6_check_expired(rt)) 673 goto out; 674 675 m = rt6_score_route(rt, oif, strict); 676 if (m == RT6_NUD_FAIL_DO_RR) { 677 match_do_rr = true; 678 m = 0; /* lowest valid score */ 679 } else if (m == RT6_NUD_FAIL_HARD) { 680 goto out; 681 } 682 683 if (strict & RT6_LOOKUP_F_REACHABLE) 684 rt6_probe(rt); 685 686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 687 if (m > *mpri) { 688 *do_rr = match_do_rr; 689 *mpri = m; 690 match = rt; 691 } 692 out: 693 return match; 694 } 695 696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 697 struct fib6_info *leaf, 698 struct fib6_info *rr_head, 699 u32 metric, int oif, int strict, 700 bool *do_rr) 701 { 702 struct fib6_info *rt, *match, *cont; 703 int mpri = -1; 704 705 match = NULL; 706 cont = NULL; 707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 708 if (rt->fib6_metric != metric) { 709 cont = rt; 710 break; 711 } 712 713 match = find_match(rt, oif, strict, &mpri, match, do_rr); 714 } 715 716 for (rt = leaf; rt && rt != rr_head; 717 rt = rcu_dereference(rt->fib6_next)) { 718 if (rt->fib6_metric != metric) { 719 cont = rt; 720 break; 721 } 722 723 match = find_match(rt, oif, strict, &mpri, match, do_rr); 724 } 725 726 if (match || !cont) 727 return match; 728 729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 730 match = find_match(rt, oif, strict, &mpri, match, do_rr); 731 732 return match; 733 } 734 735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 736 int oif, int strict) 737 { 738 struct fib6_info *leaf = rcu_dereference(fn->leaf); 739 struct fib6_info *match, *rt0; 740 bool do_rr = false; 741 int key_plen; 742 743 if (!leaf || leaf == net->ipv6.fib6_null_entry) 744 return net->ipv6.fib6_null_entry; 745 746 rt0 = rcu_dereference(fn->rr_ptr); 747 if (!rt0) 748 rt0 = leaf; 749 750 /* Double check to make sure fn is not an intermediate node 751 * and fn->leaf does not points to its child's leaf 752 * (This might happen if all routes under fn are deleted from 753 * the tree and fib6_repair_tree() is called on the node.) 754 */ 755 key_plen = rt0->fib6_dst.plen; 756 #ifdef CONFIG_IPV6_SUBTREES 757 if (rt0->fib6_src.plen) 758 key_plen = rt0->fib6_src.plen; 759 #endif 760 if (fn->fn_bit != key_plen) 761 return net->ipv6.fib6_null_entry; 762 763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 764 &do_rr); 765 766 if (do_rr) { 767 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 768 769 /* no entries matched; do round-robin */ 770 if (!next || next->fib6_metric != rt0->fib6_metric) 771 next = leaf; 772 773 if (next != rt0) { 774 spin_lock_bh(&leaf->fib6_table->tb6_lock); 775 /* make sure next is not being deleted from the tree */ 776 if (next->fib6_node) 777 rcu_assign_pointer(fn->rr_ptr, next); 778 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 779 } 780 } 781 782 return match ? match : net->ipv6.fib6_null_entry; 783 } 784 785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 786 { 787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 788 } 789 790 #ifdef CONFIG_IPV6_ROUTE_INFO 791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 792 const struct in6_addr *gwaddr) 793 { 794 struct net *net = dev_net(dev); 795 struct route_info *rinfo = (struct route_info *) opt; 796 struct in6_addr prefix_buf, *prefix; 797 unsigned int pref; 798 unsigned long lifetime; 799 struct fib6_info *rt; 800 801 if (len < sizeof(struct route_info)) { 802 return -EINVAL; 803 } 804 805 /* Sanity check for prefix_len and length */ 806 if (rinfo->length > 3) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 128) { 809 return -EINVAL; 810 } else if (rinfo->prefix_len > 64) { 811 if (rinfo->length < 2) { 812 return -EINVAL; 813 } 814 } else if (rinfo->prefix_len > 0) { 815 if (rinfo->length < 1) { 816 return -EINVAL; 817 } 818 } 819 820 pref = rinfo->route_pref; 821 if (pref == ICMPV6_ROUTER_PREF_INVALID) 822 return -EINVAL; 823 824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 825 826 if (rinfo->length == 3) 827 prefix = (struct in6_addr *)rinfo->prefix; 828 else { 829 /* this function is safe */ 830 ipv6_addr_prefix(&prefix_buf, 831 (struct in6_addr *)rinfo->prefix, 832 rinfo->prefix_len); 833 prefix = &prefix_buf; 834 } 835 836 if (rinfo->prefix_len == 0) 837 rt = rt6_get_dflt_router(net, gwaddr, dev); 838 else 839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 840 gwaddr, dev); 841 842 if (rt && !lifetime) { 843 ip6_del_rt(net, rt); 844 rt = NULL; 845 } 846 847 if (!rt && lifetime) 848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 849 dev, pref); 850 else if (rt) 851 rt->fib6_flags = RTF_ROUTEINFO | 852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 853 854 if (rt) { 855 if (!addrconf_finite_timeout(lifetime)) 856 fib6_clean_expires(rt); 857 else 858 fib6_set_expires(rt, jiffies + HZ * lifetime); 859 860 fib6_info_release(rt); 861 } 862 return 0; 863 } 864 #endif 865 866 /* 867 * Misc support functions 868 */ 869 870 /* called with rcu_lock held */ 871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 872 { 873 struct net_device *dev = rt->fib6_nh.nh_dev; 874 875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 876 /* for copies of local routes, dst->dev needs to be the 877 * device if it is a master device, the master device if 878 * device is enslaved, and the loopback as the default 879 */ 880 if (netif_is_l3_slave(dev) && 881 !rt6_need_strict(&rt->fib6_dst.addr)) 882 dev = l3mdev_master_dev_rcu(dev); 883 else if (!netif_is_l3_master(dev)) 884 dev = dev_net(dev)->loopback_dev; 885 /* last case is netif_is_l3_master(dev) is true in which 886 * case we want dev returned to be dev 887 */ 888 } 889 890 return dev; 891 } 892 893 static const int fib6_prop[RTN_MAX + 1] = { 894 [RTN_UNSPEC] = 0, 895 [RTN_UNICAST] = 0, 896 [RTN_LOCAL] = 0, 897 [RTN_BROADCAST] = 0, 898 [RTN_ANYCAST] = 0, 899 [RTN_MULTICAST] = 0, 900 [RTN_BLACKHOLE] = -EINVAL, 901 [RTN_UNREACHABLE] = -EHOSTUNREACH, 902 [RTN_PROHIBIT] = -EACCES, 903 [RTN_THROW] = -EAGAIN, 904 [RTN_NAT] = -EINVAL, 905 [RTN_XRESOLVE] = -EINVAL, 906 }; 907 908 static int ip6_rt_type_to_error(u8 fib6_type) 909 { 910 return fib6_prop[fib6_type]; 911 } 912 913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 914 { 915 unsigned short flags = 0; 916 917 if (rt->dst_nocount) 918 flags |= DST_NOCOUNT; 919 if (rt->dst_nopolicy) 920 flags |= DST_NOPOLICY; 921 if (rt->dst_host) 922 flags |= DST_HOST; 923 924 return flags; 925 } 926 927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 928 { 929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 930 931 switch (ort->fib6_type) { 932 case RTN_BLACKHOLE: 933 rt->dst.output = dst_discard_out; 934 rt->dst.input = dst_discard; 935 break; 936 case RTN_PROHIBIT: 937 rt->dst.output = ip6_pkt_prohibit_out; 938 rt->dst.input = ip6_pkt_prohibit; 939 break; 940 case RTN_THROW: 941 case RTN_UNREACHABLE: 942 default: 943 rt->dst.output = ip6_pkt_discard_out; 944 rt->dst.input = ip6_pkt_discard; 945 break; 946 } 947 } 948 949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 950 { 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 } 999 1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1001 struct in6_addr *saddr) 1002 { 1003 struct fib6_node *pn, *sn; 1004 while (1) { 1005 if (fn->fn_flags & RTN_TL_ROOT) 1006 return NULL; 1007 pn = rcu_dereference(fn->parent); 1008 sn = FIB6_SUBTREE(pn); 1009 if (sn && sn != fn) 1010 fn = fib6_node_lookup(sn, NULL, saddr); 1011 else 1012 fn = pn; 1013 if (fn->fn_flags & RTN_RTINFO) 1014 return fn; 1015 } 1016 } 1017 1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1019 bool null_fallback) 1020 { 1021 struct rt6_info *rt = *prt; 1022 1023 if (dst_hold_safe(&rt->dst)) 1024 return true; 1025 if (null_fallback) { 1026 rt = net->ipv6.ip6_null_entry; 1027 dst_hold(&rt->dst); 1028 } else { 1029 rt = NULL; 1030 } 1031 *prt = rt; 1032 return false; 1033 } 1034 1035 /* called with rcu_lock held */ 1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1037 { 1038 unsigned short flags = fib6_info_dst_flags(rt); 1039 struct net_device *dev = rt->fib6_nh.nh_dev; 1040 struct rt6_info *nrt; 1041 1042 if (!fib6_info_hold_safe(rt)) 1043 return NULL; 1044 1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1046 if (nrt) 1047 ip6_rt_copy_init(nrt, rt); 1048 else 1049 fib6_info_release(rt); 1050 1051 return nrt; 1052 } 1053 1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1055 struct fib6_table *table, 1056 struct flowi6 *fl6, 1057 const struct sk_buff *skb, 1058 int flags) 1059 { 1060 struct fib6_info *f6i; 1061 struct fib6_node *fn; 1062 struct rt6_info *rt; 1063 1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1065 flags &= ~RT6_LOOKUP_F_IFACE; 1066 1067 rcu_read_lock(); 1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1069 restart: 1070 f6i = rcu_dereference(fn->leaf); 1071 if (!f6i) { 1072 f6i = net->ipv6.fib6_null_entry; 1073 } else { 1074 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1075 fl6->flowi6_oif, flags); 1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1077 f6i = fib6_multipath_select(net, f6i, fl6, 1078 fl6->flowi6_oif, skb, 1079 flags); 1080 } 1081 if (f6i == net->ipv6.fib6_null_entry) { 1082 fn = fib6_backtrack(fn, &fl6->saddr); 1083 if (fn) 1084 goto restart; 1085 } 1086 1087 trace_fib6_table_lookup(net, f6i, table, fl6); 1088 1089 /* Search through exception table */ 1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1091 if (rt) { 1092 if (ip6_hold_safe(net, &rt, true)) 1093 dst_use_noref(&rt->dst, jiffies); 1094 } else if (f6i == net->ipv6.fib6_null_entry) { 1095 rt = net->ipv6.ip6_null_entry; 1096 dst_hold(&rt->dst); 1097 } else { 1098 rt = ip6_create_rt_rcu(f6i); 1099 if (!rt) { 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 } 1103 } 1104 1105 rcu_read_unlock(); 1106 1107 return rt; 1108 } 1109 1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1111 const struct sk_buff *skb, int flags) 1112 { 1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1114 } 1115 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1116 1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1118 const struct in6_addr *saddr, int oif, 1119 const struct sk_buff *skb, int strict) 1120 { 1121 struct flowi6 fl6 = { 1122 .flowi6_oif = oif, 1123 .daddr = *daddr, 1124 }; 1125 struct dst_entry *dst; 1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1127 1128 if (saddr) { 1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1130 flags |= RT6_LOOKUP_F_HAS_SADDR; 1131 } 1132 1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1134 if (dst->error == 0) 1135 return (struct rt6_info *) dst; 1136 1137 dst_release(dst); 1138 1139 return NULL; 1140 } 1141 EXPORT_SYMBOL(rt6_lookup); 1142 1143 /* ip6_ins_rt is called with FREE table->tb6_lock. 1144 * It takes new route entry, the addition fails by any reason the 1145 * route is released. 1146 * Caller must hold dst before calling it. 1147 */ 1148 1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1150 struct netlink_ext_ack *extack) 1151 { 1152 int err; 1153 struct fib6_table *table; 1154 1155 table = rt->fib6_table; 1156 spin_lock_bh(&table->tb6_lock); 1157 err = fib6_add(&table->tb6_root, rt, info, extack); 1158 spin_unlock_bh(&table->tb6_lock); 1159 1160 return err; 1161 } 1162 1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1164 { 1165 struct nl_info info = { .nl_net = net, }; 1166 1167 return __ip6_ins_rt(rt, &info, NULL); 1168 } 1169 1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1171 const struct in6_addr *daddr, 1172 const struct in6_addr *saddr) 1173 { 1174 struct net_device *dev; 1175 struct rt6_info *rt; 1176 1177 /* 1178 * Clone the route. 1179 */ 1180 1181 if (!fib6_info_hold_safe(ort)) 1182 return NULL; 1183 1184 dev = ip6_rt_get_dev_rcu(ort); 1185 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1186 if (!rt) { 1187 fib6_info_release(ort); 1188 return NULL; 1189 } 1190 1191 ip6_rt_copy_init(rt, ort); 1192 rt->rt6i_flags |= RTF_CACHE; 1193 rt->dst.flags |= DST_HOST; 1194 rt->rt6i_dst.addr = *daddr; 1195 rt->rt6i_dst.plen = 128; 1196 1197 if (!rt6_is_gw_or_nonexthop(ort)) { 1198 if (ort->fib6_dst.plen != 128 && 1199 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1200 rt->rt6i_flags |= RTF_ANYCAST; 1201 #ifdef CONFIG_IPV6_SUBTREES 1202 if (rt->rt6i_src.plen && saddr) { 1203 rt->rt6i_src.addr = *saddr; 1204 rt->rt6i_src.plen = 128; 1205 } 1206 #endif 1207 } 1208 1209 return rt; 1210 } 1211 1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1213 { 1214 unsigned short flags = fib6_info_dst_flags(rt); 1215 struct net_device *dev; 1216 struct rt6_info *pcpu_rt; 1217 1218 if (!fib6_info_hold_safe(rt)) 1219 return NULL; 1220 1221 rcu_read_lock(); 1222 dev = ip6_rt_get_dev_rcu(rt); 1223 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1224 rcu_read_unlock(); 1225 if (!pcpu_rt) { 1226 fib6_info_release(rt); 1227 return NULL; 1228 } 1229 ip6_rt_copy_init(pcpu_rt, rt); 1230 pcpu_rt->rt6i_flags |= RTF_PCPU; 1231 return pcpu_rt; 1232 } 1233 1234 /* It should be called with rcu_read_lock() acquired */ 1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1236 { 1237 struct rt6_info *pcpu_rt, **p; 1238 1239 p = this_cpu_ptr(rt->rt6i_pcpu); 1240 pcpu_rt = *p; 1241 1242 if (pcpu_rt) 1243 ip6_hold_safe(NULL, &pcpu_rt, false); 1244 1245 return pcpu_rt; 1246 } 1247 1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1249 struct fib6_info *rt) 1250 { 1251 struct rt6_info *pcpu_rt, *prev, **p; 1252 1253 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1254 if (!pcpu_rt) { 1255 dst_hold(&net->ipv6.ip6_null_entry->dst); 1256 return net->ipv6.ip6_null_entry; 1257 } 1258 1259 dst_hold(&pcpu_rt->dst); 1260 p = this_cpu_ptr(rt->rt6i_pcpu); 1261 prev = cmpxchg(p, NULL, pcpu_rt); 1262 BUG_ON(prev); 1263 1264 return pcpu_rt; 1265 } 1266 1267 /* exception hash table implementation 1268 */ 1269 static DEFINE_SPINLOCK(rt6_exception_lock); 1270 1271 /* Remove rt6_ex from hash table and free the memory 1272 * Caller must hold rt6_exception_lock 1273 */ 1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1275 struct rt6_exception *rt6_ex) 1276 { 1277 struct fib6_info *from; 1278 struct net *net; 1279 1280 if (!bucket || !rt6_ex) 1281 return; 1282 1283 net = dev_net(rt6_ex->rt6i->dst.dev); 1284 net->ipv6.rt6_stats->fib_rt_cache--; 1285 1286 /* purge completely the exception to allow releasing the held resources: 1287 * some [sk] cache may keep the dst around for unlimited time 1288 */ 1289 from = rcu_dereference_protected(rt6_ex->rt6i->from, 1290 lockdep_is_held(&rt6_exception_lock)); 1291 rcu_assign_pointer(rt6_ex->rt6i->from, NULL); 1292 fib6_info_release(from); 1293 dst_dev_put(&rt6_ex->rt6i->dst); 1294 1295 hlist_del_rcu(&rt6_ex->hlist); 1296 dst_release(&rt6_ex->rt6i->dst); 1297 kfree_rcu(rt6_ex, rcu); 1298 WARN_ON_ONCE(!bucket->depth); 1299 bucket->depth--; 1300 } 1301 1302 /* Remove oldest rt6_ex in bucket and free the memory 1303 * Caller must hold rt6_exception_lock 1304 */ 1305 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1306 { 1307 struct rt6_exception *rt6_ex, *oldest = NULL; 1308 1309 if (!bucket) 1310 return; 1311 1312 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1313 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1314 oldest = rt6_ex; 1315 } 1316 rt6_remove_exception(bucket, oldest); 1317 } 1318 1319 static u32 rt6_exception_hash(const struct in6_addr *dst, 1320 const struct in6_addr *src) 1321 { 1322 static u32 seed __read_mostly; 1323 u32 val; 1324 1325 net_get_random_once(&seed, sizeof(seed)); 1326 val = jhash(dst, sizeof(*dst), seed); 1327 1328 #ifdef CONFIG_IPV6_SUBTREES 1329 if (src) 1330 val = jhash(src, sizeof(*src), val); 1331 #endif 1332 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1333 } 1334 1335 /* Helper function to find the cached rt in the hash table 1336 * and update bucket pointer to point to the bucket for this 1337 * (daddr, saddr) pair 1338 * Caller must hold rt6_exception_lock 1339 */ 1340 static struct rt6_exception * 1341 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1342 const struct in6_addr *daddr, 1343 const struct in6_addr *saddr) 1344 { 1345 struct rt6_exception *rt6_ex; 1346 u32 hval; 1347 1348 if (!(*bucket) || !daddr) 1349 return NULL; 1350 1351 hval = rt6_exception_hash(daddr, saddr); 1352 *bucket += hval; 1353 1354 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1355 struct rt6_info *rt6 = rt6_ex->rt6i; 1356 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1357 1358 #ifdef CONFIG_IPV6_SUBTREES 1359 if (matched && saddr) 1360 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1361 #endif 1362 if (matched) 1363 return rt6_ex; 1364 } 1365 return NULL; 1366 } 1367 1368 /* Helper function to find the cached rt in the hash table 1369 * and update bucket pointer to point to the bucket for this 1370 * (daddr, saddr) pair 1371 * Caller must hold rcu_read_lock() 1372 */ 1373 static struct rt6_exception * 1374 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1375 const struct in6_addr *daddr, 1376 const struct in6_addr *saddr) 1377 { 1378 struct rt6_exception *rt6_ex; 1379 u32 hval; 1380 1381 WARN_ON_ONCE(!rcu_read_lock_held()); 1382 1383 if (!(*bucket) || !daddr) 1384 return NULL; 1385 1386 hval = rt6_exception_hash(daddr, saddr); 1387 *bucket += hval; 1388 1389 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1390 struct rt6_info *rt6 = rt6_ex->rt6i; 1391 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1392 1393 #ifdef CONFIG_IPV6_SUBTREES 1394 if (matched && saddr) 1395 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1396 #endif 1397 if (matched) 1398 return rt6_ex; 1399 } 1400 return NULL; 1401 } 1402 1403 static unsigned int fib6_mtu(const struct fib6_info *rt) 1404 { 1405 unsigned int mtu; 1406 1407 if (rt->fib6_pmtu) { 1408 mtu = rt->fib6_pmtu; 1409 } else { 1410 struct net_device *dev = fib6_info_nh_dev(rt); 1411 struct inet6_dev *idev; 1412 1413 rcu_read_lock(); 1414 idev = __in6_dev_get(dev); 1415 mtu = idev->cnf.mtu6; 1416 rcu_read_unlock(); 1417 } 1418 1419 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1420 1421 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1422 } 1423 1424 static int rt6_insert_exception(struct rt6_info *nrt, 1425 struct fib6_info *ort) 1426 { 1427 struct net *net = dev_net(nrt->dst.dev); 1428 struct rt6_exception_bucket *bucket; 1429 struct in6_addr *src_key = NULL; 1430 struct rt6_exception *rt6_ex; 1431 int err = 0; 1432 1433 spin_lock_bh(&rt6_exception_lock); 1434 1435 if (ort->exception_bucket_flushed) { 1436 err = -EINVAL; 1437 goto out; 1438 } 1439 1440 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1441 lockdep_is_held(&rt6_exception_lock)); 1442 if (!bucket) { 1443 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1444 GFP_ATOMIC); 1445 if (!bucket) { 1446 err = -ENOMEM; 1447 goto out; 1448 } 1449 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1450 } 1451 1452 #ifdef CONFIG_IPV6_SUBTREES 1453 /* rt6i_src.plen != 0 indicates ort is in subtree 1454 * and exception table is indexed by a hash of 1455 * both rt6i_dst and rt6i_src. 1456 * Otherwise, the exception table is indexed by 1457 * a hash of only rt6i_dst. 1458 */ 1459 if (ort->fib6_src.plen) 1460 src_key = &nrt->rt6i_src.addr; 1461 #endif 1462 /* rt6_mtu_change() might lower mtu on ort. 1463 * Only insert this exception route if its mtu 1464 * is less than ort's mtu value. 1465 */ 1466 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1467 err = -EINVAL; 1468 goto out; 1469 } 1470 1471 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1472 src_key); 1473 if (rt6_ex) 1474 rt6_remove_exception(bucket, rt6_ex); 1475 1476 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1477 if (!rt6_ex) { 1478 err = -ENOMEM; 1479 goto out; 1480 } 1481 rt6_ex->rt6i = nrt; 1482 rt6_ex->stamp = jiffies; 1483 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1484 bucket->depth++; 1485 net->ipv6.rt6_stats->fib_rt_cache++; 1486 1487 if (bucket->depth > FIB6_MAX_DEPTH) 1488 rt6_exception_remove_oldest(bucket); 1489 1490 out: 1491 spin_unlock_bh(&rt6_exception_lock); 1492 1493 /* Update fn->fn_sernum to invalidate all cached dst */ 1494 if (!err) { 1495 spin_lock_bh(&ort->fib6_table->tb6_lock); 1496 fib6_update_sernum(net, ort); 1497 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1498 fib6_force_start_gc(net); 1499 } 1500 1501 return err; 1502 } 1503 1504 void rt6_flush_exceptions(struct fib6_info *rt) 1505 { 1506 struct rt6_exception_bucket *bucket; 1507 struct rt6_exception *rt6_ex; 1508 struct hlist_node *tmp; 1509 int i; 1510 1511 spin_lock_bh(&rt6_exception_lock); 1512 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1513 rt->exception_bucket_flushed = 1; 1514 1515 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1516 lockdep_is_held(&rt6_exception_lock)); 1517 if (!bucket) 1518 goto out; 1519 1520 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1521 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1522 rt6_remove_exception(bucket, rt6_ex); 1523 WARN_ON_ONCE(bucket->depth); 1524 bucket++; 1525 } 1526 1527 out: 1528 spin_unlock_bh(&rt6_exception_lock); 1529 } 1530 1531 /* Find cached rt in the hash table inside passed in rt 1532 * Caller has to hold rcu_read_lock() 1533 */ 1534 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1535 struct in6_addr *daddr, 1536 struct in6_addr *saddr) 1537 { 1538 struct rt6_exception_bucket *bucket; 1539 struct in6_addr *src_key = NULL; 1540 struct rt6_exception *rt6_ex; 1541 struct rt6_info *res = NULL; 1542 1543 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1544 1545 #ifdef CONFIG_IPV6_SUBTREES 1546 /* rt6i_src.plen != 0 indicates rt is in subtree 1547 * and exception table is indexed by a hash of 1548 * both rt6i_dst and rt6i_src. 1549 * Otherwise, the exception table is indexed by 1550 * a hash of only rt6i_dst. 1551 */ 1552 if (rt->fib6_src.plen) 1553 src_key = saddr; 1554 #endif 1555 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1556 1557 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1558 res = rt6_ex->rt6i; 1559 1560 return res; 1561 } 1562 1563 /* Remove the passed in cached rt from the hash table that contains it */ 1564 static int rt6_remove_exception_rt(struct rt6_info *rt) 1565 { 1566 struct rt6_exception_bucket *bucket; 1567 struct in6_addr *src_key = NULL; 1568 struct rt6_exception *rt6_ex; 1569 struct fib6_info *from; 1570 int err; 1571 1572 from = rcu_dereference(rt->from); 1573 if (!from || 1574 !(rt->rt6i_flags & RTF_CACHE)) 1575 return -EINVAL; 1576 1577 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1578 return -ENOENT; 1579 1580 spin_lock_bh(&rt6_exception_lock); 1581 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1582 lockdep_is_held(&rt6_exception_lock)); 1583 #ifdef CONFIG_IPV6_SUBTREES 1584 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1585 * and exception table is indexed by a hash of 1586 * both rt6i_dst and rt6i_src. 1587 * Otherwise, the exception table is indexed by 1588 * a hash of only rt6i_dst. 1589 */ 1590 if (from->fib6_src.plen) 1591 src_key = &rt->rt6i_src.addr; 1592 #endif 1593 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1594 &rt->rt6i_dst.addr, 1595 src_key); 1596 if (rt6_ex) { 1597 rt6_remove_exception(bucket, rt6_ex); 1598 err = 0; 1599 } else { 1600 err = -ENOENT; 1601 } 1602 1603 spin_unlock_bh(&rt6_exception_lock); 1604 return err; 1605 } 1606 1607 /* Find rt6_ex which contains the passed in rt cache and 1608 * refresh its stamp 1609 */ 1610 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1611 { 1612 struct rt6_exception_bucket *bucket; 1613 struct in6_addr *src_key = NULL; 1614 struct rt6_exception *rt6_ex; 1615 struct fib6_info *from; 1616 1617 rcu_read_lock(); 1618 from = rcu_dereference(rt->from); 1619 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1620 goto unlock; 1621 1622 bucket = rcu_dereference(from->rt6i_exception_bucket); 1623 1624 #ifdef CONFIG_IPV6_SUBTREES 1625 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1626 * and exception table is indexed by a hash of 1627 * both rt6i_dst and rt6i_src. 1628 * Otherwise, the exception table is indexed by 1629 * a hash of only rt6i_dst. 1630 */ 1631 if (from->fib6_src.plen) 1632 src_key = &rt->rt6i_src.addr; 1633 #endif 1634 rt6_ex = __rt6_find_exception_rcu(&bucket, 1635 &rt->rt6i_dst.addr, 1636 src_key); 1637 if (rt6_ex) 1638 rt6_ex->stamp = jiffies; 1639 1640 unlock: 1641 rcu_read_unlock(); 1642 } 1643 1644 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1645 struct rt6_info *rt, int mtu) 1646 { 1647 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1648 * lowest MTU in the path: always allow updating the route PMTU to 1649 * reflect PMTU decreases. 1650 * 1651 * If the new MTU is higher, and the route PMTU is equal to the local 1652 * MTU, this means the old MTU is the lowest in the path, so allow 1653 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1654 * handle this. 1655 */ 1656 1657 if (dst_mtu(&rt->dst) >= mtu) 1658 return true; 1659 1660 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1661 return true; 1662 1663 return false; 1664 } 1665 1666 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1667 struct fib6_info *rt, int mtu) 1668 { 1669 struct rt6_exception_bucket *bucket; 1670 struct rt6_exception *rt6_ex; 1671 int i; 1672 1673 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1674 lockdep_is_held(&rt6_exception_lock)); 1675 1676 if (!bucket) 1677 return; 1678 1679 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1680 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1681 struct rt6_info *entry = rt6_ex->rt6i; 1682 1683 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1684 * route), the metrics of its rt->from have already 1685 * been updated. 1686 */ 1687 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1688 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1689 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1690 } 1691 bucket++; 1692 } 1693 } 1694 1695 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1696 1697 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1698 struct in6_addr *gateway) 1699 { 1700 struct rt6_exception_bucket *bucket; 1701 struct rt6_exception *rt6_ex; 1702 struct hlist_node *tmp; 1703 int i; 1704 1705 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1706 return; 1707 1708 spin_lock_bh(&rt6_exception_lock); 1709 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1710 lockdep_is_held(&rt6_exception_lock)); 1711 1712 if (bucket) { 1713 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1714 hlist_for_each_entry_safe(rt6_ex, tmp, 1715 &bucket->chain, hlist) { 1716 struct rt6_info *entry = rt6_ex->rt6i; 1717 1718 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1719 RTF_CACHE_GATEWAY && 1720 ipv6_addr_equal(gateway, 1721 &entry->rt6i_gateway)) { 1722 rt6_remove_exception(bucket, rt6_ex); 1723 } 1724 } 1725 bucket++; 1726 } 1727 } 1728 1729 spin_unlock_bh(&rt6_exception_lock); 1730 } 1731 1732 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1733 struct rt6_exception *rt6_ex, 1734 struct fib6_gc_args *gc_args, 1735 unsigned long now) 1736 { 1737 struct rt6_info *rt = rt6_ex->rt6i; 1738 1739 /* we are pruning and obsoleting aged-out and non gateway exceptions 1740 * even if others have still references to them, so that on next 1741 * dst_check() such references can be dropped. 1742 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1743 * expired, independently from their aging, as per RFC 8201 section 4 1744 */ 1745 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1746 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1747 RT6_TRACE("aging clone %p\n", rt); 1748 rt6_remove_exception(bucket, rt6_ex); 1749 return; 1750 } 1751 } else if (time_after(jiffies, rt->dst.expires)) { 1752 RT6_TRACE("purging expired route %p\n", rt); 1753 rt6_remove_exception(bucket, rt6_ex); 1754 return; 1755 } 1756 1757 if (rt->rt6i_flags & RTF_GATEWAY) { 1758 struct neighbour *neigh; 1759 __u8 neigh_flags = 0; 1760 1761 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1762 if (neigh) 1763 neigh_flags = neigh->flags; 1764 1765 if (!(neigh_flags & NTF_ROUTER)) { 1766 RT6_TRACE("purging route %p via non-router but gateway\n", 1767 rt); 1768 rt6_remove_exception(bucket, rt6_ex); 1769 return; 1770 } 1771 } 1772 1773 gc_args->more++; 1774 } 1775 1776 void rt6_age_exceptions(struct fib6_info *rt, 1777 struct fib6_gc_args *gc_args, 1778 unsigned long now) 1779 { 1780 struct rt6_exception_bucket *bucket; 1781 struct rt6_exception *rt6_ex; 1782 struct hlist_node *tmp; 1783 int i; 1784 1785 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1786 return; 1787 1788 rcu_read_lock_bh(); 1789 spin_lock(&rt6_exception_lock); 1790 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1791 lockdep_is_held(&rt6_exception_lock)); 1792 1793 if (bucket) { 1794 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1795 hlist_for_each_entry_safe(rt6_ex, tmp, 1796 &bucket->chain, hlist) { 1797 rt6_age_examine_exception(bucket, rt6_ex, 1798 gc_args, now); 1799 } 1800 bucket++; 1801 } 1802 } 1803 spin_unlock(&rt6_exception_lock); 1804 rcu_read_unlock_bh(); 1805 } 1806 1807 /* must be called with rcu lock held */ 1808 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1809 int oif, struct flowi6 *fl6, int strict) 1810 { 1811 struct fib6_node *fn, *saved_fn; 1812 struct fib6_info *f6i; 1813 1814 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1815 saved_fn = fn; 1816 1817 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1818 oif = 0; 1819 1820 redo_rt6_select: 1821 f6i = rt6_select(net, fn, oif, strict); 1822 if (f6i == net->ipv6.fib6_null_entry) { 1823 fn = fib6_backtrack(fn, &fl6->saddr); 1824 if (fn) 1825 goto redo_rt6_select; 1826 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1827 /* also consider unreachable route */ 1828 strict &= ~RT6_LOOKUP_F_REACHABLE; 1829 fn = saved_fn; 1830 goto redo_rt6_select; 1831 } 1832 } 1833 1834 trace_fib6_table_lookup(net, f6i, table, fl6); 1835 1836 return f6i; 1837 } 1838 1839 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1840 int oif, struct flowi6 *fl6, 1841 const struct sk_buff *skb, int flags) 1842 { 1843 struct fib6_info *f6i; 1844 struct rt6_info *rt; 1845 int strict = 0; 1846 1847 strict |= flags & RT6_LOOKUP_F_IFACE; 1848 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1849 if (net->ipv6.devconf_all->forwarding == 0) 1850 strict |= RT6_LOOKUP_F_REACHABLE; 1851 1852 rcu_read_lock(); 1853 1854 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1855 if (f6i->fib6_nsiblings) 1856 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1857 1858 if (f6i == net->ipv6.fib6_null_entry) { 1859 rt = net->ipv6.ip6_null_entry; 1860 rcu_read_unlock(); 1861 dst_hold(&rt->dst); 1862 return rt; 1863 } 1864 1865 /*Search through exception table */ 1866 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1867 if (rt) { 1868 if (ip6_hold_safe(net, &rt, true)) 1869 dst_use_noref(&rt->dst, jiffies); 1870 1871 rcu_read_unlock(); 1872 return rt; 1873 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1874 !(f6i->fib6_flags & RTF_GATEWAY))) { 1875 /* Create a RTF_CACHE clone which will not be 1876 * owned by the fib6 tree. It is for the special case where 1877 * the daddr in the skb during the neighbor look-up is different 1878 * from the fl6->daddr used to look-up route here. 1879 */ 1880 struct rt6_info *uncached_rt; 1881 1882 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1883 1884 rcu_read_unlock(); 1885 1886 if (uncached_rt) { 1887 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1888 * No need for another dst_hold() 1889 */ 1890 rt6_uncached_list_add(uncached_rt); 1891 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1892 } else { 1893 uncached_rt = net->ipv6.ip6_null_entry; 1894 dst_hold(&uncached_rt->dst); 1895 } 1896 1897 return uncached_rt; 1898 } else { 1899 /* Get a percpu copy */ 1900 1901 struct rt6_info *pcpu_rt; 1902 1903 local_bh_disable(); 1904 pcpu_rt = rt6_get_pcpu_route(f6i); 1905 1906 if (!pcpu_rt) 1907 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1908 1909 local_bh_enable(); 1910 rcu_read_unlock(); 1911 1912 return pcpu_rt; 1913 } 1914 } 1915 EXPORT_SYMBOL_GPL(ip6_pol_route); 1916 1917 static struct rt6_info *ip6_pol_route_input(struct net *net, 1918 struct fib6_table *table, 1919 struct flowi6 *fl6, 1920 const struct sk_buff *skb, 1921 int flags) 1922 { 1923 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1924 } 1925 1926 struct dst_entry *ip6_route_input_lookup(struct net *net, 1927 struct net_device *dev, 1928 struct flowi6 *fl6, 1929 const struct sk_buff *skb, 1930 int flags) 1931 { 1932 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1933 flags |= RT6_LOOKUP_F_IFACE; 1934 1935 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1936 } 1937 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1938 1939 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1940 struct flow_keys *keys, 1941 struct flow_keys *flkeys) 1942 { 1943 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1944 const struct ipv6hdr *key_iph = outer_iph; 1945 struct flow_keys *_flkeys = flkeys; 1946 const struct ipv6hdr *inner_iph; 1947 const struct icmp6hdr *icmph; 1948 struct ipv6hdr _inner_iph; 1949 struct icmp6hdr _icmph; 1950 1951 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1952 goto out; 1953 1954 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1955 sizeof(_icmph), &_icmph); 1956 if (!icmph) 1957 goto out; 1958 1959 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1960 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1961 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1962 icmph->icmp6_type != ICMPV6_PARAMPROB) 1963 goto out; 1964 1965 inner_iph = skb_header_pointer(skb, 1966 skb_transport_offset(skb) + sizeof(*icmph), 1967 sizeof(_inner_iph), &_inner_iph); 1968 if (!inner_iph) 1969 goto out; 1970 1971 key_iph = inner_iph; 1972 _flkeys = NULL; 1973 out: 1974 if (_flkeys) { 1975 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1976 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1977 keys->tags.flow_label = _flkeys->tags.flow_label; 1978 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1979 } else { 1980 keys->addrs.v6addrs.src = key_iph->saddr; 1981 keys->addrs.v6addrs.dst = key_iph->daddr; 1982 keys->tags.flow_label = ip6_flowlabel(key_iph); 1983 keys->basic.ip_proto = key_iph->nexthdr; 1984 } 1985 } 1986 1987 /* if skb is set it will be used and fl6 can be NULL */ 1988 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1989 const struct sk_buff *skb, struct flow_keys *flkeys) 1990 { 1991 struct flow_keys hash_keys; 1992 u32 mhash; 1993 1994 switch (ip6_multipath_hash_policy(net)) { 1995 case 0: 1996 memset(&hash_keys, 0, sizeof(hash_keys)); 1997 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1998 if (skb) { 1999 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2000 } else { 2001 hash_keys.addrs.v6addrs.src = fl6->saddr; 2002 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2003 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2004 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2005 } 2006 break; 2007 case 1: 2008 if (skb) { 2009 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2010 struct flow_keys keys; 2011 2012 /* short-circuit if we already have L4 hash present */ 2013 if (skb->l4_hash) 2014 return skb_get_hash_raw(skb) >> 1; 2015 2016 memset(&hash_keys, 0, sizeof(hash_keys)); 2017 2018 if (!flkeys) { 2019 skb_flow_dissect_flow_keys(skb, &keys, flag); 2020 flkeys = &keys; 2021 } 2022 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2023 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2024 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2025 hash_keys.ports.src = flkeys->ports.src; 2026 hash_keys.ports.dst = flkeys->ports.dst; 2027 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2028 } else { 2029 memset(&hash_keys, 0, sizeof(hash_keys)); 2030 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2031 hash_keys.addrs.v6addrs.src = fl6->saddr; 2032 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2033 hash_keys.ports.src = fl6->fl6_sport; 2034 hash_keys.ports.dst = fl6->fl6_dport; 2035 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2036 } 2037 break; 2038 } 2039 mhash = flow_hash_from_keys(&hash_keys); 2040 2041 return mhash >> 1; 2042 } 2043 2044 void ip6_route_input(struct sk_buff *skb) 2045 { 2046 const struct ipv6hdr *iph = ipv6_hdr(skb); 2047 struct net *net = dev_net(skb->dev); 2048 int flags = RT6_LOOKUP_F_HAS_SADDR; 2049 struct ip_tunnel_info *tun_info; 2050 struct flowi6 fl6 = { 2051 .flowi6_iif = skb->dev->ifindex, 2052 .daddr = iph->daddr, 2053 .saddr = iph->saddr, 2054 .flowlabel = ip6_flowinfo(iph), 2055 .flowi6_mark = skb->mark, 2056 .flowi6_proto = iph->nexthdr, 2057 }; 2058 struct flow_keys *flkeys = NULL, _flkeys; 2059 2060 tun_info = skb_tunnel_info(skb); 2061 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2062 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2063 2064 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2065 flkeys = &_flkeys; 2066 2067 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2068 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2069 skb_dst_drop(skb); 2070 skb_dst_set(skb, 2071 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2072 } 2073 2074 static struct rt6_info *ip6_pol_route_output(struct net *net, 2075 struct fib6_table *table, 2076 struct flowi6 *fl6, 2077 const struct sk_buff *skb, 2078 int flags) 2079 { 2080 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2081 } 2082 2083 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2084 struct flowi6 *fl6, int flags) 2085 { 2086 bool any_src; 2087 2088 if (ipv6_addr_type(&fl6->daddr) & 2089 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2090 struct dst_entry *dst; 2091 2092 dst = l3mdev_link_scope_lookup(net, fl6); 2093 if (dst) 2094 return dst; 2095 } 2096 2097 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2098 2099 any_src = ipv6_addr_any(&fl6->saddr); 2100 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2101 (fl6->flowi6_oif && any_src)) 2102 flags |= RT6_LOOKUP_F_IFACE; 2103 2104 if (!any_src) 2105 flags |= RT6_LOOKUP_F_HAS_SADDR; 2106 else if (sk) 2107 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2108 2109 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2110 } 2111 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2112 2113 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2114 { 2115 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2116 struct net_device *loopback_dev = net->loopback_dev; 2117 struct dst_entry *new = NULL; 2118 2119 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2120 DST_OBSOLETE_DEAD, 0); 2121 if (rt) { 2122 rt6_info_init(rt); 2123 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2124 2125 new = &rt->dst; 2126 new->__use = 1; 2127 new->input = dst_discard; 2128 new->output = dst_discard_out; 2129 2130 dst_copy_metrics(new, &ort->dst); 2131 2132 rt->rt6i_idev = in6_dev_get(loopback_dev); 2133 rt->rt6i_gateway = ort->rt6i_gateway; 2134 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2135 2136 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2137 #ifdef CONFIG_IPV6_SUBTREES 2138 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2139 #endif 2140 } 2141 2142 dst_release(dst_orig); 2143 return new ? new : ERR_PTR(-ENOMEM); 2144 } 2145 2146 /* 2147 * Destination cache support functions 2148 */ 2149 2150 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2151 { 2152 u32 rt_cookie = 0; 2153 2154 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2155 return false; 2156 2157 if (fib6_check_expired(f6i)) 2158 return false; 2159 2160 return true; 2161 } 2162 2163 static struct dst_entry *rt6_check(struct rt6_info *rt, 2164 struct fib6_info *from, 2165 u32 cookie) 2166 { 2167 u32 rt_cookie = 0; 2168 2169 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2170 rt_cookie != cookie) 2171 return NULL; 2172 2173 if (rt6_check_expired(rt)) 2174 return NULL; 2175 2176 return &rt->dst; 2177 } 2178 2179 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2180 struct fib6_info *from, 2181 u32 cookie) 2182 { 2183 if (!__rt6_check_expired(rt) && 2184 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2185 fib6_check(from, cookie)) 2186 return &rt->dst; 2187 else 2188 return NULL; 2189 } 2190 2191 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2192 { 2193 struct dst_entry *dst_ret; 2194 struct fib6_info *from; 2195 struct rt6_info *rt; 2196 2197 rt = container_of(dst, struct rt6_info, dst); 2198 2199 rcu_read_lock(); 2200 2201 /* All IPV6 dsts are created with ->obsolete set to the value 2202 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2203 * into this function always. 2204 */ 2205 2206 from = rcu_dereference(rt->from); 2207 2208 if (from && (rt->rt6i_flags & RTF_PCPU || 2209 unlikely(!list_empty(&rt->rt6i_uncached)))) 2210 dst_ret = rt6_dst_from_check(rt, from, cookie); 2211 else 2212 dst_ret = rt6_check(rt, from, cookie); 2213 2214 rcu_read_unlock(); 2215 2216 return dst_ret; 2217 } 2218 2219 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2220 { 2221 struct rt6_info *rt = (struct rt6_info *) dst; 2222 2223 if (rt) { 2224 if (rt->rt6i_flags & RTF_CACHE) { 2225 rcu_read_lock(); 2226 if (rt6_check_expired(rt)) { 2227 rt6_remove_exception_rt(rt); 2228 dst = NULL; 2229 } 2230 rcu_read_unlock(); 2231 } else { 2232 dst_release(dst); 2233 dst = NULL; 2234 } 2235 } 2236 return dst; 2237 } 2238 2239 static void ip6_link_failure(struct sk_buff *skb) 2240 { 2241 struct rt6_info *rt; 2242 2243 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2244 2245 rt = (struct rt6_info *) skb_dst(skb); 2246 if (rt) { 2247 rcu_read_lock(); 2248 if (rt->rt6i_flags & RTF_CACHE) { 2249 rt6_remove_exception_rt(rt); 2250 } else { 2251 struct fib6_info *from; 2252 struct fib6_node *fn; 2253 2254 from = rcu_dereference(rt->from); 2255 if (from) { 2256 fn = rcu_dereference(from->fib6_node); 2257 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2258 fn->fn_sernum = -1; 2259 } 2260 } 2261 rcu_read_unlock(); 2262 } 2263 } 2264 2265 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2266 { 2267 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2268 struct fib6_info *from; 2269 2270 rcu_read_lock(); 2271 from = rcu_dereference(rt0->from); 2272 if (from) 2273 rt0->dst.expires = from->expires; 2274 rcu_read_unlock(); 2275 } 2276 2277 dst_set_expires(&rt0->dst, timeout); 2278 rt0->rt6i_flags |= RTF_EXPIRES; 2279 } 2280 2281 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2282 { 2283 struct net *net = dev_net(rt->dst.dev); 2284 2285 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2286 rt->rt6i_flags |= RTF_MODIFIED; 2287 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2288 } 2289 2290 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2291 { 2292 bool from_set; 2293 2294 rcu_read_lock(); 2295 from_set = !!rcu_dereference(rt->from); 2296 rcu_read_unlock(); 2297 2298 return !(rt->rt6i_flags & RTF_CACHE) && 2299 (rt->rt6i_flags & RTF_PCPU || from_set); 2300 } 2301 2302 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2303 const struct ipv6hdr *iph, u32 mtu) 2304 { 2305 const struct in6_addr *daddr, *saddr; 2306 struct rt6_info *rt6 = (struct rt6_info *)dst; 2307 2308 if (dst_metric_locked(dst, RTAX_MTU)) 2309 return; 2310 2311 if (iph) { 2312 daddr = &iph->daddr; 2313 saddr = &iph->saddr; 2314 } else if (sk) { 2315 daddr = &sk->sk_v6_daddr; 2316 saddr = &inet6_sk(sk)->saddr; 2317 } else { 2318 daddr = NULL; 2319 saddr = NULL; 2320 } 2321 dst_confirm_neigh(dst, daddr); 2322 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2323 if (mtu >= dst_mtu(dst)) 2324 return; 2325 2326 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2327 rt6_do_update_pmtu(rt6, mtu); 2328 /* update rt6_ex->stamp for cache */ 2329 if (rt6->rt6i_flags & RTF_CACHE) 2330 rt6_update_exception_stamp_rt(rt6); 2331 } else if (daddr) { 2332 struct fib6_info *from; 2333 struct rt6_info *nrt6; 2334 2335 rcu_read_lock(); 2336 from = rcu_dereference(rt6->from); 2337 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2338 if (nrt6) { 2339 rt6_do_update_pmtu(nrt6, mtu); 2340 if (rt6_insert_exception(nrt6, from)) 2341 dst_release_immediate(&nrt6->dst); 2342 } 2343 rcu_read_unlock(); 2344 } 2345 } 2346 2347 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2348 struct sk_buff *skb, u32 mtu) 2349 { 2350 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2351 } 2352 2353 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2354 int oif, u32 mark, kuid_t uid) 2355 { 2356 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2357 struct dst_entry *dst; 2358 struct flowi6 fl6 = { 2359 .flowi6_oif = oif, 2360 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2361 .daddr = iph->daddr, 2362 .saddr = iph->saddr, 2363 .flowlabel = ip6_flowinfo(iph), 2364 .flowi6_uid = uid, 2365 }; 2366 2367 dst = ip6_route_output(net, NULL, &fl6); 2368 if (!dst->error) 2369 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2370 dst_release(dst); 2371 } 2372 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2373 2374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2375 { 2376 int oif = sk->sk_bound_dev_if; 2377 struct dst_entry *dst; 2378 2379 if (!oif && skb->dev) 2380 oif = l3mdev_master_ifindex(skb->dev); 2381 2382 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2383 2384 dst = __sk_dst_get(sk); 2385 if (!dst || !dst->obsolete || 2386 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2387 return; 2388 2389 bh_lock_sock(sk); 2390 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2391 ip6_datagram_dst_update(sk, false); 2392 bh_unlock_sock(sk); 2393 } 2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2395 2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2397 const struct flowi6 *fl6) 2398 { 2399 #ifdef CONFIG_IPV6_SUBTREES 2400 struct ipv6_pinfo *np = inet6_sk(sk); 2401 #endif 2402 2403 ip6_dst_store(sk, dst, 2404 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2405 &sk->sk_v6_daddr : NULL, 2406 #ifdef CONFIG_IPV6_SUBTREES 2407 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2408 &np->saddr : 2409 #endif 2410 NULL); 2411 } 2412 2413 /* Handle redirects */ 2414 struct ip6rd_flowi { 2415 struct flowi6 fl6; 2416 struct in6_addr gateway; 2417 }; 2418 2419 static struct rt6_info *__ip6_route_redirect(struct net *net, 2420 struct fib6_table *table, 2421 struct flowi6 *fl6, 2422 const struct sk_buff *skb, 2423 int flags) 2424 { 2425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2426 struct rt6_info *ret = NULL, *rt_cache; 2427 struct fib6_info *rt; 2428 struct fib6_node *fn; 2429 2430 /* Get the "current" route for this destination and 2431 * check if the redirect has come from appropriate router. 2432 * 2433 * RFC 4861 specifies that redirects should only be 2434 * accepted if they come from the nexthop to the target. 2435 * Due to the way the routes are chosen, this notion 2436 * is a bit fuzzy and one might need to check all possible 2437 * routes. 2438 */ 2439 2440 rcu_read_lock(); 2441 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2442 restart: 2443 for_each_fib6_node_rt_rcu(fn) { 2444 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2445 continue; 2446 if (fib6_check_expired(rt)) 2447 continue; 2448 if (rt->fib6_flags & RTF_REJECT) 2449 break; 2450 if (!(rt->fib6_flags & RTF_GATEWAY)) 2451 continue; 2452 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2453 continue; 2454 /* rt_cache's gateway might be different from its 'parent' 2455 * in the case of an ip redirect. 2456 * So we keep searching in the exception table if the gateway 2457 * is different. 2458 */ 2459 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2460 rt_cache = rt6_find_cached_rt(rt, 2461 &fl6->daddr, 2462 &fl6->saddr); 2463 if (rt_cache && 2464 ipv6_addr_equal(&rdfl->gateway, 2465 &rt_cache->rt6i_gateway)) { 2466 ret = rt_cache; 2467 break; 2468 } 2469 continue; 2470 } 2471 break; 2472 } 2473 2474 if (!rt) 2475 rt = net->ipv6.fib6_null_entry; 2476 else if (rt->fib6_flags & RTF_REJECT) { 2477 ret = net->ipv6.ip6_null_entry; 2478 goto out; 2479 } 2480 2481 if (rt == net->ipv6.fib6_null_entry) { 2482 fn = fib6_backtrack(fn, &fl6->saddr); 2483 if (fn) 2484 goto restart; 2485 } 2486 2487 out: 2488 if (ret) 2489 ip6_hold_safe(net, &ret, true); 2490 else 2491 ret = ip6_create_rt_rcu(rt); 2492 2493 rcu_read_unlock(); 2494 2495 trace_fib6_table_lookup(net, rt, table, fl6); 2496 return ret; 2497 }; 2498 2499 static struct dst_entry *ip6_route_redirect(struct net *net, 2500 const struct flowi6 *fl6, 2501 const struct sk_buff *skb, 2502 const struct in6_addr *gateway) 2503 { 2504 int flags = RT6_LOOKUP_F_HAS_SADDR; 2505 struct ip6rd_flowi rdfl; 2506 2507 rdfl.fl6 = *fl6; 2508 rdfl.gateway = *gateway; 2509 2510 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2511 flags, __ip6_route_redirect); 2512 } 2513 2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2515 kuid_t uid) 2516 { 2517 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2518 struct dst_entry *dst; 2519 struct flowi6 fl6 = { 2520 .flowi6_iif = LOOPBACK_IFINDEX, 2521 .flowi6_oif = oif, 2522 .flowi6_mark = mark, 2523 .daddr = iph->daddr, 2524 .saddr = iph->saddr, 2525 .flowlabel = ip6_flowinfo(iph), 2526 .flowi6_uid = uid, 2527 }; 2528 2529 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2530 rt6_do_redirect(dst, NULL, skb); 2531 dst_release(dst); 2532 } 2533 EXPORT_SYMBOL_GPL(ip6_redirect); 2534 2535 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2536 { 2537 const struct ipv6hdr *iph = ipv6_hdr(skb); 2538 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2539 struct dst_entry *dst; 2540 struct flowi6 fl6 = { 2541 .flowi6_iif = LOOPBACK_IFINDEX, 2542 .flowi6_oif = oif, 2543 .daddr = msg->dest, 2544 .saddr = iph->daddr, 2545 .flowi6_uid = sock_net_uid(net, NULL), 2546 }; 2547 2548 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2549 rt6_do_redirect(dst, NULL, skb); 2550 dst_release(dst); 2551 } 2552 2553 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2554 { 2555 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2556 sk->sk_uid); 2557 } 2558 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2559 2560 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2561 { 2562 struct net_device *dev = dst->dev; 2563 unsigned int mtu = dst_mtu(dst); 2564 struct net *net = dev_net(dev); 2565 2566 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2567 2568 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2569 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2570 2571 /* 2572 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2573 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2574 * IPV6_MAXPLEN is also valid and means: "any MSS, 2575 * rely only on pmtu discovery" 2576 */ 2577 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2578 mtu = IPV6_MAXPLEN; 2579 return mtu; 2580 } 2581 2582 static unsigned int ip6_mtu(const struct dst_entry *dst) 2583 { 2584 struct inet6_dev *idev; 2585 unsigned int mtu; 2586 2587 mtu = dst_metric_raw(dst, RTAX_MTU); 2588 if (mtu) 2589 goto out; 2590 2591 mtu = IPV6_MIN_MTU; 2592 2593 rcu_read_lock(); 2594 idev = __in6_dev_get(dst->dev); 2595 if (idev) 2596 mtu = idev->cnf.mtu6; 2597 rcu_read_unlock(); 2598 2599 out: 2600 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2601 2602 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2603 } 2604 2605 /* MTU selection: 2606 * 1. mtu on route is locked - use it 2607 * 2. mtu from nexthop exception 2608 * 3. mtu from egress device 2609 * 2610 * based on ip6_dst_mtu_forward and exception logic of 2611 * rt6_find_cached_rt; called with rcu_read_lock 2612 */ 2613 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2614 struct in6_addr *saddr) 2615 { 2616 struct rt6_exception_bucket *bucket; 2617 struct rt6_exception *rt6_ex; 2618 struct in6_addr *src_key; 2619 struct inet6_dev *idev; 2620 u32 mtu = 0; 2621 2622 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2623 mtu = f6i->fib6_pmtu; 2624 if (mtu) 2625 goto out; 2626 } 2627 2628 src_key = NULL; 2629 #ifdef CONFIG_IPV6_SUBTREES 2630 if (f6i->fib6_src.plen) 2631 src_key = saddr; 2632 #endif 2633 2634 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2635 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2636 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2637 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2638 2639 if (likely(!mtu)) { 2640 struct net_device *dev = fib6_info_nh_dev(f6i); 2641 2642 mtu = IPV6_MIN_MTU; 2643 idev = __in6_dev_get(dev); 2644 if (idev && idev->cnf.mtu6 > mtu) 2645 mtu = idev->cnf.mtu6; 2646 } 2647 2648 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2649 out: 2650 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2651 } 2652 2653 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2654 struct flowi6 *fl6) 2655 { 2656 struct dst_entry *dst; 2657 struct rt6_info *rt; 2658 struct inet6_dev *idev = in6_dev_get(dev); 2659 struct net *net = dev_net(dev); 2660 2661 if (unlikely(!idev)) 2662 return ERR_PTR(-ENODEV); 2663 2664 rt = ip6_dst_alloc(net, dev, 0); 2665 if (unlikely(!rt)) { 2666 in6_dev_put(idev); 2667 dst = ERR_PTR(-ENOMEM); 2668 goto out; 2669 } 2670 2671 rt->dst.flags |= DST_HOST; 2672 rt->dst.input = ip6_input; 2673 rt->dst.output = ip6_output; 2674 rt->rt6i_gateway = fl6->daddr; 2675 rt->rt6i_dst.addr = fl6->daddr; 2676 rt->rt6i_dst.plen = 128; 2677 rt->rt6i_idev = idev; 2678 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2679 2680 /* Add this dst into uncached_list so that rt6_disable_ip() can 2681 * do proper release of the net_device 2682 */ 2683 rt6_uncached_list_add(rt); 2684 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2685 2686 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2687 2688 out: 2689 return dst; 2690 } 2691 2692 static int ip6_dst_gc(struct dst_ops *ops) 2693 { 2694 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2695 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2696 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2697 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2698 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2699 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2700 int entries; 2701 2702 entries = dst_entries_get_fast(ops); 2703 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2704 entries <= rt_max_size) 2705 goto out; 2706 2707 net->ipv6.ip6_rt_gc_expire++; 2708 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2709 entries = dst_entries_get_slow(ops); 2710 if (entries < ops->gc_thresh) 2711 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2712 out: 2713 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2714 return entries > rt_max_size; 2715 } 2716 2717 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2718 struct fib6_config *cfg, 2719 const struct in6_addr *gw_addr, 2720 u32 tbid, int flags) 2721 { 2722 struct flowi6 fl6 = { 2723 .flowi6_oif = cfg->fc_ifindex, 2724 .daddr = *gw_addr, 2725 .saddr = cfg->fc_prefsrc, 2726 }; 2727 struct fib6_table *table; 2728 struct rt6_info *rt; 2729 2730 table = fib6_get_table(net, tbid); 2731 if (!table) 2732 return NULL; 2733 2734 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2735 flags |= RT6_LOOKUP_F_HAS_SADDR; 2736 2737 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2738 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2739 2740 /* if table lookup failed, fall back to full lookup */ 2741 if (rt == net->ipv6.ip6_null_entry) { 2742 ip6_rt_put(rt); 2743 rt = NULL; 2744 } 2745 2746 return rt; 2747 } 2748 2749 static int ip6_route_check_nh_onlink(struct net *net, 2750 struct fib6_config *cfg, 2751 const struct net_device *dev, 2752 struct netlink_ext_ack *extack) 2753 { 2754 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2755 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2756 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2757 struct fib6_info *from; 2758 struct rt6_info *grt; 2759 int err; 2760 2761 err = 0; 2762 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2763 if (grt) { 2764 rcu_read_lock(); 2765 from = rcu_dereference(grt->from); 2766 if (!grt->dst.error && 2767 /* ignore match if it is the default route */ 2768 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2769 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2770 NL_SET_ERR_MSG(extack, 2771 "Nexthop has invalid gateway or device mismatch"); 2772 err = -EINVAL; 2773 } 2774 rcu_read_unlock(); 2775 2776 ip6_rt_put(grt); 2777 } 2778 2779 return err; 2780 } 2781 2782 static int ip6_route_check_nh(struct net *net, 2783 struct fib6_config *cfg, 2784 struct net_device **_dev, 2785 struct inet6_dev **idev) 2786 { 2787 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2788 struct net_device *dev = _dev ? *_dev : NULL; 2789 struct rt6_info *grt = NULL; 2790 int err = -EHOSTUNREACH; 2791 2792 if (cfg->fc_table) { 2793 int flags = RT6_LOOKUP_F_IFACE; 2794 2795 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2796 cfg->fc_table, flags); 2797 if (grt) { 2798 if (grt->rt6i_flags & RTF_GATEWAY || 2799 (dev && dev != grt->dst.dev)) { 2800 ip6_rt_put(grt); 2801 grt = NULL; 2802 } 2803 } 2804 } 2805 2806 if (!grt) 2807 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2808 2809 if (!grt) 2810 goto out; 2811 2812 if (dev) { 2813 if (dev != grt->dst.dev) { 2814 ip6_rt_put(grt); 2815 goto out; 2816 } 2817 } else { 2818 *_dev = dev = grt->dst.dev; 2819 *idev = grt->rt6i_idev; 2820 dev_hold(dev); 2821 in6_dev_hold(grt->rt6i_idev); 2822 } 2823 2824 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2825 err = 0; 2826 2827 ip6_rt_put(grt); 2828 2829 out: 2830 return err; 2831 } 2832 2833 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2834 struct net_device **_dev, struct inet6_dev **idev, 2835 struct netlink_ext_ack *extack) 2836 { 2837 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2838 int gwa_type = ipv6_addr_type(gw_addr); 2839 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2840 const struct net_device *dev = *_dev; 2841 bool need_addr_check = !dev; 2842 int err = -EINVAL; 2843 2844 /* if gw_addr is local we will fail to detect this in case 2845 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2846 * will return already-added prefix route via interface that 2847 * prefix route was assigned to, which might be non-loopback. 2848 */ 2849 if (dev && 2850 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2851 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2852 goto out; 2853 } 2854 2855 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2856 /* IPv6 strictly inhibits using not link-local 2857 * addresses as nexthop address. 2858 * Otherwise, router will not able to send redirects. 2859 * It is very good, but in some (rare!) circumstances 2860 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2861 * some exceptions. --ANK 2862 * We allow IPv4-mapped nexthops to support RFC4798-type 2863 * addressing 2864 */ 2865 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2866 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2867 goto out; 2868 } 2869 2870 if (cfg->fc_flags & RTNH_F_ONLINK) 2871 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2872 else 2873 err = ip6_route_check_nh(net, cfg, _dev, idev); 2874 2875 if (err) 2876 goto out; 2877 } 2878 2879 /* reload in case device was changed */ 2880 dev = *_dev; 2881 2882 err = -EINVAL; 2883 if (!dev) { 2884 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2885 goto out; 2886 } else if (dev->flags & IFF_LOOPBACK) { 2887 NL_SET_ERR_MSG(extack, 2888 "Egress device can not be loopback device for this route"); 2889 goto out; 2890 } 2891 2892 /* if we did not check gw_addr above, do so now that the 2893 * egress device has been resolved. 2894 */ 2895 if (need_addr_check && 2896 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2897 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2898 goto out; 2899 } 2900 2901 err = 0; 2902 out: 2903 return err; 2904 } 2905 2906 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2907 gfp_t gfp_flags, 2908 struct netlink_ext_ack *extack) 2909 { 2910 struct net *net = cfg->fc_nlinfo.nl_net; 2911 struct fib6_info *rt = NULL; 2912 struct net_device *dev = NULL; 2913 struct inet6_dev *idev = NULL; 2914 struct fib6_table *table; 2915 int addr_type; 2916 int err = -EINVAL; 2917 2918 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2919 if (cfg->fc_flags & RTF_PCPU) { 2920 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2921 goto out; 2922 } 2923 2924 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2925 if (cfg->fc_flags & RTF_CACHE) { 2926 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2927 goto out; 2928 } 2929 2930 if (cfg->fc_type > RTN_MAX) { 2931 NL_SET_ERR_MSG(extack, "Invalid route type"); 2932 goto out; 2933 } 2934 2935 if (cfg->fc_dst_len > 128) { 2936 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2937 goto out; 2938 } 2939 if (cfg->fc_src_len > 128) { 2940 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2941 goto out; 2942 } 2943 #ifndef CONFIG_IPV6_SUBTREES 2944 if (cfg->fc_src_len) { 2945 NL_SET_ERR_MSG(extack, 2946 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2947 goto out; 2948 } 2949 #endif 2950 if (cfg->fc_ifindex) { 2951 err = -ENODEV; 2952 dev = dev_get_by_index(net, cfg->fc_ifindex); 2953 if (!dev) 2954 goto out; 2955 idev = in6_dev_get(dev); 2956 if (!idev) 2957 goto out; 2958 } 2959 2960 if (cfg->fc_metric == 0) 2961 cfg->fc_metric = IP6_RT_PRIO_USER; 2962 2963 if (cfg->fc_flags & RTNH_F_ONLINK) { 2964 if (!dev) { 2965 NL_SET_ERR_MSG(extack, 2966 "Nexthop device required for onlink"); 2967 err = -ENODEV; 2968 goto out; 2969 } 2970 2971 if (!(dev->flags & IFF_UP)) { 2972 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2973 err = -ENETDOWN; 2974 goto out; 2975 } 2976 } 2977 2978 err = -ENOBUFS; 2979 if (cfg->fc_nlinfo.nlh && 2980 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2981 table = fib6_get_table(net, cfg->fc_table); 2982 if (!table) { 2983 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2984 table = fib6_new_table(net, cfg->fc_table); 2985 } 2986 } else { 2987 table = fib6_new_table(net, cfg->fc_table); 2988 } 2989 2990 if (!table) 2991 goto out; 2992 2993 err = -ENOMEM; 2994 rt = fib6_info_alloc(gfp_flags); 2995 if (!rt) 2996 goto out; 2997 2998 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 2999 extack); 3000 if (IS_ERR(rt->fib6_metrics)) { 3001 err = PTR_ERR(rt->fib6_metrics); 3002 /* Do not leave garbage there. */ 3003 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3004 goto out; 3005 } 3006 3007 if (cfg->fc_flags & RTF_ADDRCONF) 3008 rt->dst_nocount = true; 3009 3010 if (cfg->fc_flags & RTF_EXPIRES) 3011 fib6_set_expires(rt, jiffies + 3012 clock_t_to_jiffies(cfg->fc_expires)); 3013 else 3014 fib6_clean_expires(rt); 3015 3016 if (cfg->fc_protocol == RTPROT_UNSPEC) 3017 cfg->fc_protocol = RTPROT_BOOT; 3018 rt->fib6_protocol = cfg->fc_protocol; 3019 3020 addr_type = ipv6_addr_type(&cfg->fc_dst); 3021 3022 if (cfg->fc_encap) { 3023 struct lwtunnel_state *lwtstate; 3024 3025 err = lwtunnel_build_state(cfg->fc_encap_type, 3026 cfg->fc_encap, AF_INET6, cfg, 3027 &lwtstate, extack); 3028 if (err) 3029 goto out; 3030 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3031 } 3032 3033 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3034 rt->fib6_dst.plen = cfg->fc_dst_len; 3035 if (rt->fib6_dst.plen == 128) 3036 rt->dst_host = true; 3037 3038 #ifdef CONFIG_IPV6_SUBTREES 3039 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3040 rt->fib6_src.plen = cfg->fc_src_len; 3041 #endif 3042 3043 rt->fib6_metric = cfg->fc_metric; 3044 rt->fib6_nh.nh_weight = 1; 3045 3046 rt->fib6_type = cfg->fc_type; 3047 3048 /* We cannot add true routes via loopback here, 3049 they would result in kernel looping; promote them to reject routes 3050 */ 3051 if ((cfg->fc_flags & RTF_REJECT) || 3052 (dev && (dev->flags & IFF_LOOPBACK) && 3053 !(addr_type & IPV6_ADDR_LOOPBACK) && 3054 !(cfg->fc_flags & RTF_LOCAL))) { 3055 /* hold loopback dev/idev if we haven't done so. */ 3056 if (dev != net->loopback_dev) { 3057 if (dev) { 3058 dev_put(dev); 3059 in6_dev_put(idev); 3060 } 3061 dev = net->loopback_dev; 3062 dev_hold(dev); 3063 idev = in6_dev_get(dev); 3064 if (!idev) { 3065 err = -ENODEV; 3066 goto out; 3067 } 3068 } 3069 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3070 goto install_route; 3071 } 3072 3073 if (cfg->fc_flags & RTF_GATEWAY) { 3074 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3075 if (err) 3076 goto out; 3077 3078 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3079 } 3080 3081 err = -ENODEV; 3082 if (!dev) 3083 goto out; 3084 3085 if (idev->cnf.disable_ipv6) { 3086 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3087 err = -EACCES; 3088 goto out; 3089 } 3090 3091 if (!(dev->flags & IFF_UP)) { 3092 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3093 err = -ENETDOWN; 3094 goto out; 3095 } 3096 3097 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3098 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3099 NL_SET_ERR_MSG(extack, "Invalid source address"); 3100 err = -EINVAL; 3101 goto out; 3102 } 3103 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3104 rt->fib6_prefsrc.plen = 128; 3105 } else 3106 rt->fib6_prefsrc.plen = 0; 3107 3108 rt->fib6_flags = cfg->fc_flags; 3109 3110 install_route: 3111 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3112 !netif_carrier_ok(dev)) 3113 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3114 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3115 rt->fib6_nh.nh_dev = dev; 3116 rt->fib6_table = table; 3117 3118 if (idev) 3119 in6_dev_put(idev); 3120 3121 return rt; 3122 out: 3123 if (dev) 3124 dev_put(dev); 3125 if (idev) 3126 in6_dev_put(idev); 3127 3128 fib6_info_release(rt); 3129 return ERR_PTR(err); 3130 } 3131 3132 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3133 struct netlink_ext_ack *extack) 3134 { 3135 struct fib6_info *rt; 3136 int err; 3137 3138 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3139 if (IS_ERR(rt)) 3140 return PTR_ERR(rt); 3141 3142 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3143 fib6_info_release(rt); 3144 3145 return err; 3146 } 3147 3148 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3149 { 3150 struct net *net = info->nl_net; 3151 struct fib6_table *table; 3152 int err; 3153 3154 if (rt == net->ipv6.fib6_null_entry) { 3155 err = -ENOENT; 3156 goto out; 3157 } 3158 3159 table = rt->fib6_table; 3160 spin_lock_bh(&table->tb6_lock); 3161 err = fib6_del(rt, info); 3162 spin_unlock_bh(&table->tb6_lock); 3163 3164 out: 3165 fib6_info_release(rt); 3166 return err; 3167 } 3168 3169 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3170 { 3171 struct nl_info info = { .nl_net = net }; 3172 3173 return __ip6_del_rt(rt, &info); 3174 } 3175 3176 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3177 { 3178 struct nl_info *info = &cfg->fc_nlinfo; 3179 struct net *net = info->nl_net; 3180 struct sk_buff *skb = NULL; 3181 struct fib6_table *table; 3182 int err = -ENOENT; 3183 3184 if (rt == net->ipv6.fib6_null_entry) 3185 goto out_put; 3186 table = rt->fib6_table; 3187 spin_lock_bh(&table->tb6_lock); 3188 3189 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3190 struct fib6_info *sibling, *next_sibling; 3191 3192 /* prefer to send a single notification with all hops */ 3193 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3194 if (skb) { 3195 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3196 3197 if (rt6_fill_node(net, skb, rt, NULL, 3198 NULL, NULL, 0, RTM_DELROUTE, 3199 info->portid, seq, 0) < 0) { 3200 kfree_skb(skb); 3201 skb = NULL; 3202 } else 3203 info->skip_notify = 1; 3204 } 3205 3206 list_for_each_entry_safe(sibling, next_sibling, 3207 &rt->fib6_siblings, 3208 fib6_siblings) { 3209 err = fib6_del(sibling, info); 3210 if (err) 3211 goto out_unlock; 3212 } 3213 } 3214 3215 err = fib6_del(rt, info); 3216 out_unlock: 3217 spin_unlock_bh(&table->tb6_lock); 3218 out_put: 3219 fib6_info_release(rt); 3220 3221 if (skb) { 3222 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3223 info->nlh, gfp_any()); 3224 } 3225 return err; 3226 } 3227 3228 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3229 { 3230 int rc = -ESRCH; 3231 3232 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3233 goto out; 3234 3235 if (cfg->fc_flags & RTF_GATEWAY && 3236 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3237 goto out; 3238 3239 rc = rt6_remove_exception_rt(rt); 3240 out: 3241 return rc; 3242 } 3243 3244 static int ip6_route_del(struct fib6_config *cfg, 3245 struct netlink_ext_ack *extack) 3246 { 3247 struct rt6_info *rt_cache; 3248 struct fib6_table *table; 3249 struct fib6_info *rt; 3250 struct fib6_node *fn; 3251 int err = -ESRCH; 3252 3253 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3254 if (!table) { 3255 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3256 return err; 3257 } 3258 3259 rcu_read_lock(); 3260 3261 fn = fib6_locate(&table->tb6_root, 3262 &cfg->fc_dst, cfg->fc_dst_len, 3263 &cfg->fc_src, cfg->fc_src_len, 3264 !(cfg->fc_flags & RTF_CACHE)); 3265 3266 if (fn) { 3267 for_each_fib6_node_rt_rcu(fn) { 3268 if (cfg->fc_flags & RTF_CACHE) { 3269 int rc; 3270 3271 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3272 &cfg->fc_src); 3273 if (rt_cache) { 3274 rc = ip6_del_cached_rt(rt_cache, cfg); 3275 if (rc != -ESRCH) { 3276 rcu_read_unlock(); 3277 return rc; 3278 } 3279 } 3280 continue; 3281 } 3282 if (cfg->fc_ifindex && 3283 (!rt->fib6_nh.nh_dev || 3284 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3285 continue; 3286 if (cfg->fc_flags & RTF_GATEWAY && 3287 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3288 continue; 3289 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3290 continue; 3291 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3292 continue; 3293 if (!fib6_info_hold_safe(rt)) 3294 continue; 3295 rcu_read_unlock(); 3296 3297 /* if gateway was specified only delete the one hop */ 3298 if (cfg->fc_flags & RTF_GATEWAY) 3299 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3300 3301 return __ip6_del_rt_siblings(rt, cfg); 3302 } 3303 } 3304 rcu_read_unlock(); 3305 3306 return err; 3307 } 3308 3309 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3310 { 3311 struct netevent_redirect netevent; 3312 struct rt6_info *rt, *nrt = NULL; 3313 struct ndisc_options ndopts; 3314 struct inet6_dev *in6_dev; 3315 struct neighbour *neigh; 3316 struct fib6_info *from; 3317 struct rd_msg *msg; 3318 int optlen, on_link; 3319 u8 *lladdr; 3320 3321 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3322 optlen -= sizeof(*msg); 3323 3324 if (optlen < 0) { 3325 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3326 return; 3327 } 3328 3329 msg = (struct rd_msg *)icmp6_hdr(skb); 3330 3331 if (ipv6_addr_is_multicast(&msg->dest)) { 3332 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3333 return; 3334 } 3335 3336 on_link = 0; 3337 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3338 on_link = 1; 3339 } else if (ipv6_addr_type(&msg->target) != 3340 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3341 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3342 return; 3343 } 3344 3345 in6_dev = __in6_dev_get(skb->dev); 3346 if (!in6_dev) 3347 return; 3348 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3349 return; 3350 3351 /* RFC2461 8.1: 3352 * The IP source address of the Redirect MUST be the same as the current 3353 * first-hop router for the specified ICMP Destination Address. 3354 */ 3355 3356 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3357 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3358 return; 3359 } 3360 3361 lladdr = NULL; 3362 if (ndopts.nd_opts_tgt_lladdr) { 3363 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3364 skb->dev); 3365 if (!lladdr) { 3366 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3367 return; 3368 } 3369 } 3370 3371 rt = (struct rt6_info *) dst; 3372 if (rt->rt6i_flags & RTF_REJECT) { 3373 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3374 return; 3375 } 3376 3377 /* Redirect received -> path was valid. 3378 * Look, redirects are sent only in response to data packets, 3379 * so that this nexthop apparently is reachable. --ANK 3380 */ 3381 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3382 3383 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3384 if (!neigh) 3385 return; 3386 3387 /* 3388 * We have finally decided to accept it. 3389 */ 3390 3391 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3392 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3393 NEIGH_UPDATE_F_OVERRIDE| 3394 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3395 NEIGH_UPDATE_F_ISROUTER)), 3396 NDISC_REDIRECT, &ndopts); 3397 3398 rcu_read_lock(); 3399 from = rcu_dereference(rt->from); 3400 /* This fib6_info_hold() is safe here because we hold reference to rt 3401 * and rt already holds reference to fib6_info. 3402 */ 3403 fib6_info_hold(from); 3404 rcu_read_unlock(); 3405 3406 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3407 if (!nrt) 3408 goto out; 3409 3410 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3411 if (on_link) 3412 nrt->rt6i_flags &= ~RTF_GATEWAY; 3413 3414 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3415 3416 /* No need to remove rt from the exception table if rt is 3417 * a cached route because rt6_insert_exception() will 3418 * takes care of it 3419 */ 3420 if (rt6_insert_exception(nrt, from)) { 3421 dst_release_immediate(&nrt->dst); 3422 goto out; 3423 } 3424 3425 netevent.old = &rt->dst; 3426 netevent.new = &nrt->dst; 3427 netevent.daddr = &msg->dest; 3428 netevent.neigh = neigh; 3429 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3430 3431 out: 3432 fib6_info_release(from); 3433 neigh_release(neigh); 3434 } 3435 3436 #ifdef CONFIG_IPV6_ROUTE_INFO 3437 static struct fib6_info *rt6_get_route_info(struct net *net, 3438 const struct in6_addr *prefix, int prefixlen, 3439 const struct in6_addr *gwaddr, 3440 struct net_device *dev) 3441 { 3442 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3443 int ifindex = dev->ifindex; 3444 struct fib6_node *fn; 3445 struct fib6_info *rt = NULL; 3446 struct fib6_table *table; 3447 3448 table = fib6_get_table(net, tb_id); 3449 if (!table) 3450 return NULL; 3451 3452 rcu_read_lock(); 3453 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3454 if (!fn) 3455 goto out; 3456 3457 for_each_fib6_node_rt_rcu(fn) { 3458 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3459 continue; 3460 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3461 continue; 3462 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3463 continue; 3464 if (!fib6_info_hold_safe(rt)) 3465 continue; 3466 break; 3467 } 3468 out: 3469 rcu_read_unlock(); 3470 return rt; 3471 } 3472 3473 static struct fib6_info *rt6_add_route_info(struct net *net, 3474 const struct in6_addr *prefix, int prefixlen, 3475 const struct in6_addr *gwaddr, 3476 struct net_device *dev, 3477 unsigned int pref) 3478 { 3479 struct fib6_config cfg = { 3480 .fc_metric = IP6_RT_PRIO_USER, 3481 .fc_ifindex = dev->ifindex, 3482 .fc_dst_len = prefixlen, 3483 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3484 RTF_UP | RTF_PREF(pref), 3485 .fc_protocol = RTPROT_RA, 3486 .fc_type = RTN_UNICAST, 3487 .fc_nlinfo.portid = 0, 3488 .fc_nlinfo.nlh = NULL, 3489 .fc_nlinfo.nl_net = net, 3490 }; 3491 3492 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3493 cfg.fc_dst = *prefix; 3494 cfg.fc_gateway = *gwaddr; 3495 3496 /* We should treat it as a default route if prefix length is 0. */ 3497 if (!prefixlen) 3498 cfg.fc_flags |= RTF_DEFAULT; 3499 3500 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3501 3502 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3503 } 3504 #endif 3505 3506 struct fib6_info *rt6_get_dflt_router(struct net *net, 3507 const struct in6_addr *addr, 3508 struct net_device *dev) 3509 { 3510 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3511 struct fib6_info *rt; 3512 struct fib6_table *table; 3513 3514 table = fib6_get_table(net, tb_id); 3515 if (!table) 3516 return NULL; 3517 3518 rcu_read_lock(); 3519 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3520 if (dev == rt->fib6_nh.nh_dev && 3521 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3522 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3523 break; 3524 } 3525 if (rt && !fib6_info_hold_safe(rt)) 3526 rt = NULL; 3527 rcu_read_unlock(); 3528 return rt; 3529 } 3530 3531 struct fib6_info *rt6_add_dflt_router(struct net *net, 3532 const struct in6_addr *gwaddr, 3533 struct net_device *dev, 3534 unsigned int pref) 3535 { 3536 struct fib6_config cfg = { 3537 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3538 .fc_metric = IP6_RT_PRIO_USER, 3539 .fc_ifindex = dev->ifindex, 3540 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3541 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3542 .fc_protocol = RTPROT_RA, 3543 .fc_type = RTN_UNICAST, 3544 .fc_nlinfo.portid = 0, 3545 .fc_nlinfo.nlh = NULL, 3546 .fc_nlinfo.nl_net = net, 3547 }; 3548 3549 cfg.fc_gateway = *gwaddr; 3550 3551 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3552 struct fib6_table *table; 3553 3554 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3555 if (table) 3556 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3557 } 3558 3559 return rt6_get_dflt_router(net, gwaddr, dev); 3560 } 3561 3562 static void __rt6_purge_dflt_routers(struct net *net, 3563 struct fib6_table *table) 3564 { 3565 struct fib6_info *rt; 3566 3567 restart: 3568 rcu_read_lock(); 3569 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3570 struct net_device *dev = fib6_info_nh_dev(rt); 3571 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3572 3573 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3574 (!idev || idev->cnf.accept_ra != 2) && 3575 fib6_info_hold_safe(rt)) { 3576 rcu_read_unlock(); 3577 ip6_del_rt(net, rt); 3578 goto restart; 3579 } 3580 } 3581 rcu_read_unlock(); 3582 3583 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3584 } 3585 3586 void rt6_purge_dflt_routers(struct net *net) 3587 { 3588 struct fib6_table *table; 3589 struct hlist_head *head; 3590 unsigned int h; 3591 3592 rcu_read_lock(); 3593 3594 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3595 head = &net->ipv6.fib_table_hash[h]; 3596 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3597 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3598 __rt6_purge_dflt_routers(net, table); 3599 } 3600 } 3601 3602 rcu_read_unlock(); 3603 } 3604 3605 static void rtmsg_to_fib6_config(struct net *net, 3606 struct in6_rtmsg *rtmsg, 3607 struct fib6_config *cfg) 3608 { 3609 *cfg = (struct fib6_config){ 3610 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3611 : RT6_TABLE_MAIN, 3612 .fc_ifindex = rtmsg->rtmsg_ifindex, 3613 .fc_metric = rtmsg->rtmsg_metric, 3614 .fc_expires = rtmsg->rtmsg_info, 3615 .fc_dst_len = rtmsg->rtmsg_dst_len, 3616 .fc_src_len = rtmsg->rtmsg_src_len, 3617 .fc_flags = rtmsg->rtmsg_flags, 3618 .fc_type = rtmsg->rtmsg_type, 3619 3620 .fc_nlinfo.nl_net = net, 3621 3622 .fc_dst = rtmsg->rtmsg_dst, 3623 .fc_src = rtmsg->rtmsg_src, 3624 .fc_gateway = rtmsg->rtmsg_gateway, 3625 }; 3626 } 3627 3628 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3629 { 3630 struct fib6_config cfg; 3631 struct in6_rtmsg rtmsg; 3632 int err; 3633 3634 switch (cmd) { 3635 case SIOCADDRT: /* Add a route */ 3636 case SIOCDELRT: /* Delete a route */ 3637 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3638 return -EPERM; 3639 err = copy_from_user(&rtmsg, arg, 3640 sizeof(struct in6_rtmsg)); 3641 if (err) 3642 return -EFAULT; 3643 3644 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3645 3646 rtnl_lock(); 3647 switch (cmd) { 3648 case SIOCADDRT: 3649 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3650 break; 3651 case SIOCDELRT: 3652 err = ip6_route_del(&cfg, NULL); 3653 break; 3654 default: 3655 err = -EINVAL; 3656 } 3657 rtnl_unlock(); 3658 3659 return err; 3660 } 3661 3662 return -EINVAL; 3663 } 3664 3665 /* 3666 * Drop the packet on the floor 3667 */ 3668 3669 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3670 { 3671 int type; 3672 struct dst_entry *dst = skb_dst(skb); 3673 switch (ipstats_mib_noroutes) { 3674 case IPSTATS_MIB_INNOROUTES: 3675 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3676 if (type == IPV6_ADDR_ANY) { 3677 IP6_INC_STATS(dev_net(dst->dev), 3678 __in6_dev_get_safely(skb->dev), 3679 IPSTATS_MIB_INADDRERRORS); 3680 break; 3681 } 3682 /* FALLTHROUGH */ 3683 case IPSTATS_MIB_OUTNOROUTES: 3684 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3685 ipstats_mib_noroutes); 3686 break; 3687 } 3688 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3689 kfree_skb(skb); 3690 return 0; 3691 } 3692 3693 static int ip6_pkt_discard(struct sk_buff *skb) 3694 { 3695 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3696 } 3697 3698 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3699 { 3700 skb->dev = skb_dst(skb)->dev; 3701 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3702 } 3703 3704 static int ip6_pkt_prohibit(struct sk_buff *skb) 3705 { 3706 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3707 } 3708 3709 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3710 { 3711 skb->dev = skb_dst(skb)->dev; 3712 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3713 } 3714 3715 /* 3716 * Allocate a dst for local (unicast / anycast) address. 3717 */ 3718 3719 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3720 struct inet6_dev *idev, 3721 const struct in6_addr *addr, 3722 bool anycast, gfp_t gfp_flags) 3723 { 3724 u32 tb_id; 3725 struct net_device *dev = idev->dev; 3726 struct fib6_info *f6i; 3727 3728 f6i = fib6_info_alloc(gfp_flags); 3729 if (!f6i) 3730 return ERR_PTR(-ENOMEM); 3731 3732 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); 3733 f6i->dst_nocount = true; 3734 f6i->dst_host = true; 3735 f6i->fib6_protocol = RTPROT_KERNEL; 3736 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3737 if (anycast) { 3738 f6i->fib6_type = RTN_ANYCAST; 3739 f6i->fib6_flags |= RTF_ANYCAST; 3740 } else { 3741 f6i->fib6_type = RTN_LOCAL; 3742 f6i->fib6_flags |= RTF_LOCAL; 3743 } 3744 3745 f6i->fib6_nh.nh_gw = *addr; 3746 dev_hold(dev); 3747 f6i->fib6_nh.nh_dev = dev; 3748 f6i->fib6_dst.addr = *addr; 3749 f6i->fib6_dst.plen = 128; 3750 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3751 f6i->fib6_table = fib6_get_table(net, tb_id); 3752 3753 return f6i; 3754 } 3755 3756 /* remove deleted ip from prefsrc entries */ 3757 struct arg_dev_net_ip { 3758 struct net_device *dev; 3759 struct net *net; 3760 struct in6_addr *addr; 3761 }; 3762 3763 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3764 { 3765 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3766 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3767 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3768 3769 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3770 rt != net->ipv6.fib6_null_entry && 3771 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3772 spin_lock_bh(&rt6_exception_lock); 3773 /* remove prefsrc entry */ 3774 rt->fib6_prefsrc.plen = 0; 3775 spin_unlock_bh(&rt6_exception_lock); 3776 } 3777 return 0; 3778 } 3779 3780 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3781 { 3782 struct net *net = dev_net(ifp->idev->dev); 3783 struct arg_dev_net_ip adni = { 3784 .dev = ifp->idev->dev, 3785 .net = net, 3786 .addr = &ifp->addr, 3787 }; 3788 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3789 } 3790 3791 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3792 3793 /* Remove routers and update dst entries when gateway turn into host. */ 3794 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3795 { 3796 struct in6_addr *gateway = (struct in6_addr *)arg; 3797 3798 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3799 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3800 return -1; 3801 } 3802 3803 /* Further clean up cached routes in exception table. 3804 * This is needed because cached route may have a different 3805 * gateway than its 'parent' in the case of an ip redirect. 3806 */ 3807 rt6_exceptions_clean_tohost(rt, gateway); 3808 3809 return 0; 3810 } 3811 3812 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3813 { 3814 fib6_clean_all(net, fib6_clean_tohost, gateway); 3815 } 3816 3817 struct arg_netdev_event { 3818 const struct net_device *dev; 3819 union { 3820 unsigned int nh_flags; 3821 unsigned long event; 3822 }; 3823 }; 3824 3825 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3826 { 3827 struct fib6_info *iter; 3828 struct fib6_node *fn; 3829 3830 fn = rcu_dereference_protected(rt->fib6_node, 3831 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3832 iter = rcu_dereference_protected(fn->leaf, 3833 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3834 while (iter) { 3835 if (iter->fib6_metric == rt->fib6_metric && 3836 rt6_qualify_for_ecmp(iter)) 3837 return iter; 3838 iter = rcu_dereference_protected(iter->fib6_next, 3839 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3840 } 3841 3842 return NULL; 3843 } 3844 3845 static bool rt6_is_dead(const struct fib6_info *rt) 3846 { 3847 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3848 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3849 fib6_ignore_linkdown(rt))) 3850 return true; 3851 3852 return false; 3853 } 3854 3855 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3856 { 3857 struct fib6_info *iter; 3858 int total = 0; 3859 3860 if (!rt6_is_dead(rt)) 3861 total += rt->fib6_nh.nh_weight; 3862 3863 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3864 if (!rt6_is_dead(iter)) 3865 total += iter->fib6_nh.nh_weight; 3866 } 3867 3868 return total; 3869 } 3870 3871 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3872 { 3873 int upper_bound = -1; 3874 3875 if (!rt6_is_dead(rt)) { 3876 *weight += rt->fib6_nh.nh_weight; 3877 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3878 total) - 1; 3879 } 3880 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3881 } 3882 3883 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3884 { 3885 struct fib6_info *iter; 3886 int weight = 0; 3887 3888 rt6_upper_bound_set(rt, &weight, total); 3889 3890 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3891 rt6_upper_bound_set(iter, &weight, total); 3892 } 3893 3894 void rt6_multipath_rebalance(struct fib6_info *rt) 3895 { 3896 struct fib6_info *first; 3897 int total; 3898 3899 /* In case the entire multipath route was marked for flushing, 3900 * then there is no need to rebalance upon the removal of every 3901 * sibling route. 3902 */ 3903 if (!rt->fib6_nsiblings || rt->should_flush) 3904 return; 3905 3906 /* During lookup routes are evaluated in order, so we need to 3907 * make sure upper bounds are assigned from the first sibling 3908 * onwards. 3909 */ 3910 first = rt6_multipath_first_sibling(rt); 3911 if (WARN_ON_ONCE(!first)) 3912 return; 3913 3914 total = rt6_multipath_total_weight(first); 3915 rt6_multipath_upper_bound_set(first, total); 3916 } 3917 3918 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3919 { 3920 const struct arg_netdev_event *arg = p_arg; 3921 struct net *net = dev_net(arg->dev); 3922 3923 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3924 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3925 fib6_update_sernum_upto_root(net, rt); 3926 rt6_multipath_rebalance(rt); 3927 } 3928 3929 return 0; 3930 } 3931 3932 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3933 { 3934 struct arg_netdev_event arg = { 3935 .dev = dev, 3936 { 3937 .nh_flags = nh_flags, 3938 }, 3939 }; 3940 3941 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3942 arg.nh_flags |= RTNH_F_LINKDOWN; 3943 3944 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3945 } 3946 3947 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3948 const struct net_device *dev) 3949 { 3950 struct fib6_info *iter; 3951 3952 if (rt->fib6_nh.nh_dev == dev) 3953 return true; 3954 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3955 if (iter->fib6_nh.nh_dev == dev) 3956 return true; 3957 3958 return false; 3959 } 3960 3961 static void rt6_multipath_flush(struct fib6_info *rt) 3962 { 3963 struct fib6_info *iter; 3964 3965 rt->should_flush = 1; 3966 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3967 iter->should_flush = 1; 3968 } 3969 3970 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3971 const struct net_device *down_dev) 3972 { 3973 struct fib6_info *iter; 3974 unsigned int dead = 0; 3975 3976 if (rt->fib6_nh.nh_dev == down_dev || 3977 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3978 dead++; 3979 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3980 if (iter->fib6_nh.nh_dev == down_dev || 3981 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3982 dead++; 3983 3984 return dead; 3985 } 3986 3987 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3988 const struct net_device *dev, 3989 unsigned int nh_flags) 3990 { 3991 struct fib6_info *iter; 3992 3993 if (rt->fib6_nh.nh_dev == dev) 3994 rt->fib6_nh.nh_flags |= nh_flags; 3995 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3996 if (iter->fib6_nh.nh_dev == dev) 3997 iter->fib6_nh.nh_flags |= nh_flags; 3998 } 3999 4000 /* called with write lock held for table with rt */ 4001 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4002 { 4003 const struct arg_netdev_event *arg = p_arg; 4004 const struct net_device *dev = arg->dev; 4005 struct net *net = dev_net(dev); 4006 4007 if (rt == net->ipv6.fib6_null_entry) 4008 return 0; 4009 4010 switch (arg->event) { 4011 case NETDEV_UNREGISTER: 4012 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4013 case NETDEV_DOWN: 4014 if (rt->should_flush) 4015 return -1; 4016 if (!rt->fib6_nsiblings) 4017 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4018 if (rt6_multipath_uses_dev(rt, dev)) { 4019 unsigned int count; 4020 4021 count = rt6_multipath_dead_count(rt, dev); 4022 if (rt->fib6_nsiblings + 1 == count) { 4023 rt6_multipath_flush(rt); 4024 return -1; 4025 } 4026 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4027 RTNH_F_LINKDOWN); 4028 fib6_update_sernum(net, rt); 4029 rt6_multipath_rebalance(rt); 4030 } 4031 return -2; 4032 case NETDEV_CHANGE: 4033 if (rt->fib6_nh.nh_dev != dev || 4034 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4035 break; 4036 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4037 rt6_multipath_rebalance(rt); 4038 break; 4039 } 4040 4041 return 0; 4042 } 4043 4044 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4045 { 4046 struct arg_netdev_event arg = { 4047 .dev = dev, 4048 { 4049 .event = event, 4050 }, 4051 }; 4052 struct net *net = dev_net(dev); 4053 4054 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4055 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4056 else 4057 fib6_clean_all(net, fib6_ifdown, &arg); 4058 } 4059 4060 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4061 { 4062 rt6_sync_down_dev(dev, event); 4063 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4064 neigh_ifdown(&nd_tbl, dev); 4065 } 4066 4067 struct rt6_mtu_change_arg { 4068 struct net_device *dev; 4069 unsigned int mtu; 4070 }; 4071 4072 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4073 { 4074 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4075 struct inet6_dev *idev; 4076 4077 /* In IPv6 pmtu discovery is not optional, 4078 so that RTAX_MTU lock cannot disable it. 4079 We still use this lock to block changes 4080 caused by addrconf/ndisc. 4081 */ 4082 4083 idev = __in6_dev_get(arg->dev); 4084 if (!idev) 4085 return 0; 4086 4087 /* For administrative MTU increase, there is no way to discover 4088 IPv6 PMTU increase, so PMTU increase should be updated here. 4089 Since RFC 1981 doesn't include administrative MTU increase 4090 update PMTU increase is a MUST. (i.e. jumbo frame) 4091 */ 4092 if (rt->fib6_nh.nh_dev == arg->dev && 4093 !fib6_metric_locked(rt, RTAX_MTU)) { 4094 u32 mtu = rt->fib6_pmtu; 4095 4096 if (mtu >= arg->mtu || 4097 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4098 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4099 4100 spin_lock_bh(&rt6_exception_lock); 4101 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4102 spin_unlock_bh(&rt6_exception_lock); 4103 } 4104 return 0; 4105 } 4106 4107 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4108 { 4109 struct rt6_mtu_change_arg arg = { 4110 .dev = dev, 4111 .mtu = mtu, 4112 }; 4113 4114 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4115 } 4116 4117 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4118 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4119 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4120 [RTA_OIF] = { .type = NLA_U32 }, 4121 [RTA_IIF] = { .type = NLA_U32 }, 4122 [RTA_PRIORITY] = { .type = NLA_U32 }, 4123 [RTA_METRICS] = { .type = NLA_NESTED }, 4124 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4125 [RTA_PREF] = { .type = NLA_U8 }, 4126 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4127 [RTA_ENCAP] = { .type = NLA_NESTED }, 4128 [RTA_EXPIRES] = { .type = NLA_U32 }, 4129 [RTA_UID] = { .type = NLA_U32 }, 4130 [RTA_MARK] = { .type = NLA_U32 }, 4131 [RTA_TABLE] = { .type = NLA_U32 }, 4132 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4133 [RTA_SPORT] = { .type = NLA_U16 }, 4134 [RTA_DPORT] = { .type = NLA_U16 }, 4135 }; 4136 4137 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4138 struct fib6_config *cfg, 4139 struct netlink_ext_ack *extack) 4140 { 4141 struct rtmsg *rtm; 4142 struct nlattr *tb[RTA_MAX+1]; 4143 unsigned int pref; 4144 int err; 4145 4146 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4147 extack); 4148 if (err < 0) 4149 goto errout; 4150 4151 err = -EINVAL; 4152 rtm = nlmsg_data(nlh); 4153 4154 *cfg = (struct fib6_config){ 4155 .fc_table = rtm->rtm_table, 4156 .fc_dst_len = rtm->rtm_dst_len, 4157 .fc_src_len = rtm->rtm_src_len, 4158 .fc_flags = RTF_UP, 4159 .fc_protocol = rtm->rtm_protocol, 4160 .fc_type = rtm->rtm_type, 4161 4162 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4163 .fc_nlinfo.nlh = nlh, 4164 .fc_nlinfo.nl_net = sock_net(skb->sk), 4165 }; 4166 4167 if (rtm->rtm_type == RTN_UNREACHABLE || 4168 rtm->rtm_type == RTN_BLACKHOLE || 4169 rtm->rtm_type == RTN_PROHIBIT || 4170 rtm->rtm_type == RTN_THROW) 4171 cfg->fc_flags |= RTF_REJECT; 4172 4173 if (rtm->rtm_type == RTN_LOCAL) 4174 cfg->fc_flags |= RTF_LOCAL; 4175 4176 if (rtm->rtm_flags & RTM_F_CLONED) 4177 cfg->fc_flags |= RTF_CACHE; 4178 4179 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4180 4181 if (tb[RTA_GATEWAY]) { 4182 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4183 cfg->fc_flags |= RTF_GATEWAY; 4184 } 4185 4186 if (tb[RTA_DST]) { 4187 int plen = (rtm->rtm_dst_len + 7) >> 3; 4188 4189 if (nla_len(tb[RTA_DST]) < plen) 4190 goto errout; 4191 4192 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4193 } 4194 4195 if (tb[RTA_SRC]) { 4196 int plen = (rtm->rtm_src_len + 7) >> 3; 4197 4198 if (nla_len(tb[RTA_SRC]) < plen) 4199 goto errout; 4200 4201 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4202 } 4203 4204 if (tb[RTA_PREFSRC]) 4205 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4206 4207 if (tb[RTA_OIF]) 4208 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4209 4210 if (tb[RTA_PRIORITY]) 4211 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4212 4213 if (tb[RTA_METRICS]) { 4214 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4215 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4216 } 4217 4218 if (tb[RTA_TABLE]) 4219 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4220 4221 if (tb[RTA_MULTIPATH]) { 4222 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4223 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4224 4225 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4226 cfg->fc_mp_len, extack); 4227 if (err < 0) 4228 goto errout; 4229 } 4230 4231 if (tb[RTA_PREF]) { 4232 pref = nla_get_u8(tb[RTA_PREF]); 4233 if (pref != ICMPV6_ROUTER_PREF_LOW && 4234 pref != ICMPV6_ROUTER_PREF_HIGH) 4235 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4236 cfg->fc_flags |= RTF_PREF(pref); 4237 } 4238 4239 if (tb[RTA_ENCAP]) 4240 cfg->fc_encap = tb[RTA_ENCAP]; 4241 4242 if (tb[RTA_ENCAP_TYPE]) { 4243 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4244 4245 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4246 if (err < 0) 4247 goto errout; 4248 } 4249 4250 if (tb[RTA_EXPIRES]) { 4251 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4252 4253 if (addrconf_finite_timeout(timeout)) { 4254 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4255 cfg->fc_flags |= RTF_EXPIRES; 4256 } 4257 } 4258 4259 err = 0; 4260 errout: 4261 return err; 4262 } 4263 4264 struct rt6_nh { 4265 struct fib6_info *fib6_info; 4266 struct fib6_config r_cfg; 4267 struct list_head next; 4268 }; 4269 4270 static int ip6_route_info_append(struct net *net, 4271 struct list_head *rt6_nh_list, 4272 struct fib6_info *rt, 4273 struct fib6_config *r_cfg) 4274 { 4275 struct rt6_nh *nh; 4276 int err = -EEXIST; 4277 4278 list_for_each_entry(nh, rt6_nh_list, next) { 4279 /* check if fib6_info already exists */ 4280 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4281 return err; 4282 } 4283 4284 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4285 if (!nh) 4286 return -ENOMEM; 4287 nh->fib6_info = rt; 4288 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4289 list_add_tail(&nh->next, rt6_nh_list); 4290 4291 return 0; 4292 } 4293 4294 static void ip6_route_mpath_notify(struct fib6_info *rt, 4295 struct fib6_info *rt_last, 4296 struct nl_info *info, 4297 __u16 nlflags) 4298 { 4299 /* if this is an APPEND route, then rt points to the first route 4300 * inserted and rt_last points to last route inserted. Userspace 4301 * wants a consistent dump of the route which starts at the first 4302 * nexthop. Since sibling routes are always added at the end of 4303 * the list, find the first sibling of the last route appended 4304 */ 4305 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4306 rt = list_first_entry(&rt_last->fib6_siblings, 4307 struct fib6_info, 4308 fib6_siblings); 4309 } 4310 4311 if (rt) 4312 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4313 } 4314 4315 static int ip6_route_multipath_add(struct fib6_config *cfg, 4316 struct netlink_ext_ack *extack) 4317 { 4318 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4319 struct nl_info *info = &cfg->fc_nlinfo; 4320 struct fib6_config r_cfg; 4321 struct rtnexthop *rtnh; 4322 struct fib6_info *rt; 4323 struct rt6_nh *err_nh; 4324 struct rt6_nh *nh, *nh_safe; 4325 __u16 nlflags; 4326 int remaining; 4327 int attrlen; 4328 int err = 1; 4329 int nhn = 0; 4330 int replace = (cfg->fc_nlinfo.nlh && 4331 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4332 LIST_HEAD(rt6_nh_list); 4333 4334 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4335 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4336 nlflags |= NLM_F_APPEND; 4337 4338 remaining = cfg->fc_mp_len; 4339 rtnh = (struct rtnexthop *)cfg->fc_mp; 4340 4341 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4342 * fib6_info structs per nexthop 4343 */ 4344 while (rtnh_ok(rtnh, remaining)) { 4345 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4346 if (rtnh->rtnh_ifindex) 4347 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4348 4349 attrlen = rtnh_attrlen(rtnh); 4350 if (attrlen > 0) { 4351 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4352 4353 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4354 if (nla) { 4355 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4356 r_cfg.fc_flags |= RTF_GATEWAY; 4357 } 4358 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4359 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4360 if (nla) 4361 r_cfg.fc_encap_type = nla_get_u16(nla); 4362 } 4363 4364 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4365 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4366 if (IS_ERR(rt)) { 4367 err = PTR_ERR(rt); 4368 rt = NULL; 4369 goto cleanup; 4370 } 4371 if (!rt6_qualify_for_ecmp(rt)) { 4372 err = -EINVAL; 4373 NL_SET_ERR_MSG(extack, 4374 "Device only routes can not be added for IPv6 using the multipath API."); 4375 fib6_info_release(rt); 4376 goto cleanup; 4377 } 4378 4379 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4380 4381 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4382 rt, &r_cfg); 4383 if (err) { 4384 fib6_info_release(rt); 4385 goto cleanup; 4386 } 4387 4388 rtnh = rtnh_next(rtnh, &remaining); 4389 } 4390 4391 /* for add and replace send one notification with all nexthops. 4392 * Skip the notification in fib6_add_rt2node and send one with 4393 * the full route when done 4394 */ 4395 info->skip_notify = 1; 4396 4397 err_nh = NULL; 4398 list_for_each_entry(nh, &rt6_nh_list, next) { 4399 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4400 fib6_info_release(nh->fib6_info); 4401 4402 if (!err) { 4403 /* save reference to last route successfully inserted */ 4404 rt_last = nh->fib6_info; 4405 4406 /* save reference to first route for notification */ 4407 if (!rt_notif) 4408 rt_notif = nh->fib6_info; 4409 } 4410 4411 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4412 nh->fib6_info = NULL; 4413 if (err) { 4414 if (replace && nhn) 4415 NL_SET_ERR_MSG_MOD(extack, 4416 "multipath route replace failed (check consistency of installed routes)"); 4417 err_nh = nh; 4418 goto add_errout; 4419 } 4420 4421 /* Because each route is added like a single route we remove 4422 * these flags after the first nexthop: if there is a collision, 4423 * we have already failed to add the first nexthop: 4424 * fib6_add_rt2node() has rejected it; when replacing, old 4425 * nexthops have been replaced by first new, the rest should 4426 * be added to it. 4427 */ 4428 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4429 NLM_F_REPLACE); 4430 nhn++; 4431 } 4432 4433 /* success ... tell user about new route */ 4434 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4435 goto cleanup; 4436 4437 add_errout: 4438 /* send notification for routes that were added so that 4439 * the delete notifications sent by ip6_route_del are 4440 * coherent 4441 */ 4442 if (rt_notif) 4443 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4444 4445 /* Delete routes that were already added */ 4446 list_for_each_entry(nh, &rt6_nh_list, next) { 4447 if (err_nh == nh) 4448 break; 4449 ip6_route_del(&nh->r_cfg, extack); 4450 } 4451 4452 cleanup: 4453 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4454 if (nh->fib6_info) 4455 fib6_info_release(nh->fib6_info); 4456 list_del(&nh->next); 4457 kfree(nh); 4458 } 4459 4460 return err; 4461 } 4462 4463 static int ip6_route_multipath_del(struct fib6_config *cfg, 4464 struct netlink_ext_ack *extack) 4465 { 4466 struct fib6_config r_cfg; 4467 struct rtnexthop *rtnh; 4468 int remaining; 4469 int attrlen; 4470 int err = 1, last_err = 0; 4471 4472 remaining = cfg->fc_mp_len; 4473 rtnh = (struct rtnexthop *)cfg->fc_mp; 4474 4475 /* Parse a Multipath Entry */ 4476 while (rtnh_ok(rtnh, remaining)) { 4477 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4478 if (rtnh->rtnh_ifindex) 4479 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4480 4481 attrlen = rtnh_attrlen(rtnh); 4482 if (attrlen > 0) { 4483 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4484 4485 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4486 if (nla) { 4487 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4488 r_cfg.fc_flags |= RTF_GATEWAY; 4489 } 4490 } 4491 err = ip6_route_del(&r_cfg, extack); 4492 if (err) 4493 last_err = err; 4494 4495 rtnh = rtnh_next(rtnh, &remaining); 4496 } 4497 4498 return last_err; 4499 } 4500 4501 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4502 struct netlink_ext_ack *extack) 4503 { 4504 struct fib6_config cfg; 4505 int err; 4506 4507 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4508 if (err < 0) 4509 return err; 4510 4511 if (cfg.fc_mp) 4512 return ip6_route_multipath_del(&cfg, extack); 4513 else { 4514 cfg.fc_delete_all_nh = 1; 4515 return ip6_route_del(&cfg, extack); 4516 } 4517 } 4518 4519 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4520 struct netlink_ext_ack *extack) 4521 { 4522 struct fib6_config cfg; 4523 int err; 4524 4525 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4526 if (err < 0) 4527 return err; 4528 4529 if (cfg.fc_mp) 4530 return ip6_route_multipath_add(&cfg, extack); 4531 else 4532 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4533 } 4534 4535 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4536 { 4537 int nexthop_len = 0; 4538 4539 if (rt->fib6_nsiblings) { 4540 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4541 + NLA_ALIGN(sizeof(struct rtnexthop)) 4542 + nla_total_size(16) /* RTA_GATEWAY */ 4543 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4544 4545 nexthop_len *= rt->fib6_nsiblings; 4546 } 4547 4548 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4549 + nla_total_size(16) /* RTA_SRC */ 4550 + nla_total_size(16) /* RTA_DST */ 4551 + nla_total_size(16) /* RTA_GATEWAY */ 4552 + nla_total_size(16) /* RTA_PREFSRC */ 4553 + nla_total_size(4) /* RTA_TABLE */ 4554 + nla_total_size(4) /* RTA_IIF */ 4555 + nla_total_size(4) /* RTA_OIF */ 4556 + nla_total_size(4) /* RTA_PRIORITY */ 4557 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4558 + nla_total_size(sizeof(struct rta_cacheinfo)) 4559 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4560 + nla_total_size(1) /* RTA_PREF */ 4561 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4562 + nexthop_len; 4563 } 4564 4565 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4566 unsigned int *flags, bool skip_oif) 4567 { 4568 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4569 *flags |= RTNH_F_DEAD; 4570 4571 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4572 *flags |= RTNH_F_LINKDOWN; 4573 4574 rcu_read_lock(); 4575 if (fib6_ignore_linkdown(rt)) 4576 *flags |= RTNH_F_DEAD; 4577 rcu_read_unlock(); 4578 } 4579 4580 if (rt->fib6_flags & RTF_GATEWAY) { 4581 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4582 goto nla_put_failure; 4583 } 4584 4585 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4586 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4587 *flags |= RTNH_F_OFFLOAD; 4588 4589 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4590 if (!skip_oif && rt->fib6_nh.nh_dev && 4591 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4592 goto nla_put_failure; 4593 4594 if (rt->fib6_nh.nh_lwtstate && 4595 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4596 goto nla_put_failure; 4597 4598 return 0; 4599 4600 nla_put_failure: 4601 return -EMSGSIZE; 4602 } 4603 4604 /* add multipath next hop */ 4605 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4606 { 4607 const struct net_device *dev = rt->fib6_nh.nh_dev; 4608 struct rtnexthop *rtnh; 4609 unsigned int flags = 0; 4610 4611 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4612 if (!rtnh) 4613 goto nla_put_failure; 4614 4615 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4616 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4617 4618 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4619 goto nla_put_failure; 4620 4621 rtnh->rtnh_flags = flags; 4622 4623 /* length of rtnetlink header + attributes */ 4624 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4625 4626 return 0; 4627 4628 nla_put_failure: 4629 return -EMSGSIZE; 4630 } 4631 4632 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4633 struct fib6_info *rt, struct dst_entry *dst, 4634 struct in6_addr *dest, struct in6_addr *src, 4635 int iif, int type, u32 portid, u32 seq, 4636 unsigned int flags) 4637 { 4638 struct rt6_info *rt6 = (struct rt6_info *)dst; 4639 struct rt6key *rt6_dst, *rt6_src; 4640 u32 *pmetrics, table, rt6_flags; 4641 struct nlmsghdr *nlh; 4642 struct rtmsg *rtm; 4643 long expires = 0; 4644 4645 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4646 if (!nlh) 4647 return -EMSGSIZE; 4648 4649 if (rt6) { 4650 rt6_dst = &rt6->rt6i_dst; 4651 rt6_src = &rt6->rt6i_src; 4652 rt6_flags = rt6->rt6i_flags; 4653 } else { 4654 rt6_dst = &rt->fib6_dst; 4655 rt6_src = &rt->fib6_src; 4656 rt6_flags = rt->fib6_flags; 4657 } 4658 4659 rtm = nlmsg_data(nlh); 4660 rtm->rtm_family = AF_INET6; 4661 rtm->rtm_dst_len = rt6_dst->plen; 4662 rtm->rtm_src_len = rt6_src->plen; 4663 rtm->rtm_tos = 0; 4664 if (rt->fib6_table) 4665 table = rt->fib6_table->tb6_id; 4666 else 4667 table = RT6_TABLE_UNSPEC; 4668 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4669 if (nla_put_u32(skb, RTA_TABLE, table)) 4670 goto nla_put_failure; 4671 4672 rtm->rtm_type = rt->fib6_type; 4673 rtm->rtm_flags = 0; 4674 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4675 rtm->rtm_protocol = rt->fib6_protocol; 4676 4677 if (rt6_flags & RTF_CACHE) 4678 rtm->rtm_flags |= RTM_F_CLONED; 4679 4680 if (dest) { 4681 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4682 goto nla_put_failure; 4683 rtm->rtm_dst_len = 128; 4684 } else if (rtm->rtm_dst_len) 4685 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4686 goto nla_put_failure; 4687 #ifdef CONFIG_IPV6_SUBTREES 4688 if (src) { 4689 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4690 goto nla_put_failure; 4691 rtm->rtm_src_len = 128; 4692 } else if (rtm->rtm_src_len && 4693 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4694 goto nla_put_failure; 4695 #endif 4696 if (iif) { 4697 #ifdef CONFIG_IPV6_MROUTE 4698 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4699 int err = ip6mr_get_route(net, skb, rtm, portid); 4700 4701 if (err == 0) 4702 return 0; 4703 if (err < 0) 4704 goto nla_put_failure; 4705 } else 4706 #endif 4707 if (nla_put_u32(skb, RTA_IIF, iif)) 4708 goto nla_put_failure; 4709 } else if (dest) { 4710 struct in6_addr saddr_buf; 4711 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4712 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4713 goto nla_put_failure; 4714 } 4715 4716 if (rt->fib6_prefsrc.plen) { 4717 struct in6_addr saddr_buf; 4718 saddr_buf = rt->fib6_prefsrc.addr; 4719 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4720 goto nla_put_failure; 4721 } 4722 4723 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4724 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4725 goto nla_put_failure; 4726 4727 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4728 goto nla_put_failure; 4729 4730 /* For multipath routes, walk the siblings list and add 4731 * each as a nexthop within RTA_MULTIPATH. 4732 */ 4733 if (rt6) { 4734 if (rt6_flags & RTF_GATEWAY && 4735 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4736 goto nla_put_failure; 4737 4738 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4739 goto nla_put_failure; 4740 } else if (rt->fib6_nsiblings) { 4741 struct fib6_info *sibling, *next_sibling; 4742 struct nlattr *mp; 4743 4744 mp = nla_nest_start(skb, RTA_MULTIPATH); 4745 if (!mp) 4746 goto nla_put_failure; 4747 4748 if (rt6_add_nexthop(skb, rt) < 0) 4749 goto nla_put_failure; 4750 4751 list_for_each_entry_safe(sibling, next_sibling, 4752 &rt->fib6_siblings, fib6_siblings) { 4753 if (rt6_add_nexthop(skb, sibling) < 0) 4754 goto nla_put_failure; 4755 } 4756 4757 nla_nest_end(skb, mp); 4758 } else { 4759 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4760 goto nla_put_failure; 4761 } 4762 4763 if (rt6_flags & RTF_EXPIRES) { 4764 expires = dst ? dst->expires : rt->expires; 4765 expires -= jiffies; 4766 } 4767 4768 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4769 goto nla_put_failure; 4770 4771 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4772 goto nla_put_failure; 4773 4774 4775 nlmsg_end(skb, nlh); 4776 return 0; 4777 4778 nla_put_failure: 4779 nlmsg_cancel(skb, nlh); 4780 return -EMSGSIZE; 4781 } 4782 4783 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4784 const struct net_device *dev) 4785 { 4786 if (f6i->fib6_nh.nh_dev == dev) 4787 return true; 4788 4789 if (f6i->fib6_nsiblings) { 4790 struct fib6_info *sibling, *next_sibling; 4791 4792 list_for_each_entry_safe(sibling, next_sibling, 4793 &f6i->fib6_siblings, fib6_siblings) { 4794 if (sibling->fib6_nh.nh_dev == dev) 4795 return true; 4796 } 4797 } 4798 4799 return false; 4800 } 4801 4802 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4803 { 4804 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4805 struct fib_dump_filter *filter = &arg->filter; 4806 unsigned int flags = NLM_F_MULTI; 4807 struct net *net = arg->net; 4808 4809 if (rt == net->ipv6.fib6_null_entry) 4810 return 0; 4811 4812 if ((filter->flags & RTM_F_PREFIX) && 4813 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4814 /* success since this is not a prefix route */ 4815 return 1; 4816 } 4817 if (filter->filter_set) { 4818 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4819 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4820 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4821 return 1; 4822 } 4823 flags |= NLM_F_DUMP_FILTERED; 4824 } 4825 4826 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4827 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4828 arg->cb->nlh->nlmsg_seq, flags); 4829 } 4830 4831 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4832 struct netlink_ext_ack *extack) 4833 { 4834 struct net *net = sock_net(in_skb->sk); 4835 struct nlattr *tb[RTA_MAX+1]; 4836 int err, iif = 0, oif = 0; 4837 struct fib6_info *from; 4838 struct dst_entry *dst; 4839 struct rt6_info *rt; 4840 struct sk_buff *skb; 4841 struct rtmsg *rtm; 4842 struct flowi6 fl6 = {}; 4843 bool fibmatch; 4844 4845 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4846 extack); 4847 if (err < 0) 4848 goto errout; 4849 4850 err = -EINVAL; 4851 rtm = nlmsg_data(nlh); 4852 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4853 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4854 4855 if (tb[RTA_SRC]) { 4856 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4857 goto errout; 4858 4859 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4860 } 4861 4862 if (tb[RTA_DST]) { 4863 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4864 goto errout; 4865 4866 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4867 } 4868 4869 if (tb[RTA_IIF]) 4870 iif = nla_get_u32(tb[RTA_IIF]); 4871 4872 if (tb[RTA_OIF]) 4873 oif = nla_get_u32(tb[RTA_OIF]); 4874 4875 if (tb[RTA_MARK]) 4876 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4877 4878 if (tb[RTA_UID]) 4879 fl6.flowi6_uid = make_kuid(current_user_ns(), 4880 nla_get_u32(tb[RTA_UID])); 4881 else 4882 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4883 4884 if (tb[RTA_SPORT]) 4885 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4886 4887 if (tb[RTA_DPORT]) 4888 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4889 4890 if (tb[RTA_IP_PROTO]) { 4891 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4892 &fl6.flowi6_proto, extack); 4893 if (err) 4894 goto errout; 4895 } 4896 4897 if (iif) { 4898 struct net_device *dev; 4899 int flags = 0; 4900 4901 rcu_read_lock(); 4902 4903 dev = dev_get_by_index_rcu(net, iif); 4904 if (!dev) { 4905 rcu_read_unlock(); 4906 err = -ENODEV; 4907 goto errout; 4908 } 4909 4910 fl6.flowi6_iif = iif; 4911 4912 if (!ipv6_addr_any(&fl6.saddr)) 4913 flags |= RT6_LOOKUP_F_HAS_SADDR; 4914 4915 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4916 4917 rcu_read_unlock(); 4918 } else { 4919 fl6.flowi6_oif = oif; 4920 4921 dst = ip6_route_output(net, NULL, &fl6); 4922 } 4923 4924 4925 rt = container_of(dst, struct rt6_info, dst); 4926 if (rt->dst.error) { 4927 err = rt->dst.error; 4928 ip6_rt_put(rt); 4929 goto errout; 4930 } 4931 4932 if (rt == net->ipv6.ip6_null_entry) { 4933 err = rt->dst.error; 4934 ip6_rt_put(rt); 4935 goto errout; 4936 } 4937 4938 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4939 if (!skb) { 4940 ip6_rt_put(rt); 4941 err = -ENOBUFS; 4942 goto errout; 4943 } 4944 4945 skb_dst_set(skb, &rt->dst); 4946 4947 rcu_read_lock(); 4948 from = rcu_dereference(rt->from); 4949 4950 if (fibmatch) 4951 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4952 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4953 nlh->nlmsg_seq, 0); 4954 else 4955 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4956 &fl6.saddr, iif, RTM_NEWROUTE, 4957 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4958 0); 4959 rcu_read_unlock(); 4960 4961 if (err < 0) { 4962 kfree_skb(skb); 4963 goto errout; 4964 } 4965 4966 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4967 errout: 4968 return err; 4969 } 4970 4971 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4972 unsigned int nlm_flags) 4973 { 4974 struct sk_buff *skb; 4975 struct net *net = info->nl_net; 4976 u32 seq; 4977 int err; 4978 4979 err = -ENOBUFS; 4980 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4981 4982 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4983 if (!skb) 4984 goto errout; 4985 4986 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4987 event, info->portid, seq, nlm_flags); 4988 if (err < 0) { 4989 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4990 WARN_ON(err == -EMSGSIZE); 4991 kfree_skb(skb); 4992 goto errout; 4993 } 4994 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4995 info->nlh, gfp_any()); 4996 return; 4997 errout: 4998 if (err < 0) 4999 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5000 } 5001 5002 static int ip6_route_dev_notify(struct notifier_block *this, 5003 unsigned long event, void *ptr) 5004 { 5005 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5006 struct net *net = dev_net(dev); 5007 5008 if (!(dev->flags & IFF_LOOPBACK)) 5009 return NOTIFY_OK; 5010 5011 if (event == NETDEV_REGISTER) { 5012 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5013 net->ipv6.ip6_null_entry->dst.dev = dev; 5014 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5015 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5016 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5017 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5018 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5019 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5020 #endif 5021 } else if (event == NETDEV_UNREGISTER && 5022 dev->reg_state != NETREG_UNREGISTERED) { 5023 /* NETDEV_UNREGISTER could be fired for multiple times by 5024 * netdev_wait_allrefs(). Make sure we only call this once. 5025 */ 5026 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5027 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5028 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5029 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5030 #endif 5031 } 5032 5033 return NOTIFY_OK; 5034 } 5035 5036 /* 5037 * /proc 5038 */ 5039 5040 #ifdef CONFIG_PROC_FS 5041 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5042 { 5043 struct net *net = (struct net *)seq->private; 5044 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5045 net->ipv6.rt6_stats->fib_nodes, 5046 net->ipv6.rt6_stats->fib_route_nodes, 5047 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5048 net->ipv6.rt6_stats->fib_rt_entries, 5049 net->ipv6.rt6_stats->fib_rt_cache, 5050 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5051 net->ipv6.rt6_stats->fib_discarded_routes); 5052 5053 return 0; 5054 } 5055 #endif /* CONFIG_PROC_FS */ 5056 5057 #ifdef CONFIG_SYSCTL 5058 5059 static 5060 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5061 void __user *buffer, size_t *lenp, loff_t *ppos) 5062 { 5063 struct net *net; 5064 int delay; 5065 int ret; 5066 if (!write) 5067 return -EINVAL; 5068 5069 net = (struct net *)ctl->extra1; 5070 delay = net->ipv6.sysctl.flush_delay; 5071 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5072 if (ret) 5073 return ret; 5074 5075 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5076 return 0; 5077 } 5078 5079 static int zero; 5080 static int one = 1; 5081 5082 static struct ctl_table ipv6_route_table_template[] = { 5083 { 5084 .procname = "flush", 5085 .data = &init_net.ipv6.sysctl.flush_delay, 5086 .maxlen = sizeof(int), 5087 .mode = 0200, 5088 .proc_handler = ipv6_sysctl_rtcache_flush 5089 }, 5090 { 5091 .procname = "gc_thresh", 5092 .data = &ip6_dst_ops_template.gc_thresh, 5093 .maxlen = sizeof(int), 5094 .mode = 0644, 5095 .proc_handler = proc_dointvec, 5096 }, 5097 { 5098 .procname = "max_size", 5099 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5100 .maxlen = sizeof(int), 5101 .mode = 0644, 5102 .proc_handler = proc_dointvec, 5103 }, 5104 { 5105 .procname = "gc_min_interval", 5106 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5107 .maxlen = sizeof(int), 5108 .mode = 0644, 5109 .proc_handler = proc_dointvec_jiffies, 5110 }, 5111 { 5112 .procname = "gc_timeout", 5113 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5114 .maxlen = sizeof(int), 5115 .mode = 0644, 5116 .proc_handler = proc_dointvec_jiffies, 5117 }, 5118 { 5119 .procname = "gc_interval", 5120 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5121 .maxlen = sizeof(int), 5122 .mode = 0644, 5123 .proc_handler = proc_dointvec_jiffies, 5124 }, 5125 { 5126 .procname = "gc_elasticity", 5127 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5128 .maxlen = sizeof(int), 5129 .mode = 0644, 5130 .proc_handler = proc_dointvec, 5131 }, 5132 { 5133 .procname = "mtu_expires", 5134 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5135 .maxlen = sizeof(int), 5136 .mode = 0644, 5137 .proc_handler = proc_dointvec_jiffies, 5138 }, 5139 { 5140 .procname = "min_adv_mss", 5141 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5142 .maxlen = sizeof(int), 5143 .mode = 0644, 5144 .proc_handler = proc_dointvec, 5145 }, 5146 { 5147 .procname = "gc_min_interval_ms", 5148 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5149 .maxlen = sizeof(int), 5150 .mode = 0644, 5151 .proc_handler = proc_dointvec_ms_jiffies, 5152 }, 5153 { 5154 .procname = "skip_notify_on_dev_down", 5155 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5156 .maxlen = sizeof(int), 5157 .mode = 0644, 5158 .proc_handler = proc_dointvec, 5159 .extra1 = &zero, 5160 .extra2 = &one, 5161 }, 5162 { } 5163 }; 5164 5165 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5166 { 5167 struct ctl_table *table; 5168 5169 table = kmemdup(ipv6_route_table_template, 5170 sizeof(ipv6_route_table_template), 5171 GFP_KERNEL); 5172 5173 if (table) { 5174 table[0].data = &net->ipv6.sysctl.flush_delay; 5175 table[0].extra1 = net; 5176 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5177 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5178 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5179 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5180 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5181 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5182 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5183 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5184 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5185 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5186 5187 /* Don't export sysctls to unprivileged users */ 5188 if (net->user_ns != &init_user_ns) 5189 table[0].procname = NULL; 5190 } 5191 5192 return table; 5193 } 5194 #endif 5195 5196 static int __net_init ip6_route_net_init(struct net *net) 5197 { 5198 int ret = -ENOMEM; 5199 5200 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5201 sizeof(net->ipv6.ip6_dst_ops)); 5202 5203 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5204 goto out_ip6_dst_ops; 5205 5206 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5207 sizeof(*net->ipv6.fib6_null_entry), 5208 GFP_KERNEL); 5209 if (!net->ipv6.fib6_null_entry) 5210 goto out_ip6_dst_entries; 5211 5212 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5213 sizeof(*net->ipv6.ip6_null_entry), 5214 GFP_KERNEL); 5215 if (!net->ipv6.ip6_null_entry) 5216 goto out_fib6_null_entry; 5217 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5218 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5219 ip6_template_metrics, true); 5220 5221 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5222 net->ipv6.fib6_has_custom_rules = false; 5223 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5224 sizeof(*net->ipv6.ip6_prohibit_entry), 5225 GFP_KERNEL); 5226 if (!net->ipv6.ip6_prohibit_entry) 5227 goto out_ip6_null_entry; 5228 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5229 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5230 ip6_template_metrics, true); 5231 5232 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5233 sizeof(*net->ipv6.ip6_blk_hole_entry), 5234 GFP_KERNEL); 5235 if (!net->ipv6.ip6_blk_hole_entry) 5236 goto out_ip6_prohibit_entry; 5237 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5238 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5239 ip6_template_metrics, true); 5240 #endif 5241 5242 net->ipv6.sysctl.flush_delay = 0; 5243 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5244 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5245 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5246 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5247 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5248 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5249 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5250 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5251 5252 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5253 5254 ret = 0; 5255 out: 5256 return ret; 5257 5258 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5259 out_ip6_prohibit_entry: 5260 kfree(net->ipv6.ip6_prohibit_entry); 5261 out_ip6_null_entry: 5262 kfree(net->ipv6.ip6_null_entry); 5263 #endif 5264 out_fib6_null_entry: 5265 kfree(net->ipv6.fib6_null_entry); 5266 out_ip6_dst_entries: 5267 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5268 out_ip6_dst_ops: 5269 goto out; 5270 } 5271 5272 static void __net_exit ip6_route_net_exit(struct net *net) 5273 { 5274 kfree(net->ipv6.fib6_null_entry); 5275 kfree(net->ipv6.ip6_null_entry); 5276 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5277 kfree(net->ipv6.ip6_prohibit_entry); 5278 kfree(net->ipv6.ip6_blk_hole_entry); 5279 #endif 5280 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5281 } 5282 5283 static int __net_init ip6_route_net_init_late(struct net *net) 5284 { 5285 #ifdef CONFIG_PROC_FS 5286 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5287 sizeof(struct ipv6_route_iter)); 5288 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5289 rt6_stats_seq_show, NULL); 5290 #endif 5291 return 0; 5292 } 5293 5294 static void __net_exit ip6_route_net_exit_late(struct net *net) 5295 { 5296 #ifdef CONFIG_PROC_FS 5297 remove_proc_entry("ipv6_route", net->proc_net); 5298 remove_proc_entry("rt6_stats", net->proc_net); 5299 #endif 5300 } 5301 5302 static struct pernet_operations ip6_route_net_ops = { 5303 .init = ip6_route_net_init, 5304 .exit = ip6_route_net_exit, 5305 }; 5306 5307 static int __net_init ipv6_inetpeer_init(struct net *net) 5308 { 5309 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5310 5311 if (!bp) 5312 return -ENOMEM; 5313 inet_peer_base_init(bp); 5314 net->ipv6.peers = bp; 5315 return 0; 5316 } 5317 5318 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5319 { 5320 struct inet_peer_base *bp = net->ipv6.peers; 5321 5322 net->ipv6.peers = NULL; 5323 inetpeer_invalidate_tree(bp); 5324 kfree(bp); 5325 } 5326 5327 static struct pernet_operations ipv6_inetpeer_ops = { 5328 .init = ipv6_inetpeer_init, 5329 .exit = ipv6_inetpeer_exit, 5330 }; 5331 5332 static struct pernet_operations ip6_route_net_late_ops = { 5333 .init = ip6_route_net_init_late, 5334 .exit = ip6_route_net_exit_late, 5335 }; 5336 5337 static struct notifier_block ip6_route_dev_notifier = { 5338 .notifier_call = ip6_route_dev_notify, 5339 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5340 }; 5341 5342 void __init ip6_route_init_special_entries(void) 5343 { 5344 /* Registering of the loopback is done before this portion of code, 5345 * the loopback reference in rt6_info will not be taken, do it 5346 * manually for init_net */ 5347 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5348 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5349 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5350 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5351 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5352 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5353 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5354 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5355 #endif 5356 } 5357 5358 int __init ip6_route_init(void) 5359 { 5360 int ret; 5361 int cpu; 5362 5363 ret = -ENOMEM; 5364 ip6_dst_ops_template.kmem_cachep = 5365 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5366 SLAB_HWCACHE_ALIGN, NULL); 5367 if (!ip6_dst_ops_template.kmem_cachep) 5368 goto out; 5369 5370 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5371 if (ret) 5372 goto out_kmem_cache; 5373 5374 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5375 if (ret) 5376 goto out_dst_entries; 5377 5378 ret = register_pernet_subsys(&ip6_route_net_ops); 5379 if (ret) 5380 goto out_register_inetpeer; 5381 5382 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5383 5384 ret = fib6_init(); 5385 if (ret) 5386 goto out_register_subsys; 5387 5388 ret = xfrm6_init(); 5389 if (ret) 5390 goto out_fib6_init; 5391 5392 ret = fib6_rules_init(); 5393 if (ret) 5394 goto xfrm6_init; 5395 5396 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5397 if (ret) 5398 goto fib6_rules_init; 5399 5400 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5401 inet6_rtm_newroute, NULL, 0); 5402 if (ret < 0) 5403 goto out_register_late_subsys; 5404 5405 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5406 inet6_rtm_delroute, NULL, 0); 5407 if (ret < 0) 5408 goto out_register_late_subsys; 5409 5410 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5411 inet6_rtm_getroute, NULL, 5412 RTNL_FLAG_DOIT_UNLOCKED); 5413 if (ret < 0) 5414 goto out_register_late_subsys; 5415 5416 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5417 if (ret) 5418 goto out_register_late_subsys; 5419 5420 for_each_possible_cpu(cpu) { 5421 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5422 5423 INIT_LIST_HEAD(&ul->head); 5424 spin_lock_init(&ul->lock); 5425 } 5426 5427 out: 5428 return ret; 5429 5430 out_register_late_subsys: 5431 rtnl_unregister_all(PF_INET6); 5432 unregister_pernet_subsys(&ip6_route_net_late_ops); 5433 fib6_rules_init: 5434 fib6_rules_cleanup(); 5435 xfrm6_init: 5436 xfrm6_fini(); 5437 out_fib6_init: 5438 fib6_gc_cleanup(); 5439 out_register_subsys: 5440 unregister_pernet_subsys(&ip6_route_net_ops); 5441 out_register_inetpeer: 5442 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5443 out_dst_entries: 5444 dst_entries_destroy(&ip6_dst_blackhole_ops); 5445 out_kmem_cache: 5446 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5447 goto out; 5448 } 5449 5450 void ip6_route_cleanup(void) 5451 { 5452 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5453 unregister_pernet_subsys(&ip6_route_net_late_ops); 5454 fib6_rules_cleanup(); 5455 xfrm6_fini(); 5456 fib6_gc_cleanup(); 5457 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5458 unregister_pernet_subsys(&ip6_route_net_ops); 5459 dst_entries_destroy(&ip6_dst_blackhole_ops); 5460 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5461 } 5462