1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 83 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 84 static unsigned int ip6_mtu(const struct dst_entry *dst); 85 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 86 static void ip6_dst_destroy(struct dst_entry *); 87 static void ip6_dst_ifdown(struct dst_entry *, 88 struct net_device *dev, int how); 89 static int ip6_dst_gc(struct dst_ops *ops); 90 91 static int ip6_pkt_discard(struct sk_buff *skb); 92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 93 static int ip6_pkt_prohibit(struct sk_buff *skb); 94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 95 static void ip6_link_failure(struct sk_buff *skb); 96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 97 struct sk_buff *skb, u32 mtu); 98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 99 struct sk_buff *skb); 100 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 102 static size_t rt6_nlmsg_size(struct rt6_info *rt); 103 static int rt6_fill_node(struct net *net, 104 struct sk_buff *skb, struct rt6_info *rt, 105 struct in6_addr *dst, struct in6_addr *src, 106 int iif, int type, u32 portid, u32 seq, 107 unsigned int flags); 108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 109 struct in6_addr *daddr, 110 struct in6_addr *saddr); 111 112 #ifdef CONFIG_IPV6_ROUTE_INFO 113 static struct rt6_info *rt6_add_route_info(struct net *net, 114 const struct in6_addr *prefix, int prefixlen, 115 const struct in6_addr *gwaddr, 116 struct net_device *dev, 117 unsigned int pref); 118 static struct rt6_info *rt6_get_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev); 122 #endif 123 124 struct uncached_list { 125 spinlock_t lock; 126 struct list_head head; 127 }; 128 129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 130 131 static void rt6_uncached_list_add(struct rt6_info *rt) 132 { 133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 134 135 rt->rt6i_uncached_list = ul; 136 137 spin_lock_bh(&ul->lock); 138 list_add_tail(&rt->rt6i_uncached, &ul->head); 139 spin_unlock_bh(&ul->lock); 140 } 141 142 static void rt6_uncached_list_del(struct rt6_info *rt) 143 { 144 if (!list_empty(&rt->rt6i_uncached)) { 145 struct uncached_list *ul = rt->rt6i_uncached_list; 146 struct net *net = dev_net(rt->dst.dev); 147 148 spin_lock_bh(&ul->lock); 149 list_del(&rt->rt6i_uncached); 150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 151 spin_unlock_bh(&ul->lock); 152 } 153 } 154 155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 156 { 157 struct net_device *loopback_dev = net->loopback_dev; 158 int cpu; 159 160 if (dev == loopback_dev) 161 return; 162 163 for_each_possible_cpu(cpu) { 164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 165 struct rt6_info *rt; 166 167 spin_lock_bh(&ul->lock); 168 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 169 struct inet6_dev *rt_idev = rt->rt6i_idev; 170 struct net_device *rt_dev = rt->dst.dev; 171 172 if (rt_idev->dev == dev) { 173 rt->rt6i_idev = in6_dev_get(loopback_dev); 174 in6_dev_put(rt_idev); 175 } 176 177 if (rt_dev == dev) { 178 rt->dst.dev = loopback_dev; 179 dev_hold(rt->dst.dev); 180 dev_put(rt_dev); 181 } 182 } 183 spin_unlock_bh(&ul->lock); 184 } 185 } 186 187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 188 { 189 return dst_metrics_write_ptr(&rt->from->dst); 190 } 191 192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 193 { 194 struct rt6_info *rt = (struct rt6_info *)dst; 195 196 if (rt->rt6i_flags & RTF_PCPU) 197 return rt6_pcpu_cow_metrics(rt); 198 else if (rt->rt6i_flags & RTF_CACHE) 199 return NULL; 200 else 201 return dst_cow_metrics_generic(dst, old); 202 } 203 204 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct in6_addr *p = &rt->rt6i_gateway; 209 210 if (!ipv6_addr_any(p)) 211 return (const void *) p; 212 else if (skb) 213 return &ipv6_hdr(skb)->daddr; 214 return daddr; 215 } 216 217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 218 struct sk_buff *skb, 219 const void *daddr) 220 { 221 struct rt6_info *rt = (struct rt6_info *) dst; 222 struct neighbour *n; 223 224 daddr = choose_neigh_daddr(rt, skb, daddr); 225 n = __ipv6_neigh_lookup(dst->dev, daddr); 226 if (n) 227 return n; 228 return neigh_create(&nd_tbl, daddr, dst->dev); 229 } 230 231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 232 { 233 struct net_device *dev = dst->dev; 234 struct rt6_info *rt = (struct rt6_info *)dst; 235 236 daddr = choose_neigh_daddr(rt, NULL, daddr); 237 if (!daddr) 238 return; 239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 240 return; 241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 242 return; 243 __ipv6_confirm_neigh(dev, daddr); 244 } 245 246 static struct dst_ops ip6_dst_ops_template = { 247 .family = AF_INET6, 248 .gc = ip6_dst_gc, 249 .gc_thresh = 1024, 250 .check = ip6_dst_check, 251 .default_advmss = ip6_default_advmss, 252 .mtu = ip6_mtu, 253 .cow_metrics = ipv6_cow_metrics, 254 .destroy = ip6_dst_destroy, 255 .ifdown = ip6_dst_ifdown, 256 .negative_advice = ip6_negative_advice, 257 .link_failure = ip6_link_failure, 258 .update_pmtu = ip6_rt_update_pmtu, 259 .redirect = rt6_do_redirect, 260 .local_out = __ip6_local_out, 261 .neigh_lookup = ip6_neigh_lookup, 262 .confirm_neigh = ip6_confirm_neigh, 263 }; 264 265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 266 { 267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 268 269 return mtu ? : dst->dev->mtu; 270 } 271 272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 273 struct sk_buff *skb, u32 mtu) 274 { 275 } 276 277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 278 struct sk_buff *skb) 279 { 280 } 281 282 static struct dst_ops ip6_dst_blackhole_ops = { 283 .family = AF_INET6, 284 .destroy = ip6_dst_destroy, 285 .check = ip6_dst_check, 286 .mtu = ip6_blackhole_mtu, 287 .default_advmss = ip6_default_advmss, 288 .update_pmtu = ip6_rt_blackhole_update_pmtu, 289 .redirect = ip6_rt_blackhole_redirect, 290 .cow_metrics = dst_cow_metrics_generic, 291 .neigh_lookup = ip6_neigh_lookup, 292 }; 293 294 static const u32 ip6_template_metrics[RTAX_MAX] = { 295 [RTAX_HOPLIMIT - 1] = 0, 296 }; 297 298 static const struct rt6_info ip6_null_entry_template = { 299 .dst = { 300 .__refcnt = ATOMIC_INIT(1), 301 .__use = 1, 302 .obsolete = DST_OBSOLETE_FORCE_CHK, 303 .error = -ENETUNREACH, 304 .input = ip6_pkt_discard, 305 .output = ip6_pkt_discard_out, 306 }, 307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 308 .rt6i_protocol = RTPROT_KERNEL, 309 .rt6i_metric = ~(u32) 0, 310 .rt6i_ref = ATOMIC_INIT(1), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 .rt6i_protocol = RTPROT_KERNEL, 326 .rt6i_metric = ~(u32) 0, 327 .rt6i_ref = ATOMIC_INIT(1), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 .rt6i_protocol = RTPROT_KERNEL, 341 .rt6i_metric = ~(u32) 0, 342 .rt6i_ref = ATOMIC_INIT(1), 343 }; 344 345 #endif 346 347 static void rt6_info_init(struct rt6_info *rt) 348 { 349 struct dst_entry *dst = &rt->dst; 350 351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 352 INIT_LIST_HEAD(&rt->rt6i_siblings); 353 INIT_LIST_HEAD(&rt->rt6i_uncached); 354 } 355 356 /* allocate dst with ip6_dst_ops */ 357 static struct rt6_info *__ip6_dst_alloc(struct net *net, 358 struct net_device *dev, 359 int flags) 360 { 361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 362 1, DST_OBSOLETE_FORCE_CHK, flags); 363 364 if (rt) { 365 rt6_info_init(rt); 366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 367 } 368 369 return rt; 370 } 371 372 struct rt6_info *ip6_dst_alloc(struct net *net, 373 struct net_device *dev, 374 int flags) 375 { 376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); 377 378 if (rt) { 379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 380 if (!rt->rt6i_pcpu) { 381 dst_release_immediate(&rt->dst); 382 return NULL; 383 } 384 } 385 386 return rt; 387 } 388 EXPORT_SYMBOL(ip6_dst_alloc); 389 390 static void ip6_dst_destroy(struct dst_entry *dst) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct rt6_exception_bucket *bucket; 394 struct rt6_info *from = rt->from; 395 struct inet6_dev *idev; 396 397 dst_destroy_metrics_generic(dst); 398 free_percpu(rt->rt6i_pcpu); 399 rt6_uncached_list_del(rt); 400 401 idev = rt->rt6i_idev; 402 if (idev) { 403 rt->rt6i_idev = NULL; 404 in6_dev_put(idev); 405 } 406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); 407 if (bucket) { 408 rt->rt6i_exception_bucket = NULL; 409 kfree(bucket); 410 } 411 412 rt->from = NULL; 413 dst_release(&from->dst); 414 } 415 416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 417 int how) 418 { 419 struct rt6_info *rt = (struct rt6_info *)dst; 420 struct inet6_dev *idev = rt->rt6i_idev; 421 struct net_device *loopback_dev = 422 dev_net(dev)->loopback_dev; 423 424 if (idev && idev->dev != loopback_dev) { 425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 426 if (loopback_idev) { 427 rt->rt6i_idev = loopback_idev; 428 in6_dev_put(idev); 429 } 430 } 431 } 432 433 static bool __rt6_check_expired(const struct rt6_info *rt) 434 { 435 if (rt->rt6i_flags & RTF_EXPIRES) 436 return time_after(jiffies, rt->dst.expires); 437 else 438 return false; 439 } 440 441 static bool rt6_check_expired(const struct rt6_info *rt) 442 { 443 if (rt->rt6i_flags & RTF_EXPIRES) { 444 if (time_after(jiffies, rt->dst.expires)) 445 return true; 446 } else if (rt->from) { 447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 448 rt6_check_expired(rt->from); 449 } 450 return false; 451 } 452 453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 454 struct flowi6 *fl6, int oif, 455 int strict) 456 { 457 struct rt6_info *sibling, *next_sibling; 458 459 /* We might have already computed the hash for ICMPv6 errors. In such 460 * case it will always be non-zero. Otherwise now is the time to do it. 461 */ 462 if (!fl6->mp_hash) 463 fl6->mp_hash = rt6_multipath_hash(fl6, NULL); 464 465 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) 466 return match; 467 468 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, 469 rt6i_siblings) { 470 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) 471 continue; 472 if (rt6_score_route(sibling, oif, strict) < 0) 473 break; 474 match = sibling; 475 break; 476 } 477 478 return match; 479 } 480 481 /* 482 * Route lookup. rcu_read_lock() should be held. 483 */ 484 485 static inline struct rt6_info *rt6_device_match(struct net *net, 486 struct rt6_info *rt, 487 const struct in6_addr *saddr, 488 int oif, 489 int flags) 490 { 491 struct rt6_info *local = NULL; 492 struct rt6_info *sprt; 493 494 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) 495 return rt; 496 497 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { 498 struct net_device *dev = sprt->dst.dev; 499 500 if (sprt->rt6i_nh_flags & RTNH_F_DEAD) 501 continue; 502 503 if (oif) { 504 if (dev->ifindex == oif) 505 return sprt; 506 if (dev->flags & IFF_LOOPBACK) { 507 if (!sprt->rt6i_idev || 508 sprt->rt6i_idev->dev->ifindex != oif) { 509 if (flags & RT6_LOOKUP_F_IFACE) 510 continue; 511 if (local && 512 local->rt6i_idev->dev->ifindex == oif) 513 continue; 514 } 515 local = sprt; 516 } 517 } else { 518 if (ipv6_chk_addr(net, saddr, dev, 519 flags & RT6_LOOKUP_F_IFACE)) 520 return sprt; 521 } 522 } 523 524 if (oif) { 525 if (local) 526 return local; 527 528 if (flags & RT6_LOOKUP_F_IFACE) 529 return net->ipv6.ip6_null_entry; 530 } 531 532 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; 533 } 534 535 #ifdef CONFIG_IPV6_ROUTER_PREF 536 struct __rt6_probe_work { 537 struct work_struct work; 538 struct in6_addr target; 539 struct net_device *dev; 540 }; 541 542 static void rt6_probe_deferred(struct work_struct *w) 543 { 544 struct in6_addr mcaddr; 545 struct __rt6_probe_work *work = 546 container_of(w, struct __rt6_probe_work, work); 547 548 addrconf_addr_solict_mult(&work->target, &mcaddr); 549 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 550 dev_put(work->dev); 551 kfree(work); 552 } 553 554 static void rt6_probe(struct rt6_info *rt) 555 { 556 struct __rt6_probe_work *work; 557 struct neighbour *neigh; 558 /* 559 * Okay, this does not seem to be appropriate 560 * for now, however, we need to check if it 561 * is really so; aka Router Reachability Probing. 562 * 563 * Router Reachability Probe MUST be rate-limited 564 * to no more than one per minute. 565 */ 566 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 567 return; 568 rcu_read_lock_bh(); 569 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 570 if (neigh) { 571 if (neigh->nud_state & NUD_VALID) 572 goto out; 573 574 work = NULL; 575 write_lock(&neigh->lock); 576 if (!(neigh->nud_state & NUD_VALID) && 577 time_after(jiffies, 578 neigh->updated + 579 rt->rt6i_idev->cnf.rtr_probe_interval)) { 580 work = kmalloc(sizeof(*work), GFP_ATOMIC); 581 if (work) 582 __neigh_set_probe_once(neigh); 583 } 584 write_unlock(&neigh->lock); 585 } else { 586 work = kmalloc(sizeof(*work), GFP_ATOMIC); 587 } 588 589 if (work) { 590 INIT_WORK(&work->work, rt6_probe_deferred); 591 work->target = rt->rt6i_gateway; 592 dev_hold(rt->dst.dev); 593 work->dev = rt->dst.dev; 594 schedule_work(&work->work); 595 } 596 597 out: 598 rcu_read_unlock_bh(); 599 } 600 #else 601 static inline void rt6_probe(struct rt6_info *rt) 602 { 603 } 604 #endif 605 606 /* 607 * Default Router Selection (RFC 2461 6.3.6) 608 */ 609 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 610 { 611 struct net_device *dev = rt->dst.dev; 612 if (!oif || dev->ifindex == oif) 613 return 2; 614 if ((dev->flags & IFF_LOOPBACK) && 615 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 616 return 1; 617 return 0; 618 } 619 620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 621 { 622 struct neighbour *neigh; 623 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 624 625 if (rt->rt6i_flags & RTF_NONEXTHOP || 626 !(rt->rt6i_flags & RTF_GATEWAY)) 627 return RT6_NUD_SUCCEED; 628 629 rcu_read_lock_bh(); 630 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 631 if (neigh) { 632 read_lock(&neigh->lock); 633 if (neigh->nud_state & NUD_VALID) 634 ret = RT6_NUD_SUCCEED; 635 #ifdef CONFIG_IPV6_ROUTER_PREF 636 else if (!(neigh->nud_state & NUD_FAILED)) 637 ret = RT6_NUD_SUCCEED; 638 else 639 ret = RT6_NUD_FAIL_PROBE; 640 #endif 641 read_unlock(&neigh->lock); 642 } else { 643 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 644 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 645 } 646 rcu_read_unlock_bh(); 647 648 return ret; 649 } 650 651 static int rt6_score_route(struct rt6_info *rt, int oif, 652 int strict) 653 { 654 int m; 655 656 m = rt6_check_dev(rt, oif); 657 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 658 return RT6_NUD_FAIL_HARD; 659 #ifdef CONFIG_IPV6_ROUTER_PREF 660 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 661 #endif 662 if (strict & RT6_LOOKUP_F_REACHABLE) { 663 int n = rt6_check_neigh(rt); 664 if (n < 0) 665 return n; 666 } 667 return m; 668 } 669 670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 671 int *mpri, struct rt6_info *match, 672 bool *do_rr) 673 { 674 int m; 675 bool match_do_rr = false; 676 struct inet6_dev *idev = rt->rt6i_idev; 677 678 if (rt->rt6i_nh_flags & RTNH_F_DEAD) 679 goto out; 680 681 if (idev->cnf.ignore_routes_with_linkdown && 682 rt->rt6i_nh_flags & RTNH_F_LINKDOWN && 683 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 684 goto out; 685 686 if (rt6_check_expired(rt)) 687 goto out; 688 689 m = rt6_score_route(rt, oif, strict); 690 if (m == RT6_NUD_FAIL_DO_RR) { 691 match_do_rr = true; 692 m = 0; /* lowest valid score */ 693 } else if (m == RT6_NUD_FAIL_HARD) { 694 goto out; 695 } 696 697 if (strict & RT6_LOOKUP_F_REACHABLE) 698 rt6_probe(rt); 699 700 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 701 if (m > *mpri) { 702 *do_rr = match_do_rr; 703 *mpri = m; 704 match = rt; 705 } 706 out: 707 return match; 708 } 709 710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 711 struct rt6_info *leaf, 712 struct rt6_info *rr_head, 713 u32 metric, int oif, int strict, 714 bool *do_rr) 715 { 716 struct rt6_info *rt, *match, *cont; 717 int mpri = -1; 718 719 match = NULL; 720 cont = NULL; 721 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { 722 if (rt->rt6i_metric != metric) { 723 cont = rt; 724 break; 725 } 726 727 match = find_match(rt, oif, strict, &mpri, match, do_rr); 728 } 729 730 for (rt = leaf; rt && rt != rr_head; 731 rt = rcu_dereference(rt->rt6_next)) { 732 if (rt->rt6i_metric != metric) { 733 cont = rt; 734 break; 735 } 736 737 match = find_match(rt, oif, strict, &mpri, match, do_rr); 738 } 739 740 if (match || !cont) 741 return match; 742 743 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) 744 match = find_match(rt, oif, strict, &mpri, match, do_rr); 745 746 return match; 747 } 748 749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, 750 int oif, int strict) 751 { 752 struct rt6_info *leaf = rcu_dereference(fn->leaf); 753 struct rt6_info *match, *rt0; 754 bool do_rr = false; 755 int key_plen; 756 757 if (!leaf || leaf == net->ipv6.ip6_null_entry) 758 return net->ipv6.ip6_null_entry; 759 760 rt0 = rcu_dereference(fn->rr_ptr); 761 if (!rt0) 762 rt0 = leaf; 763 764 /* Double check to make sure fn is not an intermediate node 765 * and fn->leaf does not points to its child's leaf 766 * (This might happen if all routes under fn are deleted from 767 * the tree and fib6_repair_tree() is called on the node.) 768 */ 769 key_plen = rt0->rt6i_dst.plen; 770 #ifdef CONFIG_IPV6_SUBTREES 771 if (rt0->rt6i_src.plen) 772 key_plen = rt0->rt6i_src.plen; 773 #endif 774 if (fn->fn_bit != key_plen) 775 return net->ipv6.ip6_null_entry; 776 777 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, 778 &do_rr); 779 780 if (do_rr) { 781 struct rt6_info *next = rcu_dereference(rt0->rt6_next); 782 783 /* no entries matched; do round-robin */ 784 if (!next || next->rt6i_metric != rt0->rt6i_metric) 785 next = leaf; 786 787 if (next != rt0) { 788 spin_lock_bh(&leaf->rt6i_table->tb6_lock); 789 /* make sure next is not being deleted from the tree */ 790 if (next->rt6i_node) 791 rcu_assign_pointer(fn->rr_ptr, next); 792 spin_unlock_bh(&leaf->rt6i_table->tb6_lock); 793 } 794 } 795 796 return match ? match : net->ipv6.ip6_null_entry; 797 } 798 799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 800 { 801 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 802 } 803 804 #ifdef CONFIG_IPV6_ROUTE_INFO 805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 806 const struct in6_addr *gwaddr) 807 { 808 struct net *net = dev_net(dev); 809 struct route_info *rinfo = (struct route_info *) opt; 810 struct in6_addr prefix_buf, *prefix; 811 unsigned int pref; 812 unsigned long lifetime; 813 struct rt6_info *rt; 814 815 if (len < sizeof(struct route_info)) { 816 return -EINVAL; 817 } 818 819 /* Sanity check for prefix_len and length */ 820 if (rinfo->length > 3) { 821 return -EINVAL; 822 } else if (rinfo->prefix_len > 128) { 823 return -EINVAL; 824 } else if (rinfo->prefix_len > 64) { 825 if (rinfo->length < 2) { 826 return -EINVAL; 827 } 828 } else if (rinfo->prefix_len > 0) { 829 if (rinfo->length < 1) { 830 return -EINVAL; 831 } 832 } 833 834 pref = rinfo->route_pref; 835 if (pref == ICMPV6_ROUTER_PREF_INVALID) 836 return -EINVAL; 837 838 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 839 840 if (rinfo->length == 3) 841 prefix = (struct in6_addr *)rinfo->prefix; 842 else { 843 /* this function is safe */ 844 ipv6_addr_prefix(&prefix_buf, 845 (struct in6_addr *)rinfo->prefix, 846 rinfo->prefix_len); 847 prefix = &prefix_buf; 848 } 849 850 if (rinfo->prefix_len == 0) 851 rt = rt6_get_dflt_router(gwaddr, dev); 852 else 853 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 854 gwaddr, dev); 855 856 if (rt && !lifetime) { 857 ip6_del_rt(rt); 858 rt = NULL; 859 } 860 861 if (!rt && lifetime) 862 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 863 dev, pref); 864 else if (rt) 865 rt->rt6i_flags = RTF_ROUTEINFO | 866 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 867 868 if (rt) { 869 if (!addrconf_finite_timeout(lifetime)) 870 rt6_clean_expires(rt); 871 else 872 rt6_set_expires(rt, jiffies + HZ * lifetime); 873 874 ip6_rt_put(rt); 875 } 876 return 0; 877 } 878 #endif 879 880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 881 struct in6_addr *saddr) 882 { 883 struct fib6_node *pn, *sn; 884 while (1) { 885 if (fn->fn_flags & RTN_TL_ROOT) 886 return NULL; 887 pn = rcu_dereference(fn->parent); 888 sn = FIB6_SUBTREE(pn); 889 if (sn && sn != fn) 890 fn = fib6_lookup(sn, NULL, saddr); 891 else 892 fn = pn; 893 if (fn->fn_flags & RTN_RTINFO) 894 return fn; 895 } 896 } 897 898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 899 bool null_fallback) 900 { 901 struct rt6_info *rt = *prt; 902 903 if (dst_hold_safe(&rt->dst)) 904 return true; 905 if (null_fallback) { 906 rt = net->ipv6.ip6_null_entry; 907 dst_hold(&rt->dst); 908 } else { 909 rt = NULL; 910 } 911 *prt = rt; 912 return false; 913 } 914 915 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 916 struct fib6_table *table, 917 struct flowi6 *fl6, int flags) 918 { 919 struct rt6_info *rt, *rt_cache; 920 struct fib6_node *fn; 921 922 rcu_read_lock(); 923 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 924 restart: 925 rt = rcu_dereference(fn->leaf); 926 if (!rt) { 927 rt = net->ipv6.ip6_null_entry; 928 } else { 929 rt = rt6_device_match(net, rt, &fl6->saddr, 930 fl6->flowi6_oif, flags); 931 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 932 rt = rt6_multipath_select(rt, fl6, 933 fl6->flowi6_oif, flags); 934 } 935 if (rt == net->ipv6.ip6_null_entry) { 936 fn = fib6_backtrack(fn, &fl6->saddr); 937 if (fn) 938 goto restart; 939 } 940 /* Search through exception table */ 941 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); 942 if (rt_cache) 943 rt = rt_cache; 944 945 if (ip6_hold_safe(net, &rt, true)) 946 dst_use_noref(&rt->dst, jiffies); 947 948 rcu_read_unlock(); 949 950 trace_fib6_table_lookup(net, rt, table, fl6); 951 952 return rt; 953 954 } 955 956 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 957 int flags) 958 { 959 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 960 } 961 EXPORT_SYMBOL_GPL(ip6_route_lookup); 962 963 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 964 const struct in6_addr *saddr, int oif, int strict) 965 { 966 struct flowi6 fl6 = { 967 .flowi6_oif = oif, 968 .daddr = *daddr, 969 }; 970 struct dst_entry *dst; 971 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 972 973 if (saddr) { 974 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 975 flags |= RT6_LOOKUP_F_HAS_SADDR; 976 } 977 978 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 979 if (dst->error == 0) 980 return (struct rt6_info *) dst; 981 982 dst_release(dst); 983 984 return NULL; 985 } 986 EXPORT_SYMBOL(rt6_lookup); 987 988 /* ip6_ins_rt is called with FREE table->tb6_lock. 989 * It takes new route entry, the addition fails by any reason the 990 * route is released. 991 * Caller must hold dst before calling it. 992 */ 993 994 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 995 struct mx6_config *mxc, 996 struct netlink_ext_ack *extack) 997 { 998 int err; 999 struct fib6_table *table; 1000 1001 table = rt->rt6i_table; 1002 spin_lock_bh(&table->tb6_lock); 1003 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 1004 spin_unlock_bh(&table->tb6_lock); 1005 1006 return err; 1007 } 1008 1009 int ip6_ins_rt(struct rt6_info *rt) 1010 { 1011 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 1012 struct mx6_config mxc = { .mx = NULL, }; 1013 1014 /* Hold dst to account for the reference from the fib6 tree */ 1015 dst_hold(&rt->dst); 1016 return __ip6_ins_rt(rt, &info, &mxc, NULL); 1017 } 1018 1019 /* called with rcu_lock held */ 1020 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) 1021 { 1022 struct net_device *dev = rt->dst.dev; 1023 1024 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1025 /* for copies of local routes, dst->dev needs to be the 1026 * device if it is a master device, the master device if 1027 * device is enslaved, and the loopback as the default 1028 */ 1029 if (netif_is_l3_slave(dev) && 1030 !rt6_need_strict(&rt->rt6i_dst.addr)) 1031 dev = l3mdev_master_dev_rcu(dev); 1032 else if (!netif_is_l3_master(dev)) 1033 dev = dev_net(dev)->loopback_dev; 1034 /* last case is netif_is_l3_master(dev) is true in which 1035 * case we want dev returned to be dev 1036 */ 1037 } 1038 1039 return dev; 1040 } 1041 1042 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 1043 const struct in6_addr *daddr, 1044 const struct in6_addr *saddr) 1045 { 1046 struct net_device *dev; 1047 struct rt6_info *rt; 1048 1049 /* 1050 * Clone the route. 1051 */ 1052 1053 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1054 ort = ort->from; 1055 1056 rcu_read_lock(); 1057 dev = ip6_rt_get_dev_rcu(ort); 1058 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 1059 rcu_read_unlock(); 1060 if (!rt) 1061 return NULL; 1062 1063 ip6_rt_copy_init(rt, ort); 1064 rt->rt6i_flags |= RTF_CACHE; 1065 rt->rt6i_metric = 0; 1066 rt->dst.flags |= DST_HOST; 1067 rt->rt6i_dst.addr = *daddr; 1068 rt->rt6i_dst.plen = 128; 1069 1070 if (!rt6_is_gw_or_nonexthop(ort)) { 1071 if (ort->rt6i_dst.plen != 128 && 1072 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1073 rt->rt6i_flags |= RTF_ANYCAST; 1074 #ifdef CONFIG_IPV6_SUBTREES 1075 if (rt->rt6i_src.plen && saddr) { 1076 rt->rt6i_src.addr = *saddr; 1077 rt->rt6i_src.plen = 128; 1078 } 1079 #endif 1080 } 1081 1082 return rt; 1083 } 1084 1085 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1086 { 1087 struct net_device *dev; 1088 struct rt6_info *pcpu_rt; 1089 1090 rcu_read_lock(); 1091 dev = ip6_rt_get_dev_rcu(rt); 1092 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1093 rcu_read_unlock(); 1094 if (!pcpu_rt) 1095 return NULL; 1096 ip6_rt_copy_init(pcpu_rt, rt); 1097 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1098 pcpu_rt->rt6i_flags |= RTF_PCPU; 1099 return pcpu_rt; 1100 } 1101 1102 /* It should be called with rcu_read_lock() acquired */ 1103 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1104 { 1105 struct rt6_info *pcpu_rt, **p; 1106 1107 p = this_cpu_ptr(rt->rt6i_pcpu); 1108 pcpu_rt = *p; 1109 1110 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) 1111 rt6_dst_from_metrics_check(pcpu_rt); 1112 1113 return pcpu_rt; 1114 } 1115 1116 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1117 { 1118 struct rt6_info *pcpu_rt, *prev, **p; 1119 1120 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1121 if (!pcpu_rt) { 1122 struct net *net = dev_net(rt->dst.dev); 1123 1124 dst_hold(&net->ipv6.ip6_null_entry->dst); 1125 return net->ipv6.ip6_null_entry; 1126 } 1127 1128 dst_hold(&pcpu_rt->dst); 1129 p = this_cpu_ptr(rt->rt6i_pcpu); 1130 prev = cmpxchg(p, NULL, pcpu_rt); 1131 BUG_ON(prev); 1132 1133 rt6_dst_from_metrics_check(pcpu_rt); 1134 return pcpu_rt; 1135 } 1136 1137 /* exception hash table implementation 1138 */ 1139 static DEFINE_SPINLOCK(rt6_exception_lock); 1140 1141 /* Remove rt6_ex from hash table and free the memory 1142 * Caller must hold rt6_exception_lock 1143 */ 1144 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1145 struct rt6_exception *rt6_ex) 1146 { 1147 struct net *net; 1148 1149 if (!bucket || !rt6_ex) 1150 return; 1151 1152 net = dev_net(rt6_ex->rt6i->dst.dev); 1153 rt6_ex->rt6i->rt6i_node = NULL; 1154 hlist_del_rcu(&rt6_ex->hlist); 1155 rt6_release(rt6_ex->rt6i); 1156 kfree_rcu(rt6_ex, rcu); 1157 WARN_ON_ONCE(!bucket->depth); 1158 bucket->depth--; 1159 net->ipv6.rt6_stats->fib_rt_cache--; 1160 } 1161 1162 /* Remove oldest rt6_ex in bucket and free the memory 1163 * Caller must hold rt6_exception_lock 1164 */ 1165 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1166 { 1167 struct rt6_exception *rt6_ex, *oldest = NULL; 1168 1169 if (!bucket) 1170 return; 1171 1172 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1173 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1174 oldest = rt6_ex; 1175 } 1176 rt6_remove_exception(bucket, oldest); 1177 } 1178 1179 static u32 rt6_exception_hash(const struct in6_addr *dst, 1180 const struct in6_addr *src) 1181 { 1182 static u32 seed __read_mostly; 1183 u32 val; 1184 1185 net_get_random_once(&seed, sizeof(seed)); 1186 val = jhash(dst, sizeof(*dst), seed); 1187 1188 #ifdef CONFIG_IPV6_SUBTREES 1189 if (src) 1190 val = jhash(src, sizeof(*src), val); 1191 #endif 1192 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1193 } 1194 1195 /* Helper function to find the cached rt in the hash table 1196 * and update bucket pointer to point to the bucket for this 1197 * (daddr, saddr) pair 1198 * Caller must hold rt6_exception_lock 1199 */ 1200 static struct rt6_exception * 1201 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1202 const struct in6_addr *daddr, 1203 const struct in6_addr *saddr) 1204 { 1205 struct rt6_exception *rt6_ex; 1206 u32 hval; 1207 1208 if (!(*bucket) || !daddr) 1209 return NULL; 1210 1211 hval = rt6_exception_hash(daddr, saddr); 1212 *bucket += hval; 1213 1214 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1215 struct rt6_info *rt6 = rt6_ex->rt6i; 1216 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1217 1218 #ifdef CONFIG_IPV6_SUBTREES 1219 if (matched && saddr) 1220 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1221 #endif 1222 if (matched) 1223 return rt6_ex; 1224 } 1225 return NULL; 1226 } 1227 1228 /* Helper function to find the cached rt in the hash table 1229 * and update bucket pointer to point to the bucket for this 1230 * (daddr, saddr) pair 1231 * Caller must hold rcu_read_lock() 1232 */ 1233 static struct rt6_exception * 1234 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1235 const struct in6_addr *daddr, 1236 const struct in6_addr *saddr) 1237 { 1238 struct rt6_exception *rt6_ex; 1239 u32 hval; 1240 1241 WARN_ON_ONCE(!rcu_read_lock_held()); 1242 1243 if (!(*bucket) || !daddr) 1244 return NULL; 1245 1246 hval = rt6_exception_hash(daddr, saddr); 1247 *bucket += hval; 1248 1249 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1250 struct rt6_info *rt6 = rt6_ex->rt6i; 1251 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1252 1253 #ifdef CONFIG_IPV6_SUBTREES 1254 if (matched && saddr) 1255 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1256 #endif 1257 if (matched) 1258 return rt6_ex; 1259 } 1260 return NULL; 1261 } 1262 1263 static int rt6_insert_exception(struct rt6_info *nrt, 1264 struct rt6_info *ort) 1265 { 1266 struct net *net = dev_net(ort->dst.dev); 1267 struct rt6_exception_bucket *bucket; 1268 struct in6_addr *src_key = NULL; 1269 struct rt6_exception *rt6_ex; 1270 int err = 0; 1271 1272 /* ort can't be a cache or pcpu route */ 1273 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1274 ort = ort->from; 1275 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); 1276 1277 spin_lock_bh(&rt6_exception_lock); 1278 1279 if (ort->exception_bucket_flushed) { 1280 err = -EINVAL; 1281 goto out; 1282 } 1283 1284 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1285 lockdep_is_held(&rt6_exception_lock)); 1286 if (!bucket) { 1287 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1288 GFP_ATOMIC); 1289 if (!bucket) { 1290 err = -ENOMEM; 1291 goto out; 1292 } 1293 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1294 } 1295 1296 #ifdef CONFIG_IPV6_SUBTREES 1297 /* rt6i_src.plen != 0 indicates ort is in subtree 1298 * and exception table is indexed by a hash of 1299 * both rt6i_dst and rt6i_src. 1300 * Otherwise, the exception table is indexed by 1301 * a hash of only rt6i_dst. 1302 */ 1303 if (ort->rt6i_src.plen) 1304 src_key = &nrt->rt6i_src.addr; 1305 #endif 1306 1307 /* Update rt6i_prefsrc as it could be changed 1308 * in rt6_remove_prefsrc() 1309 */ 1310 nrt->rt6i_prefsrc = ort->rt6i_prefsrc; 1311 /* rt6_mtu_change() might lower mtu on ort. 1312 * Only insert this exception route if its mtu 1313 * is less than ort's mtu value. 1314 */ 1315 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { 1316 err = -EINVAL; 1317 goto out; 1318 } 1319 1320 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1321 src_key); 1322 if (rt6_ex) 1323 rt6_remove_exception(bucket, rt6_ex); 1324 1325 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1326 if (!rt6_ex) { 1327 err = -ENOMEM; 1328 goto out; 1329 } 1330 rt6_ex->rt6i = nrt; 1331 rt6_ex->stamp = jiffies; 1332 atomic_inc(&nrt->rt6i_ref); 1333 nrt->rt6i_node = ort->rt6i_node; 1334 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1335 bucket->depth++; 1336 net->ipv6.rt6_stats->fib_rt_cache++; 1337 1338 if (bucket->depth > FIB6_MAX_DEPTH) 1339 rt6_exception_remove_oldest(bucket); 1340 1341 out: 1342 spin_unlock_bh(&rt6_exception_lock); 1343 1344 /* Update fn->fn_sernum to invalidate all cached dst */ 1345 if (!err) { 1346 spin_lock_bh(&ort->rt6i_table->tb6_lock); 1347 fib6_update_sernum(ort); 1348 spin_unlock_bh(&ort->rt6i_table->tb6_lock); 1349 fib6_force_start_gc(net); 1350 } 1351 1352 return err; 1353 } 1354 1355 void rt6_flush_exceptions(struct rt6_info *rt) 1356 { 1357 struct rt6_exception_bucket *bucket; 1358 struct rt6_exception *rt6_ex; 1359 struct hlist_node *tmp; 1360 int i; 1361 1362 spin_lock_bh(&rt6_exception_lock); 1363 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1364 rt->exception_bucket_flushed = 1; 1365 1366 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1367 lockdep_is_held(&rt6_exception_lock)); 1368 if (!bucket) 1369 goto out; 1370 1371 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1372 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1373 rt6_remove_exception(bucket, rt6_ex); 1374 WARN_ON_ONCE(bucket->depth); 1375 bucket++; 1376 } 1377 1378 out: 1379 spin_unlock_bh(&rt6_exception_lock); 1380 } 1381 1382 /* Find cached rt in the hash table inside passed in rt 1383 * Caller has to hold rcu_read_lock() 1384 */ 1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 1386 struct in6_addr *daddr, 1387 struct in6_addr *saddr) 1388 { 1389 struct rt6_exception_bucket *bucket; 1390 struct in6_addr *src_key = NULL; 1391 struct rt6_exception *rt6_ex; 1392 struct rt6_info *res = NULL; 1393 1394 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1395 1396 #ifdef CONFIG_IPV6_SUBTREES 1397 /* rt6i_src.plen != 0 indicates rt is in subtree 1398 * and exception table is indexed by a hash of 1399 * both rt6i_dst and rt6i_src. 1400 * Otherwise, the exception table is indexed by 1401 * a hash of only rt6i_dst. 1402 */ 1403 if (rt->rt6i_src.plen) 1404 src_key = saddr; 1405 #endif 1406 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1407 1408 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1409 res = rt6_ex->rt6i; 1410 1411 return res; 1412 } 1413 1414 /* Remove the passed in cached rt from the hash table that contains it */ 1415 int rt6_remove_exception_rt(struct rt6_info *rt) 1416 { 1417 struct rt6_exception_bucket *bucket; 1418 struct rt6_info *from = rt->from; 1419 struct in6_addr *src_key = NULL; 1420 struct rt6_exception *rt6_ex; 1421 int err; 1422 1423 if (!from || 1424 !(rt->rt6i_flags & RTF_CACHE)) 1425 return -EINVAL; 1426 1427 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1428 return -ENOENT; 1429 1430 spin_lock_bh(&rt6_exception_lock); 1431 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1432 lockdep_is_held(&rt6_exception_lock)); 1433 #ifdef CONFIG_IPV6_SUBTREES 1434 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1435 * and exception table is indexed by a hash of 1436 * both rt6i_dst and rt6i_src. 1437 * Otherwise, the exception table is indexed by 1438 * a hash of only rt6i_dst. 1439 */ 1440 if (from->rt6i_src.plen) 1441 src_key = &rt->rt6i_src.addr; 1442 #endif 1443 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1444 &rt->rt6i_dst.addr, 1445 src_key); 1446 if (rt6_ex) { 1447 rt6_remove_exception(bucket, rt6_ex); 1448 err = 0; 1449 } else { 1450 err = -ENOENT; 1451 } 1452 1453 spin_unlock_bh(&rt6_exception_lock); 1454 return err; 1455 } 1456 1457 /* Find rt6_ex which contains the passed in rt cache and 1458 * refresh its stamp 1459 */ 1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1461 { 1462 struct rt6_exception_bucket *bucket; 1463 struct rt6_info *from = rt->from; 1464 struct in6_addr *src_key = NULL; 1465 struct rt6_exception *rt6_ex; 1466 1467 if (!from || 1468 !(rt->rt6i_flags & RTF_CACHE)) 1469 return; 1470 1471 rcu_read_lock(); 1472 bucket = rcu_dereference(from->rt6i_exception_bucket); 1473 1474 #ifdef CONFIG_IPV6_SUBTREES 1475 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1476 * and exception table is indexed by a hash of 1477 * both rt6i_dst and rt6i_src. 1478 * Otherwise, the exception table is indexed by 1479 * a hash of only rt6i_dst. 1480 */ 1481 if (from->rt6i_src.plen) 1482 src_key = &rt->rt6i_src.addr; 1483 #endif 1484 rt6_ex = __rt6_find_exception_rcu(&bucket, 1485 &rt->rt6i_dst.addr, 1486 src_key); 1487 if (rt6_ex) 1488 rt6_ex->stamp = jiffies; 1489 1490 rcu_read_unlock(); 1491 } 1492 1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) 1494 { 1495 struct rt6_exception_bucket *bucket; 1496 struct rt6_exception *rt6_ex; 1497 int i; 1498 1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1500 lockdep_is_held(&rt6_exception_lock)); 1501 1502 if (bucket) { 1503 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1504 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1505 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1506 } 1507 bucket++; 1508 } 1509 } 1510 } 1511 1512 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) 1513 { 1514 struct rt6_exception_bucket *bucket; 1515 struct rt6_exception *rt6_ex; 1516 int i; 1517 1518 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1519 lockdep_is_held(&rt6_exception_lock)); 1520 1521 if (bucket) { 1522 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1523 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1524 struct rt6_info *entry = rt6_ex->rt6i; 1525 /* For RTF_CACHE with rt6i_pmtu == 0 1526 * (i.e. a redirected route), 1527 * the metrics of its rt->dst.from has already 1528 * been updated. 1529 */ 1530 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) 1531 entry->rt6i_pmtu = mtu; 1532 } 1533 bucket++; 1534 } 1535 } 1536 } 1537 1538 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1539 1540 static void rt6_exceptions_clean_tohost(struct rt6_info *rt, 1541 struct in6_addr *gateway) 1542 { 1543 struct rt6_exception_bucket *bucket; 1544 struct rt6_exception *rt6_ex; 1545 struct hlist_node *tmp; 1546 int i; 1547 1548 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1549 return; 1550 1551 spin_lock_bh(&rt6_exception_lock); 1552 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1553 lockdep_is_held(&rt6_exception_lock)); 1554 1555 if (bucket) { 1556 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1557 hlist_for_each_entry_safe(rt6_ex, tmp, 1558 &bucket->chain, hlist) { 1559 struct rt6_info *entry = rt6_ex->rt6i; 1560 1561 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1562 RTF_CACHE_GATEWAY && 1563 ipv6_addr_equal(gateway, 1564 &entry->rt6i_gateway)) { 1565 rt6_remove_exception(bucket, rt6_ex); 1566 } 1567 } 1568 bucket++; 1569 } 1570 } 1571 1572 spin_unlock_bh(&rt6_exception_lock); 1573 } 1574 1575 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1576 struct rt6_exception *rt6_ex, 1577 struct fib6_gc_args *gc_args, 1578 unsigned long now) 1579 { 1580 struct rt6_info *rt = rt6_ex->rt6i; 1581 1582 /* we are pruning and obsoleting aged-out and non gateway exceptions 1583 * even if others have still references to them, so that on next 1584 * dst_check() such references can be dropped. 1585 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1586 * expired, independently from their aging, as per RFC 8201 section 4 1587 */ 1588 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1589 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1590 RT6_TRACE("aging clone %p\n", rt); 1591 rt6_remove_exception(bucket, rt6_ex); 1592 return; 1593 } 1594 } else if (time_after(jiffies, rt->dst.expires)) { 1595 RT6_TRACE("purging expired route %p\n", rt); 1596 rt6_remove_exception(bucket, rt6_ex); 1597 return; 1598 } 1599 1600 if (rt->rt6i_flags & RTF_GATEWAY) { 1601 struct neighbour *neigh; 1602 __u8 neigh_flags = 0; 1603 1604 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); 1605 if (neigh) { 1606 neigh_flags = neigh->flags; 1607 neigh_release(neigh); 1608 } 1609 if (!(neigh_flags & NTF_ROUTER)) { 1610 RT6_TRACE("purging route %p via non-router but gateway\n", 1611 rt); 1612 rt6_remove_exception(bucket, rt6_ex); 1613 return; 1614 } 1615 } 1616 1617 gc_args->more++; 1618 } 1619 1620 void rt6_age_exceptions(struct rt6_info *rt, 1621 struct fib6_gc_args *gc_args, 1622 unsigned long now) 1623 { 1624 struct rt6_exception_bucket *bucket; 1625 struct rt6_exception *rt6_ex; 1626 struct hlist_node *tmp; 1627 int i; 1628 1629 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1630 return; 1631 1632 spin_lock_bh(&rt6_exception_lock); 1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1634 lockdep_is_held(&rt6_exception_lock)); 1635 1636 if (bucket) { 1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1638 hlist_for_each_entry_safe(rt6_ex, tmp, 1639 &bucket->chain, hlist) { 1640 rt6_age_examine_exception(bucket, rt6_ex, 1641 gc_args, now); 1642 } 1643 bucket++; 1644 } 1645 } 1646 spin_unlock_bh(&rt6_exception_lock); 1647 } 1648 1649 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1650 int oif, struct flowi6 *fl6, int flags) 1651 { 1652 struct fib6_node *fn, *saved_fn; 1653 struct rt6_info *rt, *rt_cache; 1654 int strict = 0; 1655 1656 strict |= flags & RT6_LOOKUP_F_IFACE; 1657 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1658 if (net->ipv6.devconf_all->forwarding == 0) 1659 strict |= RT6_LOOKUP_F_REACHABLE; 1660 1661 rcu_read_lock(); 1662 1663 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1664 saved_fn = fn; 1665 1666 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1667 oif = 0; 1668 1669 redo_rt6_select: 1670 rt = rt6_select(net, fn, oif, strict); 1671 if (rt->rt6i_nsiblings) 1672 rt = rt6_multipath_select(rt, fl6, oif, strict); 1673 if (rt == net->ipv6.ip6_null_entry) { 1674 fn = fib6_backtrack(fn, &fl6->saddr); 1675 if (fn) 1676 goto redo_rt6_select; 1677 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1678 /* also consider unreachable route */ 1679 strict &= ~RT6_LOOKUP_F_REACHABLE; 1680 fn = saved_fn; 1681 goto redo_rt6_select; 1682 } 1683 } 1684 1685 /*Search through exception table */ 1686 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); 1687 if (rt_cache) 1688 rt = rt_cache; 1689 1690 if (rt == net->ipv6.ip6_null_entry) { 1691 rcu_read_unlock(); 1692 dst_hold(&rt->dst); 1693 trace_fib6_table_lookup(net, rt, table, fl6); 1694 return rt; 1695 } else if (rt->rt6i_flags & RTF_CACHE) { 1696 if (ip6_hold_safe(net, &rt, true)) { 1697 dst_use_noref(&rt->dst, jiffies); 1698 rt6_dst_from_metrics_check(rt); 1699 } 1700 rcu_read_unlock(); 1701 trace_fib6_table_lookup(net, rt, table, fl6); 1702 return rt; 1703 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1704 !(rt->rt6i_flags & RTF_GATEWAY))) { 1705 /* Create a RTF_CACHE clone which will not be 1706 * owned by the fib6 tree. It is for the special case where 1707 * the daddr in the skb during the neighbor look-up is different 1708 * from the fl6->daddr used to look-up route here. 1709 */ 1710 1711 struct rt6_info *uncached_rt; 1712 1713 if (ip6_hold_safe(net, &rt, true)) { 1714 dst_use_noref(&rt->dst, jiffies); 1715 } else { 1716 rcu_read_unlock(); 1717 uncached_rt = rt; 1718 goto uncached_rt_out; 1719 } 1720 rcu_read_unlock(); 1721 1722 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1723 dst_release(&rt->dst); 1724 1725 if (uncached_rt) { 1726 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1727 * No need for another dst_hold() 1728 */ 1729 rt6_uncached_list_add(uncached_rt); 1730 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1731 } else { 1732 uncached_rt = net->ipv6.ip6_null_entry; 1733 dst_hold(&uncached_rt->dst); 1734 } 1735 1736 uncached_rt_out: 1737 trace_fib6_table_lookup(net, uncached_rt, table, fl6); 1738 return uncached_rt; 1739 1740 } else { 1741 /* Get a percpu copy */ 1742 1743 struct rt6_info *pcpu_rt; 1744 1745 dst_use_noref(&rt->dst, jiffies); 1746 local_bh_disable(); 1747 pcpu_rt = rt6_get_pcpu_route(rt); 1748 1749 if (!pcpu_rt) { 1750 /* atomic_inc_not_zero() is needed when using rcu */ 1751 if (atomic_inc_not_zero(&rt->rt6i_ref)) { 1752 /* No dst_hold() on rt is needed because grabbing 1753 * rt->rt6i_ref makes sure rt can't be released. 1754 */ 1755 pcpu_rt = rt6_make_pcpu_route(rt); 1756 rt6_release(rt); 1757 } else { 1758 /* rt is already removed from tree */ 1759 pcpu_rt = net->ipv6.ip6_null_entry; 1760 dst_hold(&pcpu_rt->dst); 1761 } 1762 } 1763 local_bh_enable(); 1764 rcu_read_unlock(); 1765 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1766 return pcpu_rt; 1767 } 1768 } 1769 EXPORT_SYMBOL_GPL(ip6_pol_route); 1770 1771 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1772 struct flowi6 *fl6, int flags) 1773 { 1774 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1775 } 1776 1777 struct dst_entry *ip6_route_input_lookup(struct net *net, 1778 struct net_device *dev, 1779 struct flowi6 *fl6, int flags) 1780 { 1781 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1782 flags |= RT6_LOOKUP_F_IFACE; 1783 1784 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1785 } 1786 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1787 1788 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1789 struct flow_keys *keys) 1790 { 1791 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1792 const struct ipv6hdr *key_iph = outer_iph; 1793 const struct ipv6hdr *inner_iph; 1794 const struct icmp6hdr *icmph; 1795 struct ipv6hdr _inner_iph; 1796 1797 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1798 goto out; 1799 1800 icmph = icmp6_hdr(skb); 1801 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1802 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1803 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1804 icmph->icmp6_type != ICMPV6_PARAMPROB) 1805 goto out; 1806 1807 inner_iph = skb_header_pointer(skb, 1808 skb_transport_offset(skb) + sizeof(*icmph), 1809 sizeof(_inner_iph), &_inner_iph); 1810 if (!inner_iph) 1811 goto out; 1812 1813 key_iph = inner_iph; 1814 out: 1815 memset(keys, 0, sizeof(*keys)); 1816 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1817 keys->addrs.v6addrs.src = key_iph->saddr; 1818 keys->addrs.v6addrs.dst = key_iph->daddr; 1819 keys->tags.flow_label = ip6_flowinfo(key_iph); 1820 keys->basic.ip_proto = key_iph->nexthdr; 1821 } 1822 1823 /* if skb is set it will be used and fl6 can be NULL */ 1824 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) 1825 { 1826 struct flow_keys hash_keys; 1827 1828 if (skb) { 1829 ip6_multipath_l3_keys(skb, &hash_keys); 1830 return flow_hash_from_keys(&hash_keys) >> 1; 1831 } 1832 1833 return get_hash_from_flowi6(fl6) >> 1; 1834 } 1835 1836 void ip6_route_input(struct sk_buff *skb) 1837 { 1838 const struct ipv6hdr *iph = ipv6_hdr(skb); 1839 struct net *net = dev_net(skb->dev); 1840 int flags = RT6_LOOKUP_F_HAS_SADDR; 1841 struct ip_tunnel_info *tun_info; 1842 struct flowi6 fl6 = { 1843 .flowi6_iif = skb->dev->ifindex, 1844 .daddr = iph->daddr, 1845 .saddr = iph->saddr, 1846 .flowlabel = ip6_flowinfo(iph), 1847 .flowi6_mark = skb->mark, 1848 .flowi6_proto = iph->nexthdr, 1849 }; 1850 1851 tun_info = skb_tunnel_info(skb); 1852 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1853 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1854 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 1855 fl6.mp_hash = rt6_multipath_hash(&fl6, skb); 1856 skb_dst_drop(skb); 1857 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1858 } 1859 1860 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1861 struct flowi6 *fl6, int flags) 1862 { 1863 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1864 } 1865 1866 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1867 struct flowi6 *fl6, int flags) 1868 { 1869 bool any_src; 1870 1871 if (rt6_need_strict(&fl6->daddr)) { 1872 struct dst_entry *dst; 1873 1874 dst = l3mdev_link_scope_lookup(net, fl6); 1875 if (dst) 1876 return dst; 1877 } 1878 1879 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1880 1881 any_src = ipv6_addr_any(&fl6->saddr); 1882 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 1883 (fl6->flowi6_oif && any_src)) 1884 flags |= RT6_LOOKUP_F_IFACE; 1885 1886 if (!any_src) 1887 flags |= RT6_LOOKUP_F_HAS_SADDR; 1888 else if (sk) 1889 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1890 1891 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1892 } 1893 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 1894 1895 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1896 { 1897 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1898 struct net_device *loopback_dev = net->loopback_dev; 1899 struct dst_entry *new = NULL; 1900 1901 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1902 DST_OBSOLETE_DEAD, 0); 1903 if (rt) { 1904 rt6_info_init(rt); 1905 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 1906 1907 new = &rt->dst; 1908 new->__use = 1; 1909 new->input = dst_discard; 1910 new->output = dst_discard_out; 1911 1912 dst_copy_metrics(new, &ort->dst); 1913 1914 rt->rt6i_idev = in6_dev_get(loopback_dev); 1915 rt->rt6i_gateway = ort->rt6i_gateway; 1916 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1917 rt->rt6i_metric = 0; 1918 1919 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1920 #ifdef CONFIG_IPV6_SUBTREES 1921 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1922 #endif 1923 } 1924 1925 dst_release(dst_orig); 1926 return new ? new : ERR_PTR(-ENOMEM); 1927 } 1928 1929 /* 1930 * Destination cache support functions 1931 */ 1932 1933 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1934 { 1935 if (rt->from && 1936 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) 1937 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); 1938 } 1939 1940 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1941 { 1942 u32 rt_cookie = 0; 1943 1944 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) 1945 return NULL; 1946 1947 if (rt6_check_expired(rt)) 1948 return NULL; 1949 1950 return &rt->dst; 1951 } 1952 1953 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1954 { 1955 if (!__rt6_check_expired(rt) && 1956 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1957 rt6_check(rt->from, cookie)) 1958 return &rt->dst; 1959 else 1960 return NULL; 1961 } 1962 1963 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1964 { 1965 struct rt6_info *rt; 1966 1967 rt = (struct rt6_info *) dst; 1968 1969 /* All IPV6 dsts are created with ->obsolete set to the value 1970 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1971 * into this function always. 1972 */ 1973 1974 rt6_dst_from_metrics_check(rt); 1975 1976 if (rt->rt6i_flags & RTF_PCPU || 1977 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) 1978 return rt6_dst_from_check(rt, cookie); 1979 else 1980 return rt6_check(rt, cookie); 1981 } 1982 1983 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1984 { 1985 struct rt6_info *rt = (struct rt6_info *) dst; 1986 1987 if (rt) { 1988 if (rt->rt6i_flags & RTF_CACHE) { 1989 if (rt6_check_expired(rt)) { 1990 ip6_del_rt(rt); 1991 dst = NULL; 1992 } 1993 } else { 1994 dst_release(dst); 1995 dst = NULL; 1996 } 1997 } 1998 return dst; 1999 } 2000 2001 static void ip6_link_failure(struct sk_buff *skb) 2002 { 2003 struct rt6_info *rt; 2004 2005 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2006 2007 rt = (struct rt6_info *) skb_dst(skb); 2008 if (rt) { 2009 if (rt->rt6i_flags & RTF_CACHE) { 2010 if (dst_hold_safe(&rt->dst)) 2011 ip6_del_rt(rt); 2012 } else { 2013 struct fib6_node *fn; 2014 2015 rcu_read_lock(); 2016 fn = rcu_dereference(rt->rt6i_node); 2017 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2018 fn->fn_sernum = -1; 2019 rcu_read_unlock(); 2020 } 2021 } 2022 } 2023 2024 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2025 { 2026 struct net *net = dev_net(rt->dst.dev); 2027 2028 rt->rt6i_flags |= RTF_MODIFIED; 2029 rt->rt6i_pmtu = mtu; 2030 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2031 } 2032 2033 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2034 { 2035 return !(rt->rt6i_flags & RTF_CACHE) && 2036 (rt->rt6i_flags & RTF_PCPU || 2037 rcu_access_pointer(rt->rt6i_node)); 2038 } 2039 2040 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2041 const struct ipv6hdr *iph, u32 mtu) 2042 { 2043 const struct in6_addr *daddr, *saddr; 2044 struct rt6_info *rt6 = (struct rt6_info *)dst; 2045 2046 if (rt6->rt6i_flags & RTF_LOCAL) 2047 return; 2048 2049 if (dst_metric_locked(dst, RTAX_MTU)) 2050 return; 2051 2052 if (iph) { 2053 daddr = &iph->daddr; 2054 saddr = &iph->saddr; 2055 } else if (sk) { 2056 daddr = &sk->sk_v6_daddr; 2057 saddr = &inet6_sk(sk)->saddr; 2058 } else { 2059 daddr = NULL; 2060 saddr = NULL; 2061 } 2062 dst_confirm_neigh(dst, daddr); 2063 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2064 if (mtu >= dst_mtu(dst)) 2065 return; 2066 2067 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2068 rt6_do_update_pmtu(rt6, mtu); 2069 /* update rt6_ex->stamp for cache */ 2070 if (rt6->rt6i_flags & RTF_CACHE) 2071 rt6_update_exception_stamp_rt(rt6); 2072 } else if (daddr) { 2073 struct rt6_info *nrt6; 2074 2075 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 2076 if (nrt6) { 2077 rt6_do_update_pmtu(nrt6, mtu); 2078 if (rt6_insert_exception(nrt6, rt6)) 2079 dst_release_immediate(&nrt6->dst); 2080 } 2081 } 2082 } 2083 2084 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2085 struct sk_buff *skb, u32 mtu) 2086 { 2087 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2088 } 2089 2090 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2091 int oif, u32 mark, kuid_t uid) 2092 { 2093 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2094 struct dst_entry *dst; 2095 struct flowi6 fl6; 2096 2097 memset(&fl6, 0, sizeof(fl6)); 2098 fl6.flowi6_oif = oif; 2099 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2100 fl6.daddr = iph->daddr; 2101 fl6.saddr = iph->saddr; 2102 fl6.flowlabel = ip6_flowinfo(iph); 2103 fl6.flowi6_uid = uid; 2104 2105 dst = ip6_route_output(net, NULL, &fl6); 2106 if (!dst->error) 2107 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2108 dst_release(dst); 2109 } 2110 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2111 2112 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2113 { 2114 struct dst_entry *dst; 2115 2116 ip6_update_pmtu(skb, sock_net(sk), mtu, 2117 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2118 2119 dst = __sk_dst_get(sk); 2120 if (!dst || !dst->obsolete || 2121 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2122 return; 2123 2124 bh_lock_sock(sk); 2125 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2126 ip6_datagram_dst_update(sk, false); 2127 bh_unlock_sock(sk); 2128 } 2129 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2130 2131 /* Handle redirects */ 2132 struct ip6rd_flowi { 2133 struct flowi6 fl6; 2134 struct in6_addr gateway; 2135 }; 2136 2137 static struct rt6_info *__ip6_route_redirect(struct net *net, 2138 struct fib6_table *table, 2139 struct flowi6 *fl6, 2140 int flags) 2141 { 2142 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2143 struct rt6_info *rt, *rt_cache; 2144 struct fib6_node *fn; 2145 2146 /* Get the "current" route for this destination and 2147 * check if the redirect has come from appropriate router. 2148 * 2149 * RFC 4861 specifies that redirects should only be 2150 * accepted if they come from the nexthop to the target. 2151 * Due to the way the routes are chosen, this notion 2152 * is a bit fuzzy and one might need to check all possible 2153 * routes. 2154 */ 2155 2156 rcu_read_lock(); 2157 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2158 restart: 2159 for_each_fib6_node_rt_rcu(fn) { 2160 if (rt->rt6i_nh_flags & RTNH_F_DEAD) 2161 continue; 2162 if (rt6_check_expired(rt)) 2163 continue; 2164 if (rt->dst.error) 2165 break; 2166 if (!(rt->rt6i_flags & RTF_GATEWAY)) 2167 continue; 2168 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 2169 continue; 2170 /* rt_cache's gateway might be different from its 'parent' 2171 * in the case of an ip redirect. 2172 * So we keep searching in the exception table if the gateway 2173 * is different. 2174 */ 2175 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { 2176 rt_cache = rt6_find_cached_rt(rt, 2177 &fl6->daddr, 2178 &fl6->saddr); 2179 if (rt_cache && 2180 ipv6_addr_equal(&rdfl->gateway, 2181 &rt_cache->rt6i_gateway)) { 2182 rt = rt_cache; 2183 break; 2184 } 2185 continue; 2186 } 2187 break; 2188 } 2189 2190 if (!rt) 2191 rt = net->ipv6.ip6_null_entry; 2192 else if (rt->dst.error) { 2193 rt = net->ipv6.ip6_null_entry; 2194 goto out; 2195 } 2196 2197 if (rt == net->ipv6.ip6_null_entry) { 2198 fn = fib6_backtrack(fn, &fl6->saddr); 2199 if (fn) 2200 goto restart; 2201 } 2202 2203 out: 2204 ip6_hold_safe(net, &rt, true); 2205 2206 rcu_read_unlock(); 2207 2208 trace_fib6_table_lookup(net, rt, table, fl6); 2209 return rt; 2210 }; 2211 2212 static struct dst_entry *ip6_route_redirect(struct net *net, 2213 const struct flowi6 *fl6, 2214 const struct in6_addr *gateway) 2215 { 2216 int flags = RT6_LOOKUP_F_HAS_SADDR; 2217 struct ip6rd_flowi rdfl; 2218 2219 rdfl.fl6 = *fl6; 2220 rdfl.gateway = *gateway; 2221 2222 return fib6_rule_lookup(net, &rdfl.fl6, 2223 flags, __ip6_route_redirect); 2224 } 2225 2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2227 kuid_t uid) 2228 { 2229 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2230 struct dst_entry *dst; 2231 struct flowi6 fl6; 2232 2233 memset(&fl6, 0, sizeof(fl6)); 2234 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2235 fl6.flowi6_oif = oif; 2236 fl6.flowi6_mark = mark; 2237 fl6.daddr = iph->daddr; 2238 fl6.saddr = iph->saddr; 2239 fl6.flowlabel = ip6_flowinfo(iph); 2240 fl6.flowi6_uid = uid; 2241 2242 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 2243 rt6_do_redirect(dst, NULL, skb); 2244 dst_release(dst); 2245 } 2246 EXPORT_SYMBOL_GPL(ip6_redirect); 2247 2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2249 u32 mark) 2250 { 2251 const struct ipv6hdr *iph = ipv6_hdr(skb); 2252 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2253 struct dst_entry *dst; 2254 struct flowi6 fl6; 2255 2256 memset(&fl6, 0, sizeof(fl6)); 2257 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2258 fl6.flowi6_oif = oif; 2259 fl6.flowi6_mark = mark; 2260 fl6.daddr = msg->dest; 2261 fl6.saddr = iph->daddr; 2262 fl6.flowi6_uid = sock_net_uid(net, NULL); 2263 2264 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 2265 rt6_do_redirect(dst, NULL, skb); 2266 dst_release(dst); 2267 } 2268 2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2270 { 2271 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2272 sk->sk_uid); 2273 } 2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2275 2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2277 { 2278 struct net_device *dev = dst->dev; 2279 unsigned int mtu = dst_mtu(dst); 2280 struct net *net = dev_net(dev); 2281 2282 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2283 2284 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2285 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2286 2287 /* 2288 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2289 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2290 * IPV6_MAXPLEN is also valid and means: "any MSS, 2291 * rely only on pmtu discovery" 2292 */ 2293 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2294 mtu = IPV6_MAXPLEN; 2295 return mtu; 2296 } 2297 2298 static unsigned int ip6_mtu(const struct dst_entry *dst) 2299 { 2300 const struct rt6_info *rt = (const struct rt6_info *)dst; 2301 unsigned int mtu = rt->rt6i_pmtu; 2302 struct inet6_dev *idev; 2303 2304 if (mtu) 2305 goto out; 2306 2307 mtu = dst_metric_raw(dst, RTAX_MTU); 2308 if (mtu) 2309 goto out; 2310 2311 mtu = IPV6_MIN_MTU; 2312 2313 rcu_read_lock(); 2314 idev = __in6_dev_get(dst->dev); 2315 if (idev) 2316 mtu = idev->cnf.mtu6; 2317 rcu_read_unlock(); 2318 2319 out: 2320 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2321 2322 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2323 } 2324 2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2326 struct flowi6 *fl6) 2327 { 2328 struct dst_entry *dst; 2329 struct rt6_info *rt; 2330 struct inet6_dev *idev = in6_dev_get(dev); 2331 struct net *net = dev_net(dev); 2332 2333 if (unlikely(!idev)) 2334 return ERR_PTR(-ENODEV); 2335 2336 rt = ip6_dst_alloc(net, dev, 0); 2337 if (unlikely(!rt)) { 2338 in6_dev_put(idev); 2339 dst = ERR_PTR(-ENOMEM); 2340 goto out; 2341 } 2342 2343 rt->dst.flags |= DST_HOST; 2344 rt->dst.input = ip6_input; 2345 rt->dst.output = ip6_output; 2346 rt->rt6i_gateway = fl6->daddr; 2347 rt->rt6i_dst.addr = fl6->daddr; 2348 rt->rt6i_dst.plen = 128; 2349 rt->rt6i_idev = idev; 2350 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2351 2352 /* Add this dst into uncached_list so that rt6_disable_ip() can 2353 * do proper release of the net_device 2354 */ 2355 rt6_uncached_list_add(rt); 2356 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2357 2358 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2359 2360 out: 2361 return dst; 2362 } 2363 2364 static int ip6_dst_gc(struct dst_ops *ops) 2365 { 2366 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2367 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2368 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2369 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2370 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2371 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2372 int entries; 2373 2374 entries = dst_entries_get_fast(ops); 2375 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2376 entries <= rt_max_size) 2377 goto out; 2378 2379 net->ipv6.ip6_rt_gc_expire++; 2380 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2381 entries = dst_entries_get_slow(ops); 2382 if (entries < ops->gc_thresh) 2383 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2384 out: 2385 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2386 return entries > rt_max_size; 2387 } 2388 2389 static int ip6_convert_metrics(struct mx6_config *mxc, 2390 const struct fib6_config *cfg) 2391 { 2392 struct net *net = cfg->fc_nlinfo.nl_net; 2393 bool ecn_ca = false; 2394 struct nlattr *nla; 2395 int remaining; 2396 u32 *mp; 2397 2398 if (!cfg->fc_mx) 2399 return 0; 2400 2401 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 2402 if (unlikely(!mp)) 2403 return -ENOMEM; 2404 2405 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 2406 int type = nla_type(nla); 2407 u32 val; 2408 2409 if (!type) 2410 continue; 2411 if (unlikely(type > RTAX_MAX)) 2412 goto err; 2413 2414 if (type == RTAX_CC_ALGO) { 2415 char tmp[TCP_CA_NAME_MAX]; 2416 2417 nla_strlcpy(tmp, nla, sizeof(tmp)); 2418 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); 2419 if (val == TCP_CA_UNSPEC) 2420 goto err; 2421 } else { 2422 val = nla_get_u32(nla); 2423 } 2424 if (type == RTAX_HOPLIMIT && val > 255) 2425 val = 255; 2426 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 2427 goto err; 2428 2429 mp[type - 1] = val; 2430 __set_bit(type - 1, mxc->mx_valid); 2431 } 2432 2433 if (ecn_ca) { 2434 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); 2435 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 2436 } 2437 2438 mxc->mx = mp; 2439 return 0; 2440 err: 2441 kfree(mp); 2442 return -EINVAL; 2443 } 2444 2445 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2446 struct fib6_config *cfg, 2447 const struct in6_addr *gw_addr, 2448 u32 tbid, int flags) 2449 { 2450 struct flowi6 fl6 = { 2451 .flowi6_oif = cfg->fc_ifindex, 2452 .daddr = *gw_addr, 2453 .saddr = cfg->fc_prefsrc, 2454 }; 2455 struct fib6_table *table; 2456 struct rt6_info *rt; 2457 2458 table = fib6_get_table(net, tbid); 2459 if (!table) 2460 return NULL; 2461 2462 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2463 flags |= RT6_LOOKUP_F_HAS_SADDR; 2464 2465 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2466 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); 2467 2468 /* if table lookup failed, fall back to full lookup */ 2469 if (rt == net->ipv6.ip6_null_entry) { 2470 ip6_rt_put(rt); 2471 rt = NULL; 2472 } 2473 2474 return rt; 2475 } 2476 2477 static int ip6_route_check_nh_onlink(struct net *net, 2478 struct fib6_config *cfg, 2479 struct net_device *dev, 2480 struct netlink_ext_ack *extack) 2481 { 2482 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2483 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2484 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2485 struct rt6_info *grt; 2486 int err; 2487 2488 err = 0; 2489 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2490 if (grt) { 2491 if (!grt->dst.error && 2492 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2493 NL_SET_ERR_MSG(extack, 2494 "Nexthop has invalid gateway or device mismatch"); 2495 err = -EINVAL; 2496 } 2497 2498 ip6_rt_put(grt); 2499 } 2500 2501 return err; 2502 } 2503 2504 static int ip6_route_check_nh(struct net *net, 2505 struct fib6_config *cfg, 2506 struct net_device **_dev, 2507 struct inet6_dev **idev) 2508 { 2509 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2510 struct net_device *dev = _dev ? *_dev : NULL; 2511 struct rt6_info *grt = NULL; 2512 int err = -EHOSTUNREACH; 2513 2514 if (cfg->fc_table) { 2515 int flags = RT6_LOOKUP_F_IFACE; 2516 2517 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2518 cfg->fc_table, flags); 2519 if (grt) { 2520 if (grt->rt6i_flags & RTF_GATEWAY || 2521 (dev && dev != grt->dst.dev)) { 2522 ip6_rt_put(grt); 2523 grt = NULL; 2524 } 2525 } 2526 } 2527 2528 if (!grt) 2529 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); 2530 2531 if (!grt) 2532 goto out; 2533 2534 if (dev) { 2535 if (dev != grt->dst.dev) { 2536 ip6_rt_put(grt); 2537 goto out; 2538 } 2539 } else { 2540 *_dev = dev = grt->dst.dev; 2541 *idev = grt->rt6i_idev; 2542 dev_hold(dev); 2543 in6_dev_hold(grt->rt6i_idev); 2544 } 2545 2546 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2547 err = 0; 2548 2549 ip6_rt_put(grt); 2550 2551 out: 2552 return err; 2553 } 2554 2555 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 2556 struct netlink_ext_ack *extack) 2557 { 2558 struct net *net = cfg->fc_nlinfo.nl_net; 2559 struct rt6_info *rt = NULL; 2560 struct net_device *dev = NULL; 2561 struct inet6_dev *idev = NULL; 2562 struct fib6_table *table; 2563 int addr_type; 2564 int err = -EINVAL; 2565 2566 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2567 if (cfg->fc_flags & RTF_PCPU) { 2568 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2569 goto out; 2570 } 2571 2572 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2573 if (cfg->fc_flags & RTF_CACHE) { 2574 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2575 goto out; 2576 } 2577 2578 if (cfg->fc_dst_len > 128) { 2579 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2580 goto out; 2581 } 2582 if (cfg->fc_src_len > 128) { 2583 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2584 goto out; 2585 } 2586 #ifndef CONFIG_IPV6_SUBTREES 2587 if (cfg->fc_src_len) { 2588 NL_SET_ERR_MSG(extack, 2589 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2590 goto out; 2591 } 2592 #endif 2593 if (cfg->fc_ifindex) { 2594 err = -ENODEV; 2595 dev = dev_get_by_index(net, cfg->fc_ifindex); 2596 if (!dev) 2597 goto out; 2598 idev = in6_dev_get(dev); 2599 if (!idev) 2600 goto out; 2601 } 2602 2603 if (cfg->fc_metric == 0) 2604 cfg->fc_metric = IP6_RT_PRIO_USER; 2605 2606 if (cfg->fc_flags & RTNH_F_ONLINK) { 2607 if (!dev) { 2608 NL_SET_ERR_MSG(extack, 2609 "Nexthop device required for onlink"); 2610 err = -ENODEV; 2611 goto out; 2612 } 2613 2614 if (!(dev->flags & IFF_UP)) { 2615 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2616 err = -ENETDOWN; 2617 goto out; 2618 } 2619 } 2620 2621 err = -ENOBUFS; 2622 if (cfg->fc_nlinfo.nlh && 2623 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2624 table = fib6_get_table(net, cfg->fc_table); 2625 if (!table) { 2626 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2627 table = fib6_new_table(net, cfg->fc_table); 2628 } 2629 } else { 2630 table = fib6_new_table(net, cfg->fc_table); 2631 } 2632 2633 if (!table) 2634 goto out; 2635 2636 rt = ip6_dst_alloc(net, NULL, 2637 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 2638 2639 if (!rt) { 2640 err = -ENOMEM; 2641 goto out; 2642 } 2643 2644 if (cfg->fc_flags & RTF_EXPIRES) 2645 rt6_set_expires(rt, jiffies + 2646 clock_t_to_jiffies(cfg->fc_expires)); 2647 else 2648 rt6_clean_expires(rt); 2649 2650 if (cfg->fc_protocol == RTPROT_UNSPEC) 2651 cfg->fc_protocol = RTPROT_BOOT; 2652 rt->rt6i_protocol = cfg->fc_protocol; 2653 2654 addr_type = ipv6_addr_type(&cfg->fc_dst); 2655 2656 if (addr_type & IPV6_ADDR_MULTICAST) 2657 rt->dst.input = ip6_mc_input; 2658 else if (cfg->fc_flags & RTF_LOCAL) 2659 rt->dst.input = ip6_input; 2660 else 2661 rt->dst.input = ip6_forward; 2662 2663 rt->dst.output = ip6_output; 2664 2665 if (cfg->fc_encap) { 2666 struct lwtunnel_state *lwtstate; 2667 2668 err = lwtunnel_build_state(cfg->fc_encap_type, 2669 cfg->fc_encap, AF_INET6, cfg, 2670 &lwtstate, extack); 2671 if (err) 2672 goto out; 2673 rt->dst.lwtstate = lwtstate_get(lwtstate); 2674 lwtunnel_set_redirect(&rt->dst); 2675 } 2676 2677 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2678 rt->rt6i_dst.plen = cfg->fc_dst_len; 2679 if (rt->rt6i_dst.plen == 128) 2680 rt->dst.flags |= DST_HOST; 2681 2682 #ifdef CONFIG_IPV6_SUBTREES 2683 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 2684 rt->rt6i_src.plen = cfg->fc_src_len; 2685 #endif 2686 2687 rt->rt6i_metric = cfg->fc_metric; 2688 rt->rt6i_nh_weight = 1; 2689 2690 /* We cannot add true routes via loopback here, 2691 they would result in kernel looping; promote them to reject routes 2692 */ 2693 if ((cfg->fc_flags & RTF_REJECT) || 2694 (dev && (dev->flags & IFF_LOOPBACK) && 2695 !(addr_type & IPV6_ADDR_LOOPBACK) && 2696 !(cfg->fc_flags & RTF_LOCAL))) { 2697 /* hold loopback dev/idev if we haven't done so. */ 2698 if (dev != net->loopback_dev) { 2699 if (dev) { 2700 dev_put(dev); 2701 in6_dev_put(idev); 2702 } 2703 dev = net->loopback_dev; 2704 dev_hold(dev); 2705 idev = in6_dev_get(dev); 2706 if (!idev) { 2707 err = -ENODEV; 2708 goto out; 2709 } 2710 } 2711 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 2712 switch (cfg->fc_type) { 2713 case RTN_BLACKHOLE: 2714 rt->dst.error = -EINVAL; 2715 rt->dst.output = dst_discard_out; 2716 rt->dst.input = dst_discard; 2717 break; 2718 case RTN_PROHIBIT: 2719 rt->dst.error = -EACCES; 2720 rt->dst.output = ip6_pkt_prohibit_out; 2721 rt->dst.input = ip6_pkt_prohibit; 2722 break; 2723 case RTN_THROW: 2724 case RTN_UNREACHABLE: 2725 default: 2726 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 2727 : (cfg->fc_type == RTN_UNREACHABLE) 2728 ? -EHOSTUNREACH : -ENETUNREACH; 2729 rt->dst.output = ip6_pkt_discard_out; 2730 rt->dst.input = ip6_pkt_discard; 2731 break; 2732 } 2733 goto install_route; 2734 } 2735 2736 if (cfg->fc_flags & RTF_GATEWAY) { 2737 const struct in6_addr *gw_addr; 2738 int gwa_type; 2739 2740 gw_addr = &cfg->fc_gateway; 2741 gwa_type = ipv6_addr_type(gw_addr); 2742 2743 /* if gw_addr is local we will fail to detect this in case 2744 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2745 * will return already-added prefix route via interface that 2746 * prefix route was assigned to, which might be non-loopback. 2747 */ 2748 err = -EINVAL; 2749 if (ipv6_chk_addr_and_flags(net, gw_addr, 2750 gwa_type & IPV6_ADDR_LINKLOCAL ? 2751 dev : NULL, 0, 0)) { 2752 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2753 goto out; 2754 } 2755 rt->rt6i_gateway = *gw_addr; 2756 2757 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 2758 /* IPv6 strictly inhibits using not link-local 2759 addresses as nexthop address. 2760 Otherwise, router will not able to send redirects. 2761 It is very good, but in some (rare!) circumstances 2762 (SIT, PtP, NBMA NOARP links) it is handy to allow 2763 some exceptions. --ANK 2764 We allow IPv4-mapped nexthops to support RFC4798-type 2765 addressing 2766 */ 2767 if (!(gwa_type & (IPV6_ADDR_UNICAST | 2768 IPV6_ADDR_MAPPED))) { 2769 NL_SET_ERR_MSG(extack, 2770 "Invalid gateway address"); 2771 goto out; 2772 } 2773 2774 if (cfg->fc_flags & RTNH_F_ONLINK) { 2775 err = ip6_route_check_nh_onlink(net, cfg, dev, 2776 extack); 2777 } else { 2778 err = ip6_route_check_nh(net, cfg, &dev, &idev); 2779 } 2780 if (err) 2781 goto out; 2782 } 2783 err = -EINVAL; 2784 if (!dev) { 2785 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2786 goto out; 2787 } else if (dev->flags & IFF_LOOPBACK) { 2788 NL_SET_ERR_MSG(extack, 2789 "Egress device can not be loopback device for this route"); 2790 goto out; 2791 } 2792 } 2793 2794 err = -ENODEV; 2795 if (!dev) 2796 goto out; 2797 2798 if (!(dev->flags & IFF_UP)) { 2799 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2800 err = -ENETDOWN; 2801 goto out; 2802 } 2803 2804 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2805 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2806 NL_SET_ERR_MSG(extack, "Invalid source address"); 2807 err = -EINVAL; 2808 goto out; 2809 } 2810 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 2811 rt->rt6i_prefsrc.plen = 128; 2812 } else 2813 rt->rt6i_prefsrc.plen = 0; 2814 2815 rt->rt6i_flags = cfg->fc_flags; 2816 2817 install_route: 2818 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && 2819 !netif_carrier_ok(dev)) 2820 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; 2821 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 2822 rt->dst.dev = dev; 2823 rt->rt6i_idev = idev; 2824 rt->rt6i_table = table; 2825 2826 cfg->fc_nlinfo.nl_net = dev_net(dev); 2827 2828 return rt; 2829 out: 2830 if (dev) 2831 dev_put(dev); 2832 if (idev) 2833 in6_dev_put(idev); 2834 if (rt) 2835 dst_release_immediate(&rt->dst); 2836 2837 return ERR_PTR(err); 2838 } 2839 2840 int ip6_route_add(struct fib6_config *cfg, 2841 struct netlink_ext_ack *extack) 2842 { 2843 struct mx6_config mxc = { .mx = NULL, }; 2844 struct rt6_info *rt; 2845 int err; 2846 2847 rt = ip6_route_info_create(cfg, extack); 2848 if (IS_ERR(rt)) { 2849 err = PTR_ERR(rt); 2850 rt = NULL; 2851 goto out; 2852 } 2853 2854 err = ip6_convert_metrics(&mxc, cfg); 2855 if (err) 2856 goto out; 2857 2858 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); 2859 2860 kfree(mxc.mx); 2861 2862 return err; 2863 out: 2864 if (rt) 2865 dst_release_immediate(&rt->dst); 2866 2867 return err; 2868 } 2869 2870 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 2871 { 2872 int err; 2873 struct fib6_table *table; 2874 struct net *net = dev_net(rt->dst.dev); 2875 2876 if (rt == net->ipv6.ip6_null_entry) { 2877 err = -ENOENT; 2878 goto out; 2879 } 2880 2881 table = rt->rt6i_table; 2882 spin_lock_bh(&table->tb6_lock); 2883 err = fib6_del(rt, info); 2884 spin_unlock_bh(&table->tb6_lock); 2885 2886 out: 2887 ip6_rt_put(rt); 2888 return err; 2889 } 2890 2891 int ip6_del_rt(struct rt6_info *rt) 2892 { 2893 struct nl_info info = { 2894 .nl_net = dev_net(rt->dst.dev), 2895 }; 2896 return __ip6_del_rt(rt, &info); 2897 } 2898 2899 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 2900 { 2901 struct nl_info *info = &cfg->fc_nlinfo; 2902 struct net *net = info->nl_net; 2903 struct sk_buff *skb = NULL; 2904 struct fib6_table *table; 2905 int err = -ENOENT; 2906 2907 if (rt == net->ipv6.ip6_null_entry) 2908 goto out_put; 2909 table = rt->rt6i_table; 2910 spin_lock_bh(&table->tb6_lock); 2911 2912 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 2913 struct rt6_info *sibling, *next_sibling; 2914 2915 /* prefer to send a single notification with all hops */ 2916 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 2917 if (skb) { 2918 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 2919 2920 if (rt6_fill_node(net, skb, rt, 2921 NULL, NULL, 0, RTM_DELROUTE, 2922 info->portid, seq, 0) < 0) { 2923 kfree_skb(skb); 2924 skb = NULL; 2925 } else 2926 info->skip_notify = 1; 2927 } 2928 2929 list_for_each_entry_safe(sibling, next_sibling, 2930 &rt->rt6i_siblings, 2931 rt6i_siblings) { 2932 err = fib6_del(sibling, info); 2933 if (err) 2934 goto out_unlock; 2935 } 2936 } 2937 2938 err = fib6_del(rt, info); 2939 out_unlock: 2940 spin_unlock_bh(&table->tb6_lock); 2941 out_put: 2942 ip6_rt_put(rt); 2943 2944 if (skb) { 2945 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 2946 info->nlh, gfp_any()); 2947 } 2948 return err; 2949 } 2950 2951 static int ip6_route_del(struct fib6_config *cfg, 2952 struct netlink_ext_ack *extack) 2953 { 2954 struct rt6_info *rt, *rt_cache; 2955 struct fib6_table *table; 2956 struct fib6_node *fn; 2957 int err = -ESRCH; 2958 2959 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2960 if (!table) { 2961 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 2962 return err; 2963 } 2964 2965 rcu_read_lock(); 2966 2967 fn = fib6_locate(&table->tb6_root, 2968 &cfg->fc_dst, cfg->fc_dst_len, 2969 &cfg->fc_src, cfg->fc_src_len, 2970 !(cfg->fc_flags & RTF_CACHE)); 2971 2972 if (fn) { 2973 for_each_fib6_node_rt_rcu(fn) { 2974 if (cfg->fc_flags & RTF_CACHE) { 2975 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 2976 &cfg->fc_src); 2977 if (!rt_cache) 2978 continue; 2979 rt = rt_cache; 2980 } 2981 if (cfg->fc_ifindex && 2982 (!rt->dst.dev || 2983 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2984 continue; 2985 if (cfg->fc_flags & RTF_GATEWAY && 2986 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2987 continue; 2988 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2989 continue; 2990 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 2991 continue; 2992 if (!dst_hold_safe(&rt->dst)) 2993 break; 2994 rcu_read_unlock(); 2995 2996 /* if gateway was specified only delete the one hop */ 2997 if (cfg->fc_flags & RTF_GATEWAY) 2998 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2999 3000 return __ip6_del_rt_siblings(rt, cfg); 3001 } 3002 } 3003 rcu_read_unlock(); 3004 3005 return err; 3006 } 3007 3008 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3009 { 3010 struct netevent_redirect netevent; 3011 struct rt6_info *rt, *nrt = NULL; 3012 struct ndisc_options ndopts; 3013 struct inet6_dev *in6_dev; 3014 struct neighbour *neigh; 3015 struct rd_msg *msg; 3016 int optlen, on_link; 3017 u8 *lladdr; 3018 3019 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3020 optlen -= sizeof(*msg); 3021 3022 if (optlen < 0) { 3023 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3024 return; 3025 } 3026 3027 msg = (struct rd_msg *)icmp6_hdr(skb); 3028 3029 if (ipv6_addr_is_multicast(&msg->dest)) { 3030 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3031 return; 3032 } 3033 3034 on_link = 0; 3035 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3036 on_link = 1; 3037 } else if (ipv6_addr_type(&msg->target) != 3038 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3039 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3040 return; 3041 } 3042 3043 in6_dev = __in6_dev_get(skb->dev); 3044 if (!in6_dev) 3045 return; 3046 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3047 return; 3048 3049 /* RFC2461 8.1: 3050 * The IP source address of the Redirect MUST be the same as the current 3051 * first-hop router for the specified ICMP Destination Address. 3052 */ 3053 3054 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3055 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3056 return; 3057 } 3058 3059 lladdr = NULL; 3060 if (ndopts.nd_opts_tgt_lladdr) { 3061 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3062 skb->dev); 3063 if (!lladdr) { 3064 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3065 return; 3066 } 3067 } 3068 3069 rt = (struct rt6_info *) dst; 3070 if (rt->rt6i_flags & RTF_REJECT) { 3071 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3072 return; 3073 } 3074 3075 /* Redirect received -> path was valid. 3076 * Look, redirects are sent only in response to data packets, 3077 * so that this nexthop apparently is reachable. --ANK 3078 */ 3079 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3080 3081 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3082 if (!neigh) 3083 return; 3084 3085 /* 3086 * We have finally decided to accept it. 3087 */ 3088 3089 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3090 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3091 NEIGH_UPDATE_F_OVERRIDE| 3092 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3093 NEIGH_UPDATE_F_ISROUTER)), 3094 NDISC_REDIRECT, &ndopts); 3095 3096 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 3097 if (!nrt) 3098 goto out; 3099 3100 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3101 if (on_link) 3102 nrt->rt6i_flags &= ~RTF_GATEWAY; 3103 3104 nrt->rt6i_protocol = RTPROT_REDIRECT; 3105 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3106 3107 /* No need to remove rt from the exception table if rt is 3108 * a cached route because rt6_insert_exception() will 3109 * takes care of it 3110 */ 3111 if (rt6_insert_exception(nrt, rt)) { 3112 dst_release_immediate(&nrt->dst); 3113 goto out; 3114 } 3115 3116 netevent.old = &rt->dst; 3117 netevent.new = &nrt->dst; 3118 netevent.daddr = &msg->dest; 3119 netevent.neigh = neigh; 3120 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3121 3122 out: 3123 neigh_release(neigh); 3124 } 3125 3126 /* 3127 * Misc support functions 3128 */ 3129 3130 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 3131 { 3132 BUG_ON(from->from); 3133 3134 rt->rt6i_flags &= ~RTF_EXPIRES; 3135 dst_hold(&from->dst); 3136 rt->from = from; 3137 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 3138 } 3139 3140 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 3141 { 3142 rt->dst.input = ort->dst.input; 3143 rt->dst.output = ort->dst.output; 3144 rt->rt6i_dst = ort->rt6i_dst; 3145 rt->dst.error = ort->dst.error; 3146 rt->rt6i_idev = ort->rt6i_idev; 3147 if (rt->rt6i_idev) 3148 in6_dev_hold(rt->rt6i_idev); 3149 rt->dst.lastuse = jiffies; 3150 rt->rt6i_gateway = ort->rt6i_gateway; 3151 rt->rt6i_flags = ort->rt6i_flags; 3152 rt6_set_from(rt, ort); 3153 rt->rt6i_metric = ort->rt6i_metric; 3154 #ifdef CONFIG_IPV6_SUBTREES 3155 rt->rt6i_src = ort->rt6i_src; 3156 #endif 3157 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 3158 rt->rt6i_table = ort->rt6i_table; 3159 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 3160 } 3161 3162 #ifdef CONFIG_IPV6_ROUTE_INFO 3163 static struct rt6_info *rt6_get_route_info(struct net *net, 3164 const struct in6_addr *prefix, int prefixlen, 3165 const struct in6_addr *gwaddr, 3166 struct net_device *dev) 3167 { 3168 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3169 int ifindex = dev->ifindex; 3170 struct fib6_node *fn; 3171 struct rt6_info *rt = NULL; 3172 struct fib6_table *table; 3173 3174 table = fib6_get_table(net, tb_id); 3175 if (!table) 3176 return NULL; 3177 3178 rcu_read_lock(); 3179 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3180 if (!fn) 3181 goto out; 3182 3183 for_each_fib6_node_rt_rcu(fn) { 3184 if (rt->dst.dev->ifindex != ifindex) 3185 continue; 3186 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3187 continue; 3188 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 3189 continue; 3190 ip6_hold_safe(NULL, &rt, false); 3191 break; 3192 } 3193 out: 3194 rcu_read_unlock(); 3195 return rt; 3196 } 3197 3198 static struct rt6_info *rt6_add_route_info(struct net *net, 3199 const struct in6_addr *prefix, int prefixlen, 3200 const struct in6_addr *gwaddr, 3201 struct net_device *dev, 3202 unsigned int pref) 3203 { 3204 struct fib6_config cfg = { 3205 .fc_metric = IP6_RT_PRIO_USER, 3206 .fc_ifindex = dev->ifindex, 3207 .fc_dst_len = prefixlen, 3208 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3209 RTF_UP | RTF_PREF(pref), 3210 .fc_protocol = RTPROT_RA, 3211 .fc_nlinfo.portid = 0, 3212 .fc_nlinfo.nlh = NULL, 3213 .fc_nlinfo.nl_net = net, 3214 }; 3215 3216 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3217 cfg.fc_dst = *prefix; 3218 cfg.fc_gateway = *gwaddr; 3219 3220 /* We should treat it as a default route if prefix length is 0. */ 3221 if (!prefixlen) 3222 cfg.fc_flags |= RTF_DEFAULT; 3223 3224 ip6_route_add(&cfg, NULL); 3225 3226 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3227 } 3228 #endif 3229 3230 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 3231 { 3232 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3233 struct rt6_info *rt; 3234 struct fib6_table *table; 3235 3236 table = fib6_get_table(dev_net(dev), tb_id); 3237 if (!table) 3238 return NULL; 3239 3240 rcu_read_lock(); 3241 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3242 if (dev == rt->dst.dev && 3243 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3244 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 3245 break; 3246 } 3247 if (rt) 3248 ip6_hold_safe(NULL, &rt, false); 3249 rcu_read_unlock(); 3250 return rt; 3251 } 3252 3253 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 3254 struct net_device *dev, 3255 unsigned int pref) 3256 { 3257 struct fib6_config cfg = { 3258 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3259 .fc_metric = IP6_RT_PRIO_USER, 3260 .fc_ifindex = dev->ifindex, 3261 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3262 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3263 .fc_protocol = RTPROT_RA, 3264 .fc_nlinfo.portid = 0, 3265 .fc_nlinfo.nlh = NULL, 3266 .fc_nlinfo.nl_net = dev_net(dev), 3267 }; 3268 3269 cfg.fc_gateway = *gwaddr; 3270 3271 if (!ip6_route_add(&cfg, NULL)) { 3272 struct fib6_table *table; 3273 3274 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3275 if (table) 3276 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3277 } 3278 3279 return rt6_get_dflt_router(gwaddr, dev); 3280 } 3281 3282 static void __rt6_purge_dflt_routers(struct fib6_table *table) 3283 { 3284 struct rt6_info *rt; 3285 3286 restart: 3287 rcu_read_lock(); 3288 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3289 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3290 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 3291 if (dst_hold_safe(&rt->dst)) { 3292 rcu_read_unlock(); 3293 ip6_del_rt(rt); 3294 } else { 3295 rcu_read_unlock(); 3296 } 3297 goto restart; 3298 } 3299 } 3300 rcu_read_unlock(); 3301 3302 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3303 } 3304 3305 void rt6_purge_dflt_routers(struct net *net) 3306 { 3307 struct fib6_table *table; 3308 struct hlist_head *head; 3309 unsigned int h; 3310 3311 rcu_read_lock(); 3312 3313 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3314 head = &net->ipv6.fib_table_hash[h]; 3315 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3316 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3317 __rt6_purge_dflt_routers(table); 3318 } 3319 } 3320 3321 rcu_read_unlock(); 3322 } 3323 3324 static void rtmsg_to_fib6_config(struct net *net, 3325 struct in6_rtmsg *rtmsg, 3326 struct fib6_config *cfg) 3327 { 3328 memset(cfg, 0, sizeof(*cfg)); 3329 3330 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3331 : RT6_TABLE_MAIN; 3332 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3333 cfg->fc_metric = rtmsg->rtmsg_metric; 3334 cfg->fc_expires = rtmsg->rtmsg_info; 3335 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3336 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3337 cfg->fc_flags = rtmsg->rtmsg_flags; 3338 3339 cfg->fc_nlinfo.nl_net = net; 3340 3341 cfg->fc_dst = rtmsg->rtmsg_dst; 3342 cfg->fc_src = rtmsg->rtmsg_src; 3343 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3344 } 3345 3346 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3347 { 3348 struct fib6_config cfg; 3349 struct in6_rtmsg rtmsg; 3350 int err; 3351 3352 switch (cmd) { 3353 case SIOCADDRT: /* Add a route */ 3354 case SIOCDELRT: /* Delete a route */ 3355 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3356 return -EPERM; 3357 err = copy_from_user(&rtmsg, arg, 3358 sizeof(struct in6_rtmsg)); 3359 if (err) 3360 return -EFAULT; 3361 3362 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3363 3364 rtnl_lock(); 3365 switch (cmd) { 3366 case SIOCADDRT: 3367 err = ip6_route_add(&cfg, NULL); 3368 break; 3369 case SIOCDELRT: 3370 err = ip6_route_del(&cfg, NULL); 3371 break; 3372 default: 3373 err = -EINVAL; 3374 } 3375 rtnl_unlock(); 3376 3377 return err; 3378 } 3379 3380 return -EINVAL; 3381 } 3382 3383 /* 3384 * Drop the packet on the floor 3385 */ 3386 3387 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3388 { 3389 int type; 3390 struct dst_entry *dst = skb_dst(skb); 3391 switch (ipstats_mib_noroutes) { 3392 case IPSTATS_MIB_INNOROUTES: 3393 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3394 if (type == IPV6_ADDR_ANY) { 3395 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3396 IPSTATS_MIB_INADDRERRORS); 3397 break; 3398 } 3399 /* FALLTHROUGH */ 3400 case IPSTATS_MIB_OUTNOROUTES: 3401 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3402 ipstats_mib_noroutes); 3403 break; 3404 } 3405 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3406 kfree_skb(skb); 3407 return 0; 3408 } 3409 3410 static int ip6_pkt_discard(struct sk_buff *skb) 3411 { 3412 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3413 } 3414 3415 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3416 { 3417 skb->dev = skb_dst(skb)->dev; 3418 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3419 } 3420 3421 static int ip6_pkt_prohibit(struct sk_buff *skb) 3422 { 3423 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3424 } 3425 3426 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3427 { 3428 skb->dev = skb_dst(skb)->dev; 3429 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3430 } 3431 3432 /* 3433 * Allocate a dst for local (unicast / anycast) address. 3434 */ 3435 3436 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 3437 const struct in6_addr *addr, 3438 bool anycast) 3439 { 3440 u32 tb_id; 3441 struct net *net = dev_net(idev->dev); 3442 struct net_device *dev = idev->dev; 3443 struct rt6_info *rt; 3444 3445 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 3446 if (!rt) 3447 return ERR_PTR(-ENOMEM); 3448 3449 in6_dev_hold(idev); 3450 3451 rt->dst.flags |= DST_HOST; 3452 rt->dst.input = ip6_input; 3453 rt->dst.output = ip6_output; 3454 rt->rt6i_idev = idev; 3455 3456 rt->rt6i_protocol = RTPROT_KERNEL; 3457 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 3458 if (anycast) 3459 rt->rt6i_flags |= RTF_ANYCAST; 3460 else 3461 rt->rt6i_flags |= RTF_LOCAL; 3462 3463 rt->rt6i_gateway = *addr; 3464 rt->rt6i_dst.addr = *addr; 3465 rt->rt6i_dst.plen = 128; 3466 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3467 rt->rt6i_table = fib6_get_table(net, tb_id); 3468 3469 return rt; 3470 } 3471 3472 /* remove deleted ip from prefsrc entries */ 3473 struct arg_dev_net_ip { 3474 struct net_device *dev; 3475 struct net *net; 3476 struct in6_addr *addr; 3477 }; 3478 3479 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 3480 { 3481 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3482 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3483 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3484 3485 if (((void *)rt->dst.dev == dev || !dev) && 3486 rt != net->ipv6.ip6_null_entry && 3487 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 3488 spin_lock_bh(&rt6_exception_lock); 3489 /* remove prefsrc entry */ 3490 rt->rt6i_prefsrc.plen = 0; 3491 /* need to update cache as well */ 3492 rt6_exceptions_remove_prefsrc(rt); 3493 spin_unlock_bh(&rt6_exception_lock); 3494 } 3495 return 0; 3496 } 3497 3498 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3499 { 3500 struct net *net = dev_net(ifp->idev->dev); 3501 struct arg_dev_net_ip adni = { 3502 .dev = ifp->idev->dev, 3503 .net = net, 3504 .addr = &ifp->addr, 3505 }; 3506 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3507 } 3508 3509 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3510 3511 /* Remove routers and update dst entries when gateway turn into host. */ 3512 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 3513 { 3514 struct in6_addr *gateway = (struct in6_addr *)arg; 3515 3516 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3517 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 3518 return -1; 3519 } 3520 3521 /* Further clean up cached routes in exception table. 3522 * This is needed because cached route may have a different 3523 * gateway than its 'parent' in the case of an ip redirect. 3524 */ 3525 rt6_exceptions_clean_tohost(rt, gateway); 3526 3527 return 0; 3528 } 3529 3530 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3531 { 3532 fib6_clean_all(net, fib6_clean_tohost, gateway); 3533 } 3534 3535 struct arg_netdev_event { 3536 const struct net_device *dev; 3537 union { 3538 unsigned int nh_flags; 3539 unsigned long event; 3540 }; 3541 }; 3542 3543 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) 3544 { 3545 struct rt6_info *iter; 3546 struct fib6_node *fn; 3547 3548 fn = rcu_dereference_protected(rt->rt6i_node, 3549 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 3550 iter = rcu_dereference_protected(fn->leaf, 3551 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 3552 while (iter) { 3553 if (iter->rt6i_metric == rt->rt6i_metric && 3554 rt6_qualify_for_ecmp(iter)) 3555 return iter; 3556 iter = rcu_dereference_protected(iter->rt6_next, 3557 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 3558 } 3559 3560 return NULL; 3561 } 3562 3563 static bool rt6_is_dead(const struct rt6_info *rt) 3564 { 3565 if (rt->rt6i_nh_flags & RTNH_F_DEAD || 3566 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && 3567 rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 3568 return true; 3569 3570 return false; 3571 } 3572 3573 static int rt6_multipath_total_weight(const struct rt6_info *rt) 3574 { 3575 struct rt6_info *iter; 3576 int total = 0; 3577 3578 if (!rt6_is_dead(rt)) 3579 total += rt->rt6i_nh_weight; 3580 3581 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { 3582 if (!rt6_is_dead(iter)) 3583 total += iter->rt6i_nh_weight; 3584 } 3585 3586 return total; 3587 } 3588 3589 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) 3590 { 3591 int upper_bound = -1; 3592 3593 if (!rt6_is_dead(rt)) { 3594 *weight += rt->rt6i_nh_weight; 3595 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3596 total) - 1; 3597 } 3598 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); 3599 } 3600 3601 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) 3602 { 3603 struct rt6_info *iter; 3604 int weight = 0; 3605 3606 rt6_upper_bound_set(rt, &weight, total); 3607 3608 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3609 rt6_upper_bound_set(iter, &weight, total); 3610 } 3611 3612 void rt6_multipath_rebalance(struct rt6_info *rt) 3613 { 3614 struct rt6_info *first; 3615 int total; 3616 3617 /* In case the entire multipath route was marked for flushing, 3618 * then there is no need to rebalance upon the removal of every 3619 * sibling route. 3620 */ 3621 if (!rt->rt6i_nsiblings || rt->should_flush) 3622 return; 3623 3624 /* During lookup routes are evaluated in order, so we need to 3625 * make sure upper bounds are assigned from the first sibling 3626 * onwards. 3627 */ 3628 first = rt6_multipath_first_sibling(rt); 3629 if (WARN_ON_ONCE(!first)) 3630 return; 3631 3632 total = rt6_multipath_total_weight(first); 3633 rt6_multipath_upper_bound_set(first, total); 3634 } 3635 3636 static int fib6_ifup(struct rt6_info *rt, void *p_arg) 3637 { 3638 const struct arg_netdev_event *arg = p_arg; 3639 const struct net *net = dev_net(arg->dev); 3640 3641 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { 3642 rt->rt6i_nh_flags &= ~arg->nh_flags; 3643 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); 3644 rt6_multipath_rebalance(rt); 3645 } 3646 3647 return 0; 3648 } 3649 3650 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3651 { 3652 struct arg_netdev_event arg = { 3653 .dev = dev, 3654 { 3655 .nh_flags = nh_flags, 3656 }, 3657 }; 3658 3659 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3660 arg.nh_flags |= RTNH_F_LINKDOWN; 3661 3662 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3663 } 3664 3665 static bool rt6_multipath_uses_dev(const struct rt6_info *rt, 3666 const struct net_device *dev) 3667 { 3668 struct rt6_info *iter; 3669 3670 if (rt->dst.dev == dev) 3671 return true; 3672 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3673 if (iter->dst.dev == dev) 3674 return true; 3675 3676 return false; 3677 } 3678 3679 static void rt6_multipath_flush(struct rt6_info *rt) 3680 { 3681 struct rt6_info *iter; 3682 3683 rt->should_flush = 1; 3684 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3685 iter->should_flush = 1; 3686 } 3687 3688 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, 3689 const struct net_device *down_dev) 3690 { 3691 struct rt6_info *iter; 3692 unsigned int dead = 0; 3693 3694 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) 3695 dead++; 3696 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3697 if (iter->dst.dev == down_dev || 3698 iter->rt6i_nh_flags & RTNH_F_DEAD) 3699 dead++; 3700 3701 return dead; 3702 } 3703 3704 static void rt6_multipath_nh_flags_set(struct rt6_info *rt, 3705 const struct net_device *dev, 3706 unsigned int nh_flags) 3707 { 3708 struct rt6_info *iter; 3709 3710 if (rt->dst.dev == dev) 3711 rt->rt6i_nh_flags |= nh_flags; 3712 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3713 if (iter->dst.dev == dev) 3714 iter->rt6i_nh_flags |= nh_flags; 3715 } 3716 3717 /* called with write lock held for table with rt */ 3718 static int fib6_ifdown(struct rt6_info *rt, void *p_arg) 3719 { 3720 const struct arg_netdev_event *arg = p_arg; 3721 const struct net_device *dev = arg->dev; 3722 const struct net *net = dev_net(dev); 3723 3724 if (rt == net->ipv6.ip6_null_entry) 3725 return 0; 3726 3727 switch (arg->event) { 3728 case NETDEV_UNREGISTER: 3729 return rt->dst.dev == dev ? -1 : 0; 3730 case NETDEV_DOWN: 3731 if (rt->should_flush) 3732 return -1; 3733 if (!rt->rt6i_nsiblings) 3734 return rt->dst.dev == dev ? -1 : 0; 3735 if (rt6_multipath_uses_dev(rt, dev)) { 3736 unsigned int count; 3737 3738 count = rt6_multipath_dead_count(rt, dev); 3739 if (rt->rt6i_nsiblings + 1 == count) { 3740 rt6_multipath_flush(rt); 3741 return -1; 3742 } 3743 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 3744 RTNH_F_LINKDOWN); 3745 fib6_update_sernum(rt); 3746 rt6_multipath_rebalance(rt); 3747 } 3748 return -2; 3749 case NETDEV_CHANGE: 3750 if (rt->dst.dev != dev || 3751 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) 3752 break; 3753 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; 3754 rt6_multipath_rebalance(rt); 3755 break; 3756 } 3757 3758 return 0; 3759 } 3760 3761 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 3762 { 3763 struct arg_netdev_event arg = { 3764 .dev = dev, 3765 { 3766 .event = event, 3767 }, 3768 }; 3769 3770 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 3771 } 3772 3773 void rt6_disable_ip(struct net_device *dev, unsigned long event) 3774 { 3775 rt6_sync_down_dev(dev, event); 3776 rt6_uncached_list_flush_dev(dev_net(dev), dev); 3777 neigh_ifdown(&nd_tbl, dev); 3778 } 3779 3780 struct rt6_mtu_change_arg { 3781 struct net_device *dev; 3782 unsigned int mtu; 3783 }; 3784 3785 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 3786 { 3787 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 3788 struct inet6_dev *idev; 3789 3790 /* In IPv6 pmtu discovery is not optional, 3791 so that RTAX_MTU lock cannot disable it. 3792 We still use this lock to block changes 3793 caused by addrconf/ndisc. 3794 */ 3795 3796 idev = __in6_dev_get(arg->dev); 3797 if (!idev) 3798 return 0; 3799 3800 /* For administrative MTU increase, there is no way to discover 3801 IPv6 PMTU increase, so PMTU increase should be updated here. 3802 Since RFC 1981 doesn't include administrative MTU increase 3803 update PMTU increase is a MUST. (i.e. jumbo frame) 3804 */ 3805 /* 3806 If new MTU is less than route PMTU, this new MTU will be the 3807 lowest MTU in the path, update the route PMTU to reflect PMTU 3808 decreases; if new MTU is greater than route PMTU, and the 3809 old MTU is the lowest MTU in the path, update the route PMTU 3810 to reflect the increase. In this case if the other nodes' MTU 3811 also have the lowest MTU, TOO BIG MESSAGE will be lead to 3812 PMTU discovery. 3813 */ 3814 if (rt->dst.dev == arg->dev && 3815 dst_metric_raw(&rt->dst, RTAX_MTU) && 3816 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 3817 spin_lock_bh(&rt6_exception_lock); 3818 if (dst_mtu(&rt->dst) >= arg->mtu || 3819 (dst_mtu(&rt->dst) < arg->mtu && 3820 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 3821 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 3822 } 3823 rt6_exceptions_update_pmtu(rt, arg->mtu); 3824 spin_unlock_bh(&rt6_exception_lock); 3825 } 3826 return 0; 3827 } 3828 3829 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 3830 { 3831 struct rt6_mtu_change_arg arg = { 3832 .dev = dev, 3833 .mtu = mtu, 3834 }; 3835 3836 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 3837 } 3838 3839 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 3840 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 3841 [RTA_OIF] = { .type = NLA_U32 }, 3842 [RTA_IIF] = { .type = NLA_U32 }, 3843 [RTA_PRIORITY] = { .type = NLA_U32 }, 3844 [RTA_METRICS] = { .type = NLA_NESTED }, 3845 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 3846 [RTA_PREF] = { .type = NLA_U8 }, 3847 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 3848 [RTA_ENCAP] = { .type = NLA_NESTED }, 3849 [RTA_EXPIRES] = { .type = NLA_U32 }, 3850 [RTA_UID] = { .type = NLA_U32 }, 3851 [RTA_MARK] = { .type = NLA_U32 }, 3852 }; 3853 3854 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 3855 struct fib6_config *cfg, 3856 struct netlink_ext_ack *extack) 3857 { 3858 struct rtmsg *rtm; 3859 struct nlattr *tb[RTA_MAX+1]; 3860 unsigned int pref; 3861 int err; 3862 3863 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 3864 NULL); 3865 if (err < 0) 3866 goto errout; 3867 3868 err = -EINVAL; 3869 rtm = nlmsg_data(nlh); 3870 memset(cfg, 0, sizeof(*cfg)); 3871 3872 cfg->fc_table = rtm->rtm_table; 3873 cfg->fc_dst_len = rtm->rtm_dst_len; 3874 cfg->fc_src_len = rtm->rtm_src_len; 3875 cfg->fc_flags = RTF_UP; 3876 cfg->fc_protocol = rtm->rtm_protocol; 3877 cfg->fc_type = rtm->rtm_type; 3878 3879 if (rtm->rtm_type == RTN_UNREACHABLE || 3880 rtm->rtm_type == RTN_BLACKHOLE || 3881 rtm->rtm_type == RTN_PROHIBIT || 3882 rtm->rtm_type == RTN_THROW) 3883 cfg->fc_flags |= RTF_REJECT; 3884 3885 if (rtm->rtm_type == RTN_LOCAL) 3886 cfg->fc_flags |= RTF_LOCAL; 3887 3888 if (rtm->rtm_flags & RTM_F_CLONED) 3889 cfg->fc_flags |= RTF_CACHE; 3890 3891 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 3892 3893 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 3894 cfg->fc_nlinfo.nlh = nlh; 3895 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 3896 3897 if (tb[RTA_GATEWAY]) { 3898 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 3899 cfg->fc_flags |= RTF_GATEWAY; 3900 } 3901 3902 if (tb[RTA_DST]) { 3903 int plen = (rtm->rtm_dst_len + 7) >> 3; 3904 3905 if (nla_len(tb[RTA_DST]) < plen) 3906 goto errout; 3907 3908 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 3909 } 3910 3911 if (tb[RTA_SRC]) { 3912 int plen = (rtm->rtm_src_len + 7) >> 3; 3913 3914 if (nla_len(tb[RTA_SRC]) < plen) 3915 goto errout; 3916 3917 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 3918 } 3919 3920 if (tb[RTA_PREFSRC]) 3921 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 3922 3923 if (tb[RTA_OIF]) 3924 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 3925 3926 if (tb[RTA_PRIORITY]) 3927 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 3928 3929 if (tb[RTA_METRICS]) { 3930 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 3931 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 3932 } 3933 3934 if (tb[RTA_TABLE]) 3935 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 3936 3937 if (tb[RTA_MULTIPATH]) { 3938 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 3939 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 3940 3941 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 3942 cfg->fc_mp_len, extack); 3943 if (err < 0) 3944 goto errout; 3945 } 3946 3947 if (tb[RTA_PREF]) { 3948 pref = nla_get_u8(tb[RTA_PREF]); 3949 if (pref != ICMPV6_ROUTER_PREF_LOW && 3950 pref != ICMPV6_ROUTER_PREF_HIGH) 3951 pref = ICMPV6_ROUTER_PREF_MEDIUM; 3952 cfg->fc_flags |= RTF_PREF(pref); 3953 } 3954 3955 if (tb[RTA_ENCAP]) 3956 cfg->fc_encap = tb[RTA_ENCAP]; 3957 3958 if (tb[RTA_ENCAP_TYPE]) { 3959 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3960 3961 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 3962 if (err < 0) 3963 goto errout; 3964 } 3965 3966 if (tb[RTA_EXPIRES]) { 3967 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3968 3969 if (addrconf_finite_timeout(timeout)) { 3970 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 3971 cfg->fc_flags |= RTF_EXPIRES; 3972 } 3973 } 3974 3975 err = 0; 3976 errout: 3977 return err; 3978 } 3979 3980 struct rt6_nh { 3981 struct rt6_info *rt6_info; 3982 struct fib6_config r_cfg; 3983 struct mx6_config mxc; 3984 struct list_head next; 3985 }; 3986 3987 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 3988 { 3989 struct rt6_nh *nh; 3990 3991 list_for_each_entry(nh, rt6_nh_list, next) { 3992 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 3993 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3994 nh->r_cfg.fc_ifindex); 3995 } 3996 } 3997 3998 static int ip6_route_info_append(struct list_head *rt6_nh_list, 3999 struct rt6_info *rt, struct fib6_config *r_cfg) 4000 { 4001 struct rt6_nh *nh; 4002 int err = -EEXIST; 4003 4004 list_for_each_entry(nh, rt6_nh_list, next) { 4005 /* check if rt6_info already exists */ 4006 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 4007 return err; 4008 } 4009 4010 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4011 if (!nh) 4012 return -ENOMEM; 4013 nh->rt6_info = rt; 4014 err = ip6_convert_metrics(&nh->mxc, r_cfg); 4015 if (err) { 4016 kfree(nh); 4017 return err; 4018 } 4019 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4020 list_add_tail(&nh->next, rt6_nh_list); 4021 4022 return 0; 4023 } 4024 4025 static void ip6_route_mpath_notify(struct rt6_info *rt, 4026 struct rt6_info *rt_last, 4027 struct nl_info *info, 4028 __u16 nlflags) 4029 { 4030 /* if this is an APPEND route, then rt points to the first route 4031 * inserted and rt_last points to last route inserted. Userspace 4032 * wants a consistent dump of the route which starts at the first 4033 * nexthop. Since sibling routes are always added at the end of 4034 * the list, find the first sibling of the last route appended 4035 */ 4036 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 4037 rt = list_first_entry(&rt_last->rt6i_siblings, 4038 struct rt6_info, 4039 rt6i_siblings); 4040 } 4041 4042 if (rt) 4043 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4044 } 4045 4046 static int ip6_route_multipath_add(struct fib6_config *cfg, 4047 struct netlink_ext_ack *extack) 4048 { 4049 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 4050 struct nl_info *info = &cfg->fc_nlinfo; 4051 struct fib6_config r_cfg; 4052 struct rtnexthop *rtnh; 4053 struct rt6_info *rt; 4054 struct rt6_nh *err_nh; 4055 struct rt6_nh *nh, *nh_safe; 4056 __u16 nlflags; 4057 int remaining; 4058 int attrlen; 4059 int err = 1; 4060 int nhn = 0; 4061 int replace = (cfg->fc_nlinfo.nlh && 4062 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4063 LIST_HEAD(rt6_nh_list); 4064 4065 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4066 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4067 nlflags |= NLM_F_APPEND; 4068 4069 remaining = cfg->fc_mp_len; 4070 rtnh = (struct rtnexthop *)cfg->fc_mp; 4071 4072 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4073 * rt6_info structs per nexthop 4074 */ 4075 while (rtnh_ok(rtnh, remaining)) { 4076 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4077 if (rtnh->rtnh_ifindex) 4078 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4079 4080 attrlen = rtnh_attrlen(rtnh); 4081 if (attrlen > 0) { 4082 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4083 4084 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4085 if (nla) { 4086 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4087 r_cfg.fc_flags |= RTF_GATEWAY; 4088 } 4089 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4090 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4091 if (nla) 4092 r_cfg.fc_encap_type = nla_get_u16(nla); 4093 } 4094 4095 rt = ip6_route_info_create(&r_cfg, extack); 4096 if (IS_ERR(rt)) { 4097 err = PTR_ERR(rt); 4098 rt = NULL; 4099 goto cleanup; 4100 } 4101 4102 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; 4103 4104 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 4105 if (err) { 4106 dst_release_immediate(&rt->dst); 4107 goto cleanup; 4108 } 4109 4110 rtnh = rtnh_next(rtnh, &remaining); 4111 } 4112 4113 /* for add and replace send one notification with all nexthops. 4114 * Skip the notification in fib6_add_rt2node and send one with 4115 * the full route when done 4116 */ 4117 info->skip_notify = 1; 4118 4119 err_nh = NULL; 4120 list_for_each_entry(nh, &rt6_nh_list, next) { 4121 rt_last = nh->rt6_info; 4122 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 4123 /* save reference to first route for notification */ 4124 if (!rt_notif && !err) 4125 rt_notif = nh->rt6_info; 4126 4127 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 4128 nh->rt6_info = NULL; 4129 if (err) { 4130 if (replace && nhn) 4131 ip6_print_replace_route_err(&rt6_nh_list); 4132 err_nh = nh; 4133 goto add_errout; 4134 } 4135 4136 /* Because each route is added like a single route we remove 4137 * these flags after the first nexthop: if there is a collision, 4138 * we have already failed to add the first nexthop: 4139 * fib6_add_rt2node() has rejected it; when replacing, old 4140 * nexthops have been replaced by first new, the rest should 4141 * be added to it. 4142 */ 4143 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4144 NLM_F_REPLACE); 4145 nhn++; 4146 } 4147 4148 /* success ... tell user about new route */ 4149 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4150 goto cleanup; 4151 4152 add_errout: 4153 /* send notification for routes that were added so that 4154 * the delete notifications sent by ip6_route_del are 4155 * coherent 4156 */ 4157 if (rt_notif) 4158 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4159 4160 /* Delete routes that were already added */ 4161 list_for_each_entry(nh, &rt6_nh_list, next) { 4162 if (err_nh == nh) 4163 break; 4164 ip6_route_del(&nh->r_cfg, extack); 4165 } 4166 4167 cleanup: 4168 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4169 if (nh->rt6_info) 4170 dst_release_immediate(&nh->rt6_info->dst); 4171 kfree(nh->mxc.mx); 4172 list_del(&nh->next); 4173 kfree(nh); 4174 } 4175 4176 return err; 4177 } 4178 4179 static int ip6_route_multipath_del(struct fib6_config *cfg, 4180 struct netlink_ext_ack *extack) 4181 { 4182 struct fib6_config r_cfg; 4183 struct rtnexthop *rtnh; 4184 int remaining; 4185 int attrlen; 4186 int err = 1, last_err = 0; 4187 4188 remaining = cfg->fc_mp_len; 4189 rtnh = (struct rtnexthop *)cfg->fc_mp; 4190 4191 /* Parse a Multipath Entry */ 4192 while (rtnh_ok(rtnh, remaining)) { 4193 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4194 if (rtnh->rtnh_ifindex) 4195 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4196 4197 attrlen = rtnh_attrlen(rtnh); 4198 if (attrlen > 0) { 4199 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4200 4201 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4202 if (nla) { 4203 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4204 r_cfg.fc_flags |= RTF_GATEWAY; 4205 } 4206 } 4207 err = ip6_route_del(&r_cfg, extack); 4208 if (err) 4209 last_err = err; 4210 4211 rtnh = rtnh_next(rtnh, &remaining); 4212 } 4213 4214 return last_err; 4215 } 4216 4217 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4218 struct netlink_ext_ack *extack) 4219 { 4220 struct fib6_config cfg; 4221 int err; 4222 4223 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4224 if (err < 0) 4225 return err; 4226 4227 if (cfg.fc_mp) 4228 return ip6_route_multipath_del(&cfg, extack); 4229 else { 4230 cfg.fc_delete_all_nh = 1; 4231 return ip6_route_del(&cfg, extack); 4232 } 4233 } 4234 4235 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4236 struct netlink_ext_ack *extack) 4237 { 4238 struct fib6_config cfg; 4239 int err; 4240 4241 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4242 if (err < 0) 4243 return err; 4244 4245 if (cfg.fc_mp) 4246 return ip6_route_multipath_add(&cfg, extack); 4247 else 4248 return ip6_route_add(&cfg, extack); 4249 } 4250 4251 static size_t rt6_nlmsg_size(struct rt6_info *rt) 4252 { 4253 int nexthop_len = 0; 4254 4255 if (rt->rt6i_nsiblings) { 4256 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4257 + NLA_ALIGN(sizeof(struct rtnexthop)) 4258 + nla_total_size(16) /* RTA_GATEWAY */ 4259 + lwtunnel_get_encap_size(rt->dst.lwtstate); 4260 4261 nexthop_len *= rt->rt6i_nsiblings; 4262 } 4263 4264 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4265 + nla_total_size(16) /* RTA_SRC */ 4266 + nla_total_size(16) /* RTA_DST */ 4267 + nla_total_size(16) /* RTA_GATEWAY */ 4268 + nla_total_size(16) /* RTA_PREFSRC */ 4269 + nla_total_size(4) /* RTA_TABLE */ 4270 + nla_total_size(4) /* RTA_IIF */ 4271 + nla_total_size(4) /* RTA_OIF */ 4272 + nla_total_size(4) /* RTA_PRIORITY */ 4273 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4274 + nla_total_size(sizeof(struct rta_cacheinfo)) 4275 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4276 + nla_total_size(1) /* RTA_PREF */ 4277 + lwtunnel_get_encap_size(rt->dst.lwtstate) 4278 + nexthop_len; 4279 } 4280 4281 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 4282 unsigned int *flags, bool skip_oif) 4283 { 4284 if (rt->rt6i_nh_flags & RTNH_F_DEAD) 4285 *flags |= RTNH_F_DEAD; 4286 4287 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { 4288 *flags |= RTNH_F_LINKDOWN; 4289 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 4290 *flags |= RTNH_F_DEAD; 4291 } 4292 4293 if (rt->rt6i_flags & RTF_GATEWAY) { 4294 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 4295 goto nla_put_failure; 4296 } 4297 4298 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); 4299 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 4300 *flags |= RTNH_F_OFFLOAD; 4301 4302 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4303 if (!skip_oif && rt->dst.dev && 4304 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 4305 goto nla_put_failure; 4306 4307 if (rt->dst.lwtstate && 4308 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 4309 goto nla_put_failure; 4310 4311 return 0; 4312 4313 nla_put_failure: 4314 return -EMSGSIZE; 4315 } 4316 4317 /* add multipath next hop */ 4318 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 4319 { 4320 struct rtnexthop *rtnh; 4321 unsigned int flags = 0; 4322 4323 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4324 if (!rtnh) 4325 goto nla_put_failure; 4326 4327 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; 4328 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 4329 4330 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4331 goto nla_put_failure; 4332 4333 rtnh->rtnh_flags = flags; 4334 4335 /* length of rtnetlink header + attributes */ 4336 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4337 4338 return 0; 4339 4340 nla_put_failure: 4341 return -EMSGSIZE; 4342 } 4343 4344 static int rt6_fill_node(struct net *net, 4345 struct sk_buff *skb, struct rt6_info *rt, 4346 struct in6_addr *dst, struct in6_addr *src, 4347 int iif, int type, u32 portid, u32 seq, 4348 unsigned int flags) 4349 { 4350 u32 metrics[RTAX_MAX]; 4351 struct rtmsg *rtm; 4352 struct nlmsghdr *nlh; 4353 long expires; 4354 u32 table; 4355 4356 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4357 if (!nlh) 4358 return -EMSGSIZE; 4359 4360 rtm = nlmsg_data(nlh); 4361 rtm->rtm_family = AF_INET6; 4362 rtm->rtm_dst_len = rt->rt6i_dst.plen; 4363 rtm->rtm_src_len = rt->rt6i_src.plen; 4364 rtm->rtm_tos = 0; 4365 if (rt->rt6i_table) 4366 table = rt->rt6i_table->tb6_id; 4367 else 4368 table = RT6_TABLE_UNSPEC; 4369 rtm->rtm_table = table; 4370 if (nla_put_u32(skb, RTA_TABLE, table)) 4371 goto nla_put_failure; 4372 if (rt->rt6i_flags & RTF_REJECT) { 4373 switch (rt->dst.error) { 4374 case -EINVAL: 4375 rtm->rtm_type = RTN_BLACKHOLE; 4376 break; 4377 case -EACCES: 4378 rtm->rtm_type = RTN_PROHIBIT; 4379 break; 4380 case -EAGAIN: 4381 rtm->rtm_type = RTN_THROW; 4382 break; 4383 default: 4384 rtm->rtm_type = RTN_UNREACHABLE; 4385 break; 4386 } 4387 } 4388 else if (rt->rt6i_flags & RTF_LOCAL) 4389 rtm->rtm_type = RTN_LOCAL; 4390 else if (rt->rt6i_flags & RTF_ANYCAST) 4391 rtm->rtm_type = RTN_ANYCAST; 4392 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 4393 rtm->rtm_type = RTN_LOCAL; 4394 else 4395 rtm->rtm_type = RTN_UNICAST; 4396 rtm->rtm_flags = 0; 4397 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4398 rtm->rtm_protocol = rt->rt6i_protocol; 4399 4400 if (rt->rt6i_flags & RTF_CACHE) 4401 rtm->rtm_flags |= RTM_F_CLONED; 4402 4403 if (dst) { 4404 if (nla_put_in6_addr(skb, RTA_DST, dst)) 4405 goto nla_put_failure; 4406 rtm->rtm_dst_len = 128; 4407 } else if (rtm->rtm_dst_len) 4408 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 4409 goto nla_put_failure; 4410 #ifdef CONFIG_IPV6_SUBTREES 4411 if (src) { 4412 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4413 goto nla_put_failure; 4414 rtm->rtm_src_len = 128; 4415 } else if (rtm->rtm_src_len && 4416 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 4417 goto nla_put_failure; 4418 #endif 4419 if (iif) { 4420 #ifdef CONFIG_IPV6_MROUTE 4421 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 4422 int err = ip6mr_get_route(net, skb, rtm, portid); 4423 4424 if (err == 0) 4425 return 0; 4426 if (err < 0) 4427 goto nla_put_failure; 4428 } else 4429 #endif 4430 if (nla_put_u32(skb, RTA_IIF, iif)) 4431 goto nla_put_failure; 4432 } else if (dst) { 4433 struct in6_addr saddr_buf; 4434 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 4435 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4436 goto nla_put_failure; 4437 } 4438 4439 if (rt->rt6i_prefsrc.plen) { 4440 struct in6_addr saddr_buf; 4441 saddr_buf = rt->rt6i_prefsrc.addr; 4442 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4443 goto nla_put_failure; 4444 } 4445 4446 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 4447 if (rt->rt6i_pmtu) 4448 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 4449 if (rtnetlink_put_metrics(skb, metrics) < 0) 4450 goto nla_put_failure; 4451 4452 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 4453 goto nla_put_failure; 4454 4455 /* For multipath routes, walk the siblings list and add 4456 * each as a nexthop within RTA_MULTIPATH. 4457 */ 4458 if (rt->rt6i_nsiblings) { 4459 struct rt6_info *sibling, *next_sibling; 4460 struct nlattr *mp; 4461 4462 mp = nla_nest_start(skb, RTA_MULTIPATH); 4463 if (!mp) 4464 goto nla_put_failure; 4465 4466 if (rt6_add_nexthop(skb, rt) < 0) 4467 goto nla_put_failure; 4468 4469 list_for_each_entry_safe(sibling, next_sibling, 4470 &rt->rt6i_siblings, rt6i_siblings) { 4471 if (rt6_add_nexthop(skb, sibling) < 0) 4472 goto nla_put_failure; 4473 } 4474 4475 nla_nest_end(skb, mp); 4476 } else { 4477 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4478 goto nla_put_failure; 4479 } 4480 4481 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 4482 4483 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 4484 goto nla_put_failure; 4485 4486 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 4487 goto nla_put_failure; 4488 4489 4490 nlmsg_end(skb, nlh); 4491 return 0; 4492 4493 nla_put_failure: 4494 nlmsg_cancel(skb, nlh); 4495 return -EMSGSIZE; 4496 } 4497 4498 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 4499 { 4500 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4501 struct net *net = arg->net; 4502 4503 if (rt == net->ipv6.ip6_null_entry) 4504 return 0; 4505 4506 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4507 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4508 4509 /* user wants prefix routes only */ 4510 if (rtm->rtm_flags & RTM_F_PREFIX && 4511 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 4512 /* success since this is not a prefix route */ 4513 return 1; 4514 } 4515 } 4516 4517 return rt6_fill_node(net, 4518 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 4519 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 4520 NLM_F_MULTI); 4521 } 4522 4523 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4524 struct netlink_ext_ack *extack) 4525 { 4526 struct net *net = sock_net(in_skb->sk); 4527 struct nlattr *tb[RTA_MAX+1]; 4528 int err, iif = 0, oif = 0; 4529 struct dst_entry *dst; 4530 struct rt6_info *rt; 4531 struct sk_buff *skb; 4532 struct rtmsg *rtm; 4533 struct flowi6 fl6; 4534 bool fibmatch; 4535 4536 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4537 extack); 4538 if (err < 0) 4539 goto errout; 4540 4541 err = -EINVAL; 4542 memset(&fl6, 0, sizeof(fl6)); 4543 rtm = nlmsg_data(nlh); 4544 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4545 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4546 4547 if (tb[RTA_SRC]) { 4548 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4549 goto errout; 4550 4551 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4552 } 4553 4554 if (tb[RTA_DST]) { 4555 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4556 goto errout; 4557 4558 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4559 } 4560 4561 if (tb[RTA_IIF]) 4562 iif = nla_get_u32(tb[RTA_IIF]); 4563 4564 if (tb[RTA_OIF]) 4565 oif = nla_get_u32(tb[RTA_OIF]); 4566 4567 if (tb[RTA_MARK]) 4568 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4569 4570 if (tb[RTA_UID]) 4571 fl6.flowi6_uid = make_kuid(current_user_ns(), 4572 nla_get_u32(tb[RTA_UID])); 4573 else 4574 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4575 4576 if (iif) { 4577 struct net_device *dev; 4578 int flags = 0; 4579 4580 rcu_read_lock(); 4581 4582 dev = dev_get_by_index_rcu(net, iif); 4583 if (!dev) { 4584 rcu_read_unlock(); 4585 err = -ENODEV; 4586 goto errout; 4587 } 4588 4589 fl6.flowi6_iif = iif; 4590 4591 if (!ipv6_addr_any(&fl6.saddr)) 4592 flags |= RT6_LOOKUP_F_HAS_SADDR; 4593 4594 dst = ip6_route_input_lookup(net, dev, &fl6, flags); 4595 4596 rcu_read_unlock(); 4597 } else { 4598 fl6.flowi6_oif = oif; 4599 4600 dst = ip6_route_output(net, NULL, &fl6); 4601 } 4602 4603 4604 rt = container_of(dst, struct rt6_info, dst); 4605 if (rt->dst.error) { 4606 err = rt->dst.error; 4607 ip6_rt_put(rt); 4608 goto errout; 4609 } 4610 4611 if (rt == net->ipv6.ip6_null_entry) { 4612 err = rt->dst.error; 4613 ip6_rt_put(rt); 4614 goto errout; 4615 } 4616 4617 if (fibmatch && rt->from) { 4618 struct rt6_info *ort = rt->from; 4619 4620 dst_hold(&ort->dst); 4621 ip6_rt_put(rt); 4622 rt = ort; 4623 } 4624 4625 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4626 if (!skb) { 4627 ip6_rt_put(rt); 4628 err = -ENOBUFS; 4629 goto errout; 4630 } 4631 4632 skb_dst_set(skb, &rt->dst); 4633 if (fibmatch) 4634 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 4635 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4636 nlh->nlmsg_seq, 0); 4637 else 4638 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 4639 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4640 nlh->nlmsg_seq, 0); 4641 if (err < 0) { 4642 kfree_skb(skb); 4643 goto errout; 4644 } 4645 4646 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4647 errout: 4648 return err; 4649 } 4650 4651 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 4652 unsigned int nlm_flags) 4653 { 4654 struct sk_buff *skb; 4655 struct net *net = info->nl_net; 4656 u32 seq; 4657 int err; 4658 4659 err = -ENOBUFS; 4660 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4661 4662 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4663 if (!skb) 4664 goto errout; 4665 4666 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 4667 event, info->portid, seq, nlm_flags); 4668 if (err < 0) { 4669 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4670 WARN_ON(err == -EMSGSIZE); 4671 kfree_skb(skb); 4672 goto errout; 4673 } 4674 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4675 info->nlh, gfp_any()); 4676 return; 4677 errout: 4678 if (err < 0) 4679 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4680 } 4681 4682 static int ip6_route_dev_notify(struct notifier_block *this, 4683 unsigned long event, void *ptr) 4684 { 4685 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4686 struct net *net = dev_net(dev); 4687 4688 if (!(dev->flags & IFF_LOOPBACK)) 4689 return NOTIFY_OK; 4690 4691 if (event == NETDEV_REGISTER) { 4692 net->ipv6.ip6_null_entry->dst.dev = dev; 4693 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4694 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4695 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4696 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4697 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4698 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4699 #endif 4700 } else if (event == NETDEV_UNREGISTER && 4701 dev->reg_state != NETREG_UNREGISTERED) { 4702 /* NETDEV_UNREGISTER could be fired for multiple times by 4703 * netdev_wait_allrefs(). Make sure we only call this once. 4704 */ 4705 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4706 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4707 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4708 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4709 #endif 4710 } 4711 4712 return NOTIFY_OK; 4713 } 4714 4715 /* 4716 * /proc 4717 */ 4718 4719 #ifdef CONFIG_PROC_FS 4720 4721 static const struct file_operations ipv6_route_proc_fops = { 4722 .open = ipv6_route_open, 4723 .read = seq_read, 4724 .llseek = seq_lseek, 4725 .release = seq_release_net, 4726 }; 4727 4728 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4729 { 4730 struct net *net = (struct net *)seq->private; 4731 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4732 net->ipv6.rt6_stats->fib_nodes, 4733 net->ipv6.rt6_stats->fib_route_nodes, 4734 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4735 net->ipv6.rt6_stats->fib_rt_entries, 4736 net->ipv6.rt6_stats->fib_rt_cache, 4737 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4738 net->ipv6.rt6_stats->fib_discarded_routes); 4739 4740 return 0; 4741 } 4742 4743 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4744 { 4745 return single_open_net(inode, file, rt6_stats_seq_show); 4746 } 4747 4748 static const struct file_operations rt6_stats_seq_fops = { 4749 .open = rt6_stats_seq_open, 4750 .read = seq_read, 4751 .llseek = seq_lseek, 4752 .release = single_release_net, 4753 }; 4754 #endif /* CONFIG_PROC_FS */ 4755 4756 #ifdef CONFIG_SYSCTL 4757 4758 static 4759 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4760 void __user *buffer, size_t *lenp, loff_t *ppos) 4761 { 4762 struct net *net; 4763 int delay; 4764 if (!write) 4765 return -EINVAL; 4766 4767 net = (struct net *)ctl->extra1; 4768 delay = net->ipv6.sysctl.flush_delay; 4769 proc_dointvec(ctl, write, buffer, lenp, ppos); 4770 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4771 return 0; 4772 } 4773 4774 struct ctl_table ipv6_route_table_template[] = { 4775 { 4776 .procname = "flush", 4777 .data = &init_net.ipv6.sysctl.flush_delay, 4778 .maxlen = sizeof(int), 4779 .mode = 0200, 4780 .proc_handler = ipv6_sysctl_rtcache_flush 4781 }, 4782 { 4783 .procname = "gc_thresh", 4784 .data = &ip6_dst_ops_template.gc_thresh, 4785 .maxlen = sizeof(int), 4786 .mode = 0644, 4787 .proc_handler = proc_dointvec, 4788 }, 4789 { 4790 .procname = "max_size", 4791 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4792 .maxlen = sizeof(int), 4793 .mode = 0644, 4794 .proc_handler = proc_dointvec, 4795 }, 4796 { 4797 .procname = "gc_min_interval", 4798 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4799 .maxlen = sizeof(int), 4800 .mode = 0644, 4801 .proc_handler = proc_dointvec_jiffies, 4802 }, 4803 { 4804 .procname = "gc_timeout", 4805 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 4806 .maxlen = sizeof(int), 4807 .mode = 0644, 4808 .proc_handler = proc_dointvec_jiffies, 4809 }, 4810 { 4811 .procname = "gc_interval", 4812 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 4813 .maxlen = sizeof(int), 4814 .mode = 0644, 4815 .proc_handler = proc_dointvec_jiffies, 4816 }, 4817 { 4818 .procname = "gc_elasticity", 4819 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 4820 .maxlen = sizeof(int), 4821 .mode = 0644, 4822 .proc_handler = proc_dointvec, 4823 }, 4824 { 4825 .procname = "mtu_expires", 4826 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 4827 .maxlen = sizeof(int), 4828 .mode = 0644, 4829 .proc_handler = proc_dointvec_jiffies, 4830 }, 4831 { 4832 .procname = "min_adv_mss", 4833 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 4834 .maxlen = sizeof(int), 4835 .mode = 0644, 4836 .proc_handler = proc_dointvec, 4837 }, 4838 { 4839 .procname = "gc_min_interval_ms", 4840 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4841 .maxlen = sizeof(int), 4842 .mode = 0644, 4843 .proc_handler = proc_dointvec_ms_jiffies, 4844 }, 4845 { } 4846 }; 4847 4848 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 4849 { 4850 struct ctl_table *table; 4851 4852 table = kmemdup(ipv6_route_table_template, 4853 sizeof(ipv6_route_table_template), 4854 GFP_KERNEL); 4855 4856 if (table) { 4857 table[0].data = &net->ipv6.sysctl.flush_delay; 4858 table[0].extra1 = net; 4859 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 4860 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 4861 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4862 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 4863 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 4864 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 4865 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 4866 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 4867 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4868 4869 /* Don't export sysctls to unprivileged users */ 4870 if (net->user_ns != &init_user_ns) 4871 table[0].procname = NULL; 4872 } 4873 4874 return table; 4875 } 4876 #endif 4877 4878 static int __net_init ip6_route_net_init(struct net *net) 4879 { 4880 int ret = -ENOMEM; 4881 4882 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 4883 sizeof(net->ipv6.ip6_dst_ops)); 4884 4885 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 4886 goto out_ip6_dst_ops; 4887 4888 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 4889 sizeof(*net->ipv6.ip6_null_entry), 4890 GFP_KERNEL); 4891 if (!net->ipv6.ip6_null_entry) 4892 goto out_ip6_dst_entries; 4893 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4894 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 4895 ip6_template_metrics, true); 4896 4897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4898 net->ipv6.fib6_has_custom_rules = false; 4899 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 4900 sizeof(*net->ipv6.ip6_prohibit_entry), 4901 GFP_KERNEL); 4902 if (!net->ipv6.ip6_prohibit_entry) 4903 goto out_ip6_null_entry; 4904 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4905 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 4906 ip6_template_metrics, true); 4907 4908 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 4909 sizeof(*net->ipv6.ip6_blk_hole_entry), 4910 GFP_KERNEL); 4911 if (!net->ipv6.ip6_blk_hole_entry) 4912 goto out_ip6_prohibit_entry; 4913 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4914 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 4915 ip6_template_metrics, true); 4916 #endif 4917 4918 net->ipv6.sysctl.flush_delay = 0; 4919 net->ipv6.sysctl.ip6_rt_max_size = 4096; 4920 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 4921 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 4922 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 4923 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 4924 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 4925 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 4926 4927 net->ipv6.ip6_rt_gc_expire = 30*HZ; 4928 4929 ret = 0; 4930 out: 4931 return ret; 4932 4933 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4934 out_ip6_prohibit_entry: 4935 kfree(net->ipv6.ip6_prohibit_entry); 4936 out_ip6_null_entry: 4937 kfree(net->ipv6.ip6_null_entry); 4938 #endif 4939 out_ip6_dst_entries: 4940 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4941 out_ip6_dst_ops: 4942 goto out; 4943 } 4944 4945 static void __net_exit ip6_route_net_exit(struct net *net) 4946 { 4947 kfree(net->ipv6.ip6_null_entry); 4948 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4949 kfree(net->ipv6.ip6_prohibit_entry); 4950 kfree(net->ipv6.ip6_blk_hole_entry); 4951 #endif 4952 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4953 } 4954 4955 static int __net_init ip6_route_net_init_late(struct net *net) 4956 { 4957 #ifdef CONFIG_PROC_FS 4958 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 4959 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 4960 #endif 4961 return 0; 4962 } 4963 4964 static void __net_exit ip6_route_net_exit_late(struct net *net) 4965 { 4966 #ifdef CONFIG_PROC_FS 4967 remove_proc_entry("ipv6_route", net->proc_net); 4968 remove_proc_entry("rt6_stats", net->proc_net); 4969 #endif 4970 } 4971 4972 static struct pernet_operations ip6_route_net_ops = { 4973 .init = ip6_route_net_init, 4974 .exit = ip6_route_net_exit, 4975 }; 4976 4977 static int __net_init ipv6_inetpeer_init(struct net *net) 4978 { 4979 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 4980 4981 if (!bp) 4982 return -ENOMEM; 4983 inet_peer_base_init(bp); 4984 net->ipv6.peers = bp; 4985 return 0; 4986 } 4987 4988 static void __net_exit ipv6_inetpeer_exit(struct net *net) 4989 { 4990 struct inet_peer_base *bp = net->ipv6.peers; 4991 4992 net->ipv6.peers = NULL; 4993 inetpeer_invalidate_tree(bp); 4994 kfree(bp); 4995 } 4996 4997 static struct pernet_operations ipv6_inetpeer_ops = { 4998 .init = ipv6_inetpeer_init, 4999 .exit = ipv6_inetpeer_exit, 5000 }; 5001 5002 static struct pernet_operations ip6_route_net_late_ops = { 5003 .init = ip6_route_net_init_late, 5004 .exit = ip6_route_net_exit_late, 5005 }; 5006 5007 static struct notifier_block ip6_route_dev_notifier = { 5008 .notifier_call = ip6_route_dev_notify, 5009 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5010 }; 5011 5012 void __init ip6_route_init_special_entries(void) 5013 { 5014 /* Registering of the loopback is done before this portion of code, 5015 * the loopback reference in rt6_info will not be taken, do it 5016 * manually for init_net */ 5017 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5018 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5019 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5020 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5021 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5022 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5023 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5024 #endif 5025 } 5026 5027 int __init ip6_route_init(void) 5028 { 5029 int ret; 5030 int cpu; 5031 5032 ret = -ENOMEM; 5033 ip6_dst_ops_template.kmem_cachep = 5034 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5035 SLAB_HWCACHE_ALIGN, NULL); 5036 if (!ip6_dst_ops_template.kmem_cachep) 5037 goto out; 5038 5039 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5040 if (ret) 5041 goto out_kmem_cache; 5042 5043 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5044 if (ret) 5045 goto out_dst_entries; 5046 5047 ret = register_pernet_subsys(&ip6_route_net_ops); 5048 if (ret) 5049 goto out_register_inetpeer; 5050 5051 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5052 5053 ret = fib6_init(); 5054 if (ret) 5055 goto out_register_subsys; 5056 5057 ret = xfrm6_init(); 5058 if (ret) 5059 goto out_fib6_init; 5060 5061 ret = fib6_rules_init(); 5062 if (ret) 5063 goto xfrm6_init; 5064 5065 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5066 if (ret) 5067 goto fib6_rules_init; 5068 5069 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5070 inet6_rtm_newroute, NULL, 0); 5071 if (ret < 0) 5072 goto out_register_late_subsys; 5073 5074 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5075 inet6_rtm_delroute, NULL, 0); 5076 if (ret < 0) 5077 goto out_register_late_subsys; 5078 5079 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5080 inet6_rtm_getroute, NULL, 5081 RTNL_FLAG_DOIT_UNLOCKED); 5082 if (ret < 0) 5083 goto out_register_late_subsys; 5084 5085 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5086 if (ret) 5087 goto out_register_late_subsys; 5088 5089 for_each_possible_cpu(cpu) { 5090 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5091 5092 INIT_LIST_HEAD(&ul->head); 5093 spin_lock_init(&ul->lock); 5094 } 5095 5096 out: 5097 return ret; 5098 5099 out_register_late_subsys: 5100 rtnl_unregister_all(PF_INET6); 5101 unregister_pernet_subsys(&ip6_route_net_late_ops); 5102 fib6_rules_init: 5103 fib6_rules_cleanup(); 5104 xfrm6_init: 5105 xfrm6_fini(); 5106 out_fib6_init: 5107 fib6_gc_cleanup(); 5108 out_register_subsys: 5109 unregister_pernet_subsys(&ip6_route_net_ops); 5110 out_register_inetpeer: 5111 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5112 out_dst_entries: 5113 dst_entries_destroy(&ip6_dst_blackhole_ops); 5114 out_kmem_cache: 5115 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5116 goto out; 5117 } 5118 5119 void ip6_route_cleanup(void) 5120 { 5121 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5122 unregister_pernet_subsys(&ip6_route_net_late_ops); 5123 fib6_rules_cleanup(); 5124 xfrm6_fini(); 5125 fib6_gc_cleanup(); 5126 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5127 unregister_pernet_subsys(&ip6_route_net_ops); 5128 dst_entries_destroy(&ip6_dst_blackhole_ops); 5129 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5130 } 5131